xref: /openbmc/qemu/target/i386/kvm/xen-emu.c (revision 0cc14182aba961f4c34a21dd202ce6e4a87470f5)
1 /*
2  * Xen HVM emulation support in KVM
3  *
4  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5  * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
21 #include "xen-emu.h"
22 #include "trace.h"
23 #include "sysemu/runstate.h"
24 
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_primary_console.h"
32 #include "hw/i386/kvm/xen_xenstore.h"
33 
34 #include "hw/xen/interface/version.h"
35 #include "hw/xen/interface/sched.h"
36 #include "hw/xen/interface/memory.h"
37 #include "hw/xen/interface/hvm/hvm_op.h"
38 #include "hw/xen/interface/hvm/params.h"
39 #include "hw/xen/interface/vcpu.h"
40 #include "hw/xen/interface/event_channel.h"
41 #include "hw/xen/interface/grant_table.h"
42 
43 #include "xen-compat.h"
44 
45 static void xen_vcpu_singleshot_timer_event(void *opaque);
46 static void xen_vcpu_periodic_timer_event(void *opaque);
47 static int vcpuop_stop_singleshot_timer(CPUState *cs);
48 
49 #ifdef TARGET_X86_64
50 #define hypercall_compat32(longmode) (!(longmode))
51 #else
52 #define hypercall_compat32(longmode) (false)
53 #endif
54 
55 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
56                            size_t *len, bool is_write)
57 {
58         struct kvm_translation tr = {
59             .linear_address = gva,
60         };
61 
62         if (len) {
63             *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
64         }
65 
66         if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
67             (is_write && !tr.writeable)) {
68             return false;
69         }
70         *gpa = tr.physical_address;
71         return true;
72 }
73 
74 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
75                       bool is_write)
76 {
77     uint8_t *buf = (uint8_t *)_buf;
78     uint64_t gpa;
79     size_t len;
80 
81     while (sz) {
82         if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
83             return -EFAULT;
84         }
85         if (len > sz) {
86             len = sz;
87         }
88 
89         cpu_physical_memory_rw(gpa, buf, len, is_write);
90 
91         buf += len;
92         sz -= len;
93         gva += len;
94     }
95 
96     return 0;
97 }
98 
99 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
100                                     size_t sz)
101 {
102     return kvm_gva_rw(cs, gva, buf, sz, false);
103 }
104 
105 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
106                                   size_t sz)
107 {
108     return kvm_gva_rw(cs, gva, buf, sz, true);
109 }
110 
111 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
112 {
113     const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
114         KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
115     struct kvm_xen_hvm_config cfg = {
116         .msr = hypercall_msr,
117         .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
118     };
119     int xen_caps, ret;
120 
121     xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
122     if (required_caps & ~xen_caps) {
123         error_report("kvm: Xen HVM guest support not present or insufficient");
124         return -ENOSYS;
125     }
126 
127     if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
128         struct kvm_xen_hvm_attr ha = {
129             .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
130             .u.xen_version = s->xen_version,
131         };
132         (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
133 
134         cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
135     }
136 
137     ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
138     if (ret < 0) {
139         error_report("kvm: Failed to enable Xen HVM support: %s",
140                      strerror(-ret));
141         return ret;
142     }
143 
144     /* If called a second time, don't repeat the rest of the setup. */
145     if (s->xen_caps) {
146         return 0;
147     }
148 
149     /*
150      * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
151      * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
152      *
153      * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
154      * such things to be polled at precisely the right time. We *could* do
155      * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
156      * the moment the IRQ is acked, and see if it should be reasserted.
157      *
158      * But the in-kernel irqchip is deprecated, so we're unlikely to add
159      * that support in the kernel. Insist on using the split irqchip mode
160      * instead.
161      *
162      * This leaves us polling for the level going low in QEMU, which lacks
163      * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
164      * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
165      * the device (for which it has to unmap the device and trap access, for
166      * some period after an IRQ!!). In the Xen case, we do it on exit from
167      * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
168      * Which is kind of icky, but less so than the VFIO one. I may fix them
169      * both later...
170      */
171     if (!kvm_kernel_irqchip_split()) {
172         error_report("kvm: Xen support requires kernel-irqchip=split");
173         return -EINVAL;
174     }
175 
176     s->xen_caps = xen_caps;
177 
178     /* Tell fw_cfg to notify the BIOS to reserve the range. */
179     ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
180                          E820_RESERVED);
181     if (ret < 0) {
182         fprintf(stderr, "e820_add_entry() table is full\n");
183         return ret;
184     }
185 
186     /* The pages couldn't be overlaid until KVM was initialized */
187     xen_primary_console_reset();
188     xen_xenstore_reset();
189 
190     return 0;
191 }
192 
193 int kvm_xen_init_vcpu(CPUState *cs)
194 {
195     X86CPU *cpu = X86_CPU(cs);
196     CPUX86State *env = &cpu->env;
197     int err;
198 
199     /*
200      * The kernel needs to know the Xen/ACPI vCPU ID because that's
201      * what the guest uses in hypercalls such as timers. It doesn't
202      * match the APIC ID which is generally used for talking to the
203      * kernel about vCPUs. And if vCPU threads race with creating
204      * their KVM vCPUs out of order, it doesn't necessarily match
205      * with the kernel's internal vCPU indices either.
206      */
207     if (kvm_xen_has_cap(EVTCHN_SEND)) {
208         struct kvm_xen_vcpu_attr va = {
209             .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
210             .u.vcpu_id = cs->cpu_index,
211         };
212         err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
213         if (err) {
214             error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
215                          strerror(-err));
216             return err;
217         }
218     }
219 
220     env->xen_vcpu_info_gpa = INVALID_GPA;
221     env->xen_vcpu_info_default_gpa = INVALID_GPA;
222     env->xen_vcpu_time_info_gpa = INVALID_GPA;
223     env->xen_vcpu_runstate_gpa = INVALID_GPA;
224 
225     qemu_mutex_init(&env->xen_timers_lock);
226     env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
227                                              xen_vcpu_singleshot_timer_event,
228                                              cpu);
229     if (!env->xen_singleshot_timer) {
230         return -ENOMEM;
231     }
232     env->xen_singleshot_timer->opaque = cs;
233 
234     env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
235                                            xen_vcpu_periodic_timer_event,
236                                            cpu);
237     if (!env->xen_periodic_timer) {
238         return -ENOMEM;
239     }
240     env->xen_periodic_timer->opaque = cs;
241 
242     return 0;
243 }
244 
245 uint32_t kvm_xen_get_caps(void)
246 {
247     return kvm_state->xen_caps;
248 }
249 
250 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
251                                      int cmd, uint64_t arg)
252 {
253     int err = 0;
254 
255     switch (cmd) {
256     case XENVER_get_features: {
257         struct xen_feature_info fi;
258 
259         /* No need for 32/64 compat handling */
260         qemu_build_assert(sizeof(fi) == 8);
261 
262         err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
263         if (err) {
264             break;
265         }
266 
267         fi.submap = 0;
268         if (fi.submap_idx == 0) {
269             fi.submap |= 1 << XENFEAT_writable_page_tables |
270                          1 << XENFEAT_writable_descriptor_tables |
271                          1 << XENFEAT_auto_translated_physmap |
272                          1 << XENFEAT_hvm_callback_vector |
273                          1 << XENFEAT_hvm_safe_pvclock |
274                          1 << XENFEAT_hvm_pirqs;
275         }
276 
277         err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
278         break;
279     }
280 
281     default:
282         return false;
283     }
284 
285     exit->u.hcall.result = err;
286     return true;
287 }
288 
289 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
290 {
291     struct kvm_xen_vcpu_attr xhsi;
292 
293     xhsi.type = type;
294     xhsi.u.gpa = gpa;
295 
296     trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
297 
298     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
299 }
300 
301 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
302 {
303     uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
304     struct kvm_xen_vcpu_attr xva;
305 
306     xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
307     xva.u.vector = vector;
308 
309     trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
310 
311     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva);
312 }
313 
314 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
315 {
316     X86CPU *cpu = X86_CPU(cs);
317     CPUX86State *env = &cpu->env;
318 
319     env->xen_vcpu_callback_vector = data.host_int;
320 
321     if (kvm_xen_has_cap(EVTCHN_SEND)) {
322         kvm_xen_set_vcpu_callback_vector(cs);
323     }
324 }
325 
326 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
327 {
328     X86CPU *cpu = X86_CPU(cs);
329     CPUX86State *env = &cpu->env;
330     MemoryRegionSection mrs = { .mr = NULL };
331     void *vcpu_info_hva = NULL;
332     int ret;
333 
334     ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
335     if (ret || gpa == INVALID_GPA) {
336         goto out;
337     }
338 
339     mrs = memory_region_find(get_system_memory(), gpa,
340                              sizeof(struct vcpu_info));
341     if (mrs.mr && mrs.mr->ram_block &&
342         !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
343         vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
344                                          mrs.offset_within_region);
345     }
346     if (!vcpu_info_hva) {
347         if (mrs.mr) {
348             memory_region_unref(mrs.mr);
349             mrs.mr = NULL;
350         }
351         ret = -EINVAL;
352     }
353 
354  out:
355     if (env->xen_vcpu_info_mr) {
356         memory_region_unref(env->xen_vcpu_info_mr);
357     }
358     env->xen_vcpu_info_hva = vcpu_info_hva;
359     env->xen_vcpu_info_mr = mrs.mr;
360     return ret;
361 }
362 
363 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
364 {
365     X86CPU *cpu = X86_CPU(cs);
366     CPUX86State *env = &cpu->env;
367 
368     env->xen_vcpu_info_default_gpa = data.host_ulong;
369 
370     /* Changing the default does nothing if a vcpu_info was explicitly set. */
371     if (env->xen_vcpu_info_gpa == INVALID_GPA) {
372         set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
373     }
374 }
375 
376 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
377 {
378     X86CPU *cpu = X86_CPU(cs);
379     CPUX86State *env = &cpu->env;
380 
381     env->xen_vcpu_info_gpa = data.host_ulong;
382 
383     set_vcpu_info(cs, env->xen_vcpu_info_gpa);
384 }
385 
386 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
387 {
388     CPUState *cs = qemu_get_cpu(vcpu_id);
389     if (!cs) {
390         return NULL;
391     }
392 
393     return X86_CPU(cs)->env.xen_vcpu_info_hva;
394 }
395 
396 void kvm_xen_maybe_deassert_callback(CPUState *cs)
397 {
398     CPUX86State *env = &X86_CPU(cs)->env;
399     struct vcpu_info *vi = env->xen_vcpu_info_hva;
400     if (!vi) {
401         return;
402     }
403 
404     /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
405     if (!vi->evtchn_upcall_pending) {
406         bql_lock();
407         /*
408          * Check again now we have the lock, because it may have been
409          * asserted in the interim. And we don't want to take the lock
410          * every time because this is a fast path.
411          */
412         if (!vi->evtchn_upcall_pending) {
413             X86_CPU(cs)->env.xen_callback_asserted = false;
414             xen_evtchn_set_callback_level(0);
415         }
416         bql_unlock();
417     }
418 }
419 
420 void kvm_xen_set_callback_asserted(void)
421 {
422     CPUState *cs = qemu_get_cpu(0);
423 
424     if (cs) {
425         X86_CPU(cs)->env.xen_callback_asserted = true;
426     }
427 }
428 
429 bool kvm_xen_has_vcpu_callback_vector(void)
430 {
431     CPUState *cs = qemu_get_cpu(0);
432 
433     return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector;
434 }
435 
436 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
437 {
438     CPUState *cs = qemu_get_cpu(vcpu_id);
439     uint8_t vector;
440 
441     if (!cs) {
442         return;
443     }
444 
445     vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
446     if (vector) {
447         /*
448          * The per-vCPU callback vector injected via lapic. Just
449          * deliver it as an MSI.
450          */
451         MSIMessage msg = {
452             .address = APIC_DEFAULT_ADDRESS |
453                        (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT),
454             .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
455         };
456         kvm_irqchip_send_msi(kvm_state, msg);
457         return;
458     }
459 
460     switch (type) {
461     case HVM_PARAM_CALLBACK_TYPE_VECTOR:
462         /*
463          * If the evtchn_upcall_pending field in the vcpu_info is set, then
464          * KVM will automatically deliver the vector on entering the vCPU
465          * so all we have to do is kick it out.
466          */
467         qemu_cpu_kick(cs);
468         break;
469 
470     case HVM_PARAM_CALLBACK_TYPE_GSI:
471     case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
472         if (vcpu_id == 0) {
473             xen_evtchn_set_callback_level(1);
474         }
475         break;
476     }
477 }
478 
479 /* Must always be called with xen_timers_lock held */
480 static int kvm_xen_set_vcpu_timer(CPUState *cs)
481 {
482     X86CPU *cpu = X86_CPU(cs);
483     CPUX86State *env = &cpu->env;
484 
485     struct kvm_xen_vcpu_attr va = {
486         .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
487         .u.timer.port = env->xen_virq[VIRQ_TIMER],
488         .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
489         .u.timer.expires_ns = env->xen_singleshot_timer_ns,
490     };
491 
492     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
493 }
494 
495 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
496 {
497     QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
498     kvm_xen_set_vcpu_timer(cs);
499 }
500 
501 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
502 {
503     CPUState *cs = qemu_get_cpu(vcpu_id);
504 
505     if (!cs) {
506         return -ENOENT;
507     }
508 
509     /* cpu.h doesn't include the actual Xen header. */
510     qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
511 
512     if (virq >= NR_VIRQS) {
513         return -EINVAL;
514     }
515 
516     if (port && X86_CPU(cs)->env.xen_virq[virq]) {
517         return -EEXIST;
518     }
519 
520     X86_CPU(cs)->env.xen_virq[virq] = port;
521     if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
522         async_run_on_cpu(cs, do_set_vcpu_timer_virq,
523                          RUN_ON_CPU_HOST_INT(port));
524     }
525     return 0;
526 }
527 
528 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
529 {
530     X86CPU *cpu = X86_CPU(cs);
531     CPUX86State *env = &cpu->env;
532 
533     env->xen_vcpu_time_info_gpa = data.host_ulong;
534 
535     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
536                           env->xen_vcpu_time_info_gpa);
537 }
538 
539 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
540 {
541     X86CPU *cpu = X86_CPU(cs);
542     CPUX86State *env = &cpu->env;
543 
544     env->xen_vcpu_runstate_gpa = data.host_ulong;
545 
546     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
547                           env->xen_vcpu_runstate_gpa);
548 }
549 
550 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
551 {
552     X86CPU *cpu = X86_CPU(cs);
553     CPUX86State *env = &cpu->env;
554 
555     env->xen_vcpu_info_gpa = INVALID_GPA;
556     env->xen_vcpu_info_default_gpa = INVALID_GPA;
557     env->xen_vcpu_time_info_gpa = INVALID_GPA;
558     env->xen_vcpu_runstate_gpa = INVALID_GPA;
559     env->xen_vcpu_callback_vector = 0;
560     memset(env->xen_virq, 0, sizeof(env->xen_virq));
561 
562     set_vcpu_info(cs, INVALID_GPA);
563     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
564                           INVALID_GPA);
565     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
566                           INVALID_GPA);
567     if (kvm_xen_has_cap(EVTCHN_SEND)) {
568         kvm_xen_set_vcpu_callback_vector(cs);
569 
570         QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
571         env->xen_singleshot_timer_ns = 0;
572         kvm_xen_set_vcpu_timer(cs);
573     } else {
574         vcpuop_stop_singleshot_timer(cs);
575     };
576 
577 }
578 
579 static int xen_set_shared_info(uint64_t gfn)
580 {
581     uint64_t gpa = gfn << TARGET_PAGE_BITS;
582     int i, err;
583 
584     BQL_LOCK_GUARD();
585 
586     /*
587      * The xen_overlay device tells KVM about it too, since it had to
588      * do that on migration load anyway (unless we're going to jump
589      * through lots of hoops to maintain the fiction that this isn't
590      * KVM-specific.
591      */
592     err = xen_overlay_map_shinfo_page(gpa);
593     if (err) {
594             return err;
595     }
596 
597     trace_kvm_xen_set_shared_info(gfn);
598 
599     for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
600         CPUState *cpu = qemu_get_cpu(i);
601         if (cpu) {
602             async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
603                              RUN_ON_CPU_HOST_ULONG(gpa));
604         }
605         gpa += sizeof(vcpu_info_t);
606     }
607 
608     return err;
609 }
610 
611 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
612 {
613     switch (space) {
614     case XENMAPSPACE_shared_info:
615         if (idx > 0) {
616             return -EINVAL;
617         }
618         return xen_set_shared_info(gfn);
619 
620     case XENMAPSPACE_grant_table:
621         return xen_gnttab_map_page(idx, gfn);
622 
623     case XENMAPSPACE_gmfn:
624     case XENMAPSPACE_gmfn_range:
625         return -ENOTSUP;
626 
627     case XENMAPSPACE_gmfn_foreign:
628     case XENMAPSPACE_dev_mmio:
629         return -EPERM;
630 
631     default:
632         return -EINVAL;
633     }
634 }
635 
636 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
637                              uint64_t arg)
638 {
639     struct xen_add_to_physmap xatp;
640     CPUState *cs = CPU(cpu);
641 
642     if (hypercall_compat32(exit->u.hcall.longmode)) {
643         struct compat_xen_add_to_physmap xatp32;
644 
645         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
646         if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
647             return -EFAULT;
648         }
649         xatp.domid = xatp32.domid;
650         xatp.size = xatp32.size;
651         xatp.space = xatp32.space;
652         xatp.idx = xatp32.idx;
653         xatp.gpfn = xatp32.gpfn;
654     } else {
655         if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
656             return -EFAULT;
657         }
658     }
659 
660     if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
661         return -ESRCH;
662     }
663 
664     return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
665 }
666 
667 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
668                                    uint64_t arg)
669 {
670     struct xen_add_to_physmap_batch xatpb;
671     unsigned long idxs_gva, gpfns_gva, errs_gva;
672     CPUState *cs = CPU(cpu);
673     size_t op_sz;
674 
675     if (hypercall_compat32(exit->u.hcall.longmode)) {
676         struct compat_xen_add_to_physmap_batch xatpb32;
677 
678         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
679         if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
680             return -EFAULT;
681         }
682         xatpb.domid = xatpb32.domid;
683         xatpb.space = xatpb32.space;
684         xatpb.size = xatpb32.size;
685 
686         idxs_gva = xatpb32.idxs.c;
687         gpfns_gva = xatpb32.gpfns.c;
688         errs_gva = xatpb32.errs.c;
689         op_sz = sizeof(uint32_t);
690     } else {
691         if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
692             return -EFAULT;
693         }
694         op_sz = sizeof(unsigned long);
695         idxs_gva = (unsigned long)xatpb.idxs.p;
696         gpfns_gva = (unsigned long)xatpb.gpfns.p;
697         errs_gva = (unsigned long)xatpb.errs.p;
698     }
699 
700     if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
701         return -ESRCH;
702     }
703 
704     /* Explicitly invalid for the batch op. Not that we implement it anyway. */
705     if (xatpb.space == XENMAPSPACE_gmfn_range) {
706         return -EINVAL;
707     }
708 
709     while (xatpb.size--) {
710         unsigned long idx = 0;
711         unsigned long gpfn = 0;
712         int err;
713 
714         /* For 32-bit compat this only copies the low 32 bits of each */
715         if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
716             kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
717             return -EFAULT;
718         }
719         idxs_gva += op_sz;
720         gpfns_gva += op_sz;
721 
722         err = add_to_physmap_one(xatpb.space, idx, gpfn);
723 
724         if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
725             return -EFAULT;
726         }
727         errs_gva += sizeof(err);
728     }
729     return 0;
730 }
731 
732 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
733                                    int cmd, uint64_t arg)
734 {
735     int err;
736 
737     switch (cmd) {
738     case XENMEM_add_to_physmap:
739         err = do_add_to_physmap(exit, cpu, arg);
740         break;
741 
742     case XENMEM_add_to_physmap_batch:
743         err = do_add_to_physmap_batch(exit, cpu, arg);
744         break;
745 
746     default:
747         return false;
748     }
749 
750     exit->u.hcall.result = err;
751     return true;
752 }
753 
754 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
755                              uint64_t arg)
756 {
757     CPUState *cs = CPU(cpu);
758     struct xen_hvm_param hp;
759     int err = 0;
760 
761     /* No need for 32/64 compat handling */
762     qemu_build_assert(sizeof(hp) == 16);
763 
764     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
765         err = -EFAULT;
766         goto out;
767     }
768 
769     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
770         err = -ESRCH;
771         goto out;
772     }
773 
774     switch (hp.index) {
775     case HVM_PARAM_CALLBACK_IRQ:
776         bql_lock();
777         err = xen_evtchn_set_callback_param(hp.value);
778         bql_unlock();
779         xen_set_long_mode(exit->u.hcall.longmode);
780         break;
781     default:
782         return false;
783     }
784 
785 out:
786     exit->u.hcall.result = err;
787     return true;
788 }
789 
790 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
791                              uint64_t arg)
792 {
793     CPUState *cs = CPU(cpu);
794     struct xen_hvm_param hp;
795     int err = 0;
796 
797     /* No need for 32/64 compat handling */
798     qemu_build_assert(sizeof(hp) == 16);
799 
800     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
801         err = -EFAULT;
802         goto out;
803     }
804 
805     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
806         err = -ESRCH;
807         goto out;
808     }
809 
810     switch (hp.index) {
811     case HVM_PARAM_STORE_PFN:
812         hp.value = XEN_SPECIAL_PFN(XENSTORE);
813         break;
814     case HVM_PARAM_STORE_EVTCHN:
815         hp.value = xen_xenstore_get_port();
816         break;
817     case HVM_PARAM_CONSOLE_PFN:
818         hp.value = xen_primary_console_get_pfn();
819         if (!hp.value) {
820             err = -EINVAL;
821         }
822         break;
823     case HVM_PARAM_CONSOLE_EVTCHN:
824         hp.value = xen_primary_console_get_port();
825         if (!hp.value) {
826             err = -EINVAL;
827         }
828         break;
829     default:
830         return false;
831     }
832 
833     if (!err && kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
834         err = -EFAULT;
835     }
836 out:
837     exit->u.hcall.result = err;
838     return true;
839 }
840 
841 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
842                                               X86CPU *cpu, uint64_t arg)
843 {
844     struct xen_hvm_evtchn_upcall_vector up;
845     CPUState *target_cs;
846 
847     /* No need for 32/64 compat handling */
848     qemu_build_assert(sizeof(up) == 8);
849 
850     if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
851         return -EFAULT;
852     }
853 
854     if (up.vector < 0x10) {
855         return -EINVAL;
856     }
857 
858     target_cs = qemu_get_cpu(up.vcpu);
859     if (!target_cs) {
860         return -EINVAL;
861     }
862 
863     async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
864                      RUN_ON_CPU_HOST_INT(up.vector));
865     return 0;
866 }
867 
868 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
869                                  int cmd, uint64_t arg)
870 {
871     int ret = -ENOSYS;
872     switch (cmd) {
873     case HVMOP_set_evtchn_upcall_vector:
874         ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg);
875         break;
876 
877     case HVMOP_pagetable_dying:
878         ret = -ENOSYS;
879         break;
880 
881     case HVMOP_set_param:
882         return handle_set_param(exit, cpu, arg);
883 
884     case HVMOP_get_param:
885         return handle_get_param(exit, cpu, arg);
886 
887     default:
888         return false;
889     }
890 
891     exit->u.hcall.result = ret;
892     return true;
893 }
894 
895 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
896                                      uint64_t arg)
897 {
898     struct vcpu_register_vcpu_info rvi;
899     uint64_t gpa;
900 
901     /* No need for 32/64 compat handling */
902     qemu_build_assert(sizeof(rvi) == 16);
903     qemu_build_assert(sizeof(struct vcpu_info) == 64);
904 
905     if (!target) {
906         return -ENOENT;
907     }
908 
909     if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
910         return -EFAULT;
911     }
912 
913     if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
914         return -EINVAL;
915     }
916 
917     gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
918     async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
919     return 0;
920 }
921 
922 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
923                                           uint64_t arg)
924 {
925     struct vcpu_register_time_memory_area tma;
926     uint64_t gpa;
927     size_t len;
928 
929     /* No need for 32/64 compat handling */
930     qemu_build_assert(sizeof(tma) == 8);
931     qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
932 
933     if (!target) {
934         return -ENOENT;
935     }
936 
937     if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
938         return -EFAULT;
939     }
940 
941     /*
942      * Xen actually uses the GVA and does the translation through the guest
943      * page tables each time. But Linux/KVM uses the GPA, on the assumption
944      * that guests only ever use *global* addresses (kernel virtual addresses)
945      * for it. If Linux is changed to redo the GVA→GPA translation each time,
946      * it will offer a new vCPU attribute for that, and we'll use it instead.
947      */
948     if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
949         len < sizeof(struct vcpu_time_info)) {
950         return -EFAULT;
951     }
952 
953     async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
954                      RUN_ON_CPU_HOST_ULONG(gpa));
955     return 0;
956 }
957 
958 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
959                                          uint64_t arg)
960 {
961     struct vcpu_register_runstate_memory_area rma;
962     uint64_t gpa;
963     size_t len;
964 
965     /* No need for 32/64 compat handling */
966     qemu_build_assert(sizeof(rma) == 8);
967     /* The runstate area actually does change size, but Linux copes. */
968 
969     if (!target) {
970         return -ENOENT;
971     }
972 
973     if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
974         return -EFAULT;
975     }
976 
977     /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
978     if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
979         return -EFAULT;
980     }
981 
982     async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
983                      RUN_ON_CPU_HOST_ULONG(gpa));
984     return 0;
985 }
986 
987 static uint64_t kvm_get_current_ns(void)
988 {
989     struct kvm_clock_data data;
990     int ret;
991 
992     ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
993     if (ret < 0) {
994         fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
995                 abort();
996     }
997 
998     return data.clock;
999 }
1000 
1001 static void xen_vcpu_singleshot_timer_event(void *opaque)
1002 {
1003     CPUState *cpu = opaque;
1004     CPUX86State *env = &X86_CPU(cpu)->env;
1005     uint16_t port = env->xen_virq[VIRQ_TIMER];
1006 
1007     if (likely(port)) {
1008         xen_evtchn_set_port(port);
1009     }
1010 
1011     qemu_mutex_lock(&env->xen_timers_lock);
1012     env->xen_singleshot_timer_ns = 0;
1013     qemu_mutex_unlock(&env->xen_timers_lock);
1014 }
1015 
1016 static void xen_vcpu_periodic_timer_event(void *opaque)
1017 {
1018     CPUState *cpu = opaque;
1019     CPUX86State *env = &X86_CPU(cpu)->env;
1020     uint16_t port = env->xen_virq[VIRQ_TIMER];
1021     int64_t qemu_now;
1022 
1023     if (likely(port)) {
1024         xen_evtchn_set_port(port);
1025     }
1026 
1027     qemu_mutex_lock(&env->xen_timers_lock);
1028 
1029     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1030     timer_mod_ns(env->xen_periodic_timer,
1031                  qemu_now + env->xen_periodic_timer_period);
1032 
1033     qemu_mutex_unlock(&env->xen_timers_lock);
1034 }
1035 
1036 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1037 {
1038     CPUX86State *tenv = &X86_CPU(target)->env;
1039     int64_t qemu_now;
1040 
1041     timer_del(tenv->xen_periodic_timer);
1042 
1043     qemu_mutex_lock(&tenv->xen_timers_lock);
1044 
1045     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1046     timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1047     tenv->xen_periodic_timer_period = period_ns;
1048 
1049     qemu_mutex_unlock(&tenv->xen_timers_lock);
1050     return 0;
1051 }
1052 
1053 #define MILLISECS(_ms)  ((int64_t)((_ms) * 1000000ULL))
1054 #define MICROSECS(_us)  ((int64_t)((_us) * 1000ULL))
1055 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1056 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */
1057 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1058 
1059 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1060                                      uint64_t arg)
1061 {
1062     struct vcpu_set_periodic_timer spt;
1063 
1064     qemu_build_assert(sizeof(spt) == 8);
1065     if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1066         return -EFAULT;
1067     }
1068 
1069     if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1070         return -EINVAL;
1071     }
1072 
1073     return do_set_periodic_timer(target, spt.period_ns);
1074 }
1075 
1076 static int vcpuop_stop_periodic_timer(CPUState *target)
1077 {
1078     CPUX86State *tenv = &X86_CPU(target)->env;
1079 
1080     qemu_mutex_lock(&tenv->xen_timers_lock);
1081 
1082     timer_del(tenv->xen_periodic_timer);
1083     tenv->xen_periodic_timer_period = 0;
1084 
1085     qemu_mutex_unlock(&tenv->xen_timers_lock);
1086     return 0;
1087 }
1088 
1089 /*
1090  * Userspace handling of timer, for older kernels.
1091  * Must always be called with xen_timers_lock held.
1092  */
1093 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1094                                    bool linux_wa)
1095 {
1096     CPUX86State *env = &X86_CPU(cs)->env;
1097     int64_t now = kvm_get_current_ns();
1098     int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1099     int64_t delta = timeout_abs - now;
1100 
1101     if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1102                              (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1103         /*
1104          * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1105          * for negative absolute timeout values (caused by integer
1106          * overflow), and for values about 13 days in the future (2^50ns)
1107          * which would be caused by jiffies overflow. For those cases, it
1108          * sets the timeout 100ms in the future (not *too* soon, since if
1109          * a guest really did set a long timeout on purpose we don't want
1110          * to keep churning CPU time by waking it up).
1111          */
1112         delta = (100 * SCALE_MS);
1113         timeout_abs = now + delta;
1114     }
1115 
1116     timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1117     env->xen_singleshot_timer_ns = now + delta;
1118     return 0;
1119 }
1120 
1121 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1122 {
1123     struct vcpu_set_singleshot_timer sst = { 0 };
1124 
1125     /*
1126      * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1127      * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1128      * that get used are identical, and there's four bytes of padding
1129      * unused at the end. For true Xen compatibility we should attempt
1130      * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1131      * if we can't get the padding too. But that's daft. Just copy what
1132      * we need.
1133      */
1134     qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1135     qemu_build_assert(sizeof(sst) >= 12);
1136 
1137     if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1138         return -EFAULT;
1139     }
1140 
1141     QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1142 
1143     /*
1144      * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does.
1145      * The only guest that ever used it, got it wrong.
1146      * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909
1147      */
1148     return do_set_singleshot_timer(cs, sst.timeout_abs_ns, false);
1149 }
1150 
1151 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1152 {
1153     CPUX86State *env = &X86_CPU(cs)->env;
1154 
1155     qemu_mutex_lock(&env->xen_timers_lock);
1156 
1157     timer_del(env->xen_singleshot_timer);
1158     env->xen_singleshot_timer_ns = 0;
1159 
1160     qemu_mutex_unlock(&env->xen_timers_lock);
1161     return 0;
1162 }
1163 
1164 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1165                                        uint64_t timeout)
1166 {
1167     int err;
1168 
1169     if (unlikely(timeout == 0)) {
1170         err = vcpuop_stop_singleshot_timer(CPU(cpu));
1171     } else {
1172         QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock);
1173         err = do_set_singleshot_timer(CPU(cpu), timeout, true);
1174     }
1175     exit->u.hcall.result = err;
1176     return true;
1177 }
1178 
1179 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1180                                   int cmd, int vcpu_id, uint64_t arg)
1181 {
1182     CPUState *cs = CPU(cpu);
1183     CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1184     int err;
1185 
1186     if (!dest) {
1187         err = -ENOENT;
1188         goto out;
1189     }
1190 
1191     switch (cmd) {
1192     case VCPUOP_register_runstate_memory_area:
1193         err = vcpuop_register_runstate_info(cs, dest, arg);
1194         break;
1195     case VCPUOP_register_vcpu_time_memory_area:
1196         err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1197         break;
1198     case VCPUOP_register_vcpu_info:
1199         err = vcpuop_register_vcpu_info(cs, dest, arg);
1200         break;
1201     case VCPUOP_set_singleshot_timer: {
1202         if (cs->cpu_index == vcpu_id) {
1203             err = vcpuop_set_singleshot_timer(dest, arg);
1204         } else {
1205             err = -EINVAL;
1206         }
1207         break;
1208     }
1209     case VCPUOP_stop_singleshot_timer:
1210         if (cs->cpu_index == vcpu_id) {
1211             err = vcpuop_stop_singleshot_timer(dest);
1212         } else {
1213             err = -EINVAL;
1214         }
1215         break;
1216     case VCPUOP_set_periodic_timer: {
1217         err = vcpuop_set_periodic_timer(cs, dest, arg);
1218         break;
1219     }
1220     case VCPUOP_stop_periodic_timer:
1221         err = vcpuop_stop_periodic_timer(dest);
1222         break;
1223 
1224     default:
1225         return false;
1226     }
1227 
1228  out:
1229     exit->u.hcall.result = err;
1230     return true;
1231 }
1232 
1233 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1234                                     int cmd, uint64_t arg)
1235 {
1236     CPUState *cs = CPU(cpu);
1237     int err = -ENOSYS;
1238 
1239     switch (cmd) {
1240     case EVTCHNOP_init_control:
1241     case EVTCHNOP_expand_array:
1242     case EVTCHNOP_set_priority:
1243         /* We do not support FIFO channels at this point */
1244         err = -ENOSYS;
1245         break;
1246 
1247     case EVTCHNOP_status: {
1248         struct evtchn_status status;
1249 
1250         qemu_build_assert(sizeof(status) == 24);
1251         if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1252             err = -EFAULT;
1253             break;
1254         }
1255 
1256         err = xen_evtchn_status_op(&status);
1257         if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1258             err = -EFAULT;
1259         }
1260         break;
1261     }
1262     case EVTCHNOP_close: {
1263         struct evtchn_close close;
1264 
1265         qemu_build_assert(sizeof(close) == 4);
1266         if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1267             err = -EFAULT;
1268             break;
1269         }
1270 
1271         err = xen_evtchn_close_op(&close);
1272         break;
1273     }
1274     case EVTCHNOP_unmask: {
1275         struct evtchn_unmask unmask;
1276 
1277         qemu_build_assert(sizeof(unmask) == 4);
1278         if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1279             err = -EFAULT;
1280             break;
1281         }
1282 
1283         err = xen_evtchn_unmask_op(&unmask);
1284         break;
1285     }
1286     case EVTCHNOP_bind_virq: {
1287         struct evtchn_bind_virq virq;
1288 
1289         qemu_build_assert(sizeof(virq) == 12);
1290         if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1291             err = -EFAULT;
1292             break;
1293         }
1294 
1295         err = xen_evtchn_bind_virq_op(&virq);
1296         if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1297             err = -EFAULT;
1298         }
1299         break;
1300     }
1301     case EVTCHNOP_bind_pirq: {
1302         struct evtchn_bind_pirq pirq;
1303 
1304         qemu_build_assert(sizeof(pirq) == 12);
1305         if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1306             err = -EFAULT;
1307             break;
1308         }
1309 
1310         err = xen_evtchn_bind_pirq_op(&pirq);
1311         if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1312             err = -EFAULT;
1313         }
1314         break;
1315     }
1316     case EVTCHNOP_bind_ipi: {
1317         struct evtchn_bind_ipi ipi;
1318 
1319         qemu_build_assert(sizeof(ipi) == 8);
1320         if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1321             err = -EFAULT;
1322             break;
1323         }
1324 
1325         err = xen_evtchn_bind_ipi_op(&ipi);
1326         if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1327             err = -EFAULT;
1328         }
1329         break;
1330     }
1331     case EVTCHNOP_send: {
1332         struct evtchn_send send;
1333 
1334         qemu_build_assert(sizeof(send) == 4);
1335         if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1336             err = -EFAULT;
1337             break;
1338         }
1339 
1340         err = xen_evtchn_send_op(&send);
1341         break;
1342     }
1343     case EVTCHNOP_alloc_unbound: {
1344         struct evtchn_alloc_unbound alloc;
1345 
1346         qemu_build_assert(sizeof(alloc) == 8);
1347         if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1348             err = -EFAULT;
1349             break;
1350         }
1351 
1352         err = xen_evtchn_alloc_unbound_op(&alloc);
1353         if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1354             err = -EFAULT;
1355         }
1356         break;
1357     }
1358     case EVTCHNOP_bind_interdomain: {
1359         struct evtchn_bind_interdomain interdomain;
1360 
1361         qemu_build_assert(sizeof(interdomain) == 12);
1362         if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1363             err = -EFAULT;
1364             break;
1365         }
1366 
1367         err = xen_evtchn_bind_interdomain_op(&interdomain);
1368         if (!err &&
1369             kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1370             err = -EFAULT;
1371         }
1372         break;
1373     }
1374     case EVTCHNOP_bind_vcpu: {
1375         struct evtchn_bind_vcpu vcpu;
1376 
1377         qemu_build_assert(sizeof(vcpu) == 8);
1378         if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1379             err = -EFAULT;
1380             break;
1381         }
1382 
1383         err = xen_evtchn_bind_vcpu_op(&vcpu);
1384         break;
1385     }
1386     case EVTCHNOP_reset: {
1387         struct evtchn_reset reset;
1388 
1389         qemu_build_assert(sizeof(reset) == 2);
1390         if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1391             err = -EFAULT;
1392             break;
1393         }
1394 
1395         err = xen_evtchn_reset_op(&reset);
1396         break;
1397     }
1398     default:
1399         return false;
1400     }
1401 
1402     exit->u.hcall.result = err;
1403     return true;
1404 }
1405 
1406 int kvm_xen_soft_reset(void)
1407 {
1408     CPUState *cpu;
1409     int err;
1410 
1411     assert(bql_locked());
1412 
1413     trace_kvm_xen_soft_reset();
1414 
1415     err = xen_evtchn_soft_reset();
1416     if (err) {
1417         return err;
1418     }
1419 
1420     /*
1421      * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1422      * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1423      * to deliver to the timer interrupt and treats that as 'disabled'.
1424      */
1425     err = xen_evtchn_set_callback_param(0);
1426     if (err) {
1427         return err;
1428     }
1429 
1430     CPU_FOREACH(cpu) {
1431         async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1432     }
1433 
1434     err = xen_overlay_map_shinfo_page(INVALID_GFN);
1435     if (err) {
1436         return err;
1437     }
1438 
1439     err = xen_gnttab_reset();
1440     if (err) {
1441         return err;
1442     }
1443 
1444     err = xen_primary_console_reset();
1445     if (err) {
1446         return err;
1447     }
1448 
1449     err = xen_xenstore_reset();
1450     if (err) {
1451         return err;
1452     }
1453 
1454     return 0;
1455 }
1456 
1457 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1458 {
1459     struct sched_shutdown shutdown;
1460     int ret = 0;
1461 
1462     /* No need for 32/64 compat handling */
1463     qemu_build_assert(sizeof(shutdown) == 4);
1464 
1465     if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1466         return -EFAULT;
1467     }
1468 
1469     switch (shutdown.reason) {
1470     case SHUTDOWN_crash:
1471         cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1472         qemu_system_guest_panicked(NULL);
1473         break;
1474 
1475     case SHUTDOWN_reboot:
1476         qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1477         break;
1478 
1479     case SHUTDOWN_poweroff:
1480         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1481         break;
1482 
1483     case SHUTDOWN_soft_reset:
1484         bql_lock();
1485         ret = kvm_xen_soft_reset();
1486         bql_unlock();
1487         break;
1488 
1489     default:
1490         ret = -EINVAL;
1491         break;
1492     }
1493 
1494     return ret;
1495 }
1496 
1497 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1498                                    int cmd, uint64_t arg)
1499 {
1500     CPUState *cs = CPU(cpu);
1501     int err = -ENOSYS;
1502 
1503     switch (cmd) {
1504     case SCHEDOP_shutdown:
1505         err = schedop_shutdown(cs, arg);
1506         break;
1507 
1508     case SCHEDOP_poll:
1509         /*
1510          * Linux will panic if this doesn't work. Just yield; it's not
1511          * worth overthinking it because with event channel handling
1512          * in KVM, the kernel will intercept this and it will never
1513          * reach QEMU anyway. The semantics of the hypercall explicltly
1514          * permit spurious wakeups.
1515          */
1516     case SCHEDOP_yield:
1517         sched_yield();
1518         err = 0;
1519         break;
1520 
1521     default:
1522         return false;
1523     }
1524 
1525     exit->u.hcall.result = err;
1526     return true;
1527 }
1528 
1529 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1530                                     int cmd, uint64_t arg, int count)
1531 {
1532     CPUState *cs = CPU(cpu);
1533     int err;
1534 
1535     switch (cmd) {
1536     case GNTTABOP_set_version: {
1537         struct gnttab_set_version set;
1538 
1539         qemu_build_assert(sizeof(set) == 4);
1540         if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1541             err = -EFAULT;
1542             break;
1543         }
1544 
1545         err = xen_gnttab_set_version_op(&set);
1546         if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1547             err = -EFAULT;
1548         }
1549         break;
1550     }
1551     case GNTTABOP_get_version: {
1552         struct gnttab_get_version get;
1553 
1554         qemu_build_assert(sizeof(get) == 8);
1555         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1556             err = -EFAULT;
1557             break;
1558         }
1559 
1560         err = xen_gnttab_get_version_op(&get);
1561         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1562             err = -EFAULT;
1563         }
1564         break;
1565     }
1566     case GNTTABOP_query_size: {
1567         struct gnttab_query_size size;
1568 
1569         qemu_build_assert(sizeof(size) == 16);
1570         if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1571             err = -EFAULT;
1572             break;
1573         }
1574 
1575         err = xen_gnttab_query_size_op(&size);
1576         if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1577             err = -EFAULT;
1578         }
1579         break;
1580     }
1581     case GNTTABOP_setup_table:
1582     case GNTTABOP_copy:
1583     case GNTTABOP_map_grant_ref:
1584     case GNTTABOP_unmap_grant_ref:
1585     case GNTTABOP_swap_grant_ref:
1586         return false;
1587 
1588     default:
1589         /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1590         err = -ENOSYS;
1591         break;
1592     }
1593 
1594     exit->u.hcall.result = err;
1595     return true;
1596 }
1597 
1598 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1599                                      int cmd, uint64_t arg)
1600 {
1601     CPUState *cs = CPU(cpu);
1602     int err;
1603 
1604     switch (cmd) {
1605     case PHYSDEVOP_map_pirq: {
1606         struct physdev_map_pirq map;
1607 
1608         if (hypercall_compat32(exit->u.hcall.longmode)) {
1609             struct compat_physdev_map_pirq *map32 = (void *)&map;
1610 
1611             if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1612                 return -EFAULT;
1613             }
1614 
1615             /*
1616              * The only thing that's different is the alignment of the
1617              * uint64_t table_base at the end, which gets padding to make
1618              * it 64-bit aligned in the 64-bit version.
1619              */
1620             qemu_build_assert(sizeof(*map32) == 36);
1621             qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1622                               offsetof(struct compat_physdev_map_pirq, entry_nr));
1623             memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1624         } else {
1625             if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1626                 err = -EFAULT;
1627                 break;
1628             }
1629         }
1630         err = xen_physdev_map_pirq(&map);
1631         /*
1632          * Since table_base is an IN parameter and won't be changed, just
1633          * copy the size of the compat structure back to the guest.
1634          */
1635         if (!err && kvm_copy_to_gva(cs, arg, &map,
1636                                     sizeof(struct compat_physdev_map_pirq))) {
1637             err = -EFAULT;
1638         }
1639         break;
1640     }
1641     case PHYSDEVOP_unmap_pirq: {
1642         struct physdev_unmap_pirq unmap;
1643 
1644         qemu_build_assert(sizeof(unmap) == 8);
1645         if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1646             err = -EFAULT;
1647             break;
1648         }
1649 
1650         err = xen_physdev_unmap_pirq(&unmap);
1651         if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1652             err = -EFAULT;
1653         }
1654         break;
1655     }
1656     case PHYSDEVOP_eoi: {
1657         struct physdev_eoi eoi;
1658 
1659         qemu_build_assert(sizeof(eoi) == 4);
1660         if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1661             err = -EFAULT;
1662             break;
1663         }
1664 
1665         err = xen_physdev_eoi_pirq(&eoi);
1666         if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1667             err = -EFAULT;
1668         }
1669         break;
1670     }
1671     case PHYSDEVOP_irq_status_query: {
1672         struct physdev_irq_status_query query;
1673 
1674         qemu_build_assert(sizeof(query) == 8);
1675         if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1676             err = -EFAULT;
1677             break;
1678         }
1679 
1680         err = xen_physdev_query_pirq(&query);
1681         if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1682             err = -EFAULT;
1683         }
1684         break;
1685     }
1686     case PHYSDEVOP_get_free_pirq: {
1687         struct physdev_get_free_pirq get;
1688 
1689         qemu_build_assert(sizeof(get) == 8);
1690         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1691             err = -EFAULT;
1692             break;
1693         }
1694 
1695         err = xen_physdev_get_free_pirq(&get);
1696         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1697             err = -EFAULT;
1698         }
1699         break;
1700     }
1701     case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1702         err = -ENOSYS;
1703         break;
1704 
1705     default:
1706         return false;
1707     }
1708 
1709     exit->u.hcall.result = err;
1710     return true;
1711 }
1712 
1713 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1714 {
1715     uint16_t code = exit->u.hcall.input;
1716 
1717     if (exit->u.hcall.cpl > 0) {
1718         exit->u.hcall.result = -EPERM;
1719         return true;
1720     }
1721 
1722     switch (code) {
1723     case __HYPERVISOR_set_timer_op:
1724         if (exit->u.hcall.longmode) {
1725             return kvm_xen_hcall_set_timer_op(exit, cpu,
1726                                               exit->u.hcall.params[0]);
1727         } else {
1728             /* In 32-bit mode, the 64-bit timer value is in two args. */
1729             uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1730                 (uint32_t)exit->u.hcall.params[0];
1731             return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1732         }
1733     case __HYPERVISOR_grant_table_op:
1734         return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1735                                        exit->u.hcall.params[1],
1736                                        exit->u.hcall.params[2]);
1737     case __HYPERVISOR_sched_op:
1738         return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1739                                       exit->u.hcall.params[1]);
1740     case __HYPERVISOR_event_channel_op:
1741         return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1742                                        exit->u.hcall.params[1]);
1743     case __HYPERVISOR_vcpu_op:
1744         return kvm_xen_hcall_vcpu_op(exit, cpu,
1745                                      exit->u.hcall.params[0],
1746                                      exit->u.hcall.params[1],
1747                                      exit->u.hcall.params[2]);
1748     case __HYPERVISOR_hvm_op:
1749         return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1750                                     exit->u.hcall.params[1]);
1751     case __HYPERVISOR_memory_op:
1752         return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1753                                        exit->u.hcall.params[1]);
1754     case __HYPERVISOR_physdev_op:
1755         return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1756                                         exit->u.hcall.params[1]);
1757     case __HYPERVISOR_xen_version:
1758         return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1759                                          exit->u.hcall.params[1]);
1760     default:
1761         return false;
1762     }
1763 }
1764 
1765 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1766 {
1767     if (exit->type != KVM_EXIT_XEN_HCALL) {
1768         return -1;
1769     }
1770 
1771     /*
1772      * The kernel latches the guest 32/64 mode when the MSR is used to fill
1773      * the hypercall page. So if we see a hypercall in a mode that doesn't
1774      * match our own idea of the guest mode, fetch the kernel's idea of the
1775      * "long mode" to remain in sync.
1776      */
1777     if (exit->u.hcall.longmode != xen_is_long_mode()) {
1778         xen_sync_long_mode();
1779     }
1780 
1781     if (!do_kvm_xen_handle_exit(cpu, exit)) {
1782         /*
1783          * Some hypercalls will be deliberately "implemented" by returning
1784          * -ENOSYS. This case is for hypercalls which are unexpected.
1785          */
1786         exit->u.hcall.result = -ENOSYS;
1787         qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1788                       PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1789                       (uint64_t)exit->u.hcall.input,
1790                       (uint64_t)exit->u.hcall.params[0],
1791                       (uint64_t)exit->u.hcall.params[1],
1792                       (uint64_t)exit->u.hcall.params[2]);
1793     }
1794 
1795     trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1796                             exit->u.hcall.input, exit->u.hcall.params[0],
1797                             exit->u.hcall.params[1], exit->u.hcall.params[2],
1798                             exit->u.hcall.result);
1799     return 0;
1800 }
1801 
1802 uint16_t kvm_xen_get_gnttab_max_frames(void)
1803 {
1804     KVMState *s = KVM_STATE(current_accel());
1805     return s->xen_gnttab_max_frames;
1806 }
1807 
1808 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1809 {
1810     KVMState *s = KVM_STATE(current_accel());
1811     return s->xen_evtchn_max_pirq;
1812 }
1813 
1814 int kvm_put_xen_state(CPUState *cs)
1815 {
1816     X86CPU *cpu = X86_CPU(cs);
1817     CPUX86State *env = &cpu->env;
1818     uint64_t gpa;
1819     int ret;
1820 
1821     gpa = env->xen_vcpu_info_gpa;
1822     if (gpa == INVALID_GPA) {
1823         gpa = env->xen_vcpu_info_default_gpa;
1824     }
1825 
1826     if (gpa != INVALID_GPA) {
1827         ret = set_vcpu_info(cs, gpa);
1828         if (ret < 0) {
1829             return ret;
1830         }
1831     }
1832 
1833     gpa = env->xen_vcpu_time_info_gpa;
1834     if (gpa != INVALID_GPA) {
1835         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1836                                     gpa);
1837         if (ret < 0) {
1838             return ret;
1839         }
1840     }
1841 
1842     gpa = env->xen_vcpu_runstate_gpa;
1843     if (gpa != INVALID_GPA) {
1844         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1845                                     gpa);
1846         if (ret < 0) {
1847             return ret;
1848         }
1849     }
1850 
1851     if (env->xen_periodic_timer_period) {
1852         ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1853         if (ret < 0) {
1854             return ret;
1855         }
1856     }
1857 
1858     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1859         /*
1860          * If the kernel has EVTCHN_SEND support then it handles timers too,
1861          * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1862          */
1863         QEMU_LOCK_GUARD(&env->xen_timers_lock);
1864         if (env->xen_singleshot_timer_ns) {
1865             ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1866                                           false);
1867             if (ret < 0) {
1868                 return ret;
1869             }
1870         }
1871         return 0;
1872     }
1873 
1874     if (env->xen_vcpu_callback_vector) {
1875         ret = kvm_xen_set_vcpu_callback_vector(cs);
1876         if (ret < 0) {
1877             return ret;
1878         }
1879     }
1880 
1881     if (env->xen_virq[VIRQ_TIMER]) {
1882         do_set_vcpu_timer_virq(cs,
1883                                RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER]));
1884     }
1885     return 0;
1886 }
1887 
1888 int kvm_get_xen_state(CPUState *cs)
1889 {
1890     X86CPU *cpu = X86_CPU(cs);
1891     CPUX86State *env = &cpu->env;
1892     uint64_t gpa;
1893     int ret;
1894 
1895     /*
1896      * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1897      * to it. It's up to userspace to *assume* that any page shared thus is
1898      * always considered dirty. The shared_info page is different since it's
1899      * an overlay and migrated separately anyway.
1900      */
1901     gpa = env->xen_vcpu_info_gpa;
1902     if (gpa == INVALID_GPA) {
1903         gpa = env->xen_vcpu_info_default_gpa;
1904     }
1905     if (gpa != INVALID_GPA) {
1906         MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1907                                                      gpa,
1908                                                      sizeof(struct vcpu_info));
1909         if (mrs.mr &&
1910             !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1911             memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1912                                     sizeof(struct vcpu_info));
1913         }
1914     }
1915 
1916     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1917         return 0;
1918     }
1919 
1920     /*
1921      * If the kernel is accelerating timers, read out the current value of the
1922      * singleshot timer deadline.
1923      */
1924     if (env->xen_virq[VIRQ_TIMER]) {
1925         struct kvm_xen_vcpu_attr va = {
1926             .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1927         };
1928         ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1929         if (ret < 0) {
1930             return ret;
1931         }
1932 
1933         /*
1934          * This locking is fairly pointless, and is here to appease Coverity.
1935          * There is an unavoidable race condition if a different vCPU sets a
1936          * timer for this vCPU after the value has been read out. But that's
1937          * OK in practice because *all* the vCPUs need to be stopped before
1938          * we set about migrating their state.
1939          */
1940         QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1941         env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1942     }
1943 
1944     return 0;
1945 }
1946