1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * svm_vmcall_test
4  *
5  * Copyright © 2021 Amazon.com, Inc. or its affiliates.
6  *
7  * Xen shared_info / pvclock testing
8  */
9 
10 #include "test_util.h"
11 #include "kvm_util.h"
12 #include "processor.h"
13 
14 #include <stdint.h>
15 #include <time.h>
16 #include <sched.h>
17 #include <signal.h>
18 #include <pthread.h>
19 
20 #include <sys/eventfd.h>
21 
22 /* Defined in include/linux/kvm_types.h */
23 #define GPA_INVALID		(~(ulong)0)
24 
25 #define SHINFO_REGION_GVA	0xc0000000ULL
26 #define SHINFO_REGION_GPA	0xc0000000ULL
27 #define SHINFO_REGION_SLOT	10
28 
29 #define DUMMY_REGION_GPA	(SHINFO_REGION_GPA + (2 * PAGE_SIZE))
30 #define DUMMY_REGION_SLOT	11
31 
32 #define SHINFO_ADDR	(SHINFO_REGION_GPA)
33 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
34 #define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + 0x20)
35 #define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
36 
37 #define SHINFO_VADDR	(SHINFO_REGION_GVA)
38 #define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + 0x20)
39 #define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
40 
41 #define EVTCHN_VECTOR	0x10
42 
43 #define EVTCHN_TEST1 15
44 #define EVTCHN_TEST2 66
45 #define EVTCHN_TIMER 13
46 
47 #define XEN_HYPERCALL_MSR	0x40000000
48 
49 #define MIN_STEAL_TIME		50000
50 
51 #define SHINFO_RACE_TIMEOUT	2	/* seconds */
52 
53 #define __HYPERVISOR_set_timer_op	15
54 #define __HYPERVISOR_sched_op		29
55 #define __HYPERVISOR_event_channel_op	32
56 
57 #define SCHEDOP_poll			3
58 
59 #define EVTCHNOP_send			4
60 
61 #define EVTCHNSTAT_interdomain		2
62 
63 struct evtchn_send {
64 	u32 port;
65 };
66 
67 struct sched_poll {
68 	u32 *ports;
69 	unsigned int nr_ports;
70 	u64 timeout;
71 };
72 
73 struct pvclock_vcpu_time_info {
74 	u32   version;
75 	u32   pad0;
76 	u64   tsc_timestamp;
77 	u64   system_time;
78 	u32   tsc_to_system_mul;
79 	s8    tsc_shift;
80 	u8    flags;
81 	u8    pad[2];
82 } __attribute__((__packed__)); /* 32 bytes */
83 
84 struct pvclock_wall_clock {
85 	u32   version;
86 	u32   sec;
87 	u32   nsec;
88 } __attribute__((__packed__));
89 
90 struct vcpu_runstate_info {
91     uint32_t state;
92     uint64_t state_entry_time;
93     uint64_t time[4];
94 };
95 
96 struct arch_vcpu_info {
97     unsigned long cr2;
98     unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
99 };
100 
101 struct vcpu_info {
102 	uint8_t evtchn_upcall_pending;
103 	uint8_t evtchn_upcall_mask;
104 	unsigned long evtchn_pending_sel;
105 	struct arch_vcpu_info arch;
106 	struct pvclock_vcpu_time_info time;
107 }; /* 64 bytes (x86) */
108 
109 struct shared_info {
110 	struct vcpu_info vcpu_info[32];
111 	unsigned long evtchn_pending[64];
112 	unsigned long evtchn_mask[64];
113 	struct pvclock_wall_clock wc;
114 	uint32_t wc_sec_hi;
115 	/* arch_shared_info here */
116 };
117 
118 #define RUNSTATE_running  0
119 #define RUNSTATE_runnable 1
120 #define RUNSTATE_blocked  2
121 #define RUNSTATE_offline  3
122 
123 static const char *runstate_names[] = {
124 	"running",
125 	"runnable",
126 	"blocked",
127 	"offline"
128 };
129 
130 struct {
131 	struct kvm_irq_routing info;
132 	struct kvm_irq_routing_entry entries[2];
133 } irq_routes;
134 
135 static volatile bool guest_saw_irq;
136 
137 static void evtchn_handler(struct ex_regs *regs)
138 {
139 	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
140 	vi->evtchn_upcall_pending = 0;
141 	vi->evtchn_pending_sel = 0;
142 	guest_saw_irq = true;
143 
144 	GUEST_SYNC(0x20);
145 }
146 
147 static void guest_wait_for_irq(void)
148 {
149 	while (!guest_saw_irq)
150 		__asm__ __volatile__ ("rep nop" : : : "memory");
151 	guest_saw_irq = false;
152 }
153 
154 static void guest_code(void)
155 {
156 	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
157 	int i;
158 
159 	__asm__ __volatile__(
160 		"sti\n"
161 		"nop\n"
162 	);
163 
164 	/* Trigger an interrupt injection */
165 	GUEST_SYNC(0);
166 
167 	guest_wait_for_irq();
168 
169 	/* Test having the host set runstates manually */
170 	GUEST_SYNC(RUNSTATE_runnable);
171 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
172 	GUEST_ASSERT(rs->state == 0);
173 
174 	GUEST_SYNC(RUNSTATE_blocked);
175 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
176 	GUEST_ASSERT(rs->state == 0);
177 
178 	GUEST_SYNC(RUNSTATE_offline);
179 	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
180 	GUEST_ASSERT(rs->state == 0);
181 
182 	/* Test runstate time adjust */
183 	GUEST_SYNC(4);
184 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
185 	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
186 
187 	/* Test runstate time set */
188 	GUEST_SYNC(5);
189 	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
190 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
191 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
192 	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
193 
194 	/* sched_yield() should result in some 'runnable' time */
195 	GUEST_SYNC(6);
196 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
197 
198 	/* Attempt to deliver a *masked* interrupt */
199 	GUEST_SYNC(7);
200 
201 	/* Wait until we see the bit set */
202 	struct shared_info *si = (void *)SHINFO_VADDR;
203 	while (!si->evtchn_pending[0])
204 		__asm__ __volatile__ ("rep nop" : : : "memory");
205 
206 	/* Now deliver an *unmasked* interrupt */
207 	GUEST_SYNC(8);
208 
209 	guest_wait_for_irq();
210 
211 	/* Change memslots and deliver an interrupt */
212 	GUEST_SYNC(9);
213 
214 	guest_wait_for_irq();
215 
216 	/* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
217 	GUEST_SYNC(10);
218 
219 	guest_wait_for_irq();
220 
221 	GUEST_SYNC(11);
222 
223 	/* Our turn. Deliver event channel (to ourselves) with
224 	 * EVTCHNOP_send hypercall. */
225 	unsigned long rax;
226 	struct evtchn_send s = { .port = 127 };
227 	__asm__ __volatile__ ("vmcall" :
228 			      "=a" (rax) :
229 			      "a" (__HYPERVISOR_event_channel_op),
230 			      "D" (EVTCHNOP_send),
231 			      "S" (&s));
232 
233 	GUEST_ASSERT(rax == 0);
234 
235 	guest_wait_for_irq();
236 
237 	GUEST_SYNC(12);
238 
239 	/* Deliver "outbound" event channel to an eventfd which
240 	 * happens to be one of our own irqfds. */
241 	s.port = 197;
242 	__asm__ __volatile__ ("vmcall" :
243 			      "=a" (rax) :
244 			      "a" (__HYPERVISOR_event_channel_op),
245 			      "D" (EVTCHNOP_send),
246 			      "S" (&s));
247 
248 	GUEST_ASSERT(rax == 0);
249 
250 	guest_wait_for_irq();
251 
252 	GUEST_SYNC(13);
253 
254 	/* Set a timer 100ms in the future. */
255 	__asm__ __volatile__ ("vmcall" :
256 			      "=a" (rax) :
257 			      "a" (__HYPERVISOR_set_timer_op),
258 			      "D" (rs->state_entry_time + 100000000));
259 	GUEST_ASSERT(rax == 0);
260 
261 	GUEST_SYNC(14);
262 
263 	/* Now wait for the timer */
264 	guest_wait_for_irq();
265 
266 	GUEST_SYNC(15);
267 
268 	/* The host has 'restored' the timer. Just wait for it. */
269 	guest_wait_for_irq();
270 
271 	GUEST_SYNC(16);
272 
273 	/* Poll for an event channel port which is already set */
274 	u32 ports[1] = { EVTCHN_TIMER };
275 	struct sched_poll p = {
276 		.ports = ports,
277 		.nr_ports = 1,
278 		.timeout = 0,
279 	};
280 
281 	__asm__ __volatile__ ("vmcall" :
282 			      "=a" (rax) :
283 			      "a" (__HYPERVISOR_sched_op),
284 			      "D" (SCHEDOP_poll),
285 			      "S" (&p));
286 
287 	GUEST_ASSERT(rax == 0);
288 
289 	GUEST_SYNC(17);
290 
291 	/* Poll for an unset port and wait for the timeout. */
292 	p.timeout = 100000000;
293 	__asm__ __volatile__ ("vmcall" :
294 			      "=a" (rax) :
295 			      "a" (__HYPERVISOR_sched_op),
296 			      "D" (SCHEDOP_poll),
297 			      "S" (&p));
298 
299 	GUEST_ASSERT(rax == 0);
300 
301 	GUEST_SYNC(18);
302 
303 	/* A timer will wake the masked port we're waiting on, while we poll */
304 	p.timeout = 0;
305 	__asm__ __volatile__ ("vmcall" :
306 			      "=a" (rax) :
307 			      "a" (__HYPERVISOR_sched_op),
308 			      "D" (SCHEDOP_poll),
309 			      "S" (&p));
310 
311 	GUEST_ASSERT(rax == 0);
312 
313 	GUEST_SYNC(19);
314 
315 	/* A timer wake an *unmasked* port which should wake us with an
316 	 * actual interrupt, while we're polling on a different port. */
317 	ports[0]++;
318 	p.timeout = 0;
319 	__asm__ __volatile__ ("vmcall" :
320 			      "=a" (rax) :
321 			      "a" (__HYPERVISOR_sched_op),
322 			      "D" (SCHEDOP_poll),
323 			      "S" (&p));
324 
325 	GUEST_ASSERT(rax == 0);
326 
327 	guest_wait_for_irq();
328 
329 	GUEST_SYNC(20);
330 
331 	/* Timer should have fired already */
332 	guest_wait_for_irq();
333 
334 	GUEST_SYNC(21);
335 	/* Racing host ioctls */
336 
337 	guest_wait_for_irq();
338 
339 	GUEST_SYNC(22);
340 	/* Racing vmcall against host ioctl */
341 
342 	ports[0] = 0;
343 
344 	p = (struct sched_poll) {
345 		.ports = ports,
346 		.nr_ports = 1,
347 		.timeout = 0
348 	};
349 
350 wait_for_timer:
351 	/*
352 	 * Poll for a timer wake event while the worker thread is mucking with
353 	 * the shared info.  KVM XEN drops timer IRQs if the shared info is
354 	 * invalid when the timer expires.  Arbitrarily poll 100 times before
355 	 * giving up and asking the VMM to re-arm the timer.  100 polls should
356 	 * consume enough time to beat on KVM without taking too long if the
357 	 * timer IRQ is dropped due to an invalid event channel.
358 	 */
359 	for (i = 0; i < 100 && !guest_saw_irq; i++)
360 		asm volatile("vmcall"
361 			     : "=a" (rax)
362 			     : "a" (__HYPERVISOR_sched_op),
363 			       "D" (SCHEDOP_poll),
364 			       "S" (&p)
365 			     : "memory");
366 
367 	/*
368 	 * Re-send the timer IRQ if it was (likely) dropped due to the timer
369 	 * expiring while the event channel was invalid.
370 	 */
371 	if (!guest_saw_irq) {
372 		GUEST_SYNC(23);
373 		goto wait_for_timer;
374 	}
375 	guest_saw_irq = false;
376 
377 	GUEST_SYNC(24);
378 }
379 
380 static int cmp_timespec(struct timespec *a, struct timespec *b)
381 {
382 	if (a->tv_sec > b->tv_sec)
383 		return 1;
384 	else if (a->tv_sec < b->tv_sec)
385 		return -1;
386 	else if (a->tv_nsec > b->tv_nsec)
387 		return 1;
388 	else if (a->tv_nsec < b->tv_nsec)
389 		return -1;
390 	else
391 		return 0;
392 }
393 
394 static struct vcpu_info *vinfo;
395 static struct kvm_vcpu *vcpu;
396 
397 static void handle_alrm(int sig)
398 {
399 	if (vinfo)
400 		printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
401 	vcpu_dump(stdout, vcpu, 0);
402 	TEST_FAIL("IRQ delivery timed out");
403 }
404 
405 static void *juggle_shinfo_state(void *arg)
406 {
407 	struct kvm_vm *vm = (struct kvm_vm *)arg;
408 
409 	struct kvm_xen_hvm_attr cache_init = {
410 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
411 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
412 	};
413 
414 	struct kvm_xen_hvm_attr cache_destroy = {
415 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
416 		.u.shared_info.gfn = GPA_INVALID
417 	};
418 
419 	for (;;) {
420 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_init);
421 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_destroy);
422 		pthread_testcancel();
423 	};
424 
425 	return NULL;
426 }
427 
428 int main(int argc, char *argv[])
429 {
430 	struct timespec min_ts, max_ts, vm_ts;
431 	struct kvm_vm *vm;
432 	pthread_t thread;
433 	bool verbose;
434 	int ret;
435 
436 	verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
437 			       !strncmp(argv[1], "--verbose", 10));
438 
439 	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
440 	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO);
441 
442 	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
443 	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
444 	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
445 
446 	clock_gettime(CLOCK_REALTIME, &min_ts);
447 
448 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
449 
450 	/* Map a region for the shared_info page */
451 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
452 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
453 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2);
454 
455 	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
456 
457 	int zero_fd = open("/dev/zero", O_RDONLY);
458 	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
459 
460 	struct kvm_xen_hvm_config hvmc = {
461 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
462 		.msr = XEN_HYPERCALL_MSR,
463 	};
464 
465 	/* Let the kernel know that we *will* use it for sending all
466 	 * event channels, which lets it intercept SCHEDOP_poll */
467 	if (do_evtchn_tests)
468 		hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
469 
470 	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
471 
472 	struct kvm_xen_hvm_attr lm = {
473 		.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
474 		.u.long_mode = 1,
475 	};
476 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
477 
478 	struct kvm_xen_hvm_attr ha = {
479 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
480 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE,
481 	};
482 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
483 
484 	/*
485 	 * Test what happens when the HVA of the shinfo page is remapped after
486 	 * the kernel has a reference to it. But make sure we copy the clock
487 	 * info over since that's only set at setup time, and we test it later.
488 	 */
489 	struct pvclock_wall_clock wc_copy = shinfo->wc;
490 	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
491 	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
492 	shinfo->wc = wc_copy;
493 
494 	struct kvm_xen_vcpu_attr vi = {
495 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
496 		.u.gpa = VCPU_INFO_ADDR,
497 	};
498 	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi);
499 
500 	struct kvm_xen_vcpu_attr pvclock = {
501 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
502 		.u.gpa = PVTIME_ADDR,
503 	};
504 	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock);
505 
506 	struct kvm_xen_hvm_attr vec = {
507 		.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
508 		.u.vector = EVTCHN_VECTOR,
509 	};
510 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec);
511 
512 	vm_init_descriptor_tables(vm);
513 	vcpu_init_descriptor_tables(vcpu);
514 	vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler);
515 
516 	if (do_runstate_tests) {
517 		struct kvm_xen_vcpu_attr st = {
518 			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
519 			.u.gpa = RUNSTATE_ADDR,
520 		};
521 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
522 	}
523 
524 	int irq_fd[2] = { -1, -1 };
525 
526 	if (do_eventfd_tests) {
527 		irq_fd[0] = eventfd(0, 0);
528 		irq_fd[1] = eventfd(0, 0);
529 
530 		/* Unexpected, but not a KVM failure */
531 		if (irq_fd[0] == -1 || irq_fd[1] == -1)
532 			do_evtchn_tests = do_eventfd_tests = false;
533 	}
534 
535 	if (do_eventfd_tests) {
536 		irq_routes.info.nr = 2;
537 
538 		irq_routes.entries[0].gsi = 32;
539 		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
540 		irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
541 		irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id;
542 		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
543 
544 		irq_routes.entries[1].gsi = 33;
545 		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
546 		irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
547 		irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id;
548 		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
549 
550 		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
551 
552 		struct kvm_irqfd ifd = { };
553 
554 		ifd.fd = irq_fd[0];
555 		ifd.gsi = 32;
556 		vm_ioctl(vm, KVM_IRQFD, &ifd);
557 
558 		ifd.fd = irq_fd[1];
559 		ifd.gsi = 33;
560 		vm_ioctl(vm, KVM_IRQFD, &ifd);
561 
562 		struct sigaction sa = { };
563 		sa.sa_handler = handle_alrm;
564 		sigaction(SIGALRM, &sa, NULL);
565 	}
566 
567 	struct kvm_xen_vcpu_attr tmr = {
568 		.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
569 		.u.timer.port = EVTCHN_TIMER,
570 		.u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
571 		.u.timer.expires_ns = 0
572 	};
573 
574 	if (do_evtchn_tests) {
575 		struct kvm_xen_hvm_attr inj = {
576 			.type = KVM_XEN_ATTR_TYPE_EVTCHN,
577 			.u.evtchn.send_port = 127,
578 			.u.evtchn.type = EVTCHNSTAT_interdomain,
579 			.u.evtchn.flags = 0,
580 			.u.evtchn.deliver.port.port = EVTCHN_TEST1,
581 			.u.evtchn.deliver.port.vcpu = vcpu->id + 1,
582 			.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
583 		};
584 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
585 
586 		/* Test migration to a different vCPU */
587 		inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
588 		inj.u.evtchn.deliver.port.vcpu = vcpu->id;
589 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
590 
591 		inj.u.evtchn.send_port = 197;
592 		inj.u.evtchn.deliver.eventfd.port = 0;
593 		inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
594 		inj.u.evtchn.flags = 0;
595 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
596 
597 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
598 	}
599 	vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
600 	vinfo->evtchn_upcall_pending = 0;
601 
602 	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
603 	rs->state = 0x5a;
604 
605 	bool evtchn_irq_expected = false;
606 
607 	for (;;) {
608 		volatile struct kvm_run *run = vcpu->run;
609 		struct ucall uc;
610 
611 		vcpu_run(vcpu);
612 
613 		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
614 			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
615 			    run->exit_reason,
616 			    exit_reason_str(run->exit_reason));
617 
618 		switch (get_ucall(vcpu, &uc)) {
619 		case UCALL_ABORT:
620 			REPORT_GUEST_ASSERT(uc);
621 			/* NOT REACHED */
622 		case UCALL_SYNC: {
623 			struct kvm_xen_vcpu_attr rst;
624 			long rundelay;
625 
626 			if (do_runstate_tests)
627 				TEST_ASSERT(rs->state_entry_time == rs->time[0] +
628 					    rs->time[1] + rs->time[2] + rs->time[3],
629 					    "runstate times don't add up");
630 
631 			switch (uc.args[1]) {
632 			case 0:
633 				if (verbose)
634 					printf("Delivering evtchn upcall\n");
635 				evtchn_irq_expected = true;
636 				vinfo->evtchn_upcall_pending = 1;
637 				break;
638 
639 			case RUNSTATE_runnable...RUNSTATE_offline:
640 				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
641 				if (!do_runstate_tests)
642 					goto done;
643 				if (verbose)
644 					printf("Testing runstate %s\n", runstate_names[uc.args[1]]);
645 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
646 				rst.u.runstate.state = uc.args[1];
647 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
648 				break;
649 
650 			case 4:
651 				if (verbose)
652 					printf("Testing RUNSTATE_ADJUST\n");
653 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
654 				memset(&rst.u, 0, sizeof(rst.u));
655 				rst.u.runstate.state = (uint64_t)-1;
656 				rst.u.runstate.time_blocked =
657 					0x5a - rs->time[RUNSTATE_blocked];
658 				rst.u.runstate.time_offline =
659 					0x6b6b - rs->time[RUNSTATE_offline];
660 				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
661 					rst.u.runstate.time_offline;
662 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
663 				break;
664 
665 			case 5:
666 				if (verbose)
667 					printf("Testing RUNSTATE_DATA\n");
668 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
669 				memset(&rst.u, 0, sizeof(rst.u));
670 				rst.u.runstate.state = RUNSTATE_running;
671 				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
672 				rst.u.runstate.time_blocked = 0x6b6b;
673 				rst.u.runstate.time_offline = 0x5a;
674 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
675 				break;
676 
677 			case 6:
678 				if (verbose)
679 					printf("Testing steal time\n");
680 				/* Yield until scheduler delay exceeds target */
681 				rundelay = get_run_delay() + MIN_STEAL_TIME;
682 				do {
683 					sched_yield();
684 				} while (get_run_delay() < rundelay);
685 				break;
686 
687 			case 7:
688 				if (!do_eventfd_tests)
689 					goto done;
690 				if (verbose)
691 					printf("Testing masked event channel\n");
692 				shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
693 				eventfd_write(irq_fd[0], 1UL);
694 				alarm(1);
695 				break;
696 
697 			case 8:
698 				if (verbose)
699 					printf("Testing unmasked event channel\n");
700 				/* Unmask that, but deliver the other one */
701 				shinfo->evtchn_pending[0] = 0;
702 				shinfo->evtchn_mask[0] = 0;
703 				eventfd_write(irq_fd[1], 1UL);
704 				evtchn_irq_expected = true;
705 				alarm(1);
706 				break;
707 
708 			case 9:
709 				TEST_ASSERT(!evtchn_irq_expected,
710 					    "Expected event channel IRQ but it didn't happen");
711 				shinfo->evtchn_pending[1] = 0;
712 				if (verbose)
713 					printf("Testing event channel after memslot change\n");
714 				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
715 							    DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0);
716 				eventfd_write(irq_fd[0], 1UL);
717 				evtchn_irq_expected = true;
718 				alarm(1);
719 				break;
720 
721 			case 10:
722 				TEST_ASSERT(!evtchn_irq_expected,
723 					    "Expected event channel IRQ but it didn't happen");
724 				if (!do_evtchn_tests)
725 					goto done;
726 
727 				shinfo->evtchn_pending[0] = 0;
728 				if (verbose)
729 					printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
730 
731 				struct kvm_irq_routing_xen_evtchn e;
732 				e.port = EVTCHN_TEST2;
733 				e.vcpu = vcpu->id;
734 				e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
735 
736 				vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
737 				evtchn_irq_expected = true;
738 				alarm(1);
739 				break;
740 
741 			case 11:
742 				TEST_ASSERT(!evtchn_irq_expected,
743 					    "Expected event channel IRQ but it didn't happen");
744 				shinfo->evtchn_pending[1] = 0;
745 
746 				if (verbose)
747 					printf("Testing guest EVTCHNOP_send direct to evtchn\n");
748 				evtchn_irq_expected = true;
749 				alarm(1);
750 				break;
751 
752 			case 12:
753 				TEST_ASSERT(!evtchn_irq_expected,
754 					    "Expected event channel IRQ but it didn't happen");
755 				shinfo->evtchn_pending[0] = 0;
756 
757 				if (verbose)
758 					printf("Testing guest EVTCHNOP_send to eventfd\n");
759 				evtchn_irq_expected = true;
760 				alarm(1);
761 				break;
762 
763 			case 13:
764 				TEST_ASSERT(!evtchn_irq_expected,
765 					    "Expected event channel IRQ but it didn't happen");
766 				shinfo->evtchn_pending[1] = 0;
767 
768 				if (verbose)
769 					printf("Testing guest oneshot timer\n");
770 				break;
771 
772 			case 14:
773 				memset(&tmr, 0, sizeof(tmr));
774 				tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
775 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
776 				TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
777 					    "Timer port not returned");
778 				TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
779 					    "Timer priority not returned");
780 				TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
781 					    "Timer expiry not returned");
782 				evtchn_irq_expected = true;
783 				alarm(1);
784 				break;
785 
786 			case 15:
787 				TEST_ASSERT(!evtchn_irq_expected,
788 					    "Expected event channel IRQ but it didn't happen");
789 				shinfo->evtchn_pending[0] = 0;
790 
791 				if (verbose)
792 					printf("Testing restored oneshot timer\n");
793 
794 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
795 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
796 				evtchn_irq_expected = true;
797 				alarm(1);
798 				break;
799 
800 			case 16:
801 				TEST_ASSERT(!evtchn_irq_expected,
802 					    "Expected event channel IRQ but it didn't happen");
803 
804 				if (verbose)
805 					printf("Testing SCHEDOP_poll with already pending event\n");
806 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
807 				alarm(1);
808 				break;
809 
810 			case 17:
811 				if (verbose)
812 					printf("Testing SCHEDOP_poll timeout\n");
813 				shinfo->evtchn_pending[0] = 0;
814 				alarm(1);
815 				break;
816 
817 			case 18:
818 				if (verbose)
819 					printf("Testing SCHEDOP_poll wake on masked event\n");
820 
821 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
822 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
823 				alarm(1);
824 				break;
825 
826 			case 19:
827 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
828 				if (verbose)
829 					printf("Testing SCHEDOP_poll wake on unmasked event\n");
830 
831 				evtchn_irq_expected = true;
832 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
833 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
834 
835 				/* Read it back and check the pending time is reported correctly */
836 				tmr.u.timer.expires_ns = 0;
837 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
838 				TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
839 					    "Timer not reported pending");
840 				alarm(1);
841 				break;
842 
843 			case 20:
844 				TEST_ASSERT(!evtchn_irq_expected,
845 					    "Expected event channel IRQ but it didn't happen");
846 				/* Read timer and check it is no longer pending */
847 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
848 				TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
849 
850 				shinfo->evtchn_pending[0] = 0;
851 				if (verbose)
852 					printf("Testing timer in the past\n");
853 
854 				evtchn_irq_expected = true;
855 				tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
856 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
857 				alarm(1);
858 				break;
859 
860 			case 21:
861 				TEST_ASSERT(!evtchn_irq_expected,
862 					    "Expected event channel IRQ but it didn't happen");
863 				alarm(0);
864 
865 				if (verbose)
866 					printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
867 
868 				ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
869 				TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
870 
871 				struct kvm_irq_routing_xen_evtchn uxe = {
872 					.port = 1,
873 					.vcpu = vcpu->id,
874 					.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
875 				};
876 
877 				evtchn_irq_expected = true;
878 				for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
879 					__vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
880 				break;
881 
882 			case 22:
883 				TEST_ASSERT(!evtchn_irq_expected,
884 					    "Expected event channel IRQ but it didn't happen");
885 
886 				if (verbose)
887 					printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
888 
889 				shinfo->evtchn_pending[0] = 1;
890 
891 				evtchn_irq_expected = true;
892 				tmr.u.timer.expires_ns = rs->state_entry_time +
893 							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
894 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
895 				break;
896 
897 			case 23:
898 				/*
899 				 * Optional and possibly repeated sync point.
900 				 * Injecting the timer IRQ may fail if the
901 				 * shinfo is invalid when the timer expires.
902 				 * If the timer has expired but the IRQ hasn't
903 				 * been delivered, rearm the timer and retry.
904 				 */
905 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
906 
907 				/* Resume the guest if the timer is still pending. */
908 				if (tmr.u.timer.expires_ns)
909 					break;
910 
911 				/* All done if the IRQ was delivered. */
912 				if (!evtchn_irq_expected)
913 					break;
914 
915 				tmr.u.timer.expires_ns = rs->state_entry_time +
916 							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
917 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
918 				break;
919 			case 24:
920 				TEST_ASSERT(!evtchn_irq_expected,
921 					    "Expected event channel IRQ but it didn't happen");
922 
923 				ret = pthread_cancel(thread);
924 				TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
925 
926 				ret = pthread_join(thread, 0);
927 				TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
928 				goto done;
929 
930 			case 0x20:
931 				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
932 				evtchn_irq_expected = false;
933 				break;
934 			}
935 			break;
936 		}
937 		case UCALL_DONE:
938 			goto done;
939 		default:
940 			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
941 		}
942 	}
943 
944  done:
945 	alarm(0);
946 	clock_gettime(CLOCK_REALTIME, &max_ts);
947 
948 	/*
949 	 * Just a *really* basic check that things are being put in the
950 	 * right place. The actual calculations are much the same for
951 	 * Xen as they are for the KVM variants, so no need to check.
952 	 */
953 	struct pvclock_wall_clock *wc;
954 	struct pvclock_vcpu_time_info *ti, *ti2;
955 
956 	wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
957 	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
958 	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
959 
960 	if (verbose) {
961 		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
962 		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
963 		       ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul,
964 		       ti->tsc_shift, ti->flags);
965 		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
966 		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
967 		       ti2->tsc_shift, ti2->flags);
968 	}
969 
970 	vm_ts.tv_sec = wc->sec;
971 	vm_ts.tv_nsec = wc->nsec;
972 	TEST_ASSERT(wc->version && !(wc->version & 1),
973 		    "Bad wallclock version %x", wc->version);
974 	TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
975 	TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
976 
977 	TEST_ASSERT(ti->version && !(ti->version & 1),
978 		    "Bad time_info version %x", ti->version);
979 	TEST_ASSERT(ti2->version && !(ti2->version & 1),
980 		    "Bad time_info version %x", ti->version);
981 
982 	if (do_runstate_tests) {
983 		/*
984 		 * Fetch runstate and check sanity. Strictly speaking in the
985 		 * general case we might not expect the numbers to be identical
986 		 * but in this case we know we aren't running the vCPU any more.
987 		 */
988 		struct kvm_xen_vcpu_attr rst = {
989 			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
990 		};
991 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst);
992 
993 		if (verbose) {
994 			printf("Runstate: %s(%d), entry %" PRIu64 " ns\n",
995 			       rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown",
996 			       rs->state, rs->state_entry_time);
997 			for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) {
998 				printf("State %s: %" PRIu64 " ns\n",
999 				       runstate_names[i], rs->time[i]);
1000 			}
1001 		}
1002 		TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
1003 		TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
1004 			    "State entry time mismatch");
1005 		TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
1006 			    "Running time mismatch");
1007 		TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
1008 			    "Runnable time mismatch");
1009 		TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
1010 			    "Blocked time mismatch");
1011 		TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
1012 			    "Offline time mismatch");
1013 
1014 		TEST_ASSERT(rs->state_entry_time == rs->time[0] +
1015 			    rs->time[1] + rs->time[2] + rs->time[3],
1016 			    "runstate times don't add up");
1017 	}
1018 	kvm_vm_free(vm);
1019 	return 0;
1020 }
1021