1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * svm_vmcall_test
4  *
5  * Copyright © 2021 Amazon.com, Inc. or its affiliates.
6  *
7  * Xen shared_info / pvclock testing
8  */
9 
10 #include "test_util.h"
11 #include "kvm_util.h"
12 #include "processor.h"
13 
14 #include <stdint.h>
15 #include <time.h>
16 #include <sched.h>
17 #include <signal.h>
18 #include <pthread.h>
19 
20 #include <sys/eventfd.h>
21 
22 #define SHINFO_REGION_GVA	0xc0000000ULL
23 #define SHINFO_REGION_GPA	0xc0000000ULL
24 #define SHINFO_REGION_SLOT	10
25 
26 #define DUMMY_REGION_GPA	(SHINFO_REGION_GPA + (3 * PAGE_SIZE))
27 #define DUMMY_REGION_SLOT	11
28 
29 #define SHINFO_ADDR	(SHINFO_REGION_GPA)
30 #define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
31 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
32 #define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - 15)
33 
34 #define SHINFO_VADDR	(SHINFO_REGION_GVA)
35 #define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
36 #define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + PAGE_SIZE - 15)
37 
38 #define EVTCHN_VECTOR	0x10
39 
40 #define EVTCHN_TEST1 15
41 #define EVTCHN_TEST2 66
42 #define EVTCHN_TIMER 13
43 
44 #define XEN_HYPERCALL_MSR	0x40000000
45 
46 #define MIN_STEAL_TIME		50000
47 
48 #define SHINFO_RACE_TIMEOUT	2	/* seconds */
49 
50 #define __HYPERVISOR_set_timer_op	15
51 #define __HYPERVISOR_sched_op		29
52 #define __HYPERVISOR_event_channel_op	32
53 
54 #define SCHEDOP_poll			3
55 
56 #define EVTCHNOP_send			4
57 
58 #define EVTCHNSTAT_interdomain		2
59 
60 struct evtchn_send {
61 	u32 port;
62 };
63 
64 struct sched_poll {
65 	u32 *ports;
66 	unsigned int nr_ports;
67 	u64 timeout;
68 };
69 
70 struct pvclock_vcpu_time_info {
71 	u32   version;
72 	u32   pad0;
73 	u64   tsc_timestamp;
74 	u64   system_time;
75 	u32   tsc_to_system_mul;
76 	s8    tsc_shift;
77 	u8    flags;
78 	u8    pad[2];
79 } __attribute__((__packed__)); /* 32 bytes */
80 
81 struct pvclock_wall_clock {
82 	u32   version;
83 	u32   sec;
84 	u32   nsec;
85 } __attribute__((__packed__));
86 
87 struct vcpu_runstate_info {
88 	uint32_t state;
89 	uint64_t state_entry_time;
90 	uint64_t time[5]; /* Extra field for overrun check */
91 };
92 
93 struct compat_vcpu_runstate_info {
94 	uint32_t state;
95 	uint64_t state_entry_time;
96 	uint64_t time[5];
97 } __attribute__((__packed__));;
98 
99 struct arch_vcpu_info {
100 	unsigned long cr2;
101 	unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
102 };
103 
104 struct vcpu_info {
105 	uint8_t evtchn_upcall_pending;
106 	uint8_t evtchn_upcall_mask;
107 	unsigned long evtchn_pending_sel;
108 	struct arch_vcpu_info arch;
109 	struct pvclock_vcpu_time_info time;
110 }; /* 64 bytes (x86) */
111 
112 struct shared_info {
113 	struct vcpu_info vcpu_info[32];
114 	unsigned long evtchn_pending[64];
115 	unsigned long evtchn_mask[64];
116 	struct pvclock_wall_clock wc;
117 	uint32_t wc_sec_hi;
118 	/* arch_shared_info here */
119 };
120 
121 #define RUNSTATE_running  0
122 #define RUNSTATE_runnable 1
123 #define RUNSTATE_blocked  2
124 #define RUNSTATE_offline  3
125 
126 static const char *runstate_names[] = {
127 	"running",
128 	"runnable",
129 	"blocked",
130 	"offline"
131 };
132 
133 struct {
134 	struct kvm_irq_routing info;
135 	struct kvm_irq_routing_entry entries[2];
136 } irq_routes;
137 
138 static volatile bool guest_saw_irq;
139 
140 static void evtchn_handler(struct ex_regs *regs)
141 {
142 	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
143 	vi->evtchn_upcall_pending = 0;
144 	vi->evtchn_pending_sel = 0;
145 	guest_saw_irq = true;
146 
147 	GUEST_SYNC(0x20);
148 }
149 
150 static void guest_wait_for_irq(void)
151 {
152 	while (!guest_saw_irq)
153 		__asm__ __volatile__ ("rep nop" : : : "memory");
154 	guest_saw_irq = false;
155 }
156 
157 static void guest_code(void)
158 {
159 	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
160 	int i;
161 
162 	__asm__ __volatile__(
163 		"sti\n"
164 		"nop\n"
165 	);
166 
167 	/* Trigger an interrupt injection */
168 	GUEST_SYNC(0);
169 
170 	guest_wait_for_irq();
171 
172 	/* Test having the host set runstates manually */
173 	GUEST_SYNC(RUNSTATE_runnable);
174 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
175 	GUEST_ASSERT(rs->state == 0);
176 
177 	GUEST_SYNC(RUNSTATE_blocked);
178 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
179 	GUEST_ASSERT(rs->state == 0);
180 
181 	GUEST_SYNC(RUNSTATE_offline);
182 	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
183 	GUEST_ASSERT(rs->state == 0);
184 
185 	/* Test runstate time adjust */
186 	GUEST_SYNC(4);
187 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
188 	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
189 
190 	/* Test runstate time set */
191 	GUEST_SYNC(5);
192 	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
193 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
194 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
195 	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
196 
197 	/* sched_yield() should result in some 'runnable' time */
198 	GUEST_SYNC(6);
199 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
200 
201 	/* Attempt to deliver a *masked* interrupt */
202 	GUEST_SYNC(7);
203 
204 	/* Wait until we see the bit set */
205 	struct shared_info *si = (void *)SHINFO_VADDR;
206 	while (!si->evtchn_pending[0])
207 		__asm__ __volatile__ ("rep nop" : : : "memory");
208 
209 	/* Now deliver an *unmasked* interrupt */
210 	GUEST_SYNC(8);
211 
212 	guest_wait_for_irq();
213 
214 	/* Change memslots and deliver an interrupt */
215 	GUEST_SYNC(9);
216 
217 	guest_wait_for_irq();
218 
219 	/* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
220 	GUEST_SYNC(10);
221 
222 	guest_wait_for_irq();
223 
224 	GUEST_SYNC(11);
225 
226 	/* Our turn. Deliver event channel (to ourselves) with
227 	 * EVTCHNOP_send hypercall. */
228 	unsigned long rax;
229 	struct evtchn_send s = { .port = 127 };
230 	__asm__ __volatile__ ("vmcall" :
231 			      "=a" (rax) :
232 			      "a" (__HYPERVISOR_event_channel_op),
233 			      "D" (EVTCHNOP_send),
234 			      "S" (&s));
235 
236 	GUEST_ASSERT(rax == 0);
237 
238 	guest_wait_for_irq();
239 
240 	GUEST_SYNC(12);
241 
242 	/* Deliver "outbound" event channel to an eventfd which
243 	 * happens to be one of our own irqfds. */
244 	s.port = 197;
245 	__asm__ __volatile__ ("vmcall" :
246 			      "=a" (rax) :
247 			      "a" (__HYPERVISOR_event_channel_op),
248 			      "D" (EVTCHNOP_send),
249 			      "S" (&s));
250 
251 	GUEST_ASSERT(rax == 0);
252 
253 	guest_wait_for_irq();
254 
255 	GUEST_SYNC(13);
256 
257 	/* Set a timer 100ms in the future. */
258 	__asm__ __volatile__ ("vmcall" :
259 			      "=a" (rax) :
260 			      "a" (__HYPERVISOR_set_timer_op),
261 			      "D" (rs->state_entry_time + 100000000));
262 	GUEST_ASSERT(rax == 0);
263 
264 	GUEST_SYNC(14);
265 
266 	/* Now wait for the timer */
267 	guest_wait_for_irq();
268 
269 	GUEST_SYNC(15);
270 
271 	/* The host has 'restored' the timer. Just wait for it. */
272 	guest_wait_for_irq();
273 
274 	GUEST_SYNC(16);
275 
276 	/* Poll for an event channel port which is already set */
277 	u32 ports[1] = { EVTCHN_TIMER };
278 	struct sched_poll p = {
279 		.ports = ports,
280 		.nr_ports = 1,
281 		.timeout = 0,
282 	};
283 
284 	__asm__ __volatile__ ("vmcall" :
285 			      "=a" (rax) :
286 			      "a" (__HYPERVISOR_sched_op),
287 			      "D" (SCHEDOP_poll),
288 			      "S" (&p));
289 
290 	GUEST_ASSERT(rax == 0);
291 
292 	GUEST_SYNC(17);
293 
294 	/* Poll for an unset port and wait for the timeout. */
295 	p.timeout = 100000000;
296 	__asm__ __volatile__ ("vmcall" :
297 			      "=a" (rax) :
298 			      "a" (__HYPERVISOR_sched_op),
299 			      "D" (SCHEDOP_poll),
300 			      "S" (&p));
301 
302 	GUEST_ASSERT(rax == 0);
303 
304 	GUEST_SYNC(18);
305 
306 	/* A timer will wake the masked port we're waiting on, while we poll */
307 	p.timeout = 0;
308 	__asm__ __volatile__ ("vmcall" :
309 			      "=a" (rax) :
310 			      "a" (__HYPERVISOR_sched_op),
311 			      "D" (SCHEDOP_poll),
312 			      "S" (&p));
313 
314 	GUEST_ASSERT(rax == 0);
315 
316 	GUEST_SYNC(19);
317 
318 	/* A timer wake an *unmasked* port which should wake us with an
319 	 * actual interrupt, while we're polling on a different port. */
320 	ports[0]++;
321 	p.timeout = 0;
322 	__asm__ __volatile__ ("vmcall" :
323 			      "=a" (rax) :
324 			      "a" (__HYPERVISOR_sched_op),
325 			      "D" (SCHEDOP_poll),
326 			      "S" (&p));
327 
328 	GUEST_ASSERT(rax == 0);
329 
330 	guest_wait_for_irq();
331 
332 	GUEST_SYNC(20);
333 
334 	/* Timer should have fired already */
335 	guest_wait_for_irq();
336 
337 	GUEST_SYNC(21);
338 	/* Racing host ioctls */
339 
340 	guest_wait_for_irq();
341 
342 	GUEST_SYNC(22);
343 	/* Racing vmcall against host ioctl */
344 
345 	ports[0] = 0;
346 
347 	p = (struct sched_poll) {
348 		.ports = ports,
349 		.nr_ports = 1,
350 		.timeout = 0
351 	};
352 
353 wait_for_timer:
354 	/*
355 	 * Poll for a timer wake event while the worker thread is mucking with
356 	 * the shared info.  KVM XEN drops timer IRQs if the shared info is
357 	 * invalid when the timer expires.  Arbitrarily poll 100 times before
358 	 * giving up and asking the VMM to re-arm the timer.  100 polls should
359 	 * consume enough time to beat on KVM without taking too long if the
360 	 * timer IRQ is dropped due to an invalid event channel.
361 	 */
362 	for (i = 0; i < 100 && !guest_saw_irq; i++)
363 		asm volatile("vmcall"
364 			     : "=a" (rax)
365 			     : "a" (__HYPERVISOR_sched_op),
366 			       "D" (SCHEDOP_poll),
367 			       "S" (&p)
368 			     : "memory");
369 
370 	/*
371 	 * Re-send the timer IRQ if it was (likely) dropped due to the timer
372 	 * expiring while the event channel was invalid.
373 	 */
374 	if (!guest_saw_irq) {
375 		GUEST_SYNC(23);
376 		goto wait_for_timer;
377 	}
378 	guest_saw_irq = false;
379 
380 	GUEST_SYNC(24);
381 }
382 
383 static int cmp_timespec(struct timespec *a, struct timespec *b)
384 {
385 	if (a->tv_sec > b->tv_sec)
386 		return 1;
387 	else if (a->tv_sec < b->tv_sec)
388 		return -1;
389 	else if (a->tv_nsec > b->tv_nsec)
390 		return 1;
391 	else if (a->tv_nsec < b->tv_nsec)
392 		return -1;
393 	else
394 		return 0;
395 }
396 
397 static struct vcpu_info *vinfo;
398 static struct kvm_vcpu *vcpu;
399 
400 static void handle_alrm(int sig)
401 {
402 	if (vinfo)
403 		printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
404 	vcpu_dump(stdout, vcpu, 0);
405 	TEST_FAIL("IRQ delivery timed out");
406 }
407 
408 static void *juggle_shinfo_state(void *arg)
409 {
410 	struct kvm_vm *vm = (struct kvm_vm *)arg;
411 
412 	struct kvm_xen_hvm_attr cache_activate = {
413 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
414 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
415 	};
416 
417 	struct kvm_xen_hvm_attr cache_deactivate = {
418 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
419 		.u.shared_info.gfn = KVM_XEN_INVALID_GFN
420 	};
421 
422 	for (;;) {
423 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate);
424 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate);
425 		pthread_testcancel();
426 	}
427 
428 	return NULL;
429 }
430 
431 int main(int argc, char *argv[])
432 {
433 	struct timespec min_ts, max_ts, vm_ts;
434 	struct kvm_xen_hvm_attr evt_reset;
435 	struct kvm_vm *vm;
436 	pthread_t thread;
437 	bool verbose;
438 	int ret;
439 
440 	verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
441 			       !strncmp(argv[1], "--verbose", 10));
442 
443 	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
444 	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO);
445 
446 	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
447 	bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG);
448 	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
449 	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
450 
451 	clock_gettime(CLOCK_REALTIME, &min_ts);
452 
453 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
454 
455 	/* Map a region for the shared_info page */
456 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
457 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0);
458 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3);
459 
460 	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
461 
462 	int zero_fd = open("/dev/zero", O_RDONLY);
463 	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
464 
465 	struct kvm_xen_hvm_config hvmc = {
466 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
467 		.msr = XEN_HYPERCALL_MSR,
468 	};
469 
470 	/* Let the kernel know that we *will* use it for sending all
471 	 * event channels, which lets it intercept SCHEDOP_poll */
472 	if (do_evtchn_tests)
473 		hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
474 
475 	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
476 
477 	struct kvm_xen_hvm_attr lm = {
478 		.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
479 		.u.long_mode = 1,
480 	};
481 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
482 
483 	if (do_runstate_flag) {
484 		struct kvm_xen_hvm_attr ruf = {
485 			.type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG,
486 			.u.runstate_update_flag = 1,
487 		};
488 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf);
489 
490 		ruf.u.runstate_update_flag = 0;
491 		vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf);
492 		TEST_ASSERT(ruf.u.runstate_update_flag == 1,
493 			    "Failed to read back RUNSTATE_UPDATE_FLAG attr");
494 	}
495 
496 	struct kvm_xen_hvm_attr ha = {
497 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
498 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE,
499 	};
500 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
501 
502 	/*
503 	 * Test what happens when the HVA of the shinfo page is remapped after
504 	 * the kernel has a reference to it. But make sure we copy the clock
505 	 * info over since that's only set at setup time, and we test it later.
506 	 */
507 	struct pvclock_wall_clock wc_copy = shinfo->wc;
508 	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
509 	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
510 	shinfo->wc = wc_copy;
511 
512 	struct kvm_xen_vcpu_attr vi = {
513 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
514 		.u.gpa = VCPU_INFO_ADDR,
515 	};
516 	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi);
517 
518 	struct kvm_xen_vcpu_attr pvclock = {
519 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
520 		.u.gpa = PVTIME_ADDR,
521 	};
522 	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock);
523 
524 	struct kvm_xen_hvm_attr vec = {
525 		.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
526 		.u.vector = EVTCHN_VECTOR,
527 	};
528 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec);
529 
530 	vm_init_descriptor_tables(vm);
531 	vcpu_init_descriptor_tables(vcpu);
532 	vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler);
533 
534 	if (do_runstate_tests) {
535 		struct kvm_xen_vcpu_attr st = {
536 			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
537 			.u.gpa = RUNSTATE_ADDR,
538 		};
539 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
540 	}
541 
542 	int irq_fd[2] = { -1, -1 };
543 
544 	if (do_eventfd_tests) {
545 		irq_fd[0] = eventfd(0, 0);
546 		irq_fd[1] = eventfd(0, 0);
547 
548 		/* Unexpected, but not a KVM failure */
549 		if (irq_fd[0] == -1 || irq_fd[1] == -1)
550 			do_evtchn_tests = do_eventfd_tests = false;
551 	}
552 
553 	if (do_eventfd_tests) {
554 		irq_routes.info.nr = 2;
555 
556 		irq_routes.entries[0].gsi = 32;
557 		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
558 		irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
559 		irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id;
560 		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
561 
562 		irq_routes.entries[1].gsi = 33;
563 		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
564 		irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
565 		irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id;
566 		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
567 
568 		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
569 
570 		struct kvm_irqfd ifd = { };
571 
572 		ifd.fd = irq_fd[0];
573 		ifd.gsi = 32;
574 		vm_ioctl(vm, KVM_IRQFD, &ifd);
575 
576 		ifd.fd = irq_fd[1];
577 		ifd.gsi = 33;
578 		vm_ioctl(vm, KVM_IRQFD, &ifd);
579 
580 		struct sigaction sa = { };
581 		sa.sa_handler = handle_alrm;
582 		sigaction(SIGALRM, &sa, NULL);
583 	}
584 
585 	struct kvm_xen_vcpu_attr tmr = {
586 		.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
587 		.u.timer.port = EVTCHN_TIMER,
588 		.u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
589 		.u.timer.expires_ns = 0
590 	};
591 
592 	if (do_evtchn_tests) {
593 		struct kvm_xen_hvm_attr inj = {
594 			.type = KVM_XEN_ATTR_TYPE_EVTCHN,
595 			.u.evtchn.send_port = 127,
596 			.u.evtchn.type = EVTCHNSTAT_interdomain,
597 			.u.evtchn.flags = 0,
598 			.u.evtchn.deliver.port.port = EVTCHN_TEST1,
599 			.u.evtchn.deliver.port.vcpu = vcpu->id + 1,
600 			.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
601 		};
602 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
603 
604 		/* Test migration to a different vCPU */
605 		inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
606 		inj.u.evtchn.deliver.port.vcpu = vcpu->id;
607 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
608 
609 		inj.u.evtchn.send_port = 197;
610 		inj.u.evtchn.deliver.eventfd.port = 0;
611 		inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
612 		inj.u.evtchn.flags = 0;
613 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
614 
615 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
616 	}
617 	vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
618 	vinfo->evtchn_upcall_pending = 0;
619 
620 	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
621 	rs->state = 0x5a;
622 
623 	bool evtchn_irq_expected = false;
624 
625 	for (;;) {
626 		volatile struct kvm_run *run = vcpu->run;
627 		struct ucall uc;
628 
629 		vcpu_run(vcpu);
630 
631 		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
632 			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
633 			    run->exit_reason,
634 			    exit_reason_str(run->exit_reason));
635 
636 		switch (get_ucall(vcpu, &uc)) {
637 		case UCALL_ABORT:
638 			REPORT_GUEST_ASSERT(uc);
639 			/* NOT REACHED */
640 		case UCALL_SYNC: {
641 			struct kvm_xen_vcpu_attr rst;
642 			long rundelay;
643 
644 			if (do_runstate_tests)
645 				TEST_ASSERT(rs->state_entry_time == rs->time[0] +
646 					    rs->time[1] + rs->time[2] + rs->time[3],
647 					    "runstate times don't add up");
648 
649 			switch (uc.args[1]) {
650 			case 0:
651 				if (verbose)
652 					printf("Delivering evtchn upcall\n");
653 				evtchn_irq_expected = true;
654 				vinfo->evtchn_upcall_pending = 1;
655 				break;
656 
657 			case RUNSTATE_runnable...RUNSTATE_offline:
658 				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
659 				if (!do_runstate_tests)
660 					goto done;
661 				if (verbose)
662 					printf("Testing runstate %s\n", runstate_names[uc.args[1]]);
663 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
664 				rst.u.runstate.state = uc.args[1];
665 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
666 				break;
667 
668 			case 4:
669 				if (verbose)
670 					printf("Testing RUNSTATE_ADJUST\n");
671 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
672 				memset(&rst.u, 0, sizeof(rst.u));
673 				rst.u.runstate.state = (uint64_t)-1;
674 				rst.u.runstate.time_blocked =
675 					0x5a - rs->time[RUNSTATE_blocked];
676 				rst.u.runstate.time_offline =
677 					0x6b6b - rs->time[RUNSTATE_offline];
678 				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
679 					rst.u.runstate.time_offline;
680 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
681 				break;
682 
683 			case 5:
684 				if (verbose)
685 					printf("Testing RUNSTATE_DATA\n");
686 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
687 				memset(&rst.u, 0, sizeof(rst.u));
688 				rst.u.runstate.state = RUNSTATE_running;
689 				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
690 				rst.u.runstate.time_blocked = 0x6b6b;
691 				rst.u.runstate.time_offline = 0x5a;
692 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
693 				break;
694 
695 			case 6:
696 				if (verbose)
697 					printf("Testing steal time\n");
698 				/* Yield until scheduler delay exceeds target */
699 				rundelay = get_run_delay() + MIN_STEAL_TIME;
700 				do {
701 					sched_yield();
702 				} while (get_run_delay() < rundelay);
703 				break;
704 
705 			case 7:
706 				if (!do_eventfd_tests)
707 					goto done;
708 				if (verbose)
709 					printf("Testing masked event channel\n");
710 				shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
711 				eventfd_write(irq_fd[0], 1UL);
712 				alarm(1);
713 				break;
714 
715 			case 8:
716 				if (verbose)
717 					printf("Testing unmasked event channel\n");
718 				/* Unmask that, but deliver the other one */
719 				shinfo->evtchn_pending[0] = 0;
720 				shinfo->evtchn_mask[0] = 0;
721 				eventfd_write(irq_fd[1], 1UL);
722 				evtchn_irq_expected = true;
723 				alarm(1);
724 				break;
725 
726 			case 9:
727 				TEST_ASSERT(!evtchn_irq_expected,
728 					    "Expected event channel IRQ but it didn't happen");
729 				shinfo->evtchn_pending[1] = 0;
730 				if (verbose)
731 					printf("Testing event channel after memslot change\n");
732 				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
733 							    DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0);
734 				eventfd_write(irq_fd[0], 1UL);
735 				evtchn_irq_expected = true;
736 				alarm(1);
737 				break;
738 
739 			case 10:
740 				TEST_ASSERT(!evtchn_irq_expected,
741 					    "Expected event channel IRQ but it didn't happen");
742 				if (!do_evtchn_tests)
743 					goto done;
744 
745 				shinfo->evtchn_pending[0] = 0;
746 				if (verbose)
747 					printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
748 
749 				struct kvm_irq_routing_xen_evtchn e;
750 				e.port = EVTCHN_TEST2;
751 				e.vcpu = vcpu->id;
752 				e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
753 
754 				vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
755 				evtchn_irq_expected = true;
756 				alarm(1);
757 				break;
758 
759 			case 11:
760 				TEST_ASSERT(!evtchn_irq_expected,
761 					    "Expected event channel IRQ but it didn't happen");
762 				shinfo->evtchn_pending[1] = 0;
763 
764 				if (verbose)
765 					printf("Testing guest EVTCHNOP_send direct to evtchn\n");
766 				evtchn_irq_expected = true;
767 				alarm(1);
768 				break;
769 
770 			case 12:
771 				TEST_ASSERT(!evtchn_irq_expected,
772 					    "Expected event channel IRQ but it didn't happen");
773 				shinfo->evtchn_pending[0] = 0;
774 
775 				if (verbose)
776 					printf("Testing guest EVTCHNOP_send to eventfd\n");
777 				evtchn_irq_expected = true;
778 				alarm(1);
779 				break;
780 
781 			case 13:
782 				TEST_ASSERT(!evtchn_irq_expected,
783 					    "Expected event channel IRQ but it didn't happen");
784 				shinfo->evtchn_pending[1] = 0;
785 
786 				if (verbose)
787 					printf("Testing guest oneshot timer\n");
788 				break;
789 
790 			case 14:
791 				memset(&tmr, 0, sizeof(tmr));
792 				tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
793 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
794 				TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
795 					    "Timer port not returned");
796 				TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
797 					    "Timer priority not returned");
798 				TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
799 					    "Timer expiry not returned");
800 				evtchn_irq_expected = true;
801 				alarm(1);
802 				break;
803 
804 			case 15:
805 				TEST_ASSERT(!evtchn_irq_expected,
806 					    "Expected event channel IRQ but it didn't happen");
807 				shinfo->evtchn_pending[0] = 0;
808 
809 				if (verbose)
810 					printf("Testing restored oneshot timer\n");
811 
812 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
813 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
814 				evtchn_irq_expected = true;
815 				alarm(1);
816 				break;
817 
818 			case 16:
819 				TEST_ASSERT(!evtchn_irq_expected,
820 					    "Expected event channel IRQ but it didn't happen");
821 
822 				if (verbose)
823 					printf("Testing SCHEDOP_poll with already pending event\n");
824 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
825 				alarm(1);
826 				break;
827 
828 			case 17:
829 				if (verbose)
830 					printf("Testing SCHEDOP_poll timeout\n");
831 				shinfo->evtchn_pending[0] = 0;
832 				alarm(1);
833 				break;
834 
835 			case 18:
836 				if (verbose)
837 					printf("Testing SCHEDOP_poll wake on masked event\n");
838 
839 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
840 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
841 				alarm(1);
842 				break;
843 
844 			case 19:
845 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
846 				if (verbose)
847 					printf("Testing SCHEDOP_poll wake on unmasked event\n");
848 
849 				evtchn_irq_expected = true;
850 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
851 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
852 
853 				/* Read it back and check the pending time is reported correctly */
854 				tmr.u.timer.expires_ns = 0;
855 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
856 				TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
857 					    "Timer not reported pending");
858 				alarm(1);
859 				break;
860 
861 			case 20:
862 				TEST_ASSERT(!evtchn_irq_expected,
863 					    "Expected event channel IRQ but it didn't happen");
864 				/* Read timer and check it is no longer pending */
865 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
866 				TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
867 
868 				shinfo->evtchn_pending[0] = 0;
869 				if (verbose)
870 					printf("Testing timer in the past\n");
871 
872 				evtchn_irq_expected = true;
873 				tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
874 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
875 				alarm(1);
876 				break;
877 
878 			case 21:
879 				TEST_ASSERT(!evtchn_irq_expected,
880 					    "Expected event channel IRQ but it didn't happen");
881 				alarm(0);
882 
883 				if (verbose)
884 					printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
885 
886 				ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
887 				TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
888 
889 				struct kvm_irq_routing_xen_evtchn uxe = {
890 					.port = 1,
891 					.vcpu = vcpu->id,
892 					.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
893 				};
894 
895 				evtchn_irq_expected = true;
896 				for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
897 					__vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
898 				break;
899 
900 			case 22:
901 				TEST_ASSERT(!evtchn_irq_expected,
902 					    "Expected event channel IRQ but it didn't happen");
903 
904 				if (verbose)
905 					printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
906 
907 				shinfo->evtchn_pending[0] = 1;
908 
909 				evtchn_irq_expected = true;
910 				tmr.u.timer.expires_ns = rs->state_entry_time +
911 							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
912 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
913 				break;
914 
915 			case 23:
916 				/*
917 				 * Optional and possibly repeated sync point.
918 				 * Injecting the timer IRQ may fail if the
919 				 * shinfo is invalid when the timer expires.
920 				 * If the timer has expired but the IRQ hasn't
921 				 * been delivered, rearm the timer and retry.
922 				 */
923 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
924 
925 				/* Resume the guest if the timer is still pending. */
926 				if (tmr.u.timer.expires_ns)
927 					break;
928 
929 				/* All done if the IRQ was delivered. */
930 				if (!evtchn_irq_expected)
931 					break;
932 
933 				tmr.u.timer.expires_ns = rs->state_entry_time +
934 							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
935 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
936 				break;
937 			case 24:
938 				TEST_ASSERT(!evtchn_irq_expected,
939 					    "Expected event channel IRQ but it didn't happen");
940 
941 				ret = pthread_cancel(thread);
942 				TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
943 
944 				ret = pthread_join(thread, 0);
945 				TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
946 				goto done;
947 
948 			case 0x20:
949 				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
950 				evtchn_irq_expected = false;
951 				break;
952 			}
953 			break;
954 		}
955 		case UCALL_DONE:
956 			goto done;
957 		default:
958 			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
959 		}
960 	}
961 
962  done:
963 	evt_reset.type = KVM_XEN_ATTR_TYPE_EVTCHN;
964 	evt_reset.u.evtchn.flags = KVM_XEN_EVTCHN_RESET;
965 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset);
966 
967 	alarm(0);
968 	clock_gettime(CLOCK_REALTIME, &max_ts);
969 
970 	/*
971 	 * Just a *really* basic check that things are being put in the
972 	 * right place. The actual calculations are much the same for
973 	 * Xen as they are for the KVM variants, so no need to check.
974 	 */
975 	struct pvclock_wall_clock *wc;
976 	struct pvclock_vcpu_time_info *ti, *ti2;
977 
978 	wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
979 	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
980 	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
981 
982 	if (verbose) {
983 		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
984 		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
985 		       ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul,
986 		       ti->tsc_shift, ti->flags);
987 		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
988 		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
989 		       ti2->tsc_shift, ti2->flags);
990 	}
991 
992 	vm_ts.tv_sec = wc->sec;
993 	vm_ts.tv_nsec = wc->nsec;
994 	TEST_ASSERT(wc->version && !(wc->version & 1),
995 		    "Bad wallclock version %x", wc->version);
996 	TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
997 	TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
998 
999 	TEST_ASSERT(ti->version && !(ti->version & 1),
1000 		    "Bad time_info version %x", ti->version);
1001 	TEST_ASSERT(ti2->version && !(ti2->version & 1),
1002 		    "Bad time_info version %x", ti->version);
1003 
1004 	if (do_runstate_tests) {
1005 		/*
1006 		 * Fetch runstate and check sanity. Strictly speaking in the
1007 		 * general case we might not expect the numbers to be identical
1008 		 * but in this case we know we aren't running the vCPU any more.
1009 		 */
1010 		struct kvm_xen_vcpu_attr rst = {
1011 			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
1012 		};
1013 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst);
1014 
1015 		if (verbose) {
1016 			printf("Runstate: %s(%d), entry %" PRIu64 " ns\n",
1017 			       rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown",
1018 			       rs->state, rs->state_entry_time);
1019 			for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) {
1020 				printf("State %s: %" PRIu64 " ns\n",
1021 				       runstate_names[i], rs->time[i]);
1022 			}
1023 		}
1024 
1025 		/*
1026 		 * Exercise runstate info at all points across the page boundary, in
1027 		 * 32-bit and 64-bit mode. In particular, test the case where it is
1028 		 * configured in 32-bit mode and then switched to 64-bit mode while
1029 		 * active, which takes it onto the second page.
1030 		 */
1031 		unsigned long runstate_addr;
1032 		struct compat_vcpu_runstate_info *crs;
1033 		for (runstate_addr = SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - sizeof(*rs) - 4;
1034 		     runstate_addr < SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE + 4; runstate_addr++) {
1035 
1036 			rs = addr_gpa2hva(vm, runstate_addr);
1037 			crs = (void *)rs;
1038 
1039 			memset(rs, 0xa5, sizeof(*rs));
1040 
1041 			/* Set to compatibility mode */
1042 			lm.u.long_mode = 0;
1043 			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
1044 
1045 			/* Set runstate to new address (kernel will write it) */
1046 			struct kvm_xen_vcpu_attr st = {
1047 				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1048 				.u.gpa = runstate_addr,
1049 			};
1050 			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
1051 
1052 			if (verbose)
1053 				printf("Compatibility runstate at %08lx\n", runstate_addr);
1054 
1055 			TEST_ASSERT(crs->state == rst.u.runstate.state, "Runstate mismatch");
1056 			TEST_ASSERT(crs->state_entry_time == rst.u.runstate.state_entry_time,
1057 				    "State entry time mismatch");
1058 			TEST_ASSERT(crs->time[RUNSTATE_running] == rst.u.runstate.time_running,
1059 				    "Running time mismatch");
1060 			TEST_ASSERT(crs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
1061 				    "Runnable time mismatch");
1062 			TEST_ASSERT(crs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
1063 				    "Blocked time mismatch");
1064 			TEST_ASSERT(crs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
1065 				    "Offline time mismatch");
1066 			TEST_ASSERT(crs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
1067 				    "Structure overrun");
1068 			TEST_ASSERT(crs->state_entry_time == crs->time[0] +
1069 				    crs->time[1] + crs->time[2] + crs->time[3],
1070 				    "runstate times don't add up");
1071 
1072 
1073 			/* Now switch to 64-bit mode */
1074 			lm.u.long_mode = 1;
1075 			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
1076 
1077 			memset(rs, 0xa5, sizeof(*rs));
1078 
1079 			/* Don't change the address, just trigger a write */
1080 			struct kvm_xen_vcpu_attr adj = {
1081 				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST,
1082 				.u.runstate.state = (uint64_t)-1
1083 			};
1084 			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &adj);
1085 
1086 			if (verbose)
1087 				printf("64-bit runstate at %08lx\n", runstate_addr);
1088 
1089 			TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
1090 			TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
1091 				    "State entry time mismatch");
1092 			TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
1093 				    "Running time mismatch");
1094 			TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
1095 				    "Runnable time mismatch");
1096 			TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
1097 				    "Blocked time mismatch");
1098 			TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
1099 				    "Offline time mismatch");
1100 			TEST_ASSERT(rs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
1101 				    "Structure overrun");
1102 
1103 			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
1104 				    rs->time[1] + rs->time[2] + rs->time[3],
1105 				    "runstate times don't add up");
1106 		}
1107 	}
1108 
1109 	kvm_vm_free(vm);
1110 	return 0;
1111 }
1112