1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * svm_vmcall_test
4  *
5  * Copyright © 2021 Amazon.com, Inc. or its affiliates.
6  *
7  * Xen shared_info / pvclock testing
8  */
9 
10 #include "test_util.h"
11 #include "kvm_util.h"
12 #include "processor.h"
13 
14 #include <stdint.h>
15 #include <time.h>
16 #include <sched.h>
17 #include <signal.h>
18 #include <pthread.h>
19 
20 #include <sys/eventfd.h>
21 
22 /* Defined in include/linux/kvm_types.h */
23 #define GPA_INVALID		(~(ulong)0)
24 
25 #define SHINFO_REGION_GVA	0xc0000000ULL
26 #define SHINFO_REGION_GPA	0xc0000000ULL
27 #define SHINFO_REGION_SLOT	10
28 
29 #define DUMMY_REGION_GPA	(SHINFO_REGION_GPA + (3 * PAGE_SIZE))
30 #define DUMMY_REGION_SLOT	11
31 
32 #define SHINFO_ADDR	(SHINFO_REGION_GPA)
33 #define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
34 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
35 #define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - 15)
36 
37 #define SHINFO_VADDR	(SHINFO_REGION_GVA)
38 #define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
39 #define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + PAGE_SIZE - 15)
40 
41 #define EVTCHN_VECTOR	0x10
42 
43 #define EVTCHN_TEST1 15
44 #define EVTCHN_TEST2 66
45 #define EVTCHN_TIMER 13
46 
47 #define XEN_HYPERCALL_MSR	0x40000000
48 
49 #define MIN_STEAL_TIME		50000
50 
51 #define SHINFO_RACE_TIMEOUT	2	/* seconds */
52 
53 #define __HYPERVISOR_set_timer_op	15
54 #define __HYPERVISOR_sched_op		29
55 #define __HYPERVISOR_event_channel_op	32
56 
57 #define SCHEDOP_poll			3
58 
59 #define EVTCHNOP_send			4
60 
61 #define EVTCHNSTAT_interdomain		2
62 
63 struct evtchn_send {
64 	u32 port;
65 };
66 
67 struct sched_poll {
68 	u32 *ports;
69 	unsigned int nr_ports;
70 	u64 timeout;
71 };
72 
73 struct pvclock_vcpu_time_info {
74 	u32   version;
75 	u32   pad0;
76 	u64   tsc_timestamp;
77 	u64   system_time;
78 	u32   tsc_to_system_mul;
79 	s8    tsc_shift;
80 	u8    flags;
81 	u8    pad[2];
82 } __attribute__((__packed__)); /* 32 bytes */
83 
84 struct pvclock_wall_clock {
85 	u32   version;
86 	u32   sec;
87 	u32   nsec;
88 } __attribute__((__packed__));
89 
90 struct vcpu_runstate_info {
91 	uint32_t state;
92 	uint64_t state_entry_time;
93 	uint64_t time[5]; /* Extra field for overrun check */
94 };
95 
96 struct compat_vcpu_runstate_info {
97 	uint32_t state;
98 	uint64_t state_entry_time;
99 	uint64_t time[5];
100 } __attribute__((__packed__));;
101 
102 struct arch_vcpu_info {
103 	unsigned long cr2;
104 	unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
105 };
106 
107 struct vcpu_info {
108 	uint8_t evtchn_upcall_pending;
109 	uint8_t evtchn_upcall_mask;
110 	unsigned long evtchn_pending_sel;
111 	struct arch_vcpu_info arch;
112 	struct pvclock_vcpu_time_info time;
113 }; /* 64 bytes (x86) */
114 
115 struct shared_info {
116 	struct vcpu_info vcpu_info[32];
117 	unsigned long evtchn_pending[64];
118 	unsigned long evtchn_mask[64];
119 	struct pvclock_wall_clock wc;
120 	uint32_t wc_sec_hi;
121 	/* arch_shared_info here */
122 };
123 
124 #define RUNSTATE_running  0
125 #define RUNSTATE_runnable 1
126 #define RUNSTATE_blocked  2
127 #define RUNSTATE_offline  3
128 
129 static const char *runstate_names[] = {
130 	"running",
131 	"runnable",
132 	"blocked",
133 	"offline"
134 };
135 
136 struct {
137 	struct kvm_irq_routing info;
138 	struct kvm_irq_routing_entry entries[2];
139 } irq_routes;
140 
141 static volatile bool guest_saw_irq;
142 
143 static void evtchn_handler(struct ex_regs *regs)
144 {
145 	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
146 	vi->evtchn_upcall_pending = 0;
147 	vi->evtchn_pending_sel = 0;
148 	guest_saw_irq = true;
149 
150 	GUEST_SYNC(0x20);
151 }
152 
153 static void guest_wait_for_irq(void)
154 {
155 	while (!guest_saw_irq)
156 		__asm__ __volatile__ ("rep nop" : : : "memory");
157 	guest_saw_irq = false;
158 }
159 
160 static void guest_code(void)
161 {
162 	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
163 	int i;
164 
165 	__asm__ __volatile__(
166 		"sti\n"
167 		"nop\n"
168 	);
169 
170 	/* Trigger an interrupt injection */
171 	GUEST_SYNC(0);
172 
173 	guest_wait_for_irq();
174 
175 	/* Test having the host set runstates manually */
176 	GUEST_SYNC(RUNSTATE_runnable);
177 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
178 	GUEST_ASSERT(rs->state == 0);
179 
180 	GUEST_SYNC(RUNSTATE_blocked);
181 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
182 	GUEST_ASSERT(rs->state == 0);
183 
184 	GUEST_SYNC(RUNSTATE_offline);
185 	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
186 	GUEST_ASSERT(rs->state == 0);
187 
188 	/* Test runstate time adjust */
189 	GUEST_SYNC(4);
190 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
191 	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
192 
193 	/* Test runstate time set */
194 	GUEST_SYNC(5);
195 	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
196 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
197 	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
198 	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
199 
200 	/* sched_yield() should result in some 'runnable' time */
201 	GUEST_SYNC(6);
202 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
203 
204 	/* Attempt to deliver a *masked* interrupt */
205 	GUEST_SYNC(7);
206 
207 	/* Wait until we see the bit set */
208 	struct shared_info *si = (void *)SHINFO_VADDR;
209 	while (!si->evtchn_pending[0])
210 		__asm__ __volatile__ ("rep nop" : : : "memory");
211 
212 	/* Now deliver an *unmasked* interrupt */
213 	GUEST_SYNC(8);
214 
215 	guest_wait_for_irq();
216 
217 	/* Change memslots and deliver an interrupt */
218 	GUEST_SYNC(9);
219 
220 	guest_wait_for_irq();
221 
222 	/* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
223 	GUEST_SYNC(10);
224 
225 	guest_wait_for_irq();
226 
227 	GUEST_SYNC(11);
228 
229 	/* Our turn. Deliver event channel (to ourselves) with
230 	 * EVTCHNOP_send hypercall. */
231 	unsigned long rax;
232 	struct evtchn_send s = { .port = 127 };
233 	__asm__ __volatile__ ("vmcall" :
234 			      "=a" (rax) :
235 			      "a" (__HYPERVISOR_event_channel_op),
236 			      "D" (EVTCHNOP_send),
237 			      "S" (&s));
238 
239 	GUEST_ASSERT(rax == 0);
240 
241 	guest_wait_for_irq();
242 
243 	GUEST_SYNC(12);
244 
245 	/* Deliver "outbound" event channel to an eventfd which
246 	 * happens to be one of our own irqfds. */
247 	s.port = 197;
248 	__asm__ __volatile__ ("vmcall" :
249 			      "=a" (rax) :
250 			      "a" (__HYPERVISOR_event_channel_op),
251 			      "D" (EVTCHNOP_send),
252 			      "S" (&s));
253 
254 	GUEST_ASSERT(rax == 0);
255 
256 	guest_wait_for_irq();
257 
258 	GUEST_SYNC(13);
259 
260 	/* Set a timer 100ms in the future. */
261 	__asm__ __volatile__ ("vmcall" :
262 			      "=a" (rax) :
263 			      "a" (__HYPERVISOR_set_timer_op),
264 			      "D" (rs->state_entry_time + 100000000));
265 	GUEST_ASSERT(rax == 0);
266 
267 	GUEST_SYNC(14);
268 
269 	/* Now wait for the timer */
270 	guest_wait_for_irq();
271 
272 	GUEST_SYNC(15);
273 
274 	/* The host has 'restored' the timer. Just wait for it. */
275 	guest_wait_for_irq();
276 
277 	GUEST_SYNC(16);
278 
279 	/* Poll for an event channel port which is already set */
280 	u32 ports[1] = { EVTCHN_TIMER };
281 	struct sched_poll p = {
282 		.ports = ports,
283 		.nr_ports = 1,
284 		.timeout = 0,
285 	};
286 
287 	__asm__ __volatile__ ("vmcall" :
288 			      "=a" (rax) :
289 			      "a" (__HYPERVISOR_sched_op),
290 			      "D" (SCHEDOP_poll),
291 			      "S" (&p));
292 
293 	GUEST_ASSERT(rax == 0);
294 
295 	GUEST_SYNC(17);
296 
297 	/* Poll for an unset port and wait for the timeout. */
298 	p.timeout = 100000000;
299 	__asm__ __volatile__ ("vmcall" :
300 			      "=a" (rax) :
301 			      "a" (__HYPERVISOR_sched_op),
302 			      "D" (SCHEDOP_poll),
303 			      "S" (&p));
304 
305 	GUEST_ASSERT(rax == 0);
306 
307 	GUEST_SYNC(18);
308 
309 	/* A timer will wake the masked port we're waiting on, while we poll */
310 	p.timeout = 0;
311 	__asm__ __volatile__ ("vmcall" :
312 			      "=a" (rax) :
313 			      "a" (__HYPERVISOR_sched_op),
314 			      "D" (SCHEDOP_poll),
315 			      "S" (&p));
316 
317 	GUEST_ASSERT(rax == 0);
318 
319 	GUEST_SYNC(19);
320 
321 	/* A timer wake an *unmasked* port which should wake us with an
322 	 * actual interrupt, while we're polling on a different port. */
323 	ports[0]++;
324 	p.timeout = 0;
325 	__asm__ __volatile__ ("vmcall" :
326 			      "=a" (rax) :
327 			      "a" (__HYPERVISOR_sched_op),
328 			      "D" (SCHEDOP_poll),
329 			      "S" (&p));
330 
331 	GUEST_ASSERT(rax == 0);
332 
333 	guest_wait_for_irq();
334 
335 	GUEST_SYNC(20);
336 
337 	/* Timer should have fired already */
338 	guest_wait_for_irq();
339 
340 	GUEST_SYNC(21);
341 	/* Racing host ioctls */
342 
343 	guest_wait_for_irq();
344 
345 	GUEST_SYNC(22);
346 	/* Racing vmcall against host ioctl */
347 
348 	ports[0] = 0;
349 
350 	p = (struct sched_poll) {
351 		.ports = ports,
352 		.nr_ports = 1,
353 		.timeout = 0
354 	};
355 
356 wait_for_timer:
357 	/*
358 	 * Poll for a timer wake event while the worker thread is mucking with
359 	 * the shared info.  KVM XEN drops timer IRQs if the shared info is
360 	 * invalid when the timer expires.  Arbitrarily poll 100 times before
361 	 * giving up and asking the VMM to re-arm the timer.  100 polls should
362 	 * consume enough time to beat on KVM without taking too long if the
363 	 * timer IRQ is dropped due to an invalid event channel.
364 	 */
365 	for (i = 0; i < 100 && !guest_saw_irq; i++)
366 		asm volatile("vmcall"
367 			     : "=a" (rax)
368 			     : "a" (__HYPERVISOR_sched_op),
369 			       "D" (SCHEDOP_poll),
370 			       "S" (&p)
371 			     : "memory");
372 
373 	/*
374 	 * Re-send the timer IRQ if it was (likely) dropped due to the timer
375 	 * expiring while the event channel was invalid.
376 	 */
377 	if (!guest_saw_irq) {
378 		GUEST_SYNC(23);
379 		goto wait_for_timer;
380 	}
381 	guest_saw_irq = false;
382 
383 	GUEST_SYNC(24);
384 }
385 
386 static int cmp_timespec(struct timespec *a, struct timespec *b)
387 {
388 	if (a->tv_sec > b->tv_sec)
389 		return 1;
390 	else if (a->tv_sec < b->tv_sec)
391 		return -1;
392 	else if (a->tv_nsec > b->tv_nsec)
393 		return 1;
394 	else if (a->tv_nsec < b->tv_nsec)
395 		return -1;
396 	else
397 		return 0;
398 }
399 
400 static struct vcpu_info *vinfo;
401 static struct kvm_vcpu *vcpu;
402 
403 static void handle_alrm(int sig)
404 {
405 	if (vinfo)
406 		printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
407 	vcpu_dump(stdout, vcpu, 0);
408 	TEST_FAIL("IRQ delivery timed out");
409 }
410 
411 static void *juggle_shinfo_state(void *arg)
412 {
413 	struct kvm_vm *vm = (struct kvm_vm *)arg;
414 
415 	struct kvm_xen_hvm_attr cache_init = {
416 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
417 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
418 	};
419 
420 	struct kvm_xen_hvm_attr cache_destroy = {
421 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
422 		.u.shared_info.gfn = GPA_INVALID
423 	};
424 
425 	for (;;) {
426 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_init);
427 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_destroy);
428 		pthread_testcancel();
429 	};
430 
431 	return NULL;
432 }
433 
434 int main(int argc, char *argv[])
435 {
436 	struct timespec min_ts, max_ts, vm_ts;
437 	struct kvm_vm *vm;
438 	pthread_t thread;
439 	bool verbose;
440 	int ret;
441 
442 	verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
443 			       !strncmp(argv[1], "--verbose", 10));
444 
445 	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
446 	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO);
447 
448 	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
449 	bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG);
450 	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
451 	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
452 
453 	clock_gettime(CLOCK_REALTIME, &min_ts);
454 
455 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
456 
457 	/* Map a region for the shared_info page */
458 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
459 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0);
460 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3);
461 
462 	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
463 
464 	int zero_fd = open("/dev/zero", O_RDONLY);
465 	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
466 
467 	struct kvm_xen_hvm_config hvmc = {
468 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
469 		.msr = XEN_HYPERCALL_MSR,
470 	};
471 
472 	/* Let the kernel know that we *will* use it for sending all
473 	 * event channels, which lets it intercept SCHEDOP_poll */
474 	if (do_evtchn_tests)
475 		hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
476 
477 	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
478 
479 	struct kvm_xen_hvm_attr lm = {
480 		.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
481 		.u.long_mode = 1,
482 	};
483 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
484 
485 	if (do_runstate_flag) {
486 		struct kvm_xen_hvm_attr ruf = {
487 			.type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG,
488 			.u.runstate_update_flag = 1,
489 		};
490 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf);
491 
492 		ruf.u.runstate_update_flag = 0;
493 		vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf);
494 		TEST_ASSERT(ruf.u.runstate_update_flag == 1,
495 			    "Failed to read back RUNSTATE_UPDATE_FLAG attr");
496 	}
497 
498 	struct kvm_xen_hvm_attr ha = {
499 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
500 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE,
501 	};
502 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
503 
504 	/*
505 	 * Test what happens when the HVA of the shinfo page is remapped after
506 	 * the kernel has a reference to it. But make sure we copy the clock
507 	 * info over since that's only set at setup time, and we test it later.
508 	 */
509 	struct pvclock_wall_clock wc_copy = shinfo->wc;
510 	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
511 	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
512 	shinfo->wc = wc_copy;
513 
514 	struct kvm_xen_vcpu_attr vi = {
515 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
516 		.u.gpa = VCPU_INFO_ADDR,
517 	};
518 	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi);
519 
520 	struct kvm_xen_vcpu_attr pvclock = {
521 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
522 		.u.gpa = PVTIME_ADDR,
523 	};
524 	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock);
525 
526 	struct kvm_xen_hvm_attr vec = {
527 		.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
528 		.u.vector = EVTCHN_VECTOR,
529 	};
530 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec);
531 
532 	vm_init_descriptor_tables(vm);
533 	vcpu_init_descriptor_tables(vcpu);
534 	vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler);
535 
536 	if (do_runstate_tests) {
537 		struct kvm_xen_vcpu_attr st = {
538 			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
539 			.u.gpa = RUNSTATE_ADDR,
540 		};
541 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
542 	}
543 
544 	int irq_fd[2] = { -1, -1 };
545 
546 	if (do_eventfd_tests) {
547 		irq_fd[0] = eventfd(0, 0);
548 		irq_fd[1] = eventfd(0, 0);
549 
550 		/* Unexpected, but not a KVM failure */
551 		if (irq_fd[0] == -1 || irq_fd[1] == -1)
552 			do_evtchn_tests = do_eventfd_tests = false;
553 	}
554 
555 	if (do_eventfd_tests) {
556 		irq_routes.info.nr = 2;
557 
558 		irq_routes.entries[0].gsi = 32;
559 		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
560 		irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
561 		irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id;
562 		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
563 
564 		irq_routes.entries[1].gsi = 33;
565 		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
566 		irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
567 		irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id;
568 		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
569 
570 		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
571 
572 		struct kvm_irqfd ifd = { };
573 
574 		ifd.fd = irq_fd[0];
575 		ifd.gsi = 32;
576 		vm_ioctl(vm, KVM_IRQFD, &ifd);
577 
578 		ifd.fd = irq_fd[1];
579 		ifd.gsi = 33;
580 		vm_ioctl(vm, KVM_IRQFD, &ifd);
581 
582 		struct sigaction sa = { };
583 		sa.sa_handler = handle_alrm;
584 		sigaction(SIGALRM, &sa, NULL);
585 	}
586 
587 	struct kvm_xen_vcpu_attr tmr = {
588 		.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
589 		.u.timer.port = EVTCHN_TIMER,
590 		.u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
591 		.u.timer.expires_ns = 0
592 	};
593 
594 	if (do_evtchn_tests) {
595 		struct kvm_xen_hvm_attr inj = {
596 			.type = KVM_XEN_ATTR_TYPE_EVTCHN,
597 			.u.evtchn.send_port = 127,
598 			.u.evtchn.type = EVTCHNSTAT_interdomain,
599 			.u.evtchn.flags = 0,
600 			.u.evtchn.deliver.port.port = EVTCHN_TEST1,
601 			.u.evtchn.deliver.port.vcpu = vcpu->id + 1,
602 			.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
603 		};
604 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
605 
606 		/* Test migration to a different vCPU */
607 		inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
608 		inj.u.evtchn.deliver.port.vcpu = vcpu->id;
609 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
610 
611 		inj.u.evtchn.send_port = 197;
612 		inj.u.evtchn.deliver.eventfd.port = 0;
613 		inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
614 		inj.u.evtchn.flags = 0;
615 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
616 
617 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
618 	}
619 	vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
620 	vinfo->evtchn_upcall_pending = 0;
621 
622 	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
623 	rs->state = 0x5a;
624 
625 	bool evtchn_irq_expected = false;
626 
627 	for (;;) {
628 		volatile struct kvm_run *run = vcpu->run;
629 		struct ucall uc;
630 
631 		vcpu_run(vcpu);
632 
633 		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
634 			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
635 			    run->exit_reason,
636 			    exit_reason_str(run->exit_reason));
637 
638 		switch (get_ucall(vcpu, &uc)) {
639 		case UCALL_ABORT:
640 			REPORT_GUEST_ASSERT(uc);
641 			/* NOT REACHED */
642 		case UCALL_SYNC: {
643 			struct kvm_xen_vcpu_attr rst;
644 			long rundelay;
645 
646 			if (do_runstate_tests)
647 				TEST_ASSERT(rs->state_entry_time == rs->time[0] +
648 					    rs->time[1] + rs->time[2] + rs->time[3],
649 					    "runstate times don't add up");
650 
651 			switch (uc.args[1]) {
652 			case 0:
653 				if (verbose)
654 					printf("Delivering evtchn upcall\n");
655 				evtchn_irq_expected = true;
656 				vinfo->evtchn_upcall_pending = 1;
657 				break;
658 
659 			case RUNSTATE_runnable...RUNSTATE_offline:
660 				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
661 				if (!do_runstate_tests)
662 					goto done;
663 				if (verbose)
664 					printf("Testing runstate %s\n", runstate_names[uc.args[1]]);
665 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
666 				rst.u.runstate.state = uc.args[1];
667 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
668 				break;
669 
670 			case 4:
671 				if (verbose)
672 					printf("Testing RUNSTATE_ADJUST\n");
673 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
674 				memset(&rst.u, 0, sizeof(rst.u));
675 				rst.u.runstate.state = (uint64_t)-1;
676 				rst.u.runstate.time_blocked =
677 					0x5a - rs->time[RUNSTATE_blocked];
678 				rst.u.runstate.time_offline =
679 					0x6b6b - rs->time[RUNSTATE_offline];
680 				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
681 					rst.u.runstate.time_offline;
682 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
683 				break;
684 
685 			case 5:
686 				if (verbose)
687 					printf("Testing RUNSTATE_DATA\n");
688 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
689 				memset(&rst.u, 0, sizeof(rst.u));
690 				rst.u.runstate.state = RUNSTATE_running;
691 				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
692 				rst.u.runstate.time_blocked = 0x6b6b;
693 				rst.u.runstate.time_offline = 0x5a;
694 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
695 				break;
696 
697 			case 6:
698 				if (verbose)
699 					printf("Testing steal time\n");
700 				/* Yield until scheduler delay exceeds target */
701 				rundelay = get_run_delay() + MIN_STEAL_TIME;
702 				do {
703 					sched_yield();
704 				} while (get_run_delay() < rundelay);
705 				break;
706 
707 			case 7:
708 				if (!do_eventfd_tests)
709 					goto done;
710 				if (verbose)
711 					printf("Testing masked event channel\n");
712 				shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
713 				eventfd_write(irq_fd[0], 1UL);
714 				alarm(1);
715 				break;
716 
717 			case 8:
718 				if (verbose)
719 					printf("Testing unmasked event channel\n");
720 				/* Unmask that, but deliver the other one */
721 				shinfo->evtchn_pending[0] = 0;
722 				shinfo->evtchn_mask[0] = 0;
723 				eventfd_write(irq_fd[1], 1UL);
724 				evtchn_irq_expected = true;
725 				alarm(1);
726 				break;
727 
728 			case 9:
729 				TEST_ASSERT(!evtchn_irq_expected,
730 					    "Expected event channel IRQ but it didn't happen");
731 				shinfo->evtchn_pending[1] = 0;
732 				if (verbose)
733 					printf("Testing event channel after memslot change\n");
734 				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
735 							    DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0);
736 				eventfd_write(irq_fd[0], 1UL);
737 				evtchn_irq_expected = true;
738 				alarm(1);
739 				break;
740 
741 			case 10:
742 				TEST_ASSERT(!evtchn_irq_expected,
743 					    "Expected event channel IRQ but it didn't happen");
744 				if (!do_evtchn_tests)
745 					goto done;
746 
747 				shinfo->evtchn_pending[0] = 0;
748 				if (verbose)
749 					printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
750 
751 				struct kvm_irq_routing_xen_evtchn e;
752 				e.port = EVTCHN_TEST2;
753 				e.vcpu = vcpu->id;
754 				e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
755 
756 				vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
757 				evtchn_irq_expected = true;
758 				alarm(1);
759 				break;
760 
761 			case 11:
762 				TEST_ASSERT(!evtchn_irq_expected,
763 					    "Expected event channel IRQ but it didn't happen");
764 				shinfo->evtchn_pending[1] = 0;
765 
766 				if (verbose)
767 					printf("Testing guest EVTCHNOP_send direct to evtchn\n");
768 				evtchn_irq_expected = true;
769 				alarm(1);
770 				break;
771 
772 			case 12:
773 				TEST_ASSERT(!evtchn_irq_expected,
774 					    "Expected event channel IRQ but it didn't happen");
775 				shinfo->evtchn_pending[0] = 0;
776 
777 				if (verbose)
778 					printf("Testing guest EVTCHNOP_send to eventfd\n");
779 				evtchn_irq_expected = true;
780 				alarm(1);
781 				break;
782 
783 			case 13:
784 				TEST_ASSERT(!evtchn_irq_expected,
785 					    "Expected event channel IRQ but it didn't happen");
786 				shinfo->evtchn_pending[1] = 0;
787 
788 				if (verbose)
789 					printf("Testing guest oneshot timer\n");
790 				break;
791 
792 			case 14:
793 				memset(&tmr, 0, sizeof(tmr));
794 				tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
795 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
796 				TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
797 					    "Timer port not returned");
798 				TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
799 					    "Timer priority not returned");
800 				TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
801 					    "Timer expiry not returned");
802 				evtchn_irq_expected = true;
803 				alarm(1);
804 				break;
805 
806 			case 15:
807 				TEST_ASSERT(!evtchn_irq_expected,
808 					    "Expected event channel IRQ but it didn't happen");
809 				shinfo->evtchn_pending[0] = 0;
810 
811 				if (verbose)
812 					printf("Testing restored oneshot timer\n");
813 
814 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
815 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
816 				evtchn_irq_expected = true;
817 				alarm(1);
818 				break;
819 
820 			case 16:
821 				TEST_ASSERT(!evtchn_irq_expected,
822 					    "Expected event channel IRQ but it didn't happen");
823 
824 				if (verbose)
825 					printf("Testing SCHEDOP_poll with already pending event\n");
826 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
827 				alarm(1);
828 				break;
829 
830 			case 17:
831 				if (verbose)
832 					printf("Testing SCHEDOP_poll timeout\n");
833 				shinfo->evtchn_pending[0] = 0;
834 				alarm(1);
835 				break;
836 
837 			case 18:
838 				if (verbose)
839 					printf("Testing SCHEDOP_poll wake on masked event\n");
840 
841 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
842 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
843 				alarm(1);
844 				break;
845 
846 			case 19:
847 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
848 				if (verbose)
849 					printf("Testing SCHEDOP_poll wake on unmasked event\n");
850 
851 				evtchn_irq_expected = true;
852 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
853 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
854 
855 				/* Read it back and check the pending time is reported correctly */
856 				tmr.u.timer.expires_ns = 0;
857 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
858 				TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
859 					    "Timer not reported pending");
860 				alarm(1);
861 				break;
862 
863 			case 20:
864 				TEST_ASSERT(!evtchn_irq_expected,
865 					    "Expected event channel IRQ but it didn't happen");
866 				/* Read timer and check it is no longer pending */
867 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
868 				TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
869 
870 				shinfo->evtchn_pending[0] = 0;
871 				if (verbose)
872 					printf("Testing timer in the past\n");
873 
874 				evtchn_irq_expected = true;
875 				tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
876 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
877 				alarm(1);
878 				break;
879 
880 			case 21:
881 				TEST_ASSERT(!evtchn_irq_expected,
882 					    "Expected event channel IRQ but it didn't happen");
883 				alarm(0);
884 
885 				if (verbose)
886 					printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
887 
888 				ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
889 				TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
890 
891 				struct kvm_irq_routing_xen_evtchn uxe = {
892 					.port = 1,
893 					.vcpu = vcpu->id,
894 					.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
895 				};
896 
897 				evtchn_irq_expected = true;
898 				for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
899 					__vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
900 				break;
901 
902 			case 22:
903 				TEST_ASSERT(!evtchn_irq_expected,
904 					    "Expected event channel IRQ but it didn't happen");
905 
906 				if (verbose)
907 					printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
908 
909 				shinfo->evtchn_pending[0] = 1;
910 
911 				evtchn_irq_expected = true;
912 				tmr.u.timer.expires_ns = rs->state_entry_time +
913 							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
914 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
915 				break;
916 
917 			case 23:
918 				/*
919 				 * Optional and possibly repeated sync point.
920 				 * Injecting the timer IRQ may fail if the
921 				 * shinfo is invalid when the timer expires.
922 				 * If the timer has expired but the IRQ hasn't
923 				 * been delivered, rearm the timer and retry.
924 				 */
925 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
926 
927 				/* Resume the guest if the timer is still pending. */
928 				if (tmr.u.timer.expires_ns)
929 					break;
930 
931 				/* All done if the IRQ was delivered. */
932 				if (!evtchn_irq_expected)
933 					break;
934 
935 				tmr.u.timer.expires_ns = rs->state_entry_time +
936 							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
937 				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
938 				break;
939 			case 24:
940 				TEST_ASSERT(!evtchn_irq_expected,
941 					    "Expected event channel IRQ but it didn't happen");
942 
943 				ret = pthread_cancel(thread);
944 				TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
945 
946 				ret = pthread_join(thread, 0);
947 				TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
948 				goto done;
949 
950 			case 0x20:
951 				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
952 				evtchn_irq_expected = false;
953 				break;
954 			}
955 			break;
956 		}
957 		case UCALL_DONE:
958 			goto done;
959 		default:
960 			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
961 		}
962 	}
963 
964  done:
965 	struct kvm_xen_hvm_attr evt_reset = {
966 		.type = KVM_XEN_ATTR_TYPE_EVTCHN,
967 		.u.evtchn.flags = KVM_XEN_EVTCHN_RESET,
968 	};
969 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset);
970 
971 	alarm(0);
972 	clock_gettime(CLOCK_REALTIME, &max_ts);
973 
974 	/*
975 	 * Just a *really* basic check that things are being put in the
976 	 * right place. The actual calculations are much the same for
977 	 * Xen as they are for the KVM variants, so no need to check.
978 	 */
979 	struct pvclock_wall_clock *wc;
980 	struct pvclock_vcpu_time_info *ti, *ti2;
981 
982 	wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
983 	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
984 	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
985 
986 	if (verbose) {
987 		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
988 		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
989 		       ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul,
990 		       ti->tsc_shift, ti->flags);
991 		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
992 		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
993 		       ti2->tsc_shift, ti2->flags);
994 	}
995 
996 	vm_ts.tv_sec = wc->sec;
997 	vm_ts.tv_nsec = wc->nsec;
998 	TEST_ASSERT(wc->version && !(wc->version & 1),
999 		    "Bad wallclock version %x", wc->version);
1000 	TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
1001 	TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
1002 
1003 	TEST_ASSERT(ti->version && !(ti->version & 1),
1004 		    "Bad time_info version %x", ti->version);
1005 	TEST_ASSERT(ti2->version && !(ti2->version & 1),
1006 		    "Bad time_info version %x", ti->version);
1007 
1008 	if (do_runstate_tests) {
1009 		/*
1010 		 * Fetch runstate and check sanity. Strictly speaking in the
1011 		 * general case we might not expect the numbers to be identical
1012 		 * but in this case we know we aren't running the vCPU any more.
1013 		 */
1014 		struct kvm_xen_vcpu_attr rst = {
1015 			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
1016 		};
1017 		vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst);
1018 
1019 		if (verbose) {
1020 			printf("Runstate: %s(%d), entry %" PRIu64 " ns\n",
1021 			       rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown",
1022 			       rs->state, rs->state_entry_time);
1023 			for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) {
1024 				printf("State %s: %" PRIu64 " ns\n",
1025 				       runstate_names[i], rs->time[i]);
1026 			}
1027 		}
1028 
1029 		/*
1030 		 * Exercise runstate info at all points across the page boundary, in
1031 		 * 32-bit and 64-bit mode. In particular, test the case where it is
1032 		 * configured in 32-bit mode and then switched to 64-bit mode while
1033 		 * active, which takes it onto the second page.
1034 		 */
1035 		unsigned long runstate_addr;
1036 		struct compat_vcpu_runstate_info *crs;
1037 		for (runstate_addr = SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - sizeof(*rs) - 4;
1038 		     runstate_addr < SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE + 4; runstate_addr++) {
1039 
1040 			rs = addr_gpa2hva(vm, runstate_addr);
1041 			crs = (void *)rs;
1042 
1043 			memset(rs, 0xa5, sizeof(*rs));
1044 
1045 			/* Set to compatibility mode */
1046 			lm.u.long_mode = 0;
1047 			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
1048 
1049 			/* Set runstate to new address (kernel will write it) */
1050 			struct kvm_xen_vcpu_attr st = {
1051 				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1052 				.u.gpa = runstate_addr,
1053 			};
1054 			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
1055 
1056 			if (verbose)
1057 				printf("Compatibility runstate at %08lx\n", runstate_addr);
1058 
1059 			TEST_ASSERT(crs->state == rst.u.runstate.state, "Runstate mismatch");
1060 			TEST_ASSERT(crs->state_entry_time == rst.u.runstate.state_entry_time,
1061 				    "State entry time mismatch");
1062 			TEST_ASSERT(crs->time[RUNSTATE_running] == rst.u.runstate.time_running,
1063 				    "Running time mismatch");
1064 			TEST_ASSERT(crs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
1065 				    "Runnable time mismatch");
1066 			TEST_ASSERT(crs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
1067 				    "Blocked time mismatch");
1068 			TEST_ASSERT(crs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
1069 				    "Offline time mismatch");
1070 			TEST_ASSERT(crs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
1071 				    "Structure overrun");
1072 			TEST_ASSERT(crs->state_entry_time == crs->time[0] +
1073 				    crs->time[1] + crs->time[2] + crs->time[3],
1074 				    "runstate times don't add up");
1075 
1076 
1077 			/* Now switch to 64-bit mode */
1078 			lm.u.long_mode = 1;
1079 			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
1080 
1081 			memset(rs, 0xa5, sizeof(*rs));
1082 
1083 			/* Don't change the address, just trigger a write */
1084 			struct kvm_xen_vcpu_attr adj = {
1085 				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST,
1086 				.u.runstate.state = (uint64_t)-1
1087 			};
1088 			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &adj);
1089 
1090 			if (verbose)
1091 				printf("64-bit runstate at %08lx\n", runstate_addr);
1092 
1093 			TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
1094 			TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
1095 				    "State entry time mismatch");
1096 			TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
1097 				    "Running time mismatch");
1098 			TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
1099 				    "Runnable time mismatch");
1100 			TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
1101 				    "Blocked time mismatch");
1102 			TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
1103 				    "Offline time mismatch");
1104 			TEST_ASSERT(rs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
1105 				    "Structure overrun");
1106 
1107 			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
1108 				    rs->time[1] + rs->time[2] + rs->time[3],
1109 				    "runstate times don't add up");
1110 		}
1111 	}
1112 
1113 	kvm_vm_free(vm);
1114 	return 0;
1115 }
1116