xref: /openbmc/linux/tools/testing/selftests/kvm/lib/kvm_util.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * tools/testing/selftests/kvm/lib/kvm_util.c
4   *
5   * Copyright (C) 2018, Google LLC.
6   */
7  
8  #define _GNU_SOURCE /* for program_invocation_name */
9  #include "test_util.h"
10  #include "kvm_util.h"
11  #include "processor.h"
12  
13  #include <assert.h>
14  #include <sched.h>
15  #include <sys/mman.h>
16  #include <sys/types.h>
17  #include <sys/stat.h>
18  #include <unistd.h>
19  #include <linux/kernel.h>
20  
21  #define KVM_UTIL_MIN_PFN	2
22  
23  static int vcpu_mmap_sz(void);
24  
open_path_or_exit(const char * path,int flags)25  int open_path_or_exit(const char *path, int flags)
26  {
27  	int fd;
28  
29  	fd = open(path, flags);
30  	__TEST_REQUIRE(fd >= 0, "%s not available (errno: %d)", path, errno);
31  
32  	return fd;
33  }
34  
35  /*
36   * Open KVM_DEV_PATH if available, otherwise exit the entire program.
37   *
38   * Input Args:
39   *   flags - The flags to pass when opening KVM_DEV_PATH.
40   *
41   * Return:
42   *   The opened file descriptor of /dev/kvm.
43   */
_open_kvm_dev_path_or_exit(int flags)44  static int _open_kvm_dev_path_or_exit(int flags)
45  {
46  	return open_path_or_exit(KVM_DEV_PATH, flags);
47  }
48  
open_kvm_dev_path_or_exit(void)49  int open_kvm_dev_path_or_exit(void)
50  {
51  	return _open_kvm_dev_path_or_exit(O_RDONLY);
52  }
53  
get_module_param_bool(const char * module_name,const char * param)54  static bool get_module_param_bool(const char *module_name, const char *param)
55  {
56  	const int path_size = 128;
57  	char path[path_size];
58  	char value;
59  	ssize_t r;
60  	int fd;
61  
62  	r = snprintf(path, path_size, "/sys/module/%s/parameters/%s",
63  		     module_name, param);
64  	TEST_ASSERT(r < path_size,
65  		    "Failed to construct sysfs path in %d bytes.", path_size);
66  
67  	fd = open_path_or_exit(path, O_RDONLY);
68  
69  	r = read(fd, &value, 1);
70  	TEST_ASSERT(r == 1, "read(%s) failed", path);
71  
72  	r = close(fd);
73  	TEST_ASSERT(!r, "close(%s) failed", path);
74  
75  	if (value == 'Y')
76  		return true;
77  	else if (value == 'N')
78  		return false;
79  
80  	TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
81  }
82  
get_kvm_param_bool(const char * param)83  bool get_kvm_param_bool(const char *param)
84  {
85  	return get_module_param_bool("kvm", param);
86  }
87  
get_kvm_intel_param_bool(const char * param)88  bool get_kvm_intel_param_bool(const char *param)
89  {
90  	return get_module_param_bool("kvm_intel", param);
91  }
92  
get_kvm_amd_param_bool(const char * param)93  bool get_kvm_amd_param_bool(const char *param)
94  {
95  	return get_module_param_bool("kvm_amd", param);
96  }
97  
98  /*
99   * Capability
100   *
101   * Input Args:
102   *   cap - Capability
103   *
104   * Output Args: None
105   *
106   * Return:
107   *   On success, the Value corresponding to the capability (KVM_CAP_*)
108   *   specified by the value of cap.  On failure a TEST_ASSERT failure
109   *   is produced.
110   *
111   * Looks up and returns the value corresponding to the capability
112   * (KVM_CAP_*) given by cap.
113   */
kvm_check_cap(long cap)114  unsigned int kvm_check_cap(long cap)
115  {
116  	int ret;
117  	int kvm_fd;
118  
119  	kvm_fd = open_kvm_dev_path_or_exit();
120  	ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap);
121  	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret));
122  
123  	close(kvm_fd);
124  
125  	return (unsigned int)ret;
126  }
127  
vm_enable_dirty_ring(struct kvm_vm * vm,uint32_t ring_size)128  void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
129  {
130  	if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL))
131  		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size);
132  	else
133  		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
134  	vm->dirty_ring_size = ring_size;
135  }
136  
vm_open(struct kvm_vm * vm)137  static void vm_open(struct kvm_vm *vm)
138  {
139  	vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR);
140  
141  	TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT));
142  
143  	vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type);
144  	TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd));
145  }
146  
vm_guest_mode_string(uint32_t i)147  const char *vm_guest_mode_string(uint32_t i)
148  {
149  	static const char * const strings[] = {
150  		[VM_MODE_P52V48_4K]	= "PA-bits:52,  VA-bits:48,  4K pages",
151  		[VM_MODE_P52V48_64K]	= "PA-bits:52,  VA-bits:48, 64K pages",
152  		[VM_MODE_P48V48_4K]	= "PA-bits:48,  VA-bits:48,  4K pages",
153  		[VM_MODE_P48V48_16K]	= "PA-bits:48,  VA-bits:48, 16K pages",
154  		[VM_MODE_P48V48_64K]	= "PA-bits:48,  VA-bits:48, 64K pages",
155  		[VM_MODE_P40V48_4K]	= "PA-bits:40,  VA-bits:48,  4K pages",
156  		[VM_MODE_P40V48_16K]	= "PA-bits:40,  VA-bits:48, 16K pages",
157  		[VM_MODE_P40V48_64K]	= "PA-bits:40,  VA-bits:48, 64K pages",
158  		[VM_MODE_PXXV48_4K]	= "PA-bits:ANY, VA-bits:48,  4K pages",
159  		[VM_MODE_P47V64_4K]	= "PA-bits:47,  VA-bits:64,  4K pages",
160  		[VM_MODE_P44V64_4K]	= "PA-bits:44,  VA-bits:64,  4K pages",
161  		[VM_MODE_P36V48_4K]	= "PA-bits:36,  VA-bits:48,  4K pages",
162  		[VM_MODE_P36V48_16K]	= "PA-bits:36,  VA-bits:48, 16K pages",
163  		[VM_MODE_P36V48_64K]	= "PA-bits:36,  VA-bits:48, 64K pages",
164  		[VM_MODE_P36V47_16K]	= "PA-bits:36,  VA-bits:47, 16K pages",
165  	};
166  	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
167  		       "Missing new mode strings?");
168  
169  	TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
170  
171  	return strings[i];
172  }
173  
174  const struct vm_guest_mode_params vm_guest_mode_params[] = {
175  	[VM_MODE_P52V48_4K]	= { 52, 48,  0x1000, 12 },
176  	[VM_MODE_P52V48_64K]	= { 52, 48, 0x10000, 16 },
177  	[VM_MODE_P48V48_4K]	= { 48, 48,  0x1000, 12 },
178  	[VM_MODE_P48V48_16K]	= { 48, 48,  0x4000, 14 },
179  	[VM_MODE_P48V48_64K]	= { 48, 48, 0x10000, 16 },
180  	[VM_MODE_P40V48_4K]	= { 40, 48,  0x1000, 12 },
181  	[VM_MODE_P40V48_16K]	= { 40, 48,  0x4000, 14 },
182  	[VM_MODE_P40V48_64K]	= { 40, 48, 0x10000, 16 },
183  	[VM_MODE_PXXV48_4K]	= {  0,  0,  0x1000, 12 },
184  	[VM_MODE_P47V64_4K]	= { 47, 64,  0x1000, 12 },
185  	[VM_MODE_P44V64_4K]	= { 44, 64,  0x1000, 12 },
186  	[VM_MODE_P36V48_4K]	= { 36, 48,  0x1000, 12 },
187  	[VM_MODE_P36V48_16K]	= { 36, 48,  0x4000, 14 },
188  	[VM_MODE_P36V48_64K]	= { 36, 48, 0x10000, 16 },
189  	[VM_MODE_P36V47_16K]	= { 36, 47,  0x4000, 14 },
190  };
191  _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
192  	       "Missing new mode params?");
193  
194  /*
195   * Initializes vm->vpages_valid to match the canonical VA space of the
196   * architecture.
197   *
198   * The default implementation is valid for architectures which split the
199   * range addressed by a single page table into a low and high region
200   * based on the MSB of the VA. On architectures with this behavior
201   * the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1].
202   */
vm_vaddr_populate_bitmap(struct kvm_vm * vm)203  __weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
204  {
205  	sparsebit_set_num(vm->vpages_valid,
206  		0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
207  	sparsebit_set_num(vm->vpages_valid,
208  		(~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
209  		(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
210  }
211  
____vm_create(enum vm_guest_mode mode)212  struct kvm_vm *____vm_create(enum vm_guest_mode mode)
213  {
214  	struct kvm_vm *vm;
215  
216  	vm = calloc(1, sizeof(*vm));
217  	TEST_ASSERT(vm != NULL, "Insufficient Memory");
218  
219  	INIT_LIST_HEAD(&vm->vcpus);
220  	vm->regions.gpa_tree = RB_ROOT;
221  	vm->regions.hva_tree = RB_ROOT;
222  	hash_init(vm->regions.slot_hash);
223  
224  	vm->mode = mode;
225  	vm->type = 0;
226  
227  	vm->pa_bits = vm_guest_mode_params[mode].pa_bits;
228  	vm->va_bits = vm_guest_mode_params[mode].va_bits;
229  	vm->page_size = vm_guest_mode_params[mode].page_size;
230  	vm->page_shift = vm_guest_mode_params[mode].page_shift;
231  
232  	/* Setup mode specific traits. */
233  	switch (vm->mode) {
234  	case VM_MODE_P52V48_4K:
235  		vm->pgtable_levels = 4;
236  		break;
237  	case VM_MODE_P52V48_64K:
238  		vm->pgtable_levels = 3;
239  		break;
240  	case VM_MODE_P48V48_4K:
241  		vm->pgtable_levels = 4;
242  		break;
243  	case VM_MODE_P48V48_64K:
244  		vm->pgtable_levels = 3;
245  		break;
246  	case VM_MODE_P40V48_4K:
247  	case VM_MODE_P36V48_4K:
248  		vm->pgtable_levels = 4;
249  		break;
250  	case VM_MODE_P40V48_64K:
251  	case VM_MODE_P36V48_64K:
252  		vm->pgtable_levels = 3;
253  		break;
254  	case VM_MODE_P48V48_16K:
255  	case VM_MODE_P40V48_16K:
256  	case VM_MODE_P36V48_16K:
257  		vm->pgtable_levels = 4;
258  		break;
259  	case VM_MODE_P36V47_16K:
260  		vm->pgtable_levels = 3;
261  		break;
262  	case VM_MODE_PXXV48_4K:
263  #ifdef __x86_64__
264  		kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
265  		/*
266  		 * Ignore KVM support for 5-level paging (vm->va_bits == 57),
267  		 * it doesn't take effect unless a CR4.LA57 is set, which it
268  		 * isn't for this VM_MODE.
269  		 */
270  		TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57,
271  			    "Linear address width (%d bits) not supported",
272  			    vm->va_bits);
273  		pr_debug("Guest physical address width detected: %d\n",
274  			 vm->pa_bits);
275  		vm->pgtable_levels = 4;
276  		vm->va_bits = 48;
277  #else
278  		TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
279  #endif
280  		break;
281  	case VM_MODE_P47V64_4K:
282  		vm->pgtable_levels = 5;
283  		break;
284  	case VM_MODE_P44V64_4K:
285  		vm->pgtable_levels = 5;
286  		break;
287  	default:
288  		TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
289  	}
290  
291  #ifdef __aarch64__
292  	if (vm->pa_bits != 40)
293  		vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
294  #endif
295  
296  	vm_open(vm);
297  
298  	/* Limit to VA-bit canonical virtual addresses. */
299  	vm->vpages_valid = sparsebit_alloc();
300  	vm_vaddr_populate_bitmap(vm);
301  
302  	/* Limit physical addresses to PA-bits. */
303  	vm->max_gfn = vm_compute_max_gfn(vm);
304  
305  	/* Allocate and setup memory for guest. */
306  	vm->vpages_mapped = sparsebit_alloc();
307  
308  	return vm;
309  }
310  
vm_nr_pages_required(enum vm_guest_mode mode,uint32_t nr_runnable_vcpus,uint64_t extra_mem_pages)311  static uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
312  				     uint32_t nr_runnable_vcpus,
313  				     uint64_t extra_mem_pages)
314  {
315  	uint64_t page_size = vm_guest_mode_params[mode].page_size;
316  	uint64_t nr_pages;
317  
318  	TEST_ASSERT(nr_runnable_vcpus,
319  		    "Use vm_create_barebones() for VMs that _never_ have vCPUs\n");
320  
321  	TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
322  		    "nr_vcpus = %d too large for host, max-vcpus = %d",
323  		    nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS));
324  
325  	/*
326  	 * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the
327  	 * test code and other per-VM assets that will be loaded into memslot0.
328  	 */
329  	nr_pages = 512;
330  
331  	/* Account for the per-vCPU stacks on behalf of the test. */
332  	nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS;
333  
334  	/*
335  	 * Account for the number of pages needed for the page tables.  The
336  	 * maximum page table size for a memory region will be when the
337  	 * smallest page size is used. Considering each page contains x page
338  	 * table descriptors, the total extra size for page tables (for extra
339  	 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
340  	 * than N/x*2.
341  	 */
342  	nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2;
343  
344  	/* Account for the number of pages needed by ucall. */
345  	nr_pages += ucall_nr_pages_required(page_size);
346  
347  	return vm_adjust_num_guest_pages(mode, nr_pages);
348  }
349  
__vm_create(enum vm_guest_mode mode,uint32_t nr_runnable_vcpus,uint64_t nr_extra_pages)350  struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
351  			   uint64_t nr_extra_pages)
352  {
353  	uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
354  						 nr_extra_pages);
355  	struct userspace_mem_region *slot0;
356  	struct kvm_vm *vm;
357  	int i;
358  
359  	pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
360  		 vm_guest_mode_string(mode), nr_pages);
361  
362  	vm = ____vm_create(mode);
363  
364  	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
365  	for (i = 0; i < NR_MEM_REGIONS; i++)
366  		vm->memslots[i] = 0;
367  
368  	kvm_vm_elf_load(vm, program_invocation_name);
369  
370  	/*
371  	 * TODO: Add proper defines to protect the library's memslots, and then
372  	 * carve out memslot1 for the ucall MMIO address.  KVM treats writes to
373  	 * read-only memslots as MMIO, and creating a read-only memslot for the
374  	 * MMIO region would prevent silently clobbering the MMIO region.
375  	 */
376  	slot0 = memslot2region(vm, 0);
377  	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
378  
379  	kvm_arch_vm_post_create(vm);
380  
381  	return vm;
382  }
383  
384  /*
385   * VM Create with customized parameters
386   *
387   * Input Args:
388   *   mode - VM Mode (e.g. VM_MODE_P52V48_4K)
389   *   nr_vcpus - VCPU count
390   *   extra_mem_pages - Non-slot0 physical memory total size
391   *   guest_code - Guest entry point
392   *   vcpuids - VCPU IDs
393   *
394   * Output Args: None
395   *
396   * Return:
397   *   Pointer to opaque structure that describes the created VM.
398   *
399   * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
400   * extra_mem_pages is only used to calculate the maximum page table size,
401   * no real memory allocation for non-slot0 memory in this function.
402   */
__vm_create_with_vcpus(enum vm_guest_mode mode,uint32_t nr_vcpus,uint64_t extra_mem_pages,void * guest_code,struct kvm_vcpu * vcpus[])403  struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
404  				      uint64_t extra_mem_pages,
405  				      void *guest_code, struct kvm_vcpu *vcpus[])
406  {
407  	struct kvm_vm *vm;
408  	int i;
409  
410  	TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array");
411  
412  	vm = __vm_create(mode, nr_vcpus, extra_mem_pages);
413  
414  	for (i = 0; i < nr_vcpus; ++i)
415  		vcpus[i] = vm_vcpu_add(vm, i, guest_code);
416  
417  	return vm;
418  }
419  
__vm_create_with_one_vcpu(struct kvm_vcpu ** vcpu,uint64_t extra_mem_pages,void * guest_code)420  struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
421  					 uint64_t extra_mem_pages,
422  					 void *guest_code)
423  {
424  	struct kvm_vcpu *vcpus[1];
425  	struct kvm_vm *vm;
426  
427  	vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages,
428  				    guest_code, vcpus);
429  
430  	*vcpu = vcpus[0];
431  	return vm;
432  }
433  
434  /*
435   * VM Restart
436   *
437   * Input Args:
438   *   vm - VM that has been released before
439   *
440   * Output Args: None
441   *
442   * Reopens the file descriptors associated to the VM and reinstates the
443   * global state, such as the irqchip and the memory regions that are mapped
444   * into the guest.
445   */
kvm_vm_restart(struct kvm_vm * vmp)446  void kvm_vm_restart(struct kvm_vm *vmp)
447  {
448  	int ctr;
449  	struct userspace_mem_region *region;
450  
451  	vm_open(vmp);
452  	if (vmp->has_irqchip)
453  		vm_create_irqchip(vmp);
454  
455  	hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
456  		int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
457  		TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
458  			    "  rc: %i errno: %i\n"
459  			    "  slot: %u flags: 0x%x\n"
460  			    "  guest_phys_addr: 0x%llx size: 0x%llx",
461  			    ret, errno, region->region.slot,
462  			    region->region.flags,
463  			    region->region.guest_phys_addr,
464  			    region->region.memory_size);
465  	}
466  }
467  
vm_arch_vcpu_recreate(struct kvm_vm * vm,uint32_t vcpu_id)468  __weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm,
469  					      uint32_t vcpu_id)
470  {
471  	return __vm_vcpu_add(vm, vcpu_id);
472  }
473  
vm_recreate_with_one_vcpu(struct kvm_vm * vm)474  struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm)
475  {
476  	kvm_vm_restart(vm);
477  
478  	return vm_vcpu_recreate(vm, 0);
479  }
480  
kvm_pin_this_task_to_pcpu(uint32_t pcpu)481  void kvm_pin_this_task_to_pcpu(uint32_t pcpu)
482  {
483  	cpu_set_t mask;
484  	int r;
485  
486  	CPU_ZERO(&mask);
487  	CPU_SET(pcpu, &mask);
488  	r = sched_setaffinity(0, sizeof(mask), &mask);
489  	TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.\n", pcpu);
490  }
491  
parse_pcpu(const char * cpu_str,const cpu_set_t * allowed_mask)492  static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
493  {
494  	uint32_t pcpu = atoi_non_negative("CPU number", cpu_str);
495  
496  	TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask),
497  		    "Not allowed to run on pCPU '%d', check cgroups?\n", pcpu);
498  	return pcpu;
499  }
500  
kvm_print_vcpu_pinning_help(void)501  void kvm_print_vcpu_pinning_help(void)
502  {
503  	const char *name = program_invocation_name;
504  
505  	printf(" -c: Pin tasks to physical CPUs.  Takes a list of comma separated\n"
506  	       "     values (target pCPU), one for each vCPU, plus an optional\n"
507  	       "     entry for the main application task (specified via entry\n"
508  	       "     <nr_vcpus + 1>).  If used, entries must be provided for all\n"
509  	       "     vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
510  	       "     E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
511  	       "     vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
512  	       "         %s -v 3 -c 22,23,24,50\n\n"
513  	       "     To leave the application task unpinned, drop the final entry:\n\n"
514  	       "         %s -v 3 -c 22,23,24\n\n"
515  	       "     (default: no pinning)\n", name, name);
516  }
517  
kvm_parse_vcpu_pinning(const char * pcpus_string,uint32_t vcpu_to_pcpu[],int nr_vcpus)518  void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
519  			    int nr_vcpus)
520  {
521  	cpu_set_t allowed_mask;
522  	char *cpu, *cpu_list;
523  	char delim[2] = ",";
524  	int i, r;
525  
526  	cpu_list = strdup(pcpus_string);
527  	TEST_ASSERT(cpu_list, "strdup() allocation failed.\n");
528  
529  	r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
530  	TEST_ASSERT(!r, "sched_getaffinity() failed");
531  
532  	cpu = strtok(cpu_list, delim);
533  
534  	/* 1. Get all pcpus for vcpus. */
535  	for (i = 0; i < nr_vcpus; i++) {
536  		TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'\n", i);
537  		vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask);
538  		cpu = strtok(NULL, delim);
539  	}
540  
541  	/* 2. Check if the main worker needs to be pinned. */
542  	if (cpu) {
543  		kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask));
544  		cpu = strtok(NULL, delim);
545  	}
546  
547  	TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu);
548  	free(cpu_list);
549  }
550  
551  /*
552   * Userspace Memory Region Find
553   *
554   * Input Args:
555   *   vm - Virtual Machine
556   *   start - Starting VM physical address
557   *   end - Ending VM physical address, inclusive.
558   *
559   * Output Args: None
560   *
561   * Return:
562   *   Pointer to overlapping region, NULL if no such region.
563   *
564   * Searches for a region with any physical memory that overlaps with
565   * any portion of the guest physical addresses from start to end
566   * inclusive.  If multiple overlapping regions exist, a pointer to any
567   * of the regions is returned.  Null is returned only when no overlapping
568   * region exists.
569   */
570  static struct userspace_mem_region *
userspace_mem_region_find(struct kvm_vm * vm,uint64_t start,uint64_t end)571  userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
572  {
573  	struct rb_node *node;
574  
575  	for (node = vm->regions.gpa_tree.rb_node; node; ) {
576  		struct userspace_mem_region *region =
577  			container_of(node, struct userspace_mem_region, gpa_node);
578  		uint64_t existing_start = region->region.guest_phys_addr;
579  		uint64_t existing_end = region->region.guest_phys_addr
580  			+ region->region.memory_size - 1;
581  		if (start <= existing_end && end >= existing_start)
582  			return region;
583  
584  		if (start < existing_start)
585  			node = node->rb_left;
586  		else
587  			node = node->rb_right;
588  	}
589  
590  	return NULL;
591  }
592  
593  /*
594   * KVM Userspace Memory Region Find
595   *
596   * Input Args:
597   *   vm - Virtual Machine
598   *   start - Starting VM physical address
599   *   end - Ending VM physical address, inclusive.
600   *
601   * Output Args: None
602   *
603   * Return:
604   *   Pointer to overlapping region, NULL if no such region.
605   *
606   * Public interface to userspace_mem_region_find. Allows tests to look up
607   * the memslot datastructure for a given range of guest physical memory.
608   */
609  struct kvm_userspace_memory_region *
kvm_userspace_memory_region_find(struct kvm_vm * vm,uint64_t start,uint64_t end)610  kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
611  				 uint64_t end)
612  {
613  	struct userspace_mem_region *region;
614  
615  	region = userspace_mem_region_find(vm, start, end);
616  	if (!region)
617  		return NULL;
618  
619  	return &region->region;
620  }
621  
vcpu_arch_free(struct kvm_vcpu * vcpu)622  __weak void vcpu_arch_free(struct kvm_vcpu *vcpu)
623  {
624  
625  }
626  
627  /*
628   * VM VCPU Remove
629   *
630   * Input Args:
631   *   vcpu - VCPU to remove
632   *
633   * Output Args: None
634   *
635   * Return: None, TEST_ASSERT failures for all error conditions
636   *
637   * Removes a vCPU from a VM and frees its resources.
638   */
vm_vcpu_rm(struct kvm_vm * vm,struct kvm_vcpu * vcpu)639  static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
640  {
641  	int ret;
642  
643  	if (vcpu->dirty_gfns) {
644  		ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
645  		TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
646  		vcpu->dirty_gfns = NULL;
647  	}
648  
649  	ret = munmap(vcpu->run, vcpu_mmap_sz());
650  	TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
651  
652  	ret = close(vcpu->fd);
653  	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
654  
655  	list_del(&vcpu->list);
656  
657  	vcpu_arch_free(vcpu);
658  	free(vcpu);
659  }
660  
kvm_vm_release(struct kvm_vm * vmp)661  void kvm_vm_release(struct kvm_vm *vmp)
662  {
663  	struct kvm_vcpu *vcpu, *tmp;
664  	int ret;
665  
666  	list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
667  		vm_vcpu_rm(vmp, vcpu);
668  
669  	ret = close(vmp->fd);
670  	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
671  
672  	ret = close(vmp->kvm_fd);
673  	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
674  }
675  
__vm_mem_region_delete(struct kvm_vm * vm,struct userspace_mem_region * region,bool unlink)676  static void __vm_mem_region_delete(struct kvm_vm *vm,
677  				   struct userspace_mem_region *region,
678  				   bool unlink)
679  {
680  	int ret;
681  
682  	if (unlink) {
683  		rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
684  		rb_erase(&region->hva_node, &vm->regions.hva_tree);
685  		hash_del(&region->slot_node);
686  	}
687  
688  	region->region.memory_size = 0;
689  	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
690  
691  	sparsebit_free(&region->unused_phy_pages);
692  	ret = munmap(region->mmap_start, region->mmap_size);
693  	TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
694  	if (region->fd >= 0) {
695  		/* There's an extra map when using shared memory. */
696  		ret = munmap(region->mmap_alias, region->mmap_size);
697  		TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
698  		close(region->fd);
699  	}
700  
701  	free(region);
702  }
703  
704  /*
705   * Destroys and frees the VM pointed to by vmp.
706   */
kvm_vm_free(struct kvm_vm * vmp)707  void kvm_vm_free(struct kvm_vm *vmp)
708  {
709  	int ctr;
710  	struct hlist_node *node;
711  	struct userspace_mem_region *region;
712  
713  	if (vmp == NULL)
714  		return;
715  
716  	/* Free cached stats metadata and close FD */
717  	if (vmp->stats_fd) {
718  		free(vmp->stats_desc);
719  		close(vmp->stats_fd);
720  	}
721  
722  	/* Free userspace_mem_regions. */
723  	hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
724  		__vm_mem_region_delete(vmp, region, false);
725  
726  	/* Free sparsebit arrays. */
727  	sparsebit_free(&vmp->vpages_valid);
728  	sparsebit_free(&vmp->vpages_mapped);
729  
730  	kvm_vm_release(vmp);
731  
732  	/* Free the structure describing the VM. */
733  	free(vmp);
734  }
735  
kvm_memfd_alloc(size_t size,bool hugepages)736  int kvm_memfd_alloc(size_t size, bool hugepages)
737  {
738  	int memfd_flags = MFD_CLOEXEC;
739  	int fd, r;
740  
741  	if (hugepages)
742  		memfd_flags |= MFD_HUGETLB;
743  
744  	fd = memfd_create("kvm_selftest", memfd_flags);
745  	TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd));
746  
747  	r = ftruncate(fd, size);
748  	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r));
749  
750  	r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size);
751  	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
752  
753  	return fd;
754  }
755  
756  /*
757   * Memory Compare, host virtual to guest virtual
758   *
759   * Input Args:
760   *   hva - Starting host virtual address
761   *   vm - Virtual Machine
762   *   gva - Starting guest virtual address
763   *   len - number of bytes to compare
764   *
765   * Output Args: None
766   *
767   * Input/Output Args: None
768   *
769   * Return:
770   *   Returns 0 if the bytes starting at hva for a length of len
771   *   are equal the guest virtual bytes starting at gva.  Returns
772   *   a value < 0, if bytes at hva are less than those at gva.
773   *   Otherwise a value > 0 is returned.
774   *
775   * Compares the bytes starting at the host virtual address hva, for
776   * a length of len, to the guest bytes starting at the guest virtual
777   * address given by gva.
778   */
kvm_memcmp_hva_gva(void * hva,struct kvm_vm * vm,vm_vaddr_t gva,size_t len)779  int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
780  {
781  	size_t amt;
782  
783  	/*
784  	 * Compare a batch of bytes until either a match is found
785  	 * or all the bytes have been compared.
786  	 */
787  	for (uintptr_t offset = 0; offset < len; offset += amt) {
788  		uintptr_t ptr1 = (uintptr_t)hva + offset;
789  
790  		/*
791  		 * Determine host address for guest virtual address
792  		 * at offset.
793  		 */
794  		uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
795  
796  		/*
797  		 * Determine amount to compare on this pass.
798  		 * Don't allow the comparsion to cross a page boundary.
799  		 */
800  		amt = len - offset;
801  		if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
802  			amt = vm->page_size - (ptr1 % vm->page_size);
803  		if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
804  			amt = vm->page_size - (ptr2 % vm->page_size);
805  
806  		assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
807  		assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
808  
809  		/*
810  		 * Perform the comparison.  If there is a difference
811  		 * return that result to the caller, otherwise need
812  		 * to continue on looking for a mismatch.
813  		 */
814  		int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
815  		if (ret != 0)
816  			return ret;
817  	}
818  
819  	/*
820  	 * No mismatch found.  Let the caller know the two memory
821  	 * areas are equal.
822  	 */
823  	return 0;
824  }
825  
vm_userspace_mem_region_gpa_insert(struct rb_root * gpa_tree,struct userspace_mem_region * region)826  static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
827  					       struct userspace_mem_region *region)
828  {
829  	struct rb_node **cur, *parent;
830  
831  	for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) {
832  		struct userspace_mem_region *cregion;
833  
834  		cregion = container_of(*cur, typeof(*cregion), gpa_node);
835  		parent = *cur;
836  		if (region->region.guest_phys_addr <
837  		    cregion->region.guest_phys_addr)
838  			cur = &(*cur)->rb_left;
839  		else {
840  			TEST_ASSERT(region->region.guest_phys_addr !=
841  				    cregion->region.guest_phys_addr,
842  				    "Duplicate GPA in region tree");
843  
844  			cur = &(*cur)->rb_right;
845  		}
846  	}
847  
848  	rb_link_node(&region->gpa_node, parent, cur);
849  	rb_insert_color(&region->gpa_node, gpa_tree);
850  }
851  
vm_userspace_mem_region_hva_insert(struct rb_root * hva_tree,struct userspace_mem_region * region)852  static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
853  					       struct userspace_mem_region *region)
854  {
855  	struct rb_node **cur, *parent;
856  
857  	for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) {
858  		struct userspace_mem_region *cregion;
859  
860  		cregion = container_of(*cur, typeof(*cregion), hva_node);
861  		parent = *cur;
862  		if (region->host_mem < cregion->host_mem)
863  			cur = &(*cur)->rb_left;
864  		else {
865  			TEST_ASSERT(region->host_mem !=
866  				    cregion->host_mem,
867  				    "Duplicate HVA in region tree");
868  
869  			cur = &(*cur)->rb_right;
870  		}
871  	}
872  
873  	rb_link_node(&region->hva_node, parent, cur);
874  	rb_insert_color(&region->hva_node, hva_tree);
875  }
876  
877  
__vm_set_user_memory_region(struct kvm_vm * vm,uint32_t slot,uint32_t flags,uint64_t gpa,uint64_t size,void * hva)878  int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
879  				uint64_t gpa, uint64_t size, void *hva)
880  {
881  	struct kvm_userspace_memory_region region = {
882  		.slot = slot,
883  		.flags = flags,
884  		.guest_phys_addr = gpa,
885  		.memory_size = size,
886  		.userspace_addr = (uintptr_t)hva,
887  	};
888  
889  	return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region);
890  }
891  
vm_set_user_memory_region(struct kvm_vm * vm,uint32_t slot,uint32_t flags,uint64_t gpa,uint64_t size,void * hva)892  void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
893  			       uint64_t gpa, uint64_t size, void *hva)
894  {
895  	int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva);
896  
897  	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)",
898  		    errno, strerror(errno));
899  }
900  
901  /*
902   * VM Userspace Memory Region Add
903   *
904   * Input Args:
905   *   vm - Virtual Machine
906   *   src_type - Storage source for this region.
907   *              NULL to use anonymous memory.
908   *   guest_paddr - Starting guest physical address
909   *   slot - KVM region slot
910   *   npages - Number of physical pages
911   *   flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
912   *
913   * Output Args: None
914   *
915   * Return: None
916   *
917   * Allocates a memory area of the number of pages specified by npages
918   * and maps it to the VM specified by vm, at a starting physical address
919   * given by guest_paddr.  The region is created with a KVM region slot
920   * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM.  The
921   * region is created with the flags given by flags.
922   */
vm_userspace_mem_region_add(struct kvm_vm * vm,enum vm_mem_backing_src_type src_type,uint64_t guest_paddr,uint32_t slot,uint64_t npages,uint32_t flags)923  void vm_userspace_mem_region_add(struct kvm_vm *vm,
924  	enum vm_mem_backing_src_type src_type,
925  	uint64_t guest_paddr, uint32_t slot, uint64_t npages,
926  	uint32_t flags)
927  {
928  	int ret;
929  	struct userspace_mem_region *region;
930  	size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
931  	size_t alignment;
932  
933  	TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
934  		"Number of guest pages is not compatible with the host. "
935  		"Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
936  
937  	TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
938  		"address not on a page boundary.\n"
939  		"  guest_paddr: 0x%lx vm->page_size: 0x%x",
940  		guest_paddr, vm->page_size);
941  	TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1)
942  		<= vm->max_gfn, "Physical range beyond maximum "
943  		"supported physical address,\n"
944  		"  guest_paddr: 0x%lx npages: 0x%lx\n"
945  		"  vm->max_gfn: 0x%lx vm->page_size: 0x%x",
946  		guest_paddr, npages, vm->max_gfn, vm->page_size);
947  
948  	/*
949  	 * Confirm a mem region with an overlapping address doesn't
950  	 * already exist.
951  	 */
952  	region = (struct userspace_mem_region *) userspace_mem_region_find(
953  		vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1);
954  	if (region != NULL)
955  		TEST_FAIL("overlapping userspace_mem_region already "
956  			"exists\n"
957  			"  requested guest_paddr: 0x%lx npages: 0x%lx "
958  			"page_size: 0x%x\n"
959  			"  existing guest_paddr: 0x%lx size: 0x%lx",
960  			guest_paddr, npages, vm->page_size,
961  			(uint64_t) region->region.guest_phys_addr,
962  			(uint64_t) region->region.memory_size);
963  
964  	/* Confirm no region with the requested slot already exists. */
965  	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
966  			       slot) {
967  		if (region->region.slot != slot)
968  			continue;
969  
970  		TEST_FAIL("A mem region with the requested slot "
971  			"already exists.\n"
972  			"  requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
973  			"  existing slot: %u paddr: 0x%lx size: 0x%lx",
974  			slot, guest_paddr, npages,
975  			region->region.slot,
976  			(uint64_t) region->region.guest_phys_addr,
977  			(uint64_t) region->region.memory_size);
978  	}
979  
980  	/* Allocate and initialize new mem region structure. */
981  	region = calloc(1, sizeof(*region));
982  	TEST_ASSERT(region != NULL, "Insufficient Memory");
983  	region->mmap_size = npages * vm->page_size;
984  
985  #ifdef __s390x__
986  	/* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
987  	alignment = 0x100000;
988  #else
989  	alignment = 1;
990  #endif
991  
992  	/*
993  	 * When using THP mmap is not guaranteed to returned a hugepage aligned
994  	 * address so we have to pad the mmap. Padding is not needed for HugeTLB
995  	 * because mmap will always return an address aligned to the HugeTLB
996  	 * page size.
997  	 */
998  	if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
999  		alignment = max(backing_src_pagesz, alignment);
1000  
1001  	TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz));
1002  
1003  	/* Add enough memory to align up if necessary */
1004  	if (alignment > 1)
1005  		region->mmap_size += alignment;
1006  
1007  	region->fd = -1;
1008  	if (backing_src_is_shared(src_type))
1009  		region->fd = kvm_memfd_alloc(region->mmap_size,
1010  					     src_type == VM_MEM_SRC_SHARED_HUGETLB);
1011  
1012  	region->mmap_start = mmap(NULL, region->mmap_size,
1013  				  PROT_READ | PROT_WRITE,
1014  				  vm_mem_backing_src_alias(src_type)->flag,
1015  				  region->fd, 0);
1016  	TEST_ASSERT(region->mmap_start != MAP_FAILED,
1017  		    __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED));
1018  
1019  	TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
1020  		    region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
1021  		    "mmap_start %p is not aligned to HugeTLB page size 0x%lx",
1022  		    region->mmap_start, backing_src_pagesz);
1023  
1024  	/* Align host address */
1025  	region->host_mem = align_ptr_up(region->mmap_start, alignment);
1026  
1027  	/* As needed perform madvise */
1028  	if ((src_type == VM_MEM_SRC_ANONYMOUS ||
1029  	     src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
1030  		ret = madvise(region->host_mem, npages * vm->page_size,
1031  			      src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
1032  		TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
1033  			    region->host_mem, npages * vm->page_size,
1034  			    vm_mem_backing_src_alias(src_type)->name);
1035  	}
1036  
1037  	region->backing_src_type = src_type;
1038  	region->unused_phy_pages = sparsebit_alloc();
1039  	sparsebit_set_num(region->unused_phy_pages,
1040  		guest_paddr >> vm->page_shift, npages);
1041  	region->region.slot = slot;
1042  	region->region.flags = flags;
1043  	region->region.guest_phys_addr = guest_paddr;
1044  	region->region.memory_size = npages * vm->page_size;
1045  	region->region.userspace_addr = (uintptr_t) region->host_mem;
1046  	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
1047  	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
1048  		"  rc: %i errno: %i\n"
1049  		"  slot: %u flags: 0x%x\n"
1050  		"  guest_phys_addr: 0x%lx size: 0x%lx",
1051  		ret, errno, slot, flags,
1052  		guest_paddr, (uint64_t) region->region.memory_size);
1053  
1054  	/* Add to quick lookup data structures */
1055  	vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
1056  	vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
1057  	hash_add(vm->regions.slot_hash, &region->slot_node, slot);
1058  
1059  	/* If shared memory, create an alias. */
1060  	if (region->fd >= 0) {
1061  		region->mmap_alias = mmap(NULL, region->mmap_size,
1062  					  PROT_READ | PROT_WRITE,
1063  					  vm_mem_backing_src_alias(src_type)->flag,
1064  					  region->fd, 0);
1065  		TEST_ASSERT(region->mmap_alias != MAP_FAILED,
1066  			    __KVM_SYSCALL_ERROR("mmap()",  (int)(unsigned long)MAP_FAILED));
1067  
1068  		/* Align host alias address */
1069  		region->host_alias = align_ptr_up(region->mmap_alias, alignment);
1070  	}
1071  }
1072  
1073  /*
1074   * Memslot to region
1075   *
1076   * Input Args:
1077   *   vm - Virtual Machine
1078   *   memslot - KVM memory slot ID
1079   *
1080   * Output Args: None
1081   *
1082   * Return:
1083   *   Pointer to memory region structure that describe memory region
1084   *   using kvm memory slot ID given by memslot.  TEST_ASSERT failure
1085   *   on error (e.g. currently no memory region using memslot as a KVM
1086   *   memory slot ID).
1087   */
1088  struct userspace_mem_region *
memslot2region(struct kvm_vm * vm,uint32_t memslot)1089  memslot2region(struct kvm_vm *vm, uint32_t memslot)
1090  {
1091  	struct userspace_mem_region *region;
1092  
1093  	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
1094  			       memslot)
1095  		if (region->region.slot == memslot)
1096  			return region;
1097  
1098  	fprintf(stderr, "No mem region with the requested slot found,\n"
1099  		"  requested slot: %u\n", memslot);
1100  	fputs("---- vm dump ----\n", stderr);
1101  	vm_dump(stderr, vm, 2);
1102  	TEST_FAIL("Mem region not found");
1103  	return NULL;
1104  }
1105  
1106  /*
1107   * VM Memory Region Flags Set
1108   *
1109   * Input Args:
1110   *   vm - Virtual Machine
1111   *   flags - Starting guest physical address
1112   *
1113   * Output Args: None
1114   *
1115   * Return: None
1116   *
1117   * Sets the flags of the memory region specified by the value of slot,
1118   * to the values given by flags.
1119   */
vm_mem_region_set_flags(struct kvm_vm * vm,uint32_t slot,uint32_t flags)1120  void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
1121  {
1122  	int ret;
1123  	struct userspace_mem_region *region;
1124  
1125  	region = memslot2region(vm, slot);
1126  
1127  	region->region.flags = flags;
1128  
1129  	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
1130  
1131  	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
1132  		"  rc: %i errno: %i slot: %u flags: 0x%x",
1133  		ret, errno, slot, flags);
1134  }
1135  
1136  /*
1137   * VM Memory Region Move
1138   *
1139   * Input Args:
1140   *   vm - Virtual Machine
1141   *   slot - Slot of the memory region to move
1142   *   new_gpa - Starting guest physical address
1143   *
1144   * Output Args: None
1145   *
1146   * Return: None
1147   *
1148   * Change the gpa of a memory region.
1149   */
vm_mem_region_move(struct kvm_vm * vm,uint32_t slot,uint64_t new_gpa)1150  void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
1151  {
1152  	struct userspace_mem_region *region;
1153  	int ret;
1154  
1155  	region = memslot2region(vm, slot);
1156  
1157  	region->region.guest_phys_addr = new_gpa;
1158  
1159  	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
1160  
1161  	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n"
1162  		    "ret: %i errno: %i slot: %u new_gpa: 0x%lx",
1163  		    ret, errno, slot, new_gpa);
1164  }
1165  
1166  /*
1167   * VM Memory Region Delete
1168   *
1169   * Input Args:
1170   *   vm - Virtual Machine
1171   *   slot - Slot of the memory region to delete
1172   *
1173   * Output Args: None
1174   *
1175   * Return: None
1176   *
1177   * Delete a memory region.
1178   */
vm_mem_region_delete(struct kvm_vm * vm,uint32_t slot)1179  void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
1180  {
1181  	__vm_mem_region_delete(vm, memslot2region(vm, slot), true);
1182  }
1183  
1184  /* Returns the size of a vCPU's kvm_run structure. */
vcpu_mmap_sz(void)1185  static int vcpu_mmap_sz(void)
1186  {
1187  	int dev_fd, ret;
1188  
1189  	dev_fd = open_kvm_dev_path_or_exit();
1190  
1191  	ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
1192  	TEST_ASSERT(ret >= sizeof(struct kvm_run),
1193  		    KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret));
1194  
1195  	close(dev_fd);
1196  
1197  	return ret;
1198  }
1199  
vcpu_exists(struct kvm_vm * vm,uint32_t vcpu_id)1200  static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id)
1201  {
1202  	struct kvm_vcpu *vcpu;
1203  
1204  	list_for_each_entry(vcpu, &vm->vcpus, list) {
1205  		if (vcpu->id == vcpu_id)
1206  			return true;
1207  	}
1208  
1209  	return false;
1210  }
1211  
1212  /*
1213   * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id.
1214   * No additional vCPU setup is done.  Returns the vCPU.
1215   */
__vm_vcpu_add(struct kvm_vm * vm,uint32_t vcpu_id)1216  struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
1217  {
1218  	struct kvm_vcpu *vcpu;
1219  
1220  	/* Confirm a vcpu with the specified id doesn't already exist. */
1221  	TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists\n", vcpu_id);
1222  
1223  	/* Allocate and initialize new vcpu structure. */
1224  	vcpu = calloc(1, sizeof(*vcpu));
1225  	TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
1226  
1227  	vcpu->vm = vm;
1228  	vcpu->id = vcpu_id;
1229  	vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id);
1230  	TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd));
1231  
1232  	TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size "
1233  		"smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
1234  		vcpu_mmap_sz(), sizeof(*vcpu->run));
1235  	vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(),
1236  		PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0);
1237  	TEST_ASSERT(vcpu->run != MAP_FAILED,
1238  		    __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED));
1239  
1240  	/* Add to linked-list of VCPUs. */
1241  	list_add(&vcpu->list, &vm->vcpus);
1242  
1243  	return vcpu;
1244  }
1245  
1246  /*
1247   * VM Virtual Address Unused Gap
1248   *
1249   * Input Args:
1250   *   vm - Virtual Machine
1251   *   sz - Size (bytes)
1252   *   vaddr_min - Minimum Virtual Address
1253   *
1254   * Output Args: None
1255   *
1256   * Return:
1257   *   Lowest virtual address at or below vaddr_min, with at least
1258   *   sz unused bytes.  TEST_ASSERT failure if no area of at least
1259   *   size sz is available.
1260   *
1261   * Within the VM specified by vm, locates the lowest starting virtual
1262   * address >= vaddr_min, that has at least sz unallocated bytes.  A
1263   * TEST_ASSERT failure occurs for invalid input or no area of at least
1264   * sz unallocated bytes >= vaddr_min is available.
1265   */
vm_vaddr_unused_gap(struct kvm_vm * vm,size_t sz,vm_vaddr_t vaddr_min)1266  vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
1267  			       vm_vaddr_t vaddr_min)
1268  {
1269  	uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
1270  
1271  	/* Determine lowest permitted virtual page index. */
1272  	uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
1273  	if ((pgidx_start * vm->page_size) < vaddr_min)
1274  		goto no_va_found;
1275  
1276  	/* Loop over section with enough valid virtual page indexes. */
1277  	if (!sparsebit_is_set_num(vm->vpages_valid,
1278  		pgidx_start, pages))
1279  		pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
1280  			pgidx_start, pages);
1281  	do {
1282  		/*
1283  		 * Are there enough unused virtual pages available at
1284  		 * the currently proposed starting virtual page index.
1285  		 * If not, adjust proposed starting index to next
1286  		 * possible.
1287  		 */
1288  		if (sparsebit_is_clear_num(vm->vpages_mapped,
1289  			pgidx_start, pages))
1290  			goto va_found;
1291  		pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
1292  			pgidx_start, pages);
1293  		if (pgidx_start == 0)
1294  			goto no_va_found;
1295  
1296  		/*
1297  		 * If needed, adjust proposed starting virtual address,
1298  		 * to next range of valid virtual addresses.
1299  		 */
1300  		if (!sparsebit_is_set_num(vm->vpages_valid,
1301  			pgidx_start, pages)) {
1302  			pgidx_start = sparsebit_next_set_num(
1303  				vm->vpages_valid, pgidx_start, pages);
1304  			if (pgidx_start == 0)
1305  				goto no_va_found;
1306  		}
1307  	} while (pgidx_start != 0);
1308  
1309  no_va_found:
1310  	TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
1311  
1312  	/* NOT REACHED */
1313  	return -1;
1314  
1315  va_found:
1316  	TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
1317  		pgidx_start, pages),
1318  		"Unexpected, invalid virtual page index range,\n"
1319  		"  pgidx_start: 0x%lx\n"
1320  		"  pages: 0x%lx",
1321  		pgidx_start, pages);
1322  	TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
1323  		pgidx_start, pages),
1324  		"Unexpected, pages already mapped,\n"
1325  		"  pgidx_start: 0x%lx\n"
1326  		"  pages: 0x%lx",
1327  		pgidx_start, pages);
1328  
1329  	return pgidx_start * vm->page_size;
1330  }
1331  
__vm_vaddr_alloc(struct kvm_vm * vm,size_t sz,vm_vaddr_t vaddr_min,enum kvm_mem_region_type type)1332  vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
1333  			    enum kvm_mem_region_type type)
1334  {
1335  	uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
1336  
1337  	virt_pgd_alloc(vm);
1338  	vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages,
1339  					      KVM_UTIL_MIN_PFN * vm->page_size,
1340  					      vm->memslots[type]);
1341  
1342  	/*
1343  	 * Find an unused range of virtual page addresses of at least
1344  	 * pages in length.
1345  	 */
1346  	vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
1347  
1348  	/* Map the virtual pages. */
1349  	for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
1350  		pages--, vaddr += vm->page_size, paddr += vm->page_size) {
1351  
1352  		virt_pg_map(vm, vaddr, paddr);
1353  
1354  		sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
1355  	}
1356  
1357  	return vaddr_start;
1358  }
1359  
1360  /*
1361   * VM Virtual Address Allocate
1362   *
1363   * Input Args:
1364   *   vm - Virtual Machine
1365   *   sz - Size in bytes
1366   *   vaddr_min - Minimum starting virtual address
1367   *
1368   * Output Args: None
1369   *
1370   * Return:
1371   *   Starting guest virtual address
1372   *
1373   * Allocates at least sz bytes within the virtual address space of the vm
1374   * given by vm.  The allocated bytes are mapped to a virtual address >=
1375   * the address given by vaddr_min.  Note that each allocation uses a
1376   * a unique set of pages, with the minimum real allocation being at least
1377   * a page. The allocated physical space comes from the TEST_DATA memory region.
1378   */
vm_vaddr_alloc(struct kvm_vm * vm,size_t sz,vm_vaddr_t vaddr_min)1379  vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
1380  {
1381  	return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA);
1382  }
1383  
1384  /*
1385   * VM Virtual Address Allocate Pages
1386   *
1387   * Input Args:
1388   *   vm - Virtual Machine
1389   *
1390   * Output Args: None
1391   *
1392   * Return:
1393   *   Starting guest virtual address
1394   *
1395   * Allocates at least N system pages worth of bytes within the virtual address
1396   * space of the vm.
1397   */
vm_vaddr_alloc_pages(struct kvm_vm * vm,int nr_pages)1398  vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages)
1399  {
1400  	return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR);
1401  }
1402  
__vm_vaddr_alloc_page(struct kvm_vm * vm,enum kvm_mem_region_type type)1403  vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type)
1404  {
1405  	return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type);
1406  }
1407  
1408  /*
1409   * VM Virtual Address Allocate Page
1410   *
1411   * Input Args:
1412   *   vm - Virtual Machine
1413   *
1414   * Output Args: None
1415   *
1416   * Return:
1417   *   Starting guest virtual address
1418   *
1419   * Allocates at least one system page worth of bytes within the virtual address
1420   * space of the vm.
1421   */
vm_vaddr_alloc_page(struct kvm_vm * vm)1422  vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm)
1423  {
1424  	return vm_vaddr_alloc_pages(vm, 1);
1425  }
1426  
1427  /*
1428   * Map a range of VM virtual address to the VM's physical address
1429   *
1430   * Input Args:
1431   *   vm - Virtual Machine
1432   *   vaddr - Virtuall address to map
1433   *   paddr - VM Physical Address
1434   *   npages - The number of pages to map
1435   *
1436   * Output Args: None
1437   *
1438   * Return: None
1439   *
1440   * Within the VM given by @vm, creates a virtual translation for
1441   * @npages starting at @vaddr to the page range starting at @paddr.
1442   */
virt_map(struct kvm_vm * vm,uint64_t vaddr,uint64_t paddr,unsigned int npages)1443  void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
1444  	      unsigned int npages)
1445  {
1446  	size_t page_size = vm->page_size;
1447  	size_t size = npages * page_size;
1448  
1449  	TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
1450  	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
1451  
1452  	while (npages--) {
1453  		virt_pg_map(vm, vaddr, paddr);
1454  		sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
1455  
1456  		vaddr += page_size;
1457  		paddr += page_size;
1458  	}
1459  }
1460  
1461  /*
1462   * Address VM Physical to Host Virtual
1463   *
1464   * Input Args:
1465   *   vm - Virtual Machine
1466   *   gpa - VM physical address
1467   *
1468   * Output Args: None
1469   *
1470   * Return:
1471   *   Equivalent host virtual address
1472   *
1473   * Locates the memory region containing the VM physical address given
1474   * by gpa, within the VM given by vm.  When found, the host virtual
1475   * address providing the memory to the vm physical address is returned.
1476   * A TEST_ASSERT failure occurs if no region containing gpa exists.
1477   */
addr_gpa2hva(struct kvm_vm * vm,vm_paddr_t gpa)1478  void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
1479  {
1480  	struct userspace_mem_region *region;
1481  
1482  	region = userspace_mem_region_find(vm, gpa, gpa);
1483  	if (!region) {
1484  		TEST_FAIL("No vm physical memory at 0x%lx", gpa);
1485  		return NULL;
1486  	}
1487  
1488  	return (void *)((uintptr_t)region->host_mem
1489  		+ (gpa - region->region.guest_phys_addr));
1490  }
1491  
1492  /*
1493   * Address Host Virtual to VM Physical
1494   *
1495   * Input Args:
1496   *   vm - Virtual Machine
1497   *   hva - Host virtual address
1498   *
1499   * Output Args: None
1500   *
1501   * Return:
1502   *   Equivalent VM physical address
1503   *
1504   * Locates the memory region containing the host virtual address given
1505   * by hva, within the VM given by vm.  When found, the equivalent
1506   * VM physical address is returned. A TEST_ASSERT failure occurs if no
1507   * region containing hva exists.
1508   */
addr_hva2gpa(struct kvm_vm * vm,void * hva)1509  vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
1510  {
1511  	struct rb_node *node;
1512  
1513  	for (node = vm->regions.hva_tree.rb_node; node; ) {
1514  		struct userspace_mem_region *region =
1515  			container_of(node, struct userspace_mem_region, hva_node);
1516  
1517  		if (hva >= region->host_mem) {
1518  			if (hva <= (region->host_mem
1519  				+ region->region.memory_size - 1))
1520  				return (vm_paddr_t)((uintptr_t)
1521  					region->region.guest_phys_addr
1522  					+ (hva - (uintptr_t)region->host_mem));
1523  
1524  			node = node->rb_right;
1525  		} else
1526  			node = node->rb_left;
1527  	}
1528  
1529  	TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
1530  	return -1;
1531  }
1532  
1533  /*
1534   * Address VM physical to Host Virtual *alias*.
1535   *
1536   * Input Args:
1537   *   vm - Virtual Machine
1538   *   gpa - VM physical address
1539   *
1540   * Output Args: None
1541   *
1542   * Return:
1543   *   Equivalent address within the host virtual *alias* area, or NULL
1544   *   (without failing the test) if the guest memory is not shared (so
1545   *   no alias exists).
1546   *
1547   * Create a writable, shared virtual=>physical alias for the specific GPA.
1548   * The primary use case is to allow the host selftest to manipulate guest
1549   * memory without mapping said memory in the guest's address space. And, for
1550   * userfaultfd-based demand paging, to do so without triggering userfaults.
1551   */
addr_gpa2alias(struct kvm_vm * vm,vm_paddr_t gpa)1552  void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
1553  {
1554  	struct userspace_mem_region *region;
1555  	uintptr_t offset;
1556  
1557  	region = userspace_mem_region_find(vm, gpa, gpa);
1558  	if (!region)
1559  		return NULL;
1560  
1561  	if (!region->host_alias)
1562  		return NULL;
1563  
1564  	offset = gpa - region->region.guest_phys_addr;
1565  	return (void *) ((uintptr_t) region->host_alias + offset);
1566  }
1567  
1568  /* Create an interrupt controller chip for the specified VM. */
vm_create_irqchip(struct kvm_vm * vm)1569  void vm_create_irqchip(struct kvm_vm *vm)
1570  {
1571  	vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
1572  
1573  	vm->has_irqchip = true;
1574  }
1575  
_vcpu_run(struct kvm_vcpu * vcpu)1576  int _vcpu_run(struct kvm_vcpu *vcpu)
1577  {
1578  	int rc;
1579  
1580  	do {
1581  		rc = __vcpu_run(vcpu);
1582  	} while (rc == -1 && errno == EINTR);
1583  
1584  	assert_on_unhandled_exception(vcpu);
1585  
1586  	return rc;
1587  }
1588  
1589  /*
1590   * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR.
1591   * Assert if the KVM returns an error (other than -EINTR).
1592   */
vcpu_run(struct kvm_vcpu * vcpu)1593  void vcpu_run(struct kvm_vcpu *vcpu)
1594  {
1595  	int ret = _vcpu_run(vcpu);
1596  
1597  	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret));
1598  }
1599  
vcpu_run_complete_io(struct kvm_vcpu * vcpu)1600  void vcpu_run_complete_io(struct kvm_vcpu *vcpu)
1601  {
1602  	int ret;
1603  
1604  	vcpu->run->immediate_exit = 1;
1605  	ret = __vcpu_run(vcpu);
1606  	vcpu->run->immediate_exit = 0;
1607  
1608  	TEST_ASSERT(ret == -1 && errno == EINTR,
1609  		    "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
1610  		    ret, errno);
1611  }
1612  
1613  /*
1614   * Get the list of guest registers which are supported for
1615   * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.  Returns a kvm_reg_list pointer,
1616   * it is the caller's responsibility to free the list.
1617   */
vcpu_get_reg_list(struct kvm_vcpu * vcpu)1618  struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu)
1619  {
1620  	struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
1621  	int ret;
1622  
1623  	ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, &reg_list_n);
1624  	TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
1625  
1626  	reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
1627  	reg_list->n = reg_list_n.n;
1628  	vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list);
1629  	return reg_list;
1630  }
1631  
vcpu_map_dirty_ring(struct kvm_vcpu * vcpu)1632  void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu)
1633  {
1634  	uint32_t page_size = getpagesize();
1635  	uint32_t size = vcpu->vm->dirty_ring_size;
1636  
1637  	TEST_ASSERT(size > 0, "Should enable dirty ring first");
1638  
1639  	if (!vcpu->dirty_gfns) {
1640  		void *addr;
1641  
1642  		addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd,
1643  			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1644  		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private");
1645  
1646  		addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd,
1647  			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1648  		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec");
1649  
1650  		addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd,
1651  			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1652  		TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed");
1653  
1654  		vcpu->dirty_gfns = addr;
1655  		vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
1656  	}
1657  
1658  	return vcpu->dirty_gfns;
1659  }
1660  
1661  /*
1662   * Device Ioctl
1663   */
1664  
__kvm_has_device_attr(int dev_fd,uint32_t group,uint64_t attr)1665  int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr)
1666  {
1667  	struct kvm_device_attr attribute = {
1668  		.group = group,
1669  		.attr = attr,
1670  		.flags = 0,
1671  	};
1672  
1673  	return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
1674  }
1675  
__kvm_test_create_device(struct kvm_vm * vm,uint64_t type)1676  int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type)
1677  {
1678  	struct kvm_create_device create_dev = {
1679  		.type = type,
1680  		.flags = KVM_CREATE_DEVICE_TEST,
1681  	};
1682  
1683  	return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1684  }
1685  
__kvm_create_device(struct kvm_vm * vm,uint64_t type)1686  int __kvm_create_device(struct kvm_vm *vm, uint64_t type)
1687  {
1688  	struct kvm_create_device create_dev = {
1689  		.type = type,
1690  		.fd = -1,
1691  		.flags = 0,
1692  	};
1693  	int err;
1694  
1695  	err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1696  	TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value");
1697  	return err ? : create_dev.fd;
1698  }
1699  
__kvm_device_attr_get(int dev_fd,uint32_t group,uint64_t attr,void * val)1700  int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val)
1701  {
1702  	struct kvm_device_attr kvmattr = {
1703  		.group = group,
1704  		.attr = attr,
1705  		.flags = 0,
1706  		.addr = (uintptr_t)val,
1707  	};
1708  
1709  	return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr);
1710  }
1711  
__kvm_device_attr_set(int dev_fd,uint32_t group,uint64_t attr,void * val)1712  int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val)
1713  {
1714  	struct kvm_device_attr kvmattr = {
1715  		.group = group,
1716  		.attr = attr,
1717  		.flags = 0,
1718  		.addr = (uintptr_t)val,
1719  	};
1720  
1721  	return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr);
1722  }
1723  
1724  /*
1725   * IRQ related functions.
1726   */
1727  
_kvm_irq_line(struct kvm_vm * vm,uint32_t irq,int level)1728  int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1729  {
1730  	struct kvm_irq_level irq_level = {
1731  		.irq    = irq,
1732  		.level  = level,
1733  	};
1734  
1735  	return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level);
1736  }
1737  
kvm_irq_line(struct kvm_vm * vm,uint32_t irq,int level)1738  void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1739  {
1740  	int ret = _kvm_irq_line(vm, irq, level);
1741  
1742  	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret));
1743  }
1744  
kvm_gsi_routing_create(void)1745  struct kvm_irq_routing *kvm_gsi_routing_create(void)
1746  {
1747  	struct kvm_irq_routing *routing;
1748  	size_t size;
1749  
1750  	size = sizeof(struct kvm_irq_routing);
1751  	/* Allocate space for the max number of entries: this wastes 196 KBs. */
1752  	size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry);
1753  	routing = calloc(1, size);
1754  	assert(routing);
1755  
1756  	return routing;
1757  }
1758  
kvm_gsi_routing_irqchip_add(struct kvm_irq_routing * routing,uint32_t gsi,uint32_t pin)1759  void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
1760  		uint32_t gsi, uint32_t pin)
1761  {
1762  	int i;
1763  
1764  	assert(routing);
1765  	assert(routing->nr < KVM_MAX_IRQ_ROUTES);
1766  
1767  	i = routing->nr;
1768  	routing->entries[i].gsi = gsi;
1769  	routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
1770  	routing->entries[i].flags = 0;
1771  	routing->entries[i].u.irqchip.irqchip = 0;
1772  	routing->entries[i].u.irqchip.pin = pin;
1773  	routing->nr++;
1774  }
1775  
_kvm_gsi_routing_write(struct kvm_vm * vm,struct kvm_irq_routing * routing)1776  int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1777  {
1778  	int ret;
1779  
1780  	assert(routing);
1781  	ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing);
1782  	free(routing);
1783  
1784  	return ret;
1785  }
1786  
kvm_gsi_routing_write(struct kvm_vm * vm,struct kvm_irq_routing * routing)1787  void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1788  {
1789  	int ret;
1790  
1791  	ret = _kvm_gsi_routing_write(vm, routing);
1792  	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret));
1793  }
1794  
1795  /*
1796   * VM Dump
1797   *
1798   * Input Args:
1799   *   vm - Virtual Machine
1800   *   indent - Left margin indent amount
1801   *
1802   * Output Args:
1803   *   stream - Output FILE stream
1804   *
1805   * Return: None
1806   *
1807   * Dumps the current state of the VM given by vm, to the FILE stream
1808   * given by stream.
1809   */
vm_dump(FILE * stream,struct kvm_vm * vm,uint8_t indent)1810  void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
1811  {
1812  	int ctr;
1813  	struct userspace_mem_region *region;
1814  	struct kvm_vcpu *vcpu;
1815  
1816  	fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
1817  	fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
1818  	fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
1819  	fprintf(stream, "%*sMem Regions:\n", indent, "");
1820  	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
1821  		fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
1822  			"host_virt: %p\n", indent + 2, "",
1823  			(uint64_t) region->region.guest_phys_addr,
1824  			(uint64_t) region->region.memory_size,
1825  			region->host_mem);
1826  		fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
1827  		sparsebit_dump(stream, region->unused_phy_pages, 0);
1828  	}
1829  	fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
1830  	sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
1831  	fprintf(stream, "%*spgd_created: %u\n", indent, "",
1832  		vm->pgd_created);
1833  	if (vm->pgd_created) {
1834  		fprintf(stream, "%*sVirtual Translation Tables:\n",
1835  			indent + 2, "");
1836  		virt_dump(stream, vm, indent + 4);
1837  	}
1838  	fprintf(stream, "%*sVCPUs:\n", indent, "");
1839  
1840  	list_for_each_entry(vcpu, &vm->vcpus, list)
1841  		vcpu_dump(stream, vcpu, indent + 2);
1842  }
1843  
1844  #define KVM_EXIT_STRING(x) {KVM_EXIT_##x, #x}
1845  
1846  /* Known KVM exit reasons */
1847  static struct exit_reason {
1848  	unsigned int reason;
1849  	const char *name;
1850  } exit_reasons_known[] = {
1851  	KVM_EXIT_STRING(UNKNOWN),
1852  	KVM_EXIT_STRING(EXCEPTION),
1853  	KVM_EXIT_STRING(IO),
1854  	KVM_EXIT_STRING(HYPERCALL),
1855  	KVM_EXIT_STRING(DEBUG),
1856  	KVM_EXIT_STRING(HLT),
1857  	KVM_EXIT_STRING(MMIO),
1858  	KVM_EXIT_STRING(IRQ_WINDOW_OPEN),
1859  	KVM_EXIT_STRING(SHUTDOWN),
1860  	KVM_EXIT_STRING(FAIL_ENTRY),
1861  	KVM_EXIT_STRING(INTR),
1862  	KVM_EXIT_STRING(SET_TPR),
1863  	KVM_EXIT_STRING(TPR_ACCESS),
1864  	KVM_EXIT_STRING(S390_SIEIC),
1865  	KVM_EXIT_STRING(S390_RESET),
1866  	KVM_EXIT_STRING(DCR),
1867  	KVM_EXIT_STRING(NMI),
1868  	KVM_EXIT_STRING(INTERNAL_ERROR),
1869  	KVM_EXIT_STRING(OSI),
1870  	KVM_EXIT_STRING(PAPR_HCALL),
1871  	KVM_EXIT_STRING(S390_UCONTROL),
1872  	KVM_EXIT_STRING(WATCHDOG),
1873  	KVM_EXIT_STRING(S390_TSCH),
1874  	KVM_EXIT_STRING(EPR),
1875  	KVM_EXIT_STRING(SYSTEM_EVENT),
1876  	KVM_EXIT_STRING(S390_STSI),
1877  	KVM_EXIT_STRING(IOAPIC_EOI),
1878  	KVM_EXIT_STRING(HYPERV),
1879  	KVM_EXIT_STRING(ARM_NISV),
1880  	KVM_EXIT_STRING(X86_RDMSR),
1881  	KVM_EXIT_STRING(X86_WRMSR),
1882  	KVM_EXIT_STRING(DIRTY_RING_FULL),
1883  	KVM_EXIT_STRING(AP_RESET_HOLD),
1884  	KVM_EXIT_STRING(X86_BUS_LOCK),
1885  	KVM_EXIT_STRING(XEN),
1886  	KVM_EXIT_STRING(RISCV_SBI),
1887  	KVM_EXIT_STRING(RISCV_CSR),
1888  	KVM_EXIT_STRING(NOTIFY),
1889  #ifdef KVM_EXIT_MEMORY_NOT_PRESENT
1890  	KVM_EXIT_STRING(MEMORY_NOT_PRESENT),
1891  #endif
1892  };
1893  
1894  /*
1895   * Exit Reason String
1896   *
1897   * Input Args:
1898   *   exit_reason - Exit reason
1899   *
1900   * Output Args: None
1901   *
1902   * Return:
1903   *   Constant string pointer describing the exit reason.
1904   *
1905   * Locates and returns a constant string that describes the KVM exit
1906   * reason given by exit_reason.  If no such string is found, a constant
1907   * string of "Unknown" is returned.
1908   */
exit_reason_str(unsigned int exit_reason)1909  const char *exit_reason_str(unsigned int exit_reason)
1910  {
1911  	unsigned int n1;
1912  
1913  	for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
1914  		if (exit_reason == exit_reasons_known[n1].reason)
1915  			return exit_reasons_known[n1].name;
1916  	}
1917  
1918  	return "Unknown";
1919  }
1920  
1921  /*
1922   * Physical Contiguous Page Allocator
1923   *
1924   * Input Args:
1925   *   vm - Virtual Machine
1926   *   num - number of pages
1927   *   paddr_min - Physical address minimum
1928   *   memslot - Memory region to allocate page from
1929   *
1930   * Output Args: None
1931   *
1932   * Return:
1933   *   Starting physical address
1934   *
1935   * Within the VM specified by vm, locates a range of available physical
1936   * pages at or above paddr_min. If found, the pages are marked as in use
1937   * and their base address is returned. A TEST_ASSERT failure occurs if
1938   * not enough pages are available at or above paddr_min.
1939   */
vm_phy_pages_alloc(struct kvm_vm * vm,size_t num,vm_paddr_t paddr_min,uint32_t memslot)1940  vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
1941  			      vm_paddr_t paddr_min, uint32_t memslot)
1942  {
1943  	struct userspace_mem_region *region;
1944  	sparsebit_idx_t pg, base;
1945  
1946  	TEST_ASSERT(num > 0, "Must allocate at least one page");
1947  
1948  	TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
1949  		"not divisible by page size.\n"
1950  		"  paddr_min: 0x%lx page_size: 0x%x",
1951  		paddr_min, vm->page_size);
1952  
1953  	region = memslot2region(vm, memslot);
1954  	base = pg = paddr_min >> vm->page_shift;
1955  
1956  	do {
1957  		for (; pg < base + num; ++pg) {
1958  			if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
1959  				base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
1960  				break;
1961  			}
1962  		}
1963  	} while (pg && pg != base + num);
1964  
1965  	if (pg == 0) {
1966  		fprintf(stderr, "No guest physical page available, "
1967  			"paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
1968  			paddr_min, vm->page_size, memslot);
1969  		fputs("---- vm dump ----\n", stderr);
1970  		vm_dump(stderr, vm, 2);
1971  		abort();
1972  	}
1973  
1974  	for (pg = base; pg < base + num; ++pg)
1975  		sparsebit_clear(region->unused_phy_pages, pg);
1976  
1977  	return base * vm->page_size;
1978  }
1979  
vm_phy_page_alloc(struct kvm_vm * vm,vm_paddr_t paddr_min,uint32_t memslot)1980  vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
1981  			     uint32_t memslot)
1982  {
1983  	return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
1984  }
1985  
vm_alloc_page_table(struct kvm_vm * vm)1986  vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
1987  {
1988  	return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
1989  				 vm->memslots[MEM_REGION_PT]);
1990  }
1991  
1992  /*
1993   * Address Guest Virtual to Host Virtual
1994   *
1995   * Input Args:
1996   *   vm - Virtual Machine
1997   *   gva - VM virtual address
1998   *
1999   * Output Args: None
2000   *
2001   * Return:
2002   *   Equivalent host virtual address
2003   */
addr_gva2hva(struct kvm_vm * vm,vm_vaddr_t gva)2004  void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
2005  {
2006  	return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
2007  }
2008  
vm_compute_max_gfn(struct kvm_vm * vm)2009  unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm)
2010  {
2011  	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
2012  }
2013  
vm_calc_num_pages(unsigned int num_pages,unsigned int page_shift,unsigned int new_page_shift,bool ceil)2014  static unsigned int vm_calc_num_pages(unsigned int num_pages,
2015  				      unsigned int page_shift,
2016  				      unsigned int new_page_shift,
2017  				      bool ceil)
2018  {
2019  	unsigned int n = 1 << (new_page_shift - page_shift);
2020  
2021  	if (page_shift >= new_page_shift)
2022  		return num_pages * (1 << (page_shift - new_page_shift));
2023  
2024  	return num_pages / n + !!(ceil && num_pages % n);
2025  }
2026  
getpageshift(void)2027  static inline int getpageshift(void)
2028  {
2029  	return __builtin_ffs(getpagesize()) - 1;
2030  }
2031  
2032  unsigned int
vm_num_host_pages(enum vm_guest_mode mode,unsigned int num_guest_pages)2033  vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
2034  {
2035  	return vm_calc_num_pages(num_guest_pages,
2036  				 vm_guest_mode_params[mode].page_shift,
2037  				 getpageshift(), true);
2038  }
2039  
2040  unsigned int
vm_num_guest_pages(enum vm_guest_mode mode,unsigned int num_host_pages)2041  vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
2042  {
2043  	return vm_calc_num_pages(num_host_pages, getpageshift(),
2044  				 vm_guest_mode_params[mode].page_shift, false);
2045  }
2046  
vm_calc_num_guest_pages(enum vm_guest_mode mode,size_t size)2047  unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
2048  {
2049  	unsigned int n;
2050  	n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
2051  	return vm_adjust_num_guest_pages(mode, n);
2052  }
2053  
2054  /*
2055   * Read binary stats descriptors
2056   *
2057   * Input Args:
2058   *   stats_fd - the file descriptor for the binary stats file from which to read
2059   *   header - the binary stats metadata header corresponding to the given FD
2060   *
2061   * Output Args: None
2062   *
2063   * Return:
2064   *   A pointer to a newly allocated series of stat descriptors.
2065   *   Caller is responsible for freeing the returned kvm_stats_desc.
2066   *
2067   * Read the stats descriptors from the binary stats interface.
2068   */
read_stats_descriptors(int stats_fd,struct kvm_stats_header * header)2069  struct kvm_stats_desc *read_stats_descriptors(int stats_fd,
2070  					      struct kvm_stats_header *header)
2071  {
2072  	struct kvm_stats_desc *stats_desc;
2073  	ssize_t desc_size, total_size, ret;
2074  
2075  	desc_size = get_stats_descriptor_size(header);
2076  	total_size = header->num_desc * desc_size;
2077  
2078  	stats_desc = calloc(header->num_desc, desc_size);
2079  	TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors");
2080  
2081  	ret = pread(stats_fd, stats_desc, total_size, header->desc_offset);
2082  	TEST_ASSERT(ret == total_size, "Read KVM stats descriptors");
2083  
2084  	return stats_desc;
2085  }
2086  
2087  /*
2088   * Read stat data for a particular stat
2089   *
2090   * Input Args:
2091   *   stats_fd - the file descriptor for the binary stats file from which to read
2092   *   header - the binary stats metadata header corresponding to the given FD
2093   *   desc - the binary stat metadata for the particular stat to be read
2094   *   max_elements - the maximum number of 8-byte values to read into data
2095   *
2096   * Output Args:
2097   *   data - the buffer into which stat data should be read
2098   *
2099   * Read the data values of a specified stat from the binary stats interface.
2100   */
read_stat_data(int stats_fd,struct kvm_stats_header * header,struct kvm_stats_desc * desc,uint64_t * data,size_t max_elements)2101  void read_stat_data(int stats_fd, struct kvm_stats_header *header,
2102  		    struct kvm_stats_desc *desc, uint64_t *data,
2103  		    size_t max_elements)
2104  {
2105  	size_t nr_elements = min_t(ssize_t, desc->size, max_elements);
2106  	size_t size = nr_elements * sizeof(*data);
2107  	ssize_t ret;
2108  
2109  	TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name);
2110  	TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name);
2111  
2112  	ret = pread(stats_fd, data, size,
2113  		    header->data_offset + desc->offset);
2114  
2115  	TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)",
2116  		    desc->name, errno, strerror(errno));
2117  	TEST_ASSERT(ret == size,
2118  		    "pread() on stat '%s' read %ld bytes, wanted %lu bytes",
2119  		    desc->name, size, ret);
2120  }
2121  
2122  /*
2123   * Read the data of the named stat
2124   *
2125   * Input Args:
2126   *   vm - the VM for which the stat should be read
2127   *   stat_name - the name of the stat to read
2128   *   max_elements - the maximum number of 8-byte values to read into data
2129   *
2130   * Output Args:
2131   *   data - the buffer into which stat data should be read
2132   *
2133   * Read the data values of a specified stat from the binary stats interface.
2134   */
__vm_get_stat(struct kvm_vm * vm,const char * stat_name,uint64_t * data,size_t max_elements)2135  void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data,
2136  		   size_t max_elements)
2137  {
2138  	struct kvm_stats_desc *desc;
2139  	size_t size_desc;
2140  	int i;
2141  
2142  	if (!vm->stats_fd) {
2143  		vm->stats_fd = vm_get_stats_fd(vm);
2144  		read_stats_header(vm->stats_fd, &vm->stats_header);
2145  		vm->stats_desc = read_stats_descriptors(vm->stats_fd,
2146  							&vm->stats_header);
2147  	}
2148  
2149  	size_desc = get_stats_descriptor_size(&vm->stats_header);
2150  
2151  	for (i = 0; i < vm->stats_header.num_desc; ++i) {
2152  		desc = (void *)vm->stats_desc + (i * size_desc);
2153  
2154  		if (strcmp(desc->name, stat_name))
2155  			continue;
2156  
2157  		read_stat_data(vm->stats_fd, &vm->stats_header, desc,
2158  			       data, max_elements);
2159  
2160  		break;
2161  	}
2162  }
2163  
kvm_arch_vm_post_create(struct kvm_vm * vm)2164  __weak void kvm_arch_vm_post_create(struct kvm_vm *vm)
2165  {
2166  }
2167  
kvm_selftest_arch_init(void)2168  __weak void kvm_selftest_arch_init(void)
2169  {
2170  }
2171  
kvm_selftest_init(void)2172  void __attribute((constructor)) kvm_selftest_init(void)
2173  {
2174  	/* Tell stdout not to buffer its content. */
2175  	setbuf(stdout, NULL);
2176  
2177  	kvm_selftest_arch_init();
2178  }
2179