1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * access_tracking_perf_test 4 * 5 * Copyright (C) 2021, Google, Inc. 6 * 7 * This test measures the performance effects of KVM's access tracking. 8 * Access tracking is driven by the MMU notifiers test_young, clear_young, and 9 * clear_flush_young. These notifiers do not have a direct userspace API, 10 * however the clear_young notifier can be triggered by marking a pages as idle 11 * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to 12 * enable access tracking on guest memory. 13 * 14 * To measure performance this test runs a VM with a configurable number of 15 * vCPUs that each touch every page in disjoint regions of memory. Performance 16 * is measured in the time it takes all vCPUs to finish touching their 17 * predefined region. 18 * 19 * Note that a deterministic correctness test of access tracking is not possible 20 * by using page_idle as it exists today. This is for a few reasons: 21 * 22 * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This 23 * means subsequent guest accesses are not guaranteed to see page table 24 * updates made by KVM until some time in the future. 25 * 26 * 2. page_idle only operates on LRU pages. Newly allocated pages are not 27 * immediately allocated to LRU lists. Instead they are held in a "pagevec", 28 * which is drained to LRU lists some time in the future. There is no 29 * userspace API to force this drain to occur. 30 * 31 * These limitations are worked around in this test by using a large enough 32 * region of memory for each vCPU such that the number of translations cached in 33 * the TLB and the number of pages held in pagevecs are a small fraction of the 34 * overall workload. And if either of those conditions are not true (for example 35 * in nesting, where TLB size is unlimited) this test will print a warning 36 * rather than silently passing. 37 */ 38 #include <inttypes.h> 39 #include <limits.h> 40 #include <pthread.h> 41 #include <sys/mman.h> 42 #include <sys/types.h> 43 #include <sys/stat.h> 44 45 #include "kvm_util.h" 46 #include "test_util.h" 47 #include "memstress.h" 48 #include "guest_modes.h" 49 50 /* Global variable used to synchronize all of the vCPU threads. */ 51 static int iteration; 52 53 /* Defines what vCPU threads should do during a given iteration. */ 54 static enum { 55 /* Run the vCPU to access all its memory. */ 56 ITERATION_ACCESS_MEMORY, 57 /* Mark the vCPU's memory idle in page_idle. */ 58 ITERATION_MARK_IDLE, 59 } iteration_work; 60 61 /* The iteration that was last completed by each vCPU. */ 62 static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; 63 64 /* Whether to overlap the regions of memory vCPUs access. */ 65 static bool overlap_memory_access; 66 67 struct test_params { 68 /* The backing source for the region of memory. */ 69 enum vm_mem_backing_src_type backing_src; 70 71 /* The amount of memory to allocate for each vCPU. */ 72 uint64_t vcpu_memory_bytes; 73 74 /* The number of vCPUs to create in the VM. */ 75 int nr_vcpus; 76 }; 77 78 static uint64_t pread_uint64(int fd, const char *filename, uint64_t index) 79 { 80 uint64_t value; 81 off_t offset = index * sizeof(value); 82 83 TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value), 84 "pread from %s offset 0x%" PRIx64 " failed!", 85 filename, offset); 86 87 return value; 88 89 } 90 91 #define PAGEMAP_PRESENT (1ULL << 63) 92 #define PAGEMAP_PFN_MASK ((1ULL << 55) - 1) 93 94 static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva) 95 { 96 uint64_t hva = (uint64_t) addr_gva2hva(vm, gva); 97 uint64_t entry; 98 uint64_t pfn; 99 100 entry = pread_uint64(pagemap_fd, "pagemap", hva / getpagesize()); 101 if (!(entry & PAGEMAP_PRESENT)) 102 return 0; 103 104 pfn = entry & PAGEMAP_PFN_MASK; 105 __TEST_REQUIRE(pfn, "Looking up PFNs requires CAP_SYS_ADMIN"); 106 107 return pfn; 108 } 109 110 static bool is_page_idle(int page_idle_fd, uint64_t pfn) 111 { 112 uint64_t bits = pread_uint64(page_idle_fd, "page_idle", pfn / 64); 113 114 return !!((bits >> (pfn % 64)) & 1); 115 } 116 117 static void mark_page_idle(int page_idle_fd, uint64_t pfn) 118 { 119 uint64_t bits = 1ULL << (pfn % 64); 120 121 TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8, 122 "Set page_idle bits for PFN 0x%" PRIx64, pfn); 123 } 124 125 static void mark_vcpu_memory_idle(struct kvm_vm *vm, 126 struct memstress_vcpu_args *vcpu_args) 127 { 128 int vcpu_idx = vcpu_args->vcpu_idx; 129 uint64_t base_gva = vcpu_args->gva; 130 uint64_t pages = vcpu_args->pages; 131 uint64_t page; 132 uint64_t still_idle = 0; 133 uint64_t no_pfn = 0; 134 int page_idle_fd; 135 int pagemap_fd; 136 137 /* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */ 138 if (overlap_memory_access && vcpu_idx) 139 return; 140 141 page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); 142 TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle."); 143 144 pagemap_fd = open("/proc/self/pagemap", O_RDONLY); 145 TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap."); 146 147 for (page = 0; page < pages; page++) { 148 uint64_t gva = base_gva + page * memstress_args.guest_page_size; 149 uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva); 150 151 if (!pfn) { 152 no_pfn++; 153 continue; 154 } 155 156 if (is_page_idle(page_idle_fd, pfn)) { 157 still_idle++; 158 continue; 159 } 160 161 mark_page_idle(page_idle_fd, pfn); 162 } 163 164 /* 165 * Assumption: Less than 1% of pages are going to be swapped out from 166 * under us during this test. 167 */ 168 TEST_ASSERT(no_pfn < pages / 100, 169 "vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.", 170 vcpu_idx, no_pfn, pages); 171 172 /* 173 * Check that at least 90% of memory has been marked idle (the rest 174 * might not be marked idle because the pages have not yet made it to an 175 * LRU list or the translations are still cached in the TLB). 90% is 176 * arbitrary; high enough that we ensure most memory access went through 177 * access tracking but low enough as to not make the test too brittle 178 * over time and across architectures. 179 * 180 * Note that when run in nested virtualization, this check will trigger 181 * much more frequently because TLB size is unlimited and since no flush 182 * happens, much more pages are cached there and guest won't see the 183 * "idle" bit cleared. 184 */ 185 if (still_idle < pages / 10) 186 printf("WARNING: vCPU%d: Too many pages still idle (%" PRIu64 187 "out of %" PRIu64 "), this will affect performance results" 188 ".\n", 189 vcpu_idx, still_idle, pages); 190 191 close(page_idle_fd); 192 close(pagemap_fd); 193 } 194 195 static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall) 196 { 197 struct ucall uc; 198 uint64_t actual_ucall = get_ucall(vcpu, &uc); 199 200 TEST_ASSERT(expected_ucall == actual_ucall, 201 "Guest exited unexpectedly (expected ucall %" PRIu64 202 ", got %" PRIu64 ")", 203 expected_ucall, actual_ucall); 204 } 205 206 static bool spin_wait_for_next_iteration(int *current_iteration) 207 { 208 int last_iteration = *current_iteration; 209 210 do { 211 if (READ_ONCE(memstress_args.stop_vcpus)) 212 return false; 213 214 *current_iteration = READ_ONCE(iteration); 215 } while (last_iteration == *current_iteration); 216 217 return true; 218 } 219 220 static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args) 221 { 222 struct kvm_vcpu *vcpu = vcpu_args->vcpu; 223 struct kvm_vm *vm = memstress_args.vm; 224 int vcpu_idx = vcpu_args->vcpu_idx; 225 int current_iteration = 0; 226 227 while (spin_wait_for_next_iteration(¤t_iteration)) { 228 switch (READ_ONCE(iteration_work)) { 229 case ITERATION_ACCESS_MEMORY: 230 vcpu_run(vcpu); 231 assert_ucall(vcpu, UCALL_SYNC); 232 break; 233 case ITERATION_MARK_IDLE: 234 mark_vcpu_memory_idle(vm, vcpu_args); 235 break; 236 }; 237 238 vcpu_last_completed_iteration[vcpu_idx] = current_iteration; 239 } 240 } 241 242 static void spin_wait_for_vcpu(int vcpu_idx, int target_iteration) 243 { 244 while (READ_ONCE(vcpu_last_completed_iteration[vcpu_idx]) != 245 target_iteration) { 246 continue; 247 } 248 } 249 250 /* The type of memory accesses to perform in the VM. */ 251 enum access_type { 252 ACCESS_READ, 253 ACCESS_WRITE, 254 }; 255 256 static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description) 257 { 258 struct timespec ts_start; 259 struct timespec ts_elapsed; 260 int next_iteration, i; 261 262 /* Kick off the vCPUs by incrementing iteration. */ 263 next_iteration = ++iteration; 264 265 clock_gettime(CLOCK_MONOTONIC, &ts_start); 266 267 /* Wait for all vCPUs to finish the iteration. */ 268 for (i = 0; i < nr_vcpus; i++) 269 spin_wait_for_vcpu(i, next_iteration); 270 271 ts_elapsed = timespec_elapsed(ts_start); 272 pr_info("%-30s: %ld.%09lds\n", 273 description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec); 274 } 275 276 static void access_memory(struct kvm_vm *vm, int nr_vcpus, 277 enum access_type access, const char *description) 278 { 279 memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100); 280 iteration_work = ITERATION_ACCESS_MEMORY; 281 run_iteration(vm, nr_vcpus, description); 282 } 283 284 static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus) 285 { 286 /* 287 * Even though this parallelizes the work across vCPUs, this is still a 288 * very slow operation because page_idle forces the test to mark one pfn 289 * at a time and the clear_young notifier serializes on the KVM MMU 290 * lock. 291 */ 292 pr_debug("Marking VM memory idle (slow)...\n"); 293 iteration_work = ITERATION_MARK_IDLE; 294 run_iteration(vm, nr_vcpus, "Mark memory idle"); 295 } 296 297 static void run_test(enum vm_guest_mode mode, void *arg) 298 { 299 struct test_params *params = arg; 300 struct kvm_vm *vm; 301 int nr_vcpus = params->nr_vcpus; 302 303 vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, 304 params->backing_src, !overlap_memory_access); 305 306 memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main); 307 308 pr_info("\n"); 309 access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory"); 310 311 /* As a control, read and write to the populated memory first. */ 312 access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory"); 313 access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory"); 314 315 /* Repeat on memory that has been marked as idle. */ 316 mark_memory_idle(vm, nr_vcpus); 317 access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to idle memory"); 318 mark_memory_idle(vm, nr_vcpus); 319 access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory"); 320 321 memstress_join_vcpu_threads(nr_vcpus); 322 memstress_destroy_vm(vm); 323 } 324 325 static void help(char *name) 326 { 327 puts(""); 328 printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n", 329 name); 330 puts(""); 331 printf(" -h: Display this help message."); 332 guest_modes_help(); 333 printf(" -b: specify the size of the memory region which should be\n" 334 " dirtied by each vCPU. e.g. 10M or 3G.\n" 335 " (default: 1G)\n"); 336 printf(" -v: specify the number of vCPUs to run.\n"); 337 printf(" -o: Overlap guest memory accesses instead of partitioning\n" 338 " them into a separate region of memory for each vCPU.\n"); 339 backing_src_help("-s"); 340 puts(""); 341 exit(0); 342 } 343 344 int main(int argc, char *argv[]) 345 { 346 struct test_params params = { 347 .backing_src = DEFAULT_VM_MEM_SRC, 348 .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, 349 .nr_vcpus = 1, 350 }; 351 int page_idle_fd; 352 int opt; 353 354 guest_modes_append_default(); 355 356 while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) { 357 switch (opt) { 358 case 'm': 359 guest_modes_cmdline(optarg); 360 break; 361 case 'b': 362 params.vcpu_memory_bytes = parse_size(optarg); 363 break; 364 case 'v': 365 params.nr_vcpus = atoi_positive("Number of vCPUs", optarg); 366 break; 367 case 'o': 368 overlap_memory_access = true; 369 break; 370 case 's': 371 params.backing_src = parse_backing_src_type(optarg); 372 break; 373 case 'h': 374 default: 375 help(argv[0]); 376 break; 377 } 378 } 379 380 page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); 381 __TEST_REQUIRE(page_idle_fd >= 0, 382 "CONFIG_IDLE_PAGE_TRACKING is not enabled"); 383 close(page_idle_fd); 384 385 for_each_guest_mode(run_test, ¶ms); 386 387 return 0; 388 } 389