1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * KVM dirty page logging test 4 * 5 * Copyright (C) 2018, Red Hat, Inc. 6 */ 7 8 #define _GNU_SOURCE /* for program_invocation_name */ 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <unistd.h> 13 #include <time.h> 14 #include <pthread.h> 15 #include <linux/bitmap.h> 16 #include <linux/bitops.h> 17 18 #include "test_util.h" 19 #include "kvm_util.h" 20 #include "processor.h" 21 22 #define VCPU_ID 1 23 24 /* The memory slot index to track dirty pages */ 25 #define TEST_MEM_SLOT_INDEX 1 26 27 /* Default guest test virtual memory offset */ 28 #define DEFAULT_GUEST_TEST_MEM 0xc0000000 29 30 /* How many pages to dirty for each guest loop */ 31 #define TEST_PAGES_PER_LOOP 1024 32 33 /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */ 34 #define TEST_HOST_LOOP_N 32UL 35 36 /* Interval for each host loop (ms) */ 37 #define TEST_HOST_LOOP_INTERVAL 10UL 38 39 /* Dirty bitmaps are always little endian, so we need to swap on big endian */ 40 #if defined(__s390x__) 41 # define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) 42 # define test_bit_le(nr, addr) \ 43 test_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 44 # define set_bit_le(nr, addr) \ 45 set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 46 # define clear_bit_le(nr, addr) \ 47 clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 48 # define test_and_set_bit_le(nr, addr) \ 49 test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 50 # define test_and_clear_bit_le(nr, addr) \ 51 test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 52 #else 53 # define test_bit_le test_bit 54 # define set_bit_le set_bit 55 # define clear_bit_le clear_bit 56 # define test_and_set_bit_le test_and_set_bit 57 # define test_and_clear_bit_le test_and_clear_bit 58 #endif 59 60 /* 61 * Guest/Host shared variables. Ensure addr_gva2hva() and/or 62 * sync_global_to/from_guest() are used when accessing from 63 * the host. READ/WRITE_ONCE() should also be used with anything 64 * that may change. 65 */ 66 static uint64_t host_page_size; 67 static uint64_t guest_page_size; 68 static uint64_t guest_num_pages; 69 static uint64_t random_array[TEST_PAGES_PER_LOOP]; 70 static uint64_t iteration; 71 72 /* 73 * Guest physical memory offset of the testing memory slot. 74 * This will be set to the topmost valid physical address minus 75 * the test memory size. 76 */ 77 static uint64_t guest_test_phys_mem; 78 79 /* 80 * Guest virtual memory offset of the testing memory slot. 81 * Must not conflict with identity mapped test code. 82 */ 83 static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; 84 85 /* 86 * Continuously write to the first 8 bytes of a random pages within 87 * the testing memory region. 88 */ 89 static void guest_code(void) 90 { 91 uint64_t addr; 92 int i; 93 94 /* 95 * On s390x, all pages of a 1M segment are initially marked as dirty 96 * when a page of the segment is written to for the very first time. 97 * To compensate this specialty in this test, we need to touch all 98 * pages during the first iteration. 99 */ 100 for (i = 0; i < guest_num_pages; i++) { 101 addr = guest_test_virt_mem + i * guest_page_size; 102 *(uint64_t *)addr = READ_ONCE(iteration); 103 } 104 105 while (true) { 106 for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { 107 addr = guest_test_virt_mem; 108 addr += (READ_ONCE(random_array[i]) % guest_num_pages) 109 * guest_page_size; 110 addr &= ~(host_page_size - 1); 111 *(uint64_t *)addr = READ_ONCE(iteration); 112 } 113 114 /* Tell the host that we need more random numbers */ 115 GUEST_SYNC(1); 116 } 117 } 118 119 /* Host variables */ 120 static bool host_quit; 121 122 /* Points to the test VM memory region on which we track dirty logs */ 123 static void *host_test_mem; 124 static uint64_t host_num_pages; 125 126 /* For statistics only */ 127 static uint64_t host_dirty_count; 128 static uint64_t host_clear_count; 129 static uint64_t host_track_next_count; 130 131 /* 132 * We use this bitmap to track some pages that should have its dirty 133 * bit set in the _next_ iteration. For example, if we detected the 134 * page value changed to current iteration but at the same time the 135 * page bit is cleared in the latest bitmap, then the system must 136 * report that write in the next get dirty log call. 137 */ 138 static unsigned long *host_bmap_track; 139 140 static void generate_random_array(uint64_t *guest_array, uint64_t size) 141 { 142 uint64_t i; 143 144 for (i = 0; i < size; i++) 145 guest_array[i] = random(); 146 } 147 148 static void *vcpu_worker(void *data) 149 { 150 int ret; 151 struct kvm_vm *vm = data; 152 uint64_t *guest_array; 153 uint64_t pages_count = 0; 154 struct kvm_run *run; 155 156 run = vcpu_state(vm, VCPU_ID); 157 158 guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); 159 generate_random_array(guest_array, TEST_PAGES_PER_LOOP); 160 161 while (!READ_ONCE(host_quit)) { 162 /* Let the guest dirty the random pages */ 163 ret = _vcpu_run(vm, VCPU_ID); 164 TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); 165 if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) { 166 pages_count += TEST_PAGES_PER_LOOP; 167 generate_random_array(guest_array, TEST_PAGES_PER_LOOP); 168 } else { 169 TEST_FAIL("Invalid guest sync status: " 170 "exit_reason=%s\n", 171 exit_reason_str(run->exit_reason)); 172 } 173 } 174 175 pr_info("Dirtied %"PRIu64" pages\n", pages_count); 176 177 return NULL; 178 } 179 180 static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) 181 { 182 uint64_t step = vm_num_host_pages(mode, 1); 183 uint64_t page; 184 uint64_t *value_ptr; 185 186 for (page = 0; page < host_num_pages; page += step) { 187 value_ptr = host_test_mem + page * host_page_size; 188 189 /* If this is a special page that we were tracking... */ 190 if (test_and_clear_bit_le(page, host_bmap_track)) { 191 host_track_next_count++; 192 TEST_ASSERT(test_bit_le(page, bmap), 193 "Page %"PRIu64" should have its dirty bit " 194 "set in this iteration but it is missing", 195 page); 196 } 197 198 if (test_bit_le(page, bmap)) { 199 host_dirty_count++; 200 /* 201 * If the bit is set, the value written onto 202 * the corresponding page should be either the 203 * previous iteration number or the current one. 204 */ 205 TEST_ASSERT(*value_ptr == iteration || 206 *value_ptr == iteration - 1, 207 "Set page %"PRIu64" value %"PRIu64 208 " incorrect (iteration=%"PRIu64")", 209 page, *value_ptr, iteration); 210 } else { 211 host_clear_count++; 212 /* 213 * If cleared, the value written can be any 214 * value smaller or equals to the iteration 215 * number. Note that the value can be exactly 216 * (iteration-1) if that write can happen 217 * like this: 218 * 219 * (1) increase loop count to "iteration-1" 220 * (2) write to page P happens (with value 221 * "iteration-1") 222 * (3) get dirty log for "iteration-1"; we'll 223 * see that page P bit is set (dirtied), 224 * and not set the bit in host_bmap_track 225 * (4) increase loop count to "iteration" 226 * (which is current iteration) 227 * (5) get dirty log for current iteration, 228 * we'll see that page P is cleared, with 229 * value "iteration-1". 230 */ 231 TEST_ASSERT(*value_ptr <= iteration, 232 "Clear page %"PRIu64" value %"PRIu64 233 " incorrect (iteration=%"PRIu64")", 234 page, *value_ptr, iteration); 235 if (*value_ptr == iteration) { 236 /* 237 * This page is _just_ modified; it 238 * should report its dirtyness in the 239 * next run 240 */ 241 set_bit_le(page, host_bmap_track); 242 } 243 } 244 } 245 } 246 247 static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, 248 uint64_t extra_mem_pages, void *guest_code) 249 { 250 struct kvm_vm *vm; 251 uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; 252 253 pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); 254 255 vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); 256 kvm_vm_elf_load(vm, program_invocation_name, 0, 0); 257 #ifdef __x86_64__ 258 vm_create_irqchip(vm); 259 #endif 260 vm_vcpu_add_default(vm, vcpuid, guest_code); 261 return vm; 262 } 263 264 #define DIRTY_MEM_BITS 30 /* 1G */ 265 #define PAGE_SHIFT_4K 12 266 267 #ifdef USE_CLEAR_DIRTY_LOG 268 static u64 dirty_log_manual_caps; 269 #endif 270 271 static void run_test(enum vm_guest_mode mode, unsigned long iterations, 272 unsigned long interval, uint64_t phys_offset) 273 { 274 pthread_t vcpu_thread; 275 struct kvm_vm *vm; 276 unsigned long *bmap; 277 278 /* 279 * We reserve page table for 2 times of extra dirty mem which 280 * will definitely cover the original (1G+) test range. Here 281 * we do the calculation with 4K page size which is the 282 * smallest so the page number will be enough for all archs 283 * (e.g., 64K page size guest will need even less memory for 284 * page tables). 285 */ 286 vm = create_vm(mode, VCPU_ID, 287 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), 288 guest_code); 289 290 guest_page_size = vm_get_page_size(vm); 291 /* 292 * A little more than 1G of guest page sized pages. Cover the 293 * case where the size is not aligned to 64 pages. 294 */ 295 guest_num_pages = (1ul << (DIRTY_MEM_BITS - 296 vm_get_page_shift(vm))) + 3; 297 guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); 298 299 host_page_size = getpagesize(); 300 host_num_pages = vm_num_host_pages(mode, guest_num_pages); 301 302 if (!phys_offset) { 303 guest_test_phys_mem = (vm_get_max_gfn(vm) - 304 guest_num_pages) * guest_page_size; 305 guest_test_phys_mem &= ~(host_page_size - 1); 306 } else { 307 guest_test_phys_mem = phys_offset; 308 } 309 310 #ifdef __s390x__ 311 /* Align to 1M (segment size) */ 312 guest_test_phys_mem &= ~((1 << 20) - 1); 313 #endif 314 315 pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); 316 317 bmap = bitmap_alloc(host_num_pages); 318 host_bmap_track = bitmap_alloc(host_num_pages); 319 320 #ifdef USE_CLEAR_DIRTY_LOG 321 struct kvm_enable_cap cap = {}; 322 323 cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; 324 cap.args[0] = dirty_log_manual_caps; 325 vm_enable_cap(vm, &cap); 326 #endif 327 328 /* Add an extra memory slot for testing dirty logging */ 329 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 330 guest_test_phys_mem, 331 TEST_MEM_SLOT_INDEX, 332 guest_num_pages, 333 KVM_MEM_LOG_DIRTY_PAGES); 334 335 /* Do mapping for the dirty track memory slot */ 336 virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); 337 338 /* Cache the HVA pointer of the region */ 339 host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); 340 341 #ifdef __x86_64__ 342 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 343 #endif 344 ucall_init(vm, NULL); 345 346 /* Export the shared variables to the guest */ 347 sync_global_to_guest(vm, host_page_size); 348 sync_global_to_guest(vm, guest_page_size); 349 sync_global_to_guest(vm, guest_test_virt_mem); 350 sync_global_to_guest(vm, guest_num_pages); 351 352 /* Start the iterations */ 353 iteration = 1; 354 sync_global_to_guest(vm, iteration); 355 host_quit = false; 356 host_dirty_count = 0; 357 host_clear_count = 0; 358 host_track_next_count = 0; 359 360 pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); 361 362 while (iteration < iterations) { 363 /* Give the vcpu thread some time to dirty some pages */ 364 usleep(interval * 1000); 365 kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); 366 #ifdef USE_CLEAR_DIRTY_LOG 367 kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, 368 host_num_pages); 369 #endif 370 vm_dirty_log_verify(mode, bmap); 371 iteration++; 372 sync_global_to_guest(vm, iteration); 373 } 374 375 /* Tell the vcpu thread to quit */ 376 host_quit = true; 377 pthread_join(vcpu_thread, NULL); 378 379 pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " 380 "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count, 381 host_track_next_count); 382 383 free(bmap); 384 free(host_bmap_track); 385 ucall_uninit(vm); 386 kvm_vm_free(vm); 387 } 388 389 struct guest_mode { 390 bool supported; 391 bool enabled; 392 }; 393 static struct guest_mode guest_modes[NUM_VM_MODES]; 394 395 #define guest_mode_init(mode, supported, enabled) ({ \ 396 guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ 397 }) 398 399 static void help(char *name) 400 { 401 int i; 402 403 puts(""); 404 printf("usage: %s [-h] [-i iterations] [-I interval] " 405 "[-p offset] [-m mode]\n", name); 406 puts(""); 407 printf(" -i: specify iteration counts (default: %"PRIu64")\n", 408 TEST_HOST_LOOP_N); 409 printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", 410 TEST_HOST_LOOP_INTERVAL); 411 printf(" -p: specify guest physical test memory offset\n" 412 " Warning: a low offset can conflict with the loaded test code.\n"); 413 printf(" -m: specify the guest mode ID to test " 414 "(default: test all supported modes)\n" 415 " This option may be used multiple times.\n" 416 " Guest mode IDs:\n"); 417 for (i = 0; i < NUM_VM_MODES; ++i) { 418 printf(" %d: %s%s\n", i, vm_guest_mode_string(i), 419 guest_modes[i].supported ? " (supported)" : ""); 420 } 421 puts(""); 422 exit(0); 423 } 424 425 int main(int argc, char *argv[]) 426 { 427 unsigned long iterations = TEST_HOST_LOOP_N; 428 unsigned long interval = TEST_HOST_LOOP_INTERVAL; 429 bool mode_selected = false; 430 uint64_t phys_offset = 0; 431 unsigned int mode; 432 int opt, i; 433 434 #ifdef USE_CLEAR_DIRTY_LOG 435 dirty_log_manual_caps = 436 kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); 437 if (!dirty_log_manual_caps) { 438 print_skip("KVM_CLEAR_DIRTY_LOG not available"); 439 exit(KSFT_SKIP); 440 } 441 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | 442 KVM_DIRTY_LOG_INITIALLY_SET); 443 #endif 444 445 #ifdef __x86_64__ 446 guest_mode_init(VM_MODE_PXXV48_4K, true, true); 447 #endif 448 #ifdef __aarch64__ 449 guest_mode_init(VM_MODE_P40V48_4K, true, true); 450 guest_mode_init(VM_MODE_P40V48_64K, true, true); 451 452 { 453 unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); 454 455 if (limit >= 52) 456 guest_mode_init(VM_MODE_P52V48_64K, true, true); 457 if (limit >= 48) { 458 guest_mode_init(VM_MODE_P48V48_4K, true, true); 459 guest_mode_init(VM_MODE_P48V48_64K, true, true); 460 } 461 } 462 #endif 463 #ifdef __s390x__ 464 guest_mode_init(VM_MODE_P40V48_4K, true, true); 465 #endif 466 467 while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) { 468 switch (opt) { 469 case 'i': 470 iterations = strtol(optarg, NULL, 10); 471 break; 472 case 'I': 473 interval = strtol(optarg, NULL, 10); 474 break; 475 case 'p': 476 phys_offset = strtoull(optarg, NULL, 0); 477 break; 478 case 'm': 479 if (!mode_selected) { 480 for (i = 0; i < NUM_VM_MODES; ++i) 481 guest_modes[i].enabled = false; 482 mode_selected = true; 483 } 484 mode = strtoul(optarg, NULL, 10); 485 TEST_ASSERT(mode < NUM_VM_MODES, 486 "Guest mode ID %d too big", mode); 487 guest_modes[mode].enabled = true; 488 break; 489 case 'h': 490 default: 491 help(argv[0]); 492 break; 493 } 494 } 495 496 TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); 497 TEST_ASSERT(interval > 0, "Interval must be greater than zero"); 498 499 pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", 500 iterations, interval); 501 502 srandom(time(0)); 503 504 for (i = 0; i < NUM_VM_MODES; ++i) { 505 if (!guest_modes[i].enabled) 506 continue; 507 TEST_ASSERT(guest_modes[i].supported, 508 "Guest mode ID %d (%s) not supported.", 509 i, vm_guest_mode_string(i)); 510 run_test(i, iterations, interval, phys_offset); 511 } 512 513 return 0; 514 } 515