161e52f16SSean Christopherson // SPDX-License-Identifier: GPL-2.0-only 261e52f16SSean Christopherson #define _GNU_SOURCE /* for program_invocation_short_name */ 361e52f16SSean Christopherson #include <errno.h> 461e52f16SSean Christopherson #include <fcntl.h> 561e52f16SSean Christopherson #include <pthread.h> 661e52f16SSean Christopherson #include <sched.h> 761e52f16SSean Christopherson #include <stdio.h> 861e52f16SSean Christopherson #include <stdlib.h> 961e52f16SSean Christopherson #include <string.h> 1061e52f16SSean Christopherson #include <signal.h> 1161e52f16SSean Christopherson #include <syscall.h> 1261e52f16SSean Christopherson #include <sys/ioctl.h> 137b0035eaSSean Christopherson #include <sys/sysinfo.h> 1461e52f16SSean Christopherson #include <asm/barrier.h> 1561e52f16SSean Christopherson #include <linux/atomic.h> 1661e52f16SSean Christopherson #include <linux/rseq.h> 1761e52f16SSean Christopherson #include <linux/unistd.h> 1861e52f16SSean Christopherson 1961e52f16SSean Christopherson #include "kvm_util.h" 2061e52f16SSean Christopherson #include "processor.h" 2161e52f16SSean Christopherson #include "test_util.h" 2261e52f16SSean Christopherson 2366d42ac7SGavin Shan #include "../rseq/rseq.c" 2461e52f16SSean Christopherson 2561e52f16SSean Christopherson /* 2661e52f16SSean Christopherson * Any bug related to task migration is likely to be timing-dependent; perform 2761e52f16SSean Christopherson * a large number of migrations to reduce the odds of a false negative. 2861e52f16SSean Christopherson */ 2961e52f16SSean Christopherson #define NR_TASK_MIGRATIONS 100000 3061e52f16SSean Christopherson 3161e52f16SSean Christopherson static pthread_t migration_thread; 3261e52f16SSean Christopherson static cpu_set_t possible_mask; 337b0035eaSSean Christopherson static int min_cpu, max_cpu; 3461e52f16SSean Christopherson static bool done; 3561e52f16SSean Christopherson 3661e52f16SSean Christopherson static atomic_t seq_cnt; 3761e52f16SSean Christopherson 3861e52f16SSean Christopherson static void guest_code(void) 3961e52f16SSean Christopherson { 4061e52f16SSean Christopherson for (;;) 4161e52f16SSean Christopherson GUEST_SYNC(0); 4261e52f16SSean Christopherson } 4361e52f16SSean Christopherson 447b0035eaSSean Christopherson static int next_cpu(int cpu) 457b0035eaSSean Christopherson { 467b0035eaSSean Christopherson /* 477b0035eaSSean Christopherson * Advance to the next CPU, skipping those that weren't in the original 487b0035eaSSean Christopherson * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's 497b0035eaSSean Christopherson * data storage is considered as opaque. Note, if this task is pinned 507b0035eaSSean Christopherson * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will 517b0035eaSSean Christopherson * burn a lot cycles and the test will take longer than normal to 527b0035eaSSean Christopherson * complete. 537b0035eaSSean Christopherson */ 547b0035eaSSean Christopherson do { 557b0035eaSSean Christopherson cpu++; 567b0035eaSSean Christopherson if (cpu > max_cpu) { 577b0035eaSSean Christopherson cpu = min_cpu; 587b0035eaSSean Christopherson TEST_ASSERT(CPU_ISSET(cpu, &possible_mask), 597b0035eaSSean Christopherson "Min CPU = %d must always be usable", cpu); 607b0035eaSSean Christopherson break; 617b0035eaSSean Christopherson } 627b0035eaSSean Christopherson } while (!CPU_ISSET(cpu, &possible_mask)); 637b0035eaSSean Christopherson 647b0035eaSSean Christopherson return cpu; 657b0035eaSSean Christopherson } 667b0035eaSSean Christopherson 67e923b053SGavin Shan static void *migration_worker(void *__rseq_tid) 6861e52f16SSean Christopherson { 69e923b053SGavin Shan pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; 7061e52f16SSean Christopherson cpu_set_t allowed_mask; 717b0035eaSSean Christopherson int r, i, cpu; 7261e52f16SSean Christopherson 7361e52f16SSean Christopherson CPU_ZERO(&allowed_mask); 7461e52f16SSean Christopherson 757b0035eaSSean Christopherson for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) { 7661e52f16SSean Christopherson CPU_SET(cpu, &allowed_mask); 7761e52f16SSean Christopherson 7861e52f16SSean Christopherson /* 7961e52f16SSean Christopherson * Bump the sequence count twice to allow the reader to detect 8061e52f16SSean Christopherson * that a migration may have occurred in between rseq and sched 8161e52f16SSean Christopherson * CPU ID reads. An odd sequence count indicates a migration 8261e52f16SSean Christopherson * is in-progress, while a completely different count indicates 8361e52f16SSean Christopherson * a migration occurred since the count was last read. 8461e52f16SSean Christopherson */ 8561e52f16SSean Christopherson atomic_inc(&seq_cnt); 8661e52f16SSean Christopherson 8761e52f16SSean Christopherson /* 880fcc1029SGavin Shan * Ensure the odd count is visible while getcpu() isn't 8961e52f16SSean Christopherson * stable, i.e. while changing affinity is in-progress. 9061e52f16SSean Christopherson */ 9161e52f16SSean Christopherson smp_wmb(); 92e923b053SGavin Shan r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); 9361e52f16SSean Christopherson TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", 9461e52f16SSean Christopherson errno, strerror(errno)); 9561e52f16SSean Christopherson smp_wmb(); 9661e52f16SSean Christopherson atomic_inc(&seq_cnt); 9761e52f16SSean Christopherson 9861e52f16SSean Christopherson CPU_CLR(cpu, &allowed_mask); 9961e52f16SSean Christopherson 10061e52f16SSean Christopherson /* 10161e52f16SSean Christopherson * Wait 1-10us before proceeding to the next iteration and more 10261e52f16SSean Christopherson * specifically, before bumping seq_cnt again. A delay is 10361e52f16SSean Christopherson * needed on three fronts: 10461e52f16SSean Christopherson * 10561e52f16SSean Christopherson * 1. To allow sched_setaffinity() to prompt migration before 10661e52f16SSean Christopherson * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME 10761e52f16SSean Christopherson * (or TIF_NEED_RESCHED, which indirectly leads to handling 10861e52f16SSean Christopherson * NOTIFY_RESUME) is handled in KVM context. 10961e52f16SSean Christopherson * 11061e52f16SSean Christopherson * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters 11161e52f16SSean Christopherson * the guest, the guest will trigger a IO/MMIO exit all the 11261e52f16SSean Christopherson * way to userspace and the TIF flags will be handled by 11361e52f16SSean Christopherson * the generic "exit to userspace" logic, not by KVM. The 11461e52f16SSean Christopherson * exit to userspace is necessary to give the test a chance 11561e52f16SSean Christopherson * to check the rseq CPU ID (see #2). 11661e52f16SSean Christopherson * 11761e52f16SSean Christopherson * Alternatively, guest_code() could include an instruction 11861e52f16SSean Christopherson * to trigger an exit that is handled by KVM, but any such 11961e52f16SSean Christopherson * exit requires architecture specific code. 12061e52f16SSean Christopherson * 12161e52f16SSean Christopherson * 2. To let ioctl(KVM_RUN) make its way back to the test 12261e52f16SSean Christopherson * before the next round of migration. The test's check on 12361e52f16SSean Christopherson * the rseq CPU ID must wait for migration to complete in 12461e52f16SSean Christopherson * order to avoid false positive, thus any kernel rseq bug 12561e52f16SSean Christopherson * will be missed if the next migration starts before the 12661e52f16SSean Christopherson * check completes. 12761e52f16SSean Christopherson * 12861e52f16SSean Christopherson * 3. To ensure the read-side makes efficient forward progress, 1290fcc1029SGavin Shan * e.g. if getcpu() involves a syscall. Stalling the read-side 1300fcc1029SGavin Shan * means the test will spend more time waiting for getcpu() 1310fcc1029SGavin Shan * to stabilize and less time trying to hit the timing-dependent 1320fcc1029SGavin Shan * bug. 13361e52f16SSean Christopherson * 13461e52f16SSean Christopherson * Because any bug in this area is likely to be timing-dependent, 13561e52f16SSean Christopherson * run with a range of delays at 1us intervals from 1us to 10us 13661e52f16SSean Christopherson * as a best effort to avoid tuning the test to the point where 13761e52f16SSean Christopherson * it can hit _only_ the original bug and not detect future 13861e52f16SSean Christopherson * regressions. 13961e52f16SSean Christopherson * 14061e52f16SSean Christopherson * The original bug can reproduce with a delay up to ~500us on 14161e52f16SSean Christopherson * x86-64, but starts to require more iterations to reproduce 14261e52f16SSean Christopherson * as the delay creeps above ~10us, and the average runtime of 14361e52f16SSean Christopherson * each iteration obviously increases as well. Cap the delay 14461e52f16SSean Christopherson * at 10us to keep test runtime reasonable while minimizing 14561e52f16SSean Christopherson * potential coverage loss. 14661e52f16SSean Christopherson * 14761e52f16SSean Christopherson * The lower bound for reproducing the bug is likely below 1us, 14861e52f16SSean Christopherson * e.g. failures occur on x86-64 with nanosleep(0), but at that 14961e52f16SSean Christopherson * point the overhead of the syscall likely dominates the delay. 15061e52f16SSean Christopherson * Use usleep() for simplicity and to avoid unnecessary kernel 15161e52f16SSean Christopherson * dependencies. 15261e52f16SSean Christopherson */ 15361e52f16SSean Christopherson usleep((i % 10) + 1); 15461e52f16SSean Christopherson } 15561e52f16SSean Christopherson done = true; 15661e52f16SSean Christopherson return NULL; 15761e52f16SSean Christopherson } 15861e52f16SSean Christopherson 1597ed397d1SSean Christopherson static void calc_min_max_cpu(void) 1607b0035eaSSean Christopherson { 1617b0035eaSSean Christopherson int i, cnt, nproc; 1627b0035eaSSean Christopherson 1637ed397d1SSean Christopherson TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2); 1647b0035eaSSean Christopherson 1657b0035eaSSean Christopherson /* 1667b0035eaSSean Christopherson * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that 1677b0035eaSSean Christopherson * this task is affined to in order to reduce the time spent querying 1687b0035eaSSean Christopherson * unusable CPUs, e.g. if this task is pinned to a small percentage of 1697b0035eaSSean Christopherson * total CPUs. 1707b0035eaSSean Christopherson */ 1717b0035eaSSean Christopherson nproc = get_nprocs_conf(); 1727b0035eaSSean Christopherson min_cpu = -1; 1737b0035eaSSean Christopherson max_cpu = -1; 1747b0035eaSSean Christopherson cnt = 0; 1757b0035eaSSean Christopherson 1767b0035eaSSean Christopherson for (i = 0; i < nproc; i++) { 1777b0035eaSSean Christopherson if (!CPU_ISSET(i, &possible_mask)) 1787b0035eaSSean Christopherson continue; 1797b0035eaSSean Christopherson if (min_cpu == -1) 1807b0035eaSSean Christopherson min_cpu = i; 1817b0035eaSSean Christopherson max_cpu = i; 1827b0035eaSSean Christopherson cnt++; 1837b0035eaSSean Christopherson } 1847b0035eaSSean Christopherson 1857ed397d1SSean Christopherson __TEST_REQUIRE(cnt >= 2, 1867ed397d1SSean Christopherson "Only one usable CPU, task migration not possible"); 1877b0035eaSSean Christopherson } 1887b0035eaSSean Christopherson 18961e52f16SSean Christopherson int main(int argc, char *argv[]) 19061e52f16SSean Christopherson { 19161e52f16SSean Christopherson int r, i, snapshot; 19261e52f16SSean Christopherson struct kvm_vm *vm; 1932494a6d8SSean Christopherson struct kvm_vcpu *vcpu; 19461e52f16SSean Christopherson u32 cpu, rseq_cpu; 19561e52f16SSean Christopherson 19661e52f16SSean Christopherson r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); 19761e52f16SSean Christopherson TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, 19861e52f16SSean Christopherson strerror(errno)); 19961e52f16SSean Christopherson 2007ed397d1SSean Christopherson calc_min_max_cpu(); 20161e52f16SSean Christopherson 20266d42ac7SGavin Shan r = rseq_register_current_thread(); 20366d42ac7SGavin Shan TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)", 20466d42ac7SGavin Shan errno, strerror(errno)); 20561e52f16SSean Christopherson 20661e52f16SSean Christopherson /* 20761e52f16SSean Christopherson * Create and run a dummy VM that immediately exits to userspace via 20861e52f16SSean Christopherson * GUEST_SYNC, while concurrently migrating the process by setting its 20961e52f16SSean Christopherson * CPU affinity. 21061e52f16SSean Christopherson */ 2112494a6d8SSean Christopherson vm = vm_create_with_one_vcpu(&vcpu, guest_code); 21261e52f16SSean Christopherson 213e923b053SGavin Shan pthread_create(&migration_thread, NULL, migration_worker, 214561cafebSJinrong Liang (void *)(unsigned long)syscall(SYS_gettid)); 21561e52f16SSean Christopherson 21661e52f16SSean Christopherson for (i = 0; !done; i++) { 217768e9a61SSean Christopherson vcpu_run(vcpu); 218768e9a61SSean Christopherson TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, 21961e52f16SSean Christopherson "Guest failed?"); 22061e52f16SSean Christopherson 22161e52f16SSean Christopherson /* 22261e52f16SSean Christopherson * Verify rseq's CPU matches sched's CPU. Ensure migration 2230fcc1029SGavin Shan * doesn't occur between getcpu() and reading the rseq cpu_id 2240fcc1029SGavin Shan * by rereading both if the sequence count changes, or if the 2250fcc1029SGavin Shan * count is odd (migration in-progress). 22661e52f16SSean Christopherson */ 22761e52f16SSean Christopherson do { 22861e52f16SSean Christopherson /* 22961e52f16SSean Christopherson * Drop bit 0 to force a mismatch if the count is odd, 23061e52f16SSean Christopherson * i.e. if a migration is in-progress. 23161e52f16SSean Christopherson */ 23261e52f16SSean Christopherson snapshot = atomic_read(&seq_cnt) & ~1; 23361e52f16SSean Christopherson 23461e52f16SSean Christopherson /* 2350fcc1029SGavin Shan * Ensure calling getcpu() and reading rseq.cpu_id complete 2360fcc1029SGavin Shan * in a single "no migration" window, i.e. are not reordered 2370fcc1029SGavin Shan * across the seq_cnt reads. 23861e52f16SSean Christopherson */ 23961e52f16SSean Christopherson smp_rmb(); 240*68efe8f7SMark Brown r = sys_getcpu(&cpu, NULL); 241*68efe8f7SMark Brown TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)", 242*68efe8f7SMark Brown errno, strerror(errno)); 24366d42ac7SGavin Shan rseq_cpu = rseq_current_cpu_raw(); 24461e52f16SSean Christopherson smp_rmb(); 24561e52f16SSean Christopherson } while (snapshot != atomic_read(&seq_cnt)); 24661e52f16SSean Christopherson 24761e52f16SSean Christopherson TEST_ASSERT(rseq_cpu == cpu, 24861e52f16SSean Christopherson "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu); 24961e52f16SSean Christopherson } 25061e52f16SSean Christopherson 25161e52f16SSean Christopherson /* 25261e52f16SSean Christopherson * Sanity check that the test was able to enter the guest a reasonable 25361e52f16SSean Christopherson * number of times, e.g. didn't get stalled too often/long waiting for 2540fcc1029SGavin Shan * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly 2550fcc1029SGavin Shan * conservative ratio on x86-64, which can do _more_ KVM_RUNs than 2560fcc1029SGavin Shan * migrations given the 1us+ delay in the migration task. 25761e52f16SSean Christopherson */ 25861e52f16SSean Christopherson TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), 25961e52f16SSean Christopherson "Only performed %d KVM_RUNs, task stalled too much?\n", i); 26061e52f16SSean Christopherson 26161e52f16SSean Christopherson pthread_join(migration_thread, NULL); 26261e52f16SSean Christopherson 26361e52f16SSean Christopherson kvm_vm_free(vm); 26461e52f16SSean Christopherson 26566d42ac7SGavin Shan rseq_unregister_current_thread(); 26661e52f16SSean Christopherson 26761e52f16SSean Christopherson return 0; 26861e52f16SSean Christopherson } 269