1 // SPDX-License-Identifier: GPL-2.0-only 2 #define _GNU_SOURCE /* for program_invocation_short_name */ 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <pthread.h> 6 #include <sched.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <signal.h> 11 #include <syscall.h> 12 #include <sys/ioctl.h> 13 #include <asm/barrier.h> 14 #include <linux/atomic.h> 15 #include <linux/rseq.h> 16 #include <linux/unistd.h> 17 18 #include "kvm_util.h" 19 #include "processor.h" 20 #include "test_util.h" 21 22 #define VCPU_ID 0 23 24 static __thread volatile struct rseq __rseq = { 25 .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 26 }; 27 28 /* 29 * Use an arbitrary, bogus signature for configuring rseq, this test does not 30 * actually enter an rseq critical section. 31 */ 32 #define RSEQ_SIG 0xdeadbeef 33 34 /* 35 * Any bug related to task migration is likely to be timing-dependent; perform 36 * a large number of migrations to reduce the odds of a false negative. 37 */ 38 #define NR_TASK_MIGRATIONS 100000 39 40 static pthread_t migration_thread; 41 static cpu_set_t possible_mask; 42 static bool done; 43 44 static atomic_t seq_cnt; 45 46 static void guest_code(void) 47 { 48 for (;;) 49 GUEST_SYNC(0); 50 } 51 52 static void sys_rseq(int flags) 53 { 54 int r; 55 56 r = syscall(__NR_rseq, &__rseq, sizeof(__rseq), flags, RSEQ_SIG); 57 TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno)); 58 } 59 60 static void *migration_worker(void *ign) 61 { 62 cpu_set_t allowed_mask; 63 int r, i, nr_cpus, cpu; 64 65 CPU_ZERO(&allowed_mask); 66 67 nr_cpus = CPU_COUNT(&possible_mask); 68 69 for (i = 0; i < NR_TASK_MIGRATIONS; i++) { 70 cpu = i % nr_cpus; 71 if (!CPU_ISSET(cpu, &possible_mask)) 72 continue; 73 74 CPU_SET(cpu, &allowed_mask); 75 76 /* 77 * Bump the sequence count twice to allow the reader to detect 78 * that a migration may have occurred in between rseq and sched 79 * CPU ID reads. An odd sequence count indicates a migration 80 * is in-progress, while a completely different count indicates 81 * a migration occurred since the count was last read. 82 */ 83 atomic_inc(&seq_cnt); 84 85 /* 86 * Ensure the odd count is visible while sched_getcpu() isn't 87 * stable, i.e. while changing affinity is in-progress. 88 */ 89 smp_wmb(); 90 r = sched_setaffinity(0, sizeof(allowed_mask), &allowed_mask); 91 TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", 92 errno, strerror(errno)); 93 smp_wmb(); 94 atomic_inc(&seq_cnt); 95 96 CPU_CLR(cpu, &allowed_mask); 97 98 /* 99 * Wait 1-10us before proceeding to the next iteration and more 100 * specifically, before bumping seq_cnt again. A delay is 101 * needed on three fronts: 102 * 103 * 1. To allow sched_setaffinity() to prompt migration before 104 * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME 105 * (or TIF_NEED_RESCHED, which indirectly leads to handling 106 * NOTIFY_RESUME) is handled in KVM context. 107 * 108 * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters 109 * the guest, the guest will trigger a IO/MMIO exit all the 110 * way to userspace and the TIF flags will be handled by 111 * the generic "exit to userspace" logic, not by KVM. The 112 * exit to userspace is necessary to give the test a chance 113 * to check the rseq CPU ID (see #2). 114 * 115 * Alternatively, guest_code() could include an instruction 116 * to trigger an exit that is handled by KVM, but any such 117 * exit requires architecture specific code. 118 * 119 * 2. To let ioctl(KVM_RUN) make its way back to the test 120 * before the next round of migration. The test's check on 121 * the rseq CPU ID must wait for migration to complete in 122 * order to avoid false positive, thus any kernel rseq bug 123 * will be missed if the next migration starts before the 124 * check completes. 125 * 126 * 3. To ensure the read-side makes efficient forward progress, 127 * e.g. if sched_getcpu() involves a syscall. Stalling the 128 * read-side means the test will spend more time waiting for 129 * sched_getcpu() to stabilize and less time trying to hit 130 * the timing-dependent bug. 131 * 132 * Because any bug in this area is likely to be timing-dependent, 133 * run with a range of delays at 1us intervals from 1us to 10us 134 * as a best effort to avoid tuning the test to the point where 135 * it can hit _only_ the original bug and not detect future 136 * regressions. 137 * 138 * The original bug can reproduce with a delay up to ~500us on 139 * x86-64, but starts to require more iterations to reproduce 140 * as the delay creeps above ~10us, and the average runtime of 141 * each iteration obviously increases as well. Cap the delay 142 * at 10us to keep test runtime reasonable while minimizing 143 * potential coverage loss. 144 * 145 * The lower bound for reproducing the bug is likely below 1us, 146 * e.g. failures occur on x86-64 with nanosleep(0), but at that 147 * point the overhead of the syscall likely dominates the delay. 148 * Use usleep() for simplicity and to avoid unnecessary kernel 149 * dependencies. 150 */ 151 usleep((i % 10) + 1); 152 } 153 done = true; 154 return NULL; 155 } 156 157 int main(int argc, char *argv[]) 158 { 159 int r, i, snapshot; 160 struct kvm_vm *vm; 161 u32 cpu, rseq_cpu; 162 163 /* Tell stdout not to buffer its content */ 164 setbuf(stdout, NULL); 165 166 r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); 167 TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, 168 strerror(errno)); 169 170 if (CPU_COUNT(&possible_mask) < 2) { 171 print_skip("Only one CPU, task migration not possible\n"); 172 exit(KSFT_SKIP); 173 } 174 175 sys_rseq(0); 176 177 /* 178 * Create and run a dummy VM that immediately exits to userspace via 179 * GUEST_SYNC, while concurrently migrating the process by setting its 180 * CPU affinity. 181 */ 182 vm = vm_create_default(VCPU_ID, 0, guest_code); 183 184 pthread_create(&migration_thread, NULL, migration_worker, 0); 185 186 for (i = 0; !done; i++) { 187 vcpu_run(vm, VCPU_ID); 188 TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, 189 "Guest failed?"); 190 191 /* 192 * Verify rseq's CPU matches sched's CPU. Ensure migration 193 * doesn't occur between sched_getcpu() and reading the rseq 194 * cpu_id by rereading both if the sequence count changes, or 195 * if the count is odd (migration in-progress). 196 */ 197 do { 198 /* 199 * Drop bit 0 to force a mismatch if the count is odd, 200 * i.e. if a migration is in-progress. 201 */ 202 snapshot = atomic_read(&seq_cnt) & ~1; 203 204 /* 205 * Ensure reading sched_getcpu() and rseq.cpu_id 206 * complete in a single "no migration" window, i.e. are 207 * not reordered across the seq_cnt reads. 208 */ 209 smp_rmb(); 210 cpu = sched_getcpu(); 211 rseq_cpu = READ_ONCE(__rseq.cpu_id); 212 smp_rmb(); 213 } while (snapshot != atomic_read(&seq_cnt)); 214 215 TEST_ASSERT(rseq_cpu == cpu, 216 "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu); 217 } 218 219 /* 220 * Sanity check that the test was able to enter the guest a reasonable 221 * number of times, e.g. didn't get stalled too often/long waiting for 222 * sched_getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a 223 * fairly conservative ratio on x86-64, which can do _more_ KVM_RUNs 224 * than migrations given the 1us+ delay in the migration task. 225 */ 226 TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), 227 "Only performed %d KVM_RUNs, task stalled too much?\n", i); 228 229 pthread_join(migration_thread, NULL); 230 231 kvm_vm_free(vm); 232 233 sys_rseq(RSEQ_FLAG_UNREGISTER); 234 235 return 0; 236 } 237