1 // SPDX-License-Identifier: GPL-2.0-only 2 #define _GNU_SOURCE /* for program_invocation_short_name */ 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <pthread.h> 6 #include <sched.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <signal.h> 11 #include <syscall.h> 12 #include <sys/ioctl.h> 13 #include <sys/sysinfo.h> 14 #include <asm/barrier.h> 15 #include <linux/atomic.h> 16 #include <linux/rseq.h> 17 #include <linux/unistd.h> 18 19 #include "kvm_util.h" 20 #include "processor.h" 21 #include "test_util.h" 22 23 #define VCPU_ID 0 24 25 static __thread volatile struct rseq __rseq = { 26 .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 27 }; 28 29 /* 30 * Use an arbitrary, bogus signature for configuring rseq, this test does not 31 * actually enter an rseq critical section. 32 */ 33 #define RSEQ_SIG 0xdeadbeef 34 35 /* 36 * Any bug related to task migration is likely to be timing-dependent; perform 37 * a large number of migrations to reduce the odds of a false negative. 38 */ 39 #define NR_TASK_MIGRATIONS 100000 40 41 static pthread_t migration_thread; 42 static cpu_set_t possible_mask; 43 static int min_cpu, max_cpu; 44 static bool done; 45 46 static atomic_t seq_cnt; 47 48 static void guest_code(void) 49 { 50 for (;;) 51 GUEST_SYNC(0); 52 } 53 54 static void sys_rseq(int flags) 55 { 56 int r; 57 58 r = syscall(__NR_rseq, &__rseq, sizeof(__rseq), flags, RSEQ_SIG); 59 TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno)); 60 } 61 62 static int next_cpu(int cpu) 63 { 64 /* 65 * Advance to the next CPU, skipping those that weren't in the original 66 * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's 67 * data storage is considered as opaque. Note, if this task is pinned 68 * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will 69 * burn a lot cycles and the test will take longer than normal to 70 * complete. 71 */ 72 do { 73 cpu++; 74 if (cpu > max_cpu) { 75 cpu = min_cpu; 76 TEST_ASSERT(CPU_ISSET(cpu, &possible_mask), 77 "Min CPU = %d must always be usable", cpu); 78 break; 79 } 80 } while (!CPU_ISSET(cpu, &possible_mask)); 81 82 return cpu; 83 } 84 85 static void *migration_worker(void *__rseq_tid) 86 { 87 pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; 88 cpu_set_t allowed_mask; 89 int r, i, cpu; 90 91 CPU_ZERO(&allowed_mask); 92 93 for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) { 94 CPU_SET(cpu, &allowed_mask); 95 96 /* 97 * Bump the sequence count twice to allow the reader to detect 98 * that a migration may have occurred in between rseq and sched 99 * CPU ID reads. An odd sequence count indicates a migration 100 * is in-progress, while a completely different count indicates 101 * a migration occurred since the count was last read. 102 */ 103 atomic_inc(&seq_cnt); 104 105 /* 106 * Ensure the odd count is visible while sched_getcpu() isn't 107 * stable, i.e. while changing affinity is in-progress. 108 */ 109 smp_wmb(); 110 r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); 111 TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", 112 errno, strerror(errno)); 113 smp_wmb(); 114 atomic_inc(&seq_cnt); 115 116 CPU_CLR(cpu, &allowed_mask); 117 118 /* 119 * Wait 1-10us before proceeding to the next iteration and more 120 * specifically, before bumping seq_cnt again. A delay is 121 * needed on three fronts: 122 * 123 * 1. To allow sched_setaffinity() to prompt migration before 124 * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME 125 * (or TIF_NEED_RESCHED, which indirectly leads to handling 126 * NOTIFY_RESUME) is handled in KVM context. 127 * 128 * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters 129 * the guest, the guest will trigger a IO/MMIO exit all the 130 * way to userspace and the TIF flags will be handled by 131 * the generic "exit to userspace" logic, not by KVM. The 132 * exit to userspace is necessary to give the test a chance 133 * to check the rseq CPU ID (see #2). 134 * 135 * Alternatively, guest_code() could include an instruction 136 * to trigger an exit that is handled by KVM, but any such 137 * exit requires architecture specific code. 138 * 139 * 2. To let ioctl(KVM_RUN) make its way back to the test 140 * before the next round of migration. The test's check on 141 * the rseq CPU ID must wait for migration to complete in 142 * order to avoid false positive, thus any kernel rseq bug 143 * will be missed if the next migration starts before the 144 * check completes. 145 * 146 * 3. To ensure the read-side makes efficient forward progress, 147 * e.g. if sched_getcpu() involves a syscall. Stalling the 148 * read-side means the test will spend more time waiting for 149 * sched_getcpu() to stabilize and less time trying to hit 150 * the timing-dependent bug. 151 * 152 * Because any bug in this area is likely to be timing-dependent, 153 * run with a range of delays at 1us intervals from 1us to 10us 154 * as a best effort to avoid tuning the test to the point where 155 * it can hit _only_ the original bug and not detect future 156 * regressions. 157 * 158 * The original bug can reproduce with a delay up to ~500us on 159 * x86-64, but starts to require more iterations to reproduce 160 * as the delay creeps above ~10us, and the average runtime of 161 * each iteration obviously increases as well. Cap the delay 162 * at 10us to keep test runtime reasonable while minimizing 163 * potential coverage loss. 164 * 165 * The lower bound for reproducing the bug is likely below 1us, 166 * e.g. failures occur on x86-64 with nanosleep(0), but at that 167 * point the overhead of the syscall likely dominates the delay. 168 * Use usleep() for simplicity and to avoid unnecessary kernel 169 * dependencies. 170 */ 171 usleep((i % 10) + 1); 172 } 173 done = true; 174 return NULL; 175 } 176 177 static int calc_min_max_cpu(void) 178 { 179 int i, cnt, nproc; 180 181 if (CPU_COUNT(&possible_mask) < 2) 182 return -EINVAL; 183 184 /* 185 * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that 186 * this task is affined to in order to reduce the time spent querying 187 * unusable CPUs, e.g. if this task is pinned to a small percentage of 188 * total CPUs. 189 */ 190 nproc = get_nprocs_conf(); 191 min_cpu = -1; 192 max_cpu = -1; 193 cnt = 0; 194 195 for (i = 0; i < nproc; i++) { 196 if (!CPU_ISSET(i, &possible_mask)) 197 continue; 198 if (min_cpu == -1) 199 min_cpu = i; 200 max_cpu = i; 201 cnt++; 202 } 203 204 return (cnt < 2) ? -EINVAL : 0; 205 } 206 207 int main(int argc, char *argv[]) 208 { 209 int r, i, snapshot; 210 struct kvm_vm *vm; 211 u32 cpu, rseq_cpu; 212 213 /* Tell stdout not to buffer its content */ 214 setbuf(stdout, NULL); 215 216 r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); 217 TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, 218 strerror(errno)); 219 220 if (calc_min_max_cpu()) { 221 print_skip("Only one usable CPU, task migration not possible"); 222 exit(KSFT_SKIP); 223 } 224 225 sys_rseq(0); 226 227 /* 228 * Create and run a dummy VM that immediately exits to userspace via 229 * GUEST_SYNC, while concurrently migrating the process by setting its 230 * CPU affinity. 231 */ 232 vm = vm_create_default(VCPU_ID, 0, guest_code); 233 ucall_init(vm, NULL); 234 235 pthread_create(&migration_thread, NULL, migration_worker, 236 (void *)(unsigned long)gettid()); 237 238 for (i = 0; !done; i++) { 239 vcpu_run(vm, VCPU_ID); 240 TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, 241 "Guest failed?"); 242 243 /* 244 * Verify rseq's CPU matches sched's CPU. Ensure migration 245 * doesn't occur between sched_getcpu() and reading the rseq 246 * cpu_id by rereading both if the sequence count changes, or 247 * if the count is odd (migration in-progress). 248 */ 249 do { 250 /* 251 * Drop bit 0 to force a mismatch if the count is odd, 252 * i.e. if a migration is in-progress. 253 */ 254 snapshot = atomic_read(&seq_cnt) & ~1; 255 256 /* 257 * Ensure reading sched_getcpu() and rseq.cpu_id 258 * complete in a single "no migration" window, i.e. are 259 * not reordered across the seq_cnt reads. 260 */ 261 smp_rmb(); 262 cpu = sched_getcpu(); 263 rseq_cpu = READ_ONCE(__rseq.cpu_id); 264 smp_rmb(); 265 } while (snapshot != atomic_read(&seq_cnt)); 266 267 TEST_ASSERT(rseq_cpu == cpu, 268 "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu); 269 } 270 271 /* 272 * Sanity check that the test was able to enter the guest a reasonable 273 * number of times, e.g. didn't get stalled too often/long waiting for 274 * sched_getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a 275 * fairly conservative ratio on x86-64, which can do _more_ KVM_RUNs 276 * than migrations given the 1us+ delay in the migration task. 277 */ 278 TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), 279 "Only performed %d KVM_RUNs, task stalled too much?\n", i); 280 281 pthread_join(migration_thread, NULL); 282 283 kvm_vm_free(vm); 284 285 sys_rseq(RSEQ_FLAG_UNREGISTER); 286 287 return 0; 288 } 289