1 // SPDX-License-Identifier: GPL-2.0-only 2 #define _GNU_SOURCE /* for program_invocation_short_name */ 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <pthread.h> 6 #include <sched.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <signal.h> 11 #include <syscall.h> 12 #include <sys/ioctl.h> 13 #include <sys/sysinfo.h> 14 #include <asm/barrier.h> 15 #include <linux/atomic.h> 16 #include <linux/rseq.h> 17 #include <linux/unistd.h> 18 19 #include "kvm_util.h" 20 #include "processor.h" 21 #include "test_util.h" 22 23 #include "../rseq/rseq.c" 24 25 /* 26 * Any bug related to task migration is likely to be timing-dependent; perform 27 * a large number of migrations to reduce the odds of a false negative. 28 */ 29 #define NR_TASK_MIGRATIONS 100000 30 31 static pthread_t migration_thread; 32 static cpu_set_t possible_mask; 33 static int min_cpu, max_cpu; 34 static bool done; 35 36 static atomic_t seq_cnt; 37 38 static void guest_code(void) 39 { 40 for (;;) 41 GUEST_SYNC(0); 42 } 43 44 /* 45 * We have to perform direct system call for getcpu() because it's 46 * not available until glic 2.29. 47 */ 48 static void sys_getcpu(unsigned *cpu) 49 { 50 int r; 51 52 r = syscall(__NR_getcpu, cpu, NULL, NULL); 53 TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)", errno, strerror(errno)); 54 } 55 56 static int next_cpu(int cpu) 57 { 58 /* 59 * Advance to the next CPU, skipping those that weren't in the original 60 * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's 61 * data storage is considered as opaque. Note, if this task is pinned 62 * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will 63 * burn a lot cycles and the test will take longer than normal to 64 * complete. 65 */ 66 do { 67 cpu++; 68 if (cpu > max_cpu) { 69 cpu = min_cpu; 70 TEST_ASSERT(CPU_ISSET(cpu, &possible_mask), 71 "Min CPU = %d must always be usable", cpu); 72 break; 73 } 74 } while (!CPU_ISSET(cpu, &possible_mask)); 75 76 return cpu; 77 } 78 79 static void *migration_worker(void *__rseq_tid) 80 { 81 pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; 82 cpu_set_t allowed_mask; 83 int r, i, cpu; 84 85 CPU_ZERO(&allowed_mask); 86 87 for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) { 88 CPU_SET(cpu, &allowed_mask); 89 90 /* 91 * Bump the sequence count twice to allow the reader to detect 92 * that a migration may have occurred in between rseq and sched 93 * CPU ID reads. An odd sequence count indicates a migration 94 * is in-progress, while a completely different count indicates 95 * a migration occurred since the count was last read. 96 */ 97 atomic_inc(&seq_cnt); 98 99 /* 100 * Ensure the odd count is visible while getcpu() isn't 101 * stable, i.e. while changing affinity is in-progress. 102 */ 103 smp_wmb(); 104 r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); 105 TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", 106 errno, strerror(errno)); 107 smp_wmb(); 108 atomic_inc(&seq_cnt); 109 110 CPU_CLR(cpu, &allowed_mask); 111 112 /* 113 * Wait 1-10us before proceeding to the next iteration and more 114 * specifically, before bumping seq_cnt again. A delay is 115 * needed on three fronts: 116 * 117 * 1. To allow sched_setaffinity() to prompt migration before 118 * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME 119 * (or TIF_NEED_RESCHED, which indirectly leads to handling 120 * NOTIFY_RESUME) is handled in KVM context. 121 * 122 * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters 123 * the guest, the guest will trigger a IO/MMIO exit all the 124 * way to userspace and the TIF flags will be handled by 125 * the generic "exit to userspace" logic, not by KVM. The 126 * exit to userspace is necessary to give the test a chance 127 * to check the rseq CPU ID (see #2). 128 * 129 * Alternatively, guest_code() could include an instruction 130 * to trigger an exit that is handled by KVM, but any such 131 * exit requires architecture specific code. 132 * 133 * 2. To let ioctl(KVM_RUN) make its way back to the test 134 * before the next round of migration. The test's check on 135 * the rseq CPU ID must wait for migration to complete in 136 * order to avoid false positive, thus any kernel rseq bug 137 * will be missed if the next migration starts before the 138 * check completes. 139 * 140 * 3. To ensure the read-side makes efficient forward progress, 141 * e.g. if getcpu() involves a syscall. Stalling the read-side 142 * means the test will spend more time waiting for getcpu() 143 * to stabilize and less time trying to hit the timing-dependent 144 * bug. 145 * 146 * Because any bug in this area is likely to be timing-dependent, 147 * run with a range of delays at 1us intervals from 1us to 10us 148 * as a best effort to avoid tuning the test to the point where 149 * it can hit _only_ the original bug and not detect future 150 * regressions. 151 * 152 * The original bug can reproduce with a delay up to ~500us on 153 * x86-64, but starts to require more iterations to reproduce 154 * as the delay creeps above ~10us, and the average runtime of 155 * each iteration obviously increases as well. Cap the delay 156 * at 10us to keep test runtime reasonable while minimizing 157 * potential coverage loss. 158 * 159 * The lower bound for reproducing the bug is likely below 1us, 160 * e.g. failures occur on x86-64 with nanosleep(0), but at that 161 * point the overhead of the syscall likely dominates the delay. 162 * Use usleep() for simplicity and to avoid unnecessary kernel 163 * dependencies. 164 */ 165 usleep((i % 10) + 1); 166 } 167 done = true; 168 return NULL; 169 } 170 171 static void calc_min_max_cpu(void) 172 { 173 int i, cnt, nproc; 174 175 TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2); 176 177 /* 178 * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that 179 * this task is affined to in order to reduce the time spent querying 180 * unusable CPUs, e.g. if this task is pinned to a small percentage of 181 * total CPUs. 182 */ 183 nproc = get_nprocs_conf(); 184 min_cpu = -1; 185 max_cpu = -1; 186 cnt = 0; 187 188 for (i = 0; i < nproc; i++) { 189 if (!CPU_ISSET(i, &possible_mask)) 190 continue; 191 if (min_cpu == -1) 192 min_cpu = i; 193 max_cpu = i; 194 cnt++; 195 } 196 197 __TEST_REQUIRE(cnt >= 2, 198 "Only one usable CPU, task migration not possible"); 199 } 200 201 int main(int argc, char *argv[]) 202 { 203 int r, i, snapshot; 204 struct kvm_vm *vm; 205 struct kvm_vcpu *vcpu; 206 u32 cpu, rseq_cpu; 207 208 /* Tell stdout not to buffer its content */ 209 setbuf(stdout, NULL); 210 211 r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); 212 TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, 213 strerror(errno)); 214 215 calc_min_max_cpu(); 216 217 r = rseq_register_current_thread(); 218 TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)", 219 errno, strerror(errno)); 220 221 /* 222 * Create and run a dummy VM that immediately exits to userspace via 223 * GUEST_SYNC, while concurrently migrating the process by setting its 224 * CPU affinity. 225 */ 226 vm = vm_create_with_one_vcpu(&vcpu, guest_code); 227 ucall_init(vm, NULL); 228 229 pthread_create(&migration_thread, NULL, migration_worker, 230 (void *)(unsigned long)syscall(SYS_gettid)); 231 232 for (i = 0; !done; i++) { 233 vcpu_run(vcpu); 234 TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, 235 "Guest failed?"); 236 237 /* 238 * Verify rseq's CPU matches sched's CPU. Ensure migration 239 * doesn't occur between getcpu() and reading the rseq cpu_id 240 * by rereading both if the sequence count changes, or if the 241 * count is odd (migration in-progress). 242 */ 243 do { 244 /* 245 * Drop bit 0 to force a mismatch if the count is odd, 246 * i.e. if a migration is in-progress. 247 */ 248 snapshot = atomic_read(&seq_cnt) & ~1; 249 250 /* 251 * Ensure calling getcpu() and reading rseq.cpu_id complete 252 * in a single "no migration" window, i.e. are not reordered 253 * across the seq_cnt reads. 254 */ 255 smp_rmb(); 256 sys_getcpu(&cpu); 257 rseq_cpu = rseq_current_cpu_raw(); 258 smp_rmb(); 259 } while (snapshot != atomic_read(&seq_cnt)); 260 261 TEST_ASSERT(rseq_cpu == cpu, 262 "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu); 263 } 264 265 /* 266 * Sanity check that the test was able to enter the guest a reasonable 267 * number of times, e.g. didn't get stalled too often/long waiting for 268 * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly 269 * conservative ratio on x86-64, which can do _more_ KVM_RUNs than 270 * migrations given the 1us+ delay in the migration task. 271 */ 272 TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), 273 "Only performed %d KVM_RUNs, task stalled too much?\n", i); 274 275 pthread_join(migration_thread, NULL); 276 277 kvm_vm_free(vm); 278 279 rseq_unregister_current_thread(); 280 281 return 0; 282 } 283