1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2022 Intel Corporation. */ 3 4 #include <linux/cpu.h> 5 #include <linux/delay.h> 6 #include <linux/fs.h> 7 #include <linux/nmi.h> 8 #include <linux/slab.h> 9 #include <linux/stop_machine.h> 10 11 #include "ifs.h" 12 13 /* 14 * Note all code and data in this file is protected by 15 * ifs_sem. On HT systems all threads on a core will 16 * execute together, but only the first thread on the 17 * core will update results of the test. 18 */ 19 20 #define CREATE_TRACE_POINTS 21 #include <trace/events/intel_ifs.h> 22 23 /* Max retries on the same chunk */ 24 #define MAX_IFS_RETRIES 5 25 26 /* 27 * Number of TSC cycles that a logical CPU will wait for the other 28 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN). 29 */ 30 #define IFS_THREAD_WAIT 100000 31 32 enum ifs_status_err_code { 33 IFS_NO_ERROR = 0, 34 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1, 35 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2, 36 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3, 37 IFS_INVALID_CHUNK_RANGE = 4, 38 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5, 39 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6, 40 IFS_UNASSIGNED_ERROR_CODE = 7, 41 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8, 42 IFS_INTERRUPTED_DURING_EXECUTION = 9, 43 }; 44 45 static const char * const scan_test_status[] = { 46 [IFS_NO_ERROR] = "SCAN no error", 47 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.", 48 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.", 49 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] = 50 "Core Abort SCAN Response due to power management condition.", 51 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range", 52 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.", 53 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently", 54 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7", 55 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] = 56 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently", 57 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start", 58 }; 59 60 static void message_not_tested(struct device *dev, int cpu, union ifs_status status) 61 { 62 if (status.error_code < ARRAY_SIZE(scan_test_status)) { 63 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n", 64 cpumask_pr_args(cpu_smt_mask(cpu)), 65 scan_test_status[status.error_code]); 66 } else if (status.error_code == IFS_SW_TIMEOUT) { 67 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n", 68 cpumask_pr_args(cpu_smt_mask(cpu))); 69 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) { 70 dev_info(dev, "CPU(s) %*pbl: %s\n", 71 cpumask_pr_args(cpu_smt_mask(cpu)), 72 "Not all scan chunks were executed. Maximum forward progress retries exceeded"); 73 } else { 74 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n", 75 cpumask_pr_args(cpu_smt_mask(cpu)), status.data); 76 } 77 } 78 79 static void message_fail(struct device *dev, int cpu, union ifs_status status) 80 { 81 struct ifs_data *ifsd = ifs_get_data(dev); 82 83 /* 84 * control_error is set when the microcode runs into a problem 85 * loading the image from the reserved BIOS memory, or it has 86 * been corrupted. Reloading the image may fix this issue. 87 */ 88 if (status.control_error) { 89 dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n", 90 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 91 } 92 93 /* 94 * signature_error is set when the output from the scan chains does not 95 * match the expected signature. This might be a transient problem (e.g. 96 * due to a bit flip from an alpha particle or neutron). If the problem 97 * repeats on a subsequent test, then it indicates an actual problem in 98 * the core being tested. 99 */ 100 if (status.signature_error) { 101 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n", 102 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 103 } 104 } 105 106 static bool can_restart(union ifs_status status) 107 { 108 enum ifs_status_err_code err_code = status.error_code; 109 110 /* Signature for chunk is bad, or scan test failed */ 111 if (status.signature_error || status.control_error) 112 return false; 113 114 switch (err_code) { 115 case IFS_NO_ERROR: 116 case IFS_OTHER_THREAD_COULD_NOT_JOIN: 117 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS: 118 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN: 119 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT: 120 case IFS_INTERRUPTED_DURING_EXECUTION: 121 return true; 122 case IFS_INVALID_CHUNK_RANGE: 123 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS: 124 case IFS_CORE_NOT_CAPABLE_CURRENTLY: 125 case IFS_UNASSIGNED_ERROR_CODE: 126 break; 127 } 128 return false; 129 } 130 131 /* 132 * Execute the scan. Called "simultaneously" on all threads of a core 133 * at high priority using the stop_cpus mechanism. 134 */ 135 static int doscan(void *data) 136 { 137 int cpu = smp_processor_id(); 138 u64 *msrs = data; 139 int first; 140 141 /* Only the first logical CPU on a core reports result */ 142 first = cpumask_first(cpu_smt_mask(cpu)); 143 144 /* 145 * This WRMSR will wait for other HT threads to also write 146 * to this MSR (at most for activate.delay cycles). Then it 147 * starts scan of each requested chunk. The core scan happens 148 * during the "execution" of the WRMSR. This instruction can 149 * take up to 200 milliseconds (in the case where all chunks 150 * are processed in a single pass) before it retires. 151 */ 152 wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]); 153 154 if (cpu == first) { 155 /* Pass back the result of the scan */ 156 rdmsrl(MSR_SCAN_STATUS, msrs[1]); 157 } 158 159 return 0; 160 } 161 162 /* 163 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN 164 * on all threads of the core to be tested. Loop if necessary to complete 165 * run of all chunks. Include some defensive tests to make sure forward 166 * progress is made, and that the whole test completes in a reasonable time. 167 */ 168 static void ifs_test_core(int cpu, struct device *dev) 169 { 170 union ifs_status status = {}; 171 union ifs_scan activate; 172 unsigned long timeout; 173 struct ifs_data *ifsd; 174 int to_start, to_stop; 175 int status_chunk; 176 u64 msrvals[2]; 177 int retries; 178 179 ifsd = ifs_get_data(dev); 180 181 activate.gen0.rsvd = 0; 182 activate.delay = IFS_THREAD_WAIT; 183 activate.sigmce = 0; 184 to_start = 0; 185 to_stop = ifsd->valid_chunks - 1; 186 187 if (ifsd->generation) { 188 activate.gen2.start = to_start; 189 activate.gen2.stop = to_stop; 190 } else { 191 activate.gen0.start = to_start; 192 activate.gen0.stop = to_stop; 193 } 194 195 timeout = jiffies + HZ / 2; 196 retries = MAX_IFS_RETRIES; 197 198 while (to_start <= to_stop) { 199 if (time_after(jiffies, timeout)) { 200 status.error_code = IFS_SW_TIMEOUT; 201 break; 202 } 203 204 msrvals[0] = activate.data; 205 stop_core_cpuslocked(cpu, doscan, msrvals); 206 207 status.data = msrvals[1]; 208 209 trace_ifs_status(cpu, to_start, to_stop, status.data); 210 211 /* Some cases can be retried, give up for others */ 212 if (!can_restart(status)) 213 break; 214 215 status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num; 216 if (status_chunk == to_start) { 217 /* Check for forward progress */ 218 if (--retries == 0) { 219 if (status.error_code == IFS_NO_ERROR) 220 status.error_code = IFS_SW_PARTIAL_COMPLETION; 221 break; 222 } 223 } else { 224 retries = MAX_IFS_RETRIES; 225 if (ifsd->generation) 226 activate.gen2.start = status_chunk; 227 else 228 activate.gen0.start = status_chunk; 229 to_start = status_chunk; 230 } 231 } 232 233 /* Update status for this core */ 234 ifsd->scan_details = status.data; 235 236 if (status.control_error || status.signature_error) { 237 ifsd->status = SCAN_TEST_FAIL; 238 message_fail(dev, cpu, status); 239 } else if (status.error_code) { 240 ifsd->status = SCAN_NOT_TESTED; 241 message_not_tested(dev, cpu, status); 242 } else { 243 ifsd->status = SCAN_TEST_PASS; 244 } 245 } 246 247 #define SPINUNIT 100 /* 100 nsec */ 248 static atomic_t array_cpus_out; 249 250 /* 251 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus() 252 */ 253 static void wait_for_sibling_cpu(atomic_t *t, long long timeout) 254 { 255 int cpu = smp_processor_id(); 256 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 257 int all_cpus = cpumask_weight(smt_mask); 258 259 atomic_inc(t); 260 while (atomic_read(t) < all_cpus) { 261 if (timeout < SPINUNIT) 262 return; 263 ndelay(SPINUNIT); 264 timeout -= SPINUNIT; 265 touch_nmi_watchdog(); 266 } 267 } 268 269 static int do_array_test(void *data) 270 { 271 union ifs_array *command = data; 272 int cpu = smp_processor_id(); 273 int first; 274 275 /* 276 * Only one logical CPU on a core needs to trigger the Array test via MSR write. 277 */ 278 first = cpumask_first(cpu_smt_mask(cpu)); 279 280 if (cpu == first) { 281 wrmsrl(MSR_ARRAY_BIST, command->data); 282 /* Pass back the result of the test */ 283 rdmsrl(MSR_ARRAY_BIST, command->data); 284 } 285 286 /* Tests complete faster if the sibling is spinning here */ 287 wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC); 288 289 return 0; 290 } 291 292 static void ifs_array_test_core(int cpu, struct device *dev) 293 { 294 union ifs_array command = {}; 295 bool timed_out = false; 296 struct ifs_data *ifsd; 297 unsigned long timeout; 298 299 ifsd = ifs_get_data(dev); 300 301 command.array_bitmask = ~0U; 302 timeout = jiffies + HZ / 2; 303 304 do { 305 if (time_after(jiffies, timeout)) { 306 timed_out = true; 307 break; 308 } 309 atomic_set(&array_cpus_out, 0); 310 stop_core_cpuslocked(cpu, do_array_test, &command); 311 312 if (command.ctrl_result) 313 break; 314 } while (command.array_bitmask); 315 316 ifsd->scan_details = command.data; 317 318 if (command.ctrl_result) 319 ifsd->status = SCAN_TEST_FAIL; 320 else if (timed_out || command.array_bitmask) 321 ifsd->status = SCAN_NOT_TESTED; 322 else 323 ifsd->status = SCAN_TEST_PASS; 324 } 325 326 /* 327 * Initiate per core test. It wakes up work queue threads on the target cpu and 328 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and 329 * wait for all sibling threads to finish the scan test. 330 */ 331 int do_core_test(int cpu, struct device *dev) 332 { 333 const struct ifs_test_caps *test = ifs_get_test_caps(dev); 334 struct ifs_data *ifsd = ifs_get_data(dev); 335 int ret = 0; 336 337 /* Prevent CPUs from being taken offline during the scan test */ 338 cpus_read_lock(); 339 340 if (!cpu_online(cpu)) { 341 dev_info(dev, "cannot test on the offline cpu %d\n", cpu); 342 ret = -EINVAL; 343 goto out; 344 } 345 346 switch (test->test_num) { 347 case IFS_TYPE_SAF: 348 if (!ifsd->loaded) 349 ret = -EPERM; 350 else 351 ifs_test_core(cpu, dev); 352 break; 353 case IFS_TYPE_ARRAY_BIST: 354 ifs_array_test_core(cpu, dev); 355 break; 356 default: 357 ret = -EINVAL; 358 } 359 out: 360 cpus_read_unlock(); 361 return ret; 362 } 363