1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2022 Intel Corporation. */ 3 4 #include <linux/cpu.h> 5 #include <linux/delay.h> 6 #include <linux/fs.h> 7 #include <linux/nmi.h> 8 #include <linux/slab.h> 9 #include <linux/stop_machine.h> 10 11 #include "ifs.h" 12 13 /* 14 * Note all code and data in this file is protected by 15 * ifs_sem. On HT systems all threads on a core will 16 * execute together, but only the first thread on the 17 * core will update results of the test. 18 */ 19 20 #define CREATE_TRACE_POINTS 21 #include <trace/events/intel_ifs.h> 22 23 /* Max retries on the same chunk */ 24 #define MAX_IFS_RETRIES 5 25 26 /* 27 * Number of TSC cycles that a logical CPU will wait for the other 28 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN). 29 */ 30 #define IFS_THREAD_WAIT 100000 31 32 enum ifs_status_err_code { 33 IFS_NO_ERROR = 0, 34 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1, 35 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2, 36 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3, 37 IFS_INVALID_CHUNK_RANGE = 4, 38 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5, 39 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6, 40 IFS_UNASSIGNED_ERROR_CODE = 7, 41 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8, 42 IFS_INTERRUPTED_DURING_EXECUTION = 9, 43 }; 44 45 static const char * const scan_test_status[] = { 46 [IFS_NO_ERROR] = "SCAN no error", 47 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.", 48 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.", 49 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] = 50 "Core Abort SCAN Response due to power management condition.", 51 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range", 52 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.", 53 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently", 54 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7", 55 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] = 56 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently", 57 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start", 58 }; 59 60 static void message_not_tested(struct device *dev, int cpu, union ifs_status status) 61 { 62 if (status.error_code < ARRAY_SIZE(scan_test_status)) { 63 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n", 64 cpumask_pr_args(cpu_smt_mask(cpu)), 65 scan_test_status[status.error_code]); 66 } else if (status.error_code == IFS_SW_TIMEOUT) { 67 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n", 68 cpumask_pr_args(cpu_smt_mask(cpu))); 69 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) { 70 dev_info(dev, "CPU(s) %*pbl: %s\n", 71 cpumask_pr_args(cpu_smt_mask(cpu)), 72 "Not all scan chunks were executed. Maximum forward progress retries exceeded"); 73 } else { 74 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n", 75 cpumask_pr_args(cpu_smt_mask(cpu)), status.data); 76 } 77 } 78 79 static void message_fail(struct device *dev, int cpu, union ifs_status status) 80 { 81 struct ifs_data *ifsd = ifs_get_data(dev); 82 83 /* 84 * control_error is set when the microcode runs into a problem 85 * loading the image from the reserved BIOS memory, or it has 86 * been corrupted. Reloading the image may fix this issue. 87 */ 88 if (status.control_error) { 89 dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n", 90 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 91 } 92 93 /* 94 * signature_error is set when the output from the scan chains does not 95 * match the expected signature. This might be a transient problem (e.g. 96 * due to a bit flip from an alpha particle or neutron). If the problem 97 * repeats on a subsequent test, then it indicates an actual problem in 98 * the core being tested. 99 */ 100 if (status.signature_error) { 101 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n", 102 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 103 } 104 } 105 106 static bool can_restart(union ifs_status status) 107 { 108 enum ifs_status_err_code err_code = status.error_code; 109 110 /* Signature for chunk is bad, or scan test failed */ 111 if (status.signature_error || status.control_error) 112 return false; 113 114 switch (err_code) { 115 case IFS_NO_ERROR: 116 case IFS_OTHER_THREAD_COULD_NOT_JOIN: 117 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS: 118 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN: 119 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT: 120 case IFS_INTERRUPTED_DURING_EXECUTION: 121 return true; 122 case IFS_INVALID_CHUNK_RANGE: 123 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS: 124 case IFS_CORE_NOT_CAPABLE_CURRENTLY: 125 case IFS_UNASSIGNED_ERROR_CODE: 126 break; 127 } 128 return false; 129 } 130 131 /* 132 * Execute the scan. Called "simultaneously" on all threads of a core 133 * at high priority using the stop_cpus mechanism. 134 */ 135 static int doscan(void *data) 136 { 137 int cpu = smp_processor_id(); 138 u64 *msrs = data; 139 int first; 140 141 /* Only the first logical CPU on a core reports result */ 142 first = cpumask_first(cpu_smt_mask(cpu)); 143 144 /* 145 * This WRMSR will wait for other HT threads to also write 146 * to this MSR (at most for activate.delay cycles). Then it 147 * starts scan of each requested chunk. The core scan happens 148 * during the "execution" of the WRMSR. This instruction can 149 * take up to 200 milliseconds (in the case where all chunks 150 * are processed in a single pass) before it retires. 151 */ 152 wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]); 153 154 if (cpu == first) { 155 /* Pass back the result of the scan */ 156 rdmsrl(MSR_SCAN_STATUS, msrs[1]); 157 } 158 159 return 0; 160 } 161 162 /* 163 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN 164 * on all threads of the core to be tested. Loop if necessary to complete 165 * run of all chunks. Include some defensive tests to make sure forward 166 * progress is made, and that the whole test completes in a reasonable time. 167 */ 168 static void ifs_test_core(int cpu, struct device *dev) 169 { 170 union ifs_scan activate; 171 union ifs_status status; 172 unsigned long timeout; 173 struct ifs_data *ifsd; 174 u64 msrvals[2]; 175 int retries; 176 177 ifsd = ifs_get_data(dev); 178 179 activate.rsvd = 0; 180 activate.delay = IFS_THREAD_WAIT; 181 activate.sigmce = 0; 182 activate.start = 0; 183 activate.stop = ifsd->valid_chunks - 1; 184 185 timeout = jiffies + HZ / 2; 186 retries = MAX_IFS_RETRIES; 187 188 while (activate.start <= activate.stop) { 189 if (time_after(jiffies, timeout)) { 190 status.error_code = IFS_SW_TIMEOUT; 191 break; 192 } 193 194 msrvals[0] = activate.data; 195 stop_core_cpuslocked(cpu, doscan, msrvals); 196 197 status.data = msrvals[1]; 198 199 trace_ifs_status(cpu, activate, status); 200 201 /* Some cases can be retried, give up for others */ 202 if (!can_restart(status)) 203 break; 204 205 if (status.chunk_num == activate.start) { 206 /* Check for forward progress */ 207 if (--retries == 0) { 208 if (status.error_code == IFS_NO_ERROR) 209 status.error_code = IFS_SW_PARTIAL_COMPLETION; 210 break; 211 } 212 } else { 213 retries = MAX_IFS_RETRIES; 214 activate.start = status.chunk_num; 215 } 216 } 217 218 /* Update status for this core */ 219 ifsd->scan_details = status.data; 220 221 if (status.control_error || status.signature_error) { 222 ifsd->status = SCAN_TEST_FAIL; 223 message_fail(dev, cpu, status); 224 } else if (status.error_code) { 225 ifsd->status = SCAN_NOT_TESTED; 226 message_not_tested(dev, cpu, status); 227 } else { 228 ifsd->status = SCAN_TEST_PASS; 229 } 230 } 231 232 #define SPINUNIT 100 /* 100 nsec */ 233 static atomic_t array_cpus_out; 234 235 /* 236 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus() 237 */ 238 static void wait_for_sibling_cpu(atomic_t *t, long long timeout) 239 { 240 int cpu = smp_processor_id(); 241 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 242 int all_cpus = cpumask_weight(smt_mask); 243 244 atomic_inc(t); 245 while (atomic_read(t) < all_cpus) { 246 if (timeout < SPINUNIT) 247 return; 248 ndelay(SPINUNIT); 249 timeout -= SPINUNIT; 250 touch_nmi_watchdog(); 251 } 252 } 253 254 static int do_array_test(void *data) 255 { 256 union ifs_array *command = data; 257 int cpu = smp_processor_id(); 258 int first; 259 260 /* 261 * Only one logical CPU on a core needs to trigger the Array test via MSR write. 262 */ 263 first = cpumask_first(cpu_smt_mask(cpu)); 264 265 if (cpu == first) { 266 wrmsrl(MSR_ARRAY_BIST, command->data); 267 /* Pass back the result of the test */ 268 rdmsrl(MSR_ARRAY_BIST, command->data); 269 } 270 271 /* Tests complete faster if the sibling is spinning here */ 272 wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC); 273 274 return 0; 275 } 276 277 static void ifs_array_test_core(int cpu, struct device *dev) 278 { 279 union ifs_array command = {}; 280 bool timed_out = false; 281 struct ifs_data *ifsd; 282 unsigned long timeout; 283 284 ifsd = ifs_get_data(dev); 285 286 command.array_bitmask = ~0U; 287 timeout = jiffies + HZ / 2; 288 289 do { 290 if (time_after(jiffies, timeout)) { 291 timed_out = true; 292 break; 293 } 294 atomic_set(&array_cpus_out, 0); 295 stop_core_cpuslocked(cpu, do_array_test, &command); 296 297 if (command.ctrl_result) 298 break; 299 } while (command.array_bitmask); 300 301 ifsd->scan_details = command.data; 302 303 if (command.ctrl_result) 304 ifsd->status = SCAN_TEST_FAIL; 305 else if (timed_out || command.array_bitmask) 306 ifsd->status = SCAN_NOT_TESTED; 307 else 308 ifsd->status = SCAN_TEST_PASS; 309 } 310 311 /* 312 * Initiate per core test. It wakes up work queue threads on the target cpu and 313 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and 314 * wait for all sibling threads to finish the scan test. 315 */ 316 int do_core_test(int cpu, struct device *dev) 317 { 318 const struct ifs_test_caps *test = ifs_get_test_caps(dev); 319 struct ifs_data *ifsd = ifs_get_data(dev); 320 int ret = 0; 321 322 /* Prevent CPUs from being taken offline during the scan test */ 323 cpus_read_lock(); 324 325 if (!cpu_online(cpu)) { 326 dev_info(dev, "cannot test on the offline cpu %d\n", cpu); 327 ret = -EINVAL; 328 goto out; 329 } 330 331 switch (test->test_num) { 332 case IFS_TYPE_SAF: 333 if (!ifsd->loaded) 334 ret = -EPERM; 335 else 336 ifs_test_core(cpu, dev); 337 break; 338 case IFS_TYPE_ARRAY_BIST: 339 ifs_array_test_core(cpu, dev); 340 break; 341 default: 342 ret = -EINVAL; 343 } 344 out: 345 cpus_read_unlock(); 346 return ret; 347 } 348