1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2022 Intel Corporation. */
3
4 #include <linux/cpu.h>
5 #include <linux/delay.h>
6 #include <linux/fs.h>
7 #include <linux/nmi.h>
8 #include <linux/slab.h>
9 #include <linux/stop_machine.h>
10
11 #include "ifs.h"
12
13 /*
14 * Note all code and data in this file is protected by
15 * ifs_sem. On HT systems all threads on a core will
16 * execute together, but only the first thread on the
17 * core will update results of the test.
18 */
19
20 #define CREATE_TRACE_POINTS
21 #include <trace/events/intel_ifs.h>
22
23 /* Max retries on the same chunk */
24 #define MAX_IFS_RETRIES 5
25
26 /*
27 * Number of TSC cycles that a logical CPU will wait for the other
28 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
29 */
30 #define IFS_THREAD_WAIT 100000
31
32 enum ifs_status_err_code {
33 IFS_NO_ERROR = 0,
34 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1,
35 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2,
36 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3,
37 IFS_INVALID_CHUNK_RANGE = 4,
38 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5,
39 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6,
40 IFS_UNASSIGNED_ERROR_CODE = 7,
41 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8,
42 IFS_INTERRUPTED_DURING_EXECUTION = 9,
43 };
44
45 static const char * const scan_test_status[] = {
46 [IFS_NO_ERROR] = "SCAN no error",
47 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
48 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
49 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
50 "Core Abort SCAN Response due to power management condition.",
51 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
52 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
53 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
54 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
55 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
56 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
57 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
58 };
59
message_not_tested(struct device * dev,int cpu,union ifs_status status)60 static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
61 {
62 if (status.error_code < ARRAY_SIZE(scan_test_status)) {
63 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
64 cpumask_pr_args(cpu_smt_mask(cpu)),
65 scan_test_status[status.error_code]);
66 } else if (status.error_code == IFS_SW_TIMEOUT) {
67 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
68 cpumask_pr_args(cpu_smt_mask(cpu)));
69 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
70 dev_info(dev, "CPU(s) %*pbl: %s\n",
71 cpumask_pr_args(cpu_smt_mask(cpu)),
72 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
73 } else {
74 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
75 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
76 }
77 }
78
message_fail(struct device * dev,int cpu,union ifs_status status)79 static void message_fail(struct device *dev, int cpu, union ifs_status status)
80 {
81 struct ifs_data *ifsd = ifs_get_data(dev);
82
83 /*
84 * control_error is set when the microcode runs into a problem
85 * loading the image from the reserved BIOS memory, or it has
86 * been corrupted. Reloading the image may fix this issue.
87 */
88 if (status.control_error) {
89 dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n",
90 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
91 }
92
93 /*
94 * signature_error is set when the output from the scan chains does not
95 * match the expected signature. This might be a transient problem (e.g.
96 * due to a bit flip from an alpha particle or neutron). If the problem
97 * repeats on a subsequent test, then it indicates an actual problem in
98 * the core being tested.
99 */
100 if (status.signature_error) {
101 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
102 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
103 }
104 }
105
can_restart(union ifs_status status)106 static bool can_restart(union ifs_status status)
107 {
108 enum ifs_status_err_code err_code = status.error_code;
109
110 /* Signature for chunk is bad, or scan test failed */
111 if (status.signature_error || status.control_error)
112 return false;
113
114 switch (err_code) {
115 case IFS_NO_ERROR:
116 case IFS_OTHER_THREAD_COULD_NOT_JOIN:
117 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
118 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
119 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
120 case IFS_INTERRUPTED_DURING_EXECUTION:
121 return true;
122 case IFS_INVALID_CHUNK_RANGE:
123 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
124 case IFS_CORE_NOT_CAPABLE_CURRENTLY:
125 case IFS_UNASSIGNED_ERROR_CODE:
126 break;
127 }
128 return false;
129 }
130
131 /*
132 * Execute the scan. Called "simultaneously" on all threads of a core
133 * at high priority using the stop_cpus mechanism.
134 */
doscan(void * data)135 static int doscan(void *data)
136 {
137 int cpu = smp_processor_id();
138 u64 *msrs = data;
139 int first;
140
141 /* Only the first logical CPU on a core reports result */
142 first = cpumask_first(cpu_smt_mask(cpu));
143
144 /*
145 * This WRMSR will wait for other HT threads to also write
146 * to this MSR (at most for activate.delay cycles). Then it
147 * starts scan of each requested chunk. The core scan happens
148 * during the "execution" of the WRMSR. This instruction can
149 * take up to 200 milliseconds (in the case where all chunks
150 * are processed in a single pass) before it retires.
151 */
152 wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]);
153
154 if (cpu == first) {
155 /* Pass back the result of the scan */
156 rdmsrl(MSR_SCAN_STATUS, msrs[1]);
157 }
158
159 return 0;
160 }
161
162 /*
163 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
164 * on all threads of the core to be tested. Loop if necessary to complete
165 * run of all chunks. Include some defensive tests to make sure forward
166 * progress is made, and that the whole test completes in a reasonable time.
167 */
ifs_test_core(int cpu,struct device * dev)168 static void ifs_test_core(int cpu, struct device *dev)
169 {
170 union ifs_status status = {};
171 union ifs_scan activate;
172 unsigned long timeout;
173 struct ifs_data *ifsd;
174 int to_start, to_stop;
175 int status_chunk;
176 u64 msrvals[2];
177 int retries;
178
179 ifsd = ifs_get_data(dev);
180
181 activate.gen0.rsvd = 0;
182 activate.delay = IFS_THREAD_WAIT;
183 activate.sigmce = 0;
184 to_start = 0;
185 to_stop = ifsd->valid_chunks - 1;
186
187 if (ifsd->generation) {
188 activate.gen2.start = to_start;
189 activate.gen2.stop = to_stop;
190 } else {
191 activate.gen0.start = to_start;
192 activate.gen0.stop = to_stop;
193 }
194
195 timeout = jiffies + HZ / 2;
196 retries = MAX_IFS_RETRIES;
197
198 while (to_start <= to_stop) {
199 if (time_after(jiffies, timeout)) {
200 status.error_code = IFS_SW_TIMEOUT;
201 break;
202 }
203
204 msrvals[0] = activate.data;
205 stop_core_cpuslocked(cpu, doscan, msrvals);
206
207 status.data = msrvals[1];
208
209 trace_ifs_status(cpu, to_start, to_stop, status.data);
210
211 /* Some cases can be retried, give up for others */
212 if (!can_restart(status))
213 break;
214
215 status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num;
216 if (status_chunk == to_start) {
217 /* Check for forward progress */
218 if (--retries == 0) {
219 if (status.error_code == IFS_NO_ERROR)
220 status.error_code = IFS_SW_PARTIAL_COMPLETION;
221 break;
222 }
223 } else {
224 retries = MAX_IFS_RETRIES;
225 if (ifsd->generation)
226 activate.gen2.start = status_chunk;
227 else
228 activate.gen0.start = status_chunk;
229 to_start = status_chunk;
230 }
231 }
232
233 /* Update status for this core */
234 ifsd->scan_details = status.data;
235
236 if (status.control_error || status.signature_error) {
237 ifsd->status = SCAN_TEST_FAIL;
238 message_fail(dev, cpu, status);
239 } else if (status.error_code) {
240 ifsd->status = SCAN_NOT_TESTED;
241 message_not_tested(dev, cpu, status);
242 } else {
243 ifsd->status = SCAN_TEST_PASS;
244 }
245 }
246
247 #define SPINUNIT 100 /* 100 nsec */
248 static atomic_t array_cpus_out;
249
250 /*
251 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
252 */
wait_for_sibling_cpu(atomic_t * t,long long timeout)253 static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
254 {
255 int cpu = smp_processor_id();
256 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
257 int all_cpus = cpumask_weight(smt_mask);
258
259 atomic_inc(t);
260 while (atomic_read(t) < all_cpus) {
261 if (timeout < SPINUNIT)
262 return;
263 ndelay(SPINUNIT);
264 timeout -= SPINUNIT;
265 touch_nmi_watchdog();
266 }
267 }
268
do_array_test(void * data)269 static int do_array_test(void *data)
270 {
271 union ifs_array *command = data;
272 int cpu = smp_processor_id();
273 int first;
274
275 /*
276 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
277 */
278 first = cpumask_first(cpu_smt_mask(cpu));
279
280 if (cpu == first) {
281 wrmsrl(MSR_ARRAY_BIST, command->data);
282 /* Pass back the result of the test */
283 rdmsrl(MSR_ARRAY_BIST, command->data);
284 }
285
286 /* Tests complete faster if the sibling is spinning here */
287 wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC);
288
289 return 0;
290 }
291
ifs_array_test_core(int cpu,struct device * dev)292 static void ifs_array_test_core(int cpu, struct device *dev)
293 {
294 union ifs_array command = {};
295 bool timed_out = false;
296 struct ifs_data *ifsd;
297 unsigned long timeout;
298
299 ifsd = ifs_get_data(dev);
300
301 command.array_bitmask = ~0U;
302 timeout = jiffies + HZ / 2;
303
304 do {
305 if (time_after(jiffies, timeout)) {
306 timed_out = true;
307 break;
308 }
309 atomic_set(&array_cpus_out, 0);
310 stop_core_cpuslocked(cpu, do_array_test, &command);
311
312 if (command.ctrl_result)
313 break;
314 } while (command.array_bitmask);
315
316 ifsd->scan_details = command.data;
317
318 if (command.ctrl_result)
319 ifsd->status = SCAN_TEST_FAIL;
320 else if (timed_out || command.array_bitmask)
321 ifsd->status = SCAN_NOT_TESTED;
322 else
323 ifsd->status = SCAN_TEST_PASS;
324 }
325
326 /*
327 * Initiate per core test. It wakes up work queue threads on the target cpu and
328 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
329 * wait for all sibling threads to finish the scan test.
330 */
do_core_test(int cpu,struct device * dev)331 int do_core_test(int cpu, struct device *dev)
332 {
333 const struct ifs_test_caps *test = ifs_get_test_caps(dev);
334 struct ifs_data *ifsd = ifs_get_data(dev);
335 int ret = 0;
336
337 /* Prevent CPUs from being taken offline during the scan test */
338 cpus_read_lock();
339
340 if (!cpu_online(cpu)) {
341 dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
342 ret = -EINVAL;
343 goto out;
344 }
345
346 switch (test->test_num) {
347 case IFS_TYPE_SAF:
348 if (!ifsd->loaded)
349 ret = -EPERM;
350 else
351 ifs_test_core(cpu, dev);
352 break;
353 case IFS_TYPE_ARRAY_BIST:
354 ifs_array_test_core(cpu, dev);
355 break;
356 default:
357 ret = -EINVAL;
358 }
359 out:
360 cpus_read_unlock();
361 return ret;
362 }
363