12b40e654SJithu Joseph // SPDX-License-Identifier: GPL-2.0-only
22b40e654SJithu Joseph /* Copyright(c) 2022 Intel Corporation. */
32b40e654SJithu Joseph
42b40e654SJithu Joseph #include <linux/cpu.h>
52b40e654SJithu Joseph #include <linux/delay.h>
62b40e654SJithu Joseph #include <linux/fs.h>
72b40e654SJithu Joseph #include <linux/nmi.h>
82b40e654SJithu Joseph #include <linux/slab.h>
92b40e654SJithu Joseph #include <linux/stop_machine.h>
102b40e654SJithu Joseph
112b40e654SJithu Joseph #include "ifs.h"
122b40e654SJithu Joseph
132b40e654SJithu Joseph /*
142b40e654SJithu Joseph * Note all code and data in this file is protected by
152b40e654SJithu Joseph * ifs_sem. On HT systems all threads on a core will
162b40e654SJithu Joseph * execute together, but only the first thread on the
172b40e654SJithu Joseph * core will update results of the test.
182b40e654SJithu Joseph */
192b40e654SJithu Joseph
2051af802fSTony Luck #define CREATE_TRACE_POINTS
2151af802fSTony Luck #include <trace/events/intel_ifs.h>
2251af802fSTony Luck
232b40e654SJithu Joseph /* Max retries on the same chunk */
242b40e654SJithu Joseph #define MAX_IFS_RETRIES 5
252b40e654SJithu Joseph
262b40e654SJithu Joseph /*
272b40e654SJithu Joseph * Number of TSC cycles that a logical CPU will wait for the other
282b40e654SJithu Joseph * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
292b40e654SJithu Joseph */
302b40e654SJithu Joseph #define IFS_THREAD_WAIT 100000
312b40e654SJithu Joseph
322b40e654SJithu Joseph enum ifs_status_err_code {
332b40e654SJithu Joseph IFS_NO_ERROR = 0,
342b40e654SJithu Joseph IFS_OTHER_THREAD_COULD_NOT_JOIN = 1,
352b40e654SJithu Joseph IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2,
362b40e654SJithu Joseph IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3,
372b40e654SJithu Joseph IFS_INVALID_CHUNK_RANGE = 4,
382b40e654SJithu Joseph IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5,
392b40e654SJithu Joseph IFS_CORE_NOT_CAPABLE_CURRENTLY = 6,
402b40e654SJithu Joseph IFS_UNASSIGNED_ERROR_CODE = 7,
412b40e654SJithu Joseph IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8,
422b40e654SJithu Joseph IFS_INTERRUPTED_DURING_EXECUTION = 9,
432b40e654SJithu Joseph };
442b40e654SJithu Joseph
452b40e654SJithu Joseph static const char * const scan_test_status[] = {
462b40e654SJithu Joseph [IFS_NO_ERROR] = "SCAN no error",
472b40e654SJithu Joseph [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
482b40e654SJithu Joseph [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
492b40e654SJithu Joseph [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
502b40e654SJithu Joseph "Core Abort SCAN Response due to power management condition.",
512b40e654SJithu Joseph [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
522b40e654SJithu Joseph [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
532b40e654SJithu Joseph [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
542b40e654SJithu Joseph [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
552b40e654SJithu Joseph [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
562b40e654SJithu Joseph "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
572b40e654SJithu Joseph [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
582b40e654SJithu Joseph };
592b40e654SJithu Joseph
message_not_tested(struct device * dev,int cpu,union ifs_status status)602b40e654SJithu Joseph static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
612b40e654SJithu Joseph {
622b40e654SJithu Joseph if (status.error_code < ARRAY_SIZE(scan_test_status)) {
632b40e654SJithu Joseph dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
642b40e654SJithu Joseph cpumask_pr_args(cpu_smt_mask(cpu)),
652b40e654SJithu Joseph scan_test_status[status.error_code]);
662b40e654SJithu Joseph } else if (status.error_code == IFS_SW_TIMEOUT) {
672b40e654SJithu Joseph dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
682b40e654SJithu Joseph cpumask_pr_args(cpu_smt_mask(cpu)));
692b40e654SJithu Joseph } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
702b40e654SJithu Joseph dev_info(dev, "CPU(s) %*pbl: %s\n",
712b40e654SJithu Joseph cpumask_pr_args(cpu_smt_mask(cpu)),
722b40e654SJithu Joseph "Not all scan chunks were executed. Maximum forward progress retries exceeded");
732b40e654SJithu Joseph } else {
742b40e654SJithu Joseph dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
752b40e654SJithu Joseph cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
762b40e654SJithu Joseph }
772b40e654SJithu Joseph }
782b40e654SJithu Joseph
message_fail(struct device * dev,int cpu,union ifs_status status)792b40e654SJithu Joseph static void message_fail(struct device *dev, int cpu, union ifs_status status)
802b40e654SJithu Joseph {
814fb858f3SJithu Joseph struct ifs_data *ifsd = ifs_get_data(dev);
824fb858f3SJithu Joseph
832b40e654SJithu Joseph /*
842b40e654SJithu Joseph * control_error is set when the microcode runs into a problem
852b40e654SJithu Joseph * loading the image from the reserved BIOS memory, or it has
862b40e654SJithu Joseph * been corrupted. Reloading the image may fix this issue.
872b40e654SJithu Joseph */
882b40e654SJithu Joseph if (status.control_error) {
894fb858f3SJithu Joseph dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n",
904fb858f3SJithu Joseph cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
912b40e654SJithu Joseph }
922b40e654SJithu Joseph
932b40e654SJithu Joseph /*
942b40e654SJithu Joseph * signature_error is set when the output from the scan chains does not
952b40e654SJithu Joseph * match the expected signature. This might be a transient problem (e.g.
962b40e654SJithu Joseph * due to a bit flip from an alpha particle or neutron). If the problem
972b40e654SJithu Joseph * repeats on a subsequent test, then it indicates an actual problem in
982b40e654SJithu Joseph * the core being tested.
992b40e654SJithu Joseph */
1002b40e654SJithu Joseph if (status.signature_error) {
1014fb858f3SJithu Joseph dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
1024fb858f3SJithu Joseph cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
1032b40e654SJithu Joseph }
1042b40e654SJithu Joseph }
1052b40e654SJithu Joseph
can_restart(union ifs_status status)1062b40e654SJithu Joseph static bool can_restart(union ifs_status status)
1072b40e654SJithu Joseph {
1082b40e654SJithu Joseph enum ifs_status_err_code err_code = status.error_code;
1092b40e654SJithu Joseph
1102b40e654SJithu Joseph /* Signature for chunk is bad, or scan test failed */
1112b40e654SJithu Joseph if (status.signature_error || status.control_error)
1122b40e654SJithu Joseph return false;
1132b40e654SJithu Joseph
1142b40e654SJithu Joseph switch (err_code) {
1152b40e654SJithu Joseph case IFS_NO_ERROR:
1162b40e654SJithu Joseph case IFS_OTHER_THREAD_COULD_NOT_JOIN:
1172b40e654SJithu Joseph case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
1182b40e654SJithu Joseph case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
1192b40e654SJithu Joseph case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
1202b40e654SJithu Joseph case IFS_INTERRUPTED_DURING_EXECUTION:
1212b40e654SJithu Joseph return true;
1222b40e654SJithu Joseph case IFS_INVALID_CHUNK_RANGE:
1232b40e654SJithu Joseph case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
1242b40e654SJithu Joseph case IFS_CORE_NOT_CAPABLE_CURRENTLY:
1252b40e654SJithu Joseph case IFS_UNASSIGNED_ERROR_CODE:
1262b40e654SJithu Joseph break;
1272b40e654SJithu Joseph }
1282b40e654SJithu Joseph return false;
1292b40e654SJithu Joseph }
1302b40e654SJithu Joseph
1312b40e654SJithu Joseph /*
1322b40e654SJithu Joseph * Execute the scan. Called "simultaneously" on all threads of a core
1332b40e654SJithu Joseph * at high priority using the stop_cpus mechanism.
1342b40e654SJithu Joseph */
doscan(void * data)1352b40e654SJithu Joseph static int doscan(void *data)
1362b40e654SJithu Joseph {
1372b40e654SJithu Joseph int cpu = smp_processor_id();
1382b40e654SJithu Joseph u64 *msrs = data;
1392b40e654SJithu Joseph int first;
1402b40e654SJithu Joseph
1412b40e654SJithu Joseph /* Only the first logical CPU on a core reports result */
1422b40e654SJithu Joseph first = cpumask_first(cpu_smt_mask(cpu));
1432b40e654SJithu Joseph
1442b40e654SJithu Joseph /*
1452b40e654SJithu Joseph * This WRMSR will wait for other HT threads to also write
1462b40e654SJithu Joseph * to this MSR (at most for activate.delay cycles). Then it
1472b40e654SJithu Joseph * starts scan of each requested chunk. The core scan happens
1482b40e654SJithu Joseph * during the "execution" of the WRMSR. This instruction can
1492b40e654SJithu Joseph * take up to 200 milliseconds (in the case where all chunks
1502b40e654SJithu Joseph * are processed in a single pass) before it retires.
1512b40e654SJithu Joseph */
1522b40e654SJithu Joseph wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]);
1532b40e654SJithu Joseph
1542b40e654SJithu Joseph if (cpu == first) {
1552b40e654SJithu Joseph /* Pass back the result of the scan */
1562b40e654SJithu Joseph rdmsrl(MSR_SCAN_STATUS, msrs[1]);
1572b40e654SJithu Joseph }
1582b40e654SJithu Joseph
1592b40e654SJithu Joseph return 0;
1602b40e654SJithu Joseph }
1612b40e654SJithu Joseph
1622b40e654SJithu Joseph /*
1632b40e654SJithu Joseph * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
1642b40e654SJithu Joseph * on all threads of the core to be tested. Loop if necessary to complete
1652b40e654SJithu Joseph * run of all chunks. Include some defensive tests to make sure forward
1662b40e654SJithu Joseph * progress is made, and that the whole test completes in a reasonable time.
1672b40e654SJithu Joseph */
ifs_test_core(int cpu,struct device * dev)1682b40e654SJithu Joseph static void ifs_test_core(int cpu, struct device *dev)
1692b40e654SJithu Joseph {
170*79b31626SKuppuswamy Sathyanarayanan union ifs_status status = {};
1712b40e654SJithu Joseph union ifs_scan activate;
1722b40e654SJithu Joseph unsigned long timeout;
1732b40e654SJithu Joseph struct ifs_data *ifsd;
1743d0d7713SJithu Joseph int to_start, to_stop;
1753d0d7713SJithu Joseph int status_chunk;
1762b40e654SJithu Joseph u64 msrvals[2];
1772b40e654SJithu Joseph int retries;
1782b40e654SJithu Joseph
1792b40e654SJithu Joseph ifsd = ifs_get_data(dev);
1802b40e654SJithu Joseph
1813d0d7713SJithu Joseph activate.gen0.rsvd = 0;
1822b40e654SJithu Joseph activate.delay = IFS_THREAD_WAIT;
1832b40e654SJithu Joseph activate.sigmce = 0;
1843d0d7713SJithu Joseph to_start = 0;
1853d0d7713SJithu Joseph to_stop = ifsd->valid_chunks - 1;
1863d0d7713SJithu Joseph
1873d0d7713SJithu Joseph if (ifsd->generation) {
1883d0d7713SJithu Joseph activate.gen2.start = to_start;
1893d0d7713SJithu Joseph activate.gen2.stop = to_stop;
1903d0d7713SJithu Joseph } else {
1913d0d7713SJithu Joseph activate.gen0.start = to_start;
1923d0d7713SJithu Joseph activate.gen0.stop = to_stop;
1933d0d7713SJithu Joseph }
1942b40e654SJithu Joseph
1952b40e654SJithu Joseph timeout = jiffies + HZ / 2;
1962b40e654SJithu Joseph retries = MAX_IFS_RETRIES;
1972b40e654SJithu Joseph
1983d0d7713SJithu Joseph while (to_start <= to_stop) {
1992b40e654SJithu Joseph if (time_after(jiffies, timeout)) {
2002b40e654SJithu Joseph status.error_code = IFS_SW_TIMEOUT;
2012b40e654SJithu Joseph break;
2022b40e654SJithu Joseph }
2032b40e654SJithu Joseph
2042b40e654SJithu Joseph msrvals[0] = activate.data;
2052b40e654SJithu Joseph stop_core_cpuslocked(cpu, doscan, msrvals);
2062b40e654SJithu Joseph
2072b40e654SJithu Joseph status.data = msrvals[1];
2082b40e654SJithu Joseph
2093d0d7713SJithu Joseph trace_ifs_status(cpu, to_start, to_stop, status.data);
21051af802fSTony Luck
2112b40e654SJithu Joseph /* Some cases can be retried, give up for others */
2122b40e654SJithu Joseph if (!can_restart(status))
2132b40e654SJithu Joseph break;
2142b40e654SJithu Joseph
2153d0d7713SJithu Joseph status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num;
2163d0d7713SJithu Joseph if (status_chunk == to_start) {
2172b40e654SJithu Joseph /* Check for forward progress */
2182b40e654SJithu Joseph if (--retries == 0) {
2192b40e654SJithu Joseph if (status.error_code == IFS_NO_ERROR)
2202b40e654SJithu Joseph status.error_code = IFS_SW_PARTIAL_COMPLETION;
2212b40e654SJithu Joseph break;
2222b40e654SJithu Joseph }
2232b40e654SJithu Joseph } else {
2242b40e654SJithu Joseph retries = MAX_IFS_RETRIES;
2253d0d7713SJithu Joseph if (ifsd->generation)
2263d0d7713SJithu Joseph activate.gen2.start = status_chunk;
2273d0d7713SJithu Joseph else
2283d0d7713SJithu Joseph activate.gen0.start = status_chunk;
2293d0d7713SJithu Joseph to_start = status_chunk;
2302b40e654SJithu Joseph }
2312b40e654SJithu Joseph }
2322b40e654SJithu Joseph
2332b40e654SJithu Joseph /* Update status for this core */
2342b40e654SJithu Joseph ifsd->scan_details = status.data;
2352b40e654SJithu Joseph
2362b40e654SJithu Joseph if (status.control_error || status.signature_error) {
2372b40e654SJithu Joseph ifsd->status = SCAN_TEST_FAIL;
2382b40e654SJithu Joseph message_fail(dev, cpu, status);
2392b40e654SJithu Joseph } else if (status.error_code) {
2402b40e654SJithu Joseph ifsd->status = SCAN_NOT_TESTED;
2412b40e654SJithu Joseph message_not_tested(dev, cpu, status);
2422b40e654SJithu Joseph } else {
2432b40e654SJithu Joseph ifsd->status = SCAN_TEST_PASS;
2442b40e654SJithu Joseph }
2452b40e654SJithu Joseph }
2462b40e654SJithu Joseph
247fed696ceSJithu Joseph #define SPINUNIT 100 /* 100 nsec */
248fed696ceSJithu Joseph static atomic_t array_cpus_out;
249fed696ceSJithu Joseph
250fed696ceSJithu Joseph /*
251fed696ceSJithu Joseph * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
252fed696ceSJithu Joseph */
wait_for_sibling_cpu(atomic_t * t,long long timeout)253fed696ceSJithu Joseph static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
254fed696ceSJithu Joseph {
255fed696ceSJithu Joseph int cpu = smp_processor_id();
256fed696ceSJithu Joseph const struct cpumask *smt_mask = cpu_smt_mask(cpu);
257fed696ceSJithu Joseph int all_cpus = cpumask_weight(smt_mask);
258fed696ceSJithu Joseph
259fed696ceSJithu Joseph atomic_inc(t);
260fed696ceSJithu Joseph while (atomic_read(t) < all_cpus) {
261fed696ceSJithu Joseph if (timeout < SPINUNIT)
262fed696ceSJithu Joseph return;
263fed696ceSJithu Joseph ndelay(SPINUNIT);
264fed696ceSJithu Joseph timeout -= SPINUNIT;
265fed696ceSJithu Joseph touch_nmi_watchdog();
266fed696ceSJithu Joseph }
267fed696ceSJithu Joseph }
268fed696ceSJithu Joseph
do_array_test(void * data)269fed696ceSJithu Joseph static int do_array_test(void *data)
270fed696ceSJithu Joseph {
271fed696ceSJithu Joseph union ifs_array *command = data;
272fed696ceSJithu Joseph int cpu = smp_processor_id();
273fed696ceSJithu Joseph int first;
274fed696ceSJithu Joseph
275fed696ceSJithu Joseph /*
276fed696ceSJithu Joseph * Only one logical CPU on a core needs to trigger the Array test via MSR write.
277fed696ceSJithu Joseph */
278fed696ceSJithu Joseph first = cpumask_first(cpu_smt_mask(cpu));
279fed696ceSJithu Joseph
280fed696ceSJithu Joseph if (cpu == first) {
281fed696ceSJithu Joseph wrmsrl(MSR_ARRAY_BIST, command->data);
282fed696ceSJithu Joseph /* Pass back the result of the test */
283fed696ceSJithu Joseph rdmsrl(MSR_ARRAY_BIST, command->data);
284fed696ceSJithu Joseph }
285fed696ceSJithu Joseph
286fed696ceSJithu Joseph /* Tests complete faster if the sibling is spinning here */
287fed696ceSJithu Joseph wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC);
288fed696ceSJithu Joseph
289fed696ceSJithu Joseph return 0;
290fed696ceSJithu Joseph }
291fed696ceSJithu Joseph
ifs_array_test_core(int cpu,struct device * dev)292fed696ceSJithu Joseph static void ifs_array_test_core(int cpu, struct device *dev)
293fed696ceSJithu Joseph {
294fed696ceSJithu Joseph union ifs_array command = {};
295fed696ceSJithu Joseph bool timed_out = false;
296fed696ceSJithu Joseph struct ifs_data *ifsd;
297fed696ceSJithu Joseph unsigned long timeout;
298fed696ceSJithu Joseph
299fed696ceSJithu Joseph ifsd = ifs_get_data(dev);
300fed696ceSJithu Joseph
301fed696ceSJithu Joseph command.array_bitmask = ~0U;
302fed696ceSJithu Joseph timeout = jiffies + HZ / 2;
303fed696ceSJithu Joseph
304fed696ceSJithu Joseph do {
305fed696ceSJithu Joseph if (time_after(jiffies, timeout)) {
306fed696ceSJithu Joseph timed_out = true;
307fed696ceSJithu Joseph break;
308fed696ceSJithu Joseph }
309fed696ceSJithu Joseph atomic_set(&array_cpus_out, 0);
310fed696ceSJithu Joseph stop_core_cpuslocked(cpu, do_array_test, &command);
311fed696ceSJithu Joseph
312fed696ceSJithu Joseph if (command.ctrl_result)
313fed696ceSJithu Joseph break;
314fed696ceSJithu Joseph } while (command.array_bitmask);
315fed696ceSJithu Joseph
316fed696ceSJithu Joseph ifsd->scan_details = command.data;
317fed696ceSJithu Joseph
318fed696ceSJithu Joseph if (command.ctrl_result)
319fed696ceSJithu Joseph ifsd->status = SCAN_TEST_FAIL;
320fed696ceSJithu Joseph else if (timed_out || command.array_bitmask)
321fed696ceSJithu Joseph ifsd->status = SCAN_NOT_TESTED;
322fed696ceSJithu Joseph else
323fed696ceSJithu Joseph ifsd->status = SCAN_TEST_PASS;
324fed696ceSJithu Joseph }
325fed696ceSJithu Joseph
3262b40e654SJithu Joseph /*
3272b40e654SJithu Joseph * Initiate per core test. It wakes up work queue threads on the target cpu and
3282b40e654SJithu Joseph * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
3292b40e654SJithu Joseph * wait for all sibling threads to finish the scan test.
3302b40e654SJithu Joseph */
do_core_test(int cpu,struct device * dev)3312b40e654SJithu Joseph int do_core_test(int cpu, struct device *dev)
3322b40e654SJithu Joseph {
3335210fb4eSJithu Joseph const struct ifs_test_caps *test = ifs_get_test_caps(dev);
3345210fb4eSJithu Joseph struct ifs_data *ifsd = ifs_get_data(dev);
3352b40e654SJithu Joseph int ret = 0;
3362b40e654SJithu Joseph
3372b40e654SJithu Joseph /* Prevent CPUs from being taken offline during the scan test */
3382b40e654SJithu Joseph cpus_read_lock();
3392b40e654SJithu Joseph
3402b40e654SJithu Joseph if (!cpu_online(cpu)) {
3412b40e654SJithu Joseph dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
3422b40e654SJithu Joseph ret = -EINVAL;
3432b40e654SJithu Joseph goto out;
3442b40e654SJithu Joseph }
3452b40e654SJithu Joseph
3465210fb4eSJithu Joseph switch (test->test_num) {
3475210fb4eSJithu Joseph case IFS_TYPE_SAF:
3485210fb4eSJithu Joseph if (!ifsd->loaded)
3492545debaSJithu Joseph ret = -EPERM;
3502545debaSJithu Joseph else
3512b40e654SJithu Joseph ifs_test_core(cpu, dev);
3525210fb4eSJithu Joseph break;
3535210fb4eSJithu Joseph case IFS_TYPE_ARRAY_BIST:
354fed696ceSJithu Joseph ifs_array_test_core(cpu, dev);
355fed696ceSJithu Joseph break;
3565210fb4eSJithu Joseph default:
3572545debaSJithu Joseph ret = -EINVAL;
3585210fb4eSJithu Joseph }
3592b40e654SJithu Joseph out:
3602b40e654SJithu Joseph cpus_read_unlock();
3612b40e654SJithu Joseph return ret;
3622b40e654SJithu Joseph }
363