xref: /openbmc/linux/drivers/misc/smpro-errmon.c (revision ef4290e6)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Ampere Computing SoC's SMpro Error Monitoring Driver
4  *
5  * Copyright (c) 2022, Ampere Computing LLC
6  *
7  */
8 
9 #include <linux/i2c.h>
10 #include <linux/mod_devicetable.h>
11 #include <linux/module.h>
12 #include <linux/platform_device.h>
13 #include <linux/regmap.h>
14 
15 /* GPI RAS Error Registers */
16 #define GPI_RAS_ERR		0x7E
17 
18 /* Core and L2C Error Registers */
19 #define CORE_CE_ERR_CNT		0x80
20 #define CORE_CE_ERR_LEN		0x81
21 #define CORE_CE_ERR_DATA	0x82
22 #define CORE_UE_ERR_CNT		0x83
23 #define CORE_UE_ERR_LEN		0x84
24 #define CORE_UE_ERR_DATA	0x85
25 
26 /* Memory Error Registers */
27 #define MEM_CE_ERR_CNT		0x90
28 #define MEM_CE_ERR_LEN		0x91
29 #define MEM_CE_ERR_DATA		0x92
30 #define MEM_UE_ERR_CNT		0x93
31 #define MEM_UE_ERR_LEN		0x94
32 #define MEM_UE_ERR_DATA		0x95
33 
34 /* RAS Error/Warning Registers */
35 #define ERR_SMPRO_TYPE		0xA0
36 #define ERR_PMPRO_TYPE		0xA1
37 #define ERR_SMPRO_INFO_LO	0xA2
38 #define ERR_SMPRO_INFO_HI	0xA3
39 #define ERR_SMPRO_DATA_LO	0xA4
40 #define ERR_SMPRO_DATA_HI	0xA5
41 #define WARN_SMPRO_INFO_LO	0xAA
42 #define WARN_SMPRO_INFO_HI	0xAB
43 #define ERR_PMPRO_INFO_LO	0xA6
44 #define ERR_PMPRO_INFO_HI	0xA7
45 #define ERR_PMPRO_DATA_LO	0xA8
46 #define ERR_PMPRO_DATA_HI	0xA9
47 #define WARN_PMPRO_INFO_LO	0xAC
48 #define WARN_PMPRO_INFO_HI	0xAD
49 
50 /* PCIE Error Registers */
51 #define PCIE_CE_ERR_CNT		0xC0
52 #define PCIE_CE_ERR_LEN		0xC1
53 #define PCIE_CE_ERR_DATA	0xC2
54 #define PCIE_UE_ERR_CNT		0xC3
55 #define PCIE_UE_ERR_LEN		0xC4
56 #define PCIE_UE_ERR_DATA	0xC5
57 
58 /* Other Error Registers */
59 #define OTHER_CE_ERR_CNT	0xD0
60 #define OTHER_CE_ERR_LEN	0xD1
61 #define OTHER_CE_ERR_DATA	0xD2
62 #define OTHER_UE_ERR_CNT	0xD8
63 #define OTHER_UE_ERR_LEN	0xD9
64 #define OTHER_UE_ERR_DATA	0xDA
65 
66 /* Event Data Registers */
67 #define VRD_WARN_FAULT_EVENT_DATA	0x78
68 #define VRD_HOT_EVENT_DATA		0x79
69 #define DIMM_HOT_EVENT_DATA		0x7A
70 
71 #define MAX_READ_BLOCK_LENGTH	48
72 
73 #define RAS_SMPRO_ERR		0
74 #define RAS_PMPRO_ERR		1
75 
76 enum RAS_48BYTES_ERR_TYPES {
77 	CORE_CE_ERR,
78 	CORE_UE_ERR,
79 	MEM_CE_ERR,
80 	MEM_UE_ERR,
81 	PCIE_CE_ERR,
82 	PCIE_UE_ERR,
83 	OTHER_CE_ERR,
84 	OTHER_UE_ERR,
85 	NUM_48BYTES_ERR_TYPE,
86 };
87 
88 struct smpro_error_hdr {
89 	u8 count;	/* Number of the RAS errors */
90 	u8 len;		/* Number of data bytes */
91 	u8 data;	/* Start of 48-byte data */
92 	u8 max_cnt;	/* Max num of errors */
93 };
94 
95 /*
96  * Included Address of registers to get Count, Length of data and Data
97  * of the 48 bytes error data
98  */
99 static struct smpro_error_hdr smpro_error_table[] = {
100 	[CORE_CE_ERR] = {
101 		.count = CORE_CE_ERR_CNT,
102 		.len = CORE_CE_ERR_LEN,
103 		.data = CORE_CE_ERR_DATA,
104 		.max_cnt = 32
105 	},
106 	[CORE_UE_ERR] = {
107 		.count = CORE_UE_ERR_CNT,
108 		.len = CORE_UE_ERR_LEN,
109 		.data = CORE_UE_ERR_DATA,
110 		.max_cnt = 32
111 	},
112 	[MEM_CE_ERR] = {
113 		.count = MEM_CE_ERR_CNT,
114 		.len = MEM_CE_ERR_LEN,
115 		.data = MEM_CE_ERR_DATA,
116 		.max_cnt = 16
117 	},
118 	[MEM_UE_ERR] = {
119 		.count = MEM_UE_ERR_CNT,
120 		.len = MEM_UE_ERR_LEN,
121 		.data = MEM_UE_ERR_DATA,
122 		.max_cnt = 16
123 	},
124 	[PCIE_CE_ERR] = {
125 		.count = PCIE_CE_ERR_CNT,
126 		.len = PCIE_CE_ERR_LEN,
127 		.data = PCIE_CE_ERR_DATA,
128 		.max_cnt = 96
129 	},
130 	[PCIE_UE_ERR] = {
131 		.count = PCIE_UE_ERR_CNT,
132 		.len = PCIE_UE_ERR_LEN,
133 		.data = PCIE_UE_ERR_DATA,
134 		.max_cnt = 96
135 	},
136 	[OTHER_CE_ERR] = {
137 		.count = OTHER_CE_ERR_CNT,
138 		.len = OTHER_CE_ERR_LEN,
139 		.data = OTHER_CE_ERR_DATA,
140 		.max_cnt = 8
141 	},
142 	[OTHER_UE_ERR] = {
143 		.count = OTHER_UE_ERR_CNT,
144 		.len = OTHER_UE_ERR_LEN,
145 		.data = OTHER_UE_ERR_DATA,
146 		.max_cnt = 8
147 	},
148 };
149 
150 /*
151  * List of SCP registers which are used to get
152  * one type of RAS Internal errors.
153  */
154 struct smpro_int_error_hdr {
155 	u8 type;
156 	u8 info_l;
157 	u8 info_h;
158 	u8 data_l;
159 	u8 data_h;
160 	u8 warn_l;
161 	u8 warn_h;
162 };
163 
164 static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
165 	[RAS_SMPRO_ERR] = {
166 		.type = ERR_SMPRO_TYPE,
167 		.info_l = ERR_SMPRO_INFO_LO,
168 		.info_h = ERR_SMPRO_INFO_HI,
169 		.data_l = ERR_SMPRO_DATA_LO,
170 		.data_h = ERR_SMPRO_DATA_HI,
171 		.warn_l = WARN_SMPRO_INFO_LO,
172 		.warn_h = WARN_SMPRO_INFO_HI,
173 	},
174 	[RAS_PMPRO_ERR] = {
175 		.type = ERR_PMPRO_TYPE,
176 		.info_l = ERR_PMPRO_INFO_LO,
177 		.info_h = ERR_PMPRO_INFO_HI,
178 		.data_l = ERR_PMPRO_DATA_LO,
179 		.data_h = ERR_PMPRO_DATA_HI,
180 		.warn_l = WARN_PMPRO_INFO_LO,
181 		.warn_h = WARN_PMPRO_INFO_HI,
182 	},
183 };
184 
185 struct smpro_errmon {
186 	struct regmap *regmap;
187 };
188 
189 enum EVENT_TYPES {
190 	VRD_WARN_FAULT_EVENT,
191 	VRD_HOT_EVENT,
192 	DIMM_HOT_EVENT,
193 	NUM_EVENTS_TYPE,
194 };
195 
196 /* Included Address of event source and data registers */
197 static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
198 	VRD_WARN_FAULT_EVENT_DATA,
199 	VRD_HOT_EVENT_DATA,
200 	DIMM_HOT_EVENT_DATA,
201 };
202 
203 static ssize_t smpro_event_data_read(struct device *dev,
204 				     struct device_attribute *da, char *buf,
205 				     int channel)
206 {
207 	struct smpro_errmon *errmon = dev_get_drvdata(dev);
208 	s32 event_data;
209 	int ret;
210 
211 	ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
212 	if (ret)
213 		return ret;
214 	/* Clear event after read */
215 	if (event_data != 0)
216 		regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
217 
218 	return sysfs_emit(buf, "%04x\n", event_data);
219 }
220 
221 static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
222 					char *buf, int channel)
223 {
224 	struct smpro_errmon *errmon = dev_get_drvdata(dev);
225 	struct smpro_error_hdr *err_info;
226 	s32 err_count;
227 	int ret;
228 
229 	err_info = &smpro_error_table[channel];
230 
231 	ret = regmap_read(errmon->regmap, err_info->count, &err_count);
232 	if (ret)
233 		return ret;
234 
235 	/* Bit 8 indicates the overflow status */
236 	return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
237 }
238 
239 static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da,
240 				     char *buf, int channel)
241 {
242 	struct smpro_errmon *errmon = dev_get_drvdata(dev);
243 	unsigned char err_data[MAX_READ_BLOCK_LENGTH];
244 	struct smpro_error_hdr *err_info;
245 	s32 err_count, err_length;
246 	int ret;
247 
248 	err_info = &smpro_error_table[channel];
249 
250 	ret = regmap_read(errmon->regmap, err_info->count, &err_count);
251 	/* Error count is the low byte */
252 	err_count &= 0xff;
253 	if (ret || !err_count || err_count > err_info->max_cnt)
254 		return ret;
255 
256 	ret = regmap_read(errmon->regmap, err_info->len, &err_length);
257 	if (ret || err_length <= 0)
258 		return ret;
259 
260 	if (err_length > MAX_READ_BLOCK_LENGTH)
261 		err_length = MAX_READ_BLOCK_LENGTH;
262 
263 	memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH);
264 	ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length);
265 	if (ret < 0)
266 		return ret;
267 
268 	/* clear the error */
269 	ret = regmap_write(errmon->regmap, err_info->count, 0x100);
270 	if (ret)
271 		return ret;
272 	/*
273 	 * The output of Core/Memory/PCIe/Others UE/CE errors follows the format
274 	 * specified in section 5.8.1 CE/UE Error Data record in
275 	 * Altra SOC BMC Interface specification.
276 	 */
277 	return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data);
278 }
279 
280 /*
281  * Output format:
282  * <4-byte hex value of error info><4-byte hex value of error extensive data>
283  * Where:
284  *   + error info : The error information
285  *   + error data : Extensive data (32 bits)
286  * Reference to section 5.10 RAS Internal Error Register Definition in
287  * Altra SOC BMC Interface specification
288  */
289 static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da,
290 				       char *buf, int channel)
291 {
292 	struct smpro_errmon *errmon = dev_get_drvdata(dev);
293 	struct smpro_int_error_hdr *err_info;
294 	unsigned int err[4] = { 0 };
295 	unsigned int err_type;
296 	unsigned int val;
297 	int ret;
298 
299 	/* read error status */
300 	ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
301 	if (ret)
302 		return ret;
303 
304 	if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
305 	    (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
306 		return 0;
307 
308 	err_info = &list_smpro_int_error_hdr[channel];
309 	ret = regmap_read(errmon->regmap, err_info->type, &val);
310 	if (ret)
311 		return ret;
312 
313 	err_type = (val & BIT(1)) ? BIT(1) :
314 		   (val & BIT(2)) ? BIT(2) : 0;
315 
316 	if (!err_type)
317 		return 0;
318 
319 	ret = regmap_read(errmon->regmap, err_info->info_l, err + 1);
320 	if (ret)
321 		return ret;
322 
323 	ret = regmap_read(errmon->regmap, err_info->info_h, err);
324 	if (ret)
325 		return ret;
326 
327 	if (err_type & BIT(2)) {
328 		/* Error with data type */
329 		ret = regmap_read(errmon->regmap, err_info->data_l, err + 3);
330 		if (ret)
331 			return ret;
332 
333 		ret = regmap_read(errmon->regmap, err_info->data_h, err + 2);
334 		if (ret)
335 			return ret;
336 	}
337 
338 	/* clear the read errors */
339 	ret = regmap_write(errmon->regmap, err_info->type, err_type);
340 	if (ret)
341 		return ret;
342 
343 	return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err);
344 }
345 
346 /*
347  * Output format:
348  * <4-byte hex value of warining info>
349  * Reference to section 5.10 RAS Internal Error Register Definition in
350  * Altra SOC BMC Interface specification
351  */
352 static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da,
353 					char *buf, int channel)
354 {
355 	struct smpro_errmon *errmon = dev_get_drvdata(dev);
356 	struct smpro_int_error_hdr *err_info;
357 	unsigned int warn[2] = { 0 };
358 	unsigned int val;
359 	int ret;
360 
361 	/* read error status */
362 	ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
363 	if (ret)
364 		return ret;
365 
366 	if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
367 	    (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
368 		return 0;
369 
370 	err_info = &list_smpro_int_error_hdr[channel];
371 	ret = regmap_read(errmon->regmap, err_info->type, &val);
372 	if (ret)
373 		return ret;
374 
375 	if (!(val & BIT(0)))
376 		return 0;
377 
378 	ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1);
379 	if (ret)
380 		return ret;
381 
382 	ret = regmap_read(errmon->regmap, err_info->warn_h, warn);
383 	if (ret)
384 		return ret;
385 
386 	/* clear the warning */
387 	ret = regmap_write(errmon->regmap, err_info->type, BIT(0));
388 	if (ret)
389 		return ret;
390 
391 	return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn);
392 }
393 
394 #define ERROR_OVERFLOW_RO(_error, _index) \
395 	static ssize_t overflow_##_error##_show(struct device *dev,            \
396 						struct device_attribute *da,   \
397 						char *buf)                     \
398 	{                                                                      \
399 		return smpro_overflow_data_read(dev, da, buf, _index);         \
400 	}                                                                      \
401 	static DEVICE_ATTR_RO(overflow_##_error)
402 
403 ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR);
404 ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR);
405 ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR);
406 ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR);
407 ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR);
408 ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR);
409 ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR);
410 ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR);
411 
412 #define ERROR_RO(_error, _index) \
413 	static ssize_t error_##_error##_show(struct device *dev,            \
414 					     struct device_attribute *da,   \
415 					     char *buf)                     \
416 	{                                                                   \
417 		return smpro_error_data_read(dev, da, buf, _index);         \
418 	}                                                                   \
419 	static DEVICE_ATTR_RO(error_##_error)
420 
421 ERROR_RO(core_ce, CORE_CE_ERR);
422 ERROR_RO(core_ue, CORE_UE_ERR);
423 ERROR_RO(mem_ce, MEM_CE_ERR);
424 ERROR_RO(mem_ue, MEM_UE_ERR);
425 ERROR_RO(pcie_ce, PCIE_CE_ERR);
426 ERROR_RO(pcie_ue, PCIE_UE_ERR);
427 ERROR_RO(other_ce, OTHER_CE_ERR);
428 ERROR_RO(other_ue, OTHER_UE_ERR);
429 
430 static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
431 {
432 	return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR);
433 }
434 static DEVICE_ATTR_RO(error_smpro);
435 
436 static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
437 {
438 	return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR);
439 }
440 static DEVICE_ATTR_RO(error_pmpro);
441 
442 static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
443 {
444 	return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR);
445 }
446 static DEVICE_ATTR_RO(warn_smpro);
447 
448 static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
449 {
450 	return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR);
451 }
452 static DEVICE_ATTR_RO(warn_pmpro);
453 
454 #define EVENT_RO(_event, _index) \
455 	static ssize_t event_##_event##_show(struct device *dev,            \
456 					     struct device_attribute *da,   \
457 					     char *buf)                     \
458 	{                                                                   \
459 		return smpro_event_data_read(dev, da, buf, _index);         \
460 	}                                                                   \
461 	static DEVICE_ATTR_RO(event_##_event)
462 
463 EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT);
464 EVENT_RO(vrd_hot, VRD_HOT_EVENT);
465 EVENT_RO(dimm_hot, DIMM_HOT_EVENT);
466 
467 static struct attribute *smpro_errmon_attrs[] = {
468 	&dev_attr_overflow_core_ce.attr,
469 	&dev_attr_overflow_core_ue.attr,
470 	&dev_attr_overflow_mem_ce.attr,
471 	&dev_attr_overflow_mem_ue.attr,
472 	&dev_attr_overflow_pcie_ce.attr,
473 	&dev_attr_overflow_pcie_ue.attr,
474 	&dev_attr_overflow_other_ce.attr,
475 	&dev_attr_overflow_other_ue.attr,
476 	&dev_attr_error_core_ce.attr,
477 	&dev_attr_error_core_ue.attr,
478 	&dev_attr_error_mem_ce.attr,
479 	&dev_attr_error_mem_ue.attr,
480 	&dev_attr_error_pcie_ce.attr,
481 	&dev_attr_error_pcie_ue.attr,
482 	&dev_attr_error_other_ce.attr,
483 	&dev_attr_error_other_ue.attr,
484 	&dev_attr_error_smpro.attr,
485 	&dev_attr_error_pmpro.attr,
486 	&dev_attr_warn_smpro.attr,
487 	&dev_attr_warn_pmpro.attr,
488 	&dev_attr_event_vrd_warn_fault.attr,
489 	&dev_attr_event_vrd_hot.attr,
490 	&dev_attr_event_dimm_hot.attr,
491 	NULL
492 };
493 
494 ATTRIBUTE_GROUPS(smpro_errmon);
495 
496 static int smpro_errmon_probe(struct platform_device *pdev)
497 {
498 	struct smpro_errmon *errmon;
499 
500 	errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL);
501 	if (!errmon)
502 		return -ENOMEM;
503 
504 	platform_set_drvdata(pdev, errmon);
505 
506 	errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL);
507 	if (!errmon->regmap)
508 		return -ENODEV;
509 
510 	return 0;
511 }
512 
513 static struct platform_driver smpro_errmon_driver = {
514 	.probe          = smpro_errmon_probe,
515 	.driver = {
516 		.name   = "smpro-errmon",
517 		.dev_groups = smpro_errmon_groups,
518 	},
519 };
520 
521 module_platform_driver(smpro_errmon_driver);
522 
523 MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>");
524 MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>");
525 MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>");
526 MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>");
527 MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>");
528 MODULE_DESCRIPTION("Ampere Altra SMpro driver");
529 MODULE_LICENSE("GPL");
530