xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c (revision cdbb816b)
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v8_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27 
28 #include "rsmu/rsmu_0_0_2_offset.h"
29 #include "rsmu/rsmu_0_0_2_sh_mask.h"
30 #include "umc/umc_8_7_0_offset.h"
31 #include "umc/umc_8_7_0_sh_mask.h"
32 
33 #define UMC_8_INST_DIST			0x40000
34 
35 const uint32_t
36 	umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = {
37 		{2, 11},  {4, 13},
38 		{1, 8},   {7, 14},
39 		{10, 3},  {12, 5},
40 		{9, 0},   {15, 6}
41 };
42 
43 static inline uint32_t get_umc_v8_7_reg_offset(struct amdgpu_device *adev,
44 					    uint32_t umc_inst,
45 					    uint32_t ch_inst)
46 {
47 	return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst;
48 }
49 
50 static void umc_v8_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
51 						uint32_t umc_inst, uint32_t ch_inst,
52 						unsigned long *error_count)
53 {
54 	uint64_t mc_umc_status;
55 	uint32_t eccinfo_table_idx;
56 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
57 
58 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
59 
60 	/* check for SRAM correctable error
61 	 * MCUMC_STATUS is a 64 bit register
62 	 */
63 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
64 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
65 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
66 		*error_count += 1;
67 }
68 
69 static void umc_v8_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
70 							uint32_t umc_inst, uint32_t ch_inst,
71 							unsigned long *error_count)
72 {
73 	uint64_t mc_umc_status;
74 	uint32_t eccinfo_table_idx;
75 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
76 
77 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
78 
79 	/* check the MCUMC_STATUS */
80 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
81 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
82 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
83 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
84 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
85 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
86 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
87 		*error_count += 1;
88 }
89 
90 static void umc_v8_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
91 					void *ras_error_status)
92 {
93 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
94 
95 	uint32_t umc_inst        = 0;
96 	uint32_t ch_inst         = 0;
97 
98 	/* TODO: driver needs to toggle DF Cstate to ensure
99 	 * safe access of UMC registers. Will add the protection
100 	 */
101 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
102 		umc_v8_7_ecc_info_query_correctable_error_count(adev,
103 							umc_inst, ch_inst,
104 							&(err_data->ce_count));
105 		umc_v8_7_ecc_info_querry_uncorrectable_error_count(adev,
106 							umc_inst, ch_inst,
107 							&(err_data->ue_count));
108 	}
109 }
110 
111 static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev,
112 					struct ras_err_data *err_data,
113 					uint32_t ch_inst,
114 					uint32_t umc_inst)
115 {
116 	uint64_t mc_umc_status, err_addr, retired_page;
117 	uint32_t channel_index;
118 	uint32_t eccinfo_table_idx;
119 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
120 
121 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
122 	channel_index =
123 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
124 
125 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
126 
127 	if (mc_umc_status == 0)
128 		return;
129 
130 	if (!err_data->err_addr)
131 		return;
132 
133 	/* calculate error address if ue error is detected */
134 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
135 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
136 
137 		err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
138 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
139 
140 		/* translate umc channel address to soc pa, 3 parts are included */
141 		retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
142 				ADDR_OF_256B_BLOCK(channel_index) |
143 				OFFSET_IN_256B_BLOCK(err_addr);
144 
145 		amdgpu_umc_fill_error_record(err_data, err_addr,
146 					retired_page, channel_index, umc_inst);
147 	}
148 }
149 
150 static void umc_v8_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
151 					void *ras_error_status)
152 {
153 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
154 
155 	uint32_t umc_inst        = 0;
156 	uint32_t ch_inst         = 0;
157 
158 	/* TODO: driver needs to toggle DF Cstate to ensure
159 	 * safe access of UMC resgisters. Will add the protection
160 	 * when firmware interface is ready
161 	 */
162 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
163 		umc_v8_7_ecc_info_query_error_address(adev,
164 						err_data,
165 						ch_inst,
166 						umc_inst);
167 	}
168 }
169 
170 static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev,
171 					uint32_t umc_reg_offset)
172 {
173 	uint32_t ecc_err_cnt_addr;
174 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
175 
176 	ecc_err_cnt_sel_addr =
177 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
178 	ecc_err_cnt_addr =
179 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
180 
181 	/* select the lower chip */
182 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
183 					umc_reg_offset) * 4);
184 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
185 					UMCCH0_0_GeccErrCntSel,
186 					GeccErrCntCsSel, 0);
187 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
188 			ecc_err_cnt_sel);
189 
190 	/* clear lower chip error count */
191 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
192 			UMC_V8_7_CE_CNT_INIT);
193 
194 	/* select the higher chip */
195 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
196 					umc_reg_offset) * 4);
197 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
198 					UMCCH0_0_GeccErrCntSel,
199 					GeccErrCntCsSel, 1);
200 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
201 			ecc_err_cnt_sel);
202 
203 	/* clear higher chip error count */
204 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
205 			UMC_V8_7_CE_CNT_INIT);
206 }
207 
208 static void umc_v8_7_clear_error_count(struct amdgpu_device *adev)
209 {
210 	uint32_t umc_inst        = 0;
211 	uint32_t ch_inst         = 0;
212 	uint32_t umc_reg_offset  = 0;
213 
214 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
215 		umc_reg_offset = get_umc_v8_7_reg_offset(adev,
216 						umc_inst,
217 						ch_inst);
218 
219 		umc_v8_7_clear_error_count_per_channel(adev,
220 						umc_reg_offset);
221 	}
222 }
223 
224 static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev,
225 						   uint32_t umc_reg_offset,
226 						   unsigned long *error_count)
227 {
228 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
229 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
230 	uint64_t mc_umc_status;
231 	uint32_t mc_umc_status_addr;
232 
233 	/* UMC 8_7_2 registers */
234 	ecc_err_cnt_sel_addr =
235 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
236 	ecc_err_cnt_addr =
237 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
238 	mc_umc_status_addr =
239 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
240 
241 	/* select the lower chip and check the error count */
242 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
243 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
244 					GeccErrCntCsSel, 0);
245 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
246 
247 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
248 	*error_count +=
249 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
250 		 UMC_V8_7_CE_CNT_INIT);
251 
252 	/* select the higher chip and check the err counter */
253 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
254 					GeccErrCntCsSel, 1);
255 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
256 
257 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
258 	*error_count +=
259 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
260 		 UMC_V8_7_CE_CNT_INIT);
261 
262 	/* check for SRAM correctable error
263 	  MCUMC_STATUS is a 64 bit register */
264 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
265 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
266 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
267 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
268 		*error_count += 1;
269 }
270 
271 static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
272 						      uint32_t umc_reg_offset,
273 						      unsigned long *error_count)
274 {
275 	uint64_t mc_umc_status;
276 	uint32_t mc_umc_status_addr;
277 
278 	mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
279 
280 	/* check the MCUMC_STATUS */
281 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
282 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
283 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
284 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
285 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
286 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
287 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
288 		*error_count += 1;
289 }
290 
291 static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev,
292 					   void *ras_error_status)
293 {
294 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
295 
296 	uint32_t umc_inst        = 0;
297 	uint32_t ch_inst         = 0;
298 	uint32_t umc_reg_offset  = 0;
299 
300 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
301 		umc_reg_offset = get_umc_v8_7_reg_offset(adev,
302 						      umc_inst,
303 						      ch_inst);
304 
305 		umc_v8_7_query_correctable_error_count(adev,
306 						       umc_reg_offset,
307 						       &(err_data->ce_count));
308 		umc_v8_7_querry_uncorrectable_error_count(adev,
309 							  umc_reg_offset,
310 							  &(err_data->ue_count));
311 	}
312 
313 	umc_v8_7_clear_error_count(adev);
314 }
315 
316 static void umc_v8_7_query_error_address(struct amdgpu_device *adev,
317 					 struct ras_err_data *err_data,
318 					 uint32_t umc_reg_offset,
319 					 uint32_t ch_inst,
320 					 uint32_t umc_inst)
321 {
322 	uint32_t lsb, mc_umc_status_addr;
323 	uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
324 	uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
325 
326 	mc_umc_status_addr =
327 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
328 	mc_umc_addrt0 =
329 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
330 
331 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
332 
333 	if (mc_umc_status == 0)
334 		return;
335 
336 	if (!err_data->err_addr) {
337 		/* clear umc status */
338 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
339 		return;
340 	}
341 
342 	/* calculate error address if ue error is detected */
343 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
344 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
345 
346 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
347 		/* the lowest lsb bits should be ignored */
348 		lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
349 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
350 		err_addr &= ~((0x1ULL << lsb) - 1);
351 
352 		/* translate umc channel address to soc pa, 3 parts are included */
353 		retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
354 				ADDR_OF_256B_BLOCK(channel_index) |
355 				OFFSET_IN_256B_BLOCK(err_addr);
356 
357 		amdgpu_umc_fill_error_record(err_data, err_addr,
358 					retired_page, channel_index, umc_inst);
359 	}
360 
361 	/* clear umc status */
362 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
363 }
364 
365 static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev,
366 					     void *ras_error_status)
367 {
368 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
369 
370 	uint32_t umc_inst        = 0;
371 	uint32_t ch_inst         = 0;
372 	uint32_t umc_reg_offset  = 0;
373 
374 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
375 		umc_reg_offset = get_umc_v8_7_reg_offset(adev,
376 						      umc_inst,
377 						      ch_inst);
378 
379 		umc_v8_7_query_error_address(adev,
380 					     err_data,
381 					     umc_reg_offset,
382 					     ch_inst,
383 					     umc_inst);
384 	}
385 }
386 
387 static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev,
388 					      uint32_t umc_reg_offset)
389 {
390 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
391 	uint32_t ecc_err_cnt_addr;
392 
393 	ecc_err_cnt_sel_addr =
394 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
395 	ecc_err_cnt_addr =
396 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
397 
398 	/* select the lower chip and check the error count */
399 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
400 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
401 					GeccErrCntCsSel, 0);
402 	/* set ce error interrupt type to APIC based interrupt */
403 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
404 					GeccErrInt, 0x1);
405 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
406 	/* set error count to initial value */
407 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
408 
409 	/* select the higher chip and check the err counter */
410 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
411 					GeccErrCntCsSel, 1);
412 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
413 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
414 }
415 
416 static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
417 {
418 	uint32_t umc_inst        = 0;
419 	uint32_t ch_inst         = 0;
420 	uint32_t umc_reg_offset  = 0;
421 
422 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
423 		umc_reg_offset = get_umc_v8_7_reg_offset(adev,
424 						      umc_inst,
425 						      ch_inst);
426 
427 		umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset);
428 	}
429 }
430 
431 const struct amdgpu_ras_block_hw_ops umc_v8_7_ras_hw_ops = {
432 	.query_ras_error_count = umc_v8_7_query_ras_error_count,
433 	.query_ras_error_address = umc_v8_7_query_ras_error_address,
434 };
435 
436 struct amdgpu_umc_ras umc_v8_7_ras = {
437 	.ras_block = {
438 		.hw_ops = &umc_v8_7_ras_hw_ops,
439 	},
440 	.err_cnt_init = umc_v8_7_err_cnt_init,
441 	.ecc_info_query_ras_error_count = umc_v8_7_ecc_info_query_ras_error_count,
442 	.ecc_info_query_ras_error_address = umc_v8_7_ecc_info_query_ras_error_address,
443 };
444