xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c (revision 36926a7d)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v6_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27 
28 #include "umc/umc_6_7_0_offset.h"
29 #include "umc/umc_6_7_0_sh_mask.h"
30 
31 const uint32_t
32 	umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33 		{28, 20, 24, 16, 12, 4, 8, 0},
34 		{6, 30, 2, 26, 22, 14, 18, 10},
35 		{19, 11, 15, 7, 3, 27, 31, 23},
36 		{9, 1, 5, 29, 25, 17, 21, 13}
37 };
38 const uint32_t
39 	umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40 		{19, 11, 15, 7,	3, 27, 31, 23},
41 		{9, 1, 5, 29, 25, 17, 21, 13},
42 		{28, 20, 24, 16, 12, 4, 8, 0},
43 		{6, 30, 2, 26, 22, 14, 18, 10},
44 };
45 
46 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
47 					      uint32_t umc_inst,
48 					      uint32_t ch_inst)
49 {
50 	uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
51 
52 	/* adjust umc and channel index offset,
53 	 * the register address is not linear on each umc instace */
54 	umc_inst = index / 4;
55 	ch_inst = index % 4;
56 
57 	return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
58 }
59 
60 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev,
61 					      uint32_t umc_inst,
62 					      uint32_t ch_inst)
63 {
64 	return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
65 }
66 
67 static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev,
68 						  uint64_t mc_umc_status, uint32_t umc_reg_offset)
69 {
70 	uint32_t mc_umc_addr;
71 	uint64_t reg_value;
72 
73 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
74 		dev_info(adev->dev, "Deferred error, no user action is needed.\n");
75 
76 	if (mc_umc_status)
77 		dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
78 
79 	/* print IPID registers value */
80 	mc_umc_addr =
81 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
82 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
83 	if (reg_value)
84 		dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
85 
86 	/* print SYND registers value */
87 	mc_umc_addr =
88 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
89 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
90 	if (reg_value)
91 		dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
92 
93 	/* print MISC0 registers value */
94 	mc_umc_addr =
95 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
96 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
97 	if (reg_value)
98 		dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
99 }
100 
101 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
102 						   uint32_t umc_inst, uint32_t ch_inst,
103 						   unsigned long *error_count)
104 {
105 	uint64_t mc_umc_status;
106 	uint32_t eccinfo_table_idx;
107 	uint32_t umc_reg_offset;
108 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
109 
110 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
111 						umc_inst, ch_inst);
112 
113 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
114 	/* check for SRAM correctable error
115 	  MCUMC_STATUS is a 64 bit register */
116 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
117 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
118 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
119 		*error_count += 1;
120 
121 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
122 
123 		if (ras->umc_ecc.record_ce_addr_supported)	{
124 			uint64_t err_addr, soc_pa;
125 			uint32_t channel_index =
126 				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
127 
128 			err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
129 			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
130 			/* translate umc channel address to soc pa, 3 parts are included */
131 			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
132 					ADDR_OF_256B_BLOCK(channel_index) |
133 					OFFSET_IN_256B_BLOCK(err_addr);
134 
135 			/* The umc channel bits are not original values, they are hashed */
136 			SET_CHANNEL_HASH(channel_index, soc_pa);
137 
138 			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
139 		}
140 	}
141 }
142 
143 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
144 							  uint32_t umc_inst, uint32_t ch_inst,
145 						      unsigned long *error_count)
146 {
147 	uint64_t mc_umc_status;
148 	uint32_t eccinfo_table_idx;
149 	uint32_t umc_reg_offset;
150 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
151 
152 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
153 						umc_inst, ch_inst);
154 
155 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
156 	/* check the MCUMC_STATUS */
157 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
158 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
159 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
160 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
161 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
162 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
163 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
164 		*error_count += 1;
165 
166 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
167 	}
168 }
169 
170 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
171 					   void *ras_error_status)
172 {
173 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
174 
175 	uint32_t umc_inst        = 0;
176 	uint32_t ch_inst         = 0;
177 
178 	/*TODO: driver needs to toggle DF Cstate to ensure
179 	 * safe access of UMC registers. Will add the protection */
180 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
181 		umc_v6_7_ecc_info_query_correctable_error_count(adev,
182 						      umc_inst, ch_inst,
183 						      &(err_data->ce_count));
184 		umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
185 						      umc_inst, ch_inst,
186 							  &(err_data->ue_count));
187 	}
188 }
189 
190 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
191 					 struct ras_err_data *err_data,
192 					 uint32_t ch_inst,
193 					 uint32_t umc_inst)
194 {
195 	uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column;
196 	uint32_t channel_index;
197 	uint32_t eccinfo_table_idx;
198 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
199 
200 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
201 	channel_index =
202 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
203 
204 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
205 
206 	if (mc_umc_status == 0)
207 		return;
208 
209 	if (!err_data->err_addr)
210 		return;
211 
212 	/* calculate error address if ue/ce error is detected */
213 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
214 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
215 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
216 
217 		err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
218 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
219 
220 		/* translate umc channel address to soc pa, 3 parts are included */
221 		soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
222 				ADDR_OF_256B_BLOCK(channel_index) |
223 				OFFSET_IN_256B_BLOCK(err_addr);
224 
225 		/* The umc channel bits are not original values, they are hashed */
226 		SET_CHANNEL_HASH(channel_index, soc_pa);
227 
228 		/* clear [C4 C3 C2] in soc physical address */
229 		soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
230 
231 		/* we only save ue error information currently, ce is skipped */
232 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
233 				== 1) {
234 			/* loop for all possibilities of [C4 C3 C2] */
235 			for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
236 				retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
237 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
238 				amdgpu_umc_fill_error_record(err_data, err_addr,
239 					retired_page, channel_index, umc_inst);
240 
241 				/* shift R14 bit */
242 				retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
243 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
244 				amdgpu_umc_fill_error_record(err_data, err_addr,
245 					retired_page, channel_index, umc_inst);
246 			}
247 		}
248 	}
249 }
250 
251 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
252 					     void *ras_error_status)
253 {
254 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
255 
256 	uint32_t umc_inst        = 0;
257 	uint32_t ch_inst         = 0;
258 
259 	/*TODO: driver needs to toggle DF Cstate to ensure
260 	 * safe access of UMC resgisters. Will add the protection
261 	 * when firmware interface is ready */
262 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
263 		umc_v6_7_ecc_info_query_error_address(adev,
264 					     err_data,
265 					     ch_inst,
266 					     umc_inst);
267 	}
268 }
269 
270 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
271 						   uint32_t umc_reg_offset,
272 						   unsigned long *error_count,
273 						   uint32_t ch_inst,
274 						   uint32_t umc_inst)
275 {
276 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
277 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
278 	uint64_t mc_umc_status;
279 	uint32_t mc_umc_status_addr;
280 
281 	/* UMC 6_1_1 registers */
282 	ecc_err_cnt_sel_addr =
283 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
284 	ecc_err_cnt_addr =
285 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
286 	mc_umc_status_addr =
287 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
288 
289 	/* select the lower chip and check the error count */
290 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
291 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
292 					EccErrCntCsSel, 0);
293 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
294 
295 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
296 	*error_count +=
297 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
298 		 UMC_V6_7_CE_CNT_INIT);
299 
300 	/* select the higher chip and check the err counter */
301 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
302 					EccErrCntCsSel, 1);
303 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
304 
305 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
306 	*error_count +=
307 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
308 		 UMC_V6_7_CE_CNT_INIT);
309 
310 	/* check for SRAM correctable error
311 	  MCUMC_STATUS is a 64 bit register */
312 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
313 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
314 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
315 		*error_count += 1;
316 
317 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
318 
319 		{
320 			uint64_t err_addr, soc_pa;
321 			uint32_t mc_umc_addrt0;
322 			uint32_t channel_index;
323 
324 			mc_umc_addrt0 =
325 				SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
326 
327 			channel_index =
328 				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
329 
330 			err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
331 			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
332 
333 			/* translate umc channel address to soc pa, 3 parts are included */
334 			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
335 					ADDR_OF_256B_BLOCK(channel_index) |
336 					OFFSET_IN_256B_BLOCK(err_addr);
337 
338 			/* The umc channel bits are not original values, they are hashed */
339 			SET_CHANNEL_HASH(channel_index, soc_pa);
340 
341 			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
342 		}
343 	}
344 }
345 
346 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
347 						      uint32_t umc_reg_offset,
348 						      unsigned long *error_count)
349 {
350 	uint64_t mc_umc_status;
351 	uint32_t mc_umc_status_addr;
352 
353 	mc_umc_status_addr =
354 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
355 
356 	/* check the MCUMC_STATUS */
357 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
358 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
359 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
360 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
361 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
362 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
363 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
364 		*error_count += 1;
365 
366 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
367 	}
368 }
369 
370 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
371 						   uint32_t umc_reg_offset)
372 {
373 	uint32_t ecc_err_cnt_addr;
374 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
375 
376 	ecc_err_cnt_sel_addr =
377 		SOC15_REG_OFFSET(UMC, 0,
378 				regUMCCH0_0_EccErrCntSel);
379 	ecc_err_cnt_addr =
380 		SOC15_REG_OFFSET(UMC, 0,
381 				regUMCCH0_0_EccErrCnt);
382 
383 	/* select the lower chip */
384 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
385 				       umc_reg_offset) * 4);
386 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
387 					UMCCH0_0_EccErrCntSel,
388 					EccErrCntCsSel, 0);
389 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
390 			ecc_err_cnt_sel);
391 
392 	/* clear lower chip error count */
393 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
394 			UMC_V6_7_CE_CNT_INIT);
395 
396 	/* select the higher chip */
397 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
398 					umc_reg_offset) * 4);
399 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
400 					UMCCH0_0_EccErrCntSel,
401 					EccErrCntCsSel, 1);
402 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
403 			ecc_err_cnt_sel);
404 
405 	/* clear higher chip error count */
406 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
407 			UMC_V6_7_CE_CNT_INIT);
408 }
409 
410 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
411 {
412 	uint32_t umc_inst        = 0;
413 	uint32_t ch_inst         = 0;
414 	uint32_t umc_reg_offset  = 0;
415 
416 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
417 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
418 							 umc_inst,
419 							 ch_inst);
420 
421 		umc_v6_7_reset_error_count_per_channel(adev,
422 						       umc_reg_offset);
423 	}
424 }
425 
426 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
427 					   void *ras_error_status)
428 {
429 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
430 
431 	uint32_t umc_inst        = 0;
432 	uint32_t ch_inst         = 0;
433 	uint32_t umc_reg_offset  = 0;
434 
435 	/*TODO: driver needs to toggle DF Cstate to ensure
436 	 * safe access of UMC registers. Will add the protection */
437 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
438 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
439 							 umc_inst,
440 							 ch_inst);
441 		umc_v6_7_query_correctable_error_count(adev,
442 						       umc_reg_offset,
443 						       &(err_data->ce_count),
444 						       ch_inst, umc_inst);
445 		umc_v6_7_querry_uncorrectable_error_count(adev,
446 							  umc_reg_offset,
447 							  &(err_data->ue_count));
448 	}
449 
450 	umc_v6_7_reset_error_count(adev);
451 }
452 
453 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
454 					 struct ras_err_data *err_data,
455 					 uint32_t umc_reg_offset, uint32_t ch_inst,
456 					 uint32_t umc_inst, uint64_t mca_addr)
457 {
458 	uint32_t mc_umc_status_addr;
459 	uint32_t channel_index;
460 	uint64_t mc_umc_status = 0, mc_umc_addrt0;
461 	uint64_t err_addr, soc_pa, retired_page, column;
462 
463 	if (mca_addr == UMC_INVALID_ADDR) {
464 		mc_umc_status_addr =
465 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
466 		mc_umc_addrt0 =
467 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
468 
469 		mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
470 
471 		if (mc_umc_status == 0)
472 			return;
473 
474 		if (!err_data->err_addr) {
475 			/* clear umc status */
476 			WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
477 			return;
478 		}
479 	}
480 
481 	channel_index =
482 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
483 
484 	/* calculate error address if ue/ce error is detected */
485 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
486 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
487 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) ||
488 	    mca_addr != UMC_INVALID_ADDR) {
489 		if (mca_addr == UMC_INVALID_ADDR) {
490 			err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
491 			err_addr =
492 				REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
493 		} else {
494 			err_addr = mca_addr;
495 		}
496 
497 		/* translate umc channel address to soc pa, 3 parts are included */
498 		soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
499 				ADDR_OF_256B_BLOCK(channel_index) |
500 				OFFSET_IN_256B_BLOCK(err_addr);
501 
502 		/* The umc channel bits are not original values, they are hashed */
503 		SET_CHANNEL_HASH(channel_index, soc_pa);
504 
505 		/* clear [C4 C3 C2] in soc physical address */
506 		soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
507 
508 		/* we only save ue error information currently, ce is skipped */
509 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
510 				== 1 ||
511 		    mca_addr != UMC_INVALID_ADDR) {
512 			/* loop for all possibilities of [C4 C3 C2] */
513 			for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
514 				retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
515 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
516 				amdgpu_umc_fill_error_record(err_data, err_addr,
517 					retired_page, channel_index, umc_inst);
518 
519 				/* shift R14 bit */
520 				retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
521 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
522 				amdgpu_umc_fill_error_record(err_data, err_addr,
523 					retired_page, channel_index, umc_inst);
524 			}
525 		}
526 	}
527 
528 	/* clear umc status */
529 	if (mca_addr == UMC_INVALID_ADDR)
530 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
531 }
532 
533 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
534 					     void *ras_error_status)
535 {
536 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
537 
538 	uint32_t umc_inst        = 0;
539 	uint32_t ch_inst         = 0;
540 	uint32_t umc_reg_offset  = 0;
541 
542 	/*TODO: driver needs to toggle DF Cstate to ensure
543 	 * safe access of UMC resgisters. Will add the protection
544 	 * when firmware interface is ready */
545 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
546 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
547 							 umc_inst,
548 							 ch_inst);
549 		umc_v6_7_query_error_address(adev,
550 					     err_data,
551 					     umc_reg_offset, ch_inst,
552 					     umc_inst, UMC_INVALID_ADDR);
553 	}
554 }
555 
556 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
557 						struct amdgpu_device *adev,
558 						uint32_t umc_reg_offset)
559 {
560 	uint32_t ecc_ctrl_addr, ecc_ctrl;
561 
562 	ecc_ctrl_addr =
563 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
564 	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
565 					umc_reg_offset) * 4);
566 
567 	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
568 }
569 
570 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
571 {
572 	uint32_t umc_reg_offset  = 0;
573 
574 	/* Enabling fatal error in umc instance0 channel0 will be
575 	 * considered as fatal error mode
576 	 */
577 	umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
578 	return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
579 }
580 
581 const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
582 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
583 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
584 };
585 
586 struct amdgpu_umc_ras umc_v6_7_ras = {
587 	.ras_block = {
588 		.hw_ops = &umc_v6_7_ras_hw_ops,
589 	},
590 	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
591 	.ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
592 	.ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
593 	.convert_ras_error_address = umc_v6_7_query_error_address,
594 };
595