xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c (revision 38857318)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v6_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27 
28 #include "umc/umc_6_7_0_offset.h"
29 #include "umc/umc_6_7_0_sh_mask.h"
30 
31 const uint32_t
32 	umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33 		{28, 20, 24, 16, 12, 4, 8, 0},
34 		{6, 30, 2, 26, 22, 14, 18, 10},
35 		{19, 11, 15, 7, 3, 27, 31, 23},
36 		{9, 1, 5, 29, 25, 17, 21, 13}
37 };
38 const uint32_t
39 	umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40 		{19, 11, 15, 7,	3, 27, 31, 23},
41 		{9, 1, 5, 29, 25, 17, 21, 13},
42 		{28, 20, 24, 16, 12, 4, 8, 0},
43 		{6, 30, 2, 26, 22, 14, 18, 10},
44 };
45 
46 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
47 					      uint32_t umc_inst,
48 					      uint32_t ch_inst)
49 {
50 	uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
51 
52 	/* adjust umc and channel index offset,
53 	 * the register address is not linear on each umc instace */
54 	umc_inst = index / 4;
55 	ch_inst = index % 4;
56 
57 	return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
58 }
59 
60 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev,
61 					      uint32_t umc_inst,
62 					      uint32_t ch_inst)
63 {
64 	return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
65 }
66 
67 static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev,
68 						  uint64_t mc_umc_status, uint32_t umc_reg_offset)
69 {
70 	uint32_t mc_umc_addr;
71 	uint64_t reg_value;
72 
73 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
74 		dev_info(adev->dev, "Deferred error, no user action is needed.\n");
75 
76 	if (mc_umc_status)
77 		dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
78 
79 	/* print IPID registers value */
80 	mc_umc_addr =
81 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
82 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
83 	if (reg_value)
84 		dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
85 
86 	/* print SYND registers value */
87 	mc_umc_addr =
88 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
89 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
90 	if (reg_value)
91 		dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
92 
93 	/* print MISC0 registers value */
94 	mc_umc_addr =
95 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
96 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
97 	if (reg_value)
98 		dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
99 }
100 
101 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
102 						   uint32_t umc_inst, uint32_t ch_inst,
103 						   unsigned long *error_count)
104 {
105 	uint64_t mc_umc_status;
106 	uint32_t eccinfo_table_idx;
107 	uint32_t umc_reg_offset;
108 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
109 
110 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
111 						umc_inst, ch_inst);
112 
113 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
114 	/* check for SRAM correctable error
115 	  MCUMC_STATUS is a 64 bit register */
116 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
117 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
118 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
119 		*error_count += 1;
120 
121 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
122 	}
123 }
124 
125 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
126 							  uint32_t umc_inst, uint32_t ch_inst,
127 						      unsigned long *error_count)
128 {
129 	uint64_t mc_umc_status;
130 	uint32_t eccinfo_table_idx;
131 	uint32_t umc_reg_offset;
132 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
133 
134 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
135 						umc_inst, ch_inst);
136 
137 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
138 	/* check the MCUMC_STATUS */
139 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
140 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
141 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
142 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
143 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
144 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
145 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
146 		*error_count += 1;
147 
148 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
149 	}
150 }
151 
152 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
153 					   void *ras_error_status)
154 {
155 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
156 
157 	uint32_t umc_inst        = 0;
158 	uint32_t ch_inst         = 0;
159 
160 	/*TODO: driver needs to toggle DF Cstate to ensure
161 	 * safe access of UMC registers. Will add the protection */
162 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
163 		umc_v6_7_ecc_info_query_correctable_error_count(adev,
164 						      umc_inst, ch_inst,
165 						      &(err_data->ce_count));
166 		umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
167 						      umc_inst, ch_inst,
168 							  &(err_data->ue_count));
169 	}
170 }
171 
172 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
173 					 struct ras_err_data *err_data,
174 					 uint32_t ch_inst,
175 					 uint32_t umc_inst)
176 {
177 	uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column;
178 	uint32_t channel_index;
179 	uint32_t eccinfo_table_idx;
180 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
181 
182 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
183 	channel_index =
184 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
185 
186 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
187 
188 	if (mc_umc_status == 0)
189 		return;
190 
191 	if (!err_data->err_addr)
192 		return;
193 
194 	/* calculate error address if ue/ce error is detected */
195 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
196 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
197 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
198 
199 		err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
200 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
201 
202 		/* translate umc channel address to soc pa, 3 parts are included */
203 		soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
204 				ADDR_OF_256B_BLOCK(channel_index) |
205 				OFFSET_IN_256B_BLOCK(err_addr);
206 
207 		/* The umc channel bits are not original values, they are hashed */
208 		SET_CHANNEL_HASH(channel_index, soc_pa);
209 
210 		/* clear [C4 C3 C2] in soc physical address */
211 		soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
212 
213 		/* we only save ue error information currently, ce is skipped */
214 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
215 				== 1) {
216 			/* loop for all possibilities of [C4 C3 C2] */
217 			for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
218 				retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
219 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
220 				amdgpu_umc_fill_error_record(err_data, err_addr,
221 					retired_page, channel_index, umc_inst);
222 
223 				/* shift R14 bit */
224 				retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
225 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
226 				amdgpu_umc_fill_error_record(err_data, err_addr,
227 					retired_page, channel_index, umc_inst);
228 			}
229 		}
230 	}
231 }
232 
233 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
234 					     void *ras_error_status)
235 {
236 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
237 
238 	uint32_t umc_inst        = 0;
239 	uint32_t ch_inst         = 0;
240 
241 	/*TODO: driver needs to toggle DF Cstate to ensure
242 	 * safe access of UMC resgisters. Will add the protection
243 	 * when firmware interface is ready */
244 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
245 		umc_v6_7_ecc_info_query_error_address(adev,
246 					     err_data,
247 					     ch_inst,
248 					     umc_inst);
249 	}
250 }
251 
252 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
253 						   uint32_t umc_reg_offset,
254 						   unsigned long *error_count)
255 {
256 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
257 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
258 	uint64_t mc_umc_status;
259 	uint32_t mc_umc_status_addr;
260 
261 	/* UMC 6_1_1 registers */
262 	ecc_err_cnt_sel_addr =
263 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
264 	ecc_err_cnt_addr =
265 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
266 	mc_umc_status_addr =
267 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
268 
269 	/* select the lower chip and check the error count */
270 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
271 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
272 					EccErrCntCsSel, 0);
273 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
274 
275 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
276 	*error_count +=
277 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
278 		 UMC_V6_7_CE_CNT_INIT);
279 
280 	/* select the higher chip and check the err counter */
281 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
282 					EccErrCntCsSel, 1);
283 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
284 
285 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
286 	*error_count +=
287 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
288 		 UMC_V6_7_CE_CNT_INIT);
289 
290 	/* check for SRAM correctable error
291 	  MCUMC_STATUS is a 64 bit register */
292 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
293 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
294 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
295 		*error_count += 1;
296 
297 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
298 	}
299 }
300 
301 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
302 						      uint32_t umc_reg_offset,
303 						      unsigned long *error_count)
304 {
305 	uint64_t mc_umc_status;
306 	uint32_t mc_umc_status_addr;
307 
308 	mc_umc_status_addr =
309 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
310 
311 	/* check the MCUMC_STATUS */
312 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
313 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
314 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
315 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
316 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
317 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
318 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
319 		*error_count += 1;
320 
321 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
322 	}
323 }
324 
325 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
326 						   uint32_t umc_reg_offset)
327 {
328 	uint32_t ecc_err_cnt_addr;
329 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
330 
331 	ecc_err_cnt_sel_addr =
332 		SOC15_REG_OFFSET(UMC, 0,
333 				regUMCCH0_0_EccErrCntSel);
334 	ecc_err_cnt_addr =
335 		SOC15_REG_OFFSET(UMC, 0,
336 				regUMCCH0_0_EccErrCnt);
337 
338 	/* select the lower chip */
339 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
340 				       umc_reg_offset) * 4);
341 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
342 					UMCCH0_0_EccErrCntSel,
343 					EccErrCntCsSel, 0);
344 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
345 			ecc_err_cnt_sel);
346 
347 	/* clear lower chip error count */
348 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
349 			UMC_V6_7_CE_CNT_INIT);
350 
351 	/* select the higher chip */
352 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
353 					umc_reg_offset) * 4);
354 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
355 					UMCCH0_0_EccErrCntSel,
356 					EccErrCntCsSel, 1);
357 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
358 			ecc_err_cnt_sel);
359 
360 	/* clear higher chip error count */
361 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
362 			UMC_V6_7_CE_CNT_INIT);
363 }
364 
365 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
366 {
367 	uint32_t umc_inst        = 0;
368 	uint32_t ch_inst         = 0;
369 	uint32_t umc_reg_offset  = 0;
370 
371 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
372 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
373 							 umc_inst,
374 							 ch_inst);
375 
376 		umc_v6_7_reset_error_count_per_channel(adev,
377 						       umc_reg_offset);
378 	}
379 }
380 
381 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
382 					   void *ras_error_status)
383 {
384 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
385 
386 	uint32_t umc_inst        = 0;
387 	uint32_t ch_inst         = 0;
388 	uint32_t umc_reg_offset  = 0;
389 
390 	/*TODO: driver needs to toggle DF Cstate to ensure
391 	 * safe access of UMC registers. Will add the protection */
392 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
393 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
394 							 umc_inst,
395 							 ch_inst);
396 		umc_v6_7_query_correctable_error_count(adev,
397 						       umc_reg_offset,
398 						       &(err_data->ce_count));
399 		umc_v6_7_querry_uncorrectable_error_count(adev,
400 							  umc_reg_offset,
401 							  &(err_data->ue_count));
402 	}
403 
404 	umc_v6_7_reset_error_count(adev);
405 }
406 
407 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
408 					 struct ras_err_data *err_data,
409 					 uint32_t umc_reg_offset,
410 					 uint32_t ch_inst,
411 					 uint32_t umc_inst)
412 {
413 	uint32_t mc_umc_status_addr;
414 	uint32_t channel_index;
415 	uint64_t mc_umc_status, mc_umc_addrt0;
416 	uint64_t err_addr, soc_pa, retired_page, column;
417 
418 	mc_umc_status_addr =
419 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
420 	mc_umc_addrt0 =
421 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
422 
423 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
424 
425 	if (mc_umc_status == 0)
426 		return;
427 
428 	if (!err_data->err_addr) {
429 		/* clear umc status */
430 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
431 		return;
432 	}
433 
434 	channel_index =
435 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
436 
437 	/* calculate error address if ue/ce error is detected */
438 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
439 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
440 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
441 
442 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
443 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
444 
445 		/* translate umc channel address to soc pa, 3 parts are included */
446 		soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
447 				ADDR_OF_256B_BLOCK(channel_index) |
448 				OFFSET_IN_256B_BLOCK(err_addr);
449 
450 		/* The umc channel bits are not original values, they are hashed */
451 		SET_CHANNEL_HASH(channel_index, soc_pa);
452 
453 		/* clear [C4 C3 C2] in soc physical address */
454 		soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
455 
456 		/* we only save ue error information currently, ce is skipped */
457 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
458 				== 1) {
459 			/* loop for all possibilities of [C4 C3 C2] */
460 			for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
461 				retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
462 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
463 				amdgpu_umc_fill_error_record(err_data, err_addr,
464 					retired_page, channel_index, umc_inst);
465 
466 				/* shift R14 bit */
467 				retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
468 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
469 				amdgpu_umc_fill_error_record(err_data, err_addr,
470 					retired_page, channel_index, umc_inst);
471 			}
472 		}
473 	}
474 
475 	/* clear umc status */
476 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
477 }
478 
479 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
480 					     void *ras_error_status)
481 {
482 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
483 
484 	uint32_t umc_inst        = 0;
485 	uint32_t ch_inst         = 0;
486 	uint32_t umc_reg_offset  = 0;
487 
488 	/*TODO: driver needs to toggle DF Cstate to ensure
489 	 * safe access of UMC resgisters. Will add the protection
490 	 * when firmware interface is ready */
491 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
492 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
493 							 umc_inst,
494 							 ch_inst);
495 		umc_v6_7_query_error_address(adev,
496 					     err_data,
497 					     umc_reg_offset,
498 					     ch_inst,
499 					     umc_inst);
500 	}
501 }
502 
503 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
504 						struct amdgpu_device *adev,
505 						uint32_t umc_reg_offset)
506 {
507 	uint32_t ecc_ctrl_addr, ecc_ctrl;
508 
509 	ecc_ctrl_addr =
510 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
511 	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
512 					umc_reg_offset) * 4);
513 
514 	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
515 }
516 
517 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
518 {
519 	uint32_t umc_reg_offset  = 0;
520 
521 	/* Enabling fatal error in umc instance0 channel0 will be
522 	 * considered as fatal error mode
523 	 */
524 	umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
525 	return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
526 }
527 
528 const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
529 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
530 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
531 };
532 
533 struct amdgpu_umc_ras umc_v6_7_ras = {
534 	.ras_block = {
535 		.hw_ops = &umc_v6_7_ras_hw_ops,
536 	},
537 	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
538 	.ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
539 	.ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
540 };
541