xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c (revision cbabf03c)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v6_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27 
28 #include "umc/umc_6_7_0_offset.h"
29 #include "umc/umc_6_7_0_sh_mask.h"
30 
31 const uint32_t
32 	umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33 		{28, 20, 24, 16, 12, 4, 8, 0},
34 		{6, 30, 2, 26, 22, 14, 18, 10},
35 		{19, 11, 15, 7, 3, 27, 31, 23},
36 		{9, 1, 5, 29, 25, 17, 21, 13}
37 };
38 const uint32_t
39 	umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40 		{19, 11, 15, 7,	3, 27, 31, 23},
41 		{9, 1, 5, 29, 25, 17, 21, 13},
42 		{28, 20, 24, 16, 12, 4, 8, 0},
43 		{6, 30, 2, 26, 22, 14, 18, 10},
44 };
45 
46 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
47 					      uint32_t umc_inst,
48 					      uint32_t ch_inst)
49 {
50 	uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
51 
52 	/* adjust umc and channel index offset,
53 	 * the register address is not linear on each umc instace */
54 	umc_inst = index / 4;
55 	ch_inst = index % 4;
56 
57 	return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
58 }
59 
60 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev,
61 					      uint32_t umc_inst,
62 					      uint32_t ch_inst)
63 {
64 	return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
65 }
66 
67 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
68 						   uint32_t umc_inst, uint32_t ch_inst,
69 						   unsigned long *error_count)
70 {
71 	uint64_t mc_umc_status;
72 	uint32_t eccinfo_table_idx;
73 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
74 
75 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
76 	/* check for SRAM correctable error
77 	  MCUMC_STATUS is a 64 bit register */
78 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
79 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
80 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
81 		*error_count += 1;
82 }
83 
84 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
85 							  uint32_t umc_inst, uint32_t ch_inst,
86 						      unsigned long *error_count)
87 {
88 	uint64_t mc_umc_status;
89 	uint32_t eccinfo_table_idx;
90 	uint32_t umc_reg_offset;
91 	uint32_t mc_umc_addr;
92 	uint64_t reg_value;
93 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
94 
95 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
96 						umc_inst, ch_inst);
97 
98 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
99 	/* check the MCUMC_STATUS */
100 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
101 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
102 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
103 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
104 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
105 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
106 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
107 		*error_count += 1;
108 
109 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
110 			dev_info(adev->dev, "Deferred error, no user action is needed.\n");
111 
112 		if (mc_umc_status)
113 			dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
114 
115 		/* print IPID registers value */
116 		mc_umc_addr =
117 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
118 		reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
119 		if (reg_value)
120 			dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
121 
122 		/* print SYND registers value */
123 		mc_umc_addr =
124 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
125 		reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
126 		if (reg_value)
127 			dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
128 
129 		/* print MISC0 registers value */
130 		mc_umc_addr =
131 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
132 		reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
133 		if (reg_value)
134 			dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
135 	}
136 }
137 
138 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
139 					   void *ras_error_status)
140 {
141 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
142 
143 	uint32_t umc_inst        = 0;
144 	uint32_t ch_inst         = 0;
145 
146 	/*TODO: driver needs to toggle DF Cstate to ensure
147 	 * safe access of UMC registers. Will add the protection */
148 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
149 		umc_v6_7_ecc_info_query_correctable_error_count(adev,
150 						      umc_inst, ch_inst,
151 						      &(err_data->ce_count));
152 		umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
153 						      umc_inst, ch_inst,
154 							  &(err_data->ue_count));
155 	}
156 }
157 
158 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
159 					 struct ras_err_data *err_data,
160 					 uint32_t ch_inst,
161 					 uint32_t umc_inst)
162 {
163 	uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column;
164 	uint32_t channel_index;
165 	uint32_t eccinfo_table_idx;
166 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
167 
168 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
169 	channel_index =
170 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
171 
172 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
173 
174 	if (mc_umc_status == 0)
175 		return;
176 
177 	if (!err_data->err_addr)
178 		return;
179 
180 	/* calculate error address if ue/ce error is detected */
181 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
182 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
183 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
184 
185 		err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
186 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
187 
188 		/* translate umc channel address to soc pa, 3 parts are included */
189 		soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
190 				ADDR_OF_256B_BLOCK(channel_index) |
191 				OFFSET_IN_256B_BLOCK(err_addr);
192 
193 		/* The umc channel bits are not original values, they are hashed */
194 		SET_CHANNEL_HASH(channel_index, soc_pa);
195 
196 		/* clear [C4 C3 C2] in soc physical address */
197 		soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
198 
199 		/* we only save ue error information currently, ce is skipped */
200 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
201 				== 1) {
202 			/* loop for all possibilities of [C4 C3 C2] */
203 			for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
204 				retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
205 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
206 				amdgpu_umc_fill_error_record(err_data, err_addr,
207 					retired_page, channel_index, umc_inst);
208 
209 				/* shift R14 bit */
210 				retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
211 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
212 				amdgpu_umc_fill_error_record(err_data, err_addr,
213 					retired_page, channel_index, umc_inst);
214 			}
215 		}
216 	}
217 }
218 
219 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
220 					     void *ras_error_status)
221 {
222 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
223 
224 	uint32_t umc_inst        = 0;
225 	uint32_t ch_inst         = 0;
226 
227 	/*TODO: driver needs to toggle DF Cstate to ensure
228 	 * safe access of UMC resgisters. Will add the protection
229 	 * when firmware interface is ready */
230 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
231 		umc_v6_7_ecc_info_query_error_address(adev,
232 					     err_data,
233 					     ch_inst,
234 					     umc_inst);
235 	}
236 }
237 
238 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
239 						   uint32_t umc_reg_offset,
240 						   unsigned long *error_count)
241 {
242 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
243 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
244 	uint64_t mc_umc_status;
245 	uint32_t mc_umc_status_addr;
246 
247 	/* UMC 6_1_1 registers */
248 	ecc_err_cnt_sel_addr =
249 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
250 	ecc_err_cnt_addr =
251 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
252 	mc_umc_status_addr =
253 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
254 
255 	/* select the lower chip and check the error count */
256 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
257 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
258 					EccErrCntCsSel, 0);
259 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
260 
261 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
262 	*error_count +=
263 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
264 		 UMC_V6_7_CE_CNT_INIT);
265 
266 	/* select the higher chip and check the err counter */
267 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
268 					EccErrCntCsSel, 1);
269 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
270 
271 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
272 	*error_count +=
273 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
274 		 UMC_V6_7_CE_CNT_INIT);
275 
276 	/* check for SRAM correctable error
277 	  MCUMC_STATUS is a 64 bit register */
278 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
279 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
280 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
281 		*error_count += 1;
282 }
283 
284 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
285 						      uint32_t umc_reg_offset,
286 						      unsigned long *error_count)
287 {
288 	uint64_t mc_umc_status;
289 	uint32_t mc_umc_status_addr;
290 	uint32_t mc_umc_addr;
291 	uint64_t reg_value;
292 
293 	mc_umc_status_addr =
294 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
295 
296 	/* check the MCUMC_STATUS */
297 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
298 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
299 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
300 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
301 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
302 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
303 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
304 		*error_count += 1;
305 
306 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
307 			dev_info(adev->dev, "Deferred error, no user action is needed.\n");
308 
309 		if (mc_umc_status)
310 			dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
311 
312 		/* print IPID registers value */
313 		mc_umc_addr =
314 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
315 		reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
316 		if (reg_value)
317 			dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
318 
319 		/* print SYND registers value */
320 		mc_umc_addr =
321 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
322 		reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
323 		if (reg_value)
324 			dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
325 
326 		/* print MISC0 registers value */
327 		mc_umc_addr =
328 			SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
329 		reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
330 		if (reg_value)
331 			dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
332 	}
333 }
334 
335 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
336 						   uint32_t umc_reg_offset)
337 {
338 	uint32_t ecc_err_cnt_addr;
339 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
340 
341 	ecc_err_cnt_sel_addr =
342 		SOC15_REG_OFFSET(UMC, 0,
343 				regUMCCH0_0_EccErrCntSel);
344 	ecc_err_cnt_addr =
345 		SOC15_REG_OFFSET(UMC, 0,
346 				regUMCCH0_0_EccErrCnt);
347 
348 	/* select the lower chip */
349 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
350 				       umc_reg_offset) * 4);
351 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
352 					UMCCH0_0_EccErrCntSel,
353 					EccErrCntCsSel, 0);
354 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
355 			ecc_err_cnt_sel);
356 
357 	/* clear lower chip error count */
358 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
359 			UMC_V6_7_CE_CNT_INIT);
360 
361 	/* select the higher chip */
362 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
363 					umc_reg_offset) * 4);
364 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
365 					UMCCH0_0_EccErrCntSel,
366 					EccErrCntCsSel, 1);
367 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
368 			ecc_err_cnt_sel);
369 
370 	/* clear higher chip error count */
371 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
372 			UMC_V6_7_CE_CNT_INIT);
373 }
374 
375 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
376 {
377 	uint32_t umc_inst        = 0;
378 	uint32_t ch_inst         = 0;
379 	uint32_t umc_reg_offset  = 0;
380 
381 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
382 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
383 							 umc_inst,
384 							 ch_inst);
385 
386 		umc_v6_7_reset_error_count_per_channel(adev,
387 						       umc_reg_offset);
388 	}
389 }
390 
391 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
392 					   void *ras_error_status)
393 {
394 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
395 
396 	uint32_t umc_inst        = 0;
397 	uint32_t ch_inst         = 0;
398 	uint32_t umc_reg_offset  = 0;
399 
400 	/*TODO: driver needs to toggle DF Cstate to ensure
401 	 * safe access of UMC registers. Will add the protection */
402 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
403 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
404 							 umc_inst,
405 							 ch_inst);
406 		umc_v6_7_query_correctable_error_count(adev,
407 						       umc_reg_offset,
408 						       &(err_data->ce_count));
409 		umc_v6_7_querry_uncorrectable_error_count(adev,
410 							  umc_reg_offset,
411 							  &(err_data->ue_count));
412 	}
413 
414 	umc_v6_7_reset_error_count(adev);
415 }
416 
417 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
418 					 struct ras_err_data *err_data,
419 					 uint32_t umc_reg_offset,
420 					 uint32_t ch_inst,
421 					 uint32_t umc_inst)
422 {
423 	uint32_t mc_umc_status_addr;
424 	uint32_t channel_index;
425 	uint64_t mc_umc_status, mc_umc_addrt0;
426 	uint64_t err_addr, soc_pa, retired_page, column;
427 
428 	mc_umc_status_addr =
429 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
430 	mc_umc_addrt0 =
431 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
432 
433 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
434 
435 	if (mc_umc_status == 0)
436 		return;
437 
438 	if (!err_data->err_addr) {
439 		/* clear umc status */
440 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
441 		return;
442 	}
443 
444 	channel_index =
445 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
446 
447 	/* calculate error address if ue/ce error is detected */
448 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
449 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
450 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
451 
452 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
453 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
454 
455 		/* translate umc channel address to soc pa, 3 parts are included */
456 		soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
457 				ADDR_OF_256B_BLOCK(channel_index) |
458 				OFFSET_IN_256B_BLOCK(err_addr);
459 
460 		/* The umc channel bits are not original values, they are hashed */
461 		SET_CHANNEL_HASH(channel_index, soc_pa);
462 
463 		/* clear [C4 C3 C2] in soc physical address */
464 		soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
465 
466 		/* we only save ue error information currently, ce is skipped */
467 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
468 				== 1) {
469 			/* loop for all possibilities of [C4 C3 C2] */
470 			for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
471 				retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
472 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
473 				amdgpu_umc_fill_error_record(err_data, err_addr,
474 					retired_page, channel_index, umc_inst);
475 
476 				/* shift R14 bit */
477 				retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
478 				dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
479 				amdgpu_umc_fill_error_record(err_data, err_addr,
480 					retired_page, channel_index, umc_inst);
481 			}
482 		}
483 	}
484 
485 	/* clear umc status */
486 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
487 }
488 
489 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
490 					     void *ras_error_status)
491 {
492 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
493 
494 	uint32_t umc_inst        = 0;
495 	uint32_t ch_inst         = 0;
496 	uint32_t umc_reg_offset  = 0;
497 
498 	/*TODO: driver needs to toggle DF Cstate to ensure
499 	 * safe access of UMC resgisters. Will add the protection
500 	 * when firmware interface is ready */
501 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
502 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
503 							 umc_inst,
504 							 ch_inst);
505 		umc_v6_7_query_error_address(adev,
506 					     err_data,
507 					     umc_reg_offset,
508 					     ch_inst,
509 					     umc_inst);
510 	}
511 }
512 
513 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
514 						struct amdgpu_device *adev,
515 						uint32_t umc_reg_offset)
516 {
517 	uint32_t ecc_ctrl_addr, ecc_ctrl;
518 
519 	ecc_ctrl_addr =
520 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
521 	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
522 					umc_reg_offset) * 4);
523 
524 	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
525 }
526 
527 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
528 {
529 	uint32_t umc_reg_offset  = 0;
530 
531 	/* Enabling fatal error in umc instance0 channel0 will be
532 	 * considered as fatal error mode
533 	 */
534 	umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
535 	return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
536 }
537 
538 const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
539 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
540 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
541 };
542 
543 struct amdgpu_umc_ras umc_v6_7_ras = {
544 	.ras_block = {
545 		.hw_ops = &umc_v6_7_ras_hw_ops,
546 	},
547 	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
548 	.ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
549 	.ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
550 };
551