xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c (revision 9257bd80)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v6_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27 
28 #include "umc/umc_6_7_0_offset.h"
29 #include "umc/umc_6_7_0_sh_mask.h"
30 
31 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
32 					      uint32_t umc_inst,
33 					      uint32_t ch_inst)
34 {
35 	return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
36 }
37 
38 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
39 						   uint32_t umc_reg_offset,
40 						   unsigned long *error_count)
41 {
42 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
43 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
44 	uint64_t mc_umc_status;
45 	uint32_t mc_umc_status_addr;
46 
47 	/* UMC 6_1_1 registers */
48 	ecc_err_cnt_sel_addr =
49 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
50 	ecc_err_cnt_addr =
51 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
52 	mc_umc_status_addr =
53 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
54 
55 	/* select the lower chip and check the error count */
56 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
57 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
58 					EccErrCntCsSel, 0);
59 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
60 
61 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
62 	*error_count +=
63 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
64 		 UMC_V6_7_CE_CNT_INIT);
65 
66 	/* select the higher chip and check the err counter */
67 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
68 					EccErrCntCsSel, 1);
69 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
70 
71 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
72 	*error_count +=
73 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
74 		 UMC_V6_7_CE_CNT_INIT);
75 
76 	/* check for SRAM correctable error
77 	  MCUMC_STATUS is a 64 bit register */
78 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
79 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
80 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
81 		*error_count += 1;
82 }
83 
84 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
85 						      uint32_t umc_reg_offset,
86 						      unsigned long *error_count)
87 {
88 	uint64_t mc_umc_status;
89 	uint32_t mc_umc_status_addr;
90 
91 	mc_umc_status_addr =
92 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
93 
94 	/* check the MCUMC_STATUS */
95 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
96 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
97 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
98 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
99 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
100 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
101 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
102 		*error_count += 1;
103 }
104 
105 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
106 						   uint32_t umc_reg_offset)
107 {
108 	uint32_t ecc_err_cnt_addr;
109 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
110 
111 	ecc_err_cnt_sel_addr =
112 		SOC15_REG_OFFSET(UMC, 0,
113 				regUMCCH0_0_EccErrCntSel);
114 	ecc_err_cnt_addr =
115 		SOC15_REG_OFFSET(UMC, 0,
116 				regUMCCH0_0_EccErrCnt);
117 
118 	/* select the lower chip */
119 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
120 				       umc_reg_offset) * 4);
121 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
122 					UMCCH0_0_EccErrCntSel,
123 					EccErrCntCsSel, 0);
124 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
125 			ecc_err_cnt_sel);
126 
127 	/* clear lower chip error count */
128 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
129 			UMC_V6_7_CE_CNT_INIT);
130 
131 	/* select the higher chip */
132 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
133 					umc_reg_offset) * 4);
134 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
135 					UMCCH0_0_EccErrCntSel,
136 					EccErrCntCsSel, 1);
137 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
138 			ecc_err_cnt_sel);
139 
140 	/* clear higher chip error count */
141 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
142 			UMC_V6_7_CE_CNT_INIT);
143 }
144 
145 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
146 {
147 	uint32_t umc_inst        = 0;
148 	uint32_t ch_inst         = 0;
149 	uint32_t umc_reg_offset  = 0;
150 
151 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
152 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
153 							 umc_inst,
154 							 ch_inst);
155 
156 		umc_v6_7_reset_error_count_per_channel(adev,
157 						       umc_reg_offset);
158 	}
159 }
160 
161 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
162 					   void *ras_error_status)
163 {
164 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
165 
166 	uint32_t umc_inst        = 0;
167 	uint32_t ch_inst         = 0;
168 	uint32_t umc_reg_offset  = 0;
169 
170 	/*TODO: driver needs to toggle DF Cstate to ensure
171 	 * safe access of UMC registers. Will add the protection */
172 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
173 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
174 							 umc_inst,
175 							 ch_inst);
176 		umc_v6_7_query_correctable_error_count(adev,
177 						       umc_reg_offset,
178 						       &(err_data->ce_count));
179 		umc_v6_7_querry_uncorrectable_error_count(adev,
180 							  umc_reg_offset,
181 							  &(err_data->ue_count));
182 	}
183 
184 	umc_v6_7_reset_error_count(adev);
185 }
186 
187 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
188 					 struct ras_err_data *err_data,
189 					 uint32_t umc_reg_offset,
190 					 uint32_t ch_inst,
191 					 uint32_t umc_inst)
192 {
193 	uint32_t mc_umc_status_addr;
194 	uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
195 	struct eeprom_table_record *err_rec;
196 	uint32_t channel_index;
197 
198 	mc_umc_status_addr =
199 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
200 	mc_umc_addrt0 =
201 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
202 
203 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
204 
205 	if (mc_umc_status == 0)
206 		return;
207 
208 	if (!err_data->err_addr) {
209 		/* clear umc status */
210 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
211 		return;
212 	}
213 
214 	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
215 
216 	channel_index =
217 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
218 
219 	/* calculate error address if ue/ce error is detected */
220 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
221 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
222 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
223 
224 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
225 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
226 
227 		/* translate umc channel address to soc pa, 3 parts are included */
228 		retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
229 				ADDR_OF_256B_BLOCK(channel_index) |
230 				OFFSET_IN_256B_BLOCK(err_addr);
231 
232 		/* we only save ue error information currently, ce is skipped */
233 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
234 				== 1) {
235 			err_rec->address = err_addr;
236 			/* page frame address is saved */
237 			err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
238 			err_rec->ts = (uint64_t)ktime_get_real_seconds();
239 			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
240 			err_rec->cu = 0;
241 			err_rec->mem_channel = channel_index;
242 			err_rec->mcumc_id = umc_inst;
243 
244 			err_data->err_addr_cnt++;
245 		}
246 	}
247 
248 	/* clear umc status */
249 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
250 }
251 
252 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
253 					     void *ras_error_status)
254 {
255 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
256 
257 	uint32_t umc_inst        = 0;
258 	uint32_t ch_inst         = 0;
259 	uint32_t umc_reg_offset  = 0;
260 
261 	/*TODO: driver needs to toggle DF Cstate to ensure
262 	 * safe access of UMC resgisters. Will add the protection
263 	 * when firmware interface is ready */
264 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
265 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
266 							 umc_inst,
267 							 ch_inst);
268 		umc_v6_7_query_error_address(adev,
269 					     err_data,
270 					     umc_reg_offset,
271 					     ch_inst,
272 					     umc_inst);
273 	}
274 }
275 
276 const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs = {
277 	.ras_late_init = amdgpu_umc_ras_late_init,
278 	.ras_fini = amdgpu_umc_ras_fini,
279 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
280 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
281 };
282