1 /* 2 * Copyright 2020 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 #include "umc_v8_7.h" 24 #include "amdgpu_ras.h" 25 #include "amdgpu_umc.h" 26 #include "amdgpu.h" 27 28 #include "rsmu/rsmu_0_0_2_offset.h" 29 #include "rsmu/rsmu_0_0_2_sh_mask.h" 30 #include "umc/umc_8_7_0_offset.h" 31 #include "umc/umc_8_7_0_sh_mask.h" 32 33 #define UMC_8_INST_DIST 0x40000 34 35 const uint32_t 36 umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = { 37 {2, 11}, {4, 13}, 38 {1, 8}, {7, 14}, 39 {10, 3}, {12, 5}, 40 {9, 0}, {15, 6} 41 }; 42 43 static inline uint32_t get_umc_v8_7_reg_offset(struct amdgpu_device *adev, 44 uint32_t umc_inst, 45 uint32_t ch_inst) 46 { 47 return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst; 48 } 49 50 static inline uint32_t get_umc_v8_7_channel_index(struct amdgpu_device *adev, 51 uint32_t umc_inst, 52 uint32_t ch_inst) 53 { 54 return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 55 } 56 57 static void umc_v8_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev, 58 uint32_t umc_inst, uint32_t ch_inst, 59 unsigned long *error_count) 60 { 61 uint64_t mc_umc_status; 62 uint32_t eccinfo_table_idx; 63 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 64 65 eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; 66 67 /* check for SRAM correctable error 68 * MCUMC_STATUS is a 64 bit register 69 */ 70 mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; 71 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 72 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) 73 *error_count += 1; 74 } 75 76 static void umc_v8_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev, 77 uint32_t umc_inst, uint32_t ch_inst, 78 unsigned long *error_count) 79 { 80 uint64_t mc_umc_status; 81 uint32_t eccinfo_table_idx; 82 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 83 84 eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; 85 86 /* check the MCUMC_STATUS */ 87 mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; 88 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && 89 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || 90 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 91 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || 92 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || 93 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) 94 *error_count += 1; 95 } 96 97 static void umc_v8_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev, 98 void *ras_error_status) 99 { 100 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 101 102 uint32_t umc_inst = 0; 103 uint32_t ch_inst = 0; 104 105 /* TODO: driver needs to toggle DF Cstate to ensure 106 * safe access of UMC registers. Will add the protection 107 */ 108 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 109 umc_v8_7_ecc_info_query_correctable_error_count(adev, 110 umc_inst, ch_inst, 111 &(err_data->ce_count)); 112 umc_v8_7_ecc_info_querry_uncorrectable_error_count(adev, 113 umc_inst, ch_inst, 114 &(err_data->ue_count)); 115 } 116 } 117 118 static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev, 119 struct ras_err_data *err_data, 120 uint32_t ch_inst, 121 uint32_t umc_inst) 122 { 123 uint64_t mc_umc_status, err_addr, retired_page; 124 uint32_t channel_index; 125 uint32_t eccinfo_table_idx; 126 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 127 128 eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; 129 channel_index = 130 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 131 132 mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; 133 134 if (mc_umc_status == 0) 135 return; 136 137 if (!err_data->err_addr) 138 return; 139 140 /* calculate error address if ue/ce error is detected */ 141 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 142 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 143 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 144 145 err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; 146 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 147 148 /* translate umc channel address to soc pa, 3 parts are included */ 149 retired_page = ADDR_OF_4KB_BLOCK(err_addr) | 150 ADDR_OF_256B_BLOCK(channel_index) | 151 OFFSET_IN_256B_BLOCK(err_addr); 152 153 /* we only save ue error information currently, ce is skipped */ 154 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) 155 == 1) 156 amdgpu_umc_fill_error_record(err_data, err_addr, 157 retired_page, channel_index, umc_inst); 158 } 159 } 160 161 static void umc_v8_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev, 162 void *ras_error_status) 163 { 164 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 165 166 uint32_t umc_inst = 0; 167 uint32_t ch_inst = 0; 168 169 /* TODO: driver needs to toggle DF Cstate to ensure 170 * safe access of UMC resgisters. Will add the protection 171 * when firmware interface is ready 172 */ 173 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 174 umc_v8_7_ecc_info_query_error_address(adev, 175 err_data, 176 ch_inst, 177 umc_inst); 178 } 179 } 180 181 static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev, 182 uint32_t umc_reg_offset) 183 { 184 uint32_t ecc_err_cnt_addr; 185 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 186 187 ecc_err_cnt_sel_addr = 188 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel); 189 ecc_err_cnt_addr = 190 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt); 191 192 /* select the lower chip */ 193 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + 194 umc_reg_offset) * 4); 195 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, 196 UMCCH0_0_GeccErrCntSel, 197 GeccErrCntCsSel, 0); 198 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, 199 ecc_err_cnt_sel); 200 201 /* clear lower chip error count */ 202 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, 203 UMC_V8_7_CE_CNT_INIT); 204 205 /* select the higher chip */ 206 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + 207 umc_reg_offset) * 4); 208 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, 209 UMCCH0_0_GeccErrCntSel, 210 GeccErrCntCsSel, 1); 211 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, 212 ecc_err_cnt_sel); 213 214 /* clear higher chip error count */ 215 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, 216 UMC_V8_7_CE_CNT_INIT); 217 } 218 219 static void umc_v8_7_clear_error_count(struct amdgpu_device *adev) 220 { 221 uint32_t umc_inst = 0; 222 uint32_t ch_inst = 0; 223 uint32_t umc_reg_offset = 0; 224 225 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 226 umc_reg_offset = get_umc_v8_7_reg_offset(adev, 227 umc_inst, 228 ch_inst); 229 230 umc_v8_7_clear_error_count_per_channel(adev, 231 umc_reg_offset); 232 } 233 } 234 235 static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev, 236 uint32_t umc_reg_offset, 237 unsigned long *error_count) 238 { 239 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 240 uint32_t ecc_err_cnt, ecc_err_cnt_addr; 241 uint64_t mc_umc_status; 242 uint32_t mc_umc_status_addr; 243 244 /* UMC 8_7_2 registers */ 245 ecc_err_cnt_sel_addr = 246 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel); 247 ecc_err_cnt_addr = 248 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt); 249 mc_umc_status_addr = 250 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 251 252 /* select the lower chip and check the error count */ 253 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4); 254 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel, 255 GeccErrCntCsSel, 0); 256 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 257 258 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 259 *error_count += 260 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) - 261 UMC_V8_7_CE_CNT_INIT); 262 263 /* select the higher chip and check the err counter */ 264 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel, 265 GeccErrCntCsSel, 1); 266 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 267 268 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 269 *error_count += 270 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) - 271 UMC_V8_7_CE_CNT_INIT); 272 273 /* check for SRAM correctable error 274 MCUMC_STATUS is a 64 bit register */ 275 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 276 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 && 277 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 278 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) 279 *error_count += 1; 280 } 281 282 static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev, 283 uint32_t umc_reg_offset, 284 unsigned long *error_count) 285 { 286 uint64_t mc_umc_status; 287 uint32_t mc_umc_status_addr; 288 289 mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 290 291 /* check the MCUMC_STATUS */ 292 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 293 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && 294 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || 295 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 296 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || 297 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || 298 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) 299 *error_count += 1; 300 } 301 302 static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev, 303 void *ras_error_status) 304 { 305 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 306 307 uint32_t umc_inst = 0; 308 uint32_t ch_inst = 0; 309 uint32_t umc_reg_offset = 0; 310 311 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 312 umc_reg_offset = get_umc_v8_7_reg_offset(adev, 313 umc_inst, 314 ch_inst); 315 316 umc_v8_7_query_correctable_error_count(adev, 317 umc_reg_offset, 318 &(err_data->ce_count)); 319 umc_v8_7_querry_uncorrectable_error_count(adev, 320 umc_reg_offset, 321 &(err_data->ue_count)); 322 } 323 324 umc_v8_7_clear_error_count(adev); 325 } 326 327 static void umc_v8_7_query_error_address(struct amdgpu_device *adev, 328 struct ras_err_data *err_data, 329 uint32_t umc_reg_offset, 330 uint32_t ch_inst, 331 uint32_t umc_inst) 332 { 333 uint32_t lsb, mc_umc_status_addr; 334 uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0; 335 uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 336 337 mc_umc_status_addr = 338 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 339 mc_umc_addrt0 = 340 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0); 341 342 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 343 344 if (mc_umc_status == 0) 345 return; 346 347 if (!err_data->err_addr) { 348 /* clear umc status */ 349 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 350 return; 351 } 352 353 /* calculate error address if ue/ce error is detected */ 354 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 355 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 356 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 357 358 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); 359 /* the lowest lsb bits should be ignored */ 360 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB); 361 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 362 err_addr &= ~((0x1ULL << lsb) - 1); 363 364 /* translate umc channel address to soc pa, 3 parts are included */ 365 retired_page = ADDR_OF_4KB_BLOCK(err_addr) | 366 ADDR_OF_256B_BLOCK(channel_index) | 367 OFFSET_IN_256B_BLOCK(err_addr); 368 369 /* we only save ue error information currently, ce is skipped */ 370 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) 371 == 1) 372 amdgpu_umc_fill_error_record(err_data, err_addr, 373 retired_page, channel_index, umc_inst); 374 } 375 376 /* clear umc status */ 377 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 378 } 379 380 static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev, 381 void *ras_error_status) 382 { 383 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 384 385 uint32_t umc_inst = 0; 386 uint32_t ch_inst = 0; 387 uint32_t umc_reg_offset = 0; 388 389 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 390 umc_reg_offset = get_umc_v8_7_reg_offset(adev, 391 umc_inst, 392 ch_inst); 393 394 umc_v8_7_query_error_address(adev, 395 err_data, 396 umc_reg_offset, 397 ch_inst, 398 umc_inst); 399 } 400 } 401 402 static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev, 403 uint32_t umc_reg_offset) 404 { 405 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 406 uint32_t ecc_err_cnt_addr; 407 408 ecc_err_cnt_sel_addr = 409 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel); 410 ecc_err_cnt_addr = 411 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt); 412 413 /* select the lower chip and check the error count */ 414 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4); 415 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel, 416 GeccErrCntCsSel, 0); 417 /* set ce error interrupt type to APIC based interrupt */ 418 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel, 419 GeccErrInt, 0x1); 420 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 421 /* set error count to initial value */ 422 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT); 423 424 /* select the higher chip and check the err counter */ 425 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel, 426 GeccErrCntCsSel, 1); 427 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 428 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT); 429 } 430 431 static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev) 432 { 433 uint32_t umc_inst = 0; 434 uint32_t ch_inst = 0; 435 uint32_t umc_reg_offset = 0; 436 437 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 438 umc_reg_offset = get_umc_v8_7_reg_offset(adev, 439 umc_inst, 440 ch_inst); 441 442 umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset); 443 } 444 } 445 446 const struct amdgpu_ras_block_hw_ops umc_v8_7_ras_hw_ops = { 447 .query_ras_error_count = umc_v8_7_query_ras_error_count, 448 .query_ras_error_address = umc_v8_7_query_ras_error_address, 449 }; 450 451 struct amdgpu_umc_ras umc_v8_7_ras = { 452 .ras_block = { 453 .hw_ops = &umc_v8_7_ras_hw_ops, 454 }, 455 .err_cnt_init = umc_v8_7_err_cnt_init, 456 .ecc_info_query_ras_error_count = umc_v8_7_ecc_info_query_ras_error_count, 457 .ecc_info_query_ras_error_address = umc_v8_7_ecc_info_query_ras_error_address, 458 }; 459