1 /* 2 * Copyright 2021 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 #include "umc_v6_7.h" 24 #include "amdgpu_ras.h" 25 #include "amdgpu_umc.h" 26 #include "amdgpu.h" 27 28 #include "umc/umc_6_7_0_offset.h" 29 #include "umc/umc_6_7_0_sh_mask.h" 30 31 const uint32_t 32 umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = { 33 {28, 20, 24, 16, 12, 4, 8, 0}, 34 {6, 30, 2, 26, 22, 14, 18, 10}, 35 {19, 11, 15, 7, 3, 27, 31, 23}, 36 {9, 1, 5, 29, 25, 17, 21, 13} 37 }; 38 const uint32_t 39 umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = { 40 {19, 11, 15, 7, 3, 27, 31, 23}, 41 {9, 1, 5, 29, 25, 17, 21, 13}, 42 {28, 20, 24, 16, 12, 4, 8, 0}, 43 {6, 30, 2, 26, 22, 14, 18, 10}, 44 }; 45 46 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev, 47 uint32_t umc_inst, 48 uint32_t ch_inst) 49 { 50 return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst; 51 } 52 53 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev, 54 uint32_t umc_inst, 55 uint32_t ch_inst) 56 { 57 return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 58 } 59 60 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev, 61 uint32_t channel_index, 62 unsigned long *error_count) 63 { 64 uint32_t ecc_err_cnt; 65 uint64_t mc_umc_status; 66 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 67 68 /* 69 * select the lower chip and check the error count 70 * skip add error count, calc error counter only from mca_umc_status 71 */ 72 ecc_err_cnt = ras->umc_ecc.ecc[channel_index].ce_count_lo_chip; 73 74 /* 75 * select the higher chip and check the err counter 76 * skip add error count, calc error counter only from mca_umc_status 77 */ 78 ecc_err_cnt = ras->umc_ecc.ecc[channel_index].ce_count_hi_chip; 79 80 /* check for SRAM correctable error 81 MCUMC_STATUS is a 64 bit register */ 82 mc_umc_status = ras->umc_ecc.ecc[channel_index].mca_umc_status; 83 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 84 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) 85 *error_count += 1; 86 } 87 88 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev, 89 uint32_t channel_index, 90 unsigned long *error_count) 91 { 92 uint64_t mc_umc_status; 93 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 94 95 /* check the MCUMC_STATUS */ 96 mc_umc_status = ras->umc_ecc.ecc[channel_index].mca_umc_status; 97 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && 98 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || 99 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 100 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || 101 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || 102 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) 103 *error_count += 1; 104 } 105 106 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev, 107 void *ras_error_status) 108 { 109 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 110 111 uint32_t umc_inst = 0; 112 uint32_t ch_inst = 0; 113 uint32_t umc_reg_offset = 0; 114 uint32_t channel_index = 0; 115 116 /*TODO: driver needs to toggle DF Cstate to ensure 117 * safe access of UMC registers. Will add the protection */ 118 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 119 umc_reg_offset = get_umc_v6_7_reg_offset(adev, 120 umc_inst, 121 ch_inst); 122 channel_index = get_umc_v6_7_channel_index(adev, 123 umc_inst, 124 ch_inst); 125 umc_v6_7_ecc_info_query_correctable_error_count(adev, 126 channel_index, 127 &(err_data->ce_count)); 128 umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev, 129 channel_index, 130 &(err_data->ue_count)); 131 } 132 } 133 134 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev, 135 struct ras_err_data *err_data, 136 uint32_t umc_reg_offset, 137 uint32_t ch_inst, 138 uint32_t umc_inst) 139 { 140 uint64_t mc_umc_status, err_addr, retired_page; 141 struct eeprom_table_record *err_rec; 142 uint32_t channel_index; 143 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 144 145 channel_index = 146 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 147 148 mc_umc_status = ras->umc_ecc.ecc[channel_index].mca_umc_status; 149 150 if (mc_umc_status == 0) 151 return; 152 153 if (!err_data->err_addr) 154 return; 155 156 err_rec = &err_data->err_addr[err_data->err_addr_cnt]; 157 158 /* calculate error address if ue/ce error is detected */ 159 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 160 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 161 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 162 163 err_addr = ras->umc_ecc.ecc[channel_index].mca_umc_addr; 164 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 165 166 /* translate umc channel address to soc pa, 3 parts are included */ 167 retired_page = ADDR_OF_8KB_BLOCK(err_addr) | 168 ADDR_OF_256B_BLOCK(channel_index) | 169 OFFSET_IN_256B_BLOCK(err_addr); 170 171 /* we only save ue error information currently, ce is skipped */ 172 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) 173 == 1) { 174 err_rec->address = err_addr; 175 /* page frame address is saved */ 176 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; 177 err_rec->ts = (uint64_t)ktime_get_real_seconds(); 178 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 179 err_rec->cu = 0; 180 err_rec->mem_channel = channel_index; 181 err_rec->mcumc_id = umc_inst; 182 183 err_data->err_addr_cnt++; 184 } 185 } 186 } 187 188 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev, 189 void *ras_error_status) 190 { 191 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 192 193 uint32_t umc_inst = 0; 194 uint32_t ch_inst = 0; 195 uint32_t umc_reg_offset = 0; 196 197 /*TODO: driver needs to toggle DF Cstate to ensure 198 * safe access of UMC resgisters. Will add the protection 199 * when firmware interface is ready */ 200 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 201 umc_reg_offset = get_umc_v6_7_reg_offset(adev, 202 umc_inst, 203 ch_inst); 204 umc_v6_7_ecc_info_query_error_address(adev, 205 err_data, 206 umc_reg_offset, 207 ch_inst, 208 umc_inst); 209 } 210 } 211 212 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev, 213 uint32_t umc_reg_offset, 214 unsigned long *error_count) 215 { 216 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 217 uint32_t ecc_err_cnt, ecc_err_cnt_addr; 218 uint64_t mc_umc_status; 219 uint32_t mc_umc_status_addr; 220 221 /* UMC 6_1_1 registers */ 222 ecc_err_cnt_sel_addr = 223 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel); 224 ecc_err_cnt_addr = 225 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt); 226 mc_umc_status_addr = 227 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); 228 229 /* select the lower chip and check the error count */ 230 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4); 231 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 232 EccErrCntCsSel, 0); 233 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 234 235 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 236 *error_count += 237 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 238 UMC_V6_7_CE_CNT_INIT); 239 240 /* select the higher chip and check the err counter */ 241 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 242 EccErrCntCsSel, 1); 243 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 244 245 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 246 *error_count += 247 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 248 UMC_V6_7_CE_CNT_INIT); 249 250 /* check for SRAM correctable error 251 MCUMC_STATUS is a 64 bit register */ 252 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 253 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 254 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) 255 *error_count += 1; 256 } 257 258 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev, 259 uint32_t umc_reg_offset, 260 unsigned long *error_count) 261 { 262 uint64_t mc_umc_status; 263 uint32_t mc_umc_status_addr; 264 265 mc_umc_status_addr = 266 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); 267 268 /* check the MCUMC_STATUS */ 269 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 270 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && 271 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || 272 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 273 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || 274 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || 275 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) 276 *error_count += 1; 277 } 278 279 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev, 280 uint32_t umc_reg_offset) 281 { 282 uint32_t ecc_err_cnt_addr; 283 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 284 285 ecc_err_cnt_sel_addr = 286 SOC15_REG_OFFSET(UMC, 0, 287 regUMCCH0_0_EccErrCntSel); 288 ecc_err_cnt_addr = 289 SOC15_REG_OFFSET(UMC, 0, 290 regUMCCH0_0_EccErrCnt); 291 292 /* select the lower chip */ 293 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + 294 umc_reg_offset) * 4); 295 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, 296 UMCCH0_0_EccErrCntSel, 297 EccErrCntCsSel, 0); 298 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, 299 ecc_err_cnt_sel); 300 301 /* clear lower chip error count */ 302 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, 303 UMC_V6_7_CE_CNT_INIT); 304 305 /* select the higher chip */ 306 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + 307 umc_reg_offset) * 4); 308 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, 309 UMCCH0_0_EccErrCntSel, 310 EccErrCntCsSel, 1); 311 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, 312 ecc_err_cnt_sel); 313 314 /* clear higher chip error count */ 315 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, 316 UMC_V6_7_CE_CNT_INIT); 317 } 318 319 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev) 320 { 321 uint32_t umc_inst = 0; 322 uint32_t ch_inst = 0; 323 uint32_t umc_reg_offset = 0; 324 325 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 326 umc_reg_offset = get_umc_v6_7_reg_offset(adev, 327 umc_inst, 328 ch_inst); 329 330 umc_v6_7_reset_error_count_per_channel(adev, 331 umc_reg_offset); 332 } 333 } 334 335 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev, 336 void *ras_error_status) 337 { 338 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 339 340 uint32_t umc_inst = 0; 341 uint32_t ch_inst = 0; 342 uint32_t umc_reg_offset = 0; 343 344 /*TODO: driver needs to toggle DF Cstate to ensure 345 * safe access of UMC registers. Will add the protection */ 346 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 347 umc_reg_offset = get_umc_v6_7_reg_offset(adev, 348 umc_inst, 349 ch_inst); 350 umc_v6_7_query_correctable_error_count(adev, 351 umc_reg_offset, 352 &(err_data->ce_count)); 353 umc_v6_7_querry_uncorrectable_error_count(adev, 354 umc_reg_offset, 355 &(err_data->ue_count)); 356 } 357 358 umc_v6_7_reset_error_count(adev); 359 } 360 361 static void umc_v6_7_query_error_address(struct amdgpu_device *adev, 362 struct ras_err_data *err_data, 363 uint32_t umc_reg_offset, 364 uint32_t ch_inst, 365 uint32_t umc_inst) 366 { 367 uint32_t mc_umc_status_addr; 368 uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0; 369 struct eeprom_table_record *err_rec; 370 uint32_t channel_index; 371 372 mc_umc_status_addr = 373 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); 374 mc_umc_addrt0 = 375 SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); 376 377 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 378 379 if (mc_umc_status == 0) 380 return; 381 382 if (!err_data->err_addr) { 383 /* clear umc status */ 384 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 385 return; 386 } 387 388 err_rec = &err_data->err_addr[err_data->err_addr_cnt]; 389 390 channel_index = 391 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 392 393 /* calculate error address if ue/ce error is detected */ 394 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 395 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 396 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 397 398 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); 399 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 400 401 /* translate umc channel address to soc pa, 3 parts are included */ 402 retired_page = ADDR_OF_8KB_BLOCK(err_addr) | 403 ADDR_OF_256B_BLOCK(channel_index) | 404 OFFSET_IN_256B_BLOCK(err_addr); 405 406 /* we only save ue error information currently, ce is skipped */ 407 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) 408 == 1) { 409 err_rec->address = err_addr; 410 /* page frame address is saved */ 411 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; 412 err_rec->ts = (uint64_t)ktime_get_real_seconds(); 413 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 414 err_rec->cu = 0; 415 err_rec->mem_channel = channel_index; 416 err_rec->mcumc_id = umc_inst; 417 418 err_data->err_addr_cnt++; 419 } 420 } 421 422 /* clear umc status */ 423 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 424 } 425 426 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev, 427 void *ras_error_status) 428 { 429 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 430 431 uint32_t umc_inst = 0; 432 uint32_t ch_inst = 0; 433 uint32_t umc_reg_offset = 0; 434 435 /*TODO: driver needs to toggle DF Cstate to ensure 436 * safe access of UMC resgisters. Will add the protection 437 * when firmware interface is ready */ 438 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 439 umc_reg_offset = get_umc_v6_7_reg_offset(adev, 440 umc_inst, 441 ch_inst); 442 umc_v6_7_query_error_address(adev, 443 err_data, 444 umc_reg_offset, 445 ch_inst, 446 umc_inst); 447 } 448 } 449 450 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel( 451 struct amdgpu_device *adev, 452 uint32_t umc_reg_offset) 453 { 454 uint32_t ecc_ctrl_addr, ecc_ctrl; 455 456 ecc_ctrl_addr = 457 SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl); 458 ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr + 459 umc_reg_offset) * 4); 460 461 return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn); 462 } 463 464 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev) 465 { 466 uint32_t umc_inst = 0; 467 uint32_t ch_inst = 0; 468 uint32_t umc_reg_offset = 0; 469 470 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 471 umc_reg_offset = get_umc_v6_7_reg_offset(adev, 472 umc_inst, 473 ch_inst); 474 /* Enabling fatal error in one channel will be considered 475 as fatal error mode */ 476 if (umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset)) 477 return false; 478 } 479 480 return true; 481 } 482 483 const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs = { 484 .ras_late_init = amdgpu_umc_ras_late_init, 485 .ras_fini = amdgpu_umc_ras_fini, 486 .query_ras_error_count = umc_v6_7_query_ras_error_count, 487 .query_ras_error_address = umc_v6_7_query_ras_error_address, 488 .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode, 489 .ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count, 490 .ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address, 491 }; 492