19884c2b1SHawking Zhang /* 29884c2b1SHawking Zhang * Copyright 2019 Advanced Micro Devices, Inc. 39884c2b1SHawking Zhang * 49884c2b1SHawking Zhang * Permission is hereby granted, free of charge, to any person obtaining a 59884c2b1SHawking Zhang * copy of this software and associated documentation files (the "Software"), 69884c2b1SHawking Zhang * to deal in the Software without restriction, including without limitation 79884c2b1SHawking Zhang * the rights to use, copy, modify, merge, publish, distribute, sublicense, 89884c2b1SHawking Zhang * and/or sell copies of the Software, and to permit persons to whom the 99884c2b1SHawking Zhang * Software is furnished to do so, subject to the following conditions: 109884c2b1SHawking Zhang * 119884c2b1SHawking Zhang * The above copyright notice and this permission notice shall be included in 129884c2b1SHawking Zhang * all copies or substantial portions of the Software. 139884c2b1SHawking Zhang * 149884c2b1SHawking Zhang * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 159884c2b1SHawking Zhang * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 169884c2b1SHawking Zhang * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 179884c2b1SHawking Zhang * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 189884c2b1SHawking Zhang * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 199884c2b1SHawking Zhang * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 209884c2b1SHawking Zhang * OTHER DEALINGS IN THE SOFTWARE. 219884c2b1SHawking Zhang * 229884c2b1SHawking Zhang */ 239884c2b1SHawking Zhang #include "umc_v6_1.h" 249884c2b1SHawking Zhang #include "amdgpu_ras.h" 259884c2b1SHawking Zhang #include "amdgpu.h" 269884c2b1SHawking Zhang 279884c2b1SHawking Zhang #include "rsmu/rsmu_0_0_2_offset.h" 289884c2b1SHawking Zhang #include "rsmu/rsmu_0_0_2_sh_mask.h" 299884c2b1SHawking Zhang #include "umc/umc_6_1_1_offset.h" 309884c2b1SHawking Zhang #include "umc/umc_6_1_1_sh_mask.h" 319884c2b1SHawking Zhang 32c2742aefSTao Zhou #define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10 33c2742aefSTao Zhou 348c948103STao Zhou /* 358c948103STao Zhou * (addr / 256) * 8192, the higher 26 bits in ErrorAddr 368c948103STao Zhou * is the index of 8KB block 378c948103STao Zhou */ 388c948103STao Zhou #define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5) 398c948103STao Zhou /* channel index is the index of 256B block */ 408c948103STao Zhou #define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8) 418c948103STao Zhou /* offset in 256B block */ 428c948103STao Zhou #define OFFSET_IN_256B_BLOCK(addr) ((addr) & 0xffULL) 438c948103STao Zhou 443aacf4eaSTao Zhou const uint32_t 45c2742aefSTao Zhou umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = { 46c2742aefSTao Zhou {2, 18, 11, 27}, {4, 20, 13, 29}, 47c2742aefSTao Zhou {1, 17, 8, 24}, {7, 23, 14, 30}, 48c2742aefSTao Zhou {10, 26, 3, 19}, {12, 28, 5, 21}, 49c2742aefSTao Zhou {9, 25, 0, 16}, {15, 31, 6, 22} 50c2742aefSTao Zhou }; 51c2742aefSTao Zhou 529884c2b1SHawking Zhang static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev, 539884c2b1SHawking Zhang uint32_t umc_instance) 549884c2b1SHawking Zhang { 559884c2b1SHawking Zhang uint32_t rsmu_umc_index; 569884c2b1SHawking Zhang 579884c2b1SHawking Zhang rsmu_umc_index = RREG32_SOC15(RSMU, 0, 589884c2b1SHawking Zhang mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); 599884c2b1SHawking Zhang rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 609884c2b1SHawking Zhang RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 619884c2b1SHawking Zhang RSMU_UMC_INDEX_MODE_EN, 1); 629884c2b1SHawking Zhang rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 639884c2b1SHawking Zhang RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 649884c2b1SHawking Zhang RSMU_UMC_INDEX_INSTANCE, umc_instance); 659884c2b1SHawking Zhang rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 669884c2b1SHawking Zhang RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 679884c2b1SHawking Zhang RSMU_UMC_INDEX_WREN, 1 << umc_instance); 689884c2b1SHawking Zhang WREG32_SOC15(RSMU, 0, mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 699884c2b1SHawking Zhang rsmu_umc_index); 709884c2b1SHawking Zhang } 719884c2b1SHawking Zhang 729884c2b1SHawking Zhang static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) 739884c2b1SHawking Zhang { 749884c2b1SHawking Zhang WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 759884c2b1SHawking Zhang RSMU_UMC_INDEX_MODE_EN, 0); 769884c2b1SHawking Zhang } 779884c2b1SHawking Zhang 7887d2b92fSTao Zhou static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev) 7987d2b92fSTao Zhou { 8087d2b92fSTao Zhou uint32_t rsmu_umc_index; 8187d2b92fSTao Zhou 8287d2b92fSTao Zhou rsmu_umc_index = RREG32_SOC15(RSMU, 0, 8387d2b92fSTao Zhou mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); 8487d2b92fSTao Zhou return REG_GET_FIELD(rsmu_umc_index, 8587d2b92fSTao Zhou RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 8687d2b92fSTao Zhou RSMU_UMC_INDEX_INSTANCE); 8787d2b92fSTao Zhou } 8887d2b92fSTao Zhou 899884c2b1SHawking Zhang static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, 909884c2b1SHawking Zhang uint32_t umc_reg_offset, 919884c2b1SHawking Zhang unsigned long *error_count) 929884c2b1SHawking Zhang { 939884c2b1SHawking Zhang uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 949884c2b1SHawking Zhang uint32_t ecc_err_cnt, ecc_err_cnt_addr; 959884c2b1SHawking Zhang uint64_t mc_umc_status; 969884c2b1SHawking Zhang uint32_t mc_umc_status_addr; 979884c2b1SHawking Zhang 989884c2b1SHawking Zhang ecc_err_cnt_sel_addr = 999884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); 1009884c2b1SHawking Zhang ecc_err_cnt_addr = 1019884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); 1029884c2b1SHawking Zhang mc_umc_status_addr = 1039884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 1049884c2b1SHawking Zhang 1059884c2b1SHawking Zhang /* select the lower chip and check the error count */ 1069884c2b1SHawking Zhang ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset); 1079884c2b1SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 1089884c2b1SHawking Zhang EccErrCntCsSel, 0); 1099884c2b1SHawking Zhang WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel); 1109884c2b1SHawking Zhang ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset); 1119884c2b1SHawking Zhang *error_count += 112b1a58953STao Zhou (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 113b1a58953STao Zhou UMC_V6_1_CE_CNT_INIT); 1149884c2b1SHawking Zhang /* clear the lower chip err count */ 115b1a58953STao Zhou WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT); 1169884c2b1SHawking Zhang 1179884c2b1SHawking Zhang /* select the higher chip and check the err counter */ 1189884c2b1SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 1199884c2b1SHawking Zhang EccErrCntCsSel, 1); 1209884c2b1SHawking Zhang WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel); 1219884c2b1SHawking Zhang ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset); 1229884c2b1SHawking Zhang *error_count += 123b1a58953STao Zhou (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 124b1a58953STao Zhou UMC_V6_1_CE_CNT_INIT); 1259884c2b1SHawking Zhang /* clear the higher chip err count */ 126b1a58953STao Zhou WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT); 1279884c2b1SHawking Zhang 1289884c2b1SHawking Zhang /* check for SRAM correctable error 1299884c2b1SHawking Zhang MCUMC_STATUS is a 64 bit register */ 130dd21a572STao Zhou mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); 1319884c2b1SHawking Zhang if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 && 1329884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 1339884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) 1349884c2b1SHawking Zhang *error_count += 1; 1359884c2b1SHawking Zhang } 1369884c2b1SHawking Zhang 1379884c2b1SHawking Zhang static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev, 1389884c2b1SHawking Zhang uint32_t umc_reg_offset, 1399884c2b1SHawking Zhang unsigned long *error_count) 1409884c2b1SHawking Zhang { 1419884c2b1SHawking Zhang uint64_t mc_umc_status; 1429884c2b1SHawking Zhang uint32_t mc_umc_status_addr; 1439884c2b1SHawking Zhang 1449884c2b1SHawking Zhang mc_umc_status_addr = 1459884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 1469884c2b1SHawking Zhang 1479884c2b1SHawking Zhang /* check the MCUMC_STATUS */ 148dd21a572STao Zhou mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); 149f1ed4afaSTao Zhou if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && 150f1ed4afaSTao Zhou (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || 151f1ed4afaSTao Zhou REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 1529884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || 1539884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || 1549884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) 1559884c2b1SHawking Zhang *error_count += 1; 1569884c2b1SHawking Zhang } 1579884c2b1SHawking Zhang 1582b671b60STao Zhou static void umc_v6_1_query_error_count(struct amdgpu_device *adev, 1592b671b60STao Zhou struct ras_err_data *err_data, uint32_t umc_reg_offset, 1602b671b60STao Zhou uint32_t channel_index) 1619884c2b1SHawking Zhang { 1629884c2b1SHawking Zhang umc_v6_1_query_correctable_error_count(adev, umc_reg_offset, 1639884c2b1SHawking Zhang &(err_data->ce_count)); 1649884c2b1SHawking Zhang umc_v6_1_querry_uncorrectable_error_count(adev, umc_reg_offset, 1659884c2b1SHawking Zhang &(err_data->ue_count)); 1669884c2b1SHawking Zhang } 1672b671b60STao Zhou 1682b671b60STao Zhou static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev, 1692b671b60STao Zhou void *ras_error_status) 1702b671b60STao Zhou { 1712b671b60STao Zhou amdgpu_umc_for_each_channel(umc_v6_1_query_error_count); 1729884c2b1SHawking Zhang } 1739884c2b1SHawking Zhang 1748c948103STao Zhou static void umc_v6_1_query_error_address(struct amdgpu_device *adev, 1752b671b60STao Zhou struct ras_err_data *err_data, 1762b671b60STao Zhou uint32_t umc_reg_offset, uint32_t channel_index) 1778c948103STao Zhou { 1782b671b60STao Zhou uint32_t lsb, mc_umc_status_addr; 17987d2b92fSTao Zhou uint64_t mc_umc_status, err_addr, retired_page; 18087d2b92fSTao Zhou struct eeprom_table_record *err_rec; 1818c948103STao Zhou 1828c948103STao Zhou mc_umc_status_addr = 1838c948103STao Zhou SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 1842b671b60STao Zhou 1852b671b60STao Zhou /* skip error address process if -ENOMEM */ 1862b671b60STao Zhou if (!err_data->err_addr) { 1872b671b60STao Zhou /* clear umc status */ 188dd21a572STao Zhou WREG64_UMC(mc_umc_status_addr + umc_reg_offset, 0x0ULL); 1892b671b60STao Zhou return; 1902b671b60STao Zhou } 1912b671b60STao Zhou 19287d2b92fSTao Zhou err_rec = &err_data->err_addr[err_data->err_addr_cnt]; 193dd21a572STao Zhou mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); 1948c948103STao Zhou 1958c948103STao Zhou /* calculate error address if ue/ce error is detected */ 1968c948103STao Zhou if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 1978c948103STao Zhou (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 1988c948103STao Zhou REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 1998c948103STao Zhou err_addr = RREG64_PCIE(smnMCA_UMC0_MCUMC_ADDRT0 + umc_reg_offset * 4); 2008c948103STao Zhou 2018c948103STao Zhou /* the lowest lsb bits should be ignored */ 2028c948103STao Zhou lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB); 2038c948103STao Zhou err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 2048c948103STao Zhou err_addr &= ~((0x1ULL << lsb) - 1); 2058c948103STao Zhou 2068c948103STao Zhou /* translate umc channel address to soc pa, 3 parts are included */ 20787d2b92fSTao Zhou retired_page = ADDR_OF_8KB_BLOCK(err_addr) | 2082b671b60STao Zhou ADDR_OF_256B_BLOCK(channel_index) | 2092b671b60STao Zhou OFFSET_IN_256B_BLOCK(err_addr); 2108c948103STao Zhou 21187d2b92fSTao Zhou /* we only save ue error information currently, ce is skipped */ 21287d2b92fSTao Zhou if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) 21387d2b92fSTao Zhou == 1) { 21487d2b92fSTao Zhou err_rec->address = err_addr; 21587d2b92fSTao Zhou /* page frame address is saved */ 21687d2b92fSTao Zhou err_rec->retired_page = retired_page >> PAGE_SHIFT; 21787d2b92fSTao Zhou err_rec->ts = (uint64_t)ktime_get_real_seconds(); 21887d2b92fSTao Zhou err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 21987d2b92fSTao Zhou err_rec->cu = 0; 22087d2b92fSTao Zhou err_rec->mem_channel = channel_index; 22187d2b92fSTao Zhou err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev); 22287d2b92fSTao Zhou 2238c948103STao Zhou err_data->err_addr_cnt++; 2248c948103STao Zhou } 22587d2b92fSTao Zhou } 2262b671b60STao Zhou 2272b671b60STao Zhou /* clear umc status */ 228dd21a572STao Zhou WREG64_UMC(mc_umc_status_addr + umc_reg_offset, 0x0ULL); 2298c948103STao Zhou } 2308c948103STao Zhou 2318c948103STao Zhou static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev, 2328c948103STao Zhou void *ras_error_status) 2338c948103STao Zhou { 2342b671b60STao Zhou amdgpu_umc_for_each_channel(umc_v6_1_query_error_address); 2358c948103STao Zhou } 2368c948103STao Zhou 237b7f92097STao Zhou static void umc_v6_1_ras_init_per_channel(struct amdgpu_device *adev, 238b7f92097STao Zhou struct ras_err_data *err_data, 239b7f92097STao Zhou uint32_t umc_reg_offset, uint32_t channel_index) 240b7f92097STao Zhou { 241b7f92097STao Zhou uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 242b7f92097STao Zhou uint32_t ecc_err_cnt_addr; 243b7f92097STao Zhou 244b7f92097STao Zhou ecc_err_cnt_sel_addr = 245b7f92097STao Zhou SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); 246b7f92097STao Zhou ecc_err_cnt_addr = 247b7f92097STao Zhou SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); 248b7f92097STao Zhou 249b7f92097STao Zhou /* select the lower chip and check the error count */ 250b7f92097STao Zhou ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset); 251b7f92097STao Zhou ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 252b7f92097STao Zhou EccErrCntCsSel, 0); 253b7f92097STao Zhou /* set ce error interrupt type to APIC based interrupt */ 254b7f92097STao Zhou ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 255b7f92097STao Zhou EccErrInt, 0x1); 256b7f92097STao Zhou WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel); 257b7f92097STao Zhou /* set error count to initial value */ 258b7f92097STao Zhou WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT); 259b7f92097STao Zhou 260b7f92097STao Zhou /* select the higher chip and check the err counter */ 261b7f92097STao Zhou ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 262b7f92097STao Zhou EccErrCntCsSel, 1); 263b7f92097STao Zhou WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel); 264b7f92097STao Zhou WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT); 265b7f92097STao Zhou } 266b7f92097STao Zhou 2673aacf4eaSTao Zhou static void umc_v6_1_ras_init(struct amdgpu_device *adev) 2683aacf4eaSTao Zhou { 269b7f92097STao Zhou void *ras_error_status = NULL; 2703aacf4eaSTao Zhou 271b7f92097STao Zhou amdgpu_umc_for_each_channel(umc_v6_1_ras_init_per_channel); 2723aacf4eaSTao Zhou } 2733aacf4eaSTao Zhou 2749884c2b1SHawking Zhang const struct amdgpu_umc_funcs umc_v6_1_funcs = { 2753aacf4eaSTao Zhou .ras_init = umc_v6_1_ras_init, 2769884c2b1SHawking Zhang .query_ras_error_count = umc_v6_1_query_ras_error_count, 2778c948103STao Zhou .query_ras_error_address = umc_v6_1_query_ras_error_address, 2783aacf4eaSTao Zhou .enable_umc_index_mode = umc_v6_1_enable_umc_index_mode, 2793aacf4eaSTao Zhou .disable_umc_index_mode = umc_v6_1_disable_umc_index_mode, 2809884c2b1SHawking Zhang }; 281