19884c2b1SHawking Zhang /* 29884c2b1SHawking Zhang * Copyright 2019 Advanced Micro Devices, Inc. 39884c2b1SHawking Zhang * 49884c2b1SHawking Zhang * Permission is hereby granted, free of charge, to any person obtaining a 59884c2b1SHawking Zhang * copy of this software and associated documentation files (the "Software"), 69884c2b1SHawking Zhang * to deal in the Software without restriction, including without limitation 79884c2b1SHawking Zhang * the rights to use, copy, modify, merge, publish, distribute, sublicense, 89884c2b1SHawking Zhang * and/or sell copies of the Software, and to permit persons to whom the 99884c2b1SHawking Zhang * Software is furnished to do so, subject to the following conditions: 109884c2b1SHawking Zhang * 119884c2b1SHawking Zhang * The above copyright notice and this permission notice shall be included in 129884c2b1SHawking Zhang * all copies or substantial portions of the Software. 139884c2b1SHawking Zhang * 149884c2b1SHawking Zhang * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 159884c2b1SHawking Zhang * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 169884c2b1SHawking Zhang * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 179884c2b1SHawking Zhang * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 189884c2b1SHawking Zhang * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 199884c2b1SHawking Zhang * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 209884c2b1SHawking Zhang * OTHER DEALINGS IN THE SOFTWARE. 219884c2b1SHawking Zhang * 229884c2b1SHawking Zhang */ 239884c2b1SHawking Zhang #include "umc_v6_1.h" 249884c2b1SHawking Zhang #include "amdgpu_ras.h" 259884c2b1SHawking Zhang #include "amdgpu.h" 269884c2b1SHawking Zhang 279884c2b1SHawking Zhang #include "rsmu/rsmu_0_0_2_offset.h" 289884c2b1SHawking Zhang #include "rsmu/rsmu_0_0_2_sh_mask.h" 299884c2b1SHawking Zhang #include "umc/umc_6_1_1_offset.h" 309884c2b1SHawking Zhang #include "umc/umc_6_1_1_sh_mask.h" 319884c2b1SHawking Zhang 32c2742aefSTao Zhou #define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10 33c2742aefSTao Zhou 348c948103STao Zhou /* 358c948103STao Zhou * (addr / 256) * 8192, the higher 26 bits in ErrorAddr 368c948103STao Zhou * is the index of 8KB block 378c948103STao Zhou */ 388c948103STao Zhou #define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5) 398c948103STao Zhou /* channel index is the index of 256B block */ 408c948103STao Zhou #define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8) 418c948103STao Zhou /* offset in 256B block */ 428c948103STao Zhou #define OFFSET_IN_256B_BLOCK(addr) ((addr) & 0xffULL) 438c948103STao Zhou 44c2742aefSTao Zhou static uint32_t 45c2742aefSTao Zhou umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = { 46c2742aefSTao Zhou {2, 18, 11, 27}, {4, 20, 13, 29}, 47c2742aefSTao Zhou {1, 17, 8, 24}, {7, 23, 14, 30}, 48c2742aefSTao Zhou {10, 26, 3, 19}, {12, 28, 5, 21}, 49c2742aefSTao Zhou {9, 25, 0, 16}, {15, 31, 6, 22} 50c2742aefSTao Zhou }; 51c2742aefSTao Zhou 529884c2b1SHawking Zhang static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev, 539884c2b1SHawking Zhang uint32_t umc_instance) 549884c2b1SHawking Zhang { 559884c2b1SHawking Zhang uint32_t rsmu_umc_index; 569884c2b1SHawking Zhang 579884c2b1SHawking Zhang rsmu_umc_index = RREG32_SOC15(RSMU, 0, 589884c2b1SHawking Zhang mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); 599884c2b1SHawking Zhang rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 609884c2b1SHawking Zhang RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 619884c2b1SHawking Zhang RSMU_UMC_INDEX_MODE_EN, 1); 629884c2b1SHawking Zhang rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 639884c2b1SHawking Zhang RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 649884c2b1SHawking Zhang RSMU_UMC_INDEX_INSTANCE, umc_instance); 659884c2b1SHawking Zhang rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 669884c2b1SHawking Zhang RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 679884c2b1SHawking Zhang RSMU_UMC_INDEX_WREN, 1 << umc_instance); 689884c2b1SHawking Zhang WREG32_SOC15(RSMU, 0, mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 699884c2b1SHawking Zhang rsmu_umc_index); 709884c2b1SHawking Zhang } 719884c2b1SHawking Zhang 729884c2b1SHawking Zhang static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) 739884c2b1SHawking Zhang { 749884c2b1SHawking Zhang WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 759884c2b1SHawking Zhang RSMU_UMC_INDEX_MODE_EN, 0); 769884c2b1SHawking Zhang } 779884c2b1SHawking Zhang 789884c2b1SHawking Zhang static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, 799884c2b1SHawking Zhang uint32_t umc_reg_offset, 809884c2b1SHawking Zhang unsigned long *error_count) 819884c2b1SHawking Zhang { 829884c2b1SHawking Zhang uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 839884c2b1SHawking Zhang uint32_t ecc_err_cnt, ecc_err_cnt_addr; 849884c2b1SHawking Zhang uint64_t mc_umc_status; 859884c2b1SHawking Zhang uint32_t mc_umc_status_addr; 869884c2b1SHawking Zhang 879884c2b1SHawking Zhang ecc_err_cnt_sel_addr = 889884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); 899884c2b1SHawking Zhang ecc_err_cnt_addr = 909884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); 919884c2b1SHawking Zhang mc_umc_status_addr = 929884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 939884c2b1SHawking Zhang 949884c2b1SHawking Zhang /* select the lower chip and check the error count */ 959884c2b1SHawking Zhang ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset); 969884c2b1SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 979884c2b1SHawking Zhang EccErrCntCsSel, 0); 989884c2b1SHawking Zhang WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel); 999884c2b1SHawking Zhang ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset); 1009884c2b1SHawking Zhang *error_count += 1019884c2b1SHawking Zhang REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt); 1029884c2b1SHawking Zhang /* clear the lower chip err count */ 1039884c2b1SHawking Zhang WREG32(ecc_err_cnt_addr + umc_reg_offset, 0); 1049884c2b1SHawking Zhang 1059884c2b1SHawking Zhang /* select the higher chip and check the err counter */ 1069884c2b1SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 1079884c2b1SHawking Zhang EccErrCntCsSel, 1); 1089884c2b1SHawking Zhang WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel); 1099884c2b1SHawking Zhang ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset); 1109884c2b1SHawking Zhang *error_count += 1119884c2b1SHawking Zhang REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt); 1129884c2b1SHawking Zhang /* clear the higher chip err count */ 1139884c2b1SHawking Zhang WREG32(ecc_err_cnt_addr + umc_reg_offset, 0); 1149884c2b1SHawking Zhang 1159884c2b1SHawking Zhang /* check for SRAM correctable error 1169884c2b1SHawking Zhang MCUMC_STATUS is a 64 bit register */ 1175bbfb64aSTao Zhou mc_umc_status = RREG64(mc_umc_status_addr + umc_reg_offset); 1189884c2b1SHawking Zhang if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 && 1199884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 1209884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) 1219884c2b1SHawking Zhang *error_count += 1; 1229884c2b1SHawking Zhang } 1239884c2b1SHawking Zhang 1249884c2b1SHawking Zhang static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev, 1259884c2b1SHawking Zhang uint32_t umc_reg_offset, 1269884c2b1SHawking Zhang unsigned long *error_count) 1279884c2b1SHawking Zhang { 1289884c2b1SHawking Zhang uint64_t mc_umc_status; 1299884c2b1SHawking Zhang uint32_t mc_umc_status_addr; 1309884c2b1SHawking Zhang 1319884c2b1SHawking Zhang mc_umc_status_addr = 1329884c2b1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 1339884c2b1SHawking Zhang 1349884c2b1SHawking Zhang /* check the MCUMC_STATUS */ 1355bbfb64aSTao Zhou mc_umc_status = RREG64(mc_umc_status_addr + umc_reg_offset); 136f1ed4afaSTao Zhou if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && 137f1ed4afaSTao Zhou (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || 138f1ed4afaSTao Zhou REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 1399884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || 1409884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || 1419884c2b1SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) 1429884c2b1SHawking Zhang *error_count += 1; 1439884c2b1SHawking Zhang } 1449884c2b1SHawking Zhang 1459884c2b1SHawking Zhang static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev, 1469884c2b1SHawking Zhang void *ras_error_status) 1479884c2b1SHawking Zhang { 1489884c2b1SHawking Zhang struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 1495bbfb64aSTao Zhou uint32_t umc_inst, channel_inst, umc_reg_offset, mc_umc_status_addr; 1505bbfb64aSTao Zhou 1515bbfb64aSTao Zhou mc_umc_status_addr = 1525bbfb64aSTao Zhou SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 1539884c2b1SHawking Zhang 1549884c2b1SHawking Zhang for (umc_inst = 0; umc_inst < UMC_V6_1_UMC_INSTANCE_NUM; umc_inst++) { 1559884c2b1SHawking Zhang /* enable the index mode to query eror count per channel */ 1569884c2b1SHawking Zhang umc_v6_1_enable_umc_index_mode(adev, umc_inst); 1579884c2b1SHawking Zhang for (channel_inst = 0; channel_inst < UMC_V6_1_CHANNEL_INSTANCE_NUM; channel_inst++) { 1589884c2b1SHawking Zhang /* calc the register offset according to channel instance */ 1599884c2b1SHawking Zhang umc_reg_offset = UMC_V6_1_PER_CHANNEL_OFFSET * channel_inst; 1609884c2b1SHawking Zhang umc_v6_1_query_correctable_error_count(adev, umc_reg_offset, 1619884c2b1SHawking Zhang &(err_data->ce_count)); 1629884c2b1SHawking Zhang umc_v6_1_querry_uncorrectable_error_count(adev, umc_reg_offset, 1639884c2b1SHawking Zhang &(err_data->ue_count)); 1645bbfb64aSTao Zhou /* clear umc status */ 1655bbfb64aSTao Zhou WREG64(mc_umc_status_addr + umc_reg_offset, 0x0ULL); 1669884c2b1SHawking Zhang } 1679884c2b1SHawking Zhang } 1689884c2b1SHawking Zhang umc_v6_1_disable_umc_index_mode(adev); 1699884c2b1SHawking Zhang } 1709884c2b1SHawking Zhang 1718c948103STao Zhou static void umc_v6_1_query_error_address(struct amdgpu_device *adev, 1728c948103STao Zhou uint32_t umc_reg_offset, uint32_t channel_index, 1738c948103STao Zhou struct ras_err_data *err_data) 1748c948103STao Zhou { 1758c948103STao Zhou uint32_t lsb; 1768c948103STao Zhou uint64_t mc_umc_status, err_addr; 1778c948103STao Zhou uint32_t mc_umc_status_addr; 1788c948103STao Zhou 1798c948103STao Zhou /* skip error address process if -ENOMEM */ 1808c948103STao Zhou if (!err_data->err_addr) 1818c948103STao Zhou return; 1828c948103STao Zhou 1838c948103STao Zhou mc_umc_status_addr = 1848c948103STao Zhou SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 1858c948103STao Zhou mc_umc_status = RREG64(mc_umc_status_addr + umc_reg_offset); 1868c948103STao Zhou 1878c948103STao Zhou /* calculate error address if ue/ce error is detected */ 1888c948103STao Zhou if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 1898c948103STao Zhou (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 1908c948103STao Zhou REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 1918c948103STao Zhou err_addr = RREG64_PCIE(smnMCA_UMC0_MCUMC_ADDRT0 + umc_reg_offset * 4); 1928c948103STao Zhou 1938c948103STao Zhou /* the lowest lsb bits should be ignored */ 1948c948103STao Zhou lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB); 1958c948103STao Zhou err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 1968c948103STao Zhou err_addr &= ~((0x1ULL << lsb) - 1); 1978c948103STao Zhou 1988c948103STao Zhou /* translate umc channel address to soc pa, 3 parts are included */ 1998c948103STao Zhou err_data->err_addr[err_data->err_addr_cnt] = 2008c948103STao Zhou ADDR_OF_8KB_BLOCK(err_addr) 2018c948103STao Zhou | ADDR_OF_256B_BLOCK(channel_index) 2028c948103STao Zhou | OFFSET_IN_256B_BLOCK(err_addr); 2038c948103STao Zhou 2048c948103STao Zhou err_data->err_addr_cnt++; 2058c948103STao Zhou } 2068c948103STao Zhou } 2078c948103STao Zhou 2088c948103STao Zhou static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev, 2098c948103STao Zhou void *ras_error_status) 2108c948103STao Zhou { 2118c948103STao Zhou struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 2128c948103STao Zhou uint32_t umc_inst, channel_inst, umc_reg_offset; 2138c948103STao Zhou uint32_t channel_index, mc_umc_status_addr; 2148c948103STao Zhou 2158c948103STao Zhou mc_umc_status_addr = 2168c948103STao Zhou SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 2178c948103STao Zhou 2188c948103STao Zhou for (umc_inst = 0; umc_inst < UMC_V6_1_UMC_INSTANCE_NUM; umc_inst++) { 2198c948103STao Zhou /* enable the index mode to query eror count per channel */ 2208c948103STao Zhou umc_v6_1_enable_umc_index_mode(adev, umc_inst); 2218c948103STao Zhou for (channel_inst = 0; channel_inst < UMC_V6_1_CHANNEL_INSTANCE_NUM; channel_inst++) { 2228c948103STao Zhou /* calc the register offset according to channel instance */ 2238c948103STao Zhou umc_reg_offset = UMC_V6_1_PER_CHANNEL_OFFSET * channel_inst; 2248c948103STao Zhou /* get channel index of interleaved memory */ 2258c948103STao Zhou channel_index = umc_v6_1_channel_idx_tbl[umc_inst][channel_inst]; 2268c948103STao Zhou 2278c948103STao Zhou umc_v6_1_query_error_address(adev, umc_reg_offset, 2288c948103STao Zhou channel_index, err_data); 2298c948103STao Zhou 2308c948103STao Zhou /* clear umc status */ 2318c948103STao Zhou WREG64(mc_umc_status_addr + umc_reg_offset, 0x0ULL); 2328c948103STao Zhou /* clear error address register */ 2338c948103STao Zhou WREG64_PCIE(smnMCA_UMC0_MCUMC_ADDRT0 + umc_reg_offset * 4, 0x0ULL); 2348c948103STao Zhou } 2358c948103STao Zhou } 2368c948103STao Zhou 2378c948103STao Zhou umc_v6_1_disable_umc_index_mode(adev); 2388c948103STao Zhou } 2398c948103STao Zhou 2409884c2b1SHawking Zhang const struct amdgpu_umc_funcs umc_v6_1_funcs = { 2419884c2b1SHawking Zhang .query_ras_error_count = umc_v6_1_query_ras_error_count, 2428c948103STao Zhou .query_ras_error_address = umc_v6_1_query_ras_error_address, 2439884c2b1SHawking Zhang }; 244