11696bf35SHawking Zhang /*
21696bf35SHawking Zhang * Copyright 2021 Advanced Micro Devices, Inc.
31696bf35SHawking Zhang *
41696bf35SHawking Zhang * Permission is hereby granted, free of charge, to any person obtaining a
51696bf35SHawking Zhang * copy of this software and associated documentation files (the "Software"),
61696bf35SHawking Zhang * to deal in the Software without restriction, including without limitation
71696bf35SHawking Zhang * the rights to use, copy, modify, merge, publish, distribute, sublicense,
81696bf35SHawking Zhang * and/or sell copies of the Software, and to permit persons to whom the
91696bf35SHawking Zhang * Software is furnished to do so, subject to the following conditions:
101696bf35SHawking Zhang *
111696bf35SHawking Zhang * The above copyright notice and this permission notice shall be included in
121696bf35SHawking Zhang * all copies or substantial portions of the Software.
131696bf35SHawking Zhang *
141696bf35SHawking Zhang * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
151696bf35SHawking Zhang * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
161696bf35SHawking Zhang * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
171696bf35SHawking Zhang * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
181696bf35SHawking Zhang * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
191696bf35SHawking Zhang * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
201696bf35SHawking Zhang * OTHER DEALINGS IN THE SOFTWARE.
211696bf35SHawking Zhang *
221696bf35SHawking Zhang */
231696bf35SHawking Zhang #include "umc_v6_7.h"
241696bf35SHawking Zhang #include "amdgpu_ras.h"
2549070c4eSHawking Zhang #include "amdgpu_umc.h"
261696bf35SHawking Zhang #include "amdgpu.h"
271696bf35SHawking Zhang
283f903560SHawking Zhang #include "umc/umc_6_7_0_offset.h"
293f903560SHawking Zhang #include "umc/umc_6_7_0_sh_mask.h"
303f903560SHawking Zhang
31186c8a85SJohn Clements const uint32_t
32186c8a85SJohn Clements umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33719e433eSMukul Joshi {28, 20, 24, 16, 12, 4, 8, 0},
34719e433eSMukul Joshi {6, 30, 2, 26, 22, 14, 18, 10},
35719e433eSMukul Joshi {19, 11, 15, 7, 3, 27, 31, 23},
36719e433eSMukul Joshi {9, 1, 5, 29, 25, 17, 21, 13}
37186c8a85SJohn Clements };
38186c8a85SJohn Clements const uint32_t
39186c8a85SJohn Clements umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40719e433eSMukul Joshi {19, 11, 15, 7, 3, 27, 31, 23},
41719e433eSMukul Joshi {9, 1, 5, 29, 25, 17, 21, 13},
42719e433eSMukul Joshi {28, 20, 24, 16, 12, 4, 8, 0},
43719e433eSMukul Joshi {6, 30, 2, 26, 22, 14, 18, 10},
44186c8a85SJohn Clements };
45186c8a85SJohn Clements
get_umc_v6_7_reg_offset(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst)46878b9e94SHawking Zhang static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
47878b9e94SHawking Zhang uint32_t umc_inst,
48878b9e94SHawking Zhang uint32_t ch_inst)
49878b9e94SHawking Zhang {
501915a433SStanley.Yang uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
511915a433SStanley.Yang
521915a433SStanley.Yang /* adjust umc and channel index offset,
531915a433SStanley.Yang * the register address is not linear on each umc instace */
541915a433SStanley.Yang umc_inst = index / 4;
551915a433SStanley.Yang ch_inst = index % 4;
561915a433SStanley.Yang
57878b9e94SHawking Zhang return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
58878b9e94SHawking Zhang }
59878b9e94SHawking Zhang
umc_v6_7_query_error_status_helper(struct amdgpu_device * adev,uint64_t mc_umc_status,uint32_t umc_reg_offset)6005eee31cSStanley.Yang static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev,
6105eee31cSStanley.Yang uint64_t mc_umc_status, uint32_t umc_reg_offset)
628882f90aSStanley.Yang {
631ec1944eSStanley.Yang uint32_t mc_umc_addr;
641ec1944eSStanley.Yang uint64_t reg_value;
651ec1944eSStanley.Yang
661ec1944eSStanley.Yang if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
671ec1944eSStanley.Yang dev_info(adev->dev, "Deferred error, no user action is needed.\n");
681ec1944eSStanley.Yang
691ec1944eSStanley.Yang if (mc_umc_status)
701ec1944eSStanley.Yang dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
711ec1944eSStanley.Yang
721ec1944eSStanley.Yang /* print IPID registers value */
731ec1944eSStanley.Yang mc_umc_addr =
741ec1944eSStanley.Yang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
751ec1944eSStanley.Yang reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
761ec1944eSStanley.Yang if (reg_value)
771ec1944eSStanley.Yang dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
781ec1944eSStanley.Yang
791ec1944eSStanley.Yang /* print SYND registers value */
801ec1944eSStanley.Yang mc_umc_addr =
811ec1944eSStanley.Yang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
821ec1944eSStanley.Yang reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
831ec1944eSStanley.Yang if (reg_value)
841ec1944eSStanley.Yang dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
851ec1944eSStanley.Yang
861ec1944eSStanley.Yang /* print MISC0 registers value */
871ec1944eSStanley.Yang mc_umc_addr =
881ec1944eSStanley.Yang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
891ec1944eSStanley.Yang reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
901ec1944eSStanley.Yang if (reg_value)
911ec1944eSStanley.Yang dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
921ec1944eSStanley.Yang }
9305eee31cSStanley.Yang
umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst,unsigned long * error_count)9405eee31cSStanley.Yang static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
9505eee31cSStanley.Yang uint32_t umc_inst, uint32_t ch_inst,
9605eee31cSStanley.Yang unsigned long *error_count)
9705eee31cSStanley.Yang {
9805eee31cSStanley.Yang uint64_t mc_umc_status;
9905eee31cSStanley.Yang uint32_t eccinfo_table_idx;
10005eee31cSStanley.Yang uint32_t umc_reg_offset;
10105eee31cSStanley.Yang struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
10205eee31cSStanley.Yang
10305eee31cSStanley.Yang umc_reg_offset = get_umc_v6_7_reg_offset(adev,
10405eee31cSStanley.Yang umc_inst, ch_inst);
10505eee31cSStanley.Yang
10605eee31cSStanley.Yang eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
10705eee31cSStanley.Yang /* check for SRAM correctable error
10805eee31cSStanley.Yang MCUMC_STATUS is a 64 bit register */
10905eee31cSStanley.Yang mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
11005eee31cSStanley.Yang if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
11105eee31cSStanley.Yang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
11205eee31cSStanley.Yang *error_count += 1;
11305eee31cSStanley.Yang
11405eee31cSStanley.Yang umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
115cbd3e844SStanley.Yang
116cbd3e844SStanley.Yang if (ras->umc_ecc.record_ce_addr_supported) {
117cbd3e844SStanley.Yang uint64_t err_addr, soc_pa;
118cbd3e844SStanley.Yang uint32_t channel_index =
119cbd3e844SStanley.Yang adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
120cbd3e844SStanley.Yang
121cbd3e844SStanley.Yang err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
122cbd3e844SStanley.Yang err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
123cbd3e844SStanley.Yang /* translate umc channel address to soc pa, 3 parts are included */
124cbd3e844SStanley.Yang soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
125cbd3e844SStanley.Yang ADDR_OF_256B_BLOCK(channel_index) |
126cbd3e844SStanley.Yang OFFSET_IN_256B_BLOCK(err_addr);
127cbd3e844SStanley.Yang
128cbd3e844SStanley.Yang /* The umc channel bits are not original values, they are hashed */
129cbd3e844SStanley.Yang SET_CHANNEL_HASH(channel_index, soc_pa);
130cbd3e844SStanley.Yang
131cbd3e844SStanley.Yang dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
132cbd3e844SStanley.Yang }
13305eee31cSStanley.Yang }
13405eee31cSStanley.Yang }
13505eee31cSStanley.Yang
umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst,unsigned long * error_count)13605eee31cSStanley.Yang static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
13705eee31cSStanley.Yang uint32_t umc_inst, uint32_t ch_inst,
13805eee31cSStanley.Yang unsigned long *error_count)
13905eee31cSStanley.Yang {
14005eee31cSStanley.Yang uint64_t mc_umc_status;
14105eee31cSStanley.Yang uint32_t eccinfo_table_idx;
14205eee31cSStanley.Yang uint32_t umc_reg_offset;
14305eee31cSStanley.Yang struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
14405eee31cSStanley.Yang
14505eee31cSStanley.Yang umc_reg_offset = get_umc_v6_7_reg_offset(adev,
14605eee31cSStanley.Yang umc_inst, ch_inst);
14705eee31cSStanley.Yang
14805eee31cSStanley.Yang eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
14905eee31cSStanley.Yang /* check the MCUMC_STATUS */
15005eee31cSStanley.Yang mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
15105eee31cSStanley.Yang if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
15205eee31cSStanley.Yang (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
15305eee31cSStanley.Yang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
15405eee31cSStanley.Yang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
15505eee31cSStanley.Yang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
15605eee31cSStanley.Yang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
15705eee31cSStanley.Yang *error_count += 1;
15805eee31cSStanley.Yang
15905eee31cSStanley.Yang umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
16005eee31cSStanley.Yang }
1618882f90aSStanley.Yang }
1628882f90aSStanley.Yang
umc_v6_7_ecc_info_querry_ecc_error_count(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)163*fc926faeSYiPeng Chai static int umc_v6_7_ecc_info_querry_ecc_error_count(struct amdgpu_device *adev,
164*fc926faeSYiPeng Chai uint32_t node_inst, uint32_t umc_inst,
165*fc926faeSYiPeng Chai uint32_t ch_inst, void *data)
1668882f90aSStanley.Yang {
167*fc926faeSYiPeng Chai struct ras_err_data *err_data = (struct ras_err_data *)data;
1688882f90aSStanley.Yang
1698882f90aSStanley.Yang umc_v6_7_ecc_info_query_correctable_error_count(adev,
17037ff945fSStanley.Yang umc_inst, ch_inst,
1718882f90aSStanley.Yang &(err_data->ce_count));
172*fc926faeSYiPeng Chai
1738882f90aSStanley.Yang umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
17437ff945fSStanley.Yang umc_inst, ch_inst,
1758882f90aSStanley.Yang &(err_data->ue_count));
176*fc926faeSYiPeng Chai
177*fc926faeSYiPeng Chai return 0;
1788882f90aSStanley.Yang }
179*fc926faeSYiPeng Chai
umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)180*fc926faeSYiPeng Chai static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
181*fc926faeSYiPeng Chai void *ras_error_status)
182*fc926faeSYiPeng Chai {
183*fc926faeSYiPeng Chai amdgpu_umc_loop_channels(adev,
184*fc926faeSYiPeng Chai umc_v6_7_ecc_info_querry_ecc_error_count, ras_error_status);
1858882f90aSStanley.Yang }
1868882f90aSStanley.Yang
umc_v6_7_convert_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)1876c0ca748SHawking Zhang void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
18844420ac5STao Zhou struct ras_err_data *err_data, uint64_t err_addr,
18944420ac5STao Zhou uint32_t ch_inst, uint32_t umc_inst)
1908882f90aSStanley.Yang {
1918882f90aSStanley.Yang uint32_t channel_index;
19244420ac5STao Zhou uint64_t soc_pa, retired_page, column;
1938882f90aSStanley.Yang
1948882f90aSStanley.Yang channel_index =
1958882f90aSStanley.Yang adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
1968882f90aSStanley.Yang /* translate umc channel address to soc pa, 3 parts are included */
197e63fa4dcSTao Zhou soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
1988882f90aSStanley.Yang ADDR_OF_256B_BLOCK(channel_index) |
1998882f90aSStanley.Yang OFFSET_IN_256B_BLOCK(err_addr);
200bee7f8d0STao Zhou
201bee7f8d0STao Zhou /* The umc channel bits are not original values, they are hashed */
202bee7f8d0STao Zhou SET_CHANNEL_HASH(channel_index, soc_pa);
203bee7f8d0STao Zhou
204e63fa4dcSTao Zhou /* clear [C4 C3 C2] in soc physical address */
205e63fa4dcSTao Zhou soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
2068882f90aSStanley.Yang
207e63fa4dcSTao Zhou /* loop for all possibilities of [C4 C3 C2] */
208e63fa4dcSTao Zhou for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
209e63fa4dcSTao Zhou retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
2101ec1944eSStanley.Yang dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
211400013b2STao Zhou amdgpu_umc_fill_error_record(err_data, err_addr,
212400013b2STao Zhou retired_page, channel_index, umc_inst);
213e63fa4dcSTao Zhou
214e63fa4dcSTao Zhou /* shift R14 bit */
215e63fa4dcSTao Zhou retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
2161ec1944eSStanley.Yang dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
217e63fa4dcSTao Zhou amdgpu_umc_fill_error_record(err_data, err_addr,
218e63fa4dcSTao Zhou retired_page, channel_index, umc_inst);
219e63fa4dcSTao Zhou }
220e63fa4dcSTao Zhou }
22144420ac5STao Zhou
umc_v6_7_ecc_info_query_error_address(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)222*fc926faeSYiPeng Chai static int umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
223*fc926faeSYiPeng Chai uint32_t node_inst, uint32_t umc_inst,
224*fc926faeSYiPeng Chai uint32_t ch_inst, void *data)
22544420ac5STao Zhou {
22644420ac5STao Zhou uint64_t mc_umc_status, err_addr;
22744420ac5STao Zhou uint32_t eccinfo_table_idx;
22844420ac5STao Zhou struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
229*fc926faeSYiPeng Chai struct ras_err_data *err_data = (struct ras_err_data *)data;
23044420ac5STao Zhou
23144420ac5STao Zhou eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
23244420ac5STao Zhou mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
23344420ac5STao Zhou
23444420ac5STao Zhou if (mc_umc_status == 0)
235*fc926faeSYiPeng Chai return 0;
23644420ac5STao Zhou
23744420ac5STao Zhou if (!err_data->err_addr)
238*fc926faeSYiPeng Chai return 0;
23944420ac5STao Zhou
24044420ac5STao Zhou /* calculate error address if ue error is detected */
24144420ac5STao Zhou if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
24244420ac5STao Zhou REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
24344420ac5STao Zhou
24444420ac5STao Zhou err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
24544420ac5STao Zhou err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
24644420ac5STao Zhou
24744420ac5STao Zhou umc_v6_7_convert_error_address(adev, err_data, err_addr,
24844420ac5STao Zhou ch_inst, umc_inst);
24944420ac5STao Zhou }
250*fc926faeSYiPeng Chai
251*fc926faeSYiPeng Chai return 0;
2528882f90aSStanley.Yang }
2538882f90aSStanley.Yang
umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)2548882f90aSStanley.Yang static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
2558882f90aSStanley.Yang void *ras_error_status)
2568882f90aSStanley.Yang {
257*fc926faeSYiPeng Chai amdgpu_umc_loop_channels(adev,
258*fc926faeSYiPeng Chai umc_v6_7_ecc_info_query_error_address, ras_error_status);
2598882f90aSStanley.Yang }
2608882f90aSStanley.Yang
umc_v6_7_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count,uint32_t ch_inst,uint32_t umc_inst)2613f903560SHawking Zhang static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
2623f903560SHawking Zhang uint32_t umc_reg_offset,
263cbd3e844SStanley.Yang unsigned long *error_count,
264cbd3e844SStanley.Yang uint32_t ch_inst,
265cbd3e844SStanley.Yang uint32_t umc_inst)
2663f903560SHawking Zhang {
2673f903560SHawking Zhang uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
2683f903560SHawking Zhang uint32_t ecc_err_cnt, ecc_err_cnt_addr;
2693f903560SHawking Zhang uint64_t mc_umc_status;
2703f903560SHawking Zhang uint32_t mc_umc_status_addr;
2713f903560SHawking Zhang
2723f903560SHawking Zhang /* UMC 6_1_1 registers */
2733f903560SHawking Zhang ecc_err_cnt_sel_addr =
2743f903560SHawking Zhang SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
2753f903560SHawking Zhang ecc_err_cnt_addr =
2763f903560SHawking Zhang SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
2773f903560SHawking Zhang mc_umc_status_addr =
2783f903560SHawking Zhang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
2793f903560SHawking Zhang
2803f903560SHawking Zhang /* select the lower chip and check the error count */
2813f903560SHawking Zhang ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
2823f903560SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
2833f903560SHawking Zhang EccErrCntCsSel, 0);
2843f903560SHawking Zhang WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
2853f903560SHawking Zhang
2863f903560SHawking Zhang ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
2873f903560SHawking Zhang *error_count +=
2883f903560SHawking Zhang (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
2893f903560SHawking Zhang UMC_V6_7_CE_CNT_INIT);
2903f903560SHawking Zhang
2913f903560SHawking Zhang /* select the higher chip and check the err counter */
2923f903560SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
2933f903560SHawking Zhang EccErrCntCsSel, 1);
2943f903560SHawking Zhang WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
2953f903560SHawking Zhang
2963f903560SHawking Zhang ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
2973f903560SHawking Zhang *error_count +=
2983f903560SHawking Zhang (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
2993f903560SHawking Zhang UMC_V6_7_CE_CNT_INIT);
3003f903560SHawking Zhang
3013f903560SHawking Zhang /* check for SRAM correctable error
3023f903560SHawking Zhang MCUMC_STATUS is a 64 bit register */
3033f903560SHawking Zhang mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
3043f903560SHawking Zhang if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
30505eee31cSStanley.Yang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
3063f903560SHawking Zhang *error_count += 1;
30705eee31cSStanley.Yang
30805eee31cSStanley.Yang umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
309cbd3e844SStanley.Yang
310cbd3e844SStanley.Yang {
311cbd3e844SStanley.Yang uint64_t err_addr, soc_pa;
312cbd3e844SStanley.Yang uint32_t mc_umc_addrt0;
313cbd3e844SStanley.Yang uint32_t channel_index;
314cbd3e844SStanley.Yang
315cbd3e844SStanley.Yang mc_umc_addrt0 =
316cbd3e844SStanley.Yang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
317cbd3e844SStanley.Yang
318cbd3e844SStanley.Yang channel_index =
319cbd3e844SStanley.Yang adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
320cbd3e844SStanley.Yang
321cbd3e844SStanley.Yang err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
322cbd3e844SStanley.Yang err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
323cbd3e844SStanley.Yang
324cbd3e844SStanley.Yang /* translate umc channel address to soc pa, 3 parts are included */
325cbd3e844SStanley.Yang soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
326cbd3e844SStanley.Yang ADDR_OF_256B_BLOCK(channel_index) |
327cbd3e844SStanley.Yang OFFSET_IN_256B_BLOCK(err_addr);
328cbd3e844SStanley.Yang
329cbd3e844SStanley.Yang /* The umc channel bits are not original values, they are hashed */
330cbd3e844SStanley.Yang SET_CHANNEL_HASH(channel_index, soc_pa);
331cbd3e844SStanley.Yang
332cbd3e844SStanley.Yang dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
333cbd3e844SStanley.Yang }
33405eee31cSStanley.Yang }
3353f903560SHawking Zhang }
3363f903560SHawking Zhang
umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)3373f903560SHawking Zhang static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
3383f903560SHawking Zhang uint32_t umc_reg_offset,
3393f903560SHawking Zhang unsigned long *error_count)
3403f903560SHawking Zhang {
3413f903560SHawking Zhang uint64_t mc_umc_status;
3423f903560SHawking Zhang uint32_t mc_umc_status_addr;
3433f903560SHawking Zhang
3443f903560SHawking Zhang mc_umc_status_addr =
3453f903560SHawking Zhang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
3463f903560SHawking Zhang
3473f903560SHawking Zhang /* check the MCUMC_STATUS */
3483f903560SHawking Zhang mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
3493f903560SHawking Zhang if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
3503f903560SHawking Zhang (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
3513f903560SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
3523f903560SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
3533f903560SHawking Zhang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
3541ec1944eSStanley.Yang REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
3553f903560SHawking Zhang *error_count += 1;
3561ec1944eSStanley.Yang
35705eee31cSStanley.Yang umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
3581ec1944eSStanley.Yang }
3593f903560SHawking Zhang }
3603f903560SHawking Zhang
umc_v6_7_reset_error_count_per_channel(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)361*fc926faeSYiPeng Chai static int umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
362*fc926faeSYiPeng Chai uint32_t node_inst, uint32_t umc_inst,
363*fc926faeSYiPeng Chai uint32_t ch_inst, void *data)
364878b9e94SHawking Zhang {
365878b9e94SHawking Zhang uint32_t ecc_err_cnt_addr;
366878b9e94SHawking Zhang uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
367*fc926faeSYiPeng Chai uint32_t umc_reg_offset =
368*fc926faeSYiPeng Chai get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst);
369878b9e94SHawking Zhang
370878b9e94SHawking Zhang ecc_err_cnt_sel_addr =
371878b9e94SHawking Zhang SOC15_REG_OFFSET(UMC, 0,
372878b9e94SHawking Zhang regUMCCH0_0_EccErrCntSel);
373878b9e94SHawking Zhang ecc_err_cnt_addr =
374878b9e94SHawking Zhang SOC15_REG_OFFSET(UMC, 0,
375878b9e94SHawking Zhang regUMCCH0_0_EccErrCnt);
376878b9e94SHawking Zhang
377878b9e94SHawking Zhang /* select the lower chip */
378878b9e94SHawking Zhang ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
379878b9e94SHawking Zhang umc_reg_offset) * 4);
380878b9e94SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
381878b9e94SHawking Zhang UMCCH0_0_EccErrCntSel,
382878b9e94SHawking Zhang EccErrCntCsSel, 0);
383878b9e94SHawking Zhang WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
384878b9e94SHawking Zhang ecc_err_cnt_sel);
385878b9e94SHawking Zhang
386878b9e94SHawking Zhang /* clear lower chip error count */
387878b9e94SHawking Zhang WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
388878b9e94SHawking Zhang UMC_V6_7_CE_CNT_INIT);
389878b9e94SHawking Zhang
390878b9e94SHawking Zhang /* select the higher chip */
391878b9e94SHawking Zhang ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
392878b9e94SHawking Zhang umc_reg_offset) * 4);
393878b9e94SHawking Zhang ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
394878b9e94SHawking Zhang UMCCH0_0_EccErrCntSel,
395878b9e94SHawking Zhang EccErrCntCsSel, 1);
396878b9e94SHawking Zhang WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
397878b9e94SHawking Zhang ecc_err_cnt_sel);
398878b9e94SHawking Zhang
399878b9e94SHawking Zhang /* clear higher chip error count */
400878b9e94SHawking Zhang WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
401878b9e94SHawking Zhang UMC_V6_7_CE_CNT_INIT);
402*fc926faeSYiPeng Chai
403*fc926faeSYiPeng Chai return 0;
404878b9e94SHawking Zhang }
405878b9e94SHawking Zhang
umc_v6_7_reset_error_count(struct amdgpu_device * adev)406878b9e94SHawking Zhang static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
407878b9e94SHawking Zhang {
408*fc926faeSYiPeng Chai amdgpu_umc_loop_channels(adev,
409*fc926faeSYiPeng Chai umc_v6_7_reset_error_count_per_channel, NULL);
410878b9e94SHawking Zhang }
411*fc926faeSYiPeng Chai
umc_v6_7_query_ecc_error_count(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)412*fc926faeSYiPeng Chai static int umc_v6_7_query_ecc_error_count(struct amdgpu_device *adev,
413*fc926faeSYiPeng Chai uint32_t node_inst, uint32_t umc_inst,
414*fc926faeSYiPeng Chai uint32_t ch_inst, void *data)
415*fc926faeSYiPeng Chai {
416*fc926faeSYiPeng Chai struct ras_err_data *err_data = (struct ras_err_data *)data;
417*fc926faeSYiPeng Chai uint32_t umc_reg_offset =
418*fc926faeSYiPeng Chai get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst);
419*fc926faeSYiPeng Chai
420*fc926faeSYiPeng Chai umc_v6_7_query_correctable_error_count(adev,
421*fc926faeSYiPeng Chai umc_reg_offset,
422*fc926faeSYiPeng Chai &(err_data->ce_count),
423*fc926faeSYiPeng Chai ch_inst, umc_inst);
424*fc926faeSYiPeng Chai
425*fc926faeSYiPeng Chai umc_v6_7_querry_uncorrectable_error_count(adev,
426*fc926faeSYiPeng Chai umc_reg_offset,
427*fc926faeSYiPeng Chai &(err_data->ue_count));
428*fc926faeSYiPeng Chai
429*fc926faeSYiPeng Chai return 0;
430878b9e94SHawking Zhang }
431878b9e94SHawking Zhang
umc_v6_7_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)432878b9e94SHawking Zhang static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
433878b9e94SHawking Zhang void *ras_error_status)
434878b9e94SHawking Zhang {
435*fc926faeSYiPeng Chai amdgpu_umc_loop_channels(adev,
436*fc926faeSYiPeng Chai umc_v6_7_query_ecc_error_count, ras_error_status);
437878b9e94SHawking Zhang
438878b9e94SHawking Zhang umc_v6_7_reset_error_count(adev);
439878b9e94SHawking Zhang }
440878b9e94SHawking Zhang
umc_v6_7_query_error_address(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)441*fc926faeSYiPeng Chai static int umc_v6_7_query_error_address(struct amdgpu_device *adev,
442*fc926faeSYiPeng Chai uint32_t node_inst, uint32_t umc_inst,
443*fc926faeSYiPeng Chai uint32_t ch_inst, void *data)
44487da0cc1SHawking Zhang {
44587da0cc1SHawking Zhang uint32_t mc_umc_status_addr;
44644420ac5STao Zhou uint64_t mc_umc_status = 0, mc_umc_addrt0, err_addr;
447*fc926faeSYiPeng Chai struct ras_err_data *err_data = (struct ras_err_data *)data;
448*fc926faeSYiPeng Chai uint32_t umc_reg_offset =
449*fc926faeSYiPeng Chai get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst);
45087da0cc1SHawking Zhang
45187da0cc1SHawking Zhang mc_umc_status_addr =
45287da0cc1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
45387da0cc1SHawking Zhang mc_umc_addrt0 =
45487da0cc1SHawking Zhang SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
45587da0cc1SHawking Zhang
45687da0cc1SHawking Zhang mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
45787da0cc1SHawking Zhang
45887da0cc1SHawking Zhang if (mc_umc_status == 0)
459*fc926faeSYiPeng Chai return 0;
46087da0cc1SHawking Zhang
46187da0cc1SHawking Zhang if (!err_data->err_addr) {
46287da0cc1SHawking Zhang /* clear umc status */
46387da0cc1SHawking Zhang WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
464*fc926faeSYiPeng Chai return 0;
46587da0cc1SHawking Zhang }
46687da0cc1SHawking Zhang
467cdbb816bSTao Zhou /* calculate error address if ue error is detected */
46844420ac5STao Zhou if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
46944420ac5STao Zhou REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
47087da0cc1SHawking Zhang err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
4711014bd1cSTao Zhou err_addr =
4721014bd1cSTao Zhou REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
47387da0cc1SHawking Zhang
47444420ac5STao Zhou umc_v6_7_convert_error_address(adev, err_data, err_addr,
47544420ac5STao Zhou ch_inst, umc_inst);
476e63fa4dcSTao Zhou }
47787da0cc1SHawking Zhang
47887da0cc1SHawking Zhang /* clear umc status */
47987da0cc1SHawking Zhang WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
480*fc926faeSYiPeng Chai
481*fc926faeSYiPeng Chai return 0;
48287da0cc1SHawking Zhang }
48387da0cc1SHawking Zhang
umc_v6_7_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)48487da0cc1SHawking Zhang static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
48587da0cc1SHawking Zhang void *ras_error_status)
48687da0cc1SHawking Zhang {
487*fc926faeSYiPeng Chai amdgpu_umc_loop_channels(adev,
488*fc926faeSYiPeng Chai umc_v6_7_query_error_address, ras_error_status);
48987da0cc1SHawking Zhang }
49087da0cc1SHawking Zhang
umc_v6_7_query_ras_poison_mode_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)491aaca8c38STao Zhou static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
492aaca8c38STao Zhou struct amdgpu_device *adev,
493aaca8c38STao Zhou uint32_t umc_reg_offset)
494aaca8c38STao Zhou {
495aaca8c38STao Zhou uint32_t ecc_ctrl_addr, ecc_ctrl;
496aaca8c38STao Zhou
497aaca8c38STao Zhou ecc_ctrl_addr =
498aaca8c38STao Zhou SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
499aaca8c38STao Zhou ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
500aaca8c38STao Zhou umc_reg_offset) * 4);
501aaca8c38STao Zhou
502aaca8c38STao Zhou return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
503aaca8c38STao Zhou }
504aaca8c38STao Zhou
umc_v6_7_query_ras_poison_mode(struct amdgpu_device * adev)505aaca8c38STao Zhou static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
506aaca8c38STao Zhou {
507aaca8c38STao Zhou uint32_t umc_reg_offset = 0;
508aaca8c38STao Zhou
50969f915ccSTao Zhou /* Enabling fatal error in umc instance0 channel0 will be
51069f915ccSTao Zhou * considered as fatal error mode
51169f915ccSTao Zhou */
51269f915ccSTao Zhou umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
51369f915ccSTao Zhou return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
514aaca8c38STao Zhou }
515aaca8c38STao Zhou
516efe17d5aSyipechai const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
517878b9e94SHawking Zhang .query_ras_error_count = umc_v6_7_query_ras_error_count,
51887da0cc1SHawking Zhang .query_ras_error_address = umc_v6_7_query_ras_error_address,
519efe17d5aSyipechai };
520efe17d5aSyipechai
521efe17d5aSyipechai struct amdgpu_umc_ras umc_v6_7_ras = {
522efe17d5aSyipechai .ras_block = {
523efe17d5aSyipechai .hw_ops = &umc_v6_7_ras_hw_ops,
524efe17d5aSyipechai },
525aaca8c38STao Zhou .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
5268882f90aSStanley.Yang .ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
5278882f90aSStanley.Yang .ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
5281696bf35SHawking Zhang };
529