xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c (revision fc926fae)
11696bf35SHawking Zhang /*
21696bf35SHawking Zhang  * Copyright 2021 Advanced Micro Devices, Inc.
31696bf35SHawking Zhang  *
41696bf35SHawking Zhang  * Permission is hereby granted, free of charge, to any person obtaining a
51696bf35SHawking Zhang  * copy of this software and associated documentation files (the "Software"),
61696bf35SHawking Zhang  * to deal in the Software without restriction, including without limitation
71696bf35SHawking Zhang  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
81696bf35SHawking Zhang  * and/or sell copies of the Software, and to permit persons to whom the
91696bf35SHawking Zhang  * Software is furnished to do so, subject to the following conditions:
101696bf35SHawking Zhang  *
111696bf35SHawking Zhang  * The above copyright notice and this permission notice shall be included in
121696bf35SHawking Zhang  * all copies or substantial portions of the Software.
131696bf35SHawking Zhang  *
141696bf35SHawking Zhang  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
151696bf35SHawking Zhang  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
161696bf35SHawking Zhang  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
171696bf35SHawking Zhang  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
181696bf35SHawking Zhang  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
191696bf35SHawking Zhang  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
201696bf35SHawking Zhang  * OTHER DEALINGS IN THE SOFTWARE.
211696bf35SHawking Zhang  *
221696bf35SHawking Zhang  */
231696bf35SHawking Zhang #include "umc_v6_7.h"
241696bf35SHawking Zhang #include "amdgpu_ras.h"
2549070c4eSHawking Zhang #include "amdgpu_umc.h"
261696bf35SHawking Zhang #include "amdgpu.h"
271696bf35SHawking Zhang 
283f903560SHawking Zhang #include "umc/umc_6_7_0_offset.h"
293f903560SHawking Zhang #include "umc/umc_6_7_0_sh_mask.h"
303f903560SHawking Zhang 
31186c8a85SJohn Clements const uint32_t
32186c8a85SJohn Clements 	umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33719e433eSMukul Joshi 		{28, 20, 24, 16, 12, 4, 8, 0},
34719e433eSMukul Joshi 		{6, 30, 2, 26, 22, 14, 18, 10},
35719e433eSMukul Joshi 		{19, 11, 15, 7, 3, 27, 31, 23},
36719e433eSMukul Joshi 		{9, 1, 5, 29, 25, 17, 21, 13}
37186c8a85SJohn Clements };
38186c8a85SJohn Clements const uint32_t
39186c8a85SJohn Clements 	umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40719e433eSMukul Joshi 		{19, 11, 15, 7,	3, 27, 31, 23},
41719e433eSMukul Joshi 		{9, 1, 5, 29, 25, 17, 21, 13},
42719e433eSMukul Joshi 		{28, 20, 24, 16, 12, 4, 8, 0},
43719e433eSMukul Joshi 		{6, 30, 2, 26, 22, 14, 18, 10},
44186c8a85SJohn Clements };
45186c8a85SJohn Clements 
get_umc_v6_7_reg_offset(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst)46878b9e94SHawking Zhang static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
47878b9e94SHawking Zhang 					      uint32_t umc_inst,
48878b9e94SHawking Zhang 					      uint32_t ch_inst)
49878b9e94SHawking Zhang {
501915a433SStanley.Yang 	uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
511915a433SStanley.Yang 
521915a433SStanley.Yang 	/* adjust umc and channel index offset,
531915a433SStanley.Yang 	 * the register address is not linear on each umc instace */
541915a433SStanley.Yang 	umc_inst = index / 4;
551915a433SStanley.Yang 	ch_inst = index % 4;
561915a433SStanley.Yang 
57878b9e94SHawking Zhang 	return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
58878b9e94SHawking Zhang }
59878b9e94SHawking Zhang 
umc_v6_7_query_error_status_helper(struct amdgpu_device * adev,uint64_t mc_umc_status,uint32_t umc_reg_offset)6005eee31cSStanley.Yang static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev,
6105eee31cSStanley.Yang 						  uint64_t mc_umc_status, uint32_t umc_reg_offset)
628882f90aSStanley.Yang {
631ec1944eSStanley.Yang 	uint32_t mc_umc_addr;
641ec1944eSStanley.Yang 	uint64_t reg_value;
651ec1944eSStanley.Yang 
661ec1944eSStanley.Yang 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
671ec1944eSStanley.Yang 		dev_info(adev->dev, "Deferred error, no user action is needed.\n");
681ec1944eSStanley.Yang 
691ec1944eSStanley.Yang 	if (mc_umc_status)
701ec1944eSStanley.Yang 		dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
711ec1944eSStanley.Yang 
721ec1944eSStanley.Yang 	/* print IPID registers value */
731ec1944eSStanley.Yang 	mc_umc_addr =
741ec1944eSStanley.Yang 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
751ec1944eSStanley.Yang 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
761ec1944eSStanley.Yang 	if (reg_value)
771ec1944eSStanley.Yang 		dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
781ec1944eSStanley.Yang 
791ec1944eSStanley.Yang 	/* print SYND registers value */
801ec1944eSStanley.Yang 	mc_umc_addr =
811ec1944eSStanley.Yang 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
821ec1944eSStanley.Yang 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
831ec1944eSStanley.Yang 	if (reg_value)
841ec1944eSStanley.Yang 		dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
851ec1944eSStanley.Yang 
861ec1944eSStanley.Yang 	/* print MISC0 registers value */
871ec1944eSStanley.Yang 	mc_umc_addr =
881ec1944eSStanley.Yang 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
891ec1944eSStanley.Yang 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
901ec1944eSStanley.Yang 	if (reg_value)
911ec1944eSStanley.Yang 		dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
921ec1944eSStanley.Yang }
9305eee31cSStanley.Yang 
umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst,unsigned long * error_count)9405eee31cSStanley.Yang static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
9505eee31cSStanley.Yang 						   uint32_t umc_inst, uint32_t ch_inst,
9605eee31cSStanley.Yang 						   unsigned long *error_count)
9705eee31cSStanley.Yang {
9805eee31cSStanley.Yang 	uint64_t mc_umc_status;
9905eee31cSStanley.Yang 	uint32_t eccinfo_table_idx;
10005eee31cSStanley.Yang 	uint32_t umc_reg_offset;
10105eee31cSStanley.Yang 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
10205eee31cSStanley.Yang 
10305eee31cSStanley.Yang 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
10405eee31cSStanley.Yang 						umc_inst, ch_inst);
10505eee31cSStanley.Yang 
10605eee31cSStanley.Yang 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
10705eee31cSStanley.Yang 	/* check for SRAM correctable error
10805eee31cSStanley.Yang 	  MCUMC_STATUS is a 64 bit register */
10905eee31cSStanley.Yang 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
11005eee31cSStanley.Yang 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
11105eee31cSStanley.Yang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
11205eee31cSStanley.Yang 		*error_count += 1;
11305eee31cSStanley.Yang 
11405eee31cSStanley.Yang 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
115cbd3e844SStanley.Yang 
116cbd3e844SStanley.Yang 		if (ras->umc_ecc.record_ce_addr_supported)	{
117cbd3e844SStanley.Yang 			uint64_t err_addr, soc_pa;
118cbd3e844SStanley.Yang 			uint32_t channel_index =
119cbd3e844SStanley.Yang 				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
120cbd3e844SStanley.Yang 
121cbd3e844SStanley.Yang 			err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
122cbd3e844SStanley.Yang 			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
123cbd3e844SStanley.Yang 			/* translate umc channel address to soc pa, 3 parts are included */
124cbd3e844SStanley.Yang 			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
125cbd3e844SStanley.Yang 					ADDR_OF_256B_BLOCK(channel_index) |
126cbd3e844SStanley.Yang 					OFFSET_IN_256B_BLOCK(err_addr);
127cbd3e844SStanley.Yang 
128cbd3e844SStanley.Yang 			/* The umc channel bits are not original values, they are hashed */
129cbd3e844SStanley.Yang 			SET_CHANNEL_HASH(channel_index, soc_pa);
130cbd3e844SStanley.Yang 
131cbd3e844SStanley.Yang 			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
132cbd3e844SStanley.Yang 		}
13305eee31cSStanley.Yang 	}
13405eee31cSStanley.Yang }
13505eee31cSStanley.Yang 
umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst,unsigned long * error_count)13605eee31cSStanley.Yang static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
13705eee31cSStanley.Yang 							  uint32_t umc_inst, uint32_t ch_inst,
13805eee31cSStanley.Yang 						      unsigned long *error_count)
13905eee31cSStanley.Yang {
14005eee31cSStanley.Yang 	uint64_t mc_umc_status;
14105eee31cSStanley.Yang 	uint32_t eccinfo_table_idx;
14205eee31cSStanley.Yang 	uint32_t umc_reg_offset;
14305eee31cSStanley.Yang 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
14405eee31cSStanley.Yang 
14505eee31cSStanley.Yang 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
14605eee31cSStanley.Yang 						umc_inst, ch_inst);
14705eee31cSStanley.Yang 
14805eee31cSStanley.Yang 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
14905eee31cSStanley.Yang 	/* check the MCUMC_STATUS */
15005eee31cSStanley.Yang 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
15105eee31cSStanley.Yang 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
15205eee31cSStanley.Yang 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
15305eee31cSStanley.Yang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
15405eee31cSStanley.Yang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
15505eee31cSStanley.Yang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
15605eee31cSStanley.Yang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
15705eee31cSStanley.Yang 		*error_count += 1;
15805eee31cSStanley.Yang 
15905eee31cSStanley.Yang 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
16005eee31cSStanley.Yang 	}
1618882f90aSStanley.Yang }
1628882f90aSStanley.Yang 
umc_v6_7_ecc_info_querry_ecc_error_count(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)163*fc926faeSYiPeng Chai static int umc_v6_7_ecc_info_querry_ecc_error_count(struct amdgpu_device *adev,
164*fc926faeSYiPeng Chai 					uint32_t node_inst, uint32_t umc_inst,
165*fc926faeSYiPeng Chai 					uint32_t ch_inst, void *data)
1668882f90aSStanley.Yang {
167*fc926faeSYiPeng Chai 	struct ras_err_data *err_data = (struct ras_err_data *)data;
1688882f90aSStanley.Yang 
1698882f90aSStanley.Yang 	umc_v6_7_ecc_info_query_correctable_error_count(adev,
17037ff945fSStanley.Yang 		umc_inst, ch_inst,
1718882f90aSStanley.Yang 		&(err_data->ce_count));
172*fc926faeSYiPeng Chai 
1738882f90aSStanley.Yang 	umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
17437ff945fSStanley.Yang 		umc_inst, ch_inst,
1758882f90aSStanley.Yang 		&(err_data->ue_count));
176*fc926faeSYiPeng Chai 
177*fc926faeSYiPeng Chai 	return 0;
1788882f90aSStanley.Yang }
179*fc926faeSYiPeng Chai 
umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)180*fc926faeSYiPeng Chai static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
181*fc926faeSYiPeng Chai 					   void *ras_error_status)
182*fc926faeSYiPeng Chai {
183*fc926faeSYiPeng Chai 	amdgpu_umc_loop_channels(adev,
184*fc926faeSYiPeng Chai 		umc_v6_7_ecc_info_querry_ecc_error_count, ras_error_status);
1858882f90aSStanley.Yang }
1868882f90aSStanley.Yang 
umc_v6_7_convert_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)1876c0ca748SHawking Zhang void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
18844420ac5STao Zhou 				    struct ras_err_data *err_data, uint64_t err_addr,
18944420ac5STao Zhou 				    uint32_t ch_inst, uint32_t umc_inst)
1908882f90aSStanley.Yang {
1918882f90aSStanley.Yang 	uint32_t channel_index;
19244420ac5STao Zhou 	uint64_t soc_pa, retired_page, column;
1938882f90aSStanley.Yang 
1948882f90aSStanley.Yang 	channel_index =
1958882f90aSStanley.Yang 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
1968882f90aSStanley.Yang 	/* translate umc channel address to soc pa, 3 parts are included */
197e63fa4dcSTao Zhou 	soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
1988882f90aSStanley.Yang 			ADDR_OF_256B_BLOCK(channel_index) |
1998882f90aSStanley.Yang 			OFFSET_IN_256B_BLOCK(err_addr);
200bee7f8d0STao Zhou 
201bee7f8d0STao Zhou 	/* The umc channel bits are not original values, they are hashed */
202bee7f8d0STao Zhou 	SET_CHANNEL_HASH(channel_index, soc_pa);
203bee7f8d0STao Zhou 
204e63fa4dcSTao Zhou 	/* clear [C4 C3 C2] in soc physical address */
205e63fa4dcSTao Zhou 	soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
2068882f90aSStanley.Yang 
207e63fa4dcSTao Zhou 	/* loop for all possibilities of [C4 C3 C2] */
208e63fa4dcSTao Zhou 	for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
209e63fa4dcSTao Zhou 		retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
2101ec1944eSStanley.Yang 		dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
211400013b2STao Zhou 		amdgpu_umc_fill_error_record(err_data, err_addr,
212400013b2STao Zhou 			retired_page, channel_index, umc_inst);
213e63fa4dcSTao Zhou 
214e63fa4dcSTao Zhou 		/* shift R14 bit */
215e63fa4dcSTao Zhou 		retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
2161ec1944eSStanley.Yang 		dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
217e63fa4dcSTao Zhou 		amdgpu_umc_fill_error_record(err_data, err_addr,
218e63fa4dcSTao Zhou 			retired_page, channel_index, umc_inst);
219e63fa4dcSTao Zhou 	}
220e63fa4dcSTao Zhou }
22144420ac5STao Zhou 
umc_v6_7_ecc_info_query_error_address(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)222*fc926faeSYiPeng Chai static int umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
223*fc926faeSYiPeng Chai 					uint32_t node_inst, uint32_t umc_inst,
224*fc926faeSYiPeng Chai 					uint32_t ch_inst, void *data)
22544420ac5STao Zhou {
22644420ac5STao Zhou 	uint64_t mc_umc_status, err_addr;
22744420ac5STao Zhou 	uint32_t eccinfo_table_idx;
22844420ac5STao Zhou 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
229*fc926faeSYiPeng Chai 	struct ras_err_data *err_data = (struct ras_err_data *)data;
23044420ac5STao Zhou 
23144420ac5STao Zhou 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
23244420ac5STao Zhou 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
23344420ac5STao Zhou 
23444420ac5STao Zhou 	if (mc_umc_status == 0)
235*fc926faeSYiPeng Chai 		return 0;
23644420ac5STao Zhou 
23744420ac5STao Zhou 	if (!err_data->err_addr)
238*fc926faeSYiPeng Chai 		return 0;
23944420ac5STao Zhou 
24044420ac5STao Zhou 	/* calculate error address if ue error is detected */
24144420ac5STao Zhou 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
24244420ac5STao Zhou 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
24344420ac5STao Zhou 
24444420ac5STao Zhou 		err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
24544420ac5STao Zhou 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
24644420ac5STao Zhou 
24744420ac5STao Zhou 		umc_v6_7_convert_error_address(adev, err_data, err_addr,
24844420ac5STao Zhou 					ch_inst, umc_inst);
24944420ac5STao Zhou 	}
250*fc926faeSYiPeng Chai 
251*fc926faeSYiPeng Chai 	return 0;
2528882f90aSStanley.Yang }
2538882f90aSStanley.Yang 
umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)2548882f90aSStanley.Yang static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
2558882f90aSStanley.Yang 					     void *ras_error_status)
2568882f90aSStanley.Yang {
257*fc926faeSYiPeng Chai 	amdgpu_umc_loop_channels(adev,
258*fc926faeSYiPeng Chai 	    umc_v6_7_ecc_info_query_error_address, ras_error_status);
2598882f90aSStanley.Yang }
2608882f90aSStanley.Yang 
umc_v6_7_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count,uint32_t ch_inst,uint32_t umc_inst)2613f903560SHawking Zhang static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
2623f903560SHawking Zhang 						   uint32_t umc_reg_offset,
263cbd3e844SStanley.Yang 						   unsigned long *error_count,
264cbd3e844SStanley.Yang 						   uint32_t ch_inst,
265cbd3e844SStanley.Yang 						   uint32_t umc_inst)
2663f903560SHawking Zhang {
2673f903560SHawking Zhang 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
2683f903560SHawking Zhang 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
2693f903560SHawking Zhang 	uint64_t mc_umc_status;
2703f903560SHawking Zhang 	uint32_t mc_umc_status_addr;
2713f903560SHawking Zhang 
2723f903560SHawking Zhang 	/* UMC 6_1_1 registers */
2733f903560SHawking Zhang 	ecc_err_cnt_sel_addr =
2743f903560SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
2753f903560SHawking Zhang 	ecc_err_cnt_addr =
2763f903560SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
2773f903560SHawking Zhang 	mc_umc_status_addr =
2783f903560SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
2793f903560SHawking Zhang 
2803f903560SHawking Zhang 	/* select the lower chip and check the error count */
2813f903560SHawking Zhang 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
2823f903560SHawking Zhang 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
2833f903560SHawking Zhang 					EccErrCntCsSel, 0);
2843f903560SHawking Zhang 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
2853f903560SHawking Zhang 
2863f903560SHawking Zhang 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
2873f903560SHawking Zhang 	*error_count +=
2883f903560SHawking Zhang 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
2893f903560SHawking Zhang 		 UMC_V6_7_CE_CNT_INIT);
2903f903560SHawking Zhang 
2913f903560SHawking Zhang 	/* select the higher chip and check the err counter */
2923f903560SHawking Zhang 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
2933f903560SHawking Zhang 					EccErrCntCsSel, 1);
2943f903560SHawking Zhang 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
2953f903560SHawking Zhang 
2963f903560SHawking Zhang 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
2973f903560SHawking Zhang 	*error_count +=
2983f903560SHawking Zhang 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
2993f903560SHawking Zhang 		 UMC_V6_7_CE_CNT_INIT);
3003f903560SHawking Zhang 
3013f903560SHawking Zhang 	/* check for SRAM correctable error
3023f903560SHawking Zhang 	  MCUMC_STATUS is a 64 bit register */
3033f903560SHawking Zhang 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
3043f903560SHawking Zhang 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
30505eee31cSStanley.Yang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
3063f903560SHawking Zhang 		*error_count += 1;
30705eee31cSStanley.Yang 
30805eee31cSStanley.Yang 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
309cbd3e844SStanley.Yang 
310cbd3e844SStanley.Yang 		{
311cbd3e844SStanley.Yang 			uint64_t err_addr, soc_pa;
312cbd3e844SStanley.Yang 			uint32_t mc_umc_addrt0;
313cbd3e844SStanley.Yang 			uint32_t channel_index;
314cbd3e844SStanley.Yang 
315cbd3e844SStanley.Yang 			mc_umc_addrt0 =
316cbd3e844SStanley.Yang 				SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
317cbd3e844SStanley.Yang 
318cbd3e844SStanley.Yang 			channel_index =
319cbd3e844SStanley.Yang 				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
320cbd3e844SStanley.Yang 
321cbd3e844SStanley.Yang 			err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
322cbd3e844SStanley.Yang 			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
323cbd3e844SStanley.Yang 
324cbd3e844SStanley.Yang 			/* translate umc channel address to soc pa, 3 parts are included */
325cbd3e844SStanley.Yang 			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
326cbd3e844SStanley.Yang 					ADDR_OF_256B_BLOCK(channel_index) |
327cbd3e844SStanley.Yang 					OFFSET_IN_256B_BLOCK(err_addr);
328cbd3e844SStanley.Yang 
329cbd3e844SStanley.Yang 			/* The umc channel bits are not original values, they are hashed */
330cbd3e844SStanley.Yang 			SET_CHANNEL_HASH(channel_index, soc_pa);
331cbd3e844SStanley.Yang 
332cbd3e844SStanley.Yang 			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
333cbd3e844SStanley.Yang 		}
33405eee31cSStanley.Yang 	}
3353f903560SHawking Zhang }
3363f903560SHawking Zhang 
umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)3373f903560SHawking Zhang static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
3383f903560SHawking Zhang 						      uint32_t umc_reg_offset,
3393f903560SHawking Zhang 						      unsigned long *error_count)
3403f903560SHawking Zhang {
3413f903560SHawking Zhang 	uint64_t mc_umc_status;
3423f903560SHawking Zhang 	uint32_t mc_umc_status_addr;
3433f903560SHawking Zhang 
3443f903560SHawking Zhang 	mc_umc_status_addr =
3453f903560SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
3463f903560SHawking Zhang 
3473f903560SHawking Zhang 	/* check the MCUMC_STATUS */
3483f903560SHawking Zhang 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
3493f903560SHawking Zhang 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
3503f903560SHawking Zhang 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
3513f903560SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
3523f903560SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
3533f903560SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
3541ec1944eSStanley.Yang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
3553f903560SHawking Zhang 		*error_count += 1;
3561ec1944eSStanley.Yang 
35705eee31cSStanley.Yang 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
3581ec1944eSStanley.Yang 	}
3593f903560SHawking Zhang }
3603f903560SHawking Zhang 
umc_v6_7_reset_error_count_per_channel(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)361*fc926faeSYiPeng Chai static int umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
362*fc926faeSYiPeng Chai 					uint32_t node_inst, uint32_t umc_inst,
363*fc926faeSYiPeng Chai 					uint32_t ch_inst, void *data)
364878b9e94SHawking Zhang {
365878b9e94SHawking Zhang 	uint32_t ecc_err_cnt_addr;
366878b9e94SHawking Zhang 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
367*fc926faeSYiPeng Chai 	uint32_t umc_reg_offset =
368*fc926faeSYiPeng Chai 		get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst);
369878b9e94SHawking Zhang 
370878b9e94SHawking Zhang 	ecc_err_cnt_sel_addr =
371878b9e94SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0,
372878b9e94SHawking Zhang 				regUMCCH0_0_EccErrCntSel);
373878b9e94SHawking Zhang 	ecc_err_cnt_addr =
374878b9e94SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0,
375878b9e94SHawking Zhang 				regUMCCH0_0_EccErrCnt);
376878b9e94SHawking Zhang 
377878b9e94SHawking Zhang 	/* select the lower chip */
378878b9e94SHawking Zhang 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
379878b9e94SHawking Zhang 				       umc_reg_offset) * 4);
380878b9e94SHawking Zhang 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
381878b9e94SHawking Zhang 					UMCCH0_0_EccErrCntSel,
382878b9e94SHawking Zhang 					EccErrCntCsSel, 0);
383878b9e94SHawking Zhang 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
384878b9e94SHawking Zhang 			ecc_err_cnt_sel);
385878b9e94SHawking Zhang 
386878b9e94SHawking Zhang 	/* clear lower chip error count */
387878b9e94SHawking Zhang 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
388878b9e94SHawking Zhang 			UMC_V6_7_CE_CNT_INIT);
389878b9e94SHawking Zhang 
390878b9e94SHawking Zhang 	/* select the higher chip */
391878b9e94SHawking Zhang 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
392878b9e94SHawking Zhang 					umc_reg_offset) * 4);
393878b9e94SHawking Zhang 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
394878b9e94SHawking Zhang 					UMCCH0_0_EccErrCntSel,
395878b9e94SHawking Zhang 					EccErrCntCsSel, 1);
396878b9e94SHawking Zhang 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
397878b9e94SHawking Zhang 			ecc_err_cnt_sel);
398878b9e94SHawking Zhang 
399878b9e94SHawking Zhang 	/* clear higher chip error count */
400878b9e94SHawking Zhang 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
401878b9e94SHawking Zhang 			UMC_V6_7_CE_CNT_INIT);
402*fc926faeSYiPeng Chai 
403*fc926faeSYiPeng Chai 	return 0;
404878b9e94SHawking Zhang }
405878b9e94SHawking Zhang 
umc_v6_7_reset_error_count(struct amdgpu_device * adev)406878b9e94SHawking Zhang static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
407878b9e94SHawking Zhang {
408*fc926faeSYiPeng Chai 	amdgpu_umc_loop_channels(adev,
409*fc926faeSYiPeng Chai 		umc_v6_7_reset_error_count_per_channel, NULL);
410878b9e94SHawking Zhang }
411*fc926faeSYiPeng Chai 
umc_v6_7_query_ecc_error_count(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)412*fc926faeSYiPeng Chai static int umc_v6_7_query_ecc_error_count(struct amdgpu_device *adev,
413*fc926faeSYiPeng Chai 					uint32_t node_inst, uint32_t umc_inst,
414*fc926faeSYiPeng Chai 					uint32_t ch_inst, void *data)
415*fc926faeSYiPeng Chai {
416*fc926faeSYiPeng Chai 	struct ras_err_data *err_data = (struct ras_err_data *)data;
417*fc926faeSYiPeng Chai 	uint32_t umc_reg_offset =
418*fc926faeSYiPeng Chai 		get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst);
419*fc926faeSYiPeng Chai 
420*fc926faeSYiPeng Chai 	umc_v6_7_query_correctable_error_count(adev,
421*fc926faeSYiPeng Chai 					umc_reg_offset,
422*fc926faeSYiPeng Chai 					&(err_data->ce_count),
423*fc926faeSYiPeng Chai 					ch_inst, umc_inst);
424*fc926faeSYiPeng Chai 
425*fc926faeSYiPeng Chai 	umc_v6_7_querry_uncorrectable_error_count(adev,
426*fc926faeSYiPeng Chai 					umc_reg_offset,
427*fc926faeSYiPeng Chai 					&(err_data->ue_count));
428*fc926faeSYiPeng Chai 
429*fc926faeSYiPeng Chai 	return 0;
430878b9e94SHawking Zhang }
431878b9e94SHawking Zhang 
umc_v6_7_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)432878b9e94SHawking Zhang static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
433878b9e94SHawking Zhang 					   void *ras_error_status)
434878b9e94SHawking Zhang {
435*fc926faeSYiPeng Chai 	amdgpu_umc_loop_channels(adev,
436*fc926faeSYiPeng Chai 		umc_v6_7_query_ecc_error_count, ras_error_status);
437878b9e94SHawking Zhang 
438878b9e94SHawking Zhang 	umc_v6_7_reset_error_count(adev);
439878b9e94SHawking Zhang }
440878b9e94SHawking Zhang 
umc_v6_7_query_error_address(struct amdgpu_device * adev,uint32_t node_inst,uint32_t umc_inst,uint32_t ch_inst,void * data)441*fc926faeSYiPeng Chai static int umc_v6_7_query_error_address(struct amdgpu_device *adev,
442*fc926faeSYiPeng Chai 					uint32_t node_inst, uint32_t umc_inst,
443*fc926faeSYiPeng Chai 					uint32_t ch_inst, void *data)
44487da0cc1SHawking Zhang {
44587da0cc1SHawking Zhang 	uint32_t mc_umc_status_addr;
44644420ac5STao Zhou 	uint64_t mc_umc_status = 0, mc_umc_addrt0, err_addr;
447*fc926faeSYiPeng Chai 	struct ras_err_data *err_data = (struct ras_err_data *)data;
448*fc926faeSYiPeng Chai 	uint32_t umc_reg_offset =
449*fc926faeSYiPeng Chai 		get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst);
45087da0cc1SHawking Zhang 
45187da0cc1SHawking Zhang 	mc_umc_status_addr =
45287da0cc1SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
45387da0cc1SHawking Zhang 	mc_umc_addrt0 =
45487da0cc1SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
45587da0cc1SHawking Zhang 
45687da0cc1SHawking Zhang 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
45787da0cc1SHawking Zhang 
45887da0cc1SHawking Zhang 	if (mc_umc_status == 0)
459*fc926faeSYiPeng Chai 		return 0;
46087da0cc1SHawking Zhang 
46187da0cc1SHawking Zhang 	if (!err_data->err_addr) {
46287da0cc1SHawking Zhang 		/* clear umc status */
46387da0cc1SHawking Zhang 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
464*fc926faeSYiPeng Chai 		return 0;
46587da0cc1SHawking Zhang 	}
46687da0cc1SHawking Zhang 
467cdbb816bSTao Zhou 	/* calculate error address if ue error is detected */
46844420ac5STao Zhou 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
46944420ac5STao Zhou 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
47087da0cc1SHawking Zhang 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
4711014bd1cSTao Zhou 		err_addr =
4721014bd1cSTao Zhou 			REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
47387da0cc1SHawking Zhang 
47444420ac5STao Zhou 		umc_v6_7_convert_error_address(adev, err_data, err_addr,
47544420ac5STao Zhou 					ch_inst, umc_inst);
476e63fa4dcSTao Zhou 	}
47787da0cc1SHawking Zhang 
47887da0cc1SHawking Zhang 	/* clear umc status */
47987da0cc1SHawking Zhang 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
480*fc926faeSYiPeng Chai 
481*fc926faeSYiPeng Chai 	return 0;
48287da0cc1SHawking Zhang }
48387da0cc1SHawking Zhang 
umc_v6_7_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)48487da0cc1SHawking Zhang static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
48587da0cc1SHawking Zhang 					     void *ras_error_status)
48687da0cc1SHawking Zhang {
487*fc926faeSYiPeng Chai 	amdgpu_umc_loop_channels(adev,
488*fc926faeSYiPeng Chai 		umc_v6_7_query_error_address, ras_error_status);
48987da0cc1SHawking Zhang }
49087da0cc1SHawking Zhang 
umc_v6_7_query_ras_poison_mode_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)491aaca8c38STao Zhou static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
492aaca8c38STao Zhou 						struct amdgpu_device *adev,
493aaca8c38STao Zhou 						uint32_t umc_reg_offset)
494aaca8c38STao Zhou {
495aaca8c38STao Zhou 	uint32_t ecc_ctrl_addr, ecc_ctrl;
496aaca8c38STao Zhou 
497aaca8c38STao Zhou 	ecc_ctrl_addr =
498aaca8c38STao Zhou 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
499aaca8c38STao Zhou 	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
500aaca8c38STao Zhou 					umc_reg_offset) * 4);
501aaca8c38STao Zhou 
502aaca8c38STao Zhou 	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
503aaca8c38STao Zhou }
504aaca8c38STao Zhou 
umc_v6_7_query_ras_poison_mode(struct amdgpu_device * adev)505aaca8c38STao Zhou static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
506aaca8c38STao Zhou {
507aaca8c38STao Zhou 	uint32_t umc_reg_offset  = 0;
508aaca8c38STao Zhou 
50969f915ccSTao Zhou 	/* Enabling fatal error in umc instance0 channel0 will be
51069f915ccSTao Zhou 	 * considered as fatal error mode
51169f915ccSTao Zhou 	 */
51269f915ccSTao Zhou 	umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
51369f915ccSTao Zhou 	return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
514aaca8c38STao Zhou }
515aaca8c38STao Zhou 
516efe17d5aSyipechai const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
517878b9e94SHawking Zhang 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
51887da0cc1SHawking Zhang 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
519efe17d5aSyipechai };
520efe17d5aSyipechai 
521efe17d5aSyipechai struct amdgpu_umc_ras umc_v6_7_ras = {
522efe17d5aSyipechai 	.ras_block = {
523efe17d5aSyipechai 		.hw_ops = &umc_v6_7_ras_hw_ops,
524efe17d5aSyipechai 	},
525aaca8c38STao Zhou 	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
5268882f90aSStanley.Yang 	.ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
5278882f90aSStanley.Yang 	.ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
5281696bf35SHawking Zhang };
529