xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c (revision 8c948103)
19884c2b1SHawking Zhang /*
29884c2b1SHawking Zhang  * Copyright 2019 Advanced Micro Devices, Inc.
39884c2b1SHawking Zhang  *
49884c2b1SHawking Zhang  * Permission is hereby granted, free of charge, to any person obtaining a
59884c2b1SHawking Zhang  * copy of this software and associated documentation files (the "Software"),
69884c2b1SHawking Zhang  * to deal in the Software without restriction, including without limitation
79884c2b1SHawking Zhang  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
89884c2b1SHawking Zhang  * and/or sell copies of the Software, and to permit persons to whom the
99884c2b1SHawking Zhang  * Software is furnished to do so, subject to the following conditions:
109884c2b1SHawking Zhang  *
119884c2b1SHawking Zhang  * The above copyright notice and this permission notice shall be included in
129884c2b1SHawking Zhang  * all copies or substantial portions of the Software.
139884c2b1SHawking Zhang  *
149884c2b1SHawking Zhang  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
159884c2b1SHawking Zhang  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
169884c2b1SHawking Zhang  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
179884c2b1SHawking Zhang  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
189884c2b1SHawking Zhang  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
199884c2b1SHawking Zhang  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
209884c2b1SHawking Zhang  * OTHER DEALINGS IN THE SOFTWARE.
219884c2b1SHawking Zhang  *
229884c2b1SHawking Zhang  */
239884c2b1SHawking Zhang #include "umc_v6_1.h"
249884c2b1SHawking Zhang #include "amdgpu_ras.h"
259884c2b1SHawking Zhang #include "amdgpu.h"
269884c2b1SHawking Zhang 
279884c2b1SHawking Zhang #include "rsmu/rsmu_0_0_2_offset.h"
289884c2b1SHawking Zhang #include "rsmu/rsmu_0_0_2_sh_mask.h"
299884c2b1SHawking Zhang #include "umc/umc_6_1_1_offset.h"
309884c2b1SHawking Zhang #include "umc/umc_6_1_1_sh_mask.h"
319884c2b1SHawking Zhang 
32c2742aefSTao Zhou #define smnMCA_UMC0_MCUMC_ADDRT0	0x50f10
33c2742aefSTao Zhou 
348c948103STao Zhou /*
358c948103STao Zhou  * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
368c948103STao Zhou  * is the index of 8KB block
378c948103STao Zhou  */
388c948103STao Zhou #define ADDR_OF_8KB_BLOCK(addr)		(((addr) & ~0xffULL) << 5)
398c948103STao Zhou /* channel index is the index of 256B block */
408c948103STao Zhou #define ADDR_OF_256B_BLOCK(channel_index)	((channel_index) << 8)
418c948103STao Zhou /* offset in 256B block */
428c948103STao Zhou #define OFFSET_IN_256B_BLOCK(addr)		((addr) & 0xffULL)
438c948103STao Zhou 
44c2742aefSTao Zhou static uint32_t
45c2742aefSTao Zhou 	umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = {
46c2742aefSTao Zhou 		{2, 18, 11, 27},	{4, 20, 13, 29},
47c2742aefSTao Zhou 		{1, 17, 8, 24},		{7, 23, 14, 30},
48c2742aefSTao Zhou 		{10, 26, 3, 19},	{12, 28, 5, 21},
49c2742aefSTao Zhou 		{9, 25, 0, 16},		{15, 31, 6, 22}
50c2742aefSTao Zhou };
51c2742aefSTao Zhou 
529884c2b1SHawking Zhang static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev,
539884c2b1SHawking Zhang 					   uint32_t umc_instance)
549884c2b1SHawking Zhang {
559884c2b1SHawking Zhang 	uint32_t rsmu_umc_index;
569884c2b1SHawking Zhang 
579884c2b1SHawking Zhang 	rsmu_umc_index = RREG32_SOC15(RSMU, 0,
589884c2b1SHawking Zhang 			mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
599884c2b1SHawking Zhang 	rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index,
609884c2b1SHawking Zhang 			RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
619884c2b1SHawking Zhang 			RSMU_UMC_INDEX_MODE_EN, 1);
629884c2b1SHawking Zhang 	rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index,
639884c2b1SHawking Zhang 			RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
649884c2b1SHawking Zhang 			RSMU_UMC_INDEX_INSTANCE, umc_instance);
659884c2b1SHawking Zhang 	rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index,
669884c2b1SHawking Zhang 			RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
679884c2b1SHawking Zhang 			RSMU_UMC_INDEX_WREN, 1 << umc_instance);
689884c2b1SHawking Zhang 	WREG32_SOC15(RSMU, 0, mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
699884c2b1SHawking Zhang 				rsmu_umc_index);
709884c2b1SHawking Zhang }
719884c2b1SHawking Zhang 
729884c2b1SHawking Zhang static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
739884c2b1SHawking Zhang {
749884c2b1SHawking Zhang 	WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
759884c2b1SHawking Zhang 			RSMU_UMC_INDEX_MODE_EN, 0);
769884c2b1SHawking Zhang }
779884c2b1SHawking Zhang 
789884c2b1SHawking Zhang static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
799884c2b1SHawking Zhang 						   uint32_t umc_reg_offset,
809884c2b1SHawking Zhang 						   unsigned long *error_count)
819884c2b1SHawking Zhang {
829884c2b1SHawking Zhang 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
839884c2b1SHawking Zhang 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
849884c2b1SHawking Zhang 	uint64_t mc_umc_status;
859884c2b1SHawking Zhang 	uint32_t mc_umc_status_addr;
869884c2b1SHawking Zhang 
879884c2b1SHawking Zhang 	ecc_err_cnt_sel_addr =
889884c2b1SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
899884c2b1SHawking Zhang 	ecc_err_cnt_addr =
909884c2b1SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
919884c2b1SHawking Zhang 	mc_umc_status_addr =
929884c2b1SHawking Zhang 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
939884c2b1SHawking Zhang 
949884c2b1SHawking Zhang 	/* select the lower chip and check the error count */
959884c2b1SHawking Zhang 	ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset);
969884c2b1SHawking Zhang 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
979884c2b1SHawking Zhang 					EccErrCntCsSel, 0);
989884c2b1SHawking Zhang 	WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel);
999884c2b1SHawking Zhang 	ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset);
1009884c2b1SHawking Zhang 	*error_count +=
1019884c2b1SHawking Zhang 		REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt);
1029884c2b1SHawking Zhang 	/* clear the lower chip err count */
1039884c2b1SHawking Zhang 	WREG32(ecc_err_cnt_addr + umc_reg_offset, 0);
1049884c2b1SHawking Zhang 
1059884c2b1SHawking Zhang 	/* select the higher chip and check the err counter */
1069884c2b1SHawking Zhang 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
1079884c2b1SHawking Zhang 					EccErrCntCsSel, 1);
1089884c2b1SHawking Zhang 	WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel);
1099884c2b1SHawking Zhang 	ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset);
1109884c2b1SHawking Zhang 	*error_count +=
1119884c2b1SHawking Zhang 		REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt);
1129884c2b1SHawking Zhang 	/* clear the higher chip err count */
1139884c2b1SHawking Zhang 	WREG32(ecc_err_cnt_addr + umc_reg_offset, 0);
1149884c2b1SHawking Zhang 
1159884c2b1SHawking Zhang 	/* check for SRAM correctable error
1169884c2b1SHawking Zhang 	  MCUMC_STATUS is a 64 bit register */
1175bbfb64aSTao Zhou 	mc_umc_status = RREG64(mc_umc_status_addr + umc_reg_offset);
1189884c2b1SHawking Zhang 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
1199884c2b1SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
1209884c2b1SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
1219884c2b1SHawking Zhang 		*error_count += 1;
1229884c2b1SHawking Zhang }
1239884c2b1SHawking Zhang 
1249884c2b1SHawking Zhang static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev,
1259884c2b1SHawking Zhang 						      uint32_t umc_reg_offset,
1269884c2b1SHawking Zhang 						      unsigned long *error_count)
1279884c2b1SHawking Zhang {
1289884c2b1SHawking Zhang 	uint64_t mc_umc_status;
1299884c2b1SHawking Zhang 	uint32_t mc_umc_status_addr;
1309884c2b1SHawking Zhang 
1319884c2b1SHawking Zhang 	mc_umc_status_addr =
1329884c2b1SHawking Zhang                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
1339884c2b1SHawking Zhang 
1349884c2b1SHawking Zhang 	/* check the MCUMC_STATUS */
1355bbfb64aSTao Zhou 	mc_umc_status = RREG64(mc_umc_status_addr + umc_reg_offset);
136f1ed4afaSTao Zhou 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
137f1ed4afaSTao Zhou 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
138f1ed4afaSTao Zhou 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
1399884c2b1SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
1409884c2b1SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
1419884c2b1SHawking Zhang 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
1429884c2b1SHawking Zhang 		*error_count += 1;
1439884c2b1SHawking Zhang }
1449884c2b1SHawking Zhang 
1459884c2b1SHawking Zhang static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
1469884c2b1SHawking Zhang 					   void *ras_error_status)
1479884c2b1SHawking Zhang {
1489884c2b1SHawking Zhang 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1495bbfb64aSTao Zhou 	uint32_t umc_inst, channel_inst, umc_reg_offset, mc_umc_status_addr;
1505bbfb64aSTao Zhou 
1515bbfb64aSTao Zhou 	mc_umc_status_addr =
1525bbfb64aSTao Zhou 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
1539884c2b1SHawking Zhang 
1549884c2b1SHawking Zhang 	for (umc_inst = 0; umc_inst < UMC_V6_1_UMC_INSTANCE_NUM; umc_inst++) {
1559884c2b1SHawking Zhang 		/* enable the index mode to query eror count per channel */
1569884c2b1SHawking Zhang 		umc_v6_1_enable_umc_index_mode(adev, umc_inst);
1579884c2b1SHawking Zhang 		for (channel_inst = 0; channel_inst < UMC_V6_1_CHANNEL_INSTANCE_NUM; channel_inst++) {
1589884c2b1SHawking Zhang 			/* calc the register offset according to channel instance */
1599884c2b1SHawking Zhang 			umc_reg_offset = UMC_V6_1_PER_CHANNEL_OFFSET * channel_inst;
1609884c2b1SHawking Zhang 			umc_v6_1_query_correctable_error_count(adev, umc_reg_offset,
1619884c2b1SHawking Zhang 							       &(err_data->ce_count));
1629884c2b1SHawking Zhang 			umc_v6_1_querry_uncorrectable_error_count(adev, umc_reg_offset,
1639884c2b1SHawking Zhang 								  &(err_data->ue_count));
1645bbfb64aSTao Zhou 			/* clear umc status */
1655bbfb64aSTao Zhou 			WREG64(mc_umc_status_addr + umc_reg_offset, 0x0ULL);
1669884c2b1SHawking Zhang 		}
1679884c2b1SHawking Zhang 	}
1689884c2b1SHawking Zhang 	umc_v6_1_disable_umc_index_mode(adev);
1699884c2b1SHawking Zhang }
1709884c2b1SHawking Zhang 
1718c948103STao Zhou static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
1728c948103STao Zhou 					 uint32_t umc_reg_offset, uint32_t channel_index,
1738c948103STao Zhou 					 struct ras_err_data *err_data)
1748c948103STao Zhou {
1758c948103STao Zhou 	uint32_t lsb;
1768c948103STao Zhou 	uint64_t mc_umc_status, err_addr;
1778c948103STao Zhou 	uint32_t mc_umc_status_addr;
1788c948103STao Zhou 
1798c948103STao Zhou 	/* skip error address process if -ENOMEM */
1808c948103STao Zhou 	if (!err_data->err_addr)
1818c948103STao Zhou 		return;
1828c948103STao Zhou 
1838c948103STao Zhou 	mc_umc_status_addr =
1848c948103STao Zhou 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
1858c948103STao Zhou 	mc_umc_status = RREG64(mc_umc_status_addr + umc_reg_offset);
1868c948103STao Zhou 
1878c948103STao Zhou 	/* calculate error address if ue/ce error is detected */
1888c948103STao Zhou 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
1898c948103STao Zhou 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
1908c948103STao Zhou 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
1918c948103STao Zhou 		err_addr = RREG64_PCIE(smnMCA_UMC0_MCUMC_ADDRT0 + umc_reg_offset * 4);
1928c948103STao Zhou 
1938c948103STao Zhou 		/* the lowest lsb bits should be ignored */
1948c948103STao Zhou 		lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
1958c948103STao Zhou 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
1968c948103STao Zhou 		err_addr &= ~((0x1ULL << lsb) - 1);
1978c948103STao Zhou 
1988c948103STao Zhou 		/* translate umc channel address to soc pa, 3 parts are included */
1998c948103STao Zhou 		err_data->err_addr[err_data->err_addr_cnt] =
2008c948103STao Zhou 						ADDR_OF_8KB_BLOCK(err_addr)
2018c948103STao Zhou 						| ADDR_OF_256B_BLOCK(channel_index)
2028c948103STao Zhou 						| OFFSET_IN_256B_BLOCK(err_addr);
2038c948103STao Zhou 
2048c948103STao Zhou 		err_data->err_addr_cnt++;
2058c948103STao Zhou 	}
2068c948103STao Zhou }
2078c948103STao Zhou 
2088c948103STao Zhou static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev,
2098c948103STao Zhou 					     void *ras_error_status)
2108c948103STao Zhou {
2118c948103STao Zhou 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
2128c948103STao Zhou 	uint32_t umc_inst, channel_inst, umc_reg_offset;
2138c948103STao Zhou 	uint32_t channel_index, mc_umc_status_addr;
2148c948103STao Zhou 
2158c948103STao Zhou 	mc_umc_status_addr =
2168c948103STao Zhou 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
2178c948103STao Zhou 
2188c948103STao Zhou 	for (umc_inst = 0; umc_inst < UMC_V6_1_UMC_INSTANCE_NUM; umc_inst++) {
2198c948103STao Zhou 		/* enable the index mode to query eror count per channel */
2208c948103STao Zhou 		umc_v6_1_enable_umc_index_mode(adev, umc_inst);
2218c948103STao Zhou 		for (channel_inst = 0; channel_inst < UMC_V6_1_CHANNEL_INSTANCE_NUM; channel_inst++) {
2228c948103STao Zhou 			/* calc the register offset according to channel instance */
2238c948103STao Zhou 			umc_reg_offset = UMC_V6_1_PER_CHANNEL_OFFSET * channel_inst;
2248c948103STao Zhou 			/* get channel index of interleaved memory */
2258c948103STao Zhou 			channel_index = umc_v6_1_channel_idx_tbl[umc_inst][channel_inst];
2268c948103STao Zhou 
2278c948103STao Zhou 			umc_v6_1_query_error_address(adev, umc_reg_offset,
2288c948103STao Zhou 						     channel_index, err_data);
2298c948103STao Zhou 
2308c948103STao Zhou 			/* clear umc status */
2318c948103STao Zhou 			WREG64(mc_umc_status_addr + umc_reg_offset, 0x0ULL);
2328c948103STao Zhou 			/* clear error address register */
2338c948103STao Zhou 			WREG64_PCIE(smnMCA_UMC0_MCUMC_ADDRT0 + umc_reg_offset * 4, 0x0ULL);
2348c948103STao Zhou 		}
2358c948103STao Zhou 	}
2368c948103STao Zhou 
2378c948103STao Zhou 	umc_v6_1_disable_umc_index_mode(adev);
2388c948103STao Zhou }
2398c948103STao Zhou 
2409884c2b1SHawking Zhang const struct amdgpu_umc_funcs umc_v6_1_funcs = {
2419884c2b1SHawking Zhang 	.query_ras_error_count = umc_v6_1_query_ras_error_count,
2428c948103STao Zhou 	.query_ras_error_address = umc_v6_1_query_ras_error_address,
2439884c2b1SHawking Zhang };
244