xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
13907c492SJohn Clements /*
23907c492SJohn Clements  * Copyright 2021 Advanced Micro Devices, Inc.
33907c492SJohn Clements  *
43907c492SJohn Clements  * Permission is hereby granted, free of charge, to any person obtaining a
53907c492SJohn Clements  * copy of this software and associated documentation files (the "Software"),
63907c492SJohn Clements  * to deal in the Software without restriction, including without limitation
73907c492SJohn Clements  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
83907c492SJohn Clements  * and/or sell copies of the Software, and to permit persons to whom the
93907c492SJohn Clements  * Software is furnished to do so, subject to the following conditions:
103907c492SJohn Clements  *
113907c492SJohn Clements  * The above copyright notice and this permission notice shall be included in
123907c492SJohn Clements  * all copies or substantial portions of the Software.
133907c492SJohn Clements  *
143907c492SJohn Clements  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
153907c492SJohn Clements  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
163907c492SJohn Clements  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
173907c492SJohn Clements  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
183907c492SJohn Clements  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
193907c492SJohn Clements  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
203907c492SJohn Clements  * OTHER DEALINGS IN THE SOFTWARE.
213907c492SJohn Clements  *
223907c492SJohn Clements  */
233907c492SJohn Clements #include "amdgpu_ras.h"
243907c492SJohn Clements #include "amdgpu.h"
253907c492SJohn Clements #include "amdgpu_mca.h"
263907c492SJohn Clements 
273907c492SJohn Clements #include "umc/umc_6_7_0_offset.h"
283907c492SJohn Clements #include "umc/umc_6_7_0_sh_mask.h"
293907c492SJohn Clements 
amdgpu_mca_query_correctable_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,unsigned long * error_count)303907c492SJohn Clements void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
313907c492SJohn Clements 					      uint64_t mc_status_addr,
323907c492SJohn Clements 					      unsigned long *error_count)
333907c492SJohn Clements {
34640ae42eSJohn Clements 	uint64_t mc_status = RREG64_PCIE(mc_status_addr);
353907c492SJohn Clements 
363907c492SJohn Clements 	if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
373907c492SJohn Clements 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
383907c492SJohn Clements 		*error_count += 1;
393907c492SJohn Clements }
403907c492SJohn Clements 
amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,unsigned long * error_count)413907c492SJohn Clements void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
423907c492SJohn Clements 						uint64_t mc_status_addr,
433907c492SJohn Clements 						unsigned long *error_count)
443907c492SJohn Clements {
45640ae42eSJohn Clements 	uint64_t mc_status = RREG64_PCIE(mc_status_addr);
463907c492SJohn Clements 
473907c492SJohn Clements 	if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
483907c492SJohn Clements 	    (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
493907c492SJohn Clements 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
503907c492SJohn Clements 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
513907c492SJohn Clements 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
523907c492SJohn Clements 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
533907c492SJohn Clements 		*error_count += 1;
543907c492SJohn Clements }
553907c492SJohn Clements 
amdgpu_mca_reset_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr)563907c492SJohn Clements void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
573907c492SJohn Clements 				  uint64_t mc_status_addr)
583907c492SJohn Clements {
59640ae42eSJohn Clements 	WREG64_PCIE(mc_status_addr, 0x0ULL);
603907c492SJohn Clements }
613907c492SJohn Clements 
amdgpu_mca_query_ras_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,void * ras_error_status)623907c492SJohn Clements void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
633907c492SJohn Clements 				      uint64_t mc_status_addr,
643907c492SJohn Clements 				      void *ras_error_status)
653907c492SJohn Clements {
663907c492SJohn Clements 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
673907c492SJohn Clements 
683907c492SJohn Clements 	amdgpu_mca_query_correctable_error_count(adev, mc_status_addr, &(err_data->ce_count));
693907c492SJohn Clements 	amdgpu_mca_query_uncorrectable_error_count(adev, mc_status_addr, &(err_data->ue_count));
703907c492SJohn Clements 
713907c492SJohn Clements 	amdgpu_mca_reset_error_count(adev, mc_status_addr);
723907c492SJohn Clements }
73*7f544c54SHawking Zhang 
amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device * adev)74*7f544c54SHawking Zhang int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev)
75*7f544c54SHawking Zhang {
76*7f544c54SHawking Zhang 	int err;
77*7f544c54SHawking Zhang 	struct amdgpu_mca_ras_block *ras;
78*7f544c54SHawking Zhang 
79*7f544c54SHawking Zhang 	if (!adev->mca.mp0.ras)
80*7f544c54SHawking Zhang 		return 0;
81*7f544c54SHawking Zhang 
82*7f544c54SHawking Zhang 	ras = adev->mca.mp0.ras;
83*7f544c54SHawking Zhang 
84*7f544c54SHawking Zhang 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
85*7f544c54SHawking Zhang 	if (err) {
86*7f544c54SHawking Zhang 		dev_err(adev->dev, "Failed to register mca.mp0 ras block!\n");
87*7f544c54SHawking Zhang 		return err;
88*7f544c54SHawking Zhang 	}
89*7f544c54SHawking Zhang 
90*7f544c54SHawking Zhang 	strcpy(ras->ras_block.ras_comm.name, "mca.mp0");
91*7f544c54SHawking Zhang 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
92*7f544c54SHawking Zhang 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
93*7f544c54SHawking Zhang 	adev->mca.mp0.ras_if = &ras->ras_block.ras_comm;
94*7f544c54SHawking Zhang 
95*7f544c54SHawking Zhang 	return 0;
96*7f544c54SHawking Zhang }
97*7f544c54SHawking Zhang 
amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device * adev)98*7f544c54SHawking Zhang int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev)
99*7f544c54SHawking Zhang {
100*7f544c54SHawking Zhang 	int err;
101*7f544c54SHawking Zhang 	struct amdgpu_mca_ras_block *ras;
102*7f544c54SHawking Zhang 
103*7f544c54SHawking Zhang 	if (!adev->mca.mp1.ras)
104*7f544c54SHawking Zhang 		return 0;
105*7f544c54SHawking Zhang 
106*7f544c54SHawking Zhang 	ras = adev->mca.mp1.ras;
107*7f544c54SHawking Zhang 
108*7f544c54SHawking Zhang 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
109*7f544c54SHawking Zhang 	if (err) {
110*7f544c54SHawking Zhang 		dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
111*7f544c54SHawking Zhang 		return err;
112*7f544c54SHawking Zhang 	}
113*7f544c54SHawking Zhang 
114*7f544c54SHawking Zhang 	strcpy(ras->ras_block.ras_comm.name, "mca.mp1");
115*7f544c54SHawking Zhang 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
116*7f544c54SHawking Zhang 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
117*7f544c54SHawking Zhang 	adev->mca.mp1.ras_if = &ras->ras_block.ras_comm;
118*7f544c54SHawking Zhang 
119*7f544c54SHawking Zhang 	return 0;
120*7f544c54SHawking Zhang }
121*7f544c54SHawking Zhang 
amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device * adev)122*7f544c54SHawking Zhang int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
123*7f544c54SHawking Zhang {
124*7f544c54SHawking Zhang 	int err;
125*7f544c54SHawking Zhang 	struct amdgpu_mca_ras_block *ras;
126*7f544c54SHawking Zhang 
127*7f544c54SHawking Zhang 	if (!adev->mca.mpio.ras)
128*7f544c54SHawking Zhang 		return 0;
129*7f544c54SHawking Zhang 
130*7f544c54SHawking Zhang 	ras = adev->mca.mpio.ras;
131*7f544c54SHawking Zhang 
132*7f544c54SHawking Zhang 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
133*7f544c54SHawking Zhang 	if (err) {
134*7f544c54SHawking Zhang 		dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
135*7f544c54SHawking Zhang 		return err;
136*7f544c54SHawking Zhang 	}
137*7f544c54SHawking Zhang 
138*7f544c54SHawking Zhang 	strcpy(ras->ras_block.ras_comm.name, "mca.mpio");
139*7f544c54SHawking Zhang 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
140*7f544c54SHawking Zhang 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
141*7f544c54SHawking Zhang 	adev->mca.mpio.ras_if = &ras->ras_block.ras_comm;
142*7f544c54SHawking Zhang 
143*7f544c54SHawking Zhang 	return 0;
144*7f544c54SHawking Zhang }
145