12fdb91a2SHawking Zhang /*
22fdb91a2SHawking Zhang * Copyright 2020 Advanced Micro Devices, Inc.
32fdb91a2SHawking Zhang *
42fdb91a2SHawking Zhang * Permission is hereby granted, free of charge, to any person obtaining a
52fdb91a2SHawking Zhang * copy of this software and associated documentation files (the "Software"),
62fdb91a2SHawking Zhang * to deal in the Software without restriction, including without limitation
72fdb91a2SHawking Zhang * the rights to use, copy, modify, merge, publish, distribute, sublicense,
82fdb91a2SHawking Zhang * and/or sell copies of the Software, and to permit persons to whom the
92fdb91a2SHawking Zhang * Software is furnished to do so, subject to the following conditions:
102fdb91a2SHawking Zhang *
112fdb91a2SHawking Zhang * The above copyright notice and this permission notice shall be included in
122fdb91a2SHawking Zhang * all copies or substantial portions of the Software.
132fdb91a2SHawking Zhang *
142fdb91a2SHawking Zhang * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
152fdb91a2SHawking Zhang * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
162fdb91a2SHawking Zhang * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
172fdb91a2SHawking Zhang * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
182fdb91a2SHawking Zhang * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
192fdb91a2SHawking Zhang * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
202fdb91a2SHawking Zhang * OTHER DEALINGS IN THE SOFTWARE.
212fdb91a2SHawking Zhang *
222fdb91a2SHawking Zhang */
232fdb91a2SHawking Zhang #include "amdgpu.h"
24b2459840SHawking Zhang #include "sdma/sdma_4_4_0_offset.h"
25b2459840SHawking Zhang #include "sdma/sdma_4_4_0_sh_mask.h"
26b2459840SHawking Zhang #include "soc15.h"
27b2459840SHawking Zhang #include "amdgpu_ras.h"
28b2459840SHawking Zhang
29b2459840SHawking Zhang #define SDMA1_REG_OFFSET 0x600
30b2459840SHawking Zhang #define SDMA2_REG_OFFSET 0x1cda0
31b2459840SHawking Zhang #define SDMA3_REG_OFFSET 0x1d1a0
32b2459840SHawking Zhang #define SDMA4_REG_OFFSET 0x1d5a0
33b2459840SHawking Zhang
34b2459840SHawking Zhang /* helper function that allow only use sdma0 register offset
35b2459840SHawking Zhang * to calculate register offset for all the sdma instances */
sdma_v4_4_get_reg_offset(struct amdgpu_device * adev,uint32_t instance,uint32_t offset)36b2459840SHawking Zhang static uint32_t sdma_v4_4_get_reg_offset(struct amdgpu_device *adev,
37b2459840SHawking Zhang uint32_t instance,
38b2459840SHawking Zhang uint32_t offset)
39b2459840SHawking Zhang {
40b2459840SHawking Zhang uint32_t sdma_base = adev->reg_offset[SDMA0_HWIP][0][0];
41b2459840SHawking Zhang
42b2459840SHawking Zhang switch (instance) {
43b2459840SHawking Zhang case 0:
44b2459840SHawking Zhang return (sdma_base + offset);
45b2459840SHawking Zhang case 1:
46b2459840SHawking Zhang return (sdma_base + SDMA1_REG_OFFSET + offset);
47b2459840SHawking Zhang case 2:
48b2459840SHawking Zhang return (sdma_base + SDMA2_REG_OFFSET + offset);
49b2459840SHawking Zhang case 3:
50b2459840SHawking Zhang return (sdma_base + SDMA3_REG_OFFSET + offset);
51b2459840SHawking Zhang case 4:
52b2459840SHawking Zhang return (sdma_base + SDMA4_REG_OFFSET + offset);
53b2459840SHawking Zhang default:
54b2459840SHawking Zhang break;
55b2459840SHawking Zhang }
56b2459840SHawking Zhang return 0;
57b2459840SHawking Zhang }
58b2459840SHawking Zhang
59b2459840SHawking Zhang static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = {
60b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF0_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
61b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF0_SED),
62b2459840SHawking Zhang 0, 0,
63b2459840SHawking Zhang },
64b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF1_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
65b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF1_SED),
66b2459840SHawking Zhang 0, 0,
67b2459840SHawking Zhang },
68b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF2_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
69b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF2_SED),
70b2459840SHawking Zhang 0, 0,
71b2459840SHawking Zhang },
72b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF3_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
73b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF3_SED),
74b2459840SHawking Zhang 0, 0,
75b2459840SHawking Zhang },
76b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF4_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
77b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF4_SED),
78b2459840SHawking Zhang 0, 0,
79b2459840SHawking Zhang },
80b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF5_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
81b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF5_SED),
82b2459840SHawking Zhang 0, 0,
83b2459840SHawking Zhang },
84b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF6_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
85b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF6_SED),
86b2459840SHawking Zhang 0, 0,
87b2459840SHawking Zhang },
88b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF7_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
89b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF7_SED),
90b2459840SHawking Zhang 0, 0,
91b2459840SHawking Zhang },
92b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF8_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
93b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF8_SED),
94b2459840SHawking Zhang 0, 0,
95b2459840SHawking Zhang },
96b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF9_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
97b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF9_SED),
98b2459840SHawking Zhang 0, 0,
99b2459840SHawking Zhang },
100b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF10_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
101b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF10_SED),
102b2459840SHawking Zhang 0, 0,
103b2459840SHawking Zhang },
104b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF11_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
105b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF11_SED),
106b2459840SHawking Zhang 0, 0,
107b2459840SHawking Zhang },
108b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF12_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
109b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF12_SED),
110b2459840SHawking Zhang 0, 0,
111b2459840SHawking Zhang },
112b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF13_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
113b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF13_SED),
114b2459840SHawking Zhang 0, 0,
115b2459840SHawking Zhang },
116b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF14_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
117b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF14_SED),
118b2459840SHawking Zhang 0, 0,
119b2459840SHawking Zhang },
120b2459840SHawking Zhang { "SDMA_MBANK_DATA_BUF15_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER),
121b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF15_SED),
122b2459840SHawking Zhang 0, 0,
123b2459840SHawking Zhang },
124b2459840SHawking Zhang { "SDMA_UCODE_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
125b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UCODE_BUF_SED),
126b2459840SHawking Zhang 0, 0,
127b2459840SHawking Zhang },
128b2459840SHawking Zhang { "SDMA_RB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
129b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_RB_CMD_BUF_SED),
130b2459840SHawking Zhang 0, 0,
131b2459840SHawking Zhang },
132b2459840SHawking Zhang { "SDMA_IB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
133b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_IB_CMD_BUF_SED),
134b2459840SHawking Zhang 0, 0,
135b2459840SHawking Zhang },
136b2459840SHawking Zhang { "SDMA_UTCL1_RD_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
137b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_RD_FIFO_SED),
138b2459840SHawking Zhang 0, 0,
139b2459840SHawking Zhang },
140b2459840SHawking Zhang { "SDMA_UTCL1_RDBST_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
141b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_RDBST_FIFO_SED),
142b2459840SHawking Zhang 0, 0,
143b2459840SHawking Zhang },
144*64e2e717SStanley.Yang { "SDMA_UTCL1_WR_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
145*64e2e717SStanley.Yang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_WR_FIFO_SED),
146*64e2e717SStanley.Yang 0, 0,
147*64e2e717SStanley.Yang },
148b2459840SHawking Zhang { "SDMA_DATA_LUT_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
149b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_DATA_LUT_FIFO_SED),
150b2459840SHawking Zhang 0, 0,
151b2459840SHawking Zhang },
152b2459840SHawking Zhang { "SDMA_SPLIT_DATA_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
153b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_SPLIT_DATA_BUF_SED),
154b2459840SHawking Zhang 0, 0,
155b2459840SHawking Zhang },
156b2459840SHawking Zhang { "SDMA_MC_WR_ADDR_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
157b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_MC_WR_ADDR_FIFO_SED),
158b2459840SHawking Zhang 0, 0,
159b2459840SHawking Zhang },
160b2459840SHawking Zhang { "SDMA_MC_RDRET_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2),
161b2459840SHawking Zhang SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_MC_WR_ADDR_FIFO_SED),
162b2459840SHawking Zhang 0, 0,
163b2459840SHawking Zhang },
164b2459840SHawking Zhang };
165b2459840SHawking Zhang
sdma_v4_4_get_ras_error_count(struct amdgpu_device * adev,uint32_t reg_offset,uint32_t value,uint32_t instance,uint32_t * sec_count)166b2459840SHawking Zhang static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
167ceb47e0dSMukul Joshi uint32_t reg_offset,
168b2459840SHawking Zhang uint32_t value,
169b2459840SHawking Zhang uint32_t instance,
170b2459840SHawking Zhang uint32_t *sec_count)
171b2459840SHawking Zhang {
172b2459840SHawking Zhang uint32_t i;
173b2459840SHawking Zhang uint32_t sec_cnt;
174b2459840SHawking Zhang
175b2459840SHawking Zhang /* double bits error (multiple bits) error detection is not supported */
176b2459840SHawking Zhang for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) {
177ceb47e0dSMukul Joshi if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset)
178ceb47e0dSMukul Joshi continue;
179ceb47e0dSMukul Joshi
180b2459840SHawking Zhang /* the SDMA_EDC_COUNTER register in each sdma instance
181b2459840SHawking Zhang * shares the same sed shift_mask
182b2459840SHawking Zhang * */
183b2459840SHawking Zhang sec_cnt = (value &
184b2459840SHawking Zhang sdma_v4_4_ras_fields[i].sec_count_mask) >>
185b2459840SHawking Zhang sdma_v4_4_ras_fields[i].sec_count_shift;
186b2459840SHawking Zhang if (sec_cnt) {
187b2459840SHawking Zhang dev_info(adev->dev, "Detected %s in SDMA%d, SED %d\n",
188b2459840SHawking Zhang sdma_v4_4_ras_fields[i].name,
189b2459840SHawking Zhang instance, sec_cnt);
190b2459840SHawking Zhang *sec_count += sec_cnt;
191b2459840SHawking Zhang }
192b2459840SHawking Zhang }
193b2459840SHawking Zhang }
194b2459840SHawking Zhang
sdma_v4_4_query_ras_error_count_by_instance(struct amdgpu_device * adev,uint32_t instance,void * ras_error_status)195bdc4292bSyipechai static int sdma_v4_4_query_ras_error_count_by_instance(struct amdgpu_device *adev,
196b2459840SHawking Zhang uint32_t instance,
197b2459840SHawking Zhang void *ras_error_status)
198b2459840SHawking Zhang {
199b2459840SHawking Zhang struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
200b2459840SHawking Zhang uint32_t sec_count = 0;
201b2459840SHawking Zhang uint32_t reg_value = 0;
202b2459840SHawking Zhang uint32_t reg_offset = 0;
203b2459840SHawking Zhang
204b2459840SHawking Zhang reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER);
205b2459840SHawking Zhang reg_value = RREG32(reg_offset);
206b2459840SHawking Zhang /* double bit error is not supported */
207b2459840SHawking Zhang if (reg_value)
208ceb47e0dSMukul Joshi sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, reg_value,
209ceb47e0dSMukul Joshi instance, &sec_count);
210ceb47e0dSMukul Joshi
211ceb47e0dSMukul Joshi reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2);
212ceb47e0dSMukul Joshi reg_value = RREG32(reg_offset);
213ceb47e0dSMukul Joshi /* double bit error is not supported */
214ceb47e0dSMukul Joshi if (reg_value)
215ceb47e0dSMukul Joshi sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, reg_value,
216ceb47e0dSMukul Joshi instance, &sec_count);
217ceb47e0dSMukul Joshi
218ceb47e0dSMukul Joshi /*
219ceb47e0dSMukul Joshi * err_data->ue_count should be initialized to 0
220ceb47e0dSMukul Joshi * before calling into this function
221ceb47e0dSMukul Joshi *
222ceb47e0dSMukul Joshi * SDMA RAS supports single bit uncorrectable error detection.
223ceb47e0dSMukul Joshi * So, increment uncorrectable error count.
224ceb47e0dSMukul Joshi */
225ceb47e0dSMukul Joshi err_data->ue_count += sec_count;
226ceb47e0dSMukul Joshi
227ceb47e0dSMukul Joshi /*
228ceb47e0dSMukul Joshi * SDMA RAS does not support correctable errors.
229ceb47e0dSMukul Joshi * Set ce count to 0.
230ceb47e0dSMukul Joshi */
231ceb47e0dSMukul Joshi err_data->ce_count = 0;
232b2459840SHawking Zhang
233b2459840SHawking Zhang return 0;
234b2459840SHawking Zhang };
2352fdb91a2SHawking Zhang
sdma_v4_4_reset_ras_error_count(struct amdgpu_device * adev)236f5f0e4a0SHawking Zhang static void sdma_v4_4_reset_ras_error_count(struct amdgpu_device *adev)
237f5f0e4a0SHawking Zhang {
238f5f0e4a0SHawking Zhang int i;
239f5f0e4a0SHawking Zhang uint32_t reg_offset;
240f5f0e4a0SHawking Zhang
241f5f0e4a0SHawking Zhang /* write 0 to EDC_COUNTER reg to clear sdma edc counters */
242f5f0e4a0SHawking Zhang if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
243f5f0e4a0SHawking Zhang for (i = 0; i < adev->sdma.num_instances; i++) {
244f5f0e4a0SHawking Zhang reg_offset = sdma_v4_4_get_reg_offset(adev, i, regSDMA0_EDC_COUNTER);
245f5f0e4a0SHawking Zhang WREG32(reg_offset, 0);
246f5f0e4a0SHawking Zhang reg_offset = sdma_v4_4_get_reg_offset(adev, i, regSDMA0_EDC_COUNTER2);
247f5f0e4a0SHawking Zhang WREG32(reg_offset, 0);
248f5f0e4a0SHawking Zhang }
249f5f0e4a0SHawking Zhang }
250f5f0e4a0SHawking Zhang }
251f5f0e4a0SHawking Zhang
sdma_v4_4_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)252bdc4292bSyipechai static void sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status)
253bdc4292bSyipechai {
254bdc4292bSyipechai int i = 0;
2558697a19eSyipechai
256bdc4292bSyipechai for (i = 0; i < adev->sdma.num_instances; i++) {
2578697a19eSyipechai if (sdma_v4_4_query_ras_error_count_by_instance(adev, i, ras_error_status)) {
258bdc4292bSyipechai dev_err(adev->dev, "Query ras error count failed in SDMA%d\n", i);
259bdc4292bSyipechai return;
260bdc4292bSyipechai }
261bdc4292bSyipechai }
262bdc4292bSyipechai
263bdc4292bSyipechai }
264bdc4292bSyipechai
265bdc4292bSyipechai const struct amdgpu_ras_block_hw_ops sdma_v4_4_ras_hw_ops = {
266b2459840SHawking Zhang .query_ras_error_count = sdma_v4_4_query_ras_error_count,
267f5f0e4a0SHawking Zhang .reset_ras_error_count = sdma_v4_4_reset_ras_error_count,
2682fdb91a2SHawking Zhang };
269bdc4292bSyipechai
270bdc4292bSyipechai struct amdgpu_sdma_ras sdma_v4_4_ras = {
271bdc4292bSyipechai .ras_block = {
272bdc4292bSyipechai .hw_ops = &sdma_v4_4_ras_hw_ops,
273bdc4292bSyipechai },
274bdc4292bSyipechai };
275