1c030f2e4Sxinhui pan /*
2c030f2e4Sxinhui pan * Copyright 2018 Advanced Micro Devices, Inc.
3c030f2e4Sxinhui pan *
4c030f2e4Sxinhui pan * Permission is hereby granted, free of charge, to any person obtaining a
5c030f2e4Sxinhui pan * copy of this software and associated documentation files (the "Software"),
6c030f2e4Sxinhui pan * to deal in the Software without restriction, including without limitation
7c030f2e4Sxinhui pan * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c030f2e4Sxinhui pan * and/or sell copies of the Software, and to permit persons to whom the
9c030f2e4Sxinhui pan * Software is furnished to do so, subject to the following conditions:
10c030f2e4Sxinhui pan *
11c030f2e4Sxinhui pan * The above copyright notice and this permission notice shall be included in
12c030f2e4Sxinhui pan * all copies or substantial portions of the Software.
13c030f2e4Sxinhui pan *
14c030f2e4Sxinhui pan * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15c030f2e4Sxinhui pan * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16c030f2e4Sxinhui pan * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17c030f2e4Sxinhui pan * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18c030f2e4Sxinhui pan * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19c030f2e4Sxinhui pan * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20c030f2e4Sxinhui pan * OTHER DEALINGS IN THE SOFTWARE.
21c030f2e4Sxinhui pan *
22c030f2e4Sxinhui pan *
23c030f2e4Sxinhui pan */
24c030f2e4Sxinhui pan #include <linux/debugfs.h>
25c030f2e4Sxinhui pan #include <linux/list.h>
26c030f2e4Sxinhui pan #include <linux/module.h>
27f867723bSSam Ravnborg #include <linux/uaccess.h>
287c6e68c7SAndrey Grodzovsky #include <linux/reboot.h>
297c6e68c7SAndrey Grodzovsky #include <linux/syscalls.h>
3005adfd80SLuben Tuikov #include <linux/pm_runtime.h>
31f867723bSSam Ravnborg
32c030f2e4Sxinhui pan #include "amdgpu.h"
33c030f2e4Sxinhui pan #include "amdgpu_ras.h"
34b404ae82Sxinhui pan #include "amdgpu_atomfirmware.h"
3519744f5fSHawking Zhang #include "amdgpu_xgmi.h"
364e644fffSHawking Zhang #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
379af357bcSHawking Zhang #include "nbio_v4_3.h"
387692e1eeSTao Zhou #include "nbio_v7_9.h"
39f50160cfSStanley.Yang #include "atom.h"
4025a2b22eSAndrey Grodzovsky #include "amdgpu_reset.h"
4125a2b22eSAndrey Grodzovsky
4212b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
4312b2cab7SMukul Joshi #include <asm/mce.h>
44c030f2e4Sxinhui pan
4512b2cab7SMukul Joshi static bool notifier_registered;
4612b2cab7SMukul Joshi #endif
47eb0c3cd4SGuchun Chen static const char *RAS_FS_NAME = "ras";
48eb0c3cd4SGuchun Chen
49c030f2e4Sxinhui pan const char *ras_error_string[] = {
50c030f2e4Sxinhui pan "none",
51c030f2e4Sxinhui pan "parity",
52c030f2e4Sxinhui pan "single_correctable",
53c030f2e4Sxinhui pan "multi_uncorrectable",
54c030f2e4Sxinhui pan "poison",
55c030f2e4Sxinhui pan };
56c030f2e4Sxinhui pan
57c030f2e4Sxinhui pan const char *ras_block_string[] = {
58c030f2e4Sxinhui pan "umc",
59c030f2e4Sxinhui pan "sdma",
60c030f2e4Sxinhui pan "gfx",
61c030f2e4Sxinhui pan "mmhub",
62c030f2e4Sxinhui pan "athub",
63c030f2e4Sxinhui pan "pcie_bif",
64c030f2e4Sxinhui pan "hdp",
65c030f2e4Sxinhui pan "xgmi_wafl",
66c030f2e4Sxinhui pan "df",
67c030f2e4Sxinhui pan "smn",
68c030f2e4Sxinhui pan "sem",
69c030f2e4Sxinhui pan "mp0",
70c030f2e4Sxinhui pan "mp1",
71c030f2e4Sxinhui pan "fuse",
72640ae42eSJohn Clements "mca",
73a3d63c62SMohammad Zafar Ziya "vcn",
74a3d63c62SMohammad Zafar Ziya "jpeg",
75c030f2e4Sxinhui pan };
76c030f2e4Sxinhui pan
77640ae42eSJohn Clements const char *ras_mca_block_string[] = {
78640ae42eSJohn Clements "mca_mp0",
79640ae42eSJohn Clements "mca_mp1",
80640ae42eSJohn Clements "mca_mpio",
81640ae42eSJohn Clements "mca_iohc",
82640ae42eSJohn Clements };
83640ae42eSJohn Clements
84d5e8ff5fSyipechai struct amdgpu_ras_block_list {
85d5e8ff5fSyipechai /* ras block link */
86d5e8ff5fSyipechai struct list_head node;
87d5e8ff5fSyipechai
88d5e8ff5fSyipechai struct amdgpu_ras_block_object *ras_obj;
89d5e8ff5fSyipechai };
90d5e8ff5fSyipechai
get_ras_block_str(struct ras_common_if * ras_block)91640ae42eSJohn Clements const char *get_ras_block_str(struct ras_common_if *ras_block)
92640ae42eSJohn Clements {
93640ae42eSJohn Clements if (!ras_block)
94640ae42eSJohn Clements return "NULL";
95640ae42eSJohn Clements
96640ae42eSJohn Clements if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
97640ae42eSJohn Clements return "OUT OF RANGE";
98640ae42eSJohn Clements
99640ae42eSJohn Clements if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
100640ae42eSJohn Clements return ras_mca_block_string[ras_block->sub_block_index];
101640ae42eSJohn Clements
102640ae42eSJohn Clements return ras_block_string[ras_block->block];
103640ae42eSJohn Clements }
104640ae42eSJohn Clements
105954ea6aaSyipechai #define ras_block_str(_BLOCK_) \
106954ea6aaSyipechai (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
1078b0fb0e9Syipechai
108c030f2e4Sxinhui pan #define ras_err_str(i) (ras_error_string[ffs(i)])
109c030f2e4Sxinhui pan
110108c6a63Sxinhui pan #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
111108c6a63Sxinhui pan
1127cdc2ee3STao Zhou /* inject address is 52 bits */
1137cdc2ee3STao Zhou #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
1147cdc2ee3STao Zhou
115e4e6a589SLuben Tuikov /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
116e4e6a589SLuben Tuikov #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
117c84d4670SGuchun Chen
11852dd95f2SGuchun Chen enum amdgpu_ras_retire_page_reservation {
11952dd95f2SGuchun Chen AMDGPU_RAS_RETIRE_PAGE_RESERVED,
12052dd95f2SGuchun Chen AMDGPU_RAS_RETIRE_PAGE_PENDING,
12152dd95f2SGuchun Chen AMDGPU_RAS_RETIRE_PAGE_FAULT,
12252dd95f2SGuchun Chen };
1237c6e68c7SAndrey Grodzovsky
1247c6e68c7SAndrey Grodzovsky atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
1257c6e68c7SAndrey Grodzovsky
126676deb38SDennis Li static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
127676deb38SDennis Li uint64_t addr);
1286e4be987STao Zhou static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1296e4be987STao Zhou uint64_t addr);
13012b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
13191a1a52dSMukul Joshi static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
13291a1a52dSMukul Joshi struct mce_notifier_adev_list {
13391a1a52dSMukul Joshi struct amdgpu_device *devs[MAX_GPU_INSTANCE];
13491a1a52dSMukul Joshi int num_gpu;
13591a1a52dSMukul Joshi };
13691a1a52dSMukul Joshi static struct mce_notifier_adev_list mce_adev_list;
13712b2cab7SMukul Joshi #endif
1386e4be987STao Zhou
amdgpu_ras_set_error_query_ready(struct amdgpu_device * adev,bool ready)13961380faaSJohn Clements void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
14061380faaSJohn Clements {
141a9d82d2fSEvan Quan if (adev && amdgpu_ras_get_context(adev))
14261380faaSJohn Clements amdgpu_ras_get_context(adev)->error_query_ready = ready;
14361380faaSJohn Clements }
14461380faaSJohn Clements
amdgpu_ras_get_error_query_ready(struct amdgpu_device * adev)145f3167919SNirmoy Das static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
14661380faaSJohn Clements {
147a9d82d2fSEvan Quan if (adev && amdgpu_ras_get_context(adev))
14861380faaSJohn Clements return amdgpu_ras_get_context(adev)->error_query_ready;
14961380faaSJohn Clements
15061380faaSJohn Clements return false;
15161380faaSJohn Clements }
15261380faaSJohn Clements
amdgpu_reserve_page_direct(struct amdgpu_device * adev,uint64_t address)153cbb8f989SJohn Clements static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
154cbb8f989SJohn Clements {
155cbb8f989SJohn Clements struct ras_err_data err_data = {0, 0, 0, NULL};
156cbb8f989SJohn Clements struct eeprom_table_record err_rec;
157cbb8f989SJohn Clements
158cbb8f989SJohn Clements if ((address >= adev->gmc.mc_vram_size) ||
159cbb8f989SJohn Clements (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
160cbb8f989SJohn Clements dev_warn(adev->dev,
161cbb8f989SJohn Clements "RAS WARN: input address 0x%llx is invalid.\n",
162cbb8f989SJohn Clements address);
163cbb8f989SJohn Clements return -EINVAL;
164cbb8f989SJohn Clements }
165cbb8f989SJohn Clements
166cbb8f989SJohn Clements if (amdgpu_ras_check_bad_page(adev, address)) {
167cbb8f989SJohn Clements dev_warn(adev->dev,
16880b0cd0fSLuben Tuikov "RAS WARN: 0x%llx has already been marked as bad page!\n",
169cbb8f989SJohn Clements address);
170cbb8f989SJohn Clements return 0;
171cbb8f989SJohn Clements }
172cbb8f989SJohn Clements
173cbb8f989SJohn Clements memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
174cbb8f989SJohn Clements err_data.err_addr = &err_rec;
17571344a71SLuben Tuikov amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
176cbb8f989SJohn Clements
177cbb8f989SJohn Clements if (amdgpu_bad_page_threshold != 0) {
178cbb8f989SJohn Clements amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
179cbb8f989SJohn Clements err_data.err_addr_cnt);
1804d33e0f1STao Zhou amdgpu_ras_save_bad_pages(adev, NULL);
181cbb8f989SJohn Clements }
182cbb8f989SJohn Clements
183cbb8f989SJohn Clements dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
184cbb8f989SJohn Clements dev_warn(adev->dev, "Clear EEPROM:\n");
185cbb8f989SJohn Clements dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
186cbb8f989SJohn Clements
187cbb8f989SJohn Clements return 0;
188cbb8f989SJohn Clements }
189cbb8f989SJohn Clements
amdgpu_ras_debugfs_read(struct file * f,char __user * buf,size_t size,loff_t * pos)190c030f2e4Sxinhui pan static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
191c030f2e4Sxinhui pan size_t size, loff_t *pos)
192c030f2e4Sxinhui pan {
193c030f2e4Sxinhui pan struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
194c030f2e4Sxinhui pan struct ras_query_if info = {
195c030f2e4Sxinhui pan .head = obj->head,
196c030f2e4Sxinhui pan };
197c030f2e4Sxinhui pan ssize_t s;
198c030f2e4Sxinhui pan char val[128];
199c030f2e4Sxinhui pan
200761d86d3SDennis Li if (amdgpu_ras_query_error_status(obj->adev, &info))
201c030f2e4Sxinhui pan return -EINVAL;
202c030f2e4Sxinhui pan
2032a460963SCandice Li /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
2042a460963SCandice Li if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
2052a460963SCandice Li obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
2062a460963SCandice Li if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
2072a460963SCandice Li dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
2082a460963SCandice Li }
2092a460963SCandice Li
210c030f2e4Sxinhui pan s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
211c030f2e4Sxinhui pan "ue", info.ue_count,
212c030f2e4Sxinhui pan "ce", info.ce_count);
213c030f2e4Sxinhui pan if (*pos >= s)
214c030f2e4Sxinhui pan return 0;
215c030f2e4Sxinhui pan
216c030f2e4Sxinhui pan s -= *pos;
217c030f2e4Sxinhui pan s = min_t(u64, s, size);
218c030f2e4Sxinhui pan
219c030f2e4Sxinhui pan
220c030f2e4Sxinhui pan if (copy_to_user(buf, &val[*pos], s))
221c030f2e4Sxinhui pan return -EINVAL;
222c030f2e4Sxinhui pan
223c030f2e4Sxinhui pan *pos += s;
224c030f2e4Sxinhui pan
225c030f2e4Sxinhui pan return s;
226c030f2e4Sxinhui pan }
227c030f2e4Sxinhui pan
228c030f2e4Sxinhui pan static const struct file_operations amdgpu_ras_debugfs_ops = {
229c030f2e4Sxinhui pan .owner = THIS_MODULE,
230c030f2e4Sxinhui pan .read = amdgpu_ras_debugfs_read,
231190211abSxinhui pan .write = NULL,
232c030f2e4Sxinhui pan .llseek = default_llseek
233c030f2e4Sxinhui pan };
234c030f2e4Sxinhui pan
amdgpu_ras_find_block_id_by_name(const char * name,int * block_id)23596ebb307Sxinhui pan static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
23696ebb307Sxinhui pan {
23796ebb307Sxinhui pan int i;
23896ebb307Sxinhui pan
23996ebb307Sxinhui pan for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
24096ebb307Sxinhui pan *block_id = i;
241640ae42eSJohn Clements if (strcmp(name, ras_block_string[i]) == 0)
24296ebb307Sxinhui pan return 0;
24396ebb307Sxinhui pan }
24496ebb307Sxinhui pan return -EINVAL;
24596ebb307Sxinhui pan }
24696ebb307Sxinhui pan
amdgpu_ras_debugfs_ctrl_parse_data(struct file * f,const char __user * buf,size_t size,loff_t * pos,struct ras_debug_if * data)24796ebb307Sxinhui pan static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
24896ebb307Sxinhui pan const char __user *buf, size_t size,
24996ebb307Sxinhui pan loff_t *pos, struct ras_debug_if *data)
25096ebb307Sxinhui pan {
25196ebb307Sxinhui pan ssize_t s = min_t(u64, 64, size);
25296ebb307Sxinhui pan char str[65];
25396ebb307Sxinhui pan char block_name[33];
25496ebb307Sxinhui pan char err[9] = "ue";
25596ebb307Sxinhui pan int op = -1;
25696ebb307Sxinhui pan int block_id;
25744494f96STao Zhou uint32_t sub_block;
25896ebb307Sxinhui pan u64 address, value;
2592c22ed0bSTao Zhou /* default value is 0 if the mask is not set by user */
2602c22ed0bSTao Zhou u32 instance_mask = 0;
26196ebb307Sxinhui pan
26296ebb307Sxinhui pan if (*pos)
26396ebb307Sxinhui pan return -EINVAL;
26496ebb307Sxinhui pan *pos = size;
26596ebb307Sxinhui pan
26696ebb307Sxinhui pan memset(str, 0, sizeof(str));
26796ebb307Sxinhui pan memset(data, 0, sizeof(*data));
26896ebb307Sxinhui pan
26996ebb307Sxinhui pan if (copy_from_user(str, buf, s))
27096ebb307Sxinhui pan return -EINVAL;
27196ebb307Sxinhui pan
27296ebb307Sxinhui pan if (sscanf(str, "disable %32s", block_name) == 1)
27396ebb307Sxinhui pan op = 0;
27496ebb307Sxinhui pan else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
27596ebb307Sxinhui pan op = 1;
27696ebb307Sxinhui pan else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
27796ebb307Sxinhui pan op = 2;
2786df23f4cSDennis Li else if (strstr(str, "retire_page") != NULL)
279cbb8f989SJohn Clements op = 3;
280b076296bSxinhui pan else if (str[0] && str[1] && str[2] && str[3])
28196ebb307Sxinhui pan /* ascii string, but commands are not matched. */
28296ebb307Sxinhui pan return -EINVAL;
28396ebb307Sxinhui pan
28496ebb307Sxinhui pan if (op != -1) {
285cbb8f989SJohn Clements if (op == 3) {
286546aa546SLuben Tuikov if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
287546aa546SLuben Tuikov sscanf(str, "%*s %llu", &address) != 1)
288cbb8f989SJohn Clements return -EINVAL;
289cbb8f989SJohn Clements
290cbb8f989SJohn Clements data->op = op;
291cbb8f989SJohn Clements data->inject.address = address;
292cbb8f989SJohn Clements
293cbb8f989SJohn Clements return 0;
294cbb8f989SJohn Clements }
295cbb8f989SJohn Clements
29696ebb307Sxinhui pan if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
29796ebb307Sxinhui pan return -EINVAL;
29896ebb307Sxinhui pan
29996ebb307Sxinhui pan data->head.block = block_id;
300e1063493STao Zhou /* only ue and ce errors are supported */
301e1063493STao Zhou if (!memcmp("ue", err, 2))
302e1063493STao Zhou data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
303e1063493STao Zhou else if (!memcmp("ce", err, 2))
304e1063493STao Zhou data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
305e1063493STao Zhou else
306e1063493STao Zhou return -EINVAL;
307e1063493STao Zhou
30896ebb307Sxinhui pan data->op = op;
30996ebb307Sxinhui pan
31096ebb307Sxinhui pan if (op == 2) {
3112c22ed0bSTao Zhou if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
3122c22ed0bSTao Zhou &sub_block, &address, &value, &instance_mask) != 4 &&
3132c22ed0bSTao Zhou sscanf(str, "%*s %*s %*s %u %llu %llu %u",
3142c22ed0bSTao Zhou &sub_block, &address, &value, &instance_mask) != 4 &&
3152c22ed0bSTao Zhou sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
316546aa546SLuben Tuikov &sub_block, &address, &value) != 3 &&
317546aa546SLuben Tuikov sscanf(str, "%*s %*s %*s %u %llu %llu",
31844494f96STao Zhou &sub_block, &address, &value) != 3)
31996ebb307Sxinhui pan return -EINVAL;
32044494f96STao Zhou data->head.sub_block_index = sub_block;
32196ebb307Sxinhui pan data->inject.address = address;
32296ebb307Sxinhui pan data->inject.value = value;
3232c22ed0bSTao Zhou data->inject.instance_mask = instance_mask;
32496ebb307Sxinhui pan }
32596ebb307Sxinhui pan } else {
32673aa8e1aSxinhui pan if (size < sizeof(*data))
32796ebb307Sxinhui pan return -EINVAL;
32896ebb307Sxinhui pan
32996ebb307Sxinhui pan if (copy_from_user(data, buf, sizeof(*data)))
33096ebb307Sxinhui pan return -EINVAL;
33196ebb307Sxinhui pan }
33296ebb307Sxinhui pan
33396ebb307Sxinhui pan return 0;
33496ebb307Sxinhui pan }
3357c6e68c7SAndrey Grodzovsky
amdgpu_ras_instance_mask_check(struct amdgpu_device * adev,struct ras_debug_if * data)336f464c5ddSTao Zhou static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev,
337f464c5ddSTao Zhou struct ras_debug_if *data)
338f464c5ddSTao Zhou {
339f464c5ddSTao Zhou int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
340f464c5ddSTao Zhou uint32_t mask, inst_mask = data->inject.instance_mask;
341f464c5ddSTao Zhou
342f464c5ddSTao Zhou /* no need to set instance mask if there is only one instance */
343f464c5ddSTao Zhou if (num_xcc <= 1 && inst_mask) {
344f464c5ddSTao Zhou data->inject.instance_mask = 0;
345f464c5ddSTao Zhou dev_dbg(adev->dev,
346f464c5ddSTao Zhou "RAS inject mask(0x%x) isn't supported and force it to 0.\n",
347f464c5ddSTao Zhou inst_mask);
348f464c5ddSTao Zhou
349f464c5ddSTao Zhou return;
350f464c5ddSTao Zhou }
351f464c5ddSTao Zhou
352f464c5ddSTao Zhou switch (data->head.block) {
353f464c5ddSTao Zhou case AMDGPU_RAS_BLOCK__GFX:
354f464c5ddSTao Zhou mask = GENMASK(num_xcc - 1, 0);
355f464c5ddSTao Zhou break;
356f464c5ddSTao Zhou case AMDGPU_RAS_BLOCK__SDMA:
357f464c5ddSTao Zhou mask = GENMASK(adev->sdma.num_instances - 1, 0);
358f464c5ddSTao Zhou break;
359e3959cb5SStanley.Yang case AMDGPU_RAS_BLOCK__VCN:
360e3959cb5SStanley.Yang case AMDGPU_RAS_BLOCK__JPEG:
361e3959cb5SStanley.Yang mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0);
362e3959cb5SStanley.Yang break;
363f464c5ddSTao Zhou default:
364e3959cb5SStanley.Yang mask = inst_mask;
365f464c5ddSTao Zhou break;
366f464c5ddSTao Zhou }
367f464c5ddSTao Zhou
368f464c5ddSTao Zhou /* remove invalid bits in instance mask */
369f464c5ddSTao Zhou data->inject.instance_mask &= mask;
370f464c5ddSTao Zhou if (inst_mask != data->inject.instance_mask)
371f464c5ddSTao Zhou dev_dbg(adev->dev,
372f464c5ddSTao Zhou "Adjust RAS inject mask 0x%x to 0x%x\n",
373f464c5ddSTao Zhou inst_mask, data->inject.instance_mask);
374f464c5ddSTao Zhou }
375f464c5ddSTao Zhou
37674abc221STom St Denis /**
37774abc221STom St Denis * DOC: AMDGPU RAS debugfs control interface
37836ea1bd2Sxinhui pan *
379737c375bSLuben Tuikov * The control interface accepts struct ras_debug_if which has two members.
38036ea1bd2Sxinhui pan *
38136ea1bd2Sxinhui pan * First member: ras_debug_if::head or ras_debug_if::inject.
38296ebb307Sxinhui pan *
38396ebb307Sxinhui pan * head is used to indicate which IP block will be under control.
38436ea1bd2Sxinhui pan *
38536ea1bd2Sxinhui pan * head has four members, they are block, type, sub_block_index, name.
38636ea1bd2Sxinhui pan * block: which IP will be under control.
38736ea1bd2Sxinhui pan * type: what kind of error will be enabled/disabled/injected.
38836ea1bd2Sxinhui pan * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
38936ea1bd2Sxinhui pan * name: the name of IP.
39036ea1bd2Sxinhui pan *
3912c22ed0bSTao Zhou * inject has three more members than head, they are address, value and mask.
39236ea1bd2Sxinhui pan * As their names indicate, inject operation will write the
39336ea1bd2Sxinhui pan * value to the address.
39436ea1bd2Sxinhui pan *
395ef177d11SAlex Deucher * The second member: struct ras_debug_if::op.
396c688a06bSGuchun Chen * It has three kinds of operations.
397879e723dSAdam Zerella *
398879e723dSAdam Zerella * - 0: disable RAS on the block. Take ::head as its data.
399879e723dSAdam Zerella * - 1: enable RAS on the block. Take ::head as its data.
400879e723dSAdam Zerella * - 2: inject errors on the block. Take ::inject as its data.
40136ea1bd2Sxinhui pan *
40296ebb307Sxinhui pan * How to use the interface?
403ef177d11SAlex Deucher *
404737c375bSLuben Tuikov * In a program
405ef177d11SAlex Deucher *
406737c375bSLuben Tuikov * Copy the struct ras_debug_if in your code and initialize it.
407737c375bSLuben Tuikov * Write the struct to the control interface.
408ef177d11SAlex Deucher *
409737c375bSLuben Tuikov * From shell
41096ebb307Sxinhui pan *
411879e723dSAdam Zerella * .. code-block:: bash
412879e723dSAdam Zerella *
413737c375bSLuben Tuikov * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
414737c375bSLuben Tuikov * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
4152c22ed0bSTao Zhou * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
416879e723dSAdam Zerella *
417737c375bSLuben Tuikov * Where N, is the card which you want to affect.
418ef177d11SAlex Deucher *
419737c375bSLuben Tuikov * "disable" requires only the block.
420737c375bSLuben Tuikov * "enable" requires the block and error type.
421737c375bSLuben Tuikov * "inject" requires the block, error type, address, and value.
422c666bbf0SDwaipayan Ray *
423737c375bSLuben Tuikov * The block is one of: umc, sdma, gfx, etc.
42496ebb307Sxinhui pan * see ras_block_string[] for details
425c666bbf0SDwaipayan Ray *
426737c375bSLuben Tuikov * The error type is one of: ue, ce, where,
427737c375bSLuben Tuikov * ue is multi-uncorrectable
428737c375bSLuben Tuikov * ce is single-correctable
429c666bbf0SDwaipayan Ray *
430737c375bSLuben Tuikov * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
431737c375bSLuben Tuikov * The address and value are hexadecimal numbers, leading 0x is optional.
4322c22ed0bSTao Zhou * The mask means instance mask, is optional, default value is 0x1.
43396ebb307Sxinhui pan *
434737c375bSLuben Tuikov * For instance,
435879e723dSAdam Zerella *
436879e723dSAdam Zerella * .. code-block:: bash
437879e723dSAdam Zerella *
43844494f96STao Zhou * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
4392c22ed0bSTao Zhou * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
44096ebb307Sxinhui pan * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
44196ebb307Sxinhui pan *
442737c375bSLuben Tuikov * How to check the result of the operation?
44336ea1bd2Sxinhui pan *
444737c375bSLuben Tuikov * To check disable/enable, see "ras" features at,
44536ea1bd2Sxinhui pan * /sys/class/drm/card[0/1/2...]/device/ras/features
44636ea1bd2Sxinhui pan *
447737c375bSLuben Tuikov * To check inject, see the corresponding error count at,
448737c375bSLuben Tuikov * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
44936ea1bd2Sxinhui pan *
450879e723dSAdam Zerella * .. note::
451ef177d11SAlex Deucher * Operations are only allowed on blocks which are supported.
452737c375bSLuben Tuikov * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
453ef177d11SAlex Deucher * to see which blocks support RAS on a particular asic.
454ef177d11SAlex Deucher *
45536ea1bd2Sxinhui pan */
amdgpu_ras_debugfs_ctrl_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)456cf696091SLuben Tuikov static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
457cf696091SLuben Tuikov const char __user *buf,
45836ea1bd2Sxinhui pan size_t size, loff_t *pos)
45936ea1bd2Sxinhui pan {
46036ea1bd2Sxinhui pan struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
46136ea1bd2Sxinhui pan struct ras_debug_if data;
46236ea1bd2Sxinhui pan int ret = 0;
46336ea1bd2Sxinhui pan
46461380faaSJohn Clements if (!amdgpu_ras_get_error_query_ready(adev)) {
4656952e99cSGuchun Chen dev_warn(adev->dev, "RAS WARN: error injection "
4666952e99cSGuchun Chen "currently inaccessible\n");
46743c4d576SJohn Clements return size;
46843c4d576SJohn Clements }
46943c4d576SJohn Clements
47096ebb307Sxinhui pan ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
47196ebb307Sxinhui pan if (ret)
472cf696091SLuben Tuikov return ret;
47336ea1bd2Sxinhui pan
47480b0cd0fSLuben Tuikov if (data.op == 3) {
475cbb8f989SJohn Clements ret = amdgpu_reserve_page_direct(adev, data.inject.address);
47680b0cd0fSLuben Tuikov if (!ret)
477cbb8f989SJohn Clements return size;
478cbb8f989SJohn Clements else
479cbb8f989SJohn Clements return ret;
480cbb8f989SJohn Clements }
481cbb8f989SJohn Clements
48236ea1bd2Sxinhui pan if (!amdgpu_ras_is_supported(adev, data.head.block))
48336ea1bd2Sxinhui pan return -EINVAL;
48436ea1bd2Sxinhui pan
48536ea1bd2Sxinhui pan switch (data.op) {
48636ea1bd2Sxinhui pan case 0:
48736ea1bd2Sxinhui pan ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
48836ea1bd2Sxinhui pan break;
48936ea1bd2Sxinhui pan case 1:
49036ea1bd2Sxinhui pan ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
49136ea1bd2Sxinhui pan break;
49236ea1bd2Sxinhui pan case 2:
49343aedbf4SStanley.Yang if ((data.inject.address >= adev->gmc.mc_vram_size &&
49443aedbf4SStanley.Yang adev->gmc.mc_vram_size) ||
4957cdc2ee3STao Zhou (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
496b0d4783aSGuchun Chen dev_warn(adev->dev, "RAS WARN: input address "
497b0d4783aSGuchun Chen "0x%llx is invalid.",
498b0d4783aSGuchun Chen data.inject.address);
4997cdc2ee3STao Zhou ret = -EINVAL;
500efb426d5Sxinhui pan break;
5017cdc2ee3STao Zhou }
5027cdc2ee3STao Zhou
5036e4be987STao Zhou /* umc ce/ue error injection for a bad page is not allowed */
5046e4be987STao Zhou if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
5056e4be987STao Zhou amdgpu_ras_check_bad_page(adev, data.inject.address)) {
506c65b0805SLuben Tuikov dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
507c65b0805SLuben Tuikov "already been marked as bad!\n",
5086e4be987STao Zhou data.inject.address);
5096e4be987STao Zhou break;
5106e4be987STao Zhou }
5116e4be987STao Zhou
512f464c5ddSTao Zhou amdgpu_ras_instance_mask_check(adev, &data);
513f464c5ddSTao Zhou
5147cdc2ee3STao Zhou /* data.inject.address is offset instead of absolute gpu address */
51536ea1bd2Sxinhui pan ret = amdgpu_ras_error_inject(adev, &data.inject);
51636ea1bd2Sxinhui pan break;
51796ebb307Sxinhui pan default:
51896ebb307Sxinhui pan ret = -EINVAL;
51996ebb307Sxinhui pan break;
520374bf7bdSzhengbin }
52136ea1bd2Sxinhui pan
52236ea1bd2Sxinhui pan if (ret)
52379c04621SStanley.Yang return ret;
52436ea1bd2Sxinhui pan
52536ea1bd2Sxinhui pan return size;
52636ea1bd2Sxinhui pan }
52736ea1bd2Sxinhui pan
528084fe13bSAndrey Grodzovsky /**
529084fe13bSAndrey Grodzovsky * DOC: AMDGPU RAS debugfs EEPROM table reset interface
530084fe13bSAndrey Grodzovsky *
531f77c7109SAlex Deucher * Some boards contain an EEPROM which is used to persistently store a list of
532ef177d11SAlex Deucher * bad pages which experiences ECC errors in vram. This interface provides
533f77c7109SAlex Deucher * a way to reset the EEPROM, e.g., after testing error injection.
534f77c7109SAlex Deucher *
535f77c7109SAlex Deucher * Usage:
536f77c7109SAlex Deucher *
537f77c7109SAlex Deucher * .. code-block:: bash
538f77c7109SAlex Deucher *
539f77c7109SAlex Deucher * echo 1 > ../ras/ras_eeprom_reset
540f77c7109SAlex Deucher *
541f77c7109SAlex Deucher * will reset EEPROM table to 0 entries.
542f77c7109SAlex Deucher *
543084fe13bSAndrey Grodzovsky */
amdgpu_ras_debugfs_eeprom_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)544cf696091SLuben Tuikov static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
545cf696091SLuben Tuikov const char __user *buf,
546084fe13bSAndrey Grodzovsky size_t size, loff_t *pos)
547084fe13bSAndrey Grodzovsky {
548bf0b91b7SGuchun Chen struct amdgpu_device *adev =
549bf0b91b7SGuchun Chen (struct amdgpu_device *)file_inode(f)->i_private;
550084fe13bSAndrey Grodzovsky int ret;
551084fe13bSAndrey Grodzovsky
552bf0b91b7SGuchun Chen ret = amdgpu_ras_eeprom_reset_table(
553bf0b91b7SGuchun Chen &(amdgpu_ras_get_context(adev)->eeprom_control));
554084fe13bSAndrey Grodzovsky
55563d4c081SLuben Tuikov if (!ret) {
556cf696091SLuben Tuikov /* Something was written to EEPROM.
557cf696091SLuben Tuikov */
558bf0b91b7SGuchun Chen amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
559bf0b91b7SGuchun Chen return size;
560bf0b91b7SGuchun Chen } else {
561cf696091SLuben Tuikov return ret;
562bf0b91b7SGuchun Chen }
563084fe13bSAndrey Grodzovsky }
564084fe13bSAndrey Grodzovsky
56536ea1bd2Sxinhui pan static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
56636ea1bd2Sxinhui pan .owner = THIS_MODULE,
56736ea1bd2Sxinhui pan .read = NULL,
56836ea1bd2Sxinhui pan .write = amdgpu_ras_debugfs_ctrl_write,
56936ea1bd2Sxinhui pan .llseek = default_llseek
57036ea1bd2Sxinhui pan };
57136ea1bd2Sxinhui pan
572084fe13bSAndrey Grodzovsky static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
573084fe13bSAndrey Grodzovsky .owner = THIS_MODULE,
574084fe13bSAndrey Grodzovsky .read = NULL,
575084fe13bSAndrey Grodzovsky .write = amdgpu_ras_debugfs_eeprom_write,
576084fe13bSAndrey Grodzovsky .llseek = default_llseek
577084fe13bSAndrey Grodzovsky };
578084fe13bSAndrey Grodzovsky
579f77c7109SAlex Deucher /**
580f77c7109SAlex Deucher * DOC: AMDGPU RAS sysfs Error Count Interface
581f77c7109SAlex Deucher *
582ef177d11SAlex Deucher * It allows the user to read the error count for each IP block on the gpu through
583f77c7109SAlex Deucher * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
584f77c7109SAlex Deucher *
585f77c7109SAlex Deucher * It outputs the multiple lines which report the uncorrected (ue) and corrected
586f77c7109SAlex Deucher * (ce) error counts.
587f77c7109SAlex Deucher *
588f77c7109SAlex Deucher * The format of one line is below,
589f77c7109SAlex Deucher *
590f77c7109SAlex Deucher * [ce|ue]: count
591f77c7109SAlex Deucher *
592f77c7109SAlex Deucher * Example:
593f77c7109SAlex Deucher *
594f77c7109SAlex Deucher * .. code-block:: bash
595f77c7109SAlex Deucher *
596f77c7109SAlex Deucher * ue: 0
597f77c7109SAlex Deucher * ce: 1
598f77c7109SAlex Deucher *
599f77c7109SAlex Deucher */
amdgpu_ras_sysfs_read(struct device * dev,struct device_attribute * attr,char * buf)600c030f2e4Sxinhui pan static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
601c030f2e4Sxinhui pan struct device_attribute *attr, char *buf)
602c030f2e4Sxinhui pan {
603c030f2e4Sxinhui pan struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
604c030f2e4Sxinhui pan struct ras_query_if info = {
605c030f2e4Sxinhui pan .head = obj->head,
606c030f2e4Sxinhui pan };
607c030f2e4Sxinhui pan
60861380faaSJohn Clements if (!amdgpu_ras_get_error_query_ready(obj->adev))
60936000c7aSTian Tao return sysfs_emit(buf, "Query currently inaccessible\n");
61043c4d576SJohn Clements
611761d86d3SDennis Li if (amdgpu_ras_query_error_status(obj->adev, &info))
612c030f2e4Sxinhui pan return -EINVAL;
613c030f2e4Sxinhui pan
6142a460963SCandice Li if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
6152a460963SCandice Li obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
6161f0d8e37SMukul Joshi if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
6172a460963SCandice Li dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
6181f0d8e37SMukul Joshi }
6191f0d8e37SMukul Joshi
62036000c7aSTian Tao return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
621c030f2e4Sxinhui pan "ce", info.ce_count);
622c030f2e4Sxinhui pan }
623c030f2e4Sxinhui pan
624c030f2e4Sxinhui pan /* obj begin */
625c030f2e4Sxinhui pan
626c030f2e4Sxinhui pan #define get_obj(obj) do { (obj)->use++; } while (0)
627c030f2e4Sxinhui pan #define alive_obj(obj) ((obj)->use)
628c030f2e4Sxinhui pan
put_obj(struct ras_manager * obj)629c030f2e4Sxinhui pan static inline void put_obj(struct ras_manager *obj)
630c030f2e4Sxinhui pan {
631f0872686SBernard Zhao if (obj && (--obj->use == 0))
632c030f2e4Sxinhui pan list_del(&obj->node);
633f0872686SBernard Zhao if (obj && (obj->use < 0))
634640ae42eSJohn Clements DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
635c030f2e4Sxinhui pan }
636c030f2e4Sxinhui pan
637c030f2e4Sxinhui pan /* make one obj and return it. */
amdgpu_ras_create_obj(struct amdgpu_device * adev,struct ras_common_if * head)638c030f2e4Sxinhui pan static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
639c030f2e4Sxinhui pan struct ras_common_if *head)
640c030f2e4Sxinhui pan {
641c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
642c030f2e4Sxinhui pan struct ras_manager *obj;
643c030f2e4Sxinhui pan
6448ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
645c030f2e4Sxinhui pan return NULL;
646c030f2e4Sxinhui pan
647c030f2e4Sxinhui pan if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
648c030f2e4Sxinhui pan return NULL;
649c030f2e4Sxinhui pan
650640ae42eSJohn Clements if (head->block == AMDGPU_RAS_BLOCK__MCA) {
651640ae42eSJohn Clements if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
652640ae42eSJohn Clements return NULL;
653640ae42eSJohn Clements
654640ae42eSJohn Clements obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
655640ae42eSJohn Clements } else
656c030f2e4Sxinhui pan obj = &con->objs[head->block];
657640ae42eSJohn Clements
658c030f2e4Sxinhui pan /* already exist. return obj? */
659c030f2e4Sxinhui pan if (alive_obj(obj))
660c030f2e4Sxinhui pan return NULL;
661c030f2e4Sxinhui pan
662c030f2e4Sxinhui pan obj->head = *head;
663c030f2e4Sxinhui pan obj->adev = adev;
664c030f2e4Sxinhui pan list_add(&obj->node, &con->head);
665c030f2e4Sxinhui pan get_obj(obj);
666c030f2e4Sxinhui pan
667c030f2e4Sxinhui pan return obj;
668c030f2e4Sxinhui pan }
669c030f2e4Sxinhui pan
670c030f2e4Sxinhui pan /* return an obj equal to head, or the first when head is NULL */
amdgpu_ras_find_obj(struct amdgpu_device * adev,struct ras_common_if * head)671f2a79be1SLe Ma struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
672c030f2e4Sxinhui pan struct ras_common_if *head)
673c030f2e4Sxinhui pan {
674c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
675c030f2e4Sxinhui pan struct ras_manager *obj;
676c030f2e4Sxinhui pan int i;
677c030f2e4Sxinhui pan
6788ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
679c030f2e4Sxinhui pan return NULL;
680c030f2e4Sxinhui pan
681c030f2e4Sxinhui pan if (head) {
682c030f2e4Sxinhui pan if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
683c030f2e4Sxinhui pan return NULL;
684c030f2e4Sxinhui pan
685640ae42eSJohn Clements if (head->block == AMDGPU_RAS_BLOCK__MCA) {
686640ae42eSJohn Clements if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
687640ae42eSJohn Clements return NULL;
688640ae42eSJohn Clements
689640ae42eSJohn Clements obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
690640ae42eSJohn Clements } else
691c030f2e4Sxinhui pan obj = &con->objs[head->block];
692c030f2e4Sxinhui pan
693640ae42eSJohn Clements if (alive_obj(obj))
694c030f2e4Sxinhui pan return obj;
695c030f2e4Sxinhui pan } else {
696640ae42eSJohn Clements for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
697c030f2e4Sxinhui pan obj = &con->objs[i];
698640ae42eSJohn Clements if (alive_obj(obj))
699c030f2e4Sxinhui pan return obj;
700c030f2e4Sxinhui pan }
701c030f2e4Sxinhui pan }
702c030f2e4Sxinhui pan
703c030f2e4Sxinhui pan return NULL;
704c030f2e4Sxinhui pan }
705c030f2e4Sxinhui pan /* obj end */
706c030f2e4Sxinhui pan
707c030f2e4Sxinhui pan /* feature ctl begin */
amdgpu_ras_is_feature_allowed(struct amdgpu_device * adev,struct ras_common_if * head)708c030f2e4Sxinhui pan static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
709c030f2e4Sxinhui pan struct ras_common_if *head)
710c030f2e4Sxinhui pan {
7118ab0d6f0SLuben Tuikov return adev->ras_hw_enabled & BIT(head->block);
712c030f2e4Sxinhui pan }
713c030f2e4Sxinhui pan
amdgpu_ras_is_feature_enabled(struct amdgpu_device * adev,struct ras_common_if * head)714c030f2e4Sxinhui pan static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
715c030f2e4Sxinhui pan struct ras_common_if *head)
716c030f2e4Sxinhui pan {
717c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
718c030f2e4Sxinhui pan
719c030f2e4Sxinhui pan return con->features & BIT(head->block);
720c030f2e4Sxinhui pan }
721c030f2e4Sxinhui pan
722c030f2e4Sxinhui pan /*
723c030f2e4Sxinhui pan * if obj is not created, then create one.
724c030f2e4Sxinhui pan * set feature enable flag.
725c030f2e4Sxinhui pan */
__amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,int enable)726c030f2e4Sxinhui pan static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
727c030f2e4Sxinhui pan struct ras_common_if *head, int enable)
728c030f2e4Sxinhui pan {
729c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
730c030f2e4Sxinhui pan struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
731c030f2e4Sxinhui pan
7325caf466aSxinhui pan /* If hardware does not support ras, then do not create obj.
7335caf466aSxinhui pan * But if hardware support ras, we can create the obj.
7345caf466aSxinhui pan * Ras framework checks con->hw_supported to see if it need do
7355caf466aSxinhui pan * corresponding initialization.
7365caf466aSxinhui pan * IP checks con->support to see if it need disable ras.
7375caf466aSxinhui pan */
738c030f2e4Sxinhui pan if (!amdgpu_ras_is_feature_allowed(adev, head))
739c030f2e4Sxinhui pan return 0;
740c030f2e4Sxinhui pan
741c030f2e4Sxinhui pan if (enable) {
742c030f2e4Sxinhui pan if (!obj) {
743c030f2e4Sxinhui pan obj = amdgpu_ras_create_obj(adev, head);
744c030f2e4Sxinhui pan if (!obj)
745c030f2e4Sxinhui pan return -EINVAL;
746c030f2e4Sxinhui pan } else {
747c030f2e4Sxinhui pan /* In case we create obj somewhere else */
748c030f2e4Sxinhui pan get_obj(obj);
749c030f2e4Sxinhui pan }
750c030f2e4Sxinhui pan con->features |= BIT(head->block);
751c030f2e4Sxinhui pan } else {
752c030f2e4Sxinhui pan if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
753c030f2e4Sxinhui pan con->features &= ~BIT(head->block);
754c030f2e4Sxinhui pan put_obj(obj);
755c030f2e4Sxinhui pan }
756c030f2e4Sxinhui pan }
757c030f2e4Sxinhui pan
758c030f2e4Sxinhui pan return 0;
759c030f2e4Sxinhui pan }
760c030f2e4Sxinhui pan
761c030f2e4Sxinhui pan /* wrapper of psp_ras_enable_features */
amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)762c030f2e4Sxinhui pan int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
763c030f2e4Sxinhui pan struct ras_common_if *head, bool enable)
764c030f2e4Sxinhui pan {
765c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
7667fcffecfSArnd Bergmann union ta_ras_cmd_input *info;
7679f051d6fSHawking Zhang int ret;
768c030f2e4Sxinhui pan
769c030f2e4Sxinhui pan if (!con)
770c030f2e4Sxinhui pan return -EINVAL;
771c030f2e4Sxinhui pan
7726fc9d92cSHawking Zhang /* Do not enable ras feature if it is not allowed */
7736fc9d92cSHawking Zhang if (enable &&
7746fc9d92cSHawking Zhang head->block != AMDGPU_RAS_BLOCK__GFX &&
7756fc9d92cSHawking Zhang !amdgpu_ras_is_feature_allowed(adev, head))
7769f051d6fSHawking Zhang return 0;
7776fc9d92cSHawking Zhang
7786fc9d92cSHawking Zhang /* Only enable gfx ras feature from host side */
7796fc9d92cSHawking Zhang if (head->block == AMDGPU_RAS_BLOCK__GFX &&
7806fc9d92cSHawking Zhang !amdgpu_sriov_vf(adev) &&
7816fc9d92cSHawking Zhang !amdgpu_ras_intr_triggered()) {
7827fcffecfSArnd Bergmann info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
7837fcffecfSArnd Bergmann if (!info)
7847fcffecfSArnd Bergmann return -ENOMEM;
7857fcffecfSArnd Bergmann
786c030f2e4Sxinhui pan if (!enable) {
7877fcffecfSArnd Bergmann info->disable_features = (struct ta_ras_disable_features_input) {
788828cfa29Sxinhui pan .block_id = amdgpu_ras_block_to_ta(head->block),
789828cfa29Sxinhui pan .error_type = amdgpu_ras_error_to_ta(head->type),
790c030f2e4Sxinhui pan };
791c030f2e4Sxinhui pan } else {
7927fcffecfSArnd Bergmann info->enable_features = (struct ta_ras_enable_features_input) {
793828cfa29Sxinhui pan .block_id = amdgpu_ras_block_to_ta(head->block),
794828cfa29Sxinhui pan .error_type = amdgpu_ras_error_to_ta(head->type),
795c030f2e4Sxinhui pan };
796c030f2e4Sxinhui pan }
797c030f2e4Sxinhui pan
7987fcffecfSArnd Bergmann ret = psp_ras_enable_features(&adev->psp, info, enable);
799c030f2e4Sxinhui pan if (ret) {
800e4348849STao Zhou dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
801c030f2e4Sxinhui pan enable ? "enable":"disable",
802640ae42eSJohn Clements get_ras_block_str(head),
803e4348849STao Zhou amdgpu_ras_is_poison_mode_supported(adev), ret);
804f387bb57SCong Liu kfree(info);
8059f051d6fSHawking Zhang return ret;
806c030f2e4Sxinhui pan }
8079f051d6fSHawking Zhang
8089f051d6fSHawking Zhang kfree(info);
809bff77e86SLe Ma }
810c030f2e4Sxinhui pan
811c030f2e4Sxinhui pan /* setup the obj */
812c030f2e4Sxinhui pan __amdgpu_ras_feature_enable(adev, head, enable);
8139f051d6fSHawking Zhang
8149f051d6fSHawking Zhang return 0;
815c030f2e4Sxinhui pan }
816c030f2e4Sxinhui pan
81777de502bSxinhui pan /* Only used in device probe stage and called only once. */
amdgpu_ras_feature_enable_on_boot(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)81877de502bSxinhui pan int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
81977de502bSxinhui pan struct ras_common_if *head, bool enable)
82077de502bSxinhui pan {
82177de502bSxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
82277de502bSxinhui pan int ret;
82377de502bSxinhui pan
82477de502bSxinhui pan if (!con)
82577de502bSxinhui pan return -EINVAL;
82677de502bSxinhui pan
82777de502bSxinhui pan if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
8287af23ebeSxinhui pan if (enable) {
8297af23ebeSxinhui pan /* There is no harm to issue a ras TA cmd regardless of
8307af23ebeSxinhui pan * the currecnt ras state.
8317af23ebeSxinhui pan * If current state == target state, it will do nothing
8327af23ebeSxinhui pan * But sometimes it requests driver to reset and repost
8337af23ebeSxinhui pan * with error code -EAGAIN.
83477de502bSxinhui pan */
8357af23ebeSxinhui pan ret = amdgpu_ras_feature_enable(adev, head, 1);
8367af23ebeSxinhui pan /* With old ras TA, we might fail to enable ras.
8377af23ebeSxinhui pan * Log it and just setup the object.
8387af23ebeSxinhui pan * TODO need remove this WA in the future.
8397af23ebeSxinhui pan */
8407af23ebeSxinhui pan if (ret == -EINVAL) {
8417af23ebeSxinhui pan ret = __amdgpu_ras_feature_enable(adev, head, 1);
8427af23ebeSxinhui pan if (!ret)
8436952e99cSGuchun Chen dev_info(adev->dev,
8446952e99cSGuchun Chen "RAS INFO: %s setup object\n",
845640ae42eSJohn Clements get_ras_block_str(head));
8467af23ebeSxinhui pan }
8477af23ebeSxinhui pan } else {
8487af23ebeSxinhui pan /* setup the object then issue a ras TA disable cmd.*/
84977de502bSxinhui pan ret = __amdgpu_ras_feature_enable(adev, head, 1);
85077de502bSxinhui pan if (ret)
85177de502bSxinhui pan return ret;
85277de502bSxinhui pan
853970fd197SStanley.Yang /* gfx block ras dsiable cmd must send to ras-ta */
854970fd197SStanley.Yang if (head->block == AMDGPU_RAS_BLOCK__GFX)
855970fd197SStanley.Yang con->features |= BIT(head->block);
856970fd197SStanley.Yang
85777de502bSxinhui pan ret = amdgpu_ras_feature_enable(adev, head, 0);
85819d0dfdaSStanley.Yang
85919d0dfdaSStanley.Yang /* clean gfx block ras features flag */
8608ab0d6f0SLuben Tuikov if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
86119d0dfdaSStanley.Yang con->features &= ~BIT(head->block);
8627af23ebeSxinhui pan }
86377de502bSxinhui pan } else
86477de502bSxinhui pan ret = amdgpu_ras_feature_enable(adev, head, enable);
86577de502bSxinhui pan
86677de502bSxinhui pan return ret;
86777de502bSxinhui pan }
86877de502bSxinhui pan
amdgpu_ras_disable_all_features(struct amdgpu_device * adev,bool bypass)869c030f2e4Sxinhui pan static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
870c030f2e4Sxinhui pan bool bypass)
871c030f2e4Sxinhui pan {
872c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
873c030f2e4Sxinhui pan struct ras_manager *obj, *tmp;
874c030f2e4Sxinhui pan
875c030f2e4Sxinhui pan list_for_each_entry_safe(obj, tmp, &con->head, node) {
876c030f2e4Sxinhui pan /* bypass psp.
877c030f2e4Sxinhui pan * aka just release the obj and corresponding flags
878c030f2e4Sxinhui pan */
879c030f2e4Sxinhui pan if (bypass) {
880c030f2e4Sxinhui pan if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
881c030f2e4Sxinhui pan break;
882c030f2e4Sxinhui pan } else {
883c030f2e4Sxinhui pan if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
884c030f2e4Sxinhui pan break;
885c030f2e4Sxinhui pan }
886289d513bSkbuild test robot }
887c030f2e4Sxinhui pan
888c030f2e4Sxinhui pan return con->features;
889c030f2e4Sxinhui pan }
890c030f2e4Sxinhui pan
amdgpu_ras_enable_all_features(struct amdgpu_device * adev,bool bypass)891c030f2e4Sxinhui pan static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
892c030f2e4Sxinhui pan bool bypass)
893c030f2e4Sxinhui pan {
894c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
895c030f2e4Sxinhui pan int i;
896640ae42eSJohn Clements const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
897c030f2e4Sxinhui pan
898640ae42eSJohn Clements for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
899c030f2e4Sxinhui pan struct ras_common_if head = {
900c030f2e4Sxinhui pan .block = i,
901191051a1Sxinhui pan .type = default_ras_type,
902c030f2e4Sxinhui pan .sub_block_index = 0,
903c030f2e4Sxinhui pan };
904640ae42eSJohn Clements
905640ae42eSJohn Clements if (i == AMDGPU_RAS_BLOCK__MCA)
906640ae42eSJohn Clements continue;
907640ae42eSJohn Clements
908640ae42eSJohn Clements if (bypass) {
909640ae42eSJohn Clements /*
910640ae42eSJohn Clements * bypass psp. vbios enable ras for us.
911640ae42eSJohn Clements * so just create the obj
912640ae42eSJohn Clements */
913640ae42eSJohn Clements if (__amdgpu_ras_feature_enable(adev, &head, 1))
914640ae42eSJohn Clements break;
915640ae42eSJohn Clements } else {
916640ae42eSJohn Clements if (amdgpu_ras_feature_enable(adev, &head, 1))
917640ae42eSJohn Clements break;
918640ae42eSJohn Clements }
919640ae42eSJohn Clements }
920640ae42eSJohn Clements
921640ae42eSJohn Clements for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
922640ae42eSJohn Clements struct ras_common_if head = {
923640ae42eSJohn Clements .block = AMDGPU_RAS_BLOCK__MCA,
924640ae42eSJohn Clements .type = default_ras_type,
925640ae42eSJohn Clements .sub_block_index = i,
926640ae42eSJohn Clements };
927640ae42eSJohn Clements
928c030f2e4Sxinhui pan if (bypass) {
929c030f2e4Sxinhui pan /*
930c030f2e4Sxinhui pan * bypass psp. vbios enable ras for us.
931c030f2e4Sxinhui pan * so just create the obj
932c030f2e4Sxinhui pan */
933c030f2e4Sxinhui pan if (__amdgpu_ras_feature_enable(adev, &head, 1))
934c030f2e4Sxinhui pan break;
935c030f2e4Sxinhui pan } else {
936c030f2e4Sxinhui pan if (amdgpu_ras_feature_enable(adev, &head, 1))
937c030f2e4Sxinhui pan break;
938c030f2e4Sxinhui pan }
939289d513bSkbuild test robot }
940c030f2e4Sxinhui pan
941c030f2e4Sxinhui pan return con->features;
942c030f2e4Sxinhui pan }
943c030f2e4Sxinhui pan /* feature ctl end */
944c030f2e4Sxinhui pan
amdgpu_ras_block_match_default(struct amdgpu_ras_block_object * block_obj,enum amdgpu_ras_block block)945e3d833f4Syipechai static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
946e3d833f4Syipechai enum amdgpu_ras_block block)
947640ae42eSJohn Clements {
9486492e1b0Syipechai if (!block_obj)
9496492e1b0Syipechai return -EINVAL;
9506492e1b0Syipechai
951bdb3489cSyipechai if (block_obj->ras_comm.block == block)
9526492e1b0Syipechai return 0;
9536492e1b0Syipechai
9546492e1b0Syipechai return -EINVAL;
955640ae42eSJohn Clements }
9566492e1b0Syipechai
amdgpu_ras_get_ras_block(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint32_t sub_block_index)9576492e1b0Syipechai static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
9586492e1b0Syipechai enum amdgpu_ras_block block, uint32_t sub_block_index)
9596492e1b0Syipechai {
960d5e8ff5fSyipechai struct amdgpu_ras_block_list *node, *tmp;
961d5e8ff5fSyipechai struct amdgpu_ras_block_object *obj;
9626492e1b0Syipechai
9636492e1b0Syipechai if (block >= AMDGPU_RAS_BLOCK__LAST)
9646492e1b0Syipechai return NULL;
9656492e1b0Syipechai
966d5e8ff5fSyipechai list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
967d5e8ff5fSyipechai if (!node->ras_obj) {
968d5e8ff5fSyipechai dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
969d5e8ff5fSyipechai continue;
970d5e8ff5fSyipechai }
971d5e8ff5fSyipechai
972d5e8ff5fSyipechai obj = node->ras_obj;
9736492e1b0Syipechai if (obj->ras_block_match) {
9746492e1b0Syipechai if (obj->ras_block_match(obj, block, sub_block_index) == 0)
9756492e1b0Syipechai return obj;
9766492e1b0Syipechai } else {
9776492e1b0Syipechai if (amdgpu_ras_block_match_default(obj, block) == 0)
9786492e1b0Syipechai return obj;
9796492e1b0Syipechai }
9806492e1b0Syipechai }
9816492e1b0Syipechai
9826492e1b0Syipechai return NULL;
983640ae42eSJohn Clements }
984640ae42eSJohn Clements
amdgpu_ras_get_ecc_info(struct amdgpu_device * adev,struct ras_err_data * err_data)985fdcb279dSStanley.Yang static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
986fdcb279dSStanley.Yang {
987fdcb279dSStanley.Yang struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
988fdcb279dSStanley.Yang int ret = 0;
989fdcb279dSStanley.Yang
990fdcb279dSStanley.Yang /*
991fdcb279dSStanley.Yang * choosing right query method according to
992fdcb279dSStanley.Yang * whether smu support query error information
993fdcb279dSStanley.Yang */
994bc143d8bSEvan Quan ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
995fdcb279dSStanley.Yang if (ret == -EOPNOTSUPP) {
996efe17d5aSyipechai if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
997efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
998efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
999fdcb279dSStanley.Yang
1000fdcb279dSStanley.Yang /* umc query_ras_error_address is also responsible for clearing
1001fdcb279dSStanley.Yang * error status
1002fdcb279dSStanley.Yang */
1003efe17d5aSyipechai if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
1004efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
1005efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
1006fdcb279dSStanley.Yang } else if (!ret) {
1007efe17d5aSyipechai if (adev->umc.ras &&
1008efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_count)
1009efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
1010fdcb279dSStanley.Yang
1011efe17d5aSyipechai if (adev->umc.ras &&
1012efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_address)
1013efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
1014fdcb279dSStanley.Yang }
1015fdcb279dSStanley.Yang }
1016fdcb279dSStanley.Yang
1017c030f2e4Sxinhui pan /* query/inject/cure begin */
amdgpu_ras_query_error_status(struct amdgpu_device * adev,struct ras_query_if * info)1018761d86d3SDennis Li int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
1019c030f2e4Sxinhui pan struct ras_query_if *info)
1020c030f2e4Sxinhui pan {
10218b0fb0e9Syipechai struct amdgpu_ras_block_object *block_obj = NULL;
1022c030f2e4Sxinhui pan struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
10236f102dbaSTao Zhou struct ras_err_data err_data = {0, 0, 0, NULL};
1024c030f2e4Sxinhui pan
1025c030f2e4Sxinhui pan if (!obj)
1026c030f2e4Sxinhui pan return -EINVAL;
1027c030f2e4Sxinhui pan
1028c364e7a3SSrinivasan Shanmugam if (!info || info->head.block == AMDGPU_RAS_BLOCK_COUNT)
1029c364e7a3SSrinivasan Shanmugam return -EINVAL;
1030c364e7a3SSrinivasan Shanmugam
10317389a5b8Syipechai if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
1032fdcb279dSStanley.Yang amdgpu_ras_get_ecc_info(adev, &err_data);
10337389a5b8Syipechai } else {
10347389a5b8Syipechai block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
10358b0fb0e9Syipechai if (!block_obj || !block_obj->hw_ops) {
1036afa37315SLuben Tuikov dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
10378b0fb0e9Syipechai get_ras_block_str(&info->head));
10388b0fb0e9Syipechai return -EINVAL;
10393e81ee9aSHawking Zhang }
1040761d86d3SDennis Li
10418b0fb0e9Syipechai if (block_obj->hw_ops->query_ras_error_count)
10428b0fb0e9Syipechai block_obj->hw_ops->query_ras_error_count(adev, &err_data);
1043761d86d3SDennis Li
10447389a5b8Syipechai if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
10457389a5b8Syipechai (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
10467389a5b8Syipechai (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
10478b0fb0e9Syipechai if (block_obj->hw_ops->query_ras_error_status)
10488b0fb0e9Syipechai block_obj->hw_ops->query_ras_error_status(adev);
10496c245386Syipechai }
1050939e2258SHawking Zhang }
105105a58345STao Zhou
105205a58345STao Zhou obj->err_data.ue_count += err_data.ue_count;
105305a58345STao Zhou obj->err_data.ce_count += err_data.ce_count;
105405a58345STao Zhou
1055c030f2e4Sxinhui pan info->ue_count = obj->err_data.ue_count;
1056c030f2e4Sxinhui pan info->ce_count = obj->err_data.ce_count;
1057c030f2e4Sxinhui pan
10587c6e68c7SAndrey Grodzovsky if (err_data.ce_count) {
1059ffd6bde3SHawking Zhang if (!adev->aid_mask &&
1060ffd6bde3SHawking Zhang adev->smuio.funcs &&
1061a30f1286SHawking Zhang adev->smuio.funcs->get_socket_id &&
1062a30f1286SHawking Zhang adev->smuio.funcs->get_die_id) {
1063a30f1286SHawking Zhang dev_info(adev->dev, "socket: %d, die: %d "
1064a30f1286SHawking Zhang "%ld correctable hardware errors "
1065a30f1286SHawking Zhang "detected in %s block, no user "
1066a30f1286SHawking Zhang "action is needed.\n",
1067a30f1286SHawking Zhang adev->smuio.funcs->get_socket_id(adev),
1068a30f1286SHawking Zhang adev->smuio.funcs->get_die_id(adev),
1069a30f1286SHawking Zhang obj->err_data.ce_count,
1070640ae42eSJohn Clements get_ras_block_str(&info->head));
1071a30f1286SHawking Zhang } else {
10726952e99cSGuchun Chen dev_info(adev->dev, "%ld correctable hardware errors "
10736952e99cSGuchun Chen "detected in %s block, no user "
10746952e99cSGuchun Chen "action is needed.\n",
10756952e99cSGuchun Chen obj->err_data.ce_count,
1076640ae42eSJohn Clements get_ras_block_str(&info->head));
10777c6e68c7SAndrey Grodzovsky }
1078a30f1286SHawking Zhang }
10797c6e68c7SAndrey Grodzovsky if (err_data.ue_count) {
1080ffd6bde3SHawking Zhang if (!adev->aid_mask &&
1081ffd6bde3SHawking Zhang adev->smuio.funcs &&
1082a30f1286SHawking Zhang adev->smuio.funcs->get_socket_id &&
1083a30f1286SHawking Zhang adev->smuio.funcs->get_die_id) {
1084a30f1286SHawking Zhang dev_info(adev->dev, "socket: %d, die: %d "
1085a30f1286SHawking Zhang "%ld uncorrectable hardware errors "
1086a30f1286SHawking Zhang "detected in %s block\n",
1087a30f1286SHawking Zhang adev->smuio.funcs->get_socket_id(adev),
1088a30f1286SHawking Zhang adev->smuio.funcs->get_die_id(adev),
1089a30f1286SHawking Zhang obj->err_data.ue_count,
1090640ae42eSJohn Clements get_ras_block_str(&info->head));
1091a30f1286SHawking Zhang } else {
10926952e99cSGuchun Chen dev_info(adev->dev, "%ld uncorrectable hardware errors "
10936952e99cSGuchun Chen "detected in %s block\n",
10946952e99cSGuchun Chen obj->err_data.ue_count,
1095640ae42eSJohn Clements get_ras_block_str(&info->head));
10967c6e68c7SAndrey Grodzovsky }
1097a30f1286SHawking Zhang }
109805a58345STao Zhou
1099c030f2e4Sxinhui pan return 0;
1100c030f2e4Sxinhui pan }
1101c030f2e4Sxinhui pan
amdgpu_ras_reset_error_status(struct amdgpu_device * adev,enum amdgpu_ras_block block)1102761d86d3SDennis Li int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
1103761d86d3SDennis Li enum amdgpu_ras_block block)
1104761d86d3SDennis Li {
11058b0fb0e9Syipechai struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
11068b0fb0e9Syipechai
1107761d86d3SDennis Li if (!amdgpu_ras_is_supported(adev, block))
1108761d86d3SDennis Li return -EINVAL;
1109761d86d3SDennis Li
11108b0fb0e9Syipechai if (!block_obj || !block_obj->hw_ops) {
1111afa37315SLuben Tuikov dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1112b6efdb02Syipechai ras_block_str(block));
11138b0fb0e9Syipechai return -EINVAL;
11148b0fb0e9Syipechai }
1115761d86d3SDennis Li
11168b0fb0e9Syipechai if (block_obj->hw_ops->reset_ras_error_count)
11178b0fb0e9Syipechai block_obj->hw_ops->reset_ras_error_count(adev);
11187780f503SDennis Li
11197389a5b8Syipechai if ((block == AMDGPU_RAS_BLOCK__GFX) ||
11207389a5b8Syipechai (block == AMDGPU_RAS_BLOCK__MMHUB)) {
11218b0fb0e9Syipechai if (block_obj->hw_ops->reset_ras_error_status)
11228b0fb0e9Syipechai block_obj->hw_ops->reset_ras_error_status(adev);
1123761d86d3SDennis Li }
1124761d86d3SDennis Li
1125761d86d3SDennis Li return 0;
1126761d86d3SDennis Li }
1127761d86d3SDennis Li
1128c030f2e4Sxinhui pan /* wrapper of psp_ras_trigger_error */
amdgpu_ras_error_inject(struct amdgpu_device * adev,struct ras_inject_if * info)1129c030f2e4Sxinhui pan int amdgpu_ras_error_inject(struct amdgpu_device *adev,
1130c030f2e4Sxinhui pan struct ras_inject_if *info)
1131c030f2e4Sxinhui pan {
1132c030f2e4Sxinhui pan struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1133c030f2e4Sxinhui pan struct ta_ras_trigger_error_input block_info = {
1134828cfa29Sxinhui pan .block_id = amdgpu_ras_block_to_ta(info->head.block),
1135828cfa29Sxinhui pan .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1136c030f2e4Sxinhui pan .sub_block_index = info->head.sub_block_index,
1137c030f2e4Sxinhui pan .address = info->address,
1138c030f2e4Sxinhui pan .value = info->value,
1139c030f2e4Sxinhui pan };
11408b0fb0e9Syipechai int ret = -EINVAL;
1141ab3b9de6SYang Li struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
1142ab3b9de6SYang Li info->head.block,
1143ab3b9de6SYang Li info->head.sub_block_index);
1144c030f2e4Sxinhui pan
1145248c9635STao Zhou /* inject on guest isn't allowed, return success directly */
1146248c9635STao Zhou if (amdgpu_sriov_vf(adev))
1147248c9635STao Zhou return 0;
1148248c9635STao Zhou
1149c030f2e4Sxinhui pan if (!obj)
1150c030f2e4Sxinhui pan return -EINVAL;
1151c030f2e4Sxinhui pan
115222d4ba53Syipechai if (!block_obj || !block_obj->hw_ops) {
1153afa37315SLuben Tuikov dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1154b6efdb02Syipechai get_ras_block_str(&info->head));
115522d4ba53Syipechai return -EINVAL;
115622d4ba53Syipechai }
115722d4ba53Syipechai
1158a6c44d25SJohn Clements /* Calculate XGMI relative offset */
1159a80fe1a6STao Zhou if (adev->gmc.xgmi.num_physical_nodes > 1 &&
1160a80fe1a6STao Zhou info->head.block != AMDGPU_RAS_BLOCK__GFX) {
116119744f5fSHawking Zhang block_info.address =
116219744f5fSHawking Zhang amdgpu_xgmi_get_relative_phy_addr(adev,
1163a6c44d25SJohn Clements block_info.address);
1164a6c44d25SJohn Clements }
1165a6c44d25SJohn Clements
116627c5f295STao Zhou if (block_obj->hw_ops->ras_error_inject) {
116727c5f295STao Zhou if (info->head.block == AMDGPU_RAS_BLOCK__GFX)
11682c22ed0bSTao Zhou ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
116927c5f295STao Zhou else /* Special ras_error_inject is defined (e.g: xgmi) */
11702c22ed0bSTao Zhou ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
11712c22ed0bSTao Zhou info->instance_mask);
117227c5f295STao Zhou } else {
117327c5f295STao Zhou /* default path */
11742c22ed0bSTao Zhou ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
1175a5dd40caSHawking Zhang }
1176a5dd40caSHawking Zhang
1177011907fdSDennis Li if (ret)
1178011907fdSDennis Li dev_err(adev->dev, "ras inject %s failed %d\n",
1179640ae42eSJohn Clements get_ras_block_str(&info->head), ret);
1180c030f2e4Sxinhui pan
1181c030f2e4Sxinhui pan return ret;
1182c030f2e4Sxinhui pan }
1183c030f2e4Sxinhui pan
11844d9f771eSLuben Tuikov /**
11854a1c9a44SHawking Zhang * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
11864a1c9a44SHawking Zhang * @adev: pointer to AMD GPU device
11874a1c9a44SHawking Zhang * @ce_count: pointer to an integer to be set to the count of correctible errors.
11884a1c9a44SHawking Zhang * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
11894a1c9a44SHawking Zhang * @query_info: pointer to ras_query_if
11904a1c9a44SHawking Zhang *
11914a1c9a44SHawking Zhang * Return 0 for query success or do nothing, otherwise return an error
11924a1c9a44SHawking Zhang * on failures
11934a1c9a44SHawking Zhang */
amdgpu_ras_query_error_count_helper(struct amdgpu_device * adev,unsigned long * ce_count,unsigned long * ue_count,struct ras_query_if * query_info)11944a1c9a44SHawking Zhang static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
11954a1c9a44SHawking Zhang unsigned long *ce_count,
11964a1c9a44SHawking Zhang unsigned long *ue_count,
11974a1c9a44SHawking Zhang struct ras_query_if *query_info)
11984a1c9a44SHawking Zhang {
11994a1c9a44SHawking Zhang int ret;
12004a1c9a44SHawking Zhang
12014a1c9a44SHawking Zhang if (!query_info)
12024a1c9a44SHawking Zhang /* do nothing if query_info is not specified */
12034a1c9a44SHawking Zhang return 0;
12044a1c9a44SHawking Zhang
12054a1c9a44SHawking Zhang ret = amdgpu_ras_query_error_status(adev, query_info);
12064a1c9a44SHawking Zhang if (ret)
12074a1c9a44SHawking Zhang return ret;
12084a1c9a44SHawking Zhang
12094a1c9a44SHawking Zhang *ce_count += query_info->ce_count;
12104a1c9a44SHawking Zhang *ue_count += query_info->ue_count;
12114a1c9a44SHawking Zhang
12124a1c9a44SHawking Zhang /* some hardware/IP supports read to clear
12134a1c9a44SHawking Zhang * no need to explictly reset the err status after the query call */
12144a1c9a44SHawking Zhang if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
12154a1c9a44SHawking Zhang adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
12164a1c9a44SHawking Zhang if (amdgpu_ras_reset_error_status(adev, query_info->head.block))
12174a1c9a44SHawking Zhang dev_warn(adev->dev,
12184a1c9a44SHawking Zhang "Failed to reset error counter and error status\n");
12194a1c9a44SHawking Zhang }
12204a1c9a44SHawking Zhang
12214a1c9a44SHawking Zhang return 0;
12224a1c9a44SHawking Zhang }
12234a1c9a44SHawking Zhang
12244a1c9a44SHawking Zhang /**
12254a1c9a44SHawking Zhang * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1226bbe04decSIsabella Basso * @adev: pointer to AMD GPU device
1227bbe04decSIsabella Basso * @ce_count: pointer to an integer to be set to the count of correctible errors.
1228bbe04decSIsabella Basso * @ue_count: pointer to an integer to be set to the count of uncorrectible
12294d9f771eSLuben Tuikov * errors.
12304a1c9a44SHawking Zhang * @query_info: pointer to ras_query_if if the query request is only for
12314a1c9a44SHawking Zhang * specific ip block; if info is NULL, then the qurey request is for
12324a1c9a44SHawking Zhang * all the ip blocks that support query ras error counters/status
12334d9f771eSLuben Tuikov *
12344d9f771eSLuben Tuikov * If set, @ce_count or @ue_count, count and return the corresponding
12354d9f771eSLuben Tuikov * error counts in those integer pointers. Return 0 if the device
12364d9f771eSLuben Tuikov * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
12374d9f771eSLuben Tuikov */
amdgpu_ras_query_error_count(struct amdgpu_device * adev,unsigned long * ce_count,unsigned long * ue_count,struct ras_query_if * query_info)12384d9f771eSLuben Tuikov int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
1239a46751fbSLuben Tuikov unsigned long *ce_count,
12404a1c9a44SHawking Zhang unsigned long *ue_count,
12414a1c9a44SHawking Zhang struct ras_query_if *query_info)
1242c030f2e4Sxinhui pan {
1243c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1244c030f2e4Sxinhui pan struct ras_manager *obj;
1245a46751fbSLuben Tuikov unsigned long ce, ue;
12464a1c9a44SHawking Zhang int ret;
1247c030f2e4Sxinhui pan
12488ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
12494d9f771eSLuben Tuikov return -EOPNOTSUPP;
12504d9f771eSLuben Tuikov
12514d9f771eSLuben Tuikov /* Don't count since no reporting.
12524d9f771eSLuben Tuikov */
12534d9f771eSLuben Tuikov if (!ce_count && !ue_count)
12544d9f771eSLuben Tuikov return 0;
1255c030f2e4Sxinhui pan
1256a46751fbSLuben Tuikov ce = 0;
1257a46751fbSLuben Tuikov ue = 0;
12584a1c9a44SHawking Zhang if (!query_info) {
12594a1c9a44SHawking Zhang /* query all the ip blocks that support ras query interface */
1260c030f2e4Sxinhui pan list_for_each_entry(obj, &con->head, node) {
1261c030f2e4Sxinhui pan struct ras_query_if info = {
1262c030f2e4Sxinhui pan .head = obj->head,
1263c030f2e4Sxinhui pan };
1264c030f2e4Sxinhui pan
12654a1c9a44SHawking Zhang ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info);
12664a1c9a44SHawking Zhang }
12674a1c9a44SHawking Zhang } else {
12684a1c9a44SHawking Zhang /* query specific ip block */
12694a1c9a44SHawking Zhang ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info);
12702a460963SCandice Li }
12712a460963SCandice Li
12724a1c9a44SHawking Zhang if (ret)
12734a1c9a44SHawking Zhang return ret;
1274c030f2e4Sxinhui pan
1275a46751fbSLuben Tuikov if (ce_count)
1276a46751fbSLuben Tuikov *ce_count = ce;
1277a46751fbSLuben Tuikov
1278a46751fbSLuben Tuikov if (ue_count)
1279a46751fbSLuben Tuikov *ue_count = ue;
12804d9f771eSLuben Tuikov
12814d9f771eSLuben Tuikov return 0;
1282c030f2e4Sxinhui pan }
1283c030f2e4Sxinhui pan /* query/inject/cure end */
1284c030f2e4Sxinhui pan
1285c030f2e4Sxinhui pan
1286c030f2e4Sxinhui pan /* sysfs begin */
1287c030f2e4Sxinhui pan
1288466b1793Sxinhui pan static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1289466b1793Sxinhui pan struct ras_badpage **bps, unsigned int *count);
1290466b1793Sxinhui pan
amdgpu_ras_badpage_flags_str(unsigned int flags)1291466b1793Sxinhui pan static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1292466b1793Sxinhui pan {
1293466b1793Sxinhui pan switch (flags) {
129452dd95f2SGuchun Chen case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1295466b1793Sxinhui pan return "R";
129652dd95f2SGuchun Chen case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1297466b1793Sxinhui pan return "P";
129852dd95f2SGuchun Chen case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1299466b1793Sxinhui pan default:
1300466b1793Sxinhui pan return "F";
1301aec576f9STom Rix }
1302466b1793Sxinhui pan }
1303466b1793Sxinhui pan
1304f77c7109SAlex Deucher /**
1305f77c7109SAlex Deucher * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
1306466b1793Sxinhui pan *
1307466b1793Sxinhui pan * It allows user to read the bad pages of vram on the gpu through
1308466b1793Sxinhui pan * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1309466b1793Sxinhui pan *
1310466b1793Sxinhui pan * It outputs multiple lines, and each line stands for one gpu page.
1311466b1793Sxinhui pan *
1312466b1793Sxinhui pan * The format of one line is below,
1313466b1793Sxinhui pan * gpu pfn : gpu page size : flags
1314466b1793Sxinhui pan *
1315466b1793Sxinhui pan * gpu pfn and gpu page size are printed in hex format.
1316466b1793Sxinhui pan * flags can be one of below character,
1317f77c7109SAlex Deucher *
1318466b1793Sxinhui pan * R: reserved, this gpu page is reserved and not able to use.
1319f77c7109SAlex Deucher *
1320466b1793Sxinhui pan * P: pending for reserve, this gpu page is marked as bad, will be reserved
1321466b1793Sxinhui pan * in next window of page_reserve.
1322f77c7109SAlex Deucher *
1323466b1793Sxinhui pan * F: unable to reserve. this gpu page can't be reserved due to some reasons.
1324466b1793Sxinhui pan *
1325f77c7109SAlex Deucher * Examples:
1326f77c7109SAlex Deucher *
1327f77c7109SAlex Deucher * .. code-block:: bash
1328f77c7109SAlex Deucher *
1329466b1793Sxinhui pan * 0x00000001 : 0x00001000 : R
1330466b1793Sxinhui pan * 0x00000002 : 0x00001000 : P
1331f77c7109SAlex Deucher *
1332466b1793Sxinhui pan */
1333466b1793Sxinhui pan
amdgpu_ras_sysfs_badpages_read(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)1334466b1793Sxinhui pan static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1335466b1793Sxinhui pan struct kobject *kobj, struct bin_attribute *attr,
1336466b1793Sxinhui pan char *buf, loff_t ppos, size_t count)
1337466b1793Sxinhui pan {
1338466b1793Sxinhui pan struct amdgpu_ras *con =
1339466b1793Sxinhui pan container_of(attr, struct amdgpu_ras, badpages_attr);
1340466b1793Sxinhui pan struct amdgpu_device *adev = con->adev;
1341466b1793Sxinhui pan const unsigned int element_size =
1342466b1793Sxinhui pan sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1343d6ee400eSSlava Abramov unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1344d6ee400eSSlava Abramov unsigned int end = div64_ul(ppos + count - 1, element_size);
1345466b1793Sxinhui pan ssize_t s = 0;
1346466b1793Sxinhui pan struct ras_badpage *bps = NULL;
1347466b1793Sxinhui pan unsigned int bps_count = 0;
1348466b1793Sxinhui pan
1349466b1793Sxinhui pan memset(buf, 0, count);
1350466b1793Sxinhui pan
1351466b1793Sxinhui pan if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1352466b1793Sxinhui pan return 0;
1353466b1793Sxinhui pan
1354466b1793Sxinhui pan for (; start < end && start < bps_count; start++)
1355466b1793Sxinhui pan s += scnprintf(&buf[s], element_size + 1,
1356466b1793Sxinhui pan "0x%08x : 0x%08x : %1s\n",
1357466b1793Sxinhui pan bps[start].bp,
1358466b1793Sxinhui pan bps[start].size,
1359466b1793Sxinhui pan amdgpu_ras_badpage_flags_str(bps[start].flags));
1360466b1793Sxinhui pan
1361466b1793Sxinhui pan kfree(bps);
1362466b1793Sxinhui pan
1363466b1793Sxinhui pan return s;
1364466b1793Sxinhui pan }
1365466b1793Sxinhui pan
amdgpu_ras_sysfs_features_read(struct device * dev,struct device_attribute * attr,char * buf)1366c030f2e4Sxinhui pan static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1367c030f2e4Sxinhui pan struct device_attribute *attr, char *buf)
1368c030f2e4Sxinhui pan {
1369c030f2e4Sxinhui pan struct amdgpu_ras *con =
1370c030f2e4Sxinhui pan container_of(attr, struct amdgpu_ras, features_attr);
1371c030f2e4Sxinhui pan
13722cffcb66Sye xingchen return sysfs_emit(buf, "feature mask: 0x%x\n", con->features);
1373c030f2e4Sxinhui pan }
1374c030f2e4Sxinhui pan
amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device * adev)1375f848159bSGuchun Chen static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1376f848159bSGuchun Chen {
1377f848159bSGuchun Chen struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1378f848159bSGuchun Chen
1379de1c0959SVitaly Prosyak if (adev->dev->kobj.sd)
1380f848159bSGuchun Chen sysfs_remove_file_from_group(&adev->dev->kobj,
1381f848159bSGuchun Chen &con->badpages_attr.attr,
1382f848159bSGuchun Chen RAS_FS_NAME);
1383f848159bSGuchun Chen }
1384f848159bSGuchun Chen
amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device * adev)1385c030f2e4Sxinhui pan static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1386c030f2e4Sxinhui pan {
1387c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1388c030f2e4Sxinhui pan struct attribute *attrs[] = {
1389c030f2e4Sxinhui pan &con->features_attr.attr,
1390c030f2e4Sxinhui pan NULL
1391c030f2e4Sxinhui pan };
1392c030f2e4Sxinhui pan struct attribute_group group = {
1393eb0c3cd4SGuchun Chen .name = RAS_FS_NAME,
1394c030f2e4Sxinhui pan .attrs = attrs,
1395c030f2e4Sxinhui pan };
1396c030f2e4Sxinhui pan
1397de1c0959SVitaly Prosyak if (adev->dev->kobj.sd)
1398c030f2e4Sxinhui pan sysfs_remove_group(&adev->dev->kobj, &group);
1399c030f2e4Sxinhui pan
1400c030f2e4Sxinhui pan return 0;
1401c030f2e4Sxinhui pan }
1402c030f2e4Sxinhui pan
amdgpu_ras_sysfs_create(struct amdgpu_device * adev,struct ras_common_if * head)1403c030f2e4Sxinhui pan int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
14049252d33dSyipechai struct ras_common_if *head)
1405c030f2e4Sxinhui pan {
14069252d33dSyipechai struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1407c030f2e4Sxinhui pan
1408c030f2e4Sxinhui pan if (!obj || obj->attr_inuse)
1409c030f2e4Sxinhui pan return -EINVAL;
1410c030f2e4Sxinhui pan
1411c030f2e4Sxinhui pan get_obj(obj);
1412c030f2e4Sxinhui pan
14139252d33dSyipechai snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
14149252d33dSyipechai "%s_err_count", head->name);
1415c030f2e4Sxinhui pan
1416c030f2e4Sxinhui pan obj->sysfs_attr = (struct device_attribute){
1417c030f2e4Sxinhui pan .attr = {
1418c030f2e4Sxinhui pan .name = obj->fs_data.sysfs_name,
1419c030f2e4Sxinhui pan .mode = S_IRUGO,
1420c030f2e4Sxinhui pan },
1421c030f2e4Sxinhui pan .show = amdgpu_ras_sysfs_read,
1422c030f2e4Sxinhui pan };
1423163def43Sxinhui pan sysfs_attr_init(&obj->sysfs_attr.attr);
1424c030f2e4Sxinhui pan
1425c030f2e4Sxinhui pan if (sysfs_add_file_to_group(&adev->dev->kobj,
1426c030f2e4Sxinhui pan &obj->sysfs_attr.attr,
1427eb0c3cd4SGuchun Chen RAS_FS_NAME)) {
1428c030f2e4Sxinhui pan put_obj(obj);
1429c030f2e4Sxinhui pan return -EINVAL;
1430c030f2e4Sxinhui pan }
1431c030f2e4Sxinhui pan
1432c030f2e4Sxinhui pan obj->attr_inuse = 1;
1433c030f2e4Sxinhui pan
1434c030f2e4Sxinhui pan return 0;
1435c030f2e4Sxinhui pan }
1436c030f2e4Sxinhui pan
amdgpu_ras_sysfs_remove(struct amdgpu_device * adev,struct ras_common_if * head)1437c030f2e4Sxinhui pan int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1438c030f2e4Sxinhui pan struct ras_common_if *head)
1439c030f2e4Sxinhui pan {
1440c030f2e4Sxinhui pan struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1441c030f2e4Sxinhui pan
1442c030f2e4Sxinhui pan if (!obj || !obj->attr_inuse)
1443c030f2e4Sxinhui pan return -EINVAL;
1444c030f2e4Sxinhui pan
1445de1c0959SVitaly Prosyak if (adev->dev->kobj.sd)
1446c030f2e4Sxinhui pan sysfs_remove_file_from_group(&adev->dev->kobj,
1447c030f2e4Sxinhui pan &obj->sysfs_attr.attr,
1448eb0c3cd4SGuchun Chen RAS_FS_NAME);
1449c030f2e4Sxinhui pan obj->attr_inuse = 0;
1450c030f2e4Sxinhui pan put_obj(obj);
1451c030f2e4Sxinhui pan
1452c030f2e4Sxinhui pan return 0;
1453c030f2e4Sxinhui pan }
1454c030f2e4Sxinhui pan
amdgpu_ras_sysfs_remove_all(struct amdgpu_device * adev)1455c030f2e4Sxinhui pan static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1456c030f2e4Sxinhui pan {
1457c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1458c030f2e4Sxinhui pan struct ras_manager *obj, *tmp;
1459c030f2e4Sxinhui pan
1460c030f2e4Sxinhui pan list_for_each_entry_safe(obj, tmp, &con->head, node) {
1461c030f2e4Sxinhui pan amdgpu_ras_sysfs_remove(adev, &obj->head);
1462c030f2e4Sxinhui pan }
1463c030f2e4Sxinhui pan
1464f848159bSGuchun Chen if (amdgpu_bad_page_threshold != 0)
1465f848159bSGuchun Chen amdgpu_ras_sysfs_remove_bad_page_node(adev);
1466f848159bSGuchun Chen
1467c030f2e4Sxinhui pan amdgpu_ras_sysfs_remove_feature_node(adev);
1468c030f2e4Sxinhui pan
1469c030f2e4Sxinhui pan return 0;
1470c030f2e4Sxinhui pan }
1471c030f2e4Sxinhui pan /* sysfs end */
1472c030f2e4Sxinhui pan
1473ef177d11SAlex Deucher /**
1474ef177d11SAlex Deucher * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1475ef177d11SAlex Deucher *
1476ef177d11SAlex Deucher * Normally when there is an uncorrectable error, the driver will reset
1477ef177d11SAlex Deucher * the GPU to recover. However, in the event of an unrecoverable error,
1478ef177d11SAlex Deucher * the driver provides an interface to reboot the system automatically
1479ef177d11SAlex Deucher * in that event.
1480ef177d11SAlex Deucher *
1481ef177d11SAlex Deucher * The following file in debugfs provides that interface:
1482ef177d11SAlex Deucher * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1483ef177d11SAlex Deucher *
1484ef177d11SAlex Deucher * Usage:
1485ef177d11SAlex Deucher *
1486ef177d11SAlex Deucher * .. code-block:: bash
1487ef177d11SAlex Deucher *
1488ef177d11SAlex Deucher * echo true > .../ras/auto_reboot
1489ef177d11SAlex Deucher *
1490ef177d11SAlex Deucher */
1491c030f2e4Sxinhui pan /* debugfs begin */
amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * adev)1492ea1b8c9bSNirmoy Das static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
149336ea1bd2Sxinhui pan {
149436ea1bd2Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1495740f42a2SLuben Tuikov struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control;
14964a580877SLuben Tuikov struct drm_minor *minor = adev_to_drm(adev)->primary;
1497ef0d7d20SLuben Tuikov struct dentry *dir;
149836ea1bd2Sxinhui pan
149988293c03SNirmoy Das dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
150088293c03SNirmoy Das debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
150188293c03SNirmoy Das &amdgpu_ras_debugfs_ctrl_ops);
150288293c03SNirmoy Das debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
150388293c03SNirmoy Das &amdgpu_ras_debugfs_eeprom_ops);
15047fb64071SLuben Tuikov debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
15057fb64071SLuben Tuikov &con->bad_page_cnt_threshold);
1506740f42a2SLuben Tuikov debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs);
1507ef0d7d20SLuben Tuikov debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
1508ef0d7d20SLuben Tuikov debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
1509c65b0805SLuben Tuikov debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
1510c65b0805SLuben Tuikov &amdgpu_ras_debugfs_eeprom_size_ops);
1511c65b0805SLuben Tuikov con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
1512c65b0805SLuben Tuikov S_IRUGO, dir, adev,
1513c65b0805SLuben Tuikov &amdgpu_ras_debugfs_eeprom_table_ops);
1514c65b0805SLuben Tuikov amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1515c688a06bSGuchun Chen
1516c688a06bSGuchun Chen /*
1517c688a06bSGuchun Chen * After one uncorrectable error happens, usually GPU recovery will
1518c688a06bSGuchun Chen * be scheduled. But due to the known problem in GPU recovery failing
1519c688a06bSGuchun Chen * to bring GPU back, below interface provides one direct way to
1520c688a06bSGuchun Chen * user to reboot system automatically in such case within
1521c688a06bSGuchun Chen * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1522c688a06bSGuchun Chen * will never be called.
1523c688a06bSGuchun Chen */
152488293c03SNirmoy Das debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
152566459e1dSGuchun Chen
152666459e1dSGuchun Chen /*
152766459e1dSGuchun Chen * User could set this not to clean up hardware's error count register
152866459e1dSGuchun Chen * of RAS IPs during ras recovery.
152966459e1dSGuchun Chen */
153088293c03SNirmoy Das debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
153188293c03SNirmoy Das &con->disable_ras_err_cnt_harvest);
153288293c03SNirmoy Das return dir;
153336ea1bd2Sxinhui pan }
153436ea1bd2Sxinhui pan
amdgpu_ras_debugfs_create(struct amdgpu_device * adev,struct ras_fs_if * head,struct dentry * dir)1535cedf7884SArnd Bergmann static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
153688293c03SNirmoy Das struct ras_fs_if *head,
153788293c03SNirmoy Das struct dentry *dir)
1538c030f2e4Sxinhui pan {
1539c030f2e4Sxinhui pan struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1540c030f2e4Sxinhui pan
154188293c03SNirmoy Das if (!obj || !dir)
1542450f30eaSGreg Kroah-Hartman return;
1543c030f2e4Sxinhui pan
1544c030f2e4Sxinhui pan get_obj(obj);
1545c030f2e4Sxinhui pan
1546c030f2e4Sxinhui pan memcpy(obj->fs_data.debugfs_name,
1547c030f2e4Sxinhui pan head->debugfs_name,
1548c030f2e4Sxinhui pan sizeof(obj->fs_data.debugfs_name));
1549c030f2e4Sxinhui pan
155088293c03SNirmoy Das debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
155188293c03SNirmoy Das obj, &amdgpu_ras_debugfs_ops);
1552c030f2e4Sxinhui pan }
1553c030f2e4Sxinhui pan
amdgpu_ras_debugfs_create_all(struct amdgpu_device * adev)1554f9317014STao Zhou void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1555f9317014STao Zhou {
1556f9317014STao Zhou struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
155788293c03SNirmoy Das struct dentry *dir;
1558c1509f3fSStanley.Yang struct ras_manager *obj;
1559f9317014STao Zhou struct ras_fs_if fs_info;
1560f9317014STao Zhou
1561f9317014STao Zhou /*
1562f9317014STao Zhou * it won't be called in resume path, no need to check
1563f9317014STao Zhou * suspend and gpu reset status
1564f9317014STao Zhou */
1565cedf7884SArnd Bergmann if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1566f9317014STao Zhou return;
1567f9317014STao Zhou
156888293c03SNirmoy Das dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
1569f9317014STao Zhou
1570c1509f3fSStanley.Yang list_for_each_entry(obj, &con->head, node) {
1571f9317014STao Zhou if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1572f9317014STao Zhou (obj->attr_inuse == 1)) {
1573f9317014STao Zhou sprintf(fs_info.debugfs_name, "%s_err_inject",
1574640ae42eSJohn Clements get_ras_block_str(&obj->head));
1575f9317014STao Zhou fs_info.head = obj->head;
157688293c03SNirmoy Das amdgpu_ras_debugfs_create(adev, &fs_info, dir);
1577f9317014STao Zhou }
1578f9317014STao Zhou }
1579f9317014STao Zhou }
1580f9317014STao Zhou
1581c030f2e4Sxinhui pan /* debugfs end */
1582c030f2e4Sxinhui pan
1583c030f2e4Sxinhui pan /* ras fs */
1584c3d4d45dSGuchun Chen static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1585c3d4d45dSGuchun Chen amdgpu_ras_sysfs_badpages_read, NULL, 0);
1586c3d4d45dSGuchun Chen static DEVICE_ATTR(features, S_IRUGO,
1587c3d4d45dSGuchun Chen amdgpu_ras_sysfs_features_read, NULL);
amdgpu_ras_fs_init(struct amdgpu_device * adev)1588c030f2e4Sxinhui pan static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1589c030f2e4Sxinhui pan {
1590c3d4d45dSGuchun Chen struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1591c3d4d45dSGuchun Chen struct attribute_group group = {
1592c3d4d45dSGuchun Chen .name = RAS_FS_NAME,
1593c3d4d45dSGuchun Chen };
1594c3d4d45dSGuchun Chen struct attribute *attrs[] = {
1595c3d4d45dSGuchun Chen &con->features_attr.attr,
1596c3d4d45dSGuchun Chen NULL
1597c3d4d45dSGuchun Chen };
1598c3d4d45dSGuchun Chen struct bin_attribute *bin_attrs[] = {
1599c3d4d45dSGuchun Chen NULL,
1600c3d4d45dSGuchun Chen NULL,
1601c3d4d45dSGuchun Chen };
1602a069a9ebSAlex Deucher int r;
1603c030f2e4Sxinhui pan
1604c3d4d45dSGuchun Chen /* add features entry */
1605c3d4d45dSGuchun Chen con->features_attr = dev_attr_features;
1606c3d4d45dSGuchun Chen group.attrs = attrs;
1607c3d4d45dSGuchun Chen sysfs_attr_init(attrs[0]);
1608c3d4d45dSGuchun Chen
1609c3d4d45dSGuchun Chen if (amdgpu_bad_page_threshold != 0) {
1610c3d4d45dSGuchun Chen /* add bad_page_features entry */
1611c3d4d45dSGuchun Chen bin_attr_gpu_vram_bad_pages.private = NULL;
1612c3d4d45dSGuchun Chen con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1613c3d4d45dSGuchun Chen bin_attrs[0] = &con->badpages_attr;
1614c3d4d45dSGuchun Chen group.bin_attrs = bin_attrs;
1615c3d4d45dSGuchun Chen sysfs_bin_attr_init(bin_attrs[0]);
1616c3d4d45dSGuchun Chen }
1617c3d4d45dSGuchun Chen
1618a069a9ebSAlex Deucher r = sysfs_create_group(&adev->dev->kobj, &group);
1619a069a9ebSAlex Deucher if (r)
1620a069a9ebSAlex Deucher dev_err(adev->dev, "Failed to create RAS sysfs group!");
1621f848159bSGuchun Chen
1622c030f2e4Sxinhui pan return 0;
1623c030f2e4Sxinhui pan }
1624c030f2e4Sxinhui pan
amdgpu_ras_fs_fini(struct amdgpu_device * adev)1625c030f2e4Sxinhui pan static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1626c030f2e4Sxinhui pan {
162788293c03SNirmoy Das struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
162888293c03SNirmoy Das struct ras_manager *con_obj, *ip_obj, *tmp;
162988293c03SNirmoy Das
163088293c03SNirmoy Das if (IS_ENABLED(CONFIG_DEBUG_FS)) {
163188293c03SNirmoy Das list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
163288293c03SNirmoy Das ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
163388293c03SNirmoy Das if (ip_obj)
163488293c03SNirmoy Das put_obj(ip_obj);
163588293c03SNirmoy Das }
163688293c03SNirmoy Das }
163788293c03SNirmoy Das
1638c030f2e4Sxinhui pan amdgpu_ras_sysfs_remove_all(adev);
1639c030f2e4Sxinhui pan return 0;
1640c030f2e4Sxinhui pan }
1641c030f2e4Sxinhui pan /* ras fs end */
1642c030f2e4Sxinhui pan
1643c030f2e4Sxinhui pan /* ih begin */
1644b3c76814STao Zhou
1645b3c76814STao Zhou /* For the hardware that cannot enable bif ring for both ras_controller_irq
1646b3c76814STao Zhou * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
1647b3c76814STao Zhou * register to check whether the interrupt is triggered or not, and properly
1648b3c76814STao Zhou * ack the interrupt if it is there
1649b3c76814STao Zhou */
amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device * adev)1650b3c76814STao Zhou void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
1651b3c76814STao Zhou {
1652950d6425SStanley.Yang /* Fatal error events are handled on host side */
16538eba7205SCandice Li if (amdgpu_sriov_vf(adev))
1654b3c76814STao Zhou return;
1655b3c76814STao Zhou
1656b3c76814STao Zhou if (adev->nbio.ras &&
1657b3c76814STao Zhou adev->nbio.ras->handle_ras_controller_intr_no_bifring)
1658b3c76814STao Zhou adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
1659b3c76814STao Zhou
1660b3c76814STao Zhou if (adev->nbio.ras &&
1661b3c76814STao Zhou adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
1662b3c76814STao Zhou adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
1663b3c76814STao Zhou }
1664b3c76814STao Zhou
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)166566f87949STao Zhou static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
166666f87949STao Zhou struct amdgpu_iv_entry *entry)
166766f87949STao Zhou {
1668b63ac5d3STao Zhou bool poison_stat = false;
166966f87949STao Zhou struct amdgpu_device *adev = obj->adev;
167066f87949STao Zhou struct amdgpu_ras_block_object *block_obj =
167166f87949STao Zhou amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
167266f87949STao Zhou
1673ac7b25d9SYiPeng Chai if (!block_obj)
1674b63ac5d3STao Zhou return;
1675b63ac5d3STao Zhou
1676b63ac5d3STao Zhou /* both query_poison_status and handle_poison_consumption are optional,
1677b63ac5d3STao Zhou * but at least one of them should be implemented if we need poison
1678b63ac5d3STao Zhou * consumption handler
1679b63ac5d3STao Zhou */
1680ac7b25d9SYiPeng Chai if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
1681b63ac5d3STao Zhou poison_stat = block_obj->hw_ops->query_poison_status(adev);
1682b63ac5d3STao Zhou if (!poison_stat) {
1683b63ac5d3STao Zhou /* Not poison consumption interrupt, no need to handle it */
1684b63ac5d3STao Zhou dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
1685b63ac5d3STao Zhou block_obj->ras_comm.name);
1686b63ac5d3STao Zhou
1687b63ac5d3STao Zhou return;
1688b63ac5d3STao Zhou }
1689b63ac5d3STao Zhou }
1690b63ac5d3STao Zhou
16911ed0e176STao Zhou amdgpu_umc_poison_handler(adev, false);
169266f87949STao Zhou
1693ac7b25d9SYiPeng Chai if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
169466f87949STao Zhou poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
169566f87949STao Zhou
1696b63ac5d3STao Zhou /* gpu reset is fallback for failed and default cases */
1697b63ac5d3STao Zhou if (poison_stat) {
1698b63ac5d3STao Zhou dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
1699b63ac5d3STao Zhou block_obj->ras_comm.name);
170066f87949STao Zhou amdgpu_ras_reset_gpu(adev);
1701ac7b25d9SYiPeng Chai } else {
1702ac7b25d9SYiPeng Chai amdgpu_gfx_poison_consumption_handler(adev, entry);
170366f87949STao Zhou }
1704b63ac5d3STao Zhou }
170566f87949STao Zhou
amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)170650a7d025STao Zhou static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
170750a7d025STao Zhou struct amdgpu_iv_entry *entry)
1708c030f2e4Sxinhui pan {
1709f524dd54STao Zhou dev_info(obj->adev->dev,
1710f524dd54STao Zhou "Poison is created, no user action is needed.\n");
171150a7d025STao Zhou }
171250a7d025STao Zhou
amdgpu_ras_interrupt_umc_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)171350a7d025STao Zhou static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
171450a7d025STao Zhou struct amdgpu_iv_entry *entry)
171550a7d025STao Zhou {
171650a7d025STao Zhou struct ras_ih_data *data = &obj->ih_data;
171750a7d025STao Zhou struct ras_err_data err_data = {0, 0, 0, NULL};
171850a7d025STao Zhou int ret;
171950a7d025STao Zhou
172050a7d025STao Zhou if (!data->cb)
172150a7d025STao Zhou return;
172250a7d025STao Zhou
1723c030f2e4Sxinhui pan /* Let IP handle its data, maybe we need get the output
172450a7d025STao Zhou * from the callback to update the error type/count, etc
1725c030f2e4Sxinhui pan */
172650a7d025STao Zhou ret = data->cb(obj->adev, &err_data, entry);
1727c030f2e4Sxinhui pan /* ue will trigger an interrupt, and in that case
1728c030f2e4Sxinhui pan * we need do a reset to recovery the whole system.
1729c030f2e4Sxinhui pan * But leave IP do that recovery, here we just dispatch
1730c030f2e4Sxinhui pan * the error.
1731c030f2e4Sxinhui pan */
1732bd2280daSTao Zhou if (ret == AMDGPU_RAS_SUCCESS) {
173351437623STao Zhou /* these counts could be left as 0 if
173451437623STao Zhou * some blocks do not count error number
1735c030f2e4Sxinhui pan */
173651437623STao Zhou obj->err_data.ue_count += err_data.ue_count;
173751437623STao Zhou obj->err_data.ce_count += err_data.ce_count;
173851437623STao Zhou }
1739c030f2e4Sxinhui pan }
174050a7d025STao Zhou
amdgpu_ras_interrupt_handler(struct ras_manager * obj)174150a7d025STao Zhou static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
174250a7d025STao Zhou {
174350a7d025STao Zhou struct ras_ih_data *data = &obj->ih_data;
174450a7d025STao Zhou struct amdgpu_iv_entry entry;
174550a7d025STao Zhou
174650a7d025STao Zhou while (data->rptr != data->wptr) {
174750a7d025STao Zhou rmb();
174850a7d025STao Zhou memcpy(&entry, &data->ring[data->rptr],
174950a7d025STao Zhou data->element_size);
175050a7d025STao Zhou
175150a7d025STao Zhou wmb();
175250a7d025STao Zhou data->rptr = (data->aligned_element_size +
175350a7d025STao Zhou data->rptr) % data->ring_size;
175450a7d025STao Zhou
175550a7d025STao Zhou if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
175650a7d025STao Zhou if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
175750a7d025STao Zhou amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
175866f87949STao Zhou else
175966f87949STao Zhou amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
176050a7d025STao Zhou } else {
176150a7d025STao Zhou if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
176250a7d025STao Zhou amdgpu_ras_interrupt_umc_handler(obj, &entry);
176350a7d025STao Zhou else
176450a7d025STao Zhou dev_warn(obj->adev->dev,
176550a7d025STao Zhou "No RAS interrupt handler for non-UMC block with poison disabled.\n");
1766c030f2e4Sxinhui pan }
1767c030f2e4Sxinhui pan }
1768f524dd54STao Zhou }
1769c030f2e4Sxinhui pan
amdgpu_ras_interrupt_process_handler(struct work_struct * work)1770c030f2e4Sxinhui pan static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1771c030f2e4Sxinhui pan {
1772c030f2e4Sxinhui pan struct ras_ih_data *data =
1773c030f2e4Sxinhui pan container_of(work, struct ras_ih_data, ih_work);
1774c030f2e4Sxinhui pan struct ras_manager *obj =
1775c030f2e4Sxinhui pan container_of(data, struct ras_manager, ih_data);
1776c030f2e4Sxinhui pan
1777c030f2e4Sxinhui pan amdgpu_ras_interrupt_handler(obj);
1778c030f2e4Sxinhui pan }
1779c030f2e4Sxinhui pan
amdgpu_ras_interrupt_dispatch(struct amdgpu_device * adev,struct ras_dispatch_if * info)1780c030f2e4Sxinhui pan int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1781c030f2e4Sxinhui pan struct ras_dispatch_if *info)
1782c030f2e4Sxinhui pan {
1783*b8961633SMa Jun struct ras_manager *obj;
1784*b8961633SMa Jun struct ras_ih_data *data;
1785c030f2e4Sxinhui pan
1786*b8961633SMa Jun obj = amdgpu_ras_find_obj(adev, &info->head);
1787c030f2e4Sxinhui pan if (!obj)
1788c030f2e4Sxinhui pan return -EINVAL;
1789c030f2e4Sxinhui pan
1790*b8961633SMa Jun data = &obj->ih_data;
1791*b8961633SMa Jun
1792c030f2e4Sxinhui pan if (data->inuse == 0)
1793c030f2e4Sxinhui pan return 0;
1794c030f2e4Sxinhui pan
1795c030f2e4Sxinhui pan /* Might be overflow... */
1796c030f2e4Sxinhui pan memcpy(&data->ring[data->wptr], info->entry,
1797c030f2e4Sxinhui pan data->element_size);
1798c030f2e4Sxinhui pan
1799c030f2e4Sxinhui pan wmb();
1800c030f2e4Sxinhui pan data->wptr = (data->aligned_element_size +
1801c030f2e4Sxinhui pan data->wptr) % data->ring_size;
1802c030f2e4Sxinhui pan
1803c030f2e4Sxinhui pan schedule_work(&data->ih_work);
1804c030f2e4Sxinhui pan
1805c030f2e4Sxinhui pan return 0;
1806c030f2e4Sxinhui pan }
1807c030f2e4Sxinhui pan
amdgpu_ras_interrupt_remove_handler(struct amdgpu_device * adev,struct ras_common_if * head)1808c030f2e4Sxinhui pan int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
18099252d33dSyipechai struct ras_common_if *head)
1810c030f2e4Sxinhui pan {
18119252d33dSyipechai struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1812c030f2e4Sxinhui pan struct ras_ih_data *data;
1813c030f2e4Sxinhui pan
1814c030f2e4Sxinhui pan if (!obj)
1815c030f2e4Sxinhui pan return -EINVAL;
1816c030f2e4Sxinhui pan
1817c030f2e4Sxinhui pan data = &obj->ih_data;
1818c030f2e4Sxinhui pan if (data->inuse == 0)
1819c030f2e4Sxinhui pan return 0;
1820c030f2e4Sxinhui pan
1821c030f2e4Sxinhui pan cancel_work_sync(&data->ih_work);
1822c030f2e4Sxinhui pan
1823c030f2e4Sxinhui pan kfree(data->ring);
1824c030f2e4Sxinhui pan memset(data, 0, sizeof(*data));
1825c030f2e4Sxinhui pan put_obj(obj);
1826c030f2e4Sxinhui pan
1827c030f2e4Sxinhui pan return 0;
1828c030f2e4Sxinhui pan }
1829c030f2e4Sxinhui pan
amdgpu_ras_interrupt_add_handler(struct amdgpu_device * adev,struct ras_common_if * head)1830c030f2e4Sxinhui pan int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
18319252d33dSyipechai struct ras_common_if *head)
1832c030f2e4Sxinhui pan {
18339252d33dSyipechai struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1834c030f2e4Sxinhui pan struct ras_ih_data *data;
18359252d33dSyipechai struct amdgpu_ras_block_object *ras_obj;
1836c030f2e4Sxinhui pan
1837c030f2e4Sxinhui pan if (!obj) {
1838c030f2e4Sxinhui pan /* in case we registe the IH before enable ras feature */
18399252d33dSyipechai obj = amdgpu_ras_create_obj(adev, head);
1840c030f2e4Sxinhui pan if (!obj)
1841c030f2e4Sxinhui pan return -EINVAL;
1842c030f2e4Sxinhui pan } else
1843c030f2e4Sxinhui pan get_obj(obj);
1844c030f2e4Sxinhui pan
18459252d33dSyipechai ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
18469252d33dSyipechai
1847c030f2e4Sxinhui pan data = &obj->ih_data;
1848c030f2e4Sxinhui pan /* add the callback.etc */
1849c030f2e4Sxinhui pan *data = (struct ras_ih_data) {
1850c030f2e4Sxinhui pan .inuse = 0,
18519252d33dSyipechai .cb = ras_obj->ras_cb,
1852c030f2e4Sxinhui pan .element_size = sizeof(struct amdgpu_iv_entry),
1853c030f2e4Sxinhui pan .rptr = 0,
1854c030f2e4Sxinhui pan .wptr = 0,
1855c030f2e4Sxinhui pan };
1856c030f2e4Sxinhui pan
1857c030f2e4Sxinhui pan INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1858c030f2e4Sxinhui pan
1859c030f2e4Sxinhui pan data->aligned_element_size = ALIGN(data->element_size, 8);
1860c030f2e4Sxinhui pan /* the ring can store 64 iv entries. */
1861c030f2e4Sxinhui pan data->ring_size = 64 * data->aligned_element_size;
1862c030f2e4Sxinhui pan data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1863c030f2e4Sxinhui pan if (!data->ring) {
1864c030f2e4Sxinhui pan put_obj(obj);
1865c030f2e4Sxinhui pan return -ENOMEM;
1866c030f2e4Sxinhui pan }
1867c030f2e4Sxinhui pan
1868c030f2e4Sxinhui pan /* IH is ready */
1869c030f2e4Sxinhui pan data->inuse = 1;
1870c030f2e4Sxinhui pan
1871c030f2e4Sxinhui pan return 0;
1872c030f2e4Sxinhui pan }
1873c030f2e4Sxinhui pan
amdgpu_ras_interrupt_remove_all(struct amdgpu_device * adev)1874c030f2e4Sxinhui pan static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1875c030f2e4Sxinhui pan {
1876c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1877c030f2e4Sxinhui pan struct ras_manager *obj, *tmp;
1878c030f2e4Sxinhui pan
1879c030f2e4Sxinhui pan list_for_each_entry_safe(obj, tmp, &con->head, node) {
18809252d33dSyipechai amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
1881c030f2e4Sxinhui pan }
1882c030f2e4Sxinhui pan
1883c030f2e4Sxinhui pan return 0;
1884c030f2e4Sxinhui pan }
1885c030f2e4Sxinhui pan /* ih end */
1886c030f2e4Sxinhui pan
1887313c8fd3SGuchun Chen /* traversal all IPs except NBIO to query error counter */
amdgpu_ras_log_on_err_counter(struct amdgpu_device * adev)1888313c8fd3SGuchun Chen static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1889313c8fd3SGuchun Chen {
1890313c8fd3SGuchun Chen struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1891313c8fd3SGuchun Chen struct ras_manager *obj;
1892313c8fd3SGuchun Chen
18938ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
1894313c8fd3SGuchun Chen return;
1895313c8fd3SGuchun Chen
1896313c8fd3SGuchun Chen list_for_each_entry(obj, &con->head, node) {
1897313c8fd3SGuchun Chen struct ras_query_if info = {
1898313c8fd3SGuchun Chen .head = obj->head,
1899313c8fd3SGuchun Chen };
1900313c8fd3SGuchun Chen
1901313c8fd3SGuchun Chen /*
1902313c8fd3SGuchun Chen * PCIE_BIF IP has one different isr by ras controller
1903313c8fd3SGuchun Chen * interrupt, the specific ras counter query will be
1904313c8fd3SGuchun Chen * done in that isr. So skip such block from common
1905313c8fd3SGuchun Chen * sync flood interrupt isr calling.
1906313c8fd3SGuchun Chen */
1907313c8fd3SGuchun Chen if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1908313c8fd3SGuchun Chen continue;
1909313c8fd3SGuchun Chen
1910cf63b702SStanley.Yang /*
1911cf63b702SStanley.Yang * this is a workaround for aldebaran, skip send msg to
1912cf63b702SStanley.Yang * smu to get ecc_info table due to smu handle get ecc
1913cf63b702SStanley.Yang * info table failed temporarily.
1914cf63b702SStanley.Yang * should be removed until smu fix handle ecc_info table.
1915cf63b702SStanley.Yang */
1916cf63b702SStanley.Yang if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
1917cf63b702SStanley.Yang (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
1918cf63b702SStanley.Yang continue;
1919cf63b702SStanley.Yang
1920761d86d3SDennis Li amdgpu_ras_query_error_status(adev, &info);
19212a460963SCandice Li
19222a460963SCandice Li if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
19236da15a23SCandice Li adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) &&
19246da15a23SCandice Li adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {
19252a460963SCandice Li if (amdgpu_ras_reset_error_status(adev, info.head.block))
19262a460963SCandice Li dev_warn(adev->dev, "Failed to reset error counter and error status");
19272a460963SCandice Li }
1928313c8fd3SGuchun Chen }
1929313c8fd3SGuchun Chen }
1930313c8fd3SGuchun Chen
19313f975d0fSStanley.Yang /* Parse RdRspStatus and WrRspStatus */
amdgpu_ras_error_status_query(struct amdgpu_device * adev,struct ras_query_if * info)1932cd92df93SLee Jones static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
19333f975d0fSStanley.Yang struct ras_query_if *info)
19343f975d0fSStanley.Yang {
19358eb53bb2Syipechai struct amdgpu_ras_block_object *block_obj;
19363f975d0fSStanley.Yang /*
19373f975d0fSStanley.Yang * Only two block need to query read/write
19383f975d0fSStanley.Yang * RspStatus at current state
19393f975d0fSStanley.Yang */
19405e67bba3Syipechai if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
19415e67bba3Syipechai (info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
19425e67bba3Syipechai return;
19435e67bba3Syipechai
1944b6efdb02Syipechai block_obj = amdgpu_ras_get_ras_block(adev,
1945b6efdb02Syipechai info->head.block,
1946b6efdb02Syipechai info->head.sub_block_index);
1947b6efdb02Syipechai
19488b0fb0e9Syipechai if (!block_obj || !block_obj->hw_ops) {
1949afa37315SLuben Tuikov dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1950b6efdb02Syipechai get_ras_block_str(&info->head));
19518b0fb0e9Syipechai return;
19523f975d0fSStanley.Yang }
19538b0fb0e9Syipechai
19548b0fb0e9Syipechai if (block_obj->hw_ops->query_ras_error_status)
19558b0fb0e9Syipechai block_obj->hw_ops->query_ras_error_status(adev);
19565e67bba3Syipechai
19573f975d0fSStanley.Yang }
19583f975d0fSStanley.Yang
amdgpu_ras_query_err_status(struct amdgpu_device * adev)19593f975d0fSStanley.Yang static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
19603f975d0fSStanley.Yang {
19613f975d0fSStanley.Yang struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
19623f975d0fSStanley.Yang struct ras_manager *obj;
19633f975d0fSStanley.Yang
19648ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
19653f975d0fSStanley.Yang return;
19663f975d0fSStanley.Yang
19673f975d0fSStanley.Yang list_for_each_entry(obj, &con->head, node) {
19683f975d0fSStanley.Yang struct ras_query_if info = {
19693f975d0fSStanley.Yang .head = obj->head,
19703f975d0fSStanley.Yang };
19713f975d0fSStanley.Yang
19723f975d0fSStanley.Yang amdgpu_ras_error_status_query(adev, &info);
19733f975d0fSStanley.Yang }
19743f975d0fSStanley.Yang }
19753f975d0fSStanley.Yang
1976c030f2e4Sxinhui pan /* recovery begin */
1977466b1793Sxinhui pan
1978466b1793Sxinhui pan /* return 0 on success.
1979466b1793Sxinhui pan * caller need free bps.
1980466b1793Sxinhui pan */
amdgpu_ras_badpages_read(struct amdgpu_device * adev,struct ras_badpage ** bps,unsigned int * count)1981466b1793Sxinhui pan static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1982466b1793Sxinhui pan struct ras_badpage **bps, unsigned int *count)
1983466b1793Sxinhui pan {
1984466b1793Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1985466b1793Sxinhui pan struct ras_err_handler_data *data;
1986466b1793Sxinhui pan int i = 0;
1987732f2a30SDennis Li int ret = 0, status;
1988466b1793Sxinhui pan
1989466b1793Sxinhui pan if (!con || !con->eh_data || !bps || !count)
1990466b1793Sxinhui pan return -EINVAL;
1991466b1793Sxinhui pan
1992466b1793Sxinhui pan mutex_lock(&con->recovery_lock);
1993466b1793Sxinhui pan data = con->eh_data;
1994466b1793Sxinhui pan if (!data || data->count == 0) {
1995466b1793Sxinhui pan *bps = NULL;
199646cf2fecSGuchun Chen ret = -EINVAL;
1997466b1793Sxinhui pan goto out;
1998466b1793Sxinhui pan }
1999466b1793Sxinhui pan
2000466b1793Sxinhui pan *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
2001466b1793Sxinhui pan if (!*bps) {
2002466b1793Sxinhui pan ret = -ENOMEM;
2003466b1793Sxinhui pan goto out;
2004466b1793Sxinhui pan }
2005466b1793Sxinhui pan
2006466b1793Sxinhui pan for (; i < data->count; i++) {
2007466b1793Sxinhui pan (*bps)[i] = (struct ras_badpage){
20089dc23a63STao Zhou .bp = data->bps[i].retired_page,
2009466b1793Sxinhui pan .size = AMDGPU_GPU_PAGE_SIZE,
201052dd95f2SGuchun Chen .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
2011466b1793Sxinhui pan };
2012ec6aae97SNirmoy Das status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
2013676deb38SDennis Li data->bps[i].retired_page);
2014732f2a30SDennis Li if (status == -EBUSY)
201552dd95f2SGuchun Chen (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
2016732f2a30SDennis Li else if (status == -ENOENT)
201752dd95f2SGuchun Chen (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
2018466b1793Sxinhui pan }
2019466b1793Sxinhui pan
2020466b1793Sxinhui pan *count = data->count;
2021466b1793Sxinhui pan out:
2022466b1793Sxinhui pan mutex_unlock(&con->recovery_lock);
2023466b1793Sxinhui pan return ret;
2024466b1793Sxinhui pan }
2025466b1793Sxinhui pan
amdgpu_ras_do_recovery(struct work_struct * work)2026c030f2e4Sxinhui pan static void amdgpu_ras_do_recovery(struct work_struct *work)
2027c030f2e4Sxinhui pan {
2028c030f2e4Sxinhui pan struct amdgpu_ras *ras =
2029c030f2e4Sxinhui pan container_of(work, struct amdgpu_ras, recovery_work);
2030b3dbd6d3SJohn Clements struct amdgpu_device *remote_adev = NULL;
2031b3dbd6d3SJohn Clements struct amdgpu_device *adev = ras->adev;
2032b3dbd6d3SJohn Clements struct list_head device_list, *device_list_handle = NULL;
2033c030f2e4Sxinhui pan
2034f75e94d8SGuchun Chen if (!ras->disable_ras_err_cnt_harvest) {
2035d95e8e97SDennis Li struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2036d95e8e97SDennis Li
2037b3dbd6d3SJohn Clements /* Build list of devices to query RAS related errors */
2038f75e94d8SGuchun Chen if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
2039b3dbd6d3SJohn Clements device_list_handle = &hive->device_list;
2040f75e94d8SGuchun Chen } else {
204112c17b9dSGuchun Chen INIT_LIST_HEAD(&device_list);
2042b3dbd6d3SJohn Clements list_add_tail(&adev->gmc.xgmi.head, &device_list);
2043b3dbd6d3SJohn Clements device_list_handle = &device_list;
2044b3dbd6d3SJohn Clements }
2045b3dbd6d3SJohn Clements
2046f75e94d8SGuchun Chen list_for_each_entry(remote_adev,
20473f975d0fSStanley.Yang device_list_handle, gmc.xgmi.head) {
20483f975d0fSStanley.Yang amdgpu_ras_query_err_status(remote_adev);
2049b3dbd6d3SJohn Clements amdgpu_ras_log_on_err_counter(remote_adev);
20503f975d0fSStanley.Yang }
2051d95e8e97SDennis Li
2052d95e8e97SDennis Li amdgpu_put_xgmi_hive(hive);
2053b3dbd6d3SJohn Clements }
2054313c8fd3SGuchun Chen
2055f1549c09SLikun Gao if (amdgpu_device_should_recover_gpu(ras->adev)) {
2056f1549c09SLikun Gao struct amdgpu_reset_context reset_context;
2057f1549c09SLikun Gao memset(&reset_context, 0, sizeof(reset_context));
2058f1549c09SLikun Gao
2059f1549c09SLikun Gao reset_context.method = AMD_RESET_METHOD_NONE;
2060f1549c09SLikun Gao reset_context.reset_req_dev = adev;
20611a11a65dSYiPeng Chai
20621a11a65dSYiPeng Chai /* Perform full reset in fatal error mode */
20631a11a65dSYiPeng Chai if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
20641a11a65dSYiPeng Chai set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
20656c47a79bSYiPeng Chai else {
2066f1549c09SLikun Gao clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2067f1549c09SLikun Gao
20686c47a79bSYiPeng Chai if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
20696c47a79bSYiPeng Chai ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
20706c47a79bSYiPeng Chai reset_context.method = AMD_RESET_METHOD_MODE2;
20716c47a79bSYiPeng Chai }
20722c7cd280SYiPeng Chai
20732c7cd280SYiPeng Chai /* Fatal error occurs in poison mode, mode1 reset is used to
20742c7cd280SYiPeng Chai * recover gpu.
20752c7cd280SYiPeng Chai */
20762c7cd280SYiPeng Chai if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
20772c7cd280SYiPeng Chai ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
20782c7cd280SYiPeng Chai set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
20791b98a5f8SYiPeng Chai
20801b98a5f8SYiPeng Chai psp_fatal_error_recovery_quirk(&adev->psp);
20812c7cd280SYiPeng Chai }
20826c47a79bSYiPeng Chai }
20836c47a79bSYiPeng Chai
2084f1549c09SLikun Gao amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
2085f1549c09SLikun Gao }
2086c030f2e4Sxinhui pan atomic_set(&ras->in_recovery, 0);
2087c030f2e4Sxinhui pan }
2088c030f2e4Sxinhui pan
2089c030f2e4Sxinhui pan /* alloc/realloc bps array */
amdgpu_ras_realloc_eh_data_space(struct amdgpu_device * adev,struct ras_err_handler_data * data,int pages)2090c030f2e4Sxinhui pan static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
2091c030f2e4Sxinhui pan struct ras_err_handler_data *data, int pages)
2092c030f2e4Sxinhui pan {
2093c030f2e4Sxinhui pan unsigned int old_space = data->count + data->space_left;
2094c030f2e4Sxinhui pan unsigned int new_space = old_space + pages;
20959dc23a63STao Zhou unsigned int align_space = ALIGN(new_space, 512);
20969dc23a63STao Zhou void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
2097c030f2e4Sxinhui pan
2098676deb38SDennis Li if (!bps) {
2099c030f2e4Sxinhui pan return -ENOMEM;
21009dc23a63STao Zhou }
2101c030f2e4Sxinhui pan
2102c030f2e4Sxinhui pan if (data->bps) {
21039dc23a63STao Zhou memcpy(bps, data->bps,
2104c030f2e4Sxinhui pan data->count * sizeof(*data->bps));
2105c030f2e4Sxinhui pan kfree(data->bps);
2106c030f2e4Sxinhui pan }
2107c030f2e4Sxinhui pan
21089dc23a63STao Zhou data->bps = bps;
2109c030f2e4Sxinhui pan data->space_left += align_space - old_space;
2110c030f2e4Sxinhui pan return 0;
2111c030f2e4Sxinhui pan }
2112c030f2e4Sxinhui pan
2113c030f2e4Sxinhui pan /* it deal with vram only. */
amdgpu_ras_add_bad_pages(struct amdgpu_device * adev,struct eeprom_table_record * bps,int pages)2114c030f2e4Sxinhui pan int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
21159dc23a63STao Zhou struct eeprom_table_record *bps, int pages)
2116c030f2e4Sxinhui pan {
2117c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
211873aa8e1aSxinhui pan struct ras_err_handler_data *data;
2119c030f2e4Sxinhui pan int ret = 0;
2120676deb38SDennis Li uint32_t i;
2121c030f2e4Sxinhui pan
212273aa8e1aSxinhui pan if (!con || !con->eh_data || !bps || pages <= 0)
2123c030f2e4Sxinhui pan return 0;
2124c030f2e4Sxinhui pan
2125c030f2e4Sxinhui pan mutex_lock(&con->recovery_lock);
212673aa8e1aSxinhui pan data = con->eh_data;
2127c030f2e4Sxinhui pan if (!data)
2128c030f2e4Sxinhui pan goto out;
2129c030f2e4Sxinhui pan
2130676deb38SDennis Li for (i = 0; i < pages; i++) {
2131676deb38SDennis Li if (amdgpu_ras_check_bad_page_unlock(con,
2132676deb38SDennis Li bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
2133676deb38SDennis Li continue;
2134676deb38SDennis Li
2135676deb38SDennis Li if (!data->space_left &&
2136676deb38SDennis Li amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
2137c030f2e4Sxinhui pan ret = -ENOMEM;
2138c030f2e4Sxinhui pan goto out;
2139c030f2e4Sxinhui pan }
2140c030f2e4Sxinhui pan
2141ec6aae97SNirmoy Das amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
2142676deb38SDennis Li bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
2143676deb38SDennis Li AMDGPU_GPU_PAGE_SIZE);
21449dc23a63STao Zhou
2145676deb38SDennis Li memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
2146676deb38SDennis Li data->count++;
2147676deb38SDennis Li data->space_left--;
2148676deb38SDennis Li }
2149c030f2e4Sxinhui pan out:
2150c030f2e4Sxinhui pan mutex_unlock(&con->recovery_lock);
2151c030f2e4Sxinhui pan
2152c030f2e4Sxinhui pan return ret;
2153c030f2e4Sxinhui pan }
2154c030f2e4Sxinhui pan
215578ad00c9STao Zhou /*
215678ad00c9STao Zhou * write error record array to eeprom, the function should be
215778ad00c9STao Zhou * protected by recovery_lock
21584d33e0f1STao Zhou * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
215978ad00c9STao Zhou */
amdgpu_ras_save_bad_pages(struct amdgpu_device * adev,unsigned long * new_cnt)21604d33e0f1STao Zhou int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
21614d33e0f1STao Zhou unsigned long *new_cnt)
216278ad00c9STao Zhou {
216378ad00c9STao Zhou struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
216478ad00c9STao Zhou struct ras_err_handler_data *data;
21658a3e801fSGuchun Chen struct amdgpu_ras_eeprom_control *control;
216678ad00c9STao Zhou int save_count;
216778ad00c9STao Zhou
21684d33e0f1STao Zhou if (!con || !con->eh_data) {
21694d33e0f1STao Zhou if (new_cnt)
21704d33e0f1STao Zhou *new_cnt = 0;
21714d33e0f1STao Zhou
217278ad00c9STao Zhou return 0;
21734d33e0f1STao Zhou }
217478ad00c9STao Zhou
2175d9a69fe5SCandice Li mutex_lock(&con->recovery_lock);
21768a3e801fSGuchun Chen control = &con->eeprom_control;
217778ad00c9STao Zhou data = con->eh_data;
21780686627bSLuben Tuikov save_count = data->count - control->ras_num_recs;
2179d9a69fe5SCandice Li mutex_unlock(&con->recovery_lock);
21804d33e0f1STao Zhou
21814d33e0f1STao Zhou if (new_cnt)
21824d33e0f1STao Zhou *new_cnt = save_count / adev->umc.retire_unit;
21834d33e0f1STao Zhou
218478ad00c9STao Zhou /* only new entries are saved */
2185b1628425SGuchun Chen if (save_count > 0) {
218663d4c081SLuben Tuikov if (amdgpu_ras_eeprom_append(control,
21870686627bSLuben Tuikov &data->bps[control->ras_num_recs],
21881fab841fSLuben Tuikov save_count)) {
21896952e99cSGuchun Chen dev_err(adev->dev, "Failed to save EEPROM table data!");
219078ad00c9STao Zhou return -EIO;
219178ad00c9STao Zhou }
219278ad00c9STao Zhou
2193b1628425SGuchun Chen dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
2194b1628425SGuchun Chen }
2195b1628425SGuchun Chen
219678ad00c9STao Zhou return 0;
219778ad00c9STao Zhou }
219878ad00c9STao Zhou
219978ad00c9STao Zhou /*
220078ad00c9STao Zhou * read error record array in eeprom and reserve enough space for
220178ad00c9STao Zhou * storing new bad pages
220278ad00c9STao Zhou */
amdgpu_ras_load_bad_pages(struct amdgpu_device * adev)220378ad00c9STao Zhou static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
220478ad00c9STao Zhou {
220578ad00c9STao Zhou struct amdgpu_ras_eeprom_control *control =
22066457205cSCandice Li &adev->psp.ras_context.ras->eeprom_control;
2207e4e6a589SLuben Tuikov struct eeprom_table_record *bps;
2208e4e6a589SLuben Tuikov int ret;
220978ad00c9STao Zhou
221078ad00c9STao Zhou /* no bad page record, skip eeprom access */
22110686627bSLuben Tuikov if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
2212e4e6a589SLuben Tuikov return 0;
221378ad00c9STao Zhou
22140686627bSLuben Tuikov bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
221578ad00c9STao Zhou if (!bps)
221678ad00c9STao Zhou return -ENOMEM;
221778ad00c9STao Zhou
22180686627bSLuben Tuikov ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
2219e4e6a589SLuben Tuikov if (ret)
22206952e99cSGuchun Chen dev_err(adev->dev, "Failed to load EEPROM table records!");
2221e4e6a589SLuben Tuikov else
22220686627bSLuben Tuikov ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
222378ad00c9STao Zhou
222478ad00c9STao Zhou kfree(bps);
222578ad00c9STao Zhou return ret;
222678ad00c9STao Zhou }
222778ad00c9STao Zhou
amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras * con,uint64_t addr)2228676deb38SDennis Li static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
2229676deb38SDennis Li uint64_t addr)
2230676deb38SDennis Li {
2231676deb38SDennis Li struct ras_err_handler_data *data = con->eh_data;
2232676deb38SDennis Li int i;
2233676deb38SDennis Li
2234676deb38SDennis Li addr >>= AMDGPU_GPU_PAGE_SHIFT;
2235676deb38SDennis Li for (i = 0; i < data->count; i++)
2236676deb38SDennis Li if (addr == data->bps[i].retired_page)
2237676deb38SDennis Li return true;
2238676deb38SDennis Li
2239676deb38SDennis Li return false;
2240676deb38SDennis Li }
2241676deb38SDennis Li
22426e4be987STao Zhou /*
22436e4be987STao Zhou * check if an address belongs to bad page
22446e4be987STao Zhou *
22456e4be987STao Zhou * Note: this check is only for umc block
22466e4be987STao Zhou */
amdgpu_ras_check_bad_page(struct amdgpu_device * adev,uint64_t addr)22476e4be987STao Zhou static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
22486e4be987STao Zhou uint64_t addr)
22496e4be987STao Zhou {
22506e4be987STao Zhou struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
22516e4be987STao Zhou bool ret = false;
22526e4be987STao Zhou
22536e4be987STao Zhou if (!con || !con->eh_data)
22546e4be987STao Zhou return ret;
22556e4be987STao Zhou
22566e4be987STao Zhou mutex_lock(&con->recovery_lock);
2257676deb38SDennis Li ret = amdgpu_ras_check_bad_page_unlock(con, addr);
22586e4be987STao Zhou mutex_unlock(&con->recovery_lock);
22596e4be987STao Zhou return ret;
22606e4be987STao Zhou }
22616e4be987STao Zhou
amdgpu_ras_validate_threshold(struct amdgpu_device * adev,uint32_t max_count)2262e5c04edfSChristian König static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
2263e4e6a589SLuben Tuikov uint32_t max_count)
2264c84d4670SGuchun Chen {
2265e5c04edfSChristian König struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2266c84d4670SGuchun Chen
2267c84d4670SGuchun Chen /*
2268c84d4670SGuchun Chen * Justification of value bad_page_cnt_threshold in ras structure
2269c84d4670SGuchun Chen *
2270f3cbe70eSTao Zhou * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
2271f3cbe70eSTao Zhou * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
2272f3cbe70eSTao Zhou * scenarios accordingly.
2273c84d4670SGuchun Chen *
2274c84d4670SGuchun Chen * Bad page retirement enablement:
2275f3cbe70eSTao Zhou * - If amdgpu_bad_page_threshold = -2,
2276c84d4670SGuchun Chen * bad_page_cnt_threshold = typical value by formula.
2277c84d4670SGuchun Chen *
2278c84d4670SGuchun Chen * - When the value from user is 0 < amdgpu_bad_page_threshold <
2279c84d4670SGuchun Chen * max record length in eeprom, use it directly.
2280c84d4670SGuchun Chen *
2281c84d4670SGuchun Chen * Bad page retirement disablement:
2282c84d4670SGuchun Chen * - If amdgpu_bad_page_threshold = 0, bad page retirement
2283c84d4670SGuchun Chen * functionality is disabled, and bad_page_cnt_threshold will
2284c84d4670SGuchun Chen * take no effect.
2285c84d4670SGuchun Chen */
2286c84d4670SGuchun Chen
2287e4e6a589SLuben Tuikov if (amdgpu_bad_page_threshold < 0) {
2288e4e6a589SLuben Tuikov u64 val = adev->gmc.mc_vram_size;
2289c84d4670SGuchun Chen
2290e4e6a589SLuben Tuikov do_div(val, RAS_BAD_PAGE_COVER);
2291e5c04edfSChristian König con->bad_page_cnt_threshold = min(lower_32_bits(val),
2292e4e6a589SLuben Tuikov max_count);
2293e5c04edfSChristian König } else {
2294e4e6a589SLuben Tuikov con->bad_page_cnt_threshold = min_t(int, max_count,
2295e4e6a589SLuben Tuikov amdgpu_bad_page_threshold);
2296c84d4670SGuchun Chen }
2297c84d4670SGuchun Chen }
2298c84d4670SGuchun Chen
amdgpu_ras_recovery_init(struct amdgpu_device * adev)22991a6fc071STao Zhou int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
2300c030f2e4Sxinhui pan {
2301c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
23024d1337d2SAndrey Grodzovsky struct ras_err_handler_data **data;
2303e4e6a589SLuben Tuikov u32 max_eeprom_records_count = 0;
2304b82e65a9SGuchun Chen bool exc_err_limit = false;
230578ad00c9STao Zhou int ret;
2306c030f2e4Sxinhui pan
2307e0e146d5SStanley.Yang if (!con || amdgpu_sriov_vf(adev))
23084d1337d2SAndrey Grodzovsky return 0;
23094d1337d2SAndrey Grodzovsky
23101d9d2ca8SLuben Tuikov /* Allow access to RAS EEPROM via debugfs, when the ASIC
23111d9d2ca8SLuben Tuikov * supports RAS and debugfs is enabled, but when
23121d9d2ca8SLuben Tuikov * adev->ras_enabled is unset, i.e. when "ras_enable"
23131d9d2ca8SLuben Tuikov * module parameter is set to 0.
23141d9d2ca8SLuben Tuikov */
23151d9d2ca8SLuben Tuikov con->adev = adev;
23161d9d2ca8SLuben Tuikov
23171d9d2ca8SLuben Tuikov if (!adev->ras_enabled)
23181d9d2ca8SLuben Tuikov return 0;
23191d9d2ca8SLuben Tuikov
23201d9d2ca8SLuben Tuikov data = &con->eh_data;
23211a6fc071STao Zhou *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
23221a6fc071STao Zhou if (!*data) {
23231a6fc071STao Zhou ret = -ENOMEM;
23241a6fc071STao Zhou goto out;
23251a6fc071STao Zhou }
2326c030f2e4Sxinhui pan
2327c030f2e4Sxinhui pan mutex_init(&con->recovery_lock);
2328c030f2e4Sxinhui pan INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
2329c030f2e4Sxinhui pan atomic_set(&con->in_recovery, 0);
233069691c82SStanley.Yang con->eeprom_control.bad_channel_bitmap = 0;
2331c030f2e4Sxinhui pan
23327f599fedSStanley.Yang max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
2333e4e6a589SLuben Tuikov amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
2334c84d4670SGuchun Chen
2335e5086659Sshaoyunl /* Todo: During test the SMU might fail to read the eeprom through I2C
2336e5086659Sshaoyunl * when the GPU is pending on XGMI reset during probe time
2337e5086659Sshaoyunl * (Mostly after second bus reset), skip it now
2338e5086659Sshaoyunl */
2339e5086659Sshaoyunl if (adev->gmc.xgmi.pending_reset)
2340e5086659Sshaoyunl return 0;
2341b82e65a9SGuchun Chen ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
2342b82e65a9SGuchun Chen /*
2343b82e65a9SGuchun Chen * This calling fails when exc_err_limit is true or
2344b82e65a9SGuchun Chen * ret != 0.
2345b82e65a9SGuchun Chen */
2346b82e65a9SGuchun Chen if (exc_err_limit || ret)
23471a6fc071STao Zhou goto free;
234878ad00c9STao Zhou
23490686627bSLuben Tuikov if (con->eeprom_control.ras_num_recs) {
235078ad00c9STao Zhou ret = amdgpu_ras_load_bad_pages(adev);
235178ad00c9STao Zhou if (ret)
23521a6fc071STao Zhou goto free;
2353513befa6SStanley.Yang
2354bc143d8bSEvan Quan amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
235569691c82SStanley.Yang
235669691c82SStanley.Yang if (con->update_channel_flag == true) {
235769691c82SStanley.Yang amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
235869691c82SStanley.Yang con->update_channel_flag = false;
235969691c82SStanley.Yang }
236078ad00c9STao Zhou }
2361c030f2e4Sxinhui pan
236212b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
236312b2cab7SMukul Joshi if ((adev->asic_type == CHIP_ALDEBARAN) &&
236412b2cab7SMukul Joshi (adev->gmc.xgmi.connected_to_cpu))
236591a1a52dSMukul Joshi amdgpu_register_bad_pages_mca_notifier(adev);
236612b2cab7SMukul Joshi #endif
2367c030f2e4Sxinhui pan return 0;
23681a6fc071STao Zhou
23691a6fc071STao Zhou free:
23701a6fc071STao Zhou kfree((*data)->bps);
23711a6fc071STao Zhou kfree(*data);
23721995b3a3SFelix Kuehling con->eh_data = NULL;
23731a6fc071STao Zhou out:
2374cf696091SLuben Tuikov dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
23751a6fc071STao Zhou
2376b82e65a9SGuchun Chen /*
2377b82e65a9SGuchun Chen * Except error threshold exceeding case, other failure cases in this
2378b82e65a9SGuchun Chen * function would not fail amdgpu driver init.
2379b82e65a9SGuchun Chen */
2380b82e65a9SGuchun Chen if (!exc_err_limit)
2381b82e65a9SGuchun Chen ret = 0;
2382b82e65a9SGuchun Chen else
2383b82e65a9SGuchun Chen ret = -EINVAL;
2384b82e65a9SGuchun Chen
23851a6fc071STao Zhou return ret;
2386c030f2e4Sxinhui pan }
2387c030f2e4Sxinhui pan
amdgpu_ras_recovery_fini(struct amdgpu_device * adev)2388c030f2e4Sxinhui pan static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2389c030f2e4Sxinhui pan {
2390c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2391c030f2e4Sxinhui pan struct ras_err_handler_data *data = con->eh_data;
2392c030f2e4Sxinhui pan
23931a6fc071STao Zhou /* recovery_init failed to init it, fini is useless */
23941a6fc071STao Zhou if (!data)
23951a6fc071STao Zhou return 0;
23961a6fc071STao Zhou
2397c030f2e4Sxinhui pan cancel_work_sync(&con->recovery_work);
2398c030f2e4Sxinhui pan
2399c030f2e4Sxinhui pan mutex_lock(&con->recovery_lock);
2400c030f2e4Sxinhui pan con->eh_data = NULL;
2401c030f2e4Sxinhui pan kfree(data->bps);
2402c030f2e4Sxinhui pan kfree(data);
2403c030f2e4Sxinhui pan mutex_unlock(&con->recovery_lock);
2404c030f2e4Sxinhui pan
2405c030f2e4Sxinhui pan return 0;
2406c030f2e4Sxinhui pan }
2407c030f2e4Sxinhui pan /* recovery end */
2408c030f2e4Sxinhui pan
amdgpu_ras_asic_supported(struct amdgpu_device * adev)2409084e2640SLuben Tuikov static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
24105436ab94SStanley.Yang {
241182835055SYiPeng Chai if (amdgpu_sriov_vf(adev)) {
241282835055SYiPeng Chai switch (adev->ip_versions[MP0_HWIP][0]) {
241382835055SYiPeng Chai case IP_VERSION(13, 0, 2):
2414e81c4556SYiPeng Chai case IP_VERSION(13, 0, 6):
241582835055SYiPeng Chai return true;
241682835055SYiPeng Chai default:
241782835055SYiPeng Chai return false;
241882835055SYiPeng Chai }
241982835055SYiPeng Chai }
242082835055SYiPeng Chai
2421073285efSYiPeng Chai if (adev->asic_type == CHIP_IP_DISCOVERY) {
2422073285efSYiPeng Chai switch (adev->ip_versions[MP0_HWIP][0]) {
2423073285efSYiPeng Chai case IP_VERSION(13, 0, 0):
2424cb906ce3SStanley.Yang case IP_VERSION(13, 0, 6):
2425073285efSYiPeng Chai case IP_VERSION(13, 0, 10):
2426073285efSYiPeng Chai return true;
2427073285efSYiPeng Chai default:
2428073285efSYiPeng Chai return false;
2429073285efSYiPeng Chai }
2430073285efSYiPeng Chai }
2431073285efSYiPeng Chai
2432084e2640SLuben Tuikov return adev->asic_type == CHIP_VEGA10 ||
2433084e2640SLuben Tuikov adev->asic_type == CHIP_VEGA20 ||
2434084e2640SLuben Tuikov adev->asic_type == CHIP_ARCTURUS ||
243575f06251SHawking Zhang adev->asic_type == CHIP_ALDEBARAN ||
2436084e2640SLuben Tuikov adev->asic_type == CHIP_SIENNA_CICHLID;
24375436ab94SStanley.Yang }
24385436ab94SStanley.Yang
24395caf466aSxinhui pan /*
2440f50160cfSStanley.Yang * this is workaround for vega20 workstation sku,
2441f50160cfSStanley.Yang * force enable gfx ras, ignore vbios gfx ras flag
2442f50160cfSStanley.Yang * due to GC EDC can not write
2443f50160cfSStanley.Yang */
amdgpu_ras_get_quirks(struct amdgpu_device * adev)2444e509965eSLuben Tuikov static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
2445f50160cfSStanley.Yang {
2446f50160cfSStanley.Yang struct atom_context *ctx = adev->mode_info.atom_context;
2447f50160cfSStanley.Yang
2448f50160cfSStanley.Yang if (!ctx)
2449f50160cfSStanley.Yang return;
2450f50160cfSStanley.Yang
2451adf64e21SMario Limonciello if (strnstr(ctx->vbios_pn, "D16406",
2452adf64e21SMario Limonciello sizeof(ctx->vbios_pn)) ||
2453adf64e21SMario Limonciello strnstr(ctx->vbios_pn, "D36002",
2454adf64e21SMario Limonciello sizeof(ctx->vbios_pn)))
24558ab0d6f0SLuben Tuikov adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
2456f50160cfSStanley.Yang }
2457f50160cfSStanley.Yang
2458f50160cfSStanley.Yang /*
24595caf466aSxinhui pan * check hardware's ras ability which will be saved in hw_supported.
24605caf466aSxinhui pan * if hardware does not support ras, we can skip some ras initializtion and
24615caf466aSxinhui pan * forbid some ras operations from IP.
24625caf466aSxinhui pan * if software itself, say boot parameter, limit the ras ability. We still
24635caf466aSxinhui pan * need allow IP do some limited operations, like disable. In such case,
24645caf466aSxinhui pan * we have to initialize ras as normal. but need check if operation is
24655caf466aSxinhui pan * allowed or not in each function.
24665caf466aSxinhui pan */
amdgpu_ras_check_supported(struct amdgpu_device * adev)2467e509965eSLuben Tuikov static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
2468c030f2e4Sxinhui pan {
24698ab0d6f0SLuben Tuikov adev->ras_hw_enabled = adev->ras_enabled = 0;
2470c030f2e4Sxinhui pan
247138298ce6SStanley.Yang if (!amdgpu_ras_asic_supported(adev))
24725caf466aSxinhui pan return;
2473b404ae82Sxinhui pan
247438298ce6SStanley.Yang if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
247588474ccaSGuchun Chen if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
2476b69d5c7eSHawking Zhang dev_info(adev->dev, "MEM ECC is active.\n");
24778ab0d6f0SLuben Tuikov adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
247888474ccaSGuchun Chen 1 << AMDGPU_RAS_BLOCK__DF);
247975f06251SHawking Zhang } else {
2480b69d5c7eSHawking Zhang dev_info(adev->dev, "MEM ECC is not presented.\n");
248175f06251SHawking Zhang }
248288474ccaSGuchun Chen
248388474ccaSGuchun Chen if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
24846952e99cSGuchun Chen dev_info(adev->dev, "SRAM ECC is active.\n");
24853189501eSTao Zhou if (!amdgpu_sriov_vf(adev))
24868ab0d6f0SLuben Tuikov adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
248788474ccaSGuchun Chen 1 << AMDGPU_RAS_BLOCK__DF);
24883189501eSTao Zhou else
24893189501eSTao Zhou adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
24903189501eSTao Zhou 1 << AMDGPU_RAS_BLOCK__SDMA |
24913189501eSTao Zhou 1 << AMDGPU_RAS_BLOCK__GFX);
2492a3d63c62SMohammad Zafar Ziya
24933189501eSTao Zhou /* VCN/JPEG RAS can be supported on both bare metal and
24943189501eSTao Zhou * SRIOV environment
24953189501eSTao Zhou */
249607615da1STao Zhou if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
249707615da1STao Zhou adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
2498a3d63c62SMohammad Zafar Ziya adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
2499a3d63c62SMohammad Zafar Ziya 1 << AMDGPU_RAS_BLOCK__JPEG);
2500a3d63c62SMohammad Zafar Ziya else
2501a3d63c62SMohammad Zafar Ziya adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
2502a3d63c62SMohammad Zafar Ziya 1 << AMDGPU_RAS_BLOCK__JPEG);
250358bc2a9cSStanley.Yang
250458bc2a9cSStanley.Yang /*
250558bc2a9cSStanley.Yang * XGMI RAS is not supported if xgmi num physical nodes
250658bc2a9cSStanley.Yang * is zero
250758bc2a9cSStanley.Yang */
250858bc2a9cSStanley.Yang if (!adev->gmc.xgmi.num_physical_nodes)
250958bc2a9cSStanley.Yang adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL);
251075f06251SHawking Zhang } else {
25116952e99cSGuchun Chen dev_info(adev->dev, "SRAM ECC is not presented.\n");
251275f06251SHawking Zhang }
251375f06251SHawking Zhang } else {
251475f06251SHawking Zhang /* driver only manages a few IP blocks RAS feature
251575f06251SHawking Zhang * when GPU is connected cpu through XGMI */
25168ab0d6f0SLuben Tuikov adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
251775f06251SHawking Zhang 1 << AMDGPU_RAS_BLOCK__SDMA |
251875f06251SHawking Zhang 1 << AMDGPU_RAS_BLOCK__MMHUB);
251975f06251SHawking Zhang }
252088474ccaSGuchun Chen
2521e509965eSLuben Tuikov amdgpu_ras_get_quirks(adev);
2522b404ae82Sxinhui pan
2523b404ae82Sxinhui pan /* hw_supported needs to be aligned with RAS block mask. */
25248ab0d6f0SLuben Tuikov adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
2525c030f2e4Sxinhui pan
2526276f6e8cSStanley.Yang
2527276f6e8cSStanley.Yang /*
2528276f6e8cSStanley.Yang * Disable ras feature for aqua vanjaram
2529276f6e8cSStanley.Yang * by default on apu platform.
2530276f6e8cSStanley.Yang */
2531fcb7a184SStanley.Yang if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) &&
2532fcb7a184SStanley.Yang adev->gmc.is_app_apu)
2533276f6e8cSStanley.Yang adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 :
2534276f6e8cSStanley.Yang adev->ras_hw_enabled & amdgpu_ras_mask;
2535276f6e8cSStanley.Yang else
25368ab0d6f0SLuben Tuikov adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
25378ab0d6f0SLuben Tuikov adev->ras_hw_enabled & amdgpu_ras_mask;
2538c030f2e4Sxinhui pan }
2539c030f2e4Sxinhui pan
amdgpu_ras_counte_dw(struct work_struct * work)254005adfd80SLuben Tuikov static void amdgpu_ras_counte_dw(struct work_struct *work)
254105adfd80SLuben Tuikov {
254205adfd80SLuben Tuikov struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
254305adfd80SLuben Tuikov ras_counte_delay_work.work);
254405adfd80SLuben Tuikov struct amdgpu_device *adev = con->adev;
2545a3fbb0d8SGuchun Chen struct drm_device *dev = adev_to_drm(adev);
254605adfd80SLuben Tuikov unsigned long ce_count, ue_count;
254705adfd80SLuben Tuikov int res;
254805adfd80SLuben Tuikov
254905adfd80SLuben Tuikov res = pm_runtime_get_sync(dev->dev);
255005adfd80SLuben Tuikov if (res < 0)
255105adfd80SLuben Tuikov goto Out;
255205adfd80SLuben Tuikov
255305adfd80SLuben Tuikov /* Cache new values.
255405adfd80SLuben Tuikov */
25554a1c9a44SHawking Zhang if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) {
255605adfd80SLuben Tuikov atomic_set(&con->ras_ce_count, ce_count);
255705adfd80SLuben Tuikov atomic_set(&con->ras_ue_count, ue_count);
25584d9f771eSLuben Tuikov }
255905adfd80SLuben Tuikov
256005adfd80SLuben Tuikov pm_runtime_mark_last_busy(dev->dev);
256105adfd80SLuben Tuikov Out:
256205adfd80SLuben Tuikov pm_runtime_put_autosuspend(dev->dev);
256305adfd80SLuben Tuikov }
256405adfd80SLuben Tuikov
amdgpu_ras_query_poison_mode(struct amdgpu_device * adev)25652dd9032bSTao Zhou static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
25662dd9032bSTao Zhou {
25672dd9032bSTao Zhou struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
25682dd9032bSTao Zhou bool df_poison, umc_poison;
25692dd9032bSTao Zhou
25702dd9032bSTao Zhou /* poison setting is useless on SRIOV guest */
25712dd9032bSTao Zhou if (amdgpu_sriov_vf(adev) || !con)
25722dd9032bSTao Zhou return;
25732dd9032bSTao Zhou
25742dd9032bSTao Zhou /* Init poison supported flag, the default value is false */
25752dd9032bSTao Zhou if (adev->gmc.xgmi.connected_to_cpu) {
25762dd9032bSTao Zhou /* enabled by default when GPU is connected to CPU */
25772dd9032bSTao Zhou con->poison_supported = true;
25782dd9032bSTao Zhou } else if (adev->df.funcs &&
25792dd9032bSTao Zhou adev->df.funcs->query_ras_poison_mode &&
25802dd9032bSTao Zhou adev->umc.ras &&
25812dd9032bSTao Zhou adev->umc.ras->query_ras_poison_mode) {
25822dd9032bSTao Zhou df_poison =
25832dd9032bSTao Zhou adev->df.funcs->query_ras_poison_mode(adev);
25842dd9032bSTao Zhou umc_poison =
25852dd9032bSTao Zhou adev->umc.ras->query_ras_poison_mode(adev);
25862dd9032bSTao Zhou
25872dd9032bSTao Zhou /* Only poison is set in both DF and UMC, we can support it */
25882dd9032bSTao Zhou if (df_poison && umc_poison)
25892dd9032bSTao Zhou con->poison_supported = true;
25902dd9032bSTao Zhou else if (df_poison != umc_poison)
25912dd9032bSTao Zhou dev_warn(adev->dev,
25922dd9032bSTao Zhou "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
25932dd9032bSTao Zhou df_poison, umc_poison);
25942dd9032bSTao Zhou }
25952dd9032bSTao Zhou }
25962dd9032bSTao Zhou
amdgpu_ras_init(struct amdgpu_device * adev)2597c030f2e4Sxinhui pan int amdgpu_ras_init(struct amdgpu_device *adev)
2598c030f2e4Sxinhui pan {
2599c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2600c030f2e4Sxinhui pan int r;
2601c030f2e4Sxinhui pan
2602c030f2e4Sxinhui pan if (con)
2603c030f2e4Sxinhui pan return 0;
2604c030f2e4Sxinhui pan
2605c030f2e4Sxinhui pan con = kmalloc(sizeof(struct amdgpu_ras) +
2606640ae42eSJohn Clements sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
2607640ae42eSJohn Clements sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
2608c030f2e4Sxinhui pan GFP_KERNEL|__GFP_ZERO);
2609c030f2e4Sxinhui pan if (!con)
2610c030f2e4Sxinhui pan return -ENOMEM;
2611c030f2e4Sxinhui pan
261205adfd80SLuben Tuikov con->adev = adev;
261305adfd80SLuben Tuikov INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
261405adfd80SLuben Tuikov atomic_set(&con->ras_ce_count, 0);
261505adfd80SLuben Tuikov atomic_set(&con->ras_ue_count, 0);
261605adfd80SLuben Tuikov
2617c030f2e4Sxinhui pan con->objs = (struct ras_manager *)(con + 1);
2618c030f2e4Sxinhui pan
2619c030f2e4Sxinhui pan amdgpu_ras_set_context(adev, con);
2620c030f2e4Sxinhui pan
2621e509965eSLuben Tuikov amdgpu_ras_check_supported(adev);
2622e509965eSLuben Tuikov
26237ddd9770SOak Zeng if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
2624970fd197SStanley.Yang /* set gfx block ras context feature for VEGA20 Gaming
2625970fd197SStanley.Yang * send ras disable cmd to ras ta during ras late init.
2626970fd197SStanley.Yang */
26278ab0d6f0SLuben Tuikov if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
2628970fd197SStanley.Yang con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
2629970fd197SStanley.Yang
2630970fd197SStanley.Yang return 0;
2631970fd197SStanley.Yang }
2632970fd197SStanley.Yang
26335e91160aSGuchun Chen r = 0;
26345436ab94SStanley.Yang goto release_con;
2635fb2a3607SHawking Zhang }
2636fb2a3607SHawking Zhang
263769691c82SStanley.Yang con->update_channel_flag = false;
2638c030f2e4Sxinhui pan con->features = 0;
2639c030f2e4Sxinhui pan INIT_LIST_HEAD(&con->head);
2640108c6a63Sxinhui pan /* Might need get this flag from vbios. */
2641108c6a63Sxinhui pan con->flags = RAS_DEFAULT_FLAGS;
2642c030f2e4Sxinhui pan
26436e36f231SHawking Zhang /* initialize nbio ras function ahead of any other
26446e36f231SHawking Zhang * ras functions so hardware fatal error interrupt
26456e36f231SHawking Zhang * can be enabled as early as possible */
2646fdc94d3aSHawking Zhang switch (adev->ip_versions[NBIO_HWIP][0]) {
2647fdc94d3aSHawking Zhang case IP_VERSION(7, 4, 0):
2648fdc94d3aSHawking Zhang case IP_VERSION(7, 4, 1):
2649fdc94d3aSHawking Zhang case IP_VERSION(7, 4, 4):
2650fdc94d3aSHawking Zhang if (!adev->gmc.xgmi.connected_to_cpu)
26512e54fe5dSyipechai adev->nbio.ras = &nbio_v7_4_ras;
26526e36f231SHawking Zhang break;
26539af357bcSHawking Zhang case IP_VERSION(4, 3, 0):
26549af357bcSHawking Zhang if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
26559af357bcSHawking Zhang /* unlike other generation of nbio ras,
26569af357bcSHawking Zhang * nbio v4_3 only support fatal error interrupt
26579af357bcSHawking Zhang * to inform software that DF is freezed due to
26589af357bcSHawking Zhang * system fatal error event. driver should not
26599af357bcSHawking Zhang * enable nbio ras in such case. Instead,
26609af357bcSHawking Zhang * check DF RAS */
26619af357bcSHawking Zhang adev->nbio.ras = &nbio_v4_3_ras;
26629af357bcSHawking Zhang break;
26637692e1eeSTao Zhou case IP_VERSION(7, 9, 0):
26647692e1eeSTao Zhou if (!adev->gmc.is_app_apu)
26657692e1eeSTao Zhou adev->nbio.ras = &nbio_v7_9_ras;
26667692e1eeSTao Zhou break;
26676e36f231SHawking Zhang default:
26686e36f231SHawking Zhang /* nbio ras is not available */
26696e36f231SHawking Zhang break;
26706e36f231SHawking Zhang }
26716e36f231SHawking Zhang
2672fdc94d3aSHawking Zhang /* nbio ras block needs to be enabled ahead of other ras blocks
2673fdc94d3aSHawking Zhang * to handle fatal error */
2674fdc94d3aSHawking Zhang r = amdgpu_nbio_ras_sw_init(adev);
2675fdc94d3aSHawking Zhang if (r)
2676fdc94d3aSHawking Zhang return r;
2677fdc94d3aSHawking Zhang
26782e54fe5dSyipechai if (adev->nbio.ras &&
26792e54fe5dSyipechai adev->nbio.ras->init_ras_controller_interrupt) {
26802e54fe5dSyipechai r = adev->nbio.ras->init_ras_controller_interrupt(adev);
26814e644fffSHawking Zhang if (r)
26825436ab94SStanley.Yang goto release_con;
26834e644fffSHawking Zhang }
26844e644fffSHawking Zhang
26852e54fe5dSyipechai if (adev->nbio.ras &&
26862e54fe5dSyipechai adev->nbio.ras->init_ras_err_event_athub_interrupt) {
26872e54fe5dSyipechai r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
26884e644fffSHawking Zhang if (r)
26895436ab94SStanley.Yang goto release_con;
26904e644fffSHawking Zhang }
26914e644fffSHawking Zhang
26922dd9032bSTao Zhou amdgpu_ras_query_poison_mode(adev);
2693e4348849STao Zhou
26945e91160aSGuchun Chen if (amdgpu_ras_fs_init(adev)) {
26955e91160aSGuchun Chen r = -EINVAL;
26965436ab94SStanley.Yang goto release_con;
26975e91160aSGuchun Chen }
2698c030f2e4Sxinhui pan
26996952e99cSGuchun Chen dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
27005d0f903fSxinhui pan "hardware ability[%x] ras_mask[%x]\n",
27018ab0d6f0SLuben Tuikov adev->ras_hw_enabled, adev->ras_enabled);
2702e509965eSLuben Tuikov
2703c030f2e4Sxinhui pan return 0;
27045436ab94SStanley.Yang release_con:
2705c030f2e4Sxinhui pan amdgpu_ras_set_context(adev, NULL);
2706c030f2e4Sxinhui pan kfree(con);
2707c030f2e4Sxinhui pan
27085e91160aSGuchun Chen return r;
2709c030f2e4Sxinhui pan }
2710c030f2e4Sxinhui pan
amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device * adev)27118f6368a9SJohn Clements int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
2712134d16d5SJohn Clements {
27138107e499SHawking Zhang if (adev->gmc.xgmi.connected_to_cpu ||
27148107e499SHawking Zhang adev->gmc.is_app_apu)
2715134d16d5SJohn Clements return 1;
2716134d16d5SJohn Clements return 0;
2717134d16d5SJohn Clements }
2718134d16d5SJohn Clements
amdgpu_persistent_edc_harvesting(struct amdgpu_device * adev,struct ras_common_if * ras_block)2719134d16d5SJohn Clements static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
2720134d16d5SJohn Clements struct ras_common_if *ras_block)
2721134d16d5SJohn Clements {
2722134d16d5SJohn Clements struct ras_query_if info = {
2723134d16d5SJohn Clements .head = *ras_block,
2724134d16d5SJohn Clements };
2725134d16d5SJohn Clements
2726134d16d5SJohn Clements if (!amdgpu_persistent_edc_harvesting_supported(adev))
2727134d16d5SJohn Clements return 0;
2728134d16d5SJohn Clements
2729134d16d5SJohn Clements if (amdgpu_ras_query_error_status(adev, &info) != 0)
2730134d16d5SJohn Clements DRM_WARN("RAS init harvest failure");
2731134d16d5SJohn Clements
2732134d16d5SJohn Clements if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
2733134d16d5SJohn Clements DRM_WARN("RAS init harvest reset failure");
2734134d16d5SJohn Clements
2735134d16d5SJohn Clements return 0;
2736134d16d5SJohn Clements }
2737134d16d5SJohn Clements
amdgpu_ras_is_poison_mode_supported(struct amdgpu_device * adev)2738e4348849STao Zhou bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
2739e4348849STao Zhou {
2740e4348849STao Zhou struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2741e4348849STao Zhou
2742e4348849STao Zhou if (!con)
2743e4348849STao Zhou return false;
2744e4348849STao Zhou
2745e4348849STao Zhou return con->poison_supported;
2746e4348849STao Zhou }
2747e4348849STao Zhou
2748b293e891SHawking Zhang /* helper function to handle common stuff in ip late init phase */
amdgpu_ras_block_late_init(struct amdgpu_device * adev,struct ras_common_if * ras_block)2749563285c8Syipechai int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
2750563285c8Syipechai struct ras_common_if *ras_block)
2751b293e891SHawking Zhang {
275229c9b6cdSyipechai struct amdgpu_ras_block_object *ras_obj = NULL;
275305adfd80SLuben Tuikov struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
27544a1c9a44SHawking Zhang struct ras_query_if *query_info;
275505adfd80SLuben Tuikov unsigned long ue_count, ce_count;
2756b293e891SHawking Zhang int r;
2757b293e891SHawking Zhang
2758b293e891SHawking Zhang /* disable RAS feature per IP block if it is not supported */
2759b293e891SHawking Zhang if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2760b293e891SHawking Zhang amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2761b293e891SHawking Zhang return 0;
2762b293e891SHawking Zhang }
2763b293e891SHawking Zhang
2764b293e891SHawking Zhang r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2765b293e891SHawking Zhang if (r) {
27669080a18fSCandice Li if (adev->in_suspend || amdgpu_in_reset(adev)) {
2767b293e891SHawking Zhang /* in resume phase, if fail to enable ras,
2768b293e891SHawking Zhang * clean up all ras fs nodes, and disable ras */
2769b293e891SHawking Zhang goto cleanup;
2770b293e891SHawking Zhang } else
2771b293e891SHawking Zhang return r;
2772b293e891SHawking Zhang }
2773b293e891SHawking Zhang
2774134d16d5SJohn Clements /* check for errors on warm reset edc persisant supported ASIC */
2775134d16d5SJohn Clements amdgpu_persistent_edc_harvesting(adev, ras_block);
2776134d16d5SJohn Clements
2777b293e891SHawking Zhang /* in resume phase, no need to create ras fs node */
277853b3f8f4SDennis Li if (adev->in_suspend || amdgpu_in_reset(adev))
2779b293e891SHawking Zhang return 0;
2780b293e891SHawking Zhang
2781563285c8Syipechai ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
278236780606STao Zhou if (ras_obj->ras_cb || (ras_obj->hw_ops &&
278336780606STao Zhou (ras_obj->hw_ops->query_poison_status ||
278436780606STao Zhou ras_obj->hw_ops->handle_poison_consumption))) {
27859252d33dSyipechai r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
2786b293e891SHawking Zhang if (r)
2787779596ceSTom Rix goto cleanup;
2788b293e891SHawking Zhang }
2789b293e891SHawking Zhang
2790f957138cSHawking Zhang if (ras_obj->hw_ops &&
2791f957138cSHawking Zhang (ras_obj->hw_ops->query_ras_error_count ||
2792f957138cSHawking Zhang ras_obj->hw_ops->query_ras_error_status)) {
27939252d33dSyipechai r = amdgpu_ras_sysfs_create(adev, ras_block);
2794b293e891SHawking Zhang if (r)
2795779596ceSTom Rix goto interrupt;
2796b293e891SHawking Zhang
279705adfd80SLuben Tuikov /* Those are the cached values at init.
279805adfd80SLuben Tuikov */
2799f957138cSHawking Zhang query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
28004a1c9a44SHawking Zhang if (!query_info)
28014a1c9a44SHawking Zhang return -ENOMEM;
28024a1c9a44SHawking Zhang memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
28034a1c9a44SHawking Zhang
28044a1c9a44SHawking Zhang if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
280505adfd80SLuben Tuikov atomic_set(&con->ras_ce_count, ce_count);
280605adfd80SLuben Tuikov atomic_set(&con->ras_ue_count, ue_count);
28074d9f771eSLuben Tuikov }
280805adfd80SLuben Tuikov
28094a1c9a44SHawking Zhang kfree(query_info);
2810f957138cSHawking Zhang }
2811f957138cSHawking Zhang
2812b293e891SHawking Zhang return 0;
2813779596ceSTom Rix
2814779596ceSTom Rix interrupt:
2815563285c8Syipechai if (ras_obj->ras_cb)
28169252d33dSyipechai amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2817779596ceSTom Rix cleanup:
2818b293e891SHawking Zhang amdgpu_ras_feature_enable(adev, ras_block, 0);
2819b293e891SHawking Zhang return r;
2820b293e891SHawking Zhang }
2821b293e891SHawking Zhang
amdgpu_ras_block_late_init_default(struct amdgpu_device * adev,struct ras_common_if * ras_block)2822d41ff22aSMaíra Canal static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
2823418abce2Syipechai struct ras_common_if *ras_block)
2824418abce2Syipechai {
2825418abce2Syipechai return amdgpu_ras_block_late_init(adev, ras_block);
2826418abce2Syipechai }
2827418abce2Syipechai
2828b293e891SHawking Zhang /* helper function to remove ras fs node and interrupt handler */
amdgpu_ras_block_late_fini(struct amdgpu_device * adev,struct ras_common_if * ras_block)2829bdb3489cSyipechai void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
2830bdb3489cSyipechai struct ras_common_if *ras_block)
2831bdb3489cSyipechai {
2832563285c8Syipechai struct amdgpu_ras_block_object *ras_obj;
2833bdb3489cSyipechai if (!ras_block)
2834bdb3489cSyipechai return;
2835bdb3489cSyipechai
2836563285c8Syipechai amdgpu_ras_sysfs_remove(adev, ras_block);
2837bdb3489cSyipechai
2838563285c8Syipechai ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
2839563285c8Syipechai if (ras_obj->ras_cb)
2840563285c8Syipechai amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2841bdb3489cSyipechai }
2842bdb3489cSyipechai
amdgpu_ras_block_late_fini_default(struct amdgpu_device * adev,struct ras_common_if * ras_block)284380e0c2cbSyipechai static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
284480e0c2cbSyipechai struct ras_common_if *ras_block)
284580e0c2cbSyipechai {
284680e0c2cbSyipechai return amdgpu_ras_block_late_fini(adev, ras_block);
284780e0c2cbSyipechai }
284880e0c2cbSyipechai
2849a564808eSxinhui pan /* do some init work after IP late init as dependence.
2850511fdbc3Sxinhui pan * and it runs in resume/gpu reset/booting up cases.
2851a564808eSxinhui pan */
amdgpu_ras_resume(struct amdgpu_device * adev)2852511fdbc3Sxinhui pan void amdgpu_ras_resume(struct amdgpu_device *adev)
2853108c6a63Sxinhui pan {
2854108c6a63Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2855108c6a63Sxinhui pan struct ras_manager *obj, *tmp;
2856108c6a63Sxinhui pan
28578ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con) {
2858970fd197SStanley.Yang /* clean ras context for VEGA20 Gaming after send ras disable cmd */
2859970fd197SStanley.Yang amdgpu_release_ras_context(adev);
2860970fd197SStanley.Yang
2861108c6a63Sxinhui pan return;
2862970fd197SStanley.Yang }
2863108c6a63Sxinhui pan
2864108c6a63Sxinhui pan if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2865191051a1Sxinhui pan /* Set up all other IPs which are not implemented. There is a
2866191051a1Sxinhui pan * tricky thing that IP's actual ras error type should be
2867191051a1Sxinhui pan * MULTI_UNCORRECTABLE, but as driver does not handle it, so
2868191051a1Sxinhui pan * ERROR_NONE make sense anyway.
2869191051a1Sxinhui pan */
2870191051a1Sxinhui pan amdgpu_ras_enable_all_features(adev, 1);
2871191051a1Sxinhui pan
2872191051a1Sxinhui pan /* We enable ras on all hw_supported block, but as boot
2873191051a1Sxinhui pan * parameter might disable some of them and one or more IP has
2874191051a1Sxinhui pan * not implemented yet. So we disable them on behalf.
2875191051a1Sxinhui pan */
2876108c6a63Sxinhui pan list_for_each_entry_safe(obj, tmp, &con->head, node) {
2877108c6a63Sxinhui pan if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2878108c6a63Sxinhui pan amdgpu_ras_feature_enable(adev, &obj->head, 0);
2879108c6a63Sxinhui pan /* there should be no any reference. */
2880108c6a63Sxinhui pan WARN_ON(alive_obj(obj));
2881108c6a63Sxinhui pan }
2882191051a1Sxinhui pan }
2883108c6a63Sxinhui pan }
2884108c6a63Sxinhui pan }
2885108c6a63Sxinhui pan
amdgpu_ras_suspend(struct amdgpu_device * adev)2886511fdbc3Sxinhui pan void amdgpu_ras_suspend(struct amdgpu_device *adev)
2887511fdbc3Sxinhui pan {
2888511fdbc3Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2889511fdbc3Sxinhui pan
28908ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
2891511fdbc3Sxinhui pan return;
2892511fdbc3Sxinhui pan
2893511fdbc3Sxinhui pan amdgpu_ras_disable_all_features(adev, 0);
2894511fdbc3Sxinhui pan /* Make sure all ras objects are disabled. */
2895511fdbc3Sxinhui pan if (con->features)
2896511fdbc3Sxinhui pan amdgpu_ras_disable_all_features(adev, 1);
2897511fdbc3Sxinhui pan }
2898511fdbc3Sxinhui pan
amdgpu_ras_late_init(struct amdgpu_device * adev)2899867e24caSyipechai int amdgpu_ras_late_init(struct amdgpu_device *adev)
2900867e24caSyipechai {
2901867e24caSyipechai struct amdgpu_ras_block_list *node, *tmp;
2902867e24caSyipechai struct amdgpu_ras_block_object *obj;
2903867e24caSyipechai int r;
2904867e24caSyipechai
2905950d6425SStanley.Yang /* Guest side doesn't need init ras feature */
2906950d6425SStanley.Yang if (amdgpu_sriov_vf(adev))
2907950d6425SStanley.Yang return 0;
2908950d6425SStanley.Yang
2909867e24caSyipechai list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
2910867e24caSyipechai if (!node->ras_obj) {
2911867e24caSyipechai dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
2912867e24caSyipechai continue;
2913867e24caSyipechai }
2914418abce2Syipechai
2915867e24caSyipechai obj = node->ras_obj;
2916867e24caSyipechai if (obj->ras_late_init) {
2917867e24caSyipechai r = obj->ras_late_init(adev, &obj->ras_comm);
2918867e24caSyipechai if (r) {
2919867e24caSyipechai dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
2920867e24caSyipechai obj->ras_comm.name, r);
2921867e24caSyipechai return r;
2922867e24caSyipechai }
2923418abce2Syipechai } else
2924418abce2Syipechai amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
2925867e24caSyipechai }
2926867e24caSyipechai
2927867e24caSyipechai return 0;
2928867e24caSyipechai }
2929867e24caSyipechai
2930c030f2e4Sxinhui pan /* do some fini work before IP fini as dependence */
amdgpu_ras_pre_fini(struct amdgpu_device * adev)2931c030f2e4Sxinhui pan int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2932c030f2e4Sxinhui pan {
2933c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2934c030f2e4Sxinhui pan
29358ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
2936c030f2e4Sxinhui pan return 0;
2937c030f2e4Sxinhui pan
293872c8c97bSAndrey Grodzovsky
2939c030f2e4Sxinhui pan /* Need disable ras on all IPs here before ip [hw/sw]fini */
2940642c0401SYiPeng Chai if (con->features)
2941c030f2e4Sxinhui pan amdgpu_ras_disable_all_features(adev, 0);
2942c030f2e4Sxinhui pan amdgpu_ras_recovery_fini(adev);
2943c030f2e4Sxinhui pan return 0;
2944c030f2e4Sxinhui pan }
2945c030f2e4Sxinhui pan
amdgpu_ras_fini(struct amdgpu_device * adev)2946c030f2e4Sxinhui pan int amdgpu_ras_fini(struct amdgpu_device *adev)
2947c030f2e4Sxinhui pan {
2948d5e8ff5fSyipechai struct amdgpu_ras_block_list *ras_node, *tmp;
29491f211a82Syipechai struct amdgpu_ras_block_object *obj = NULL;
2950c030f2e4Sxinhui pan struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2951c030f2e4Sxinhui pan
29528ab0d6f0SLuben Tuikov if (!adev->ras_enabled || !con)
2953c030f2e4Sxinhui pan return 0;
2954c030f2e4Sxinhui pan
29551f211a82Syipechai list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
29561f211a82Syipechai if (ras_node->ras_obj) {
29571f211a82Syipechai obj = ras_node->ras_obj;
29581f211a82Syipechai if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
29591f211a82Syipechai obj->ras_fini)
29601f211a82Syipechai obj->ras_fini(adev, &obj->ras_comm);
296180e0c2cbSyipechai else
296280e0c2cbSyipechai amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
29631f211a82Syipechai }
29641f211a82Syipechai
29651f211a82Syipechai /* Clear ras blocks from ras_list and free ras block list node */
29661f211a82Syipechai list_del(&ras_node->node);
29671f211a82Syipechai kfree(ras_node);
29681f211a82Syipechai }
29691f211a82Syipechai
2970c030f2e4Sxinhui pan amdgpu_ras_fs_fini(adev);
2971c030f2e4Sxinhui pan amdgpu_ras_interrupt_remove_all(adev);
2972c030f2e4Sxinhui pan
2973c030f2e4Sxinhui pan WARN(con->features, "Feature mask is not cleared");
2974c030f2e4Sxinhui pan
2975c030f2e4Sxinhui pan if (con->features)
2976c030f2e4Sxinhui pan amdgpu_ras_disable_all_features(adev, 1);
2977c030f2e4Sxinhui pan
297805adfd80SLuben Tuikov cancel_delayed_work_sync(&con->ras_counte_delay_work);
297905adfd80SLuben Tuikov
2980c030f2e4Sxinhui pan amdgpu_ras_set_context(adev, NULL);
2981c030f2e4Sxinhui pan kfree(con);
2982c030f2e4Sxinhui pan
2983c030f2e4Sxinhui pan return 0;
2984c030f2e4Sxinhui pan }
29857c6e68c7SAndrey Grodzovsky
amdgpu_ras_global_ras_isr(struct amdgpu_device * adev)29867c6e68c7SAndrey Grodzovsky void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
29877c6e68c7SAndrey Grodzovsky {
29887c6e68c7SAndrey Grodzovsky if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
29892c7cd280SYiPeng Chai struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
29902c7cd280SYiPeng Chai
29916952e99cSGuchun Chen dev_info(adev->dev, "uncorrectable hardware error"
29926952e99cSGuchun Chen "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2993d5ea093eSAndrey Grodzovsky
29942c7cd280SYiPeng Chai ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
299561934624SGuchun Chen amdgpu_ras_reset_gpu(adev);
29967c6e68c7SAndrey Grodzovsky }
29977c6e68c7SAndrey Grodzovsky }
2998bb5c7235SWenhui Sheng
amdgpu_ras_need_emergency_restart(struct amdgpu_device * adev)2999bb5c7235SWenhui Sheng bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
3000bb5c7235SWenhui Sheng {
3001bb5c7235SWenhui Sheng if (adev->asic_type == CHIP_VEGA20 &&
3002bb5c7235SWenhui Sheng adev->pm.fw_version <= 0x283400) {
3003bb5c7235SWenhui Sheng return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
3004bb5c7235SWenhui Sheng amdgpu_ras_intr_triggered();
3005bb5c7235SWenhui Sheng }
3006bb5c7235SWenhui Sheng
3007bb5c7235SWenhui Sheng return false;
3008bb5c7235SWenhui Sheng }
3009970fd197SStanley.Yang
amdgpu_release_ras_context(struct amdgpu_device * adev)3010970fd197SStanley.Yang void amdgpu_release_ras_context(struct amdgpu_device *adev)
3011970fd197SStanley.Yang {
3012970fd197SStanley.Yang struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3013970fd197SStanley.Yang
3014970fd197SStanley.Yang if (!con)
3015970fd197SStanley.Yang return;
3016970fd197SStanley.Yang
30178ab0d6f0SLuben Tuikov if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
3018970fd197SStanley.Yang con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
3019970fd197SStanley.Yang amdgpu_ras_set_context(adev, NULL);
3020970fd197SStanley.Yang kfree(con);
3021970fd197SStanley.Yang }
3022970fd197SStanley.Yang }
302312b2cab7SMukul Joshi
302412b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
find_adev(uint32_t node_id)302512b2cab7SMukul Joshi static struct amdgpu_device *find_adev(uint32_t node_id)
302612b2cab7SMukul Joshi {
302712b2cab7SMukul Joshi int i;
302812b2cab7SMukul Joshi struct amdgpu_device *adev = NULL;
302912b2cab7SMukul Joshi
303091a1a52dSMukul Joshi for (i = 0; i < mce_adev_list.num_gpu; i++) {
303191a1a52dSMukul Joshi adev = mce_adev_list.devs[i];
303212b2cab7SMukul Joshi
303391a1a52dSMukul Joshi if (adev && adev->gmc.xgmi.connected_to_cpu &&
303412b2cab7SMukul Joshi adev->gmc.xgmi.physical_node_id == node_id)
303512b2cab7SMukul Joshi break;
303612b2cab7SMukul Joshi adev = NULL;
303712b2cab7SMukul Joshi }
303812b2cab7SMukul Joshi
303912b2cab7SMukul Joshi return adev;
304012b2cab7SMukul Joshi }
304112b2cab7SMukul Joshi
304212b2cab7SMukul Joshi #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
304312b2cab7SMukul Joshi #define GET_UMC_INST(m) (((m) >> 21) & 0x7)
304412b2cab7SMukul Joshi #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
304512b2cab7SMukul Joshi #define GPU_ID_OFFSET 8
304612b2cab7SMukul Joshi
amdgpu_bad_page_notifier(struct notifier_block * nb,unsigned long val,void * data)304712b2cab7SMukul Joshi static int amdgpu_bad_page_notifier(struct notifier_block *nb,
304812b2cab7SMukul Joshi unsigned long val, void *data)
304912b2cab7SMukul Joshi {
305012b2cab7SMukul Joshi struct mce *m = (struct mce *)data;
305112b2cab7SMukul Joshi struct amdgpu_device *adev = NULL;
305212b2cab7SMukul Joshi uint32_t gpu_id = 0;
3053cd4c99f1STao Zhou uint32_t umc_inst = 0, ch_inst = 0;
305412b2cab7SMukul Joshi
305512b2cab7SMukul Joshi /*
305612b2cab7SMukul Joshi * If the error was generated in UMC_V2, which belongs to GPU UMCs,
305712b2cab7SMukul Joshi * and error occurred in DramECC (Extended error code = 0) then only
305812b2cab7SMukul Joshi * process the error, else bail out.
305912b2cab7SMukul Joshi */
306091f75eb4SYazen Ghannam if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
306112b2cab7SMukul Joshi (XEC(m->status, 0x3f) == 0x0)))
306212b2cab7SMukul Joshi return NOTIFY_DONE;
306312b2cab7SMukul Joshi
306412b2cab7SMukul Joshi /*
306512b2cab7SMukul Joshi * If it is correctable error, return.
306612b2cab7SMukul Joshi */
306712b2cab7SMukul Joshi if (mce_is_correctable(m))
306812b2cab7SMukul Joshi return NOTIFY_OK;
306912b2cab7SMukul Joshi
307012b2cab7SMukul Joshi /*
307112b2cab7SMukul Joshi * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
307212b2cab7SMukul Joshi */
307312b2cab7SMukul Joshi gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
307412b2cab7SMukul Joshi
307512b2cab7SMukul Joshi adev = find_adev(gpu_id);
307612b2cab7SMukul Joshi if (!adev) {
307712b2cab7SMukul Joshi DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
307812b2cab7SMukul Joshi gpu_id);
307912b2cab7SMukul Joshi return NOTIFY_DONE;
308012b2cab7SMukul Joshi }
308112b2cab7SMukul Joshi
308212b2cab7SMukul Joshi /*
308312b2cab7SMukul Joshi * If it is uncorrectable error, then find out UMC instance and
308412b2cab7SMukul Joshi * channel index.
308512b2cab7SMukul Joshi */
308612b2cab7SMukul Joshi umc_inst = GET_UMC_INST(m->ipid);
308712b2cab7SMukul Joshi ch_inst = GET_CHAN_INDEX(m->ipid);
308812b2cab7SMukul Joshi
308912b2cab7SMukul Joshi dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
309012b2cab7SMukul Joshi umc_inst, ch_inst);
309112b2cab7SMukul Joshi
309224b82292STao Zhou if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst))
309312b2cab7SMukul Joshi return NOTIFY_OK;
309424b82292STao Zhou else
309524b82292STao Zhou return NOTIFY_DONE;
309612b2cab7SMukul Joshi }
309712b2cab7SMukul Joshi
309812b2cab7SMukul Joshi static struct notifier_block amdgpu_bad_page_nb = {
309912b2cab7SMukul Joshi .notifier_call = amdgpu_bad_page_notifier,
310012b2cab7SMukul Joshi .priority = MCE_PRIO_UC,
310112b2cab7SMukul Joshi };
310212b2cab7SMukul Joshi
amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device * adev)310391a1a52dSMukul Joshi static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
310412b2cab7SMukul Joshi {
310512b2cab7SMukul Joshi /*
310691a1a52dSMukul Joshi * Add the adev to the mce_adev_list.
310791a1a52dSMukul Joshi * During mode2 reset, amdgpu device is temporarily
310891a1a52dSMukul Joshi * removed from the mgpu_info list which can cause
310991a1a52dSMukul Joshi * page retirement to fail.
311091a1a52dSMukul Joshi * Use this list instead of mgpu_info to find the amdgpu
311191a1a52dSMukul Joshi * device on which the UMC error was reported.
311291a1a52dSMukul Joshi */
311391a1a52dSMukul Joshi mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
311491a1a52dSMukul Joshi
311591a1a52dSMukul Joshi /*
311612b2cab7SMukul Joshi * Register the x86 notifier only once
311712b2cab7SMukul Joshi * with MCE subsystem.
311812b2cab7SMukul Joshi */
311912b2cab7SMukul Joshi if (notifier_registered == false) {
312012b2cab7SMukul Joshi mce_register_decode_chain(&amdgpu_bad_page_nb);
312112b2cab7SMukul Joshi notifier_registered = true;
312212b2cab7SMukul Joshi }
312312b2cab7SMukul Joshi }
312412b2cab7SMukul Joshi #endif
31257cab2124Syipechai
amdgpu_ras_get_context(struct amdgpu_device * adev)31267cab2124Syipechai struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
31277cab2124Syipechai {
31287cab2124Syipechai if (!adev)
31297cab2124Syipechai return NULL;
31307cab2124Syipechai
31317cab2124Syipechai return adev->psp.ras_context.ras;
31327cab2124Syipechai }
31337cab2124Syipechai
amdgpu_ras_set_context(struct amdgpu_device * adev,struct amdgpu_ras * ras_con)31347cab2124Syipechai int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
31357cab2124Syipechai {
31367cab2124Syipechai if (!adev)
313769f91d32SYang Li return -EINVAL;
31387cab2124Syipechai
31397cab2124Syipechai adev->psp.ras_context.ras = ras_con;
31407cab2124Syipechai return 0;
31417cab2124Syipechai }
31427cab2124Syipechai
31437cab2124Syipechai /* check if ras is supported on block, say, sdma, gfx */
amdgpu_ras_is_supported(struct amdgpu_device * adev,unsigned int block)31447cab2124Syipechai int amdgpu_ras_is_supported(struct amdgpu_device *adev,
31457cab2124Syipechai unsigned int block)
31467cab2124Syipechai {
31478f453c51SYiPeng Chai int ret = 0;
31487cab2124Syipechai struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
31497cab2124Syipechai
31507cab2124Syipechai if (block >= AMDGPU_RAS_BLOCK_COUNT)
31517cab2124Syipechai return 0;
31528f453c51SYiPeng Chai
31538f453c51SYiPeng Chai ret = ras && (adev->ras_enabled & (1 << block));
31548f453c51SYiPeng Chai
31558f453c51SYiPeng Chai /* For the special asic with mem ecc enabled but sram ecc
31568f453c51SYiPeng Chai * not enabled, even if the ras block is not supported on
31578f453c51SYiPeng Chai * .ras_enabled, if the asic supports poison mode and the
31588f453c51SYiPeng Chai * ras block has ras configuration, it can be considered
31598f453c51SYiPeng Chai * that the ras block supports ras function.
31608f453c51SYiPeng Chai */
31618f453c51SYiPeng Chai if (!ret &&
3162bc0f8080SCandice Li (block == AMDGPU_RAS_BLOCK__GFX ||
3163bc0f8080SCandice Li block == AMDGPU_RAS_BLOCK__SDMA ||
3164bc0f8080SCandice Li block == AMDGPU_RAS_BLOCK__VCN ||
3165bc0f8080SCandice Li block == AMDGPU_RAS_BLOCK__JPEG) &&
31668f453c51SYiPeng Chai amdgpu_ras_is_poison_mode_supported(adev) &&
31678f453c51SYiPeng Chai amdgpu_ras_get_ras_block(adev, block, 0))
31688f453c51SYiPeng Chai ret = 1;
31698f453c51SYiPeng Chai
31708f453c51SYiPeng Chai return ret;
31717cab2124Syipechai }
31727cab2124Syipechai
amdgpu_ras_reset_gpu(struct amdgpu_device * adev)31737cab2124Syipechai int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
31747cab2124Syipechai {
31757cab2124Syipechai struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
31767cab2124Syipechai
31777cab2124Syipechai if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
317825a2b22eSAndrey Grodzovsky amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
31797cab2124Syipechai return 0;
31807cab2124Syipechai }
31817cab2124Syipechai
31827cab2124Syipechai
31836492e1b0Syipechai /* Register each ip ras block into amdgpu ras */
amdgpu_ras_register_ras_block(struct amdgpu_device * adev,struct amdgpu_ras_block_object * ras_block_obj)31846492e1b0Syipechai int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
31856492e1b0Syipechai struct amdgpu_ras_block_object *ras_block_obj)
31866492e1b0Syipechai {
3187d5e8ff5fSyipechai struct amdgpu_ras_block_list *ras_node;
31886492e1b0Syipechai if (!adev || !ras_block_obj)
31896492e1b0Syipechai return -EINVAL;
31906492e1b0Syipechai
3191d5e8ff5fSyipechai ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
3192d5e8ff5fSyipechai if (!ras_node)
3193d5e8ff5fSyipechai return -ENOMEM;
3194d5e8ff5fSyipechai
3195d5e8ff5fSyipechai INIT_LIST_HEAD(&ras_node->node);
3196d5e8ff5fSyipechai ras_node->ras_obj = ras_block_obj;
3197d5e8ff5fSyipechai list_add_tail(&ras_node->node, &adev->ras_list);
31986492e1b0Syipechai
31996492e1b0Syipechai return 0;
32006492e1b0Syipechai }
3201322a7e00SHawking Zhang
amdgpu_ras_get_error_type_name(uint32_t err_type,char * err_type_name)3202322a7e00SHawking Zhang void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name)
3203322a7e00SHawking Zhang {
3204322a7e00SHawking Zhang if (!err_type_name)
3205322a7e00SHawking Zhang return;
3206322a7e00SHawking Zhang
3207322a7e00SHawking Zhang switch (err_type) {
3208322a7e00SHawking Zhang case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
3209322a7e00SHawking Zhang sprintf(err_type_name, "correctable");
3210322a7e00SHawking Zhang break;
3211322a7e00SHawking Zhang case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
3212322a7e00SHawking Zhang sprintf(err_type_name, "uncorrectable");
3213322a7e00SHawking Zhang break;
3214322a7e00SHawking Zhang default:
3215322a7e00SHawking Zhang sprintf(err_type_name, "unknown");
3216322a7e00SHawking Zhang break;
3217322a7e00SHawking Zhang }
3218322a7e00SHawking Zhang }
3219322a7e00SHawking Zhang
amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_entry,uint32_t instance,uint32_t * memory_id)3220322a7e00SHawking Zhang bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
3221322a7e00SHawking Zhang const struct amdgpu_ras_err_status_reg_entry *reg_entry,
3222322a7e00SHawking Zhang uint32_t instance,
3223322a7e00SHawking Zhang uint32_t *memory_id)
3224322a7e00SHawking Zhang {
3225322a7e00SHawking Zhang uint32_t err_status_lo_data, err_status_lo_offset;
3226322a7e00SHawking Zhang
3227322a7e00SHawking Zhang if (!reg_entry)
3228322a7e00SHawking Zhang return false;
3229322a7e00SHawking Zhang
3230322a7e00SHawking Zhang err_status_lo_offset =
3231322a7e00SHawking Zhang AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
3232322a7e00SHawking Zhang reg_entry->seg_lo, reg_entry->reg_lo);
3233322a7e00SHawking Zhang err_status_lo_data = RREG32(err_status_lo_offset);
3234322a7e00SHawking Zhang
3235322a7e00SHawking Zhang if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) &&
3236322a7e00SHawking Zhang !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG))
3237322a7e00SHawking Zhang return false;
3238322a7e00SHawking Zhang
3239322a7e00SHawking Zhang *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID);
3240322a7e00SHawking Zhang
3241322a7e00SHawking Zhang return true;
3242322a7e00SHawking Zhang }
3243322a7e00SHawking Zhang
amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_entry,uint32_t instance,unsigned long * err_cnt)3244322a7e00SHawking Zhang bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
3245322a7e00SHawking Zhang const struct amdgpu_ras_err_status_reg_entry *reg_entry,
3246322a7e00SHawking Zhang uint32_t instance,
3247322a7e00SHawking Zhang unsigned long *err_cnt)
3248322a7e00SHawking Zhang {
3249322a7e00SHawking Zhang uint32_t err_status_hi_data, err_status_hi_offset;
3250322a7e00SHawking Zhang
3251322a7e00SHawking Zhang if (!reg_entry)
3252322a7e00SHawking Zhang return false;
3253322a7e00SHawking Zhang
3254322a7e00SHawking Zhang err_status_hi_offset =
3255322a7e00SHawking Zhang AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
3256322a7e00SHawking Zhang reg_entry->seg_hi, reg_entry->reg_hi);
3257322a7e00SHawking Zhang err_status_hi_data = RREG32(err_status_hi_offset);
3258322a7e00SHawking Zhang
3259322a7e00SHawking Zhang if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
3260322a7e00SHawking Zhang !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
32619b337b7dSHawking Zhang /* keep the check here in case we need to refer to the result later */
32629b337b7dSHawking Zhang dev_dbg(adev->dev, "Invalid err_info field\n");
3263322a7e00SHawking Zhang
3264322a7e00SHawking Zhang /* read err count */
3265322a7e00SHawking Zhang *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
3266322a7e00SHawking Zhang
3267322a7e00SHawking Zhang return true;
3268322a7e00SHawking Zhang }
3269322a7e00SHawking Zhang
amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_list,uint32_t reg_list_size,const struct amdgpu_ras_memory_id_entry * mem_list,uint32_t mem_list_size,uint32_t instance,uint32_t err_type,unsigned long * err_count)3270322a7e00SHawking Zhang void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
3271322a7e00SHawking Zhang const struct amdgpu_ras_err_status_reg_entry *reg_list,
3272322a7e00SHawking Zhang uint32_t reg_list_size,
3273322a7e00SHawking Zhang const struct amdgpu_ras_memory_id_entry *mem_list,
3274322a7e00SHawking Zhang uint32_t mem_list_size,
3275322a7e00SHawking Zhang uint32_t instance,
3276322a7e00SHawking Zhang uint32_t err_type,
3277322a7e00SHawking Zhang unsigned long *err_count)
3278322a7e00SHawking Zhang {
3279322a7e00SHawking Zhang uint32_t memory_id;
3280322a7e00SHawking Zhang unsigned long err_cnt;
3281322a7e00SHawking Zhang char err_type_name[16];
3282322a7e00SHawking Zhang uint32_t i, j;
3283322a7e00SHawking Zhang
3284322a7e00SHawking Zhang for (i = 0; i < reg_list_size; i++) {
32859b337b7dSHawking Zhang /* query memory_id from err_status_lo */
32869b337b7dSHawking Zhang if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i],
32879b337b7dSHawking Zhang instance, &memory_id))
32889b337b7dSHawking Zhang continue;
32899b337b7dSHawking Zhang
3290322a7e00SHawking Zhang /* query err_cnt from err_status_hi */
3291322a7e00SHawking Zhang if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i],
3292322a7e00SHawking Zhang instance, &err_cnt) ||
3293322a7e00SHawking Zhang !err_cnt)
3294322a7e00SHawking Zhang continue;
3295322a7e00SHawking Zhang
3296322a7e00SHawking Zhang *err_count += err_cnt;
3297322a7e00SHawking Zhang
3298322a7e00SHawking Zhang /* log the errors */
3299322a7e00SHawking Zhang amdgpu_ras_get_error_type_name(err_type, err_type_name);
3300322a7e00SHawking Zhang if (!mem_list) {
3301322a7e00SHawking Zhang /* memory_list is not supported */
3302322a7e00SHawking Zhang dev_info(adev->dev,
3303322a7e00SHawking Zhang "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
3304322a7e00SHawking Zhang err_cnt, err_type_name,
3305322a7e00SHawking Zhang reg_list[i].block_name,
3306322a7e00SHawking Zhang instance, memory_id);
3307322a7e00SHawking Zhang } else {
3308322a7e00SHawking Zhang for (j = 0; j < mem_list_size; j++) {
3309322a7e00SHawking Zhang if (memory_id == mem_list[j].memory_id) {
3310322a7e00SHawking Zhang dev_info(adev->dev,
3311322a7e00SHawking Zhang "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
3312322a7e00SHawking Zhang err_cnt, err_type_name,
3313322a7e00SHawking Zhang reg_list[i].block_name,
3314322a7e00SHawking Zhang instance, mem_list[j].name);
3315322a7e00SHawking Zhang break;
3316322a7e00SHawking Zhang }
3317322a7e00SHawking Zhang }
3318322a7e00SHawking Zhang }
3319322a7e00SHawking Zhang }
3320322a7e00SHawking Zhang }
3321e53a3250SHawking Zhang
amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_list,uint32_t reg_list_size,uint32_t instance)3322e53a3250SHawking Zhang void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
3323e53a3250SHawking Zhang const struct amdgpu_ras_err_status_reg_entry *reg_list,
3324e53a3250SHawking Zhang uint32_t reg_list_size,
3325e53a3250SHawking Zhang uint32_t instance)
3326e53a3250SHawking Zhang {
3327e53a3250SHawking Zhang uint32_t err_status_lo_offset, err_status_hi_offset;
3328e53a3250SHawking Zhang uint32_t i;
3329e53a3250SHawking Zhang
3330e53a3250SHawking Zhang for (i = 0; i < reg_list_size; i++) {
3331e53a3250SHawking Zhang err_status_lo_offset =
3332e53a3250SHawking Zhang AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
3333e53a3250SHawking Zhang reg_list[i].seg_lo, reg_list[i].reg_lo);
3334e53a3250SHawking Zhang err_status_hi_offset =
3335e53a3250SHawking Zhang AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
3336e53a3250SHawking Zhang reg_list[i].seg_hi, reg_list[i].reg_hi);
3337e53a3250SHawking Zhang WREG32(err_status_lo_offset, 0);
3338e53a3250SHawking Zhang WREG32(err_status_hi_offset, 0);
3339e53a3250SHawking Zhang }
3340e53a3250SHawking Zhang }
3341