xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c (revision c005e2f62f8421b13b9a31adb9db7281f1a19e68)
1c030f2e4Sxinhui pan /*
2c030f2e4Sxinhui pan  * Copyright 2018 Advanced Micro Devices, Inc.
3c030f2e4Sxinhui pan  *
4c030f2e4Sxinhui pan  * Permission is hereby granted, free of charge, to any person obtaining a
5c030f2e4Sxinhui pan  * copy of this software and associated documentation files (the "Software"),
6c030f2e4Sxinhui pan  * to deal in the Software without restriction, including without limitation
7c030f2e4Sxinhui pan  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c030f2e4Sxinhui pan  * and/or sell copies of the Software, and to permit persons to whom the
9c030f2e4Sxinhui pan  * Software is furnished to do so, subject to the following conditions:
10c030f2e4Sxinhui pan  *
11c030f2e4Sxinhui pan  * The above copyright notice and this permission notice shall be included in
12c030f2e4Sxinhui pan  * all copies or substantial portions of the Software.
13c030f2e4Sxinhui pan  *
14c030f2e4Sxinhui pan  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15c030f2e4Sxinhui pan  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16c030f2e4Sxinhui pan  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17c030f2e4Sxinhui pan  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18c030f2e4Sxinhui pan  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19c030f2e4Sxinhui pan  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20c030f2e4Sxinhui pan  * OTHER DEALINGS IN THE SOFTWARE.
21c030f2e4Sxinhui pan  *
22c030f2e4Sxinhui pan  *
23c030f2e4Sxinhui pan  */
24c030f2e4Sxinhui pan #include <linux/debugfs.h>
25c030f2e4Sxinhui pan #include <linux/list.h>
26c030f2e4Sxinhui pan #include <linux/module.h>
27f867723bSSam Ravnborg #include <linux/uaccess.h>
287c6e68c7SAndrey Grodzovsky #include <linux/reboot.h>
297c6e68c7SAndrey Grodzovsky #include <linux/syscalls.h>
3005adfd80SLuben Tuikov #include <linux/pm_runtime.h>
31f867723bSSam Ravnborg 
32c030f2e4Sxinhui pan #include "amdgpu.h"
33c030f2e4Sxinhui pan #include "amdgpu_ras.h"
34b404ae82Sxinhui pan #include "amdgpu_atomfirmware.h"
3519744f5fSHawking Zhang #include "amdgpu_xgmi.h"
364e644fffSHawking Zhang #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
379af357bcSHawking Zhang #include "nbio_v4_3.h"
387692e1eeSTao Zhou #include "nbio_v7_9.h"
39f50160cfSStanley.Yang #include "atom.h"
4025a2b22eSAndrey Grodzovsky #include "amdgpu_reset.h"
4125a2b22eSAndrey Grodzovsky 
4212b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
4312b2cab7SMukul Joshi #include <asm/mce.h>
44c030f2e4Sxinhui pan 
4512b2cab7SMukul Joshi static bool notifier_registered;
4612b2cab7SMukul Joshi #endif
47eb0c3cd4SGuchun Chen static const char *RAS_FS_NAME = "ras";
48eb0c3cd4SGuchun Chen 
49c030f2e4Sxinhui pan const char *ras_error_string[] = {
50c030f2e4Sxinhui pan 	"none",
51c030f2e4Sxinhui pan 	"parity",
52c030f2e4Sxinhui pan 	"single_correctable",
53c030f2e4Sxinhui pan 	"multi_uncorrectable",
54c030f2e4Sxinhui pan 	"poison",
55c030f2e4Sxinhui pan };
56c030f2e4Sxinhui pan 
57c030f2e4Sxinhui pan const char *ras_block_string[] = {
58c030f2e4Sxinhui pan 	"umc",
59c030f2e4Sxinhui pan 	"sdma",
60c030f2e4Sxinhui pan 	"gfx",
61c030f2e4Sxinhui pan 	"mmhub",
62c030f2e4Sxinhui pan 	"athub",
63c030f2e4Sxinhui pan 	"pcie_bif",
64c030f2e4Sxinhui pan 	"hdp",
65c030f2e4Sxinhui pan 	"xgmi_wafl",
66c030f2e4Sxinhui pan 	"df",
67c030f2e4Sxinhui pan 	"smn",
68c030f2e4Sxinhui pan 	"sem",
69c030f2e4Sxinhui pan 	"mp0",
70c030f2e4Sxinhui pan 	"mp1",
71c030f2e4Sxinhui pan 	"fuse",
72640ae42eSJohn Clements 	"mca",
73a3d63c62SMohammad Zafar Ziya 	"vcn",
74a3d63c62SMohammad Zafar Ziya 	"jpeg",
75c030f2e4Sxinhui pan };
76c030f2e4Sxinhui pan 
77640ae42eSJohn Clements const char *ras_mca_block_string[] = {
78640ae42eSJohn Clements 	"mca_mp0",
79640ae42eSJohn Clements 	"mca_mp1",
80640ae42eSJohn Clements 	"mca_mpio",
81640ae42eSJohn Clements 	"mca_iohc",
82640ae42eSJohn Clements };
83640ae42eSJohn Clements 
84d5e8ff5fSyipechai struct amdgpu_ras_block_list {
85d5e8ff5fSyipechai 	/* ras block link */
86d5e8ff5fSyipechai 	struct list_head node;
87d5e8ff5fSyipechai 
88d5e8ff5fSyipechai 	struct amdgpu_ras_block_object *ras_obj;
89d5e8ff5fSyipechai };
90d5e8ff5fSyipechai 
get_ras_block_str(struct ras_common_if * ras_block)91640ae42eSJohn Clements const char *get_ras_block_str(struct ras_common_if *ras_block)
92640ae42eSJohn Clements {
93640ae42eSJohn Clements 	if (!ras_block)
94640ae42eSJohn Clements 		return "NULL";
95640ae42eSJohn Clements 
96640ae42eSJohn Clements 	if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
97640ae42eSJohn Clements 		return "OUT OF RANGE";
98640ae42eSJohn Clements 
99640ae42eSJohn Clements 	if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
100640ae42eSJohn Clements 		return ras_mca_block_string[ras_block->sub_block_index];
101640ae42eSJohn Clements 
102640ae42eSJohn Clements 	return ras_block_string[ras_block->block];
103640ae42eSJohn Clements }
104640ae42eSJohn Clements 
105954ea6aaSyipechai #define ras_block_str(_BLOCK_) \
106954ea6aaSyipechai 	(((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
1078b0fb0e9Syipechai 
108c030f2e4Sxinhui pan #define ras_err_str(i) (ras_error_string[ffs(i)])
109c030f2e4Sxinhui pan 
110108c6a63Sxinhui pan #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
111108c6a63Sxinhui pan 
1127cdc2ee3STao Zhou /* inject address is 52 bits */
1137cdc2ee3STao Zhou #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
1147cdc2ee3STao Zhou 
115e4e6a589SLuben Tuikov /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
116e4e6a589SLuben Tuikov #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
117c84d4670SGuchun Chen 
11852dd95f2SGuchun Chen enum amdgpu_ras_retire_page_reservation {
11952dd95f2SGuchun Chen 	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
12052dd95f2SGuchun Chen 	AMDGPU_RAS_RETIRE_PAGE_PENDING,
12152dd95f2SGuchun Chen 	AMDGPU_RAS_RETIRE_PAGE_FAULT,
12252dd95f2SGuchun Chen };
1237c6e68c7SAndrey Grodzovsky 
1247c6e68c7SAndrey Grodzovsky atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
1257c6e68c7SAndrey Grodzovsky 
126676deb38SDennis Li static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
127676deb38SDennis Li 				uint64_t addr);
1286e4be987STao Zhou static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1296e4be987STao Zhou 				uint64_t addr);
13012b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
13191a1a52dSMukul Joshi static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
13291a1a52dSMukul Joshi struct mce_notifier_adev_list {
13391a1a52dSMukul Joshi 	struct amdgpu_device *devs[MAX_GPU_INSTANCE];
13491a1a52dSMukul Joshi 	int num_gpu;
13591a1a52dSMukul Joshi };
13691a1a52dSMukul Joshi static struct mce_notifier_adev_list mce_adev_list;
13712b2cab7SMukul Joshi #endif
1386e4be987STao Zhou 
amdgpu_ras_set_error_query_ready(struct amdgpu_device * adev,bool ready)13961380faaSJohn Clements void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
14061380faaSJohn Clements {
141a9d82d2fSEvan Quan 	if (adev && amdgpu_ras_get_context(adev))
14261380faaSJohn Clements 		amdgpu_ras_get_context(adev)->error_query_ready = ready;
14361380faaSJohn Clements }
14461380faaSJohn Clements 
amdgpu_ras_get_error_query_ready(struct amdgpu_device * adev)145f3167919SNirmoy Das static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
14661380faaSJohn Clements {
147a9d82d2fSEvan Quan 	if (adev && amdgpu_ras_get_context(adev))
14861380faaSJohn Clements 		return amdgpu_ras_get_context(adev)->error_query_ready;
14961380faaSJohn Clements 
15061380faaSJohn Clements 	return false;
15161380faaSJohn Clements }
15261380faaSJohn Clements 
amdgpu_reserve_page_direct(struct amdgpu_device * adev,uint64_t address)153cbb8f989SJohn Clements static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
154cbb8f989SJohn Clements {
155cbb8f989SJohn Clements 	struct ras_err_data err_data = {0, 0, 0, NULL};
156cbb8f989SJohn Clements 	struct eeprom_table_record err_rec;
157cbb8f989SJohn Clements 
158cbb8f989SJohn Clements 	if ((address >= adev->gmc.mc_vram_size) ||
159cbb8f989SJohn Clements 	    (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
160cbb8f989SJohn Clements 		dev_warn(adev->dev,
161cbb8f989SJohn Clements 		         "RAS WARN: input address 0x%llx is invalid.\n",
162cbb8f989SJohn Clements 		         address);
163cbb8f989SJohn Clements 		return -EINVAL;
164cbb8f989SJohn Clements 	}
165cbb8f989SJohn Clements 
166cbb8f989SJohn Clements 	if (amdgpu_ras_check_bad_page(adev, address)) {
167cbb8f989SJohn Clements 		dev_warn(adev->dev,
16880b0cd0fSLuben Tuikov 			 "RAS WARN: 0x%llx has already been marked as bad page!\n",
169cbb8f989SJohn Clements 			 address);
170cbb8f989SJohn Clements 		return 0;
171cbb8f989SJohn Clements 	}
172cbb8f989SJohn Clements 
173cbb8f989SJohn Clements 	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
174cbb8f989SJohn Clements 	err_data.err_addr = &err_rec;
17571344a71SLuben Tuikov 	amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
176cbb8f989SJohn Clements 
177cbb8f989SJohn Clements 	if (amdgpu_bad_page_threshold != 0) {
178cbb8f989SJohn Clements 		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
179cbb8f989SJohn Clements 					 err_data.err_addr_cnt);
1804d33e0f1STao Zhou 		amdgpu_ras_save_bad_pages(adev, NULL);
181cbb8f989SJohn Clements 	}
182cbb8f989SJohn Clements 
183cbb8f989SJohn Clements 	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
184cbb8f989SJohn Clements 	dev_warn(adev->dev, "Clear EEPROM:\n");
185cbb8f989SJohn Clements 	dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
186cbb8f989SJohn Clements 
187cbb8f989SJohn Clements 	return 0;
188cbb8f989SJohn Clements }
189cbb8f989SJohn Clements 
amdgpu_ras_debugfs_read(struct file * f,char __user * buf,size_t size,loff_t * pos)190c030f2e4Sxinhui pan static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
191c030f2e4Sxinhui pan 					size_t size, loff_t *pos)
192c030f2e4Sxinhui pan {
193c030f2e4Sxinhui pan 	struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
194c030f2e4Sxinhui pan 	struct ras_query_if info = {
195c030f2e4Sxinhui pan 		.head = obj->head,
196c030f2e4Sxinhui pan 	};
197c030f2e4Sxinhui pan 	ssize_t s;
198c030f2e4Sxinhui pan 	char val[128];
199c030f2e4Sxinhui pan 
200761d86d3SDennis Li 	if (amdgpu_ras_query_error_status(obj->adev, &info))
201c030f2e4Sxinhui pan 		return -EINVAL;
202c030f2e4Sxinhui pan 
2032a460963SCandice Li 	/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
2042a460963SCandice Li 	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
2052a460963SCandice Li 	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
2062a460963SCandice Li 		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
2072a460963SCandice Li 			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
2082a460963SCandice Li 	}
2092a460963SCandice Li 
210c030f2e4Sxinhui pan 	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
211c030f2e4Sxinhui pan 			"ue", info.ue_count,
212c030f2e4Sxinhui pan 			"ce", info.ce_count);
213c030f2e4Sxinhui pan 	if (*pos >= s)
214c030f2e4Sxinhui pan 		return 0;
215c030f2e4Sxinhui pan 
216c030f2e4Sxinhui pan 	s -= *pos;
217c030f2e4Sxinhui pan 	s = min_t(u64, s, size);
218c030f2e4Sxinhui pan 
219c030f2e4Sxinhui pan 
220c030f2e4Sxinhui pan 	if (copy_to_user(buf, &val[*pos], s))
221c030f2e4Sxinhui pan 		return -EINVAL;
222c030f2e4Sxinhui pan 
223c030f2e4Sxinhui pan 	*pos += s;
224c030f2e4Sxinhui pan 
225c030f2e4Sxinhui pan 	return s;
226c030f2e4Sxinhui pan }
227c030f2e4Sxinhui pan 
228c030f2e4Sxinhui pan static const struct file_operations amdgpu_ras_debugfs_ops = {
229c030f2e4Sxinhui pan 	.owner = THIS_MODULE,
230c030f2e4Sxinhui pan 	.read = amdgpu_ras_debugfs_read,
231190211abSxinhui pan 	.write = NULL,
232c030f2e4Sxinhui pan 	.llseek = default_llseek
233c030f2e4Sxinhui pan };
234c030f2e4Sxinhui pan 
amdgpu_ras_find_block_id_by_name(const char * name,int * block_id)23596ebb307Sxinhui pan static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
23696ebb307Sxinhui pan {
23796ebb307Sxinhui pan 	int i;
23896ebb307Sxinhui pan 
23996ebb307Sxinhui pan 	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
24096ebb307Sxinhui pan 		*block_id = i;
241640ae42eSJohn Clements 		if (strcmp(name, ras_block_string[i]) == 0)
24296ebb307Sxinhui pan 			return 0;
24396ebb307Sxinhui pan 	}
24496ebb307Sxinhui pan 	return -EINVAL;
24596ebb307Sxinhui pan }
24696ebb307Sxinhui pan 
amdgpu_ras_debugfs_ctrl_parse_data(struct file * f,const char __user * buf,size_t size,loff_t * pos,struct ras_debug_if * data)24796ebb307Sxinhui pan static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
24896ebb307Sxinhui pan 		const char __user *buf, size_t size,
24996ebb307Sxinhui pan 		loff_t *pos, struct ras_debug_if *data)
25096ebb307Sxinhui pan {
25196ebb307Sxinhui pan 	ssize_t s = min_t(u64, 64, size);
25296ebb307Sxinhui pan 	char str[65];
25396ebb307Sxinhui pan 	char block_name[33];
25496ebb307Sxinhui pan 	char err[9] = "ue";
25596ebb307Sxinhui pan 	int op = -1;
25696ebb307Sxinhui pan 	int block_id;
25744494f96STao Zhou 	uint32_t sub_block;
25896ebb307Sxinhui pan 	u64 address, value;
2592c22ed0bSTao Zhou 	/* default value is 0 if the mask is not set by user */
2602c22ed0bSTao Zhou 	u32 instance_mask = 0;
26196ebb307Sxinhui pan 
26296ebb307Sxinhui pan 	if (*pos)
26396ebb307Sxinhui pan 		return -EINVAL;
26496ebb307Sxinhui pan 	*pos = size;
26596ebb307Sxinhui pan 
26696ebb307Sxinhui pan 	memset(str, 0, sizeof(str));
26796ebb307Sxinhui pan 	memset(data, 0, sizeof(*data));
26896ebb307Sxinhui pan 
26996ebb307Sxinhui pan 	if (copy_from_user(str, buf, s))
27096ebb307Sxinhui pan 		return -EINVAL;
27196ebb307Sxinhui pan 
27296ebb307Sxinhui pan 	if (sscanf(str, "disable %32s", block_name) == 1)
27396ebb307Sxinhui pan 		op = 0;
27496ebb307Sxinhui pan 	else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
27596ebb307Sxinhui pan 		op = 1;
27696ebb307Sxinhui pan 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
27796ebb307Sxinhui pan 		op = 2;
2786df23f4cSDennis Li 	else if (strstr(str, "retire_page") != NULL)
279cbb8f989SJohn Clements 		op = 3;
280b076296bSxinhui pan 	else if (str[0] && str[1] && str[2] && str[3])
28196ebb307Sxinhui pan 		/* ascii string, but commands are not matched. */
28296ebb307Sxinhui pan 		return -EINVAL;
28396ebb307Sxinhui pan 
28496ebb307Sxinhui pan 	if (op != -1) {
285cbb8f989SJohn Clements 		if (op == 3) {
286546aa546SLuben Tuikov 			if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
287546aa546SLuben Tuikov 			    sscanf(str, "%*s %llu", &address) != 1)
288cbb8f989SJohn Clements 				return -EINVAL;
289cbb8f989SJohn Clements 
290cbb8f989SJohn Clements 			data->op = op;
291cbb8f989SJohn Clements 			data->inject.address = address;
292cbb8f989SJohn Clements 
293cbb8f989SJohn Clements 			return 0;
294cbb8f989SJohn Clements 		}
295cbb8f989SJohn Clements 
29696ebb307Sxinhui pan 		if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
29796ebb307Sxinhui pan 			return -EINVAL;
29896ebb307Sxinhui pan 
29996ebb307Sxinhui pan 		data->head.block = block_id;
300e1063493STao Zhou 		/* only ue and ce errors are supported */
301e1063493STao Zhou 		if (!memcmp("ue", err, 2))
302e1063493STao Zhou 			data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
303e1063493STao Zhou 		else if (!memcmp("ce", err, 2))
304e1063493STao Zhou 			data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
305e1063493STao Zhou 		else
306e1063493STao Zhou 			return -EINVAL;
307e1063493STao Zhou 
30896ebb307Sxinhui pan 		data->op = op;
30996ebb307Sxinhui pan 
31096ebb307Sxinhui pan 		if (op == 2) {
3112c22ed0bSTao Zhou 			if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
3122c22ed0bSTao Zhou 				   &sub_block, &address, &value, &instance_mask) != 4 &&
3132c22ed0bSTao Zhou 			    sscanf(str, "%*s %*s %*s %u %llu %llu %u",
3142c22ed0bSTao Zhou 				   &sub_block, &address, &value, &instance_mask) != 4 &&
3152c22ed0bSTao Zhou 				sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
316546aa546SLuben Tuikov 				   &sub_block, &address, &value) != 3 &&
317546aa546SLuben Tuikov 			    sscanf(str, "%*s %*s %*s %u %llu %llu",
31844494f96STao Zhou 				   &sub_block, &address, &value) != 3)
31996ebb307Sxinhui pan 				return -EINVAL;
32044494f96STao Zhou 			data->head.sub_block_index = sub_block;
32196ebb307Sxinhui pan 			data->inject.address = address;
32296ebb307Sxinhui pan 			data->inject.value = value;
3232c22ed0bSTao Zhou 			data->inject.instance_mask = instance_mask;
32496ebb307Sxinhui pan 		}
32596ebb307Sxinhui pan 	} else {
32673aa8e1aSxinhui pan 		if (size < sizeof(*data))
32796ebb307Sxinhui pan 			return -EINVAL;
32896ebb307Sxinhui pan 
32996ebb307Sxinhui pan 		if (copy_from_user(data, buf, sizeof(*data)))
33096ebb307Sxinhui pan 			return -EINVAL;
33196ebb307Sxinhui pan 	}
33296ebb307Sxinhui pan 
33396ebb307Sxinhui pan 	return 0;
33496ebb307Sxinhui pan }
3357c6e68c7SAndrey Grodzovsky 
amdgpu_ras_instance_mask_check(struct amdgpu_device * adev,struct ras_debug_if * data)336f464c5ddSTao Zhou static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev,
337f464c5ddSTao Zhou 				struct ras_debug_if *data)
338f464c5ddSTao Zhou {
339f464c5ddSTao Zhou 	int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
340f464c5ddSTao Zhou 	uint32_t mask, inst_mask = data->inject.instance_mask;
341f464c5ddSTao Zhou 
342f464c5ddSTao Zhou 	/* no need to set instance mask if there is only one instance */
343f464c5ddSTao Zhou 	if (num_xcc <= 1 && inst_mask) {
344f464c5ddSTao Zhou 		data->inject.instance_mask = 0;
345f464c5ddSTao Zhou 		dev_dbg(adev->dev,
346f464c5ddSTao Zhou 			"RAS inject mask(0x%x) isn't supported and force it to 0.\n",
347f464c5ddSTao Zhou 			inst_mask);
348f464c5ddSTao Zhou 
349f464c5ddSTao Zhou 		return;
350f464c5ddSTao Zhou 	}
351f464c5ddSTao Zhou 
352f464c5ddSTao Zhou 	switch (data->head.block) {
353f464c5ddSTao Zhou 	case AMDGPU_RAS_BLOCK__GFX:
354f464c5ddSTao Zhou 		mask = GENMASK(num_xcc - 1, 0);
355f464c5ddSTao Zhou 		break;
356f464c5ddSTao Zhou 	case AMDGPU_RAS_BLOCK__SDMA:
357f464c5ddSTao Zhou 		mask = GENMASK(adev->sdma.num_instances - 1, 0);
358f464c5ddSTao Zhou 		break;
359e3959cb5SStanley.Yang 	case AMDGPU_RAS_BLOCK__VCN:
360e3959cb5SStanley.Yang 	case AMDGPU_RAS_BLOCK__JPEG:
361e3959cb5SStanley.Yang 		mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0);
362e3959cb5SStanley.Yang 		break;
363f464c5ddSTao Zhou 	default:
364e3959cb5SStanley.Yang 		mask = inst_mask;
365f464c5ddSTao Zhou 		break;
366f464c5ddSTao Zhou 	}
367f464c5ddSTao Zhou 
368f464c5ddSTao Zhou 	/* remove invalid bits in instance mask */
369f464c5ddSTao Zhou 	data->inject.instance_mask &= mask;
370f464c5ddSTao Zhou 	if (inst_mask != data->inject.instance_mask)
371f464c5ddSTao Zhou 		dev_dbg(adev->dev,
372f464c5ddSTao Zhou 			"Adjust RAS inject mask 0x%x to 0x%x\n",
373f464c5ddSTao Zhou 			inst_mask, data->inject.instance_mask);
374f464c5ddSTao Zhou }
375f464c5ddSTao Zhou 
37674abc221STom St Denis /**
37774abc221STom St Denis  * DOC: AMDGPU RAS debugfs control interface
37836ea1bd2Sxinhui pan  *
379737c375bSLuben Tuikov  * The control interface accepts struct ras_debug_if which has two members.
38036ea1bd2Sxinhui pan  *
38136ea1bd2Sxinhui pan  * First member: ras_debug_if::head or ras_debug_if::inject.
38296ebb307Sxinhui pan  *
38396ebb307Sxinhui pan  * head is used to indicate which IP block will be under control.
38436ea1bd2Sxinhui pan  *
38536ea1bd2Sxinhui pan  * head has four members, they are block, type, sub_block_index, name.
38636ea1bd2Sxinhui pan  * block: which IP will be under control.
38736ea1bd2Sxinhui pan  * type: what kind of error will be enabled/disabled/injected.
38836ea1bd2Sxinhui pan  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
38936ea1bd2Sxinhui pan  * name: the name of IP.
39036ea1bd2Sxinhui pan  *
3912c22ed0bSTao Zhou  * inject has three more members than head, they are address, value and mask.
39236ea1bd2Sxinhui pan  * As their names indicate, inject operation will write the
39336ea1bd2Sxinhui pan  * value to the address.
39436ea1bd2Sxinhui pan  *
395ef177d11SAlex Deucher  * The second member: struct ras_debug_if::op.
396c688a06bSGuchun Chen  * It has three kinds of operations.
397879e723dSAdam Zerella  *
398879e723dSAdam Zerella  * - 0: disable RAS on the block. Take ::head as its data.
399879e723dSAdam Zerella  * - 1: enable RAS on the block. Take ::head as its data.
400879e723dSAdam Zerella  * - 2: inject errors on the block. Take ::inject as its data.
40136ea1bd2Sxinhui pan  *
40296ebb307Sxinhui pan  * How to use the interface?
403ef177d11SAlex Deucher  *
404737c375bSLuben Tuikov  * In a program
405ef177d11SAlex Deucher  *
406737c375bSLuben Tuikov  * Copy the struct ras_debug_if in your code and initialize it.
407737c375bSLuben Tuikov  * Write the struct to the control interface.
408ef177d11SAlex Deucher  *
409737c375bSLuben Tuikov  * From shell
41096ebb307Sxinhui pan  *
411879e723dSAdam Zerella  * .. code-block:: bash
412879e723dSAdam Zerella  *
413737c375bSLuben Tuikov  *	echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
414737c375bSLuben Tuikov  *	echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
4152c22ed0bSTao Zhou  *	echo "inject  <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
416879e723dSAdam Zerella  *
417737c375bSLuben Tuikov  * Where N, is the card which you want to affect.
418ef177d11SAlex Deucher  *
419737c375bSLuben Tuikov  * "disable" requires only the block.
420737c375bSLuben Tuikov  * "enable" requires the block and error type.
421737c375bSLuben Tuikov  * "inject" requires the block, error type, address, and value.
422c666bbf0SDwaipayan Ray  *
423737c375bSLuben Tuikov  * The block is one of: umc, sdma, gfx, etc.
42496ebb307Sxinhui pan  *	see ras_block_string[] for details
425c666bbf0SDwaipayan Ray  *
426737c375bSLuben Tuikov  * The error type is one of: ue, ce, where,
427737c375bSLuben Tuikov  *	ue is multi-uncorrectable
428737c375bSLuben Tuikov  *	ce is single-correctable
429c666bbf0SDwaipayan Ray  *
430737c375bSLuben Tuikov  * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
431737c375bSLuben Tuikov  * The address and value are hexadecimal numbers, leading 0x is optional.
4322c22ed0bSTao Zhou  * The mask means instance mask, is optional, default value is 0x1.
43396ebb307Sxinhui pan  *
434737c375bSLuben Tuikov  * For instance,
435879e723dSAdam Zerella  *
436879e723dSAdam Zerella  * .. code-block:: bash
437879e723dSAdam Zerella  *
43844494f96STao Zhou  *	echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
4392c22ed0bSTao Zhou  *	echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
44096ebb307Sxinhui pan  *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
44196ebb307Sxinhui pan  *
442737c375bSLuben Tuikov  * How to check the result of the operation?
44336ea1bd2Sxinhui pan  *
444737c375bSLuben Tuikov  * To check disable/enable, see "ras" features at,
44536ea1bd2Sxinhui pan  * /sys/class/drm/card[0/1/2...]/device/ras/features
44636ea1bd2Sxinhui pan  *
447737c375bSLuben Tuikov  * To check inject, see the corresponding error count at,
448737c375bSLuben Tuikov  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
44936ea1bd2Sxinhui pan  *
450879e723dSAdam Zerella  * .. note::
451ef177d11SAlex Deucher  *	Operations are only allowed on blocks which are supported.
452737c375bSLuben Tuikov  *	Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
453ef177d11SAlex Deucher  *	to see which blocks support RAS on a particular asic.
454ef177d11SAlex Deucher  *
45536ea1bd2Sxinhui pan  */
amdgpu_ras_debugfs_ctrl_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)456cf696091SLuben Tuikov static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
457cf696091SLuben Tuikov 					     const char __user *buf,
45836ea1bd2Sxinhui pan 					     size_t size, loff_t *pos)
45936ea1bd2Sxinhui pan {
46036ea1bd2Sxinhui pan 	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
46136ea1bd2Sxinhui pan 	struct ras_debug_if data;
46236ea1bd2Sxinhui pan 	int ret = 0;
46336ea1bd2Sxinhui pan 
46461380faaSJohn Clements 	if (!amdgpu_ras_get_error_query_ready(adev)) {
4656952e99cSGuchun Chen 		dev_warn(adev->dev, "RAS WARN: error injection "
4666952e99cSGuchun Chen 				"currently inaccessible\n");
46743c4d576SJohn Clements 		return size;
46843c4d576SJohn Clements 	}
46943c4d576SJohn Clements 
47096ebb307Sxinhui pan 	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
47196ebb307Sxinhui pan 	if (ret)
472cf696091SLuben Tuikov 		return ret;
47336ea1bd2Sxinhui pan 
47480b0cd0fSLuben Tuikov 	if (data.op == 3) {
475cbb8f989SJohn Clements 		ret = amdgpu_reserve_page_direct(adev, data.inject.address);
47680b0cd0fSLuben Tuikov 		if (!ret)
477cbb8f989SJohn Clements 			return size;
478cbb8f989SJohn Clements 		else
479cbb8f989SJohn Clements 			return ret;
480cbb8f989SJohn Clements 	}
481cbb8f989SJohn Clements 
48236ea1bd2Sxinhui pan 	if (!amdgpu_ras_is_supported(adev, data.head.block))
48336ea1bd2Sxinhui pan 		return -EINVAL;
48436ea1bd2Sxinhui pan 
48536ea1bd2Sxinhui pan 	switch (data.op) {
48636ea1bd2Sxinhui pan 	case 0:
48736ea1bd2Sxinhui pan 		ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
48836ea1bd2Sxinhui pan 		break;
48936ea1bd2Sxinhui pan 	case 1:
49036ea1bd2Sxinhui pan 		ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
49136ea1bd2Sxinhui pan 		break;
49236ea1bd2Sxinhui pan 	case 2:
49343aedbf4SStanley.Yang 		if ((data.inject.address >= adev->gmc.mc_vram_size &&
49443aedbf4SStanley.Yang 		    adev->gmc.mc_vram_size) ||
4957cdc2ee3STao Zhou 		    (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
496b0d4783aSGuchun Chen 			dev_warn(adev->dev, "RAS WARN: input address "
497b0d4783aSGuchun Chen 					"0x%llx is invalid.",
498b0d4783aSGuchun Chen 					data.inject.address);
4997cdc2ee3STao Zhou 			ret = -EINVAL;
500efb426d5Sxinhui pan 			break;
5017cdc2ee3STao Zhou 		}
5027cdc2ee3STao Zhou 
5036e4be987STao Zhou 		/* umc ce/ue error injection for a bad page is not allowed */
5046e4be987STao Zhou 		if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
5056e4be987STao Zhou 		    amdgpu_ras_check_bad_page(adev, data.inject.address)) {
506c65b0805SLuben Tuikov 			dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
507c65b0805SLuben Tuikov 				 "already been marked as bad!\n",
5086e4be987STao Zhou 				 data.inject.address);
5096e4be987STao Zhou 			break;
5106e4be987STao Zhou 		}
5116e4be987STao Zhou 
512f464c5ddSTao Zhou 		amdgpu_ras_instance_mask_check(adev, &data);
513f464c5ddSTao Zhou 
5147cdc2ee3STao Zhou 		/* data.inject.address is offset instead of absolute gpu address */
51536ea1bd2Sxinhui pan 		ret = amdgpu_ras_error_inject(adev, &data.inject);
51636ea1bd2Sxinhui pan 		break;
51796ebb307Sxinhui pan 	default:
51896ebb307Sxinhui pan 		ret = -EINVAL;
51996ebb307Sxinhui pan 		break;
520374bf7bdSzhengbin 	}
52136ea1bd2Sxinhui pan 
52236ea1bd2Sxinhui pan 	if (ret)
52379c04621SStanley.Yang 		return ret;
52436ea1bd2Sxinhui pan 
52536ea1bd2Sxinhui pan 	return size;
52636ea1bd2Sxinhui pan }
52736ea1bd2Sxinhui pan 
528084fe13bSAndrey Grodzovsky /**
529084fe13bSAndrey Grodzovsky  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
530084fe13bSAndrey Grodzovsky  *
531f77c7109SAlex Deucher  * Some boards contain an EEPROM which is used to persistently store a list of
532ef177d11SAlex Deucher  * bad pages which experiences ECC errors in vram.  This interface provides
533f77c7109SAlex Deucher  * a way to reset the EEPROM, e.g., after testing error injection.
534f77c7109SAlex Deucher  *
535f77c7109SAlex Deucher  * Usage:
536f77c7109SAlex Deucher  *
537f77c7109SAlex Deucher  * .. code-block:: bash
538f77c7109SAlex Deucher  *
539f77c7109SAlex Deucher  *	echo 1 > ../ras/ras_eeprom_reset
540f77c7109SAlex Deucher  *
541f77c7109SAlex Deucher  * will reset EEPROM table to 0 entries.
542f77c7109SAlex Deucher  *
543084fe13bSAndrey Grodzovsky  */
amdgpu_ras_debugfs_eeprom_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)544cf696091SLuben Tuikov static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
545cf696091SLuben Tuikov 					       const char __user *buf,
546084fe13bSAndrey Grodzovsky 					       size_t size, loff_t *pos)
547084fe13bSAndrey Grodzovsky {
548bf0b91b7SGuchun Chen 	struct amdgpu_device *adev =
549bf0b91b7SGuchun Chen 		(struct amdgpu_device *)file_inode(f)->i_private;
550084fe13bSAndrey Grodzovsky 	int ret;
551084fe13bSAndrey Grodzovsky 
552bf0b91b7SGuchun Chen 	ret = amdgpu_ras_eeprom_reset_table(
553bf0b91b7SGuchun Chen 		&(amdgpu_ras_get_context(adev)->eeprom_control));
554084fe13bSAndrey Grodzovsky 
55563d4c081SLuben Tuikov 	if (!ret) {
556cf696091SLuben Tuikov 		/* Something was written to EEPROM.
557cf696091SLuben Tuikov 		 */
558bf0b91b7SGuchun Chen 		amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
559bf0b91b7SGuchun Chen 		return size;
560bf0b91b7SGuchun Chen 	} else {
561cf696091SLuben Tuikov 		return ret;
562bf0b91b7SGuchun Chen 	}
563084fe13bSAndrey Grodzovsky }
564084fe13bSAndrey Grodzovsky 
56536ea1bd2Sxinhui pan static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
56636ea1bd2Sxinhui pan 	.owner = THIS_MODULE,
56736ea1bd2Sxinhui pan 	.read = NULL,
56836ea1bd2Sxinhui pan 	.write = amdgpu_ras_debugfs_ctrl_write,
56936ea1bd2Sxinhui pan 	.llseek = default_llseek
57036ea1bd2Sxinhui pan };
57136ea1bd2Sxinhui pan 
572084fe13bSAndrey Grodzovsky static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
573084fe13bSAndrey Grodzovsky 	.owner = THIS_MODULE,
574084fe13bSAndrey Grodzovsky 	.read = NULL,
575084fe13bSAndrey Grodzovsky 	.write = amdgpu_ras_debugfs_eeprom_write,
576084fe13bSAndrey Grodzovsky 	.llseek = default_llseek
577084fe13bSAndrey Grodzovsky };
578084fe13bSAndrey Grodzovsky 
579f77c7109SAlex Deucher /**
580f77c7109SAlex Deucher  * DOC: AMDGPU RAS sysfs Error Count Interface
581f77c7109SAlex Deucher  *
582ef177d11SAlex Deucher  * It allows the user to read the error count for each IP block on the gpu through
583f77c7109SAlex Deucher  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
584f77c7109SAlex Deucher  *
585f77c7109SAlex Deucher  * It outputs the multiple lines which report the uncorrected (ue) and corrected
586f77c7109SAlex Deucher  * (ce) error counts.
587f77c7109SAlex Deucher  *
588f77c7109SAlex Deucher  * The format of one line is below,
589f77c7109SAlex Deucher  *
590f77c7109SAlex Deucher  * [ce|ue]: count
591f77c7109SAlex Deucher  *
592f77c7109SAlex Deucher  * Example:
593f77c7109SAlex Deucher  *
594f77c7109SAlex Deucher  * .. code-block:: bash
595f77c7109SAlex Deucher  *
596f77c7109SAlex Deucher  *	ue: 0
597f77c7109SAlex Deucher  *	ce: 1
598f77c7109SAlex Deucher  *
599f77c7109SAlex Deucher  */
amdgpu_ras_sysfs_read(struct device * dev,struct device_attribute * attr,char * buf)600c030f2e4Sxinhui pan static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
601c030f2e4Sxinhui pan 		struct device_attribute *attr, char *buf)
602c030f2e4Sxinhui pan {
603c030f2e4Sxinhui pan 	struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
604c030f2e4Sxinhui pan 	struct ras_query_if info = {
605c030f2e4Sxinhui pan 		.head = obj->head,
606c030f2e4Sxinhui pan 	};
607c030f2e4Sxinhui pan 
60861380faaSJohn Clements 	if (!amdgpu_ras_get_error_query_ready(obj->adev))
60936000c7aSTian Tao 		return sysfs_emit(buf, "Query currently inaccessible\n");
61043c4d576SJohn Clements 
611761d86d3SDennis Li 	if (amdgpu_ras_query_error_status(obj->adev, &info))
612c030f2e4Sxinhui pan 		return -EINVAL;
613c030f2e4Sxinhui pan 
6142a460963SCandice Li 	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
6152a460963SCandice Li 	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
6161f0d8e37SMukul Joshi 		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
6172a460963SCandice Li 			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
6181f0d8e37SMukul Joshi 	}
6191f0d8e37SMukul Joshi 
62036000c7aSTian Tao 	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
621c030f2e4Sxinhui pan 			  "ce", info.ce_count);
622c030f2e4Sxinhui pan }
623c030f2e4Sxinhui pan 
624c030f2e4Sxinhui pan /* obj begin */
625c030f2e4Sxinhui pan 
626c030f2e4Sxinhui pan #define get_obj(obj) do { (obj)->use++; } while (0)
627c030f2e4Sxinhui pan #define alive_obj(obj) ((obj)->use)
628c030f2e4Sxinhui pan 
put_obj(struct ras_manager * obj)629c030f2e4Sxinhui pan static inline void put_obj(struct ras_manager *obj)
630c030f2e4Sxinhui pan {
631f0872686SBernard Zhao 	if (obj && (--obj->use == 0))
632c030f2e4Sxinhui pan 		list_del(&obj->node);
633f0872686SBernard Zhao 	if (obj && (obj->use < 0))
634640ae42eSJohn Clements 		DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
635c030f2e4Sxinhui pan }
636c030f2e4Sxinhui pan 
637c030f2e4Sxinhui pan /* make one obj and return it. */
amdgpu_ras_create_obj(struct amdgpu_device * adev,struct ras_common_if * head)638c030f2e4Sxinhui pan static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
639c030f2e4Sxinhui pan 		struct ras_common_if *head)
640c030f2e4Sxinhui pan {
641c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
642c030f2e4Sxinhui pan 	struct ras_manager *obj;
643c030f2e4Sxinhui pan 
6448ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
645c030f2e4Sxinhui pan 		return NULL;
646c030f2e4Sxinhui pan 
647c030f2e4Sxinhui pan 	if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
648c030f2e4Sxinhui pan 		return NULL;
649c030f2e4Sxinhui pan 
650640ae42eSJohn Clements 	if (head->block == AMDGPU_RAS_BLOCK__MCA) {
651640ae42eSJohn Clements 		if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
652640ae42eSJohn Clements 			return NULL;
653640ae42eSJohn Clements 
654640ae42eSJohn Clements 		obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
655640ae42eSJohn Clements 	} else
656c030f2e4Sxinhui pan 		obj = &con->objs[head->block];
657640ae42eSJohn Clements 
658c030f2e4Sxinhui pan 	/* already exist. return obj? */
659c030f2e4Sxinhui pan 	if (alive_obj(obj))
660c030f2e4Sxinhui pan 		return NULL;
661c030f2e4Sxinhui pan 
662c030f2e4Sxinhui pan 	obj->head = *head;
663c030f2e4Sxinhui pan 	obj->adev = adev;
664c030f2e4Sxinhui pan 	list_add(&obj->node, &con->head);
665c030f2e4Sxinhui pan 	get_obj(obj);
666c030f2e4Sxinhui pan 
667c030f2e4Sxinhui pan 	return obj;
668c030f2e4Sxinhui pan }
669c030f2e4Sxinhui pan 
670c030f2e4Sxinhui pan /* return an obj equal to head, or the first when head is NULL */
amdgpu_ras_find_obj(struct amdgpu_device * adev,struct ras_common_if * head)671f2a79be1SLe Ma struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
672c030f2e4Sxinhui pan 		struct ras_common_if *head)
673c030f2e4Sxinhui pan {
674c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
675c030f2e4Sxinhui pan 	struct ras_manager *obj;
676c030f2e4Sxinhui pan 	int i;
677c030f2e4Sxinhui pan 
6788ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
679c030f2e4Sxinhui pan 		return NULL;
680c030f2e4Sxinhui pan 
681c030f2e4Sxinhui pan 	if (head) {
682c030f2e4Sxinhui pan 		if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
683c030f2e4Sxinhui pan 			return NULL;
684c030f2e4Sxinhui pan 
685640ae42eSJohn Clements 		if (head->block == AMDGPU_RAS_BLOCK__MCA) {
686640ae42eSJohn Clements 			if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
687640ae42eSJohn Clements 				return NULL;
688640ae42eSJohn Clements 
689640ae42eSJohn Clements 			obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
690640ae42eSJohn Clements 		} else
691c030f2e4Sxinhui pan 			obj = &con->objs[head->block];
692c030f2e4Sxinhui pan 
693640ae42eSJohn Clements 		if (alive_obj(obj))
694c030f2e4Sxinhui pan 			return obj;
695c030f2e4Sxinhui pan 	} else {
696640ae42eSJohn Clements 		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
697c030f2e4Sxinhui pan 			obj = &con->objs[i];
698640ae42eSJohn Clements 			if (alive_obj(obj))
699c030f2e4Sxinhui pan 				return obj;
700c030f2e4Sxinhui pan 		}
701c030f2e4Sxinhui pan 	}
702c030f2e4Sxinhui pan 
703c030f2e4Sxinhui pan 	return NULL;
704c030f2e4Sxinhui pan }
705c030f2e4Sxinhui pan /* obj end */
706c030f2e4Sxinhui pan 
707c030f2e4Sxinhui pan /* feature ctl begin */
amdgpu_ras_is_feature_allowed(struct amdgpu_device * adev,struct ras_common_if * head)708c030f2e4Sxinhui pan static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
709c030f2e4Sxinhui pan 					 struct ras_common_if *head)
710c030f2e4Sxinhui pan {
7118ab0d6f0SLuben Tuikov 	return adev->ras_hw_enabled & BIT(head->block);
712c030f2e4Sxinhui pan }
713c030f2e4Sxinhui pan 
amdgpu_ras_is_feature_enabled(struct amdgpu_device * adev,struct ras_common_if * head)714c030f2e4Sxinhui pan static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
715c030f2e4Sxinhui pan 		struct ras_common_if *head)
716c030f2e4Sxinhui pan {
717c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
718c030f2e4Sxinhui pan 
719c030f2e4Sxinhui pan 	return con->features & BIT(head->block);
720c030f2e4Sxinhui pan }
721c030f2e4Sxinhui pan 
722c030f2e4Sxinhui pan /*
723c030f2e4Sxinhui pan  * if obj is not created, then create one.
724c030f2e4Sxinhui pan  * set feature enable flag.
725c030f2e4Sxinhui pan  */
__amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,int enable)726c030f2e4Sxinhui pan static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
727c030f2e4Sxinhui pan 		struct ras_common_if *head, int enable)
728c030f2e4Sxinhui pan {
729c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
730c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
731c030f2e4Sxinhui pan 
7325caf466aSxinhui pan 	/* If hardware does not support ras, then do not create obj.
7335caf466aSxinhui pan 	 * But if hardware support ras, we can create the obj.
7345caf466aSxinhui pan 	 * Ras framework checks con->hw_supported to see if it need do
7355caf466aSxinhui pan 	 * corresponding initialization.
7365caf466aSxinhui pan 	 * IP checks con->support to see if it need disable ras.
7375caf466aSxinhui pan 	 */
738c030f2e4Sxinhui pan 	if (!amdgpu_ras_is_feature_allowed(adev, head))
739c030f2e4Sxinhui pan 		return 0;
740c030f2e4Sxinhui pan 
741c030f2e4Sxinhui pan 	if (enable) {
742c030f2e4Sxinhui pan 		if (!obj) {
743c030f2e4Sxinhui pan 			obj = amdgpu_ras_create_obj(adev, head);
744c030f2e4Sxinhui pan 			if (!obj)
745c030f2e4Sxinhui pan 				return -EINVAL;
746c030f2e4Sxinhui pan 		} else {
747c030f2e4Sxinhui pan 			/* In case we create obj somewhere else */
748c030f2e4Sxinhui pan 			get_obj(obj);
749c030f2e4Sxinhui pan 		}
750c030f2e4Sxinhui pan 		con->features |= BIT(head->block);
751c030f2e4Sxinhui pan 	} else {
752c030f2e4Sxinhui pan 		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
753c030f2e4Sxinhui pan 			con->features &= ~BIT(head->block);
754c030f2e4Sxinhui pan 			put_obj(obj);
755c030f2e4Sxinhui pan 		}
756c030f2e4Sxinhui pan 	}
757c030f2e4Sxinhui pan 
758c030f2e4Sxinhui pan 	return 0;
759c030f2e4Sxinhui pan }
760c030f2e4Sxinhui pan 
761c030f2e4Sxinhui pan /* wrapper of psp_ras_enable_features */
amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)762c030f2e4Sxinhui pan int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
763c030f2e4Sxinhui pan 		struct ras_common_if *head, bool enable)
764c030f2e4Sxinhui pan {
765c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
7667fcffecfSArnd Bergmann 	union ta_ras_cmd_input *info;
7679f051d6fSHawking Zhang 	int ret;
768c030f2e4Sxinhui pan 
769c030f2e4Sxinhui pan 	if (!con)
770c030f2e4Sxinhui pan 		return -EINVAL;
771c030f2e4Sxinhui pan 
7726fc9d92cSHawking Zhang 	/* Do not enable ras feature if it is not allowed */
7736fc9d92cSHawking Zhang 	if (enable &&
7746fc9d92cSHawking Zhang 	    head->block != AMDGPU_RAS_BLOCK__GFX &&
7756fc9d92cSHawking Zhang 	    !amdgpu_ras_is_feature_allowed(adev, head))
7769f051d6fSHawking Zhang 		return 0;
7776fc9d92cSHawking Zhang 
7786fc9d92cSHawking Zhang 	/* Only enable gfx ras feature from host side */
7796fc9d92cSHawking Zhang 	if (head->block == AMDGPU_RAS_BLOCK__GFX &&
7806fc9d92cSHawking Zhang 	    !amdgpu_sriov_vf(adev) &&
7816fc9d92cSHawking Zhang 	    !amdgpu_ras_intr_triggered()) {
7827fcffecfSArnd Bergmann 		info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
7837fcffecfSArnd Bergmann 		if (!info)
7847fcffecfSArnd Bergmann 			return -ENOMEM;
7857fcffecfSArnd Bergmann 
786c030f2e4Sxinhui pan 		if (!enable) {
7877fcffecfSArnd Bergmann 			info->disable_features = (struct ta_ras_disable_features_input) {
788828cfa29Sxinhui pan 				.block_id =  amdgpu_ras_block_to_ta(head->block),
789828cfa29Sxinhui pan 				.error_type = amdgpu_ras_error_to_ta(head->type),
790c030f2e4Sxinhui pan 			};
791c030f2e4Sxinhui pan 		} else {
7927fcffecfSArnd Bergmann 			info->enable_features = (struct ta_ras_enable_features_input) {
793828cfa29Sxinhui pan 				.block_id =  amdgpu_ras_block_to_ta(head->block),
794828cfa29Sxinhui pan 				.error_type = amdgpu_ras_error_to_ta(head->type),
795c030f2e4Sxinhui pan 			};
796c030f2e4Sxinhui pan 		}
797c030f2e4Sxinhui pan 
7987fcffecfSArnd Bergmann 		ret = psp_ras_enable_features(&adev->psp, info, enable);
799c030f2e4Sxinhui pan 		if (ret) {
800e4348849STao Zhou 			dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
801c030f2e4Sxinhui pan 				enable ? "enable":"disable",
802640ae42eSJohn Clements 				get_ras_block_str(head),
803e4348849STao Zhou 				amdgpu_ras_is_poison_mode_supported(adev), ret);
804f387bb57SCong Liu 			kfree(info);
8059f051d6fSHawking Zhang 			return ret;
806c030f2e4Sxinhui pan 		}
8079f051d6fSHawking Zhang 
8089f051d6fSHawking Zhang 		kfree(info);
809bff77e86SLe Ma 	}
810c030f2e4Sxinhui pan 
811c030f2e4Sxinhui pan 	/* setup the obj */
812c030f2e4Sxinhui pan 	__amdgpu_ras_feature_enable(adev, head, enable);
8139f051d6fSHawking Zhang 
8149f051d6fSHawking Zhang 	return 0;
815c030f2e4Sxinhui pan }
816c030f2e4Sxinhui pan 
81777de502bSxinhui pan /* Only used in device probe stage and called only once. */
amdgpu_ras_feature_enable_on_boot(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)81877de502bSxinhui pan int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
81977de502bSxinhui pan 		struct ras_common_if *head, bool enable)
82077de502bSxinhui pan {
82177de502bSxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
82277de502bSxinhui pan 	int ret;
82377de502bSxinhui pan 
82477de502bSxinhui pan 	if (!con)
82577de502bSxinhui pan 		return -EINVAL;
82677de502bSxinhui pan 
82777de502bSxinhui pan 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
8287af23ebeSxinhui pan 		if (enable) {
8297af23ebeSxinhui pan 			/* There is no harm to issue a ras TA cmd regardless of
8307af23ebeSxinhui pan 			 * the currecnt ras state.
8317af23ebeSxinhui pan 			 * If current state == target state, it will do nothing
8327af23ebeSxinhui pan 			 * But sometimes it requests driver to reset and repost
8337af23ebeSxinhui pan 			 * with error code -EAGAIN.
83477de502bSxinhui pan 			 */
8357af23ebeSxinhui pan 			ret = amdgpu_ras_feature_enable(adev, head, 1);
8367af23ebeSxinhui pan 			/* With old ras TA, we might fail to enable ras.
8377af23ebeSxinhui pan 			 * Log it and just setup the object.
8387af23ebeSxinhui pan 			 * TODO need remove this WA in the future.
8397af23ebeSxinhui pan 			 */
8407af23ebeSxinhui pan 			if (ret == -EINVAL) {
8417af23ebeSxinhui pan 				ret = __amdgpu_ras_feature_enable(adev, head, 1);
8427af23ebeSxinhui pan 				if (!ret)
8436952e99cSGuchun Chen 					dev_info(adev->dev,
8446952e99cSGuchun Chen 						"RAS INFO: %s setup object\n",
845640ae42eSJohn Clements 						get_ras_block_str(head));
8467af23ebeSxinhui pan 			}
8477af23ebeSxinhui pan 		} else {
8487af23ebeSxinhui pan 			/* setup the object then issue a ras TA disable cmd.*/
84977de502bSxinhui pan 			ret = __amdgpu_ras_feature_enable(adev, head, 1);
85077de502bSxinhui pan 			if (ret)
85177de502bSxinhui pan 				return ret;
85277de502bSxinhui pan 
853970fd197SStanley.Yang 			/* gfx block ras dsiable cmd must send to ras-ta */
854970fd197SStanley.Yang 			if (head->block == AMDGPU_RAS_BLOCK__GFX)
855970fd197SStanley.Yang 				con->features |= BIT(head->block);
856970fd197SStanley.Yang 
85777de502bSxinhui pan 			ret = amdgpu_ras_feature_enable(adev, head, 0);
85819d0dfdaSStanley.Yang 
85919d0dfdaSStanley.Yang 			/* clean gfx block ras features flag */
8608ab0d6f0SLuben Tuikov 			if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
86119d0dfdaSStanley.Yang 				con->features &= ~BIT(head->block);
8627af23ebeSxinhui pan 		}
86377de502bSxinhui pan 	} else
86477de502bSxinhui pan 		ret = amdgpu_ras_feature_enable(adev, head, enable);
86577de502bSxinhui pan 
86677de502bSxinhui pan 	return ret;
86777de502bSxinhui pan }
86877de502bSxinhui pan 
amdgpu_ras_disable_all_features(struct amdgpu_device * adev,bool bypass)869c030f2e4Sxinhui pan static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
870c030f2e4Sxinhui pan 		bool bypass)
871c030f2e4Sxinhui pan {
872c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
873c030f2e4Sxinhui pan 	struct ras_manager *obj, *tmp;
874c030f2e4Sxinhui pan 
875c030f2e4Sxinhui pan 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
876c030f2e4Sxinhui pan 		/* bypass psp.
877c030f2e4Sxinhui pan 		 * aka just release the obj and corresponding flags
878c030f2e4Sxinhui pan 		 */
879c030f2e4Sxinhui pan 		if (bypass) {
880c030f2e4Sxinhui pan 			if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
881c030f2e4Sxinhui pan 				break;
882c030f2e4Sxinhui pan 		} else {
883c030f2e4Sxinhui pan 			if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
884c030f2e4Sxinhui pan 				break;
885c030f2e4Sxinhui pan 		}
886289d513bSkbuild test robot 	}
887c030f2e4Sxinhui pan 
888c030f2e4Sxinhui pan 	return con->features;
889c030f2e4Sxinhui pan }
890c030f2e4Sxinhui pan 
amdgpu_ras_enable_all_features(struct amdgpu_device * adev,bool bypass)891c030f2e4Sxinhui pan static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
892c030f2e4Sxinhui pan 		bool bypass)
893c030f2e4Sxinhui pan {
894c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
895c030f2e4Sxinhui pan 	int i;
896640ae42eSJohn Clements 	const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
897c030f2e4Sxinhui pan 
898640ae42eSJohn Clements 	for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
899c030f2e4Sxinhui pan 		struct ras_common_if head = {
900c030f2e4Sxinhui pan 			.block = i,
901191051a1Sxinhui pan 			.type = default_ras_type,
902c030f2e4Sxinhui pan 			.sub_block_index = 0,
903c030f2e4Sxinhui pan 		};
904640ae42eSJohn Clements 
905640ae42eSJohn Clements 		if (i == AMDGPU_RAS_BLOCK__MCA)
906640ae42eSJohn Clements 			continue;
907640ae42eSJohn Clements 
908640ae42eSJohn Clements 		if (bypass) {
909640ae42eSJohn Clements 			/*
910640ae42eSJohn Clements 			 * bypass psp. vbios enable ras for us.
911640ae42eSJohn Clements 			 * so just create the obj
912640ae42eSJohn Clements 			 */
913640ae42eSJohn Clements 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
914640ae42eSJohn Clements 				break;
915640ae42eSJohn Clements 		} else {
916640ae42eSJohn Clements 			if (amdgpu_ras_feature_enable(adev, &head, 1))
917640ae42eSJohn Clements 				break;
918640ae42eSJohn Clements 		}
919640ae42eSJohn Clements 	}
920640ae42eSJohn Clements 
921640ae42eSJohn Clements 	for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
922640ae42eSJohn Clements 		struct ras_common_if head = {
923640ae42eSJohn Clements 			.block = AMDGPU_RAS_BLOCK__MCA,
924640ae42eSJohn Clements 			.type = default_ras_type,
925640ae42eSJohn Clements 			.sub_block_index = i,
926640ae42eSJohn Clements 		};
927640ae42eSJohn Clements 
928c030f2e4Sxinhui pan 		if (bypass) {
929c030f2e4Sxinhui pan 			/*
930c030f2e4Sxinhui pan 			 * bypass psp. vbios enable ras for us.
931c030f2e4Sxinhui pan 			 * so just create the obj
932c030f2e4Sxinhui pan 			 */
933c030f2e4Sxinhui pan 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
934c030f2e4Sxinhui pan 				break;
935c030f2e4Sxinhui pan 		} else {
936c030f2e4Sxinhui pan 			if (amdgpu_ras_feature_enable(adev, &head, 1))
937c030f2e4Sxinhui pan 				break;
938c030f2e4Sxinhui pan 		}
939289d513bSkbuild test robot 	}
940c030f2e4Sxinhui pan 
941c030f2e4Sxinhui pan 	return con->features;
942c030f2e4Sxinhui pan }
943c030f2e4Sxinhui pan /* feature ctl end */
944c030f2e4Sxinhui pan 
amdgpu_ras_block_match_default(struct amdgpu_ras_block_object * block_obj,enum amdgpu_ras_block block)945e3d833f4Syipechai static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
946e3d833f4Syipechai 		enum amdgpu_ras_block block)
947640ae42eSJohn Clements {
9486492e1b0Syipechai 	if (!block_obj)
9496492e1b0Syipechai 		return -EINVAL;
9506492e1b0Syipechai 
951bdb3489cSyipechai 	if (block_obj->ras_comm.block == block)
9526492e1b0Syipechai 		return 0;
9536492e1b0Syipechai 
9546492e1b0Syipechai 	return -EINVAL;
955640ae42eSJohn Clements }
9566492e1b0Syipechai 
amdgpu_ras_get_ras_block(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint32_t sub_block_index)9576492e1b0Syipechai static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
9586492e1b0Syipechai 					enum amdgpu_ras_block block, uint32_t sub_block_index)
9596492e1b0Syipechai {
960d5e8ff5fSyipechai 	struct amdgpu_ras_block_list *node, *tmp;
961d5e8ff5fSyipechai 	struct amdgpu_ras_block_object *obj;
9626492e1b0Syipechai 
9636492e1b0Syipechai 	if (block >= AMDGPU_RAS_BLOCK__LAST)
9646492e1b0Syipechai 		return NULL;
9656492e1b0Syipechai 
966d5e8ff5fSyipechai 	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
967d5e8ff5fSyipechai 		if (!node->ras_obj) {
968d5e8ff5fSyipechai 			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
969d5e8ff5fSyipechai 			continue;
970d5e8ff5fSyipechai 		}
971d5e8ff5fSyipechai 
972d5e8ff5fSyipechai 		obj = node->ras_obj;
9736492e1b0Syipechai 		if (obj->ras_block_match) {
9746492e1b0Syipechai 			if (obj->ras_block_match(obj, block, sub_block_index) == 0)
9756492e1b0Syipechai 				return obj;
9766492e1b0Syipechai 		} else {
9776492e1b0Syipechai 			if (amdgpu_ras_block_match_default(obj, block) == 0)
9786492e1b0Syipechai 				return obj;
9796492e1b0Syipechai 		}
9806492e1b0Syipechai 	}
9816492e1b0Syipechai 
9826492e1b0Syipechai 	return NULL;
983640ae42eSJohn Clements }
984640ae42eSJohn Clements 
amdgpu_ras_get_ecc_info(struct amdgpu_device * adev,struct ras_err_data * err_data)985fdcb279dSStanley.Yang static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
986fdcb279dSStanley.Yang {
987fdcb279dSStanley.Yang 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
988fdcb279dSStanley.Yang 	int ret = 0;
989fdcb279dSStanley.Yang 
990fdcb279dSStanley.Yang 	/*
991fdcb279dSStanley.Yang 	 * choosing right query method according to
992fdcb279dSStanley.Yang 	 * whether smu support query error information
993fdcb279dSStanley.Yang 	 */
994bc143d8bSEvan Quan 	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
995fdcb279dSStanley.Yang 	if (ret == -EOPNOTSUPP) {
996efe17d5aSyipechai 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
997efe17d5aSyipechai 			adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
998efe17d5aSyipechai 			adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
999fdcb279dSStanley.Yang 
1000fdcb279dSStanley.Yang 		/* umc query_ras_error_address is also responsible for clearing
1001fdcb279dSStanley.Yang 		 * error status
1002fdcb279dSStanley.Yang 		 */
1003efe17d5aSyipechai 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
1004efe17d5aSyipechai 		    adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
1005efe17d5aSyipechai 			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
1006fdcb279dSStanley.Yang 	} else if (!ret) {
1007efe17d5aSyipechai 		if (adev->umc.ras &&
1008efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_count)
1009efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
1010fdcb279dSStanley.Yang 
1011efe17d5aSyipechai 		if (adev->umc.ras &&
1012efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_address)
1013efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
1014fdcb279dSStanley.Yang 	}
1015fdcb279dSStanley.Yang }
1016fdcb279dSStanley.Yang 
1017c030f2e4Sxinhui pan /* query/inject/cure begin */
amdgpu_ras_query_error_status(struct amdgpu_device * adev,struct ras_query_if * info)1018761d86d3SDennis Li int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
1019c030f2e4Sxinhui pan 				  struct ras_query_if *info)
1020c030f2e4Sxinhui pan {
10218b0fb0e9Syipechai 	struct amdgpu_ras_block_object *block_obj = NULL;
1022c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
10236f102dbaSTao Zhou 	struct ras_err_data err_data = {0, 0, 0, NULL};
1024c030f2e4Sxinhui pan 
1025c030f2e4Sxinhui pan 	if (!obj)
1026c030f2e4Sxinhui pan 		return -EINVAL;
1027c030f2e4Sxinhui pan 
1028c364e7a3SSrinivasan Shanmugam 	if (!info || info->head.block == AMDGPU_RAS_BLOCK_COUNT)
1029c364e7a3SSrinivasan Shanmugam 		return -EINVAL;
1030c364e7a3SSrinivasan Shanmugam 
10317389a5b8Syipechai 	if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
1032fdcb279dSStanley.Yang 		amdgpu_ras_get_ecc_info(adev, &err_data);
10337389a5b8Syipechai 	} else {
10347389a5b8Syipechai 		block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
10358b0fb0e9Syipechai 		if (!block_obj || !block_obj->hw_ops)   {
1036afa37315SLuben Tuikov 			dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
10378b0fb0e9Syipechai 				     get_ras_block_str(&info->head));
10388b0fb0e9Syipechai 			return -EINVAL;
10393e81ee9aSHawking Zhang 		}
1040761d86d3SDennis Li 
10418b0fb0e9Syipechai 		if (block_obj->hw_ops->query_ras_error_count)
10428b0fb0e9Syipechai 			block_obj->hw_ops->query_ras_error_count(adev, &err_data);
1043761d86d3SDennis Li 
10447389a5b8Syipechai 		if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
10457389a5b8Syipechai 		    (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
10467389a5b8Syipechai 		    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
10478b0fb0e9Syipechai 				if (block_obj->hw_ops->query_ras_error_status)
10488b0fb0e9Syipechai 					block_obj->hw_ops->query_ras_error_status(adev);
10496c245386Syipechai 			}
1050939e2258SHawking Zhang 	}
105105a58345STao Zhou 
105205a58345STao Zhou 	obj->err_data.ue_count += err_data.ue_count;
105305a58345STao Zhou 	obj->err_data.ce_count += err_data.ce_count;
105405a58345STao Zhou 
1055c030f2e4Sxinhui pan 	info->ue_count = obj->err_data.ue_count;
1056c030f2e4Sxinhui pan 	info->ce_count = obj->err_data.ce_count;
1057c030f2e4Sxinhui pan 
10587c6e68c7SAndrey Grodzovsky 	if (err_data.ce_count) {
1059ffd6bde3SHawking Zhang 		if (!adev->aid_mask &&
1060ffd6bde3SHawking Zhang 		    adev->smuio.funcs &&
1061a30f1286SHawking Zhang 		    adev->smuio.funcs->get_socket_id &&
1062a30f1286SHawking Zhang 		    adev->smuio.funcs->get_die_id) {
1063a30f1286SHawking Zhang 			dev_info(adev->dev, "socket: %d, die: %d "
1064a30f1286SHawking Zhang 					"%ld correctable hardware errors "
1065a30f1286SHawking Zhang 					"detected in %s block, no user "
1066a30f1286SHawking Zhang 					"action is needed.\n",
1067a30f1286SHawking Zhang 					adev->smuio.funcs->get_socket_id(adev),
1068a30f1286SHawking Zhang 					adev->smuio.funcs->get_die_id(adev),
1069a30f1286SHawking Zhang 					obj->err_data.ce_count,
1070640ae42eSJohn Clements 					get_ras_block_str(&info->head));
1071a30f1286SHawking Zhang 		} else {
10726952e99cSGuchun Chen 			dev_info(adev->dev, "%ld correctable hardware errors "
10736952e99cSGuchun Chen 					"detected in %s block, no user "
10746952e99cSGuchun Chen 					"action is needed.\n",
10756952e99cSGuchun Chen 					obj->err_data.ce_count,
1076640ae42eSJohn Clements 					get_ras_block_str(&info->head));
10777c6e68c7SAndrey Grodzovsky 		}
1078a30f1286SHawking Zhang 	}
10797c6e68c7SAndrey Grodzovsky 	if (err_data.ue_count) {
1080ffd6bde3SHawking Zhang 		if (!adev->aid_mask &&
1081ffd6bde3SHawking Zhang 		    adev->smuio.funcs &&
1082a30f1286SHawking Zhang 		    adev->smuio.funcs->get_socket_id &&
1083a30f1286SHawking Zhang 		    adev->smuio.funcs->get_die_id) {
1084a30f1286SHawking Zhang 			dev_info(adev->dev, "socket: %d, die: %d "
1085a30f1286SHawking Zhang 					"%ld uncorrectable hardware errors "
1086a30f1286SHawking Zhang 					"detected in %s block\n",
1087a30f1286SHawking Zhang 					adev->smuio.funcs->get_socket_id(adev),
1088a30f1286SHawking Zhang 					adev->smuio.funcs->get_die_id(adev),
1089a30f1286SHawking Zhang 					obj->err_data.ue_count,
1090640ae42eSJohn Clements 					get_ras_block_str(&info->head));
1091a30f1286SHawking Zhang 		} else {
10926952e99cSGuchun Chen 			dev_info(adev->dev, "%ld uncorrectable hardware errors "
10936952e99cSGuchun Chen 					"detected in %s block\n",
10946952e99cSGuchun Chen 					obj->err_data.ue_count,
1095640ae42eSJohn Clements 					get_ras_block_str(&info->head));
10967c6e68c7SAndrey Grodzovsky 		}
1097a30f1286SHawking Zhang 	}
109805a58345STao Zhou 
1099c030f2e4Sxinhui pan 	return 0;
1100c030f2e4Sxinhui pan }
1101c030f2e4Sxinhui pan 
amdgpu_ras_reset_error_status(struct amdgpu_device * adev,enum amdgpu_ras_block block)1102761d86d3SDennis Li int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
1103761d86d3SDennis Li 		enum amdgpu_ras_block block)
1104761d86d3SDennis Li {
11058b0fb0e9Syipechai 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
11068b0fb0e9Syipechai 
1107761d86d3SDennis Li 	if (!amdgpu_ras_is_supported(adev, block))
1108761d86d3SDennis Li 		return -EINVAL;
1109761d86d3SDennis Li 
11108b0fb0e9Syipechai 	if (!block_obj || !block_obj->hw_ops)   {
1111afa37315SLuben Tuikov 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1112b6efdb02Syipechai 			     ras_block_str(block));
11138b0fb0e9Syipechai 		return -EINVAL;
11148b0fb0e9Syipechai 	}
1115761d86d3SDennis Li 
11168b0fb0e9Syipechai 	if (block_obj->hw_ops->reset_ras_error_count)
11178b0fb0e9Syipechai 		block_obj->hw_ops->reset_ras_error_count(adev);
11187780f503SDennis Li 
11197389a5b8Syipechai 	if ((block == AMDGPU_RAS_BLOCK__GFX) ||
11207389a5b8Syipechai 	    (block == AMDGPU_RAS_BLOCK__MMHUB)) {
11218b0fb0e9Syipechai 		if (block_obj->hw_ops->reset_ras_error_status)
11228b0fb0e9Syipechai 			block_obj->hw_ops->reset_ras_error_status(adev);
1123761d86d3SDennis Li 	}
1124761d86d3SDennis Li 
1125761d86d3SDennis Li 	return 0;
1126761d86d3SDennis Li }
1127761d86d3SDennis Li 
1128c030f2e4Sxinhui pan /* wrapper of psp_ras_trigger_error */
amdgpu_ras_error_inject(struct amdgpu_device * adev,struct ras_inject_if * info)1129c030f2e4Sxinhui pan int amdgpu_ras_error_inject(struct amdgpu_device *adev,
1130c030f2e4Sxinhui pan 		struct ras_inject_if *info)
1131c030f2e4Sxinhui pan {
1132c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1133c030f2e4Sxinhui pan 	struct ta_ras_trigger_error_input block_info = {
1134828cfa29Sxinhui pan 		.block_id =  amdgpu_ras_block_to_ta(info->head.block),
1135828cfa29Sxinhui pan 		.inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1136c030f2e4Sxinhui pan 		.sub_block_index = info->head.sub_block_index,
1137c030f2e4Sxinhui pan 		.address = info->address,
1138c030f2e4Sxinhui pan 		.value = info->value,
1139c030f2e4Sxinhui pan 	};
11408b0fb0e9Syipechai 	int ret = -EINVAL;
1141ab3b9de6SYang Li 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
1142ab3b9de6SYang Li 							info->head.block,
1143ab3b9de6SYang Li 							info->head.sub_block_index);
1144c030f2e4Sxinhui pan 
1145248c9635STao Zhou 	/* inject on guest isn't allowed, return success directly */
1146248c9635STao Zhou 	if (amdgpu_sriov_vf(adev))
1147248c9635STao Zhou 		return 0;
1148248c9635STao Zhou 
1149c030f2e4Sxinhui pan 	if (!obj)
1150c030f2e4Sxinhui pan 		return -EINVAL;
1151c030f2e4Sxinhui pan 
115222d4ba53Syipechai 	if (!block_obj || !block_obj->hw_ops)	{
1153afa37315SLuben Tuikov 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1154b6efdb02Syipechai 			     get_ras_block_str(&info->head));
115522d4ba53Syipechai 		return -EINVAL;
115622d4ba53Syipechai 	}
115722d4ba53Syipechai 
1158a6c44d25SJohn Clements 	/* Calculate XGMI relative offset */
1159a80fe1a6STao Zhou 	if (adev->gmc.xgmi.num_physical_nodes > 1 &&
1160a80fe1a6STao Zhou 	    info->head.block != AMDGPU_RAS_BLOCK__GFX) {
116119744f5fSHawking Zhang 		block_info.address =
116219744f5fSHawking Zhang 			amdgpu_xgmi_get_relative_phy_addr(adev,
1163a6c44d25SJohn Clements 							  block_info.address);
1164a6c44d25SJohn Clements 	}
1165a6c44d25SJohn Clements 
116627c5f295STao Zhou 	if (block_obj->hw_ops->ras_error_inject) {
116727c5f295STao Zhou 		if (info->head.block == AMDGPU_RAS_BLOCK__GFX)
11682c22ed0bSTao Zhou 			ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
116927c5f295STao Zhou 		else /* Special ras_error_inject is defined (e.g: xgmi) */
11702c22ed0bSTao Zhou 			ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
11712c22ed0bSTao Zhou 						info->instance_mask);
117227c5f295STao Zhou 	} else {
117327c5f295STao Zhou 		/* default path */
11742c22ed0bSTao Zhou 		ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
1175a5dd40caSHawking Zhang 	}
1176a5dd40caSHawking Zhang 
1177011907fdSDennis Li 	if (ret)
1178011907fdSDennis Li 		dev_err(adev->dev, "ras inject %s failed %d\n",
1179640ae42eSJohn Clements 			get_ras_block_str(&info->head), ret);
1180c030f2e4Sxinhui pan 
1181c030f2e4Sxinhui pan 	return ret;
1182c030f2e4Sxinhui pan }
1183c030f2e4Sxinhui pan 
11844d9f771eSLuben Tuikov /**
11854a1c9a44SHawking Zhang  * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
11864a1c9a44SHawking Zhang  * @adev: pointer to AMD GPU device
11874a1c9a44SHawking Zhang  * @ce_count: pointer to an integer to be set to the count of correctible errors.
11884a1c9a44SHawking Zhang  * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
11894a1c9a44SHawking Zhang  * @query_info: pointer to ras_query_if
11904a1c9a44SHawking Zhang  *
11914a1c9a44SHawking Zhang  * Return 0 for query success or do nothing, otherwise return an error
11924a1c9a44SHawking Zhang  * on failures
11934a1c9a44SHawking Zhang  */
amdgpu_ras_query_error_count_helper(struct amdgpu_device * adev,unsigned long * ce_count,unsigned long * ue_count,struct ras_query_if * query_info)11944a1c9a44SHawking Zhang static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
11954a1c9a44SHawking Zhang 					       unsigned long *ce_count,
11964a1c9a44SHawking Zhang 					       unsigned long *ue_count,
11974a1c9a44SHawking Zhang 					       struct ras_query_if *query_info)
11984a1c9a44SHawking Zhang {
11994a1c9a44SHawking Zhang 	int ret;
12004a1c9a44SHawking Zhang 
12014a1c9a44SHawking Zhang 	if (!query_info)
12024a1c9a44SHawking Zhang 		/* do nothing if query_info is not specified */
12034a1c9a44SHawking Zhang 		return 0;
12044a1c9a44SHawking Zhang 
12054a1c9a44SHawking Zhang 	ret = amdgpu_ras_query_error_status(adev, query_info);
12064a1c9a44SHawking Zhang 	if (ret)
12074a1c9a44SHawking Zhang 		return ret;
12084a1c9a44SHawking Zhang 
12094a1c9a44SHawking Zhang 	*ce_count += query_info->ce_count;
12104a1c9a44SHawking Zhang 	*ue_count += query_info->ue_count;
12114a1c9a44SHawking Zhang 
12124a1c9a44SHawking Zhang 	/* some hardware/IP supports read to clear
12134a1c9a44SHawking Zhang 	 * no need to explictly reset the err status after the query call */
12144a1c9a44SHawking Zhang 	if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
12154a1c9a44SHawking Zhang 	    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
12164a1c9a44SHawking Zhang 		if (amdgpu_ras_reset_error_status(adev, query_info->head.block))
12174a1c9a44SHawking Zhang 			dev_warn(adev->dev,
12184a1c9a44SHawking Zhang 				 "Failed to reset error counter and error status\n");
12194a1c9a44SHawking Zhang 	}
12204a1c9a44SHawking Zhang 
12214a1c9a44SHawking Zhang 	return 0;
12224a1c9a44SHawking Zhang }
12234a1c9a44SHawking Zhang 
12244a1c9a44SHawking Zhang /**
12254a1c9a44SHawking Zhang  * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1226bbe04decSIsabella Basso  * @adev: pointer to AMD GPU device
1227bbe04decSIsabella Basso  * @ce_count: pointer to an integer to be set to the count of correctible errors.
1228bbe04decSIsabella Basso  * @ue_count: pointer to an integer to be set to the count of uncorrectible
12294d9f771eSLuben Tuikov  * errors.
12304a1c9a44SHawking Zhang  * @query_info: pointer to ras_query_if if the query request is only for
12314a1c9a44SHawking Zhang  * specific ip block; if info is NULL, then the qurey request is for
12324a1c9a44SHawking Zhang  * all the ip blocks that support query ras error counters/status
12334d9f771eSLuben Tuikov  *
12344d9f771eSLuben Tuikov  * If set, @ce_count or @ue_count, count and return the corresponding
12354d9f771eSLuben Tuikov  * error counts in those integer pointers. Return 0 if the device
12364d9f771eSLuben Tuikov  * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
12374d9f771eSLuben Tuikov  */
amdgpu_ras_query_error_count(struct amdgpu_device * adev,unsigned long * ce_count,unsigned long * ue_count,struct ras_query_if * query_info)12384d9f771eSLuben Tuikov int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
1239a46751fbSLuben Tuikov 				 unsigned long *ce_count,
12404a1c9a44SHawking Zhang 				 unsigned long *ue_count,
12414a1c9a44SHawking Zhang 				 struct ras_query_if *query_info)
1242c030f2e4Sxinhui pan {
1243c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1244c030f2e4Sxinhui pan 	struct ras_manager *obj;
1245a46751fbSLuben Tuikov 	unsigned long ce, ue;
12464a1c9a44SHawking Zhang 	int ret;
1247c030f2e4Sxinhui pan 
12488ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
12494d9f771eSLuben Tuikov 		return -EOPNOTSUPP;
12504d9f771eSLuben Tuikov 
12514d9f771eSLuben Tuikov 	/* Don't count since no reporting.
12524d9f771eSLuben Tuikov 	 */
12534d9f771eSLuben Tuikov 	if (!ce_count && !ue_count)
12544d9f771eSLuben Tuikov 		return 0;
1255c030f2e4Sxinhui pan 
1256a46751fbSLuben Tuikov 	ce = 0;
1257a46751fbSLuben Tuikov 	ue = 0;
12584a1c9a44SHawking Zhang 	if (!query_info) {
12594a1c9a44SHawking Zhang 		/* query all the ip blocks that support ras query interface */
1260c030f2e4Sxinhui pan 		list_for_each_entry(obj, &con->head, node) {
1261c030f2e4Sxinhui pan 			struct ras_query_if info = {
1262c030f2e4Sxinhui pan 				.head = obj->head,
1263c030f2e4Sxinhui pan 			};
1264c030f2e4Sxinhui pan 
12654a1c9a44SHawking Zhang 			ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info);
12664a1c9a44SHawking Zhang 		}
12674a1c9a44SHawking Zhang 	} else {
12684a1c9a44SHawking Zhang 		/* query specific ip block */
12694a1c9a44SHawking Zhang 		ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info);
12702a460963SCandice Li 	}
12712a460963SCandice Li 
12724a1c9a44SHawking Zhang 	if (ret)
12734a1c9a44SHawking Zhang 		return ret;
1274c030f2e4Sxinhui pan 
1275a46751fbSLuben Tuikov 	if (ce_count)
1276a46751fbSLuben Tuikov 		*ce_count = ce;
1277a46751fbSLuben Tuikov 
1278a46751fbSLuben Tuikov 	if (ue_count)
1279a46751fbSLuben Tuikov 		*ue_count = ue;
12804d9f771eSLuben Tuikov 
12814d9f771eSLuben Tuikov 	return 0;
1282c030f2e4Sxinhui pan }
1283c030f2e4Sxinhui pan /* query/inject/cure end */
1284c030f2e4Sxinhui pan 
1285c030f2e4Sxinhui pan 
1286c030f2e4Sxinhui pan /* sysfs begin */
1287c030f2e4Sxinhui pan 
1288466b1793Sxinhui pan static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1289466b1793Sxinhui pan 		struct ras_badpage **bps, unsigned int *count);
1290466b1793Sxinhui pan 
amdgpu_ras_badpage_flags_str(unsigned int flags)1291466b1793Sxinhui pan static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1292466b1793Sxinhui pan {
1293466b1793Sxinhui pan 	switch (flags) {
129452dd95f2SGuchun Chen 	case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1295466b1793Sxinhui pan 		return "R";
129652dd95f2SGuchun Chen 	case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1297466b1793Sxinhui pan 		return "P";
129852dd95f2SGuchun Chen 	case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1299466b1793Sxinhui pan 	default:
1300466b1793Sxinhui pan 		return "F";
1301aec576f9STom Rix 	}
1302466b1793Sxinhui pan }
1303466b1793Sxinhui pan 
1304f77c7109SAlex Deucher /**
1305f77c7109SAlex Deucher  * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
1306466b1793Sxinhui pan  *
1307466b1793Sxinhui pan  * It allows user to read the bad pages of vram on the gpu through
1308466b1793Sxinhui pan  * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1309466b1793Sxinhui pan  *
1310466b1793Sxinhui pan  * It outputs multiple lines, and each line stands for one gpu page.
1311466b1793Sxinhui pan  *
1312466b1793Sxinhui pan  * The format of one line is below,
1313466b1793Sxinhui pan  * gpu pfn : gpu page size : flags
1314466b1793Sxinhui pan  *
1315466b1793Sxinhui pan  * gpu pfn and gpu page size are printed in hex format.
1316466b1793Sxinhui pan  * flags can be one of below character,
1317f77c7109SAlex Deucher  *
1318466b1793Sxinhui pan  * R: reserved, this gpu page is reserved and not able to use.
1319f77c7109SAlex Deucher  *
1320466b1793Sxinhui pan  * P: pending for reserve, this gpu page is marked as bad, will be reserved
1321466b1793Sxinhui pan  * in next window of page_reserve.
1322f77c7109SAlex Deucher  *
1323466b1793Sxinhui pan  * F: unable to reserve. this gpu page can't be reserved due to some reasons.
1324466b1793Sxinhui pan  *
1325f77c7109SAlex Deucher  * Examples:
1326f77c7109SAlex Deucher  *
1327f77c7109SAlex Deucher  * .. code-block:: bash
1328f77c7109SAlex Deucher  *
1329466b1793Sxinhui pan  *	0x00000001 : 0x00001000 : R
1330466b1793Sxinhui pan  *	0x00000002 : 0x00001000 : P
1331f77c7109SAlex Deucher  *
1332466b1793Sxinhui pan  */
1333466b1793Sxinhui pan 
amdgpu_ras_sysfs_badpages_read(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)1334466b1793Sxinhui pan static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1335466b1793Sxinhui pan 		struct kobject *kobj, struct bin_attribute *attr,
1336466b1793Sxinhui pan 		char *buf, loff_t ppos, size_t count)
1337466b1793Sxinhui pan {
1338466b1793Sxinhui pan 	struct amdgpu_ras *con =
1339466b1793Sxinhui pan 		container_of(attr, struct amdgpu_ras, badpages_attr);
1340466b1793Sxinhui pan 	struct amdgpu_device *adev = con->adev;
1341466b1793Sxinhui pan 	const unsigned int element_size =
1342466b1793Sxinhui pan 		sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1343d6ee400eSSlava Abramov 	unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1344d6ee400eSSlava Abramov 	unsigned int end = div64_ul(ppos + count - 1, element_size);
1345466b1793Sxinhui pan 	ssize_t s = 0;
1346466b1793Sxinhui pan 	struct ras_badpage *bps = NULL;
1347466b1793Sxinhui pan 	unsigned int bps_count = 0;
1348466b1793Sxinhui pan 
1349466b1793Sxinhui pan 	memset(buf, 0, count);
1350466b1793Sxinhui pan 
1351466b1793Sxinhui pan 	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1352466b1793Sxinhui pan 		return 0;
1353466b1793Sxinhui pan 
1354466b1793Sxinhui pan 	for (; start < end && start < bps_count; start++)
1355466b1793Sxinhui pan 		s += scnprintf(&buf[s], element_size + 1,
1356466b1793Sxinhui pan 				"0x%08x : 0x%08x : %1s\n",
1357466b1793Sxinhui pan 				bps[start].bp,
1358466b1793Sxinhui pan 				bps[start].size,
1359466b1793Sxinhui pan 				amdgpu_ras_badpage_flags_str(bps[start].flags));
1360466b1793Sxinhui pan 
1361466b1793Sxinhui pan 	kfree(bps);
1362466b1793Sxinhui pan 
1363466b1793Sxinhui pan 	return s;
1364466b1793Sxinhui pan }
1365466b1793Sxinhui pan 
amdgpu_ras_sysfs_features_read(struct device * dev,struct device_attribute * attr,char * buf)1366c030f2e4Sxinhui pan static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1367c030f2e4Sxinhui pan 		struct device_attribute *attr, char *buf)
1368c030f2e4Sxinhui pan {
1369c030f2e4Sxinhui pan 	struct amdgpu_ras *con =
1370c030f2e4Sxinhui pan 		container_of(attr, struct amdgpu_ras, features_attr);
1371c030f2e4Sxinhui pan 
13722cffcb66Sye xingchen 	return sysfs_emit(buf, "feature mask: 0x%x\n", con->features);
1373c030f2e4Sxinhui pan }
1374c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device * adev)1375f848159bSGuchun Chen static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1376f848159bSGuchun Chen {
1377f848159bSGuchun Chen 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1378f848159bSGuchun Chen 
1379de1c0959SVitaly Prosyak 	if (adev->dev->kobj.sd)
1380f848159bSGuchun Chen 		sysfs_remove_file_from_group(&adev->dev->kobj,
1381f848159bSGuchun Chen 				&con->badpages_attr.attr,
1382f848159bSGuchun Chen 				RAS_FS_NAME);
1383f848159bSGuchun Chen }
1384f848159bSGuchun Chen 
amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device * adev)1385c030f2e4Sxinhui pan static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1386c030f2e4Sxinhui pan {
1387c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1388c030f2e4Sxinhui pan 	struct attribute *attrs[] = {
1389c030f2e4Sxinhui pan 		&con->features_attr.attr,
1390c030f2e4Sxinhui pan 		NULL
1391c030f2e4Sxinhui pan 	};
1392c030f2e4Sxinhui pan 	struct attribute_group group = {
1393eb0c3cd4SGuchun Chen 		.name = RAS_FS_NAME,
1394c030f2e4Sxinhui pan 		.attrs = attrs,
1395c030f2e4Sxinhui pan 	};
1396c030f2e4Sxinhui pan 
1397de1c0959SVitaly Prosyak 	if (adev->dev->kobj.sd)
1398c030f2e4Sxinhui pan 		sysfs_remove_group(&adev->dev->kobj, &group);
1399c030f2e4Sxinhui pan 
1400c030f2e4Sxinhui pan 	return 0;
1401c030f2e4Sxinhui pan }
1402c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_create(struct amdgpu_device * adev,struct ras_common_if * head)1403c030f2e4Sxinhui pan int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
14049252d33dSyipechai 		struct ras_common_if *head)
1405c030f2e4Sxinhui pan {
14069252d33dSyipechai 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1407c030f2e4Sxinhui pan 
1408c030f2e4Sxinhui pan 	if (!obj || obj->attr_inuse)
1409c030f2e4Sxinhui pan 		return -EINVAL;
1410c030f2e4Sxinhui pan 
1411c030f2e4Sxinhui pan 	get_obj(obj);
1412c030f2e4Sxinhui pan 
14139252d33dSyipechai 	snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
14149252d33dSyipechai 		"%s_err_count", head->name);
1415c030f2e4Sxinhui pan 
1416c030f2e4Sxinhui pan 	obj->sysfs_attr = (struct device_attribute){
1417c030f2e4Sxinhui pan 		.attr = {
1418c030f2e4Sxinhui pan 			.name = obj->fs_data.sysfs_name,
1419c030f2e4Sxinhui pan 			.mode = S_IRUGO,
1420c030f2e4Sxinhui pan 		},
1421c030f2e4Sxinhui pan 			.show = amdgpu_ras_sysfs_read,
1422c030f2e4Sxinhui pan 	};
1423163def43Sxinhui pan 	sysfs_attr_init(&obj->sysfs_attr.attr);
1424c030f2e4Sxinhui pan 
1425c030f2e4Sxinhui pan 	if (sysfs_add_file_to_group(&adev->dev->kobj,
1426c030f2e4Sxinhui pan 				&obj->sysfs_attr.attr,
1427eb0c3cd4SGuchun Chen 				RAS_FS_NAME)) {
1428c030f2e4Sxinhui pan 		put_obj(obj);
1429c030f2e4Sxinhui pan 		return -EINVAL;
1430c030f2e4Sxinhui pan 	}
1431c030f2e4Sxinhui pan 
1432c030f2e4Sxinhui pan 	obj->attr_inuse = 1;
1433c030f2e4Sxinhui pan 
1434c030f2e4Sxinhui pan 	return 0;
1435c030f2e4Sxinhui pan }
1436c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_remove(struct amdgpu_device * adev,struct ras_common_if * head)1437c030f2e4Sxinhui pan int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1438c030f2e4Sxinhui pan 		struct ras_common_if *head)
1439c030f2e4Sxinhui pan {
1440c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1441c030f2e4Sxinhui pan 
1442c030f2e4Sxinhui pan 	if (!obj || !obj->attr_inuse)
1443c030f2e4Sxinhui pan 		return -EINVAL;
1444c030f2e4Sxinhui pan 
1445de1c0959SVitaly Prosyak 	if (adev->dev->kobj.sd)
1446c030f2e4Sxinhui pan 		sysfs_remove_file_from_group(&adev->dev->kobj,
1447c030f2e4Sxinhui pan 				&obj->sysfs_attr.attr,
1448eb0c3cd4SGuchun Chen 				RAS_FS_NAME);
1449c030f2e4Sxinhui pan 	obj->attr_inuse = 0;
1450c030f2e4Sxinhui pan 	put_obj(obj);
1451c030f2e4Sxinhui pan 
1452c030f2e4Sxinhui pan 	return 0;
1453c030f2e4Sxinhui pan }
1454c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_remove_all(struct amdgpu_device * adev)1455c030f2e4Sxinhui pan static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1456c030f2e4Sxinhui pan {
1457c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1458c030f2e4Sxinhui pan 	struct ras_manager *obj, *tmp;
1459c030f2e4Sxinhui pan 
1460c030f2e4Sxinhui pan 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
1461c030f2e4Sxinhui pan 		amdgpu_ras_sysfs_remove(adev, &obj->head);
1462c030f2e4Sxinhui pan 	}
1463c030f2e4Sxinhui pan 
1464f848159bSGuchun Chen 	if (amdgpu_bad_page_threshold != 0)
1465f848159bSGuchun Chen 		amdgpu_ras_sysfs_remove_bad_page_node(adev);
1466f848159bSGuchun Chen 
1467c030f2e4Sxinhui pan 	amdgpu_ras_sysfs_remove_feature_node(adev);
1468c030f2e4Sxinhui pan 
1469c030f2e4Sxinhui pan 	return 0;
1470c030f2e4Sxinhui pan }
1471c030f2e4Sxinhui pan /* sysfs end */
1472c030f2e4Sxinhui pan 
1473ef177d11SAlex Deucher /**
1474ef177d11SAlex Deucher  * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1475ef177d11SAlex Deucher  *
1476ef177d11SAlex Deucher  * Normally when there is an uncorrectable error, the driver will reset
1477ef177d11SAlex Deucher  * the GPU to recover.  However, in the event of an unrecoverable error,
1478ef177d11SAlex Deucher  * the driver provides an interface to reboot the system automatically
1479ef177d11SAlex Deucher  * in that event.
1480ef177d11SAlex Deucher  *
1481ef177d11SAlex Deucher  * The following file in debugfs provides that interface:
1482ef177d11SAlex Deucher  * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1483ef177d11SAlex Deucher  *
1484ef177d11SAlex Deucher  * Usage:
1485ef177d11SAlex Deucher  *
1486ef177d11SAlex Deucher  * .. code-block:: bash
1487ef177d11SAlex Deucher  *
1488ef177d11SAlex Deucher  *	echo true > .../ras/auto_reboot
1489ef177d11SAlex Deucher  *
1490ef177d11SAlex Deucher  */
1491c030f2e4Sxinhui pan /* debugfs begin */
amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * adev)1492ea1b8c9bSNirmoy Das static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
149336ea1bd2Sxinhui pan {
149436ea1bd2Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1495740f42a2SLuben Tuikov 	struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control;
14964a580877SLuben Tuikov 	struct drm_minor  *minor = adev_to_drm(adev)->primary;
1497ef0d7d20SLuben Tuikov 	struct dentry     *dir;
149836ea1bd2Sxinhui pan 
149988293c03SNirmoy Das 	dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
150088293c03SNirmoy Das 	debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
150188293c03SNirmoy Das 			    &amdgpu_ras_debugfs_ctrl_ops);
150288293c03SNirmoy Das 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
150388293c03SNirmoy Das 			    &amdgpu_ras_debugfs_eeprom_ops);
15047fb64071SLuben Tuikov 	debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
15057fb64071SLuben Tuikov 			   &con->bad_page_cnt_threshold);
1506740f42a2SLuben Tuikov 	debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs);
1507ef0d7d20SLuben Tuikov 	debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
1508ef0d7d20SLuben Tuikov 	debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
1509c65b0805SLuben Tuikov 	debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
1510c65b0805SLuben Tuikov 			    &amdgpu_ras_debugfs_eeprom_size_ops);
1511c65b0805SLuben Tuikov 	con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
1512c65b0805SLuben Tuikov 						       S_IRUGO, dir, adev,
1513c65b0805SLuben Tuikov 						       &amdgpu_ras_debugfs_eeprom_table_ops);
1514c65b0805SLuben Tuikov 	amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1515c688a06bSGuchun Chen 
1516c688a06bSGuchun Chen 	/*
1517c688a06bSGuchun Chen 	 * After one uncorrectable error happens, usually GPU recovery will
1518c688a06bSGuchun Chen 	 * be scheduled. But due to the known problem in GPU recovery failing
1519c688a06bSGuchun Chen 	 * to bring GPU back, below interface provides one direct way to
1520c688a06bSGuchun Chen 	 * user to reboot system automatically in such case within
1521c688a06bSGuchun Chen 	 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1522c688a06bSGuchun Chen 	 * will never be called.
1523c688a06bSGuchun Chen 	 */
152488293c03SNirmoy Das 	debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
152566459e1dSGuchun Chen 
152666459e1dSGuchun Chen 	/*
152766459e1dSGuchun Chen 	 * User could set this not to clean up hardware's error count register
152866459e1dSGuchun Chen 	 * of RAS IPs during ras recovery.
152966459e1dSGuchun Chen 	 */
153088293c03SNirmoy Das 	debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
153188293c03SNirmoy Das 			    &con->disable_ras_err_cnt_harvest);
153288293c03SNirmoy Das 	return dir;
153336ea1bd2Sxinhui pan }
153436ea1bd2Sxinhui pan 
amdgpu_ras_debugfs_create(struct amdgpu_device * adev,struct ras_fs_if * head,struct dentry * dir)1535cedf7884SArnd Bergmann static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
153688293c03SNirmoy Das 				      struct ras_fs_if *head,
153788293c03SNirmoy Das 				      struct dentry *dir)
1538c030f2e4Sxinhui pan {
1539c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1540c030f2e4Sxinhui pan 
154188293c03SNirmoy Das 	if (!obj || !dir)
1542450f30eaSGreg Kroah-Hartman 		return;
1543c030f2e4Sxinhui pan 
1544c030f2e4Sxinhui pan 	get_obj(obj);
1545c030f2e4Sxinhui pan 
1546c030f2e4Sxinhui pan 	memcpy(obj->fs_data.debugfs_name,
1547c030f2e4Sxinhui pan 			head->debugfs_name,
1548c030f2e4Sxinhui pan 			sizeof(obj->fs_data.debugfs_name));
1549c030f2e4Sxinhui pan 
155088293c03SNirmoy Das 	debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
155188293c03SNirmoy Das 			    obj, &amdgpu_ras_debugfs_ops);
1552c030f2e4Sxinhui pan }
1553c030f2e4Sxinhui pan 
amdgpu_ras_debugfs_create_all(struct amdgpu_device * adev)1554f9317014STao Zhou void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1555f9317014STao Zhou {
1556f9317014STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
155788293c03SNirmoy Das 	struct dentry *dir;
1558c1509f3fSStanley.Yang 	struct ras_manager *obj;
1559f9317014STao Zhou 	struct ras_fs_if fs_info;
1560f9317014STao Zhou 
1561f9317014STao Zhou 	/*
1562f9317014STao Zhou 	 * it won't be called in resume path, no need to check
1563f9317014STao Zhou 	 * suspend and gpu reset status
1564f9317014STao Zhou 	 */
1565cedf7884SArnd Bergmann 	if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1566f9317014STao Zhou 		return;
1567f9317014STao Zhou 
156888293c03SNirmoy Das 	dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
1569f9317014STao Zhou 
1570c1509f3fSStanley.Yang 	list_for_each_entry(obj, &con->head, node) {
1571f9317014STao Zhou 		if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1572f9317014STao Zhou 			(obj->attr_inuse == 1)) {
1573f9317014STao Zhou 			sprintf(fs_info.debugfs_name, "%s_err_inject",
1574640ae42eSJohn Clements 					get_ras_block_str(&obj->head));
1575f9317014STao Zhou 			fs_info.head = obj->head;
157688293c03SNirmoy Das 			amdgpu_ras_debugfs_create(adev, &fs_info, dir);
1577f9317014STao Zhou 		}
1578f9317014STao Zhou 	}
1579f9317014STao Zhou }
1580f9317014STao Zhou 
1581c030f2e4Sxinhui pan /* debugfs end */
1582c030f2e4Sxinhui pan 
1583c030f2e4Sxinhui pan /* ras fs */
1584c3d4d45dSGuchun Chen static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1585c3d4d45dSGuchun Chen 		amdgpu_ras_sysfs_badpages_read, NULL, 0);
1586c3d4d45dSGuchun Chen static DEVICE_ATTR(features, S_IRUGO,
1587c3d4d45dSGuchun Chen 		amdgpu_ras_sysfs_features_read, NULL);
amdgpu_ras_fs_init(struct amdgpu_device * adev)1588c030f2e4Sxinhui pan static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1589c030f2e4Sxinhui pan {
1590c3d4d45dSGuchun Chen 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1591c3d4d45dSGuchun Chen 	struct attribute_group group = {
1592c3d4d45dSGuchun Chen 		.name = RAS_FS_NAME,
1593c3d4d45dSGuchun Chen 	};
1594c3d4d45dSGuchun Chen 	struct attribute *attrs[] = {
1595c3d4d45dSGuchun Chen 		&con->features_attr.attr,
1596c3d4d45dSGuchun Chen 		NULL
1597c3d4d45dSGuchun Chen 	};
1598c3d4d45dSGuchun Chen 	struct bin_attribute *bin_attrs[] = {
1599c3d4d45dSGuchun Chen 		NULL,
1600c3d4d45dSGuchun Chen 		NULL,
1601c3d4d45dSGuchun Chen 	};
1602a069a9ebSAlex Deucher 	int r;
1603c030f2e4Sxinhui pan 
1604c3d4d45dSGuchun Chen 	/* add features entry */
1605c3d4d45dSGuchun Chen 	con->features_attr = dev_attr_features;
1606c3d4d45dSGuchun Chen 	group.attrs = attrs;
1607c3d4d45dSGuchun Chen 	sysfs_attr_init(attrs[0]);
1608c3d4d45dSGuchun Chen 
1609c3d4d45dSGuchun Chen 	if (amdgpu_bad_page_threshold != 0) {
1610c3d4d45dSGuchun Chen 		/* add bad_page_features entry */
1611c3d4d45dSGuchun Chen 		bin_attr_gpu_vram_bad_pages.private = NULL;
1612c3d4d45dSGuchun Chen 		con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1613c3d4d45dSGuchun Chen 		bin_attrs[0] = &con->badpages_attr;
1614c3d4d45dSGuchun Chen 		group.bin_attrs = bin_attrs;
1615c3d4d45dSGuchun Chen 		sysfs_bin_attr_init(bin_attrs[0]);
1616c3d4d45dSGuchun Chen 	}
1617c3d4d45dSGuchun Chen 
1618a069a9ebSAlex Deucher 	r = sysfs_create_group(&adev->dev->kobj, &group);
1619a069a9ebSAlex Deucher 	if (r)
1620a069a9ebSAlex Deucher 		dev_err(adev->dev, "Failed to create RAS sysfs group!");
1621f848159bSGuchun Chen 
1622c030f2e4Sxinhui pan 	return 0;
1623c030f2e4Sxinhui pan }
1624c030f2e4Sxinhui pan 
amdgpu_ras_fs_fini(struct amdgpu_device * adev)1625c030f2e4Sxinhui pan static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1626c030f2e4Sxinhui pan {
162788293c03SNirmoy Das 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
162888293c03SNirmoy Das 	struct ras_manager *con_obj, *ip_obj, *tmp;
162988293c03SNirmoy Das 
163088293c03SNirmoy Das 	if (IS_ENABLED(CONFIG_DEBUG_FS)) {
163188293c03SNirmoy Das 		list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
163288293c03SNirmoy Das 			ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
163388293c03SNirmoy Das 			if (ip_obj)
163488293c03SNirmoy Das 				put_obj(ip_obj);
163588293c03SNirmoy Das 		}
163688293c03SNirmoy Das 	}
163788293c03SNirmoy Das 
1638c030f2e4Sxinhui pan 	amdgpu_ras_sysfs_remove_all(adev);
1639c030f2e4Sxinhui pan 	return 0;
1640c030f2e4Sxinhui pan }
1641c030f2e4Sxinhui pan /* ras fs end */
1642c030f2e4Sxinhui pan 
1643c030f2e4Sxinhui pan /* ih begin */
1644b3c76814STao Zhou 
1645b3c76814STao Zhou /* For the hardware that cannot enable bif ring for both ras_controller_irq
1646b3c76814STao Zhou  * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
1647b3c76814STao Zhou  * register to check whether the interrupt is triggered or not, and properly
1648b3c76814STao Zhou  * ack the interrupt if it is there
1649b3c76814STao Zhou  */
amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device * adev)1650b3c76814STao Zhou void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
1651b3c76814STao Zhou {
1652950d6425SStanley.Yang 	/* Fatal error events are handled on host side */
16538eba7205SCandice Li 	if (amdgpu_sriov_vf(adev))
1654b3c76814STao Zhou 		return;
1655b3c76814STao Zhou 
1656b3c76814STao Zhou 	if (adev->nbio.ras &&
1657b3c76814STao Zhou 	    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
1658b3c76814STao Zhou 		adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
1659b3c76814STao Zhou 
1660b3c76814STao Zhou 	if (adev->nbio.ras &&
1661b3c76814STao Zhou 	    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
1662b3c76814STao Zhou 		adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
1663b3c76814STao Zhou }
1664b3c76814STao Zhou 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)166566f87949STao Zhou static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
166666f87949STao Zhou 				struct amdgpu_iv_entry *entry)
166766f87949STao Zhou {
1668b63ac5d3STao Zhou 	bool poison_stat = false;
166966f87949STao Zhou 	struct amdgpu_device *adev = obj->adev;
167066f87949STao Zhou 	struct amdgpu_ras_block_object *block_obj =
167166f87949STao Zhou 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
167266f87949STao Zhou 
1673ac7b25d9SYiPeng Chai 	if (!block_obj)
1674b63ac5d3STao Zhou 		return;
1675b63ac5d3STao Zhou 
1676b63ac5d3STao Zhou 	/* both query_poison_status and handle_poison_consumption are optional,
1677b63ac5d3STao Zhou 	 * but at least one of them should be implemented if we need poison
1678b63ac5d3STao Zhou 	 * consumption handler
1679b63ac5d3STao Zhou 	 */
1680ac7b25d9SYiPeng Chai 	if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
1681b63ac5d3STao Zhou 		poison_stat = block_obj->hw_ops->query_poison_status(adev);
1682b63ac5d3STao Zhou 		if (!poison_stat) {
1683b63ac5d3STao Zhou 			/* Not poison consumption interrupt, no need to handle it */
1684b63ac5d3STao Zhou 			dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
1685b63ac5d3STao Zhou 					block_obj->ras_comm.name);
1686b63ac5d3STao Zhou 
1687b63ac5d3STao Zhou 			return;
1688b63ac5d3STao Zhou 		}
1689b63ac5d3STao Zhou 	}
1690b63ac5d3STao Zhou 
16911ed0e176STao Zhou 	amdgpu_umc_poison_handler(adev, false);
169266f87949STao Zhou 
1693ac7b25d9SYiPeng Chai 	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
169466f87949STao Zhou 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
169566f87949STao Zhou 
1696b63ac5d3STao Zhou 	/* gpu reset is fallback for failed and default cases */
1697b63ac5d3STao Zhou 	if (poison_stat) {
1698b63ac5d3STao Zhou 		dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
1699b63ac5d3STao Zhou 				block_obj->ras_comm.name);
170066f87949STao Zhou 		amdgpu_ras_reset_gpu(adev);
1701ac7b25d9SYiPeng Chai 	} else {
1702ac7b25d9SYiPeng Chai 		amdgpu_gfx_poison_consumption_handler(adev, entry);
170366f87949STao Zhou 	}
1704b63ac5d3STao Zhou }
170566f87949STao Zhou 
amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)170650a7d025STao Zhou static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
170750a7d025STao Zhou 				struct amdgpu_iv_entry *entry)
1708c030f2e4Sxinhui pan {
1709f524dd54STao Zhou 	dev_info(obj->adev->dev,
1710f524dd54STao Zhou 		"Poison is created, no user action is needed.\n");
171150a7d025STao Zhou }
171250a7d025STao Zhou 
amdgpu_ras_interrupt_umc_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)171350a7d025STao Zhou static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
171450a7d025STao Zhou 				struct amdgpu_iv_entry *entry)
171550a7d025STao Zhou {
171650a7d025STao Zhou 	struct ras_ih_data *data = &obj->ih_data;
171750a7d025STao Zhou 	struct ras_err_data err_data = {0, 0, 0, NULL};
171850a7d025STao Zhou 	int ret;
171950a7d025STao Zhou 
172050a7d025STao Zhou 	if (!data->cb)
172150a7d025STao Zhou 		return;
172250a7d025STao Zhou 
1723c030f2e4Sxinhui pan 	/* Let IP handle its data, maybe we need get the output
172450a7d025STao Zhou 	 * from the callback to update the error type/count, etc
1725c030f2e4Sxinhui pan 	 */
172650a7d025STao Zhou 	ret = data->cb(obj->adev, &err_data, entry);
1727c030f2e4Sxinhui pan 	/* ue will trigger an interrupt, and in that case
1728c030f2e4Sxinhui pan 	 * we need do a reset to recovery the whole system.
1729c030f2e4Sxinhui pan 	 * But leave IP do that recovery, here we just dispatch
1730c030f2e4Sxinhui pan 	 * the error.
1731c030f2e4Sxinhui pan 	 */
1732bd2280daSTao Zhou 	if (ret == AMDGPU_RAS_SUCCESS) {
173351437623STao Zhou 		/* these counts could be left as 0 if
173451437623STao Zhou 		 * some blocks do not count error number
1735c030f2e4Sxinhui pan 		 */
173651437623STao Zhou 		obj->err_data.ue_count += err_data.ue_count;
173751437623STao Zhou 		obj->err_data.ce_count += err_data.ce_count;
173851437623STao Zhou 	}
1739c030f2e4Sxinhui pan }
174050a7d025STao Zhou 
amdgpu_ras_interrupt_handler(struct ras_manager * obj)174150a7d025STao Zhou static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
174250a7d025STao Zhou {
174350a7d025STao Zhou 	struct ras_ih_data *data = &obj->ih_data;
174450a7d025STao Zhou 	struct amdgpu_iv_entry entry;
174550a7d025STao Zhou 
174650a7d025STao Zhou 	while (data->rptr != data->wptr) {
174750a7d025STao Zhou 		rmb();
174850a7d025STao Zhou 		memcpy(&entry, &data->ring[data->rptr],
174950a7d025STao Zhou 				data->element_size);
175050a7d025STao Zhou 
175150a7d025STao Zhou 		wmb();
175250a7d025STao Zhou 		data->rptr = (data->aligned_element_size +
175350a7d025STao Zhou 				data->rptr) % data->ring_size;
175450a7d025STao Zhou 
175550a7d025STao Zhou 		if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
175650a7d025STao Zhou 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
175750a7d025STao Zhou 				amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
175866f87949STao Zhou 			else
175966f87949STao Zhou 				amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
176050a7d025STao Zhou 		} else {
176150a7d025STao Zhou 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
176250a7d025STao Zhou 				amdgpu_ras_interrupt_umc_handler(obj, &entry);
176350a7d025STao Zhou 			else
176450a7d025STao Zhou 				dev_warn(obj->adev->dev,
176550a7d025STao Zhou 					"No RAS interrupt handler for non-UMC block with poison disabled.\n");
1766c030f2e4Sxinhui pan 		}
1767c030f2e4Sxinhui pan 	}
1768f524dd54STao Zhou }
1769c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_process_handler(struct work_struct * work)1770c030f2e4Sxinhui pan static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1771c030f2e4Sxinhui pan {
1772c030f2e4Sxinhui pan 	struct ras_ih_data *data =
1773c030f2e4Sxinhui pan 		container_of(work, struct ras_ih_data, ih_work);
1774c030f2e4Sxinhui pan 	struct ras_manager *obj =
1775c030f2e4Sxinhui pan 		container_of(data, struct ras_manager, ih_data);
1776c030f2e4Sxinhui pan 
1777c030f2e4Sxinhui pan 	amdgpu_ras_interrupt_handler(obj);
1778c030f2e4Sxinhui pan }
1779c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_dispatch(struct amdgpu_device * adev,struct ras_dispatch_if * info)1780c030f2e4Sxinhui pan int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1781c030f2e4Sxinhui pan 		struct ras_dispatch_if *info)
1782c030f2e4Sxinhui pan {
1783*b8961633SMa Jun 	struct ras_manager *obj;
1784*b8961633SMa Jun 	struct ras_ih_data *data;
1785c030f2e4Sxinhui pan 
1786*b8961633SMa Jun 	obj = amdgpu_ras_find_obj(adev, &info->head);
1787c030f2e4Sxinhui pan 	if (!obj)
1788c030f2e4Sxinhui pan 		return -EINVAL;
1789c030f2e4Sxinhui pan 
1790*b8961633SMa Jun 	data = &obj->ih_data;
1791*b8961633SMa Jun 
1792c030f2e4Sxinhui pan 	if (data->inuse == 0)
1793c030f2e4Sxinhui pan 		return 0;
1794c030f2e4Sxinhui pan 
1795c030f2e4Sxinhui pan 	/* Might be overflow... */
1796c030f2e4Sxinhui pan 	memcpy(&data->ring[data->wptr], info->entry,
1797c030f2e4Sxinhui pan 			data->element_size);
1798c030f2e4Sxinhui pan 
1799c030f2e4Sxinhui pan 	wmb();
1800c030f2e4Sxinhui pan 	data->wptr = (data->aligned_element_size +
1801c030f2e4Sxinhui pan 			data->wptr) % data->ring_size;
1802c030f2e4Sxinhui pan 
1803c030f2e4Sxinhui pan 	schedule_work(&data->ih_work);
1804c030f2e4Sxinhui pan 
1805c030f2e4Sxinhui pan 	return 0;
1806c030f2e4Sxinhui pan }
1807c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_remove_handler(struct amdgpu_device * adev,struct ras_common_if * head)1808c030f2e4Sxinhui pan int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
18099252d33dSyipechai 		struct ras_common_if *head)
1810c030f2e4Sxinhui pan {
18119252d33dSyipechai 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1812c030f2e4Sxinhui pan 	struct ras_ih_data *data;
1813c030f2e4Sxinhui pan 
1814c030f2e4Sxinhui pan 	if (!obj)
1815c030f2e4Sxinhui pan 		return -EINVAL;
1816c030f2e4Sxinhui pan 
1817c030f2e4Sxinhui pan 	data = &obj->ih_data;
1818c030f2e4Sxinhui pan 	if (data->inuse == 0)
1819c030f2e4Sxinhui pan 		return 0;
1820c030f2e4Sxinhui pan 
1821c030f2e4Sxinhui pan 	cancel_work_sync(&data->ih_work);
1822c030f2e4Sxinhui pan 
1823c030f2e4Sxinhui pan 	kfree(data->ring);
1824c030f2e4Sxinhui pan 	memset(data, 0, sizeof(*data));
1825c030f2e4Sxinhui pan 	put_obj(obj);
1826c030f2e4Sxinhui pan 
1827c030f2e4Sxinhui pan 	return 0;
1828c030f2e4Sxinhui pan }
1829c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_add_handler(struct amdgpu_device * adev,struct ras_common_if * head)1830c030f2e4Sxinhui pan int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
18319252d33dSyipechai 		struct ras_common_if *head)
1832c030f2e4Sxinhui pan {
18339252d33dSyipechai 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1834c030f2e4Sxinhui pan 	struct ras_ih_data *data;
18359252d33dSyipechai 	struct amdgpu_ras_block_object *ras_obj;
1836c030f2e4Sxinhui pan 
1837c030f2e4Sxinhui pan 	if (!obj) {
1838c030f2e4Sxinhui pan 		/* in case we registe the IH before enable ras feature */
18399252d33dSyipechai 		obj = amdgpu_ras_create_obj(adev, head);
1840c030f2e4Sxinhui pan 		if (!obj)
1841c030f2e4Sxinhui pan 			return -EINVAL;
1842c030f2e4Sxinhui pan 	} else
1843c030f2e4Sxinhui pan 		get_obj(obj);
1844c030f2e4Sxinhui pan 
18459252d33dSyipechai 	ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
18469252d33dSyipechai 
1847c030f2e4Sxinhui pan 	data = &obj->ih_data;
1848c030f2e4Sxinhui pan 	/* add the callback.etc */
1849c030f2e4Sxinhui pan 	*data = (struct ras_ih_data) {
1850c030f2e4Sxinhui pan 		.inuse = 0,
18519252d33dSyipechai 		.cb = ras_obj->ras_cb,
1852c030f2e4Sxinhui pan 		.element_size = sizeof(struct amdgpu_iv_entry),
1853c030f2e4Sxinhui pan 		.rptr = 0,
1854c030f2e4Sxinhui pan 		.wptr = 0,
1855c030f2e4Sxinhui pan 	};
1856c030f2e4Sxinhui pan 
1857c030f2e4Sxinhui pan 	INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1858c030f2e4Sxinhui pan 
1859c030f2e4Sxinhui pan 	data->aligned_element_size = ALIGN(data->element_size, 8);
1860c030f2e4Sxinhui pan 	/* the ring can store 64 iv entries. */
1861c030f2e4Sxinhui pan 	data->ring_size = 64 * data->aligned_element_size;
1862c030f2e4Sxinhui pan 	data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1863c030f2e4Sxinhui pan 	if (!data->ring) {
1864c030f2e4Sxinhui pan 		put_obj(obj);
1865c030f2e4Sxinhui pan 		return -ENOMEM;
1866c030f2e4Sxinhui pan 	}
1867c030f2e4Sxinhui pan 
1868c030f2e4Sxinhui pan 	/* IH is ready */
1869c030f2e4Sxinhui pan 	data->inuse = 1;
1870c030f2e4Sxinhui pan 
1871c030f2e4Sxinhui pan 	return 0;
1872c030f2e4Sxinhui pan }
1873c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_remove_all(struct amdgpu_device * adev)1874c030f2e4Sxinhui pan static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1875c030f2e4Sxinhui pan {
1876c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1877c030f2e4Sxinhui pan 	struct ras_manager *obj, *tmp;
1878c030f2e4Sxinhui pan 
1879c030f2e4Sxinhui pan 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
18809252d33dSyipechai 		amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
1881c030f2e4Sxinhui pan 	}
1882c030f2e4Sxinhui pan 
1883c030f2e4Sxinhui pan 	return 0;
1884c030f2e4Sxinhui pan }
1885c030f2e4Sxinhui pan /* ih end */
1886c030f2e4Sxinhui pan 
1887313c8fd3SGuchun Chen /* traversal all IPs except NBIO to query error counter */
amdgpu_ras_log_on_err_counter(struct amdgpu_device * adev)1888313c8fd3SGuchun Chen static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1889313c8fd3SGuchun Chen {
1890313c8fd3SGuchun Chen 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1891313c8fd3SGuchun Chen 	struct ras_manager *obj;
1892313c8fd3SGuchun Chen 
18938ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
1894313c8fd3SGuchun Chen 		return;
1895313c8fd3SGuchun Chen 
1896313c8fd3SGuchun Chen 	list_for_each_entry(obj, &con->head, node) {
1897313c8fd3SGuchun Chen 		struct ras_query_if info = {
1898313c8fd3SGuchun Chen 			.head = obj->head,
1899313c8fd3SGuchun Chen 		};
1900313c8fd3SGuchun Chen 
1901313c8fd3SGuchun Chen 		/*
1902313c8fd3SGuchun Chen 		 * PCIE_BIF IP has one different isr by ras controller
1903313c8fd3SGuchun Chen 		 * interrupt, the specific ras counter query will be
1904313c8fd3SGuchun Chen 		 * done in that isr. So skip such block from common
1905313c8fd3SGuchun Chen 		 * sync flood interrupt isr calling.
1906313c8fd3SGuchun Chen 		 */
1907313c8fd3SGuchun Chen 		if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1908313c8fd3SGuchun Chen 			continue;
1909313c8fd3SGuchun Chen 
1910cf63b702SStanley.Yang 		/*
1911cf63b702SStanley.Yang 		 * this is a workaround for aldebaran, skip send msg to
1912cf63b702SStanley.Yang 		 * smu to get ecc_info table due to smu handle get ecc
1913cf63b702SStanley.Yang 		 * info table failed temporarily.
1914cf63b702SStanley.Yang 		 * should be removed until smu fix handle ecc_info table.
1915cf63b702SStanley.Yang 		 */
1916cf63b702SStanley.Yang 		if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
1917cf63b702SStanley.Yang 			(adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
1918cf63b702SStanley.Yang 			continue;
1919cf63b702SStanley.Yang 
1920761d86d3SDennis Li 		amdgpu_ras_query_error_status(adev, &info);
19212a460963SCandice Li 
19222a460963SCandice Li 		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
19236da15a23SCandice Li 		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) &&
19246da15a23SCandice Li 		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {
19252a460963SCandice Li 			if (amdgpu_ras_reset_error_status(adev, info.head.block))
19262a460963SCandice Li 				dev_warn(adev->dev, "Failed to reset error counter and error status");
19272a460963SCandice Li 		}
1928313c8fd3SGuchun Chen 	}
1929313c8fd3SGuchun Chen }
1930313c8fd3SGuchun Chen 
19313f975d0fSStanley.Yang /* Parse RdRspStatus and WrRspStatus */
amdgpu_ras_error_status_query(struct amdgpu_device * adev,struct ras_query_if * info)1932cd92df93SLee Jones static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
19333f975d0fSStanley.Yang 					  struct ras_query_if *info)
19343f975d0fSStanley.Yang {
19358eb53bb2Syipechai 	struct amdgpu_ras_block_object *block_obj;
19363f975d0fSStanley.Yang 	/*
19373f975d0fSStanley.Yang 	 * Only two block need to query read/write
19383f975d0fSStanley.Yang 	 * RspStatus at current state
19393f975d0fSStanley.Yang 	 */
19405e67bba3Syipechai 	if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
19415e67bba3Syipechai 		(info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
19425e67bba3Syipechai 		return;
19435e67bba3Syipechai 
1944b6efdb02Syipechai 	block_obj = amdgpu_ras_get_ras_block(adev,
1945b6efdb02Syipechai 					info->head.block,
1946b6efdb02Syipechai 					info->head.sub_block_index);
1947b6efdb02Syipechai 
19488b0fb0e9Syipechai 	if (!block_obj || !block_obj->hw_ops) {
1949afa37315SLuben Tuikov 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1950b6efdb02Syipechai 			     get_ras_block_str(&info->head));
19518b0fb0e9Syipechai 		return;
19523f975d0fSStanley.Yang 	}
19538b0fb0e9Syipechai 
19548b0fb0e9Syipechai 	if (block_obj->hw_ops->query_ras_error_status)
19558b0fb0e9Syipechai 		block_obj->hw_ops->query_ras_error_status(adev);
19565e67bba3Syipechai 
19573f975d0fSStanley.Yang }
19583f975d0fSStanley.Yang 
amdgpu_ras_query_err_status(struct amdgpu_device * adev)19593f975d0fSStanley.Yang static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
19603f975d0fSStanley.Yang {
19613f975d0fSStanley.Yang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
19623f975d0fSStanley.Yang 	struct ras_manager *obj;
19633f975d0fSStanley.Yang 
19648ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
19653f975d0fSStanley.Yang 		return;
19663f975d0fSStanley.Yang 
19673f975d0fSStanley.Yang 	list_for_each_entry(obj, &con->head, node) {
19683f975d0fSStanley.Yang 		struct ras_query_if info = {
19693f975d0fSStanley.Yang 			.head = obj->head,
19703f975d0fSStanley.Yang 		};
19713f975d0fSStanley.Yang 
19723f975d0fSStanley.Yang 		amdgpu_ras_error_status_query(adev, &info);
19733f975d0fSStanley.Yang 	}
19743f975d0fSStanley.Yang }
19753f975d0fSStanley.Yang 
1976c030f2e4Sxinhui pan /* recovery begin */
1977466b1793Sxinhui pan 
1978466b1793Sxinhui pan /* return 0 on success.
1979466b1793Sxinhui pan  * caller need free bps.
1980466b1793Sxinhui pan  */
amdgpu_ras_badpages_read(struct amdgpu_device * adev,struct ras_badpage ** bps,unsigned int * count)1981466b1793Sxinhui pan static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1982466b1793Sxinhui pan 		struct ras_badpage **bps, unsigned int *count)
1983466b1793Sxinhui pan {
1984466b1793Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1985466b1793Sxinhui pan 	struct ras_err_handler_data *data;
1986466b1793Sxinhui pan 	int i = 0;
1987732f2a30SDennis Li 	int ret = 0, status;
1988466b1793Sxinhui pan 
1989466b1793Sxinhui pan 	if (!con || !con->eh_data || !bps || !count)
1990466b1793Sxinhui pan 		return -EINVAL;
1991466b1793Sxinhui pan 
1992466b1793Sxinhui pan 	mutex_lock(&con->recovery_lock);
1993466b1793Sxinhui pan 	data = con->eh_data;
1994466b1793Sxinhui pan 	if (!data || data->count == 0) {
1995466b1793Sxinhui pan 		*bps = NULL;
199646cf2fecSGuchun Chen 		ret = -EINVAL;
1997466b1793Sxinhui pan 		goto out;
1998466b1793Sxinhui pan 	}
1999466b1793Sxinhui pan 
2000466b1793Sxinhui pan 	*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
2001466b1793Sxinhui pan 	if (!*bps) {
2002466b1793Sxinhui pan 		ret = -ENOMEM;
2003466b1793Sxinhui pan 		goto out;
2004466b1793Sxinhui pan 	}
2005466b1793Sxinhui pan 
2006466b1793Sxinhui pan 	for (; i < data->count; i++) {
2007466b1793Sxinhui pan 		(*bps)[i] = (struct ras_badpage){
20089dc23a63STao Zhou 			.bp = data->bps[i].retired_page,
2009466b1793Sxinhui pan 			.size = AMDGPU_GPU_PAGE_SIZE,
201052dd95f2SGuchun Chen 			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
2011466b1793Sxinhui pan 		};
2012ec6aae97SNirmoy Das 		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
2013676deb38SDennis Li 				data->bps[i].retired_page);
2014732f2a30SDennis Li 		if (status == -EBUSY)
201552dd95f2SGuchun Chen 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
2016732f2a30SDennis Li 		else if (status == -ENOENT)
201752dd95f2SGuchun Chen 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
2018466b1793Sxinhui pan 	}
2019466b1793Sxinhui pan 
2020466b1793Sxinhui pan 	*count = data->count;
2021466b1793Sxinhui pan out:
2022466b1793Sxinhui pan 	mutex_unlock(&con->recovery_lock);
2023466b1793Sxinhui pan 	return ret;
2024466b1793Sxinhui pan }
2025466b1793Sxinhui pan 
amdgpu_ras_do_recovery(struct work_struct * work)2026c030f2e4Sxinhui pan static void amdgpu_ras_do_recovery(struct work_struct *work)
2027c030f2e4Sxinhui pan {
2028c030f2e4Sxinhui pan 	struct amdgpu_ras *ras =
2029c030f2e4Sxinhui pan 		container_of(work, struct amdgpu_ras, recovery_work);
2030b3dbd6d3SJohn Clements 	struct amdgpu_device *remote_adev = NULL;
2031b3dbd6d3SJohn Clements 	struct amdgpu_device *adev = ras->adev;
2032b3dbd6d3SJohn Clements 	struct list_head device_list, *device_list_handle =  NULL;
2033c030f2e4Sxinhui pan 
2034f75e94d8SGuchun Chen 	if (!ras->disable_ras_err_cnt_harvest) {
2035d95e8e97SDennis Li 		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2036d95e8e97SDennis Li 
2037b3dbd6d3SJohn Clements 		/* Build list of devices to query RAS related errors */
2038f75e94d8SGuchun Chen 		if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
2039b3dbd6d3SJohn Clements 			device_list_handle = &hive->device_list;
2040f75e94d8SGuchun Chen 		} else {
204112c17b9dSGuchun Chen 			INIT_LIST_HEAD(&device_list);
2042b3dbd6d3SJohn Clements 			list_add_tail(&adev->gmc.xgmi.head, &device_list);
2043b3dbd6d3SJohn Clements 			device_list_handle = &device_list;
2044b3dbd6d3SJohn Clements 		}
2045b3dbd6d3SJohn Clements 
2046f75e94d8SGuchun Chen 		list_for_each_entry(remote_adev,
20473f975d0fSStanley.Yang 				device_list_handle, gmc.xgmi.head) {
20483f975d0fSStanley.Yang 			amdgpu_ras_query_err_status(remote_adev);
2049b3dbd6d3SJohn Clements 			amdgpu_ras_log_on_err_counter(remote_adev);
20503f975d0fSStanley.Yang 		}
2051d95e8e97SDennis Li 
2052d95e8e97SDennis Li 		amdgpu_put_xgmi_hive(hive);
2053b3dbd6d3SJohn Clements 	}
2054313c8fd3SGuchun Chen 
2055f1549c09SLikun Gao 	if (amdgpu_device_should_recover_gpu(ras->adev)) {
2056f1549c09SLikun Gao 		struct amdgpu_reset_context reset_context;
2057f1549c09SLikun Gao 		memset(&reset_context, 0, sizeof(reset_context));
2058f1549c09SLikun Gao 
2059f1549c09SLikun Gao 		reset_context.method = AMD_RESET_METHOD_NONE;
2060f1549c09SLikun Gao 		reset_context.reset_req_dev = adev;
20611a11a65dSYiPeng Chai 
20621a11a65dSYiPeng Chai 		/* Perform full reset in fatal error mode */
20631a11a65dSYiPeng Chai 		if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
20641a11a65dSYiPeng Chai 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
20656c47a79bSYiPeng Chai 		else {
2066f1549c09SLikun Gao 			clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2067f1549c09SLikun Gao 
20686c47a79bSYiPeng Chai 			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
20696c47a79bSYiPeng Chai 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
20706c47a79bSYiPeng Chai 				reset_context.method = AMD_RESET_METHOD_MODE2;
20716c47a79bSYiPeng Chai 			}
20722c7cd280SYiPeng Chai 
20732c7cd280SYiPeng Chai 			/* Fatal error occurs in poison mode, mode1 reset is used to
20742c7cd280SYiPeng Chai 			 * recover gpu.
20752c7cd280SYiPeng Chai 			 */
20762c7cd280SYiPeng Chai 			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
20772c7cd280SYiPeng Chai 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
20782c7cd280SYiPeng Chai 				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
20791b98a5f8SYiPeng Chai 
20801b98a5f8SYiPeng Chai 				psp_fatal_error_recovery_quirk(&adev->psp);
20812c7cd280SYiPeng Chai 			}
20826c47a79bSYiPeng Chai 		}
20836c47a79bSYiPeng Chai 
2084f1549c09SLikun Gao 		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
2085f1549c09SLikun Gao 	}
2086c030f2e4Sxinhui pan 	atomic_set(&ras->in_recovery, 0);
2087c030f2e4Sxinhui pan }
2088c030f2e4Sxinhui pan 
2089c030f2e4Sxinhui pan /* alloc/realloc bps array */
amdgpu_ras_realloc_eh_data_space(struct amdgpu_device * adev,struct ras_err_handler_data * data,int pages)2090c030f2e4Sxinhui pan static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
2091c030f2e4Sxinhui pan 		struct ras_err_handler_data *data, int pages)
2092c030f2e4Sxinhui pan {
2093c030f2e4Sxinhui pan 	unsigned int old_space = data->count + data->space_left;
2094c030f2e4Sxinhui pan 	unsigned int new_space = old_space + pages;
20959dc23a63STao Zhou 	unsigned int align_space = ALIGN(new_space, 512);
20969dc23a63STao Zhou 	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
2097c030f2e4Sxinhui pan 
2098676deb38SDennis Li 	if (!bps) {
2099c030f2e4Sxinhui pan 		return -ENOMEM;
21009dc23a63STao Zhou 	}
2101c030f2e4Sxinhui pan 
2102c030f2e4Sxinhui pan 	if (data->bps) {
21039dc23a63STao Zhou 		memcpy(bps, data->bps,
2104c030f2e4Sxinhui pan 				data->count * sizeof(*data->bps));
2105c030f2e4Sxinhui pan 		kfree(data->bps);
2106c030f2e4Sxinhui pan 	}
2107c030f2e4Sxinhui pan 
21089dc23a63STao Zhou 	data->bps = bps;
2109c030f2e4Sxinhui pan 	data->space_left += align_space - old_space;
2110c030f2e4Sxinhui pan 	return 0;
2111c030f2e4Sxinhui pan }
2112c030f2e4Sxinhui pan 
2113c030f2e4Sxinhui pan /* it deal with vram only. */
amdgpu_ras_add_bad_pages(struct amdgpu_device * adev,struct eeprom_table_record * bps,int pages)2114c030f2e4Sxinhui pan int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
21159dc23a63STao Zhou 		struct eeprom_table_record *bps, int pages)
2116c030f2e4Sxinhui pan {
2117c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
211873aa8e1aSxinhui pan 	struct ras_err_handler_data *data;
2119c030f2e4Sxinhui pan 	int ret = 0;
2120676deb38SDennis Li 	uint32_t i;
2121c030f2e4Sxinhui pan 
212273aa8e1aSxinhui pan 	if (!con || !con->eh_data || !bps || pages <= 0)
2123c030f2e4Sxinhui pan 		return 0;
2124c030f2e4Sxinhui pan 
2125c030f2e4Sxinhui pan 	mutex_lock(&con->recovery_lock);
212673aa8e1aSxinhui pan 	data = con->eh_data;
2127c030f2e4Sxinhui pan 	if (!data)
2128c030f2e4Sxinhui pan 		goto out;
2129c030f2e4Sxinhui pan 
2130676deb38SDennis Li 	for (i = 0; i < pages; i++) {
2131676deb38SDennis Li 		if (amdgpu_ras_check_bad_page_unlock(con,
2132676deb38SDennis Li 			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
2133676deb38SDennis Li 			continue;
2134676deb38SDennis Li 
2135676deb38SDennis Li 		if (!data->space_left &&
2136676deb38SDennis Li 			amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
2137c030f2e4Sxinhui pan 			ret = -ENOMEM;
2138c030f2e4Sxinhui pan 			goto out;
2139c030f2e4Sxinhui pan 		}
2140c030f2e4Sxinhui pan 
2141ec6aae97SNirmoy Das 		amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
2142676deb38SDennis Li 			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
2143676deb38SDennis Li 			AMDGPU_GPU_PAGE_SIZE);
21449dc23a63STao Zhou 
2145676deb38SDennis Li 		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
2146676deb38SDennis Li 		data->count++;
2147676deb38SDennis Li 		data->space_left--;
2148676deb38SDennis Li 	}
2149c030f2e4Sxinhui pan out:
2150c030f2e4Sxinhui pan 	mutex_unlock(&con->recovery_lock);
2151c030f2e4Sxinhui pan 
2152c030f2e4Sxinhui pan 	return ret;
2153c030f2e4Sxinhui pan }
2154c030f2e4Sxinhui pan 
215578ad00c9STao Zhou /*
215678ad00c9STao Zhou  * write error record array to eeprom, the function should be
215778ad00c9STao Zhou  * protected by recovery_lock
21584d33e0f1STao Zhou  * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
215978ad00c9STao Zhou  */
amdgpu_ras_save_bad_pages(struct amdgpu_device * adev,unsigned long * new_cnt)21604d33e0f1STao Zhou int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
21614d33e0f1STao Zhou 		unsigned long *new_cnt)
216278ad00c9STao Zhou {
216378ad00c9STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
216478ad00c9STao Zhou 	struct ras_err_handler_data *data;
21658a3e801fSGuchun Chen 	struct amdgpu_ras_eeprom_control *control;
216678ad00c9STao Zhou 	int save_count;
216778ad00c9STao Zhou 
21684d33e0f1STao Zhou 	if (!con || !con->eh_data) {
21694d33e0f1STao Zhou 		if (new_cnt)
21704d33e0f1STao Zhou 			*new_cnt = 0;
21714d33e0f1STao Zhou 
217278ad00c9STao Zhou 		return 0;
21734d33e0f1STao Zhou 	}
217478ad00c9STao Zhou 
2175d9a69fe5SCandice Li 	mutex_lock(&con->recovery_lock);
21768a3e801fSGuchun Chen 	control = &con->eeprom_control;
217778ad00c9STao Zhou 	data = con->eh_data;
21780686627bSLuben Tuikov 	save_count = data->count - control->ras_num_recs;
2179d9a69fe5SCandice Li 	mutex_unlock(&con->recovery_lock);
21804d33e0f1STao Zhou 
21814d33e0f1STao Zhou 	if (new_cnt)
21824d33e0f1STao Zhou 		*new_cnt = save_count / adev->umc.retire_unit;
21834d33e0f1STao Zhou 
218478ad00c9STao Zhou 	/* only new entries are saved */
2185b1628425SGuchun Chen 	if (save_count > 0) {
218663d4c081SLuben Tuikov 		if (amdgpu_ras_eeprom_append(control,
21870686627bSLuben Tuikov 					     &data->bps[control->ras_num_recs],
21881fab841fSLuben Tuikov 					     save_count)) {
21896952e99cSGuchun Chen 			dev_err(adev->dev, "Failed to save EEPROM table data!");
219078ad00c9STao Zhou 			return -EIO;
219178ad00c9STao Zhou 		}
219278ad00c9STao Zhou 
2193b1628425SGuchun Chen 		dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
2194b1628425SGuchun Chen 	}
2195b1628425SGuchun Chen 
219678ad00c9STao Zhou 	return 0;
219778ad00c9STao Zhou }
219878ad00c9STao Zhou 
219978ad00c9STao Zhou /*
220078ad00c9STao Zhou  * read error record array in eeprom and reserve enough space for
220178ad00c9STao Zhou  * storing new bad pages
220278ad00c9STao Zhou  */
amdgpu_ras_load_bad_pages(struct amdgpu_device * adev)220378ad00c9STao Zhou static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
220478ad00c9STao Zhou {
220578ad00c9STao Zhou 	struct amdgpu_ras_eeprom_control *control =
22066457205cSCandice Li 		&adev->psp.ras_context.ras->eeprom_control;
2207e4e6a589SLuben Tuikov 	struct eeprom_table_record *bps;
2208e4e6a589SLuben Tuikov 	int ret;
220978ad00c9STao Zhou 
221078ad00c9STao Zhou 	/* no bad page record, skip eeprom access */
22110686627bSLuben Tuikov 	if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
2212e4e6a589SLuben Tuikov 		return 0;
221378ad00c9STao Zhou 
22140686627bSLuben Tuikov 	bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
221578ad00c9STao Zhou 	if (!bps)
221678ad00c9STao Zhou 		return -ENOMEM;
221778ad00c9STao Zhou 
22180686627bSLuben Tuikov 	ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
2219e4e6a589SLuben Tuikov 	if (ret)
22206952e99cSGuchun Chen 		dev_err(adev->dev, "Failed to load EEPROM table records!");
2221e4e6a589SLuben Tuikov 	else
22220686627bSLuben Tuikov 		ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
222378ad00c9STao Zhou 
222478ad00c9STao Zhou 	kfree(bps);
222578ad00c9STao Zhou 	return ret;
222678ad00c9STao Zhou }
222778ad00c9STao Zhou 
amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras * con,uint64_t addr)2228676deb38SDennis Li static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
2229676deb38SDennis Li 				uint64_t addr)
2230676deb38SDennis Li {
2231676deb38SDennis Li 	struct ras_err_handler_data *data = con->eh_data;
2232676deb38SDennis Li 	int i;
2233676deb38SDennis Li 
2234676deb38SDennis Li 	addr >>= AMDGPU_GPU_PAGE_SHIFT;
2235676deb38SDennis Li 	for (i = 0; i < data->count; i++)
2236676deb38SDennis Li 		if (addr == data->bps[i].retired_page)
2237676deb38SDennis Li 			return true;
2238676deb38SDennis Li 
2239676deb38SDennis Li 	return false;
2240676deb38SDennis Li }
2241676deb38SDennis Li 
22426e4be987STao Zhou /*
22436e4be987STao Zhou  * check if an address belongs to bad page
22446e4be987STao Zhou  *
22456e4be987STao Zhou  * Note: this check is only for umc block
22466e4be987STao Zhou  */
amdgpu_ras_check_bad_page(struct amdgpu_device * adev,uint64_t addr)22476e4be987STao Zhou static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
22486e4be987STao Zhou 				uint64_t addr)
22496e4be987STao Zhou {
22506e4be987STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
22516e4be987STao Zhou 	bool ret = false;
22526e4be987STao Zhou 
22536e4be987STao Zhou 	if (!con || !con->eh_data)
22546e4be987STao Zhou 		return ret;
22556e4be987STao Zhou 
22566e4be987STao Zhou 	mutex_lock(&con->recovery_lock);
2257676deb38SDennis Li 	ret = amdgpu_ras_check_bad_page_unlock(con, addr);
22586e4be987STao Zhou 	mutex_unlock(&con->recovery_lock);
22596e4be987STao Zhou 	return ret;
22606e4be987STao Zhou }
22616e4be987STao Zhou 
amdgpu_ras_validate_threshold(struct amdgpu_device * adev,uint32_t max_count)2262e5c04edfSChristian König static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
2263e4e6a589SLuben Tuikov 					  uint32_t max_count)
2264c84d4670SGuchun Chen {
2265e5c04edfSChristian König 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2266c84d4670SGuchun Chen 
2267c84d4670SGuchun Chen 	/*
2268c84d4670SGuchun Chen 	 * Justification of value bad_page_cnt_threshold in ras structure
2269c84d4670SGuchun Chen 	 *
2270f3cbe70eSTao Zhou 	 * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
2271f3cbe70eSTao Zhou 	 * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
2272f3cbe70eSTao Zhou 	 * scenarios accordingly.
2273c84d4670SGuchun Chen 	 *
2274c84d4670SGuchun Chen 	 * Bad page retirement enablement:
2275f3cbe70eSTao Zhou 	 *    - If amdgpu_bad_page_threshold = -2,
2276c84d4670SGuchun Chen 	 *      bad_page_cnt_threshold = typical value by formula.
2277c84d4670SGuchun Chen 	 *
2278c84d4670SGuchun Chen 	 *    - When the value from user is 0 < amdgpu_bad_page_threshold <
2279c84d4670SGuchun Chen 	 *      max record length in eeprom, use it directly.
2280c84d4670SGuchun Chen 	 *
2281c84d4670SGuchun Chen 	 * Bad page retirement disablement:
2282c84d4670SGuchun Chen 	 *    - If amdgpu_bad_page_threshold = 0, bad page retirement
2283c84d4670SGuchun Chen 	 *      functionality is disabled, and bad_page_cnt_threshold will
2284c84d4670SGuchun Chen 	 *      take no effect.
2285c84d4670SGuchun Chen 	 */
2286c84d4670SGuchun Chen 
2287e4e6a589SLuben Tuikov 	if (amdgpu_bad_page_threshold < 0) {
2288e4e6a589SLuben Tuikov 		u64 val = adev->gmc.mc_vram_size;
2289c84d4670SGuchun Chen 
2290e4e6a589SLuben Tuikov 		do_div(val, RAS_BAD_PAGE_COVER);
2291e5c04edfSChristian König 		con->bad_page_cnt_threshold = min(lower_32_bits(val),
2292e4e6a589SLuben Tuikov 						  max_count);
2293e5c04edfSChristian König 	} else {
2294e4e6a589SLuben Tuikov 		con->bad_page_cnt_threshold = min_t(int, max_count,
2295e4e6a589SLuben Tuikov 						    amdgpu_bad_page_threshold);
2296c84d4670SGuchun Chen 	}
2297c84d4670SGuchun Chen }
2298c84d4670SGuchun Chen 
amdgpu_ras_recovery_init(struct amdgpu_device * adev)22991a6fc071STao Zhou int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
2300c030f2e4Sxinhui pan {
2301c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
23024d1337d2SAndrey Grodzovsky 	struct ras_err_handler_data **data;
2303e4e6a589SLuben Tuikov 	u32  max_eeprom_records_count = 0;
2304b82e65a9SGuchun Chen 	bool exc_err_limit = false;
230578ad00c9STao Zhou 	int ret;
2306c030f2e4Sxinhui pan 
2307e0e146d5SStanley.Yang 	if (!con || amdgpu_sriov_vf(adev))
23084d1337d2SAndrey Grodzovsky 		return 0;
23094d1337d2SAndrey Grodzovsky 
23101d9d2ca8SLuben Tuikov 	/* Allow access to RAS EEPROM via debugfs, when the ASIC
23111d9d2ca8SLuben Tuikov 	 * supports RAS and debugfs is enabled, but when
23121d9d2ca8SLuben Tuikov 	 * adev->ras_enabled is unset, i.e. when "ras_enable"
23131d9d2ca8SLuben Tuikov 	 * module parameter is set to 0.
23141d9d2ca8SLuben Tuikov 	 */
23151d9d2ca8SLuben Tuikov 	con->adev = adev;
23161d9d2ca8SLuben Tuikov 
23171d9d2ca8SLuben Tuikov 	if (!adev->ras_enabled)
23181d9d2ca8SLuben Tuikov 		return 0;
23191d9d2ca8SLuben Tuikov 
23201d9d2ca8SLuben Tuikov 	data = &con->eh_data;
23211a6fc071STao Zhou 	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
23221a6fc071STao Zhou 	if (!*data) {
23231a6fc071STao Zhou 		ret = -ENOMEM;
23241a6fc071STao Zhou 		goto out;
23251a6fc071STao Zhou 	}
2326c030f2e4Sxinhui pan 
2327c030f2e4Sxinhui pan 	mutex_init(&con->recovery_lock);
2328c030f2e4Sxinhui pan 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
2329c030f2e4Sxinhui pan 	atomic_set(&con->in_recovery, 0);
233069691c82SStanley.Yang 	con->eeprom_control.bad_channel_bitmap = 0;
2331c030f2e4Sxinhui pan 
23327f599fedSStanley.Yang 	max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
2333e4e6a589SLuben Tuikov 	amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
2334c84d4670SGuchun Chen 
2335e5086659Sshaoyunl 	/* Todo: During test the SMU might fail to read the eeprom through I2C
2336e5086659Sshaoyunl 	 * when the GPU is pending on XGMI reset during probe time
2337e5086659Sshaoyunl 	 * (Mostly after second bus reset), skip it now
2338e5086659Sshaoyunl 	 */
2339e5086659Sshaoyunl 	if (adev->gmc.xgmi.pending_reset)
2340e5086659Sshaoyunl 		return 0;
2341b82e65a9SGuchun Chen 	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
2342b82e65a9SGuchun Chen 	/*
2343b82e65a9SGuchun Chen 	 * This calling fails when exc_err_limit is true or
2344b82e65a9SGuchun Chen 	 * ret != 0.
2345b82e65a9SGuchun Chen 	 */
2346b82e65a9SGuchun Chen 	if (exc_err_limit || ret)
23471a6fc071STao Zhou 		goto free;
234878ad00c9STao Zhou 
23490686627bSLuben Tuikov 	if (con->eeprom_control.ras_num_recs) {
235078ad00c9STao Zhou 		ret = amdgpu_ras_load_bad_pages(adev);
235178ad00c9STao Zhou 		if (ret)
23521a6fc071STao Zhou 			goto free;
2353513befa6SStanley.Yang 
2354bc143d8bSEvan Quan 		amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
235569691c82SStanley.Yang 
235669691c82SStanley.Yang 		if (con->update_channel_flag == true) {
235769691c82SStanley.Yang 			amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
235869691c82SStanley.Yang 			con->update_channel_flag = false;
235969691c82SStanley.Yang 		}
236078ad00c9STao Zhou 	}
2361c030f2e4Sxinhui pan 
236212b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
236312b2cab7SMukul Joshi 	if ((adev->asic_type == CHIP_ALDEBARAN) &&
236412b2cab7SMukul Joshi 	    (adev->gmc.xgmi.connected_to_cpu))
236591a1a52dSMukul Joshi 		amdgpu_register_bad_pages_mca_notifier(adev);
236612b2cab7SMukul Joshi #endif
2367c030f2e4Sxinhui pan 	return 0;
23681a6fc071STao Zhou 
23691a6fc071STao Zhou free:
23701a6fc071STao Zhou 	kfree((*data)->bps);
23711a6fc071STao Zhou 	kfree(*data);
23721995b3a3SFelix Kuehling 	con->eh_data = NULL;
23731a6fc071STao Zhou out:
2374cf696091SLuben Tuikov 	dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
23751a6fc071STao Zhou 
2376b82e65a9SGuchun Chen 	/*
2377b82e65a9SGuchun Chen 	 * Except error threshold exceeding case, other failure cases in this
2378b82e65a9SGuchun Chen 	 * function would not fail amdgpu driver init.
2379b82e65a9SGuchun Chen 	 */
2380b82e65a9SGuchun Chen 	if (!exc_err_limit)
2381b82e65a9SGuchun Chen 		ret = 0;
2382b82e65a9SGuchun Chen 	else
2383b82e65a9SGuchun Chen 		ret = -EINVAL;
2384b82e65a9SGuchun Chen 
23851a6fc071STao Zhou 	return ret;
2386c030f2e4Sxinhui pan }
2387c030f2e4Sxinhui pan 
amdgpu_ras_recovery_fini(struct amdgpu_device * adev)2388c030f2e4Sxinhui pan static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2389c030f2e4Sxinhui pan {
2390c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2391c030f2e4Sxinhui pan 	struct ras_err_handler_data *data = con->eh_data;
2392c030f2e4Sxinhui pan 
23931a6fc071STao Zhou 	/* recovery_init failed to init it, fini is useless */
23941a6fc071STao Zhou 	if (!data)
23951a6fc071STao Zhou 		return 0;
23961a6fc071STao Zhou 
2397c030f2e4Sxinhui pan 	cancel_work_sync(&con->recovery_work);
2398c030f2e4Sxinhui pan 
2399c030f2e4Sxinhui pan 	mutex_lock(&con->recovery_lock);
2400c030f2e4Sxinhui pan 	con->eh_data = NULL;
2401c030f2e4Sxinhui pan 	kfree(data->bps);
2402c030f2e4Sxinhui pan 	kfree(data);
2403c030f2e4Sxinhui pan 	mutex_unlock(&con->recovery_lock);
2404c030f2e4Sxinhui pan 
2405c030f2e4Sxinhui pan 	return 0;
2406c030f2e4Sxinhui pan }
2407c030f2e4Sxinhui pan /* recovery end */
2408c030f2e4Sxinhui pan 
amdgpu_ras_asic_supported(struct amdgpu_device * adev)2409084e2640SLuben Tuikov static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
24105436ab94SStanley.Yang {
241182835055SYiPeng Chai 	if (amdgpu_sriov_vf(adev)) {
241282835055SYiPeng Chai 		switch (adev->ip_versions[MP0_HWIP][0]) {
241382835055SYiPeng Chai 		case IP_VERSION(13, 0, 2):
2414e81c4556SYiPeng Chai 		case IP_VERSION(13, 0, 6):
241582835055SYiPeng Chai 			return true;
241682835055SYiPeng Chai 		default:
241782835055SYiPeng Chai 			return false;
241882835055SYiPeng Chai 		}
241982835055SYiPeng Chai 	}
242082835055SYiPeng Chai 
2421073285efSYiPeng Chai 	if (adev->asic_type == CHIP_IP_DISCOVERY) {
2422073285efSYiPeng Chai 		switch (adev->ip_versions[MP0_HWIP][0]) {
2423073285efSYiPeng Chai 		case IP_VERSION(13, 0, 0):
2424cb906ce3SStanley.Yang 		case IP_VERSION(13, 0, 6):
2425073285efSYiPeng Chai 		case IP_VERSION(13, 0, 10):
2426073285efSYiPeng Chai 			return true;
2427073285efSYiPeng Chai 		default:
2428073285efSYiPeng Chai 			return false;
2429073285efSYiPeng Chai 		}
2430073285efSYiPeng Chai 	}
2431073285efSYiPeng Chai 
2432084e2640SLuben Tuikov 	return adev->asic_type == CHIP_VEGA10 ||
2433084e2640SLuben Tuikov 		adev->asic_type == CHIP_VEGA20 ||
2434084e2640SLuben Tuikov 		adev->asic_type == CHIP_ARCTURUS ||
243575f06251SHawking Zhang 		adev->asic_type == CHIP_ALDEBARAN ||
2436084e2640SLuben Tuikov 		adev->asic_type == CHIP_SIENNA_CICHLID;
24375436ab94SStanley.Yang }
24385436ab94SStanley.Yang 
24395caf466aSxinhui pan /*
2440f50160cfSStanley.Yang  * this is workaround for vega20 workstation sku,
2441f50160cfSStanley.Yang  * force enable gfx ras, ignore vbios gfx ras flag
2442f50160cfSStanley.Yang  * due to GC EDC can not write
2443f50160cfSStanley.Yang  */
amdgpu_ras_get_quirks(struct amdgpu_device * adev)2444e509965eSLuben Tuikov static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
2445f50160cfSStanley.Yang {
2446f50160cfSStanley.Yang 	struct atom_context *ctx = adev->mode_info.atom_context;
2447f50160cfSStanley.Yang 
2448f50160cfSStanley.Yang 	if (!ctx)
2449f50160cfSStanley.Yang 		return;
2450f50160cfSStanley.Yang 
2451adf64e21SMario Limonciello 	if (strnstr(ctx->vbios_pn, "D16406",
2452adf64e21SMario Limonciello 		    sizeof(ctx->vbios_pn)) ||
2453adf64e21SMario Limonciello 		strnstr(ctx->vbios_pn, "D36002",
2454adf64e21SMario Limonciello 			sizeof(ctx->vbios_pn)))
24558ab0d6f0SLuben Tuikov 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
2456f50160cfSStanley.Yang }
2457f50160cfSStanley.Yang 
2458f50160cfSStanley.Yang /*
24595caf466aSxinhui pan  * check hardware's ras ability which will be saved in hw_supported.
24605caf466aSxinhui pan  * if hardware does not support ras, we can skip some ras initializtion and
24615caf466aSxinhui pan  * forbid some ras operations from IP.
24625caf466aSxinhui pan  * if software itself, say boot parameter, limit the ras ability. We still
24635caf466aSxinhui pan  * need allow IP do some limited operations, like disable. In such case,
24645caf466aSxinhui pan  * we have to initialize ras as normal. but need check if operation is
24655caf466aSxinhui pan  * allowed or not in each function.
24665caf466aSxinhui pan  */
amdgpu_ras_check_supported(struct amdgpu_device * adev)2467e509965eSLuben Tuikov static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
2468c030f2e4Sxinhui pan {
24698ab0d6f0SLuben Tuikov 	adev->ras_hw_enabled = adev->ras_enabled = 0;
2470c030f2e4Sxinhui pan 
247138298ce6SStanley.Yang 	if (!amdgpu_ras_asic_supported(adev))
24725caf466aSxinhui pan 		return;
2473b404ae82Sxinhui pan 
247438298ce6SStanley.Yang 	if (!adev->gmc.xgmi.connected_to_cpu &&	!adev->gmc.is_app_apu) {
247588474ccaSGuchun Chen 		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
2476b69d5c7eSHawking Zhang 			dev_info(adev->dev, "MEM ECC is active.\n");
24778ab0d6f0SLuben Tuikov 			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
247888474ccaSGuchun Chen 						   1 << AMDGPU_RAS_BLOCK__DF);
247975f06251SHawking Zhang 		} else {
2480b69d5c7eSHawking Zhang 			dev_info(adev->dev, "MEM ECC is not presented.\n");
248175f06251SHawking Zhang 		}
248288474ccaSGuchun Chen 
248388474ccaSGuchun Chen 		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
24846952e99cSGuchun Chen 			dev_info(adev->dev, "SRAM ECC is active.\n");
24853189501eSTao Zhou 			if (!amdgpu_sriov_vf(adev))
24868ab0d6f0SLuben Tuikov 				adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
248788474ccaSGuchun Chen 							    1 << AMDGPU_RAS_BLOCK__DF);
24883189501eSTao Zhou 			else
24893189501eSTao Zhou 				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
24903189501eSTao Zhou 								1 << AMDGPU_RAS_BLOCK__SDMA |
24913189501eSTao Zhou 								1 << AMDGPU_RAS_BLOCK__GFX);
2492a3d63c62SMohammad Zafar Ziya 
24933189501eSTao Zhou 			/* VCN/JPEG RAS can be supported on both bare metal and
24943189501eSTao Zhou 			 * SRIOV environment
24953189501eSTao Zhou 			 */
249607615da1STao Zhou 			if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
249707615da1STao Zhou 			    adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
2498a3d63c62SMohammad Zafar Ziya 				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
2499a3d63c62SMohammad Zafar Ziya 							1 << AMDGPU_RAS_BLOCK__JPEG);
2500a3d63c62SMohammad Zafar Ziya 			else
2501a3d63c62SMohammad Zafar Ziya 				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
2502a3d63c62SMohammad Zafar Ziya 							1 << AMDGPU_RAS_BLOCK__JPEG);
250358bc2a9cSStanley.Yang 
250458bc2a9cSStanley.Yang 			/*
250558bc2a9cSStanley.Yang 			 * XGMI RAS is not supported if xgmi num physical nodes
250658bc2a9cSStanley.Yang 			 * is zero
250758bc2a9cSStanley.Yang 			 */
250858bc2a9cSStanley.Yang 			if (!adev->gmc.xgmi.num_physical_nodes)
250958bc2a9cSStanley.Yang 				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL);
251075f06251SHawking Zhang 		} else {
25116952e99cSGuchun Chen 			dev_info(adev->dev, "SRAM ECC is not presented.\n");
251275f06251SHawking Zhang 		}
251375f06251SHawking Zhang 	} else {
251475f06251SHawking Zhang 		/* driver only manages a few IP blocks RAS feature
251575f06251SHawking Zhang 		 * when GPU is connected cpu through XGMI */
25168ab0d6f0SLuben Tuikov 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
251775f06251SHawking Zhang 					   1 << AMDGPU_RAS_BLOCK__SDMA |
251875f06251SHawking Zhang 					   1 << AMDGPU_RAS_BLOCK__MMHUB);
251975f06251SHawking Zhang 	}
252088474ccaSGuchun Chen 
2521e509965eSLuben Tuikov 	amdgpu_ras_get_quirks(adev);
2522b404ae82Sxinhui pan 
2523b404ae82Sxinhui pan 	/* hw_supported needs to be aligned with RAS block mask. */
25248ab0d6f0SLuben Tuikov 	adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
2525c030f2e4Sxinhui pan 
2526276f6e8cSStanley.Yang 
2527276f6e8cSStanley.Yang 	/*
2528276f6e8cSStanley.Yang 	 * Disable ras feature for aqua vanjaram
2529276f6e8cSStanley.Yang 	 * by default on apu platform.
2530276f6e8cSStanley.Yang 	 */
2531fcb7a184SStanley.Yang 	if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) &&
2532fcb7a184SStanley.Yang 	    adev->gmc.is_app_apu)
2533276f6e8cSStanley.Yang 		adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 :
2534276f6e8cSStanley.Yang 			adev->ras_hw_enabled & amdgpu_ras_mask;
2535276f6e8cSStanley.Yang 	else
25368ab0d6f0SLuben Tuikov 		adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
25378ab0d6f0SLuben Tuikov 			adev->ras_hw_enabled & amdgpu_ras_mask;
2538c030f2e4Sxinhui pan }
2539c030f2e4Sxinhui pan 
amdgpu_ras_counte_dw(struct work_struct * work)254005adfd80SLuben Tuikov static void amdgpu_ras_counte_dw(struct work_struct *work)
254105adfd80SLuben Tuikov {
254205adfd80SLuben Tuikov 	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
254305adfd80SLuben Tuikov 					      ras_counte_delay_work.work);
254405adfd80SLuben Tuikov 	struct amdgpu_device *adev = con->adev;
2545a3fbb0d8SGuchun Chen 	struct drm_device *dev = adev_to_drm(adev);
254605adfd80SLuben Tuikov 	unsigned long ce_count, ue_count;
254705adfd80SLuben Tuikov 	int res;
254805adfd80SLuben Tuikov 
254905adfd80SLuben Tuikov 	res = pm_runtime_get_sync(dev->dev);
255005adfd80SLuben Tuikov 	if (res < 0)
255105adfd80SLuben Tuikov 		goto Out;
255205adfd80SLuben Tuikov 
255305adfd80SLuben Tuikov 	/* Cache new values.
255405adfd80SLuben Tuikov 	 */
25554a1c9a44SHawking Zhang 	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) {
255605adfd80SLuben Tuikov 		atomic_set(&con->ras_ce_count, ce_count);
255705adfd80SLuben Tuikov 		atomic_set(&con->ras_ue_count, ue_count);
25584d9f771eSLuben Tuikov 	}
255905adfd80SLuben Tuikov 
256005adfd80SLuben Tuikov 	pm_runtime_mark_last_busy(dev->dev);
256105adfd80SLuben Tuikov Out:
256205adfd80SLuben Tuikov 	pm_runtime_put_autosuspend(dev->dev);
256305adfd80SLuben Tuikov }
256405adfd80SLuben Tuikov 
amdgpu_ras_query_poison_mode(struct amdgpu_device * adev)25652dd9032bSTao Zhou static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
25662dd9032bSTao Zhou {
25672dd9032bSTao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
25682dd9032bSTao Zhou 	bool df_poison, umc_poison;
25692dd9032bSTao Zhou 
25702dd9032bSTao Zhou 	/* poison setting is useless on SRIOV guest */
25712dd9032bSTao Zhou 	if (amdgpu_sriov_vf(adev) || !con)
25722dd9032bSTao Zhou 		return;
25732dd9032bSTao Zhou 
25742dd9032bSTao Zhou 	/* Init poison supported flag, the default value is false */
25752dd9032bSTao Zhou 	if (adev->gmc.xgmi.connected_to_cpu) {
25762dd9032bSTao Zhou 		/* enabled by default when GPU is connected to CPU */
25772dd9032bSTao Zhou 		con->poison_supported = true;
25782dd9032bSTao Zhou 	} else if (adev->df.funcs &&
25792dd9032bSTao Zhou 	    adev->df.funcs->query_ras_poison_mode &&
25802dd9032bSTao Zhou 	    adev->umc.ras &&
25812dd9032bSTao Zhou 	    adev->umc.ras->query_ras_poison_mode) {
25822dd9032bSTao Zhou 		df_poison =
25832dd9032bSTao Zhou 			adev->df.funcs->query_ras_poison_mode(adev);
25842dd9032bSTao Zhou 		umc_poison =
25852dd9032bSTao Zhou 			adev->umc.ras->query_ras_poison_mode(adev);
25862dd9032bSTao Zhou 
25872dd9032bSTao Zhou 		/* Only poison is set in both DF and UMC, we can support it */
25882dd9032bSTao Zhou 		if (df_poison && umc_poison)
25892dd9032bSTao Zhou 			con->poison_supported = true;
25902dd9032bSTao Zhou 		else if (df_poison != umc_poison)
25912dd9032bSTao Zhou 			dev_warn(adev->dev,
25922dd9032bSTao Zhou 				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
25932dd9032bSTao Zhou 				df_poison, umc_poison);
25942dd9032bSTao Zhou 	}
25952dd9032bSTao Zhou }
25962dd9032bSTao Zhou 
amdgpu_ras_init(struct amdgpu_device * adev)2597c030f2e4Sxinhui pan int amdgpu_ras_init(struct amdgpu_device *adev)
2598c030f2e4Sxinhui pan {
2599c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2600c030f2e4Sxinhui pan 	int r;
2601c030f2e4Sxinhui pan 
2602c030f2e4Sxinhui pan 	if (con)
2603c030f2e4Sxinhui pan 		return 0;
2604c030f2e4Sxinhui pan 
2605c030f2e4Sxinhui pan 	con = kmalloc(sizeof(struct amdgpu_ras) +
2606640ae42eSJohn Clements 			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
2607640ae42eSJohn Clements 			sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
2608c030f2e4Sxinhui pan 			GFP_KERNEL|__GFP_ZERO);
2609c030f2e4Sxinhui pan 	if (!con)
2610c030f2e4Sxinhui pan 		return -ENOMEM;
2611c030f2e4Sxinhui pan 
261205adfd80SLuben Tuikov 	con->adev = adev;
261305adfd80SLuben Tuikov 	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
261405adfd80SLuben Tuikov 	atomic_set(&con->ras_ce_count, 0);
261505adfd80SLuben Tuikov 	atomic_set(&con->ras_ue_count, 0);
261605adfd80SLuben Tuikov 
2617c030f2e4Sxinhui pan 	con->objs = (struct ras_manager *)(con + 1);
2618c030f2e4Sxinhui pan 
2619c030f2e4Sxinhui pan 	amdgpu_ras_set_context(adev, con);
2620c030f2e4Sxinhui pan 
2621e509965eSLuben Tuikov 	amdgpu_ras_check_supported(adev);
2622e509965eSLuben Tuikov 
26237ddd9770SOak Zeng 	if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
2624970fd197SStanley.Yang 		/* set gfx block ras context feature for VEGA20 Gaming
2625970fd197SStanley.Yang 		 * send ras disable cmd to ras ta during ras late init.
2626970fd197SStanley.Yang 		 */
26278ab0d6f0SLuben Tuikov 		if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
2628970fd197SStanley.Yang 			con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
2629970fd197SStanley.Yang 
2630970fd197SStanley.Yang 			return 0;
2631970fd197SStanley.Yang 		}
2632970fd197SStanley.Yang 
26335e91160aSGuchun Chen 		r = 0;
26345436ab94SStanley.Yang 		goto release_con;
2635fb2a3607SHawking Zhang 	}
2636fb2a3607SHawking Zhang 
263769691c82SStanley.Yang 	con->update_channel_flag = false;
2638c030f2e4Sxinhui pan 	con->features = 0;
2639c030f2e4Sxinhui pan 	INIT_LIST_HEAD(&con->head);
2640108c6a63Sxinhui pan 	/* Might need get this flag from vbios. */
2641108c6a63Sxinhui pan 	con->flags = RAS_DEFAULT_FLAGS;
2642c030f2e4Sxinhui pan 
26436e36f231SHawking Zhang 	/* initialize nbio ras function ahead of any other
26446e36f231SHawking Zhang 	 * ras functions so hardware fatal error interrupt
26456e36f231SHawking Zhang 	 * can be enabled as early as possible */
2646fdc94d3aSHawking Zhang 	switch (adev->ip_versions[NBIO_HWIP][0]) {
2647fdc94d3aSHawking Zhang 	case IP_VERSION(7, 4, 0):
2648fdc94d3aSHawking Zhang 	case IP_VERSION(7, 4, 1):
2649fdc94d3aSHawking Zhang 	case IP_VERSION(7, 4, 4):
2650fdc94d3aSHawking Zhang 		if (!adev->gmc.xgmi.connected_to_cpu)
26512e54fe5dSyipechai 			adev->nbio.ras = &nbio_v7_4_ras;
26526e36f231SHawking Zhang 		break;
26539af357bcSHawking Zhang 	case IP_VERSION(4, 3, 0):
26549af357bcSHawking Zhang 		if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
26559af357bcSHawking Zhang 			/* unlike other generation of nbio ras,
26569af357bcSHawking Zhang 			 * nbio v4_3 only support fatal error interrupt
26579af357bcSHawking Zhang 			 * to inform software that DF is freezed due to
26589af357bcSHawking Zhang 			 * system fatal error event. driver should not
26599af357bcSHawking Zhang 			 * enable nbio ras in such case. Instead,
26609af357bcSHawking Zhang 			 * check DF RAS */
26619af357bcSHawking Zhang 			adev->nbio.ras = &nbio_v4_3_ras;
26629af357bcSHawking Zhang 		break;
26637692e1eeSTao Zhou 	case IP_VERSION(7, 9, 0):
26647692e1eeSTao Zhou 		if (!adev->gmc.is_app_apu)
26657692e1eeSTao Zhou 			adev->nbio.ras = &nbio_v7_9_ras;
26667692e1eeSTao Zhou 		break;
26676e36f231SHawking Zhang 	default:
26686e36f231SHawking Zhang 		/* nbio ras is not available */
26696e36f231SHawking Zhang 		break;
26706e36f231SHawking Zhang 	}
26716e36f231SHawking Zhang 
2672fdc94d3aSHawking Zhang 	/* nbio ras block needs to be enabled ahead of other ras blocks
2673fdc94d3aSHawking Zhang 	 * to handle fatal error */
2674fdc94d3aSHawking Zhang 	r = amdgpu_nbio_ras_sw_init(adev);
2675fdc94d3aSHawking Zhang 	if (r)
2676fdc94d3aSHawking Zhang 		return r;
2677fdc94d3aSHawking Zhang 
26782e54fe5dSyipechai 	if (adev->nbio.ras &&
26792e54fe5dSyipechai 	    adev->nbio.ras->init_ras_controller_interrupt) {
26802e54fe5dSyipechai 		r = adev->nbio.ras->init_ras_controller_interrupt(adev);
26814e644fffSHawking Zhang 		if (r)
26825436ab94SStanley.Yang 			goto release_con;
26834e644fffSHawking Zhang 	}
26844e644fffSHawking Zhang 
26852e54fe5dSyipechai 	if (adev->nbio.ras &&
26862e54fe5dSyipechai 	    adev->nbio.ras->init_ras_err_event_athub_interrupt) {
26872e54fe5dSyipechai 		r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
26884e644fffSHawking Zhang 		if (r)
26895436ab94SStanley.Yang 			goto release_con;
26904e644fffSHawking Zhang 	}
26914e644fffSHawking Zhang 
26922dd9032bSTao Zhou 	amdgpu_ras_query_poison_mode(adev);
2693e4348849STao Zhou 
26945e91160aSGuchun Chen 	if (amdgpu_ras_fs_init(adev)) {
26955e91160aSGuchun Chen 		r = -EINVAL;
26965436ab94SStanley.Yang 		goto release_con;
26975e91160aSGuchun Chen 	}
2698c030f2e4Sxinhui pan 
26996952e99cSGuchun Chen 	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
27005d0f903fSxinhui pan 		 "hardware ability[%x] ras_mask[%x]\n",
27018ab0d6f0SLuben Tuikov 		 adev->ras_hw_enabled, adev->ras_enabled);
2702e509965eSLuben Tuikov 
2703c030f2e4Sxinhui pan 	return 0;
27045436ab94SStanley.Yang release_con:
2705c030f2e4Sxinhui pan 	amdgpu_ras_set_context(adev, NULL);
2706c030f2e4Sxinhui pan 	kfree(con);
2707c030f2e4Sxinhui pan 
27085e91160aSGuchun Chen 	return r;
2709c030f2e4Sxinhui pan }
2710c030f2e4Sxinhui pan 
amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device * adev)27118f6368a9SJohn Clements int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
2712134d16d5SJohn Clements {
27138107e499SHawking Zhang 	if (adev->gmc.xgmi.connected_to_cpu ||
27148107e499SHawking Zhang 	    adev->gmc.is_app_apu)
2715134d16d5SJohn Clements 		return 1;
2716134d16d5SJohn Clements 	return 0;
2717134d16d5SJohn Clements }
2718134d16d5SJohn Clements 
amdgpu_persistent_edc_harvesting(struct amdgpu_device * adev,struct ras_common_if * ras_block)2719134d16d5SJohn Clements static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
2720134d16d5SJohn Clements 					struct ras_common_if *ras_block)
2721134d16d5SJohn Clements {
2722134d16d5SJohn Clements 	struct ras_query_if info = {
2723134d16d5SJohn Clements 		.head = *ras_block,
2724134d16d5SJohn Clements 	};
2725134d16d5SJohn Clements 
2726134d16d5SJohn Clements 	if (!amdgpu_persistent_edc_harvesting_supported(adev))
2727134d16d5SJohn Clements 		return 0;
2728134d16d5SJohn Clements 
2729134d16d5SJohn Clements 	if (amdgpu_ras_query_error_status(adev, &info) != 0)
2730134d16d5SJohn Clements 		DRM_WARN("RAS init harvest failure");
2731134d16d5SJohn Clements 
2732134d16d5SJohn Clements 	if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
2733134d16d5SJohn Clements 		DRM_WARN("RAS init harvest reset failure");
2734134d16d5SJohn Clements 
2735134d16d5SJohn Clements 	return 0;
2736134d16d5SJohn Clements }
2737134d16d5SJohn Clements 
amdgpu_ras_is_poison_mode_supported(struct amdgpu_device * adev)2738e4348849STao Zhou bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
2739e4348849STao Zhou {
2740e4348849STao Zhou        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2741e4348849STao Zhou 
2742e4348849STao Zhou        if (!con)
2743e4348849STao Zhou                return false;
2744e4348849STao Zhou 
2745e4348849STao Zhou        return con->poison_supported;
2746e4348849STao Zhou }
2747e4348849STao Zhou 
2748b293e891SHawking Zhang /* helper function to handle common stuff in ip late init phase */
amdgpu_ras_block_late_init(struct amdgpu_device * adev,struct ras_common_if * ras_block)2749563285c8Syipechai int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
2750563285c8Syipechai 			 struct ras_common_if *ras_block)
2751b293e891SHawking Zhang {
275229c9b6cdSyipechai 	struct amdgpu_ras_block_object *ras_obj = NULL;
275305adfd80SLuben Tuikov 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
27544a1c9a44SHawking Zhang 	struct ras_query_if *query_info;
275505adfd80SLuben Tuikov 	unsigned long ue_count, ce_count;
2756b293e891SHawking Zhang 	int r;
2757b293e891SHawking Zhang 
2758b293e891SHawking Zhang 	/* disable RAS feature per IP block if it is not supported */
2759b293e891SHawking Zhang 	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2760b293e891SHawking Zhang 		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2761b293e891SHawking Zhang 		return 0;
2762b293e891SHawking Zhang 	}
2763b293e891SHawking Zhang 
2764b293e891SHawking Zhang 	r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2765b293e891SHawking Zhang 	if (r) {
27669080a18fSCandice Li 		if (adev->in_suspend || amdgpu_in_reset(adev)) {
2767b293e891SHawking Zhang 			/* in resume phase, if fail to enable ras,
2768b293e891SHawking Zhang 			 * clean up all ras fs nodes, and disable ras */
2769b293e891SHawking Zhang 			goto cleanup;
2770b293e891SHawking Zhang 		} else
2771b293e891SHawking Zhang 			return r;
2772b293e891SHawking Zhang 	}
2773b293e891SHawking Zhang 
2774134d16d5SJohn Clements 	/* check for errors on warm reset edc persisant supported ASIC */
2775134d16d5SJohn Clements 	amdgpu_persistent_edc_harvesting(adev, ras_block);
2776134d16d5SJohn Clements 
2777b293e891SHawking Zhang 	/* in resume phase, no need to create ras fs node */
277853b3f8f4SDennis Li 	if (adev->in_suspend || amdgpu_in_reset(adev))
2779b293e891SHawking Zhang 		return 0;
2780b293e891SHawking Zhang 
2781563285c8Syipechai 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
278236780606STao Zhou 	if (ras_obj->ras_cb || (ras_obj->hw_ops &&
278336780606STao Zhou 	    (ras_obj->hw_ops->query_poison_status ||
278436780606STao Zhou 	    ras_obj->hw_ops->handle_poison_consumption))) {
27859252d33dSyipechai 		r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
2786b293e891SHawking Zhang 		if (r)
2787779596ceSTom Rix 			goto cleanup;
2788b293e891SHawking Zhang 	}
2789b293e891SHawking Zhang 
2790f957138cSHawking Zhang 	if (ras_obj->hw_ops &&
2791f957138cSHawking Zhang 	    (ras_obj->hw_ops->query_ras_error_count ||
2792f957138cSHawking Zhang 	     ras_obj->hw_ops->query_ras_error_status)) {
27939252d33dSyipechai 		r = amdgpu_ras_sysfs_create(adev, ras_block);
2794b293e891SHawking Zhang 		if (r)
2795779596ceSTom Rix 			goto interrupt;
2796b293e891SHawking Zhang 
279705adfd80SLuben Tuikov 		/* Those are the cached values at init.
279805adfd80SLuben Tuikov 		 */
2799f957138cSHawking Zhang 		query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
28004a1c9a44SHawking Zhang 		if (!query_info)
28014a1c9a44SHawking Zhang 			return -ENOMEM;
28024a1c9a44SHawking Zhang 		memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
28034a1c9a44SHawking Zhang 
28044a1c9a44SHawking Zhang 		if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
280505adfd80SLuben Tuikov 			atomic_set(&con->ras_ce_count, ce_count);
280605adfd80SLuben Tuikov 			atomic_set(&con->ras_ue_count, ue_count);
28074d9f771eSLuben Tuikov 		}
280805adfd80SLuben Tuikov 
28094a1c9a44SHawking Zhang 		kfree(query_info);
2810f957138cSHawking Zhang 	}
2811f957138cSHawking Zhang 
2812b293e891SHawking Zhang 	return 0;
2813779596ceSTom Rix 
2814779596ceSTom Rix interrupt:
2815563285c8Syipechai 	if (ras_obj->ras_cb)
28169252d33dSyipechai 		amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2817779596ceSTom Rix cleanup:
2818b293e891SHawking Zhang 	amdgpu_ras_feature_enable(adev, ras_block, 0);
2819b293e891SHawking Zhang 	return r;
2820b293e891SHawking Zhang }
2821b293e891SHawking Zhang 
amdgpu_ras_block_late_init_default(struct amdgpu_device * adev,struct ras_common_if * ras_block)2822d41ff22aSMaíra Canal static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
2823418abce2Syipechai 			 struct ras_common_if *ras_block)
2824418abce2Syipechai {
2825418abce2Syipechai 	return amdgpu_ras_block_late_init(adev, ras_block);
2826418abce2Syipechai }
2827418abce2Syipechai 
2828b293e891SHawking Zhang /* helper function to remove ras fs node and interrupt handler */
amdgpu_ras_block_late_fini(struct amdgpu_device * adev,struct ras_common_if * ras_block)2829bdb3489cSyipechai void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
2830bdb3489cSyipechai 			  struct ras_common_if *ras_block)
2831bdb3489cSyipechai {
2832563285c8Syipechai 	struct amdgpu_ras_block_object *ras_obj;
2833bdb3489cSyipechai 	if (!ras_block)
2834bdb3489cSyipechai 		return;
2835bdb3489cSyipechai 
2836563285c8Syipechai 	amdgpu_ras_sysfs_remove(adev, ras_block);
2837bdb3489cSyipechai 
2838563285c8Syipechai 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
2839563285c8Syipechai 	if (ras_obj->ras_cb)
2840563285c8Syipechai 		amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2841bdb3489cSyipechai }
2842bdb3489cSyipechai 
amdgpu_ras_block_late_fini_default(struct amdgpu_device * adev,struct ras_common_if * ras_block)284380e0c2cbSyipechai static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
284480e0c2cbSyipechai 			  struct ras_common_if *ras_block)
284580e0c2cbSyipechai {
284680e0c2cbSyipechai 	return amdgpu_ras_block_late_fini(adev, ras_block);
284780e0c2cbSyipechai }
284880e0c2cbSyipechai 
2849a564808eSxinhui pan /* do some init work after IP late init as dependence.
2850511fdbc3Sxinhui pan  * and it runs in resume/gpu reset/booting up cases.
2851a564808eSxinhui pan  */
amdgpu_ras_resume(struct amdgpu_device * adev)2852511fdbc3Sxinhui pan void amdgpu_ras_resume(struct amdgpu_device *adev)
2853108c6a63Sxinhui pan {
2854108c6a63Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2855108c6a63Sxinhui pan 	struct ras_manager *obj, *tmp;
2856108c6a63Sxinhui pan 
28578ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con) {
2858970fd197SStanley.Yang 		/* clean ras context for VEGA20 Gaming after send ras disable cmd */
2859970fd197SStanley.Yang 		amdgpu_release_ras_context(adev);
2860970fd197SStanley.Yang 
2861108c6a63Sxinhui pan 		return;
2862970fd197SStanley.Yang 	}
2863108c6a63Sxinhui pan 
2864108c6a63Sxinhui pan 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2865191051a1Sxinhui pan 		/* Set up all other IPs which are not implemented. There is a
2866191051a1Sxinhui pan 		 * tricky thing that IP's actual ras error type should be
2867191051a1Sxinhui pan 		 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
2868191051a1Sxinhui pan 		 * ERROR_NONE make sense anyway.
2869191051a1Sxinhui pan 		 */
2870191051a1Sxinhui pan 		amdgpu_ras_enable_all_features(adev, 1);
2871191051a1Sxinhui pan 
2872191051a1Sxinhui pan 		/* We enable ras on all hw_supported block, but as boot
2873191051a1Sxinhui pan 		 * parameter might disable some of them and one or more IP has
2874191051a1Sxinhui pan 		 * not implemented yet. So we disable them on behalf.
2875191051a1Sxinhui pan 		 */
2876108c6a63Sxinhui pan 		list_for_each_entry_safe(obj, tmp, &con->head, node) {
2877108c6a63Sxinhui pan 			if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2878108c6a63Sxinhui pan 				amdgpu_ras_feature_enable(adev, &obj->head, 0);
2879108c6a63Sxinhui pan 				/* there should be no any reference. */
2880108c6a63Sxinhui pan 				WARN_ON(alive_obj(obj));
2881108c6a63Sxinhui pan 			}
2882191051a1Sxinhui pan 		}
2883108c6a63Sxinhui pan 	}
2884108c6a63Sxinhui pan }
2885108c6a63Sxinhui pan 
amdgpu_ras_suspend(struct amdgpu_device * adev)2886511fdbc3Sxinhui pan void amdgpu_ras_suspend(struct amdgpu_device *adev)
2887511fdbc3Sxinhui pan {
2888511fdbc3Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2889511fdbc3Sxinhui pan 
28908ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
2891511fdbc3Sxinhui pan 		return;
2892511fdbc3Sxinhui pan 
2893511fdbc3Sxinhui pan 	amdgpu_ras_disable_all_features(adev, 0);
2894511fdbc3Sxinhui pan 	/* Make sure all ras objects are disabled. */
2895511fdbc3Sxinhui pan 	if (con->features)
2896511fdbc3Sxinhui pan 		amdgpu_ras_disable_all_features(adev, 1);
2897511fdbc3Sxinhui pan }
2898511fdbc3Sxinhui pan 
amdgpu_ras_late_init(struct amdgpu_device * adev)2899867e24caSyipechai int amdgpu_ras_late_init(struct amdgpu_device *adev)
2900867e24caSyipechai {
2901867e24caSyipechai 	struct amdgpu_ras_block_list *node, *tmp;
2902867e24caSyipechai 	struct amdgpu_ras_block_object *obj;
2903867e24caSyipechai 	int r;
2904867e24caSyipechai 
2905950d6425SStanley.Yang 	/* Guest side doesn't need init ras feature */
2906950d6425SStanley.Yang 	if (amdgpu_sriov_vf(adev))
2907950d6425SStanley.Yang 		return 0;
2908950d6425SStanley.Yang 
2909867e24caSyipechai 	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
2910867e24caSyipechai 		if (!node->ras_obj) {
2911867e24caSyipechai 			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
2912867e24caSyipechai 			continue;
2913867e24caSyipechai 		}
2914418abce2Syipechai 
2915867e24caSyipechai 		obj = node->ras_obj;
2916867e24caSyipechai 		if (obj->ras_late_init) {
2917867e24caSyipechai 			r = obj->ras_late_init(adev, &obj->ras_comm);
2918867e24caSyipechai 			if (r) {
2919867e24caSyipechai 				dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
2920867e24caSyipechai 					obj->ras_comm.name, r);
2921867e24caSyipechai 				return r;
2922867e24caSyipechai 			}
2923418abce2Syipechai 		} else
2924418abce2Syipechai 			amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
2925867e24caSyipechai 	}
2926867e24caSyipechai 
2927867e24caSyipechai 	return 0;
2928867e24caSyipechai }
2929867e24caSyipechai 
2930c030f2e4Sxinhui pan /* do some fini work before IP fini as dependence */
amdgpu_ras_pre_fini(struct amdgpu_device * adev)2931c030f2e4Sxinhui pan int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2932c030f2e4Sxinhui pan {
2933c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2934c030f2e4Sxinhui pan 
29358ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
2936c030f2e4Sxinhui pan 		return 0;
2937c030f2e4Sxinhui pan 
293872c8c97bSAndrey Grodzovsky 
2939c030f2e4Sxinhui pan 	/* Need disable ras on all IPs here before ip [hw/sw]fini */
2940642c0401SYiPeng Chai 	if (con->features)
2941c030f2e4Sxinhui pan 		amdgpu_ras_disable_all_features(adev, 0);
2942c030f2e4Sxinhui pan 	amdgpu_ras_recovery_fini(adev);
2943c030f2e4Sxinhui pan 	return 0;
2944c030f2e4Sxinhui pan }
2945c030f2e4Sxinhui pan 
amdgpu_ras_fini(struct amdgpu_device * adev)2946c030f2e4Sxinhui pan int amdgpu_ras_fini(struct amdgpu_device *adev)
2947c030f2e4Sxinhui pan {
2948d5e8ff5fSyipechai 	struct amdgpu_ras_block_list *ras_node, *tmp;
29491f211a82Syipechai 	struct amdgpu_ras_block_object *obj = NULL;
2950c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2951c030f2e4Sxinhui pan 
29528ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
2953c030f2e4Sxinhui pan 		return 0;
2954c030f2e4Sxinhui pan 
29551f211a82Syipechai 	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
29561f211a82Syipechai 		if (ras_node->ras_obj) {
29571f211a82Syipechai 			obj = ras_node->ras_obj;
29581f211a82Syipechai 			if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
29591f211a82Syipechai 			    obj->ras_fini)
29601f211a82Syipechai 				obj->ras_fini(adev, &obj->ras_comm);
296180e0c2cbSyipechai 			else
296280e0c2cbSyipechai 				amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
29631f211a82Syipechai 		}
29641f211a82Syipechai 
29651f211a82Syipechai 		/* Clear ras blocks from ras_list and free ras block list node */
29661f211a82Syipechai 		list_del(&ras_node->node);
29671f211a82Syipechai 		kfree(ras_node);
29681f211a82Syipechai 	}
29691f211a82Syipechai 
2970c030f2e4Sxinhui pan 	amdgpu_ras_fs_fini(adev);
2971c030f2e4Sxinhui pan 	amdgpu_ras_interrupt_remove_all(adev);
2972c030f2e4Sxinhui pan 
2973c030f2e4Sxinhui pan 	WARN(con->features, "Feature mask is not cleared");
2974c030f2e4Sxinhui pan 
2975c030f2e4Sxinhui pan 	if (con->features)
2976c030f2e4Sxinhui pan 		amdgpu_ras_disable_all_features(adev, 1);
2977c030f2e4Sxinhui pan 
297805adfd80SLuben Tuikov 	cancel_delayed_work_sync(&con->ras_counte_delay_work);
297905adfd80SLuben Tuikov 
2980c030f2e4Sxinhui pan 	amdgpu_ras_set_context(adev, NULL);
2981c030f2e4Sxinhui pan 	kfree(con);
2982c030f2e4Sxinhui pan 
2983c030f2e4Sxinhui pan 	return 0;
2984c030f2e4Sxinhui pan }
29857c6e68c7SAndrey Grodzovsky 
amdgpu_ras_global_ras_isr(struct amdgpu_device * adev)29867c6e68c7SAndrey Grodzovsky void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
29877c6e68c7SAndrey Grodzovsky {
29887c6e68c7SAndrey Grodzovsky 	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
29892c7cd280SYiPeng Chai 		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
29902c7cd280SYiPeng Chai 
29916952e99cSGuchun Chen 		dev_info(adev->dev, "uncorrectable hardware error"
29926952e99cSGuchun Chen 			"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2993d5ea093eSAndrey Grodzovsky 
29942c7cd280SYiPeng Chai 		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
299561934624SGuchun Chen 		amdgpu_ras_reset_gpu(adev);
29967c6e68c7SAndrey Grodzovsky 	}
29977c6e68c7SAndrey Grodzovsky }
2998bb5c7235SWenhui Sheng 
amdgpu_ras_need_emergency_restart(struct amdgpu_device * adev)2999bb5c7235SWenhui Sheng bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
3000bb5c7235SWenhui Sheng {
3001bb5c7235SWenhui Sheng 	if (adev->asic_type == CHIP_VEGA20 &&
3002bb5c7235SWenhui Sheng 	    adev->pm.fw_version <= 0x283400) {
3003bb5c7235SWenhui Sheng 		return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
3004bb5c7235SWenhui Sheng 				amdgpu_ras_intr_triggered();
3005bb5c7235SWenhui Sheng 	}
3006bb5c7235SWenhui Sheng 
3007bb5c7235SWenhui Sheng 	return false;
3008bb5c7235SWenhui Sheng }
3009970fd197SStanley.Yang 
amdgpu_release_ras_context(struct amdgpu_device * adev)3010970fd197SStanley.Yang void amdgpu_release_ras_context(struct amdgpu_device *adev)
3011970fd197SStanley.Yang {
3012970fd197SStanley.Yang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3013970fd197SStanley.Yang 
3014970fd197SStanley.Yang 	if (!con)
3015970fd197SStanley.Yang 		return;
3016970fd197SStanley.Yang 
30178ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
3018970fd197SStanley.Yang 		con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
3019970fd197SStanley.Yang 		amdgpu_ras_set_context(adev, NULL);
3020970fd197SStanley.Yang 		kfree(con);
3021970fd197SStanley.Yang 	}
3022970fd197SStanley.Yang }
302312b2cab7SMukul Joshi 
302412b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
find_adev(uint32_t node_id)302512b2cab7SMukul Joshi static struct amdgpu_device *find_adev(uint32_t node_id)
302612b2cab7SMukul Joshi {
302712b2cab7SMukul Joshi 	int i;
302812b2cab7SMukul Joshi 	struct amdgpu_device *adev = NULL;
302912b2cab7SMukul Joshi 
303091a1a52dSMukul Joshi 	for (i = 0; i < mce_adev_list.num_gpu; i++) {
303191a1a52dSMukul Joshi 		adev = mce_adev_list.devs[i];
303212b2cab7SMukul Joshi 
303391a1a52dSMukul Joshi 		if (adev && adev->gmc.xgmi.connected_to_cpu &&
303412b2cab7SMukul Joshi 		    adev->gmc.xgmi.physical_node_id == node_id)
303512b2cab7SMukul Joshi 			break;
303612b2cab7SMukul Joshi 		adev = NULL;
303712b2cab7SMukul Joshi 	}
303812b2cab7SMukul Joshi 
303912b2cab7SMukul Joshi 	return adev;
304012b2cab7SMukul Joshi }
304112b2cab7SMukul Joshi 
304212b2cab7SMukul Joshi #define GET_MCA_IPID_GPUID(m)	(((m) >> 44) & 0xF)
304312b2cab7SMukul Joshi #define GET_UMC_INST(m)		(((m) >> 21) & 0x7)
304412b2cab7SMukul Joshi #define GET_CHAN_INDEX(m)	((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
304512b2cab7SMukul Joshi #define GPU_ID_OFFSET		8
304612b2cab7SMukul Joshi 
amdgpu_bad_page_notifier(struct notifier_block * nb,unsigned long val,void * data)304712b2cab7SMukul Joshi static int amdgpu_bad_page_notifier(struct notifier_block *nb,
304812b2cab7SMukul Joshi 				    unsigned long val, void *data)
304912b2cab7SMukul Joshi {
305012b2cab7SMukul Joshi 	struct mce *m = (struct mce *)data;
305112b2cab7SMukul Joshi 	struct amdgpu_device *adev = NULL;
305212b2cab7SMukul Joshi 	uint32_t gpu_id = 0;
3053cd4c99f1STao Zhou 	uint32_t umc_inst = 0, ch_inst = 0;
305412b2cab7SMukul Joshi 
305512b2cab7SMukul Joshi 	/*
305612b2cab7SMukul Joshi 	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
305712b2cab7SMukul Joshi 	 * and error occurred in DramECC (Extended error code = 0) then only
305812b2cab7SMukul Joshi 	 * process the error, else bail out.
305912b2cab7SMukul Joshi 	 */
306091f75eb4SYazen Ghannam 	if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
306112b2cab7SMukul Joshi 		    (XEC(m->status, 0x3f) == 0x0)))
306212b2cab7SMukul Joshi 		return NOTIFY_DONE;
306312b2cab7SMukul Joshi 
306412b2cab7SMukul Joshi 	/*
306512b2cab7SMukul Joshi 	 * If it is correctable error, return.
306612b2cab7SMukul Joshi 	 */
306712b2cab7SMukul Joshi 	if (mce_is_correctable(m))
306812b2cab7SMukul Joshi 		return NOTIFY_OK;
306912b2cab7SMukul Joshi 
307012b2cab7SMukul Joshi 	/*
307112b2cab7SMukul Joshi 	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
307212b2cab7SMukul Joshi 	 */
307312b2cab7SMukul Joshi 	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
307412b2cab7SMukul Joshi 
307512b2cab7SMukul Joshi 	adev = find_adev(gpu_id);
307612b2cab7SMukul Joshi 	if (!adev) {
307712b2cab7SMukul Joshi 		DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
307812b2cab7SMukul Joshi 								gpu_id);
307912b2cab7SMukul Joshi 		return NOTIFY_DONE;
308012b2cab7SMukul Joshi 	}
308112b2cab7SMukul Joshi 
308212b2cab7SMukul Joshi 	/*
308312b2cab7SMukul Joshi 	 * If it is uncorrectable error, then find out UMC instance and
308412b2cab7SMukul Joshi 	 * channel index.
308512b2cab7SMukul Joshi 	 */
308612b2cab7SMukul Joshi 	umc_inst = GET_UMC_INST(m->ipid);
308712b2cab7SMukul Joshi 	ch_inst = GET_CHAN_INDEX(m->ipid);
308812b2cab7SMukul Joshi 
308912b2cab7SMukul Joshi 	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
309012b2cab7SMukul Joshi 			     umc_inst, ch_inst);
309112b2cab7SMukul Joshi 
309224b82292STao Zhou 	if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst))
309312b2cab7SMukul Joshi 		return NOTIFY_OK;
309424b82292STao Zhou 	else
309524b82292STao Zhou 		return NOTIFY_DONE;
309612b2cab7SMukul Joshi }
309712b2cab7SMukul Joshi 
309812b2cab7SMukul Joshi static struct notifier_block amdgpu_bad_page_nb = {
309912b2cab7SMukul Joshi 	.notifier_call  = amdgpu_bad_page_notifier,
310012b2cab7SMukul Joshi 	.priority       = MCE_PRIO_UC,
310112b2cab7SMukul Joshi };
310212b2cab7SMukul Joshi 
amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device * adev)310391a1a52dSMukul Joshi static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
310412b2cab7SMukul Joshi {
310512b2cab7SMukul Joshi 	/*
310691a1a52dSMukul Joshi 	 * Add the adev to the mce_adev_list.
310791a1a52dSMukul Joshi 	 * During mode2 reset, amdgpu device is temporarily
310891a1a52dSMukul Joshi 	 * removed from the mgpu_info list which can cause
310991a1a52dSMukul Joshi 	 * page retirement to fail.
311091a1a52dSMukul Joshi 	 * Use this list instead of mgpu_info to find the amdgpu
311191a1a52dSMukul Joshi 	 * device on which the UMC error was reported.
311291a1a52dSMukul Joshi 	 */
311391a1a52dSMukul Joshi 	mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
311491a1a52dSMukul Joshi 
311591a1a52dSMukul Joshi 	/*
311612b2cab7SMukul Joshi 	 * Register the x86 notifier only once
311712b2cab7SMukul Joshi 	 * with MCE subsystem.
311812b2cab7SMukul Joshi 	 */
311912b2cab7SMukul Joshi 	if (notifier_registered == false) {
312012b2cab7SMukul Joshi 		mce_register_decode_chain(&amdgpu_bad_page_nb);
312112b2cab7SMukul Joshi 		notifier_registered = true;
312212b2cab7SMukul Joshi 	}
312312b2cab7SMukul Joshi }
312412b2cab7SMukul Joshi #endif
31257cab2124Syipechai 
amdgpu_ras_get_context(struct amdgpu_device * adev)31267cab2124Syipechai struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
31277cab2124Syipechai {
31287cab2124Syipechai 	if (!adev)
31297cab2124Syipechai 		return NULL;
31307cab2124Syipechai 
31317cab2124Syipechai 	return adev->psp.ras_context.ras;
31327cab2124Syipechai }
31337cab2124Syipechai 
amdgpu_ras_set_context(struct amdgpu_device * adev,struct amdgpu_ras * ras_con)31347cab2124Syipechai int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
31357cab2124Syipechai {
31367cab2124Syipechai 	if (!adev)
313769f91d32SYang Li 		return -EINVAL;
31387cab2124Syipechai 
31397cab2124Syipechai 	adev->psp.ras_context.ras = ras_con;
31407cab2124Syipechai 	return 0;
31417cab2124Syipechai }
31427cab2124Syipechai 
31437cab2124Syipechai /* check if ras is supported on block, say, sdma, gfx */
amdgpu_ras_is_supported(struct amdgpu_device * adev,unsigned int block)31447cab2124Syipechai int amdgpu_ras_is_supported(struct amdgpu_device *adev,
31457cab2124Syipechai 		unsigned int block)
31467cab2124Syipechai {
31478f453c51SYiPeng Chai 	int ret = 0;
31487cab2124Syipechai 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
31497cab2124Syipechai 
31507cab2124Syipechai 	if (block >= AMDGPU_RAS_BLOCK_COUNT)
31517cab2124Syipechai 		return 0;
31528f453c51SYiPeng Chai 
31538f453c51SYiPeng Chai 	ret = ras && (adev->ras_enabled & (1 << block));
31548f453c51SYiPeng Chai 
31558f453c51SYiPeng Chai 	/* For the special asic with mem ecc enabled but sram ecc
31568f453c51SYiPeng Chai 	 * not enabled, even if the ras block is not supported on
31578f453c51SYiPeng Chai 	 * .ras_enabled, if the asic supports poison mode and the
31588f453c51SYiPeng Chai 	 * ras block has ras configuration, it can be considered
31598f453c51SYiPeng Chai 	 * that the ras block supports ras function.
31608f453c51SYiPeng Chai 	 */
31618f453c51SYiPeng Chai 	if (!ret &&
3162bc0f8080SCandice Li 	    (block == AMDGPU_RAS_BLOCK__GFX ||
3163bc0f8080SCandice Li 	     block == AMDGPU_RAS_BLOCK__SDMA ||
3164bc0f8080SCandice Li 	     block == AMDGPU_RAS_BLOCK__VCN ||
3165bc0f8080SCandice Li 	     block == AMDGPU_RAS_BLOCK__JPEG) &&
31668f453c51SYiPeng Chai 	    amdgpu_ras_is_poison_mode_supported(adev) &&
31678f453c51SYiPeng Chai 	    amdgpu_ras_get_ras_block(adev, block, 0))
31688f453c51SYiPeng Chai 		ret = 1;
31698f453c51SYiPeng Chai 
31708f453c51SYiPeng Chai 	return ret;
31717cab2124Syipechai }
31727cab2124Syipechai 
amdgpu_ras_reset_gpu(struct amdgpu_device * adev)31737cab2124Syipechai int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
31747cab2124Syipechai {
31757cab2124Syipechai 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
31767cab2124Syipechai 
31777cab2124Syipechai 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
317825a2b22eSAndrey Grodzovsky 		amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
31797cab2124Syipechai 	return 0;
31807cab2124Syipechai }
31817cab2124Syipechai 
31827cab2124Syipechai 
31836492e1b0Syipechai /* Register each ip ras block into amdgpu ras */
amdgpu_ras_register_ras_block(struct amdgpu_device * adev,struct amdgpu_ras_block_object * ras_block_obj)31846492e1b0Syipechai int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
31856492e1b0Syipechai 		struct amdgpu_ras_block_object *ras_block_obj)
31866492e1b0Syipechai {
3187d5e8ff5fSyipechai 	struct amdgpu_ras_block_list *ras_node;
31886492e1b0Syipechai 	if (!adev || !ras_block_obj)
31896492e1b0Syipechai 		return -EINVAL;
31906492e1b0Syipechai 
3191d5e8ff5fSyipechai 	ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
3192d5e8ff5fSyipechai 	if (!ras_node)
3193d5e8ff5fSyipechai 		return -ENOMEM;
3194d5e8ff5fSyipechai 
3195d5e8ff5fSyipechai 	INIT_LIST_HEAD(&ras_node->node);
3196d5e8ff5fSyipechai 	ras_node->ras_obj = ras_block_obj;
3197d5e8ff5fSyipechai 	list_add_tail(&ras_node->node, &adev->ras_list);
31986492e1b0Syipechai 
31996492e1b0Syipechai 	return 0;
32006492e1b0Syipechai }
3201322a7e00SHawking Zhang 
amdgpu_ras_get_error_type_name(uint32_t err_type,char * err_type_name)3202322a7e00SHawking Zhang void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name)
3203322a7e00SHawking Zhang {
3204322a7e00SHawking Zhang 	if (!err_type_name)
3205322a7e00SHawking Zhang 		return;
3206322a7e00SHawking Zhang 
3207322a7e00SHawking Zhang 	switch (err_type) {
3208322a7e00SHawking Zhang 	case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
3209322a7e00SHawking Zhang 		sprintf(err_type_name, "correctable");
3210322a7e00SHawking Zhang 		break;
3211322a7e00SHawking Zhang 	case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
3212322a7e00SHawking Zhang 		sprintf(err_type_name, "uncorrectable");
3213322a7e00SHawking Zhang 		break;
3214322a7e00SHawking Zhang 	default:
3215322a7e00SHawking Zhang 		sprintf(err_type_name, "unknown");
3216322a7e00SHawking Zhang 		break;
3217322a7e00SHawking Zhang 	}
3218322a7e00SHawking Zhang }
3219322a7e00SHawking Zhang 
amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_entry,uint32_t instance,uint32_t * memory_id)3220322a7e00SHawking Zhang bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
3221322a7e00SHawking Zhang 					 const struct amdgpu_ras_err_status_reg_entry *reg_entry,
3222322a7e00SHawking Zhang 					 uint32_t instance,
3223322a7e00SHawking Zhang 					 uint32_t *memory_id)
3224322a7e00SHawking Zhang {
3225322a7e00SHawking Zhang 	uint32_t err_status_lo_data, err_status_lo_offset;
3226322a7e00SHawking Zhang 
3227322a7e00SHawking Zhang 	if (!reg_entry)
3228322a7e00SHawking Zhang 		return false;
3229322a7e00SHawking Zhang 
3230322a7e00SHawking Zhang 	err_status_lo_offset =
3231322a7e00SHawking Zhang 		AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
3232322a7e00SHawking Zhang 					    reg_entry->seg_lo, reg_entry->reg_lo);
3233322a7e00SHawking Zhang 	err_status_lo_data = RREG32(err_status_lo_offset);
3234322a7e00SHawking Zhang 
3235322a7e00SHawking Zhang 	if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) &&
3236322a7e00SHawking Zhang 	    !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG))
3237322a7e00SHawking Zhang 		return false;
3238322a7e00SHawking Zhang 
3239322a7e00SHawking Zhang 	*memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID);
3240322a7e00SHawking Zhang 
3241322a7e00SHawking Zhang 	return true;
3242322a7e00SHawking Zhang }
3243322a7e00SHawking Zhang 
amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_entry,uint32_t instance,unsigned long * err_cnt)3244322a7e00SHawking Zhang bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
3245322a7e00SHawking Zhang 				       const struct amdgpu_ras_err_status_reg_entry *reg_entry,
3246322a7e00SHawking Zhang 				       uint32_t instance,
3247322a7e00SHawking Zhang 				       unsigned long *err_cnt)
3248322a7e00SHawking Zhang {
3249322a7e00SHawking Zhang 	uint32_t err_status_hi_data, err_status_hi_offset;
3250322a7e00SHawking Zhang 
3251322a7e00SHawking Zhang 	if (!reg_entry)
3252322a7e00SHawking Zhang 		return false;
3253322a7e00SHawking Zhang 
3254322a7e00SHawking Zhang 	err_status_hi_offset =
3255322a7e00SHawking Zhang 		AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
3256322a7e00SHawking Zhang 					    reg_entry->seg_hi, reg_entry->reg_hi);
3257322a7e00SHawking Zhang 	err_status_hi_data = RREG32(err_status_hi_offset);
3258322a7e00SHawking Zhang 
3259322a7e00SHawking Zhang 	if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
3260322a7e00SHawking Zhang 	    !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
32619b337b7dSHawking Zhang 		/* keep the check here in case we need to refer to the result later */
32629b337b7dSHawking Zhang 		dev_dbg(adev->dev, "Invalid err_info field\n");
3263322a7e00SHawking Zhang 
3264322a7e00SHawking Zhang 	/* read err count */
3265322a7e00SHawking Zhang 	*err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
3266322a7e00SHawking Zhang 
3267322a7e00SHawking Zhang 	return true;
3268322a7e00SHawking Zhang }
3269322a7e00SHawking Zhang 
amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_list,uint32_t reg_list_size,const struct amdgpu_ras_memory_id_entry * mem_list,uint32_t mem_list_size,uint32_t instance,uint32_t err_type,unsigned long * err_count)3270322a7e00SHawking Zhang void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
3271322a7e00SHawking Zhang 					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
3272322a7e00SHawking Zhang 					   uint32_t reg_list_size,
3273322a7e00SHawking Zhang 					   const struct amdgpu_ras_memory_id_entry *mem_list,
3274322a7e00SHawking Zhang 					   uint32_t mem_list_size,
3275322a7e00SHawking Zhang 					   uint32_t instance,
3276322a7e00SHawking Zhang 					   uint32_t err_type,
3277322a7e00SHawking Zhang 					   unsigned long *err_count)
3278322a7e00SHawking Zhang {
3279322a7e00SHawking Zhang 	uint32_t memory_id;
3280322a7e00SHawking Zhang 	unsigned long err_cnt;
3281322a7e00SHawking Zhang 	char err_type_name[16];
3282322a7e00SHawking Zhang 	uint32_t i, j;
3283322a7e00SHawking Zhang 
3284322a7e00SHawking Zhang 	for (i = 0; i < reg_list_size; i++) {
32859b337b7dSHawking Zhang 		/* query memory_id from err_status_lo */
32869b337b7dSHawking Zhang 		if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
32879b337b7dSHawking Zhang 							 instance, &memory_id))
32889b337b7dSHawking Zhang 			continue;
32899b337b7dSHawking Zhang 
3290322a7e00SHawking Zhang 		/* query err_cnt from err_status_hi */
3291322a7e00SHawking Zhang 		if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
3292322a7e00SHawking Zhang 						       instance, &err_cnt) ||
3293322a7e00SHawking Zhang 		    !err_cnt)
3294322a7e00SHawking Zhang 			continue;
3295322a7e00SHawking Zhang 
3296322a7e00SHawking Zhang 		*err_count += err_cnt;
3297322a7e00SHawking Zhang 
3298322a7e00SHawking Zhang 		/* log the errors */
3299322a7e00SHawking Zhang 		amdgpu_ras_get_error_type_name(err_type, err_type_name);
3300322a7e00SHawking Zhang 		if (!mem_list) {
3301322a7e00SHawking Zhang 			/* memory_list is not supported */
3302322a7e00SHawking Zhang 			dev_info(adev->dev,
3303322a7e00SHawking Zhang 				 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
3304322a7e00SHawking Zhang 				 err_cnt, err_type_name,
3305322a7e00SHawking Zhang 				 reg_list[i].block_name,
3306322a7e00SHawking Zhang 				 instance, memory_id);
3307322a7e00SHawking Zhang 		} else {
3308322a7e00SHawking Zhang 			for (j = 0; j < mem_list_size; j++) {
3309322a7e00SHawking Zhang 				if (memory_id == mem_list[j].memory_id) {
3310322a7e00SHawking Zhang 					dev_info(adev->dev,
3311322a7e00SHawking Zhang 						 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
3312322a7e00SHawking Zhang 						 err_cnt, err_type_name,
3313322a7e00SHawking Zhang 						 reg_list[i].block_name,
3314322a7e00SHawking Zhang 						 instance, mem_list[j].name);
3315322a7e00SHawking Zhang 					break;
3316322a7e00SHawking Zhang 				}
3317322a7e00SHawking Zhang 			}
3318322a7e00SHawking Zhang 		}
3319322a7e00SHawking Zhang 	}
3320322a7e00SHawking Zhang }
3321e53a3250SHawking Zhang 
amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_list,uint32_t reg_list_size,uint32_t instance)3322e53a3250SHawking Zhang void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
3323e53a3250SHawking Zhang 					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
3324e53a3250SHawking Zhang 					   uint32_t reg_list_size,
3325e53a3250SHawking Zhang 					   uint32_t instance)
3326e53a3250SHawking Zhang {
3327e53a3250SHawking Zhang 	uint32_t err_status_lo_offset, err_status_hi_offset;
3328e53a3250SHawking Zhang 	uint32_t i;
3329e53a3250SHawking Zhang 
3330e53a3250SHawking Zhang 	for (i = 0; i < reg_list_size; i++) {
3331e53a3250SHawking Zhang 		err_status_lo_offset =
3332e53a3250SHawking Zhang 			AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
3333e53a3250SHawking Zhang 						    reg_list[i].seg_lo, reg_list[i].reg_lo);
3334e53a3250SHawking Zhang 		err_status_hi_offset =
3335e53a3250SHawking Zhang 			AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
3336e53a3250SHawking Zhang 						    reg_list[i].seg_hi, reg_list[i].reg_hi);
3337e53a3250SHawking Zhang 		WREG32(err_status_lo_offset, 0);
3338e53a3250SHawking Zhang 		WREG32(err_status_hi_offset, 0);
3339e53a3250SHawking Zhang 	}
3340e53a3250SHawking Zhang }
3341