1 /*
2  * Copyright 2019 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #include "amdgpu_ras.h"
25 
26 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, void *ras_ih_info)
27 {
28 	int r;
29 	struct ras_ih_if *ih_info = (struct ras_ih_if *)ras_ih_info;
30 	struct ras_fs_if fs_info = {
31 		.sysfs_name = "umc_err_count",
32 		.debugfs_name = "umc_err_inject",
33 	};
34 
35 	if (!ih_info)
36 		return -EINVAL;
37 
38 	if (!adev->gmc.umc_ras_if) {
39 		adev->gmc.umc_ras_if =
40 			kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
41 		if (!adev->gmc.umc_ras_if)
42 			return -ENOMEM;
43 		adev->gmc.umc_ras_if->block = AMDGPU_RAS_BLOCK__UMC;
44 		adev->gmc.umc_ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
45 		adev->gmc.umc_ras_if->sub_block_index = 0;
46 		strcpy(adev->gmc.umc_ras_if->name, "umc");
47 	}
48 	ih_info->head = fs_info.head = *adev->gmc.umc_ras_if;
49 
50 	r = amdgpu_ras_late_init(adev, adev->gmc.umc_ras_if,
51 				 &fs_info, ih_info);
52 	if (r)
53 		goto free;
54 
55 	if (amdgpu_ras_is_supported(adev, adev->gmc.umc_ras_if->block)) {
56 		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
57 		if (r)
58 			goto late_fini;
59 	} else {
60 		r = 0;
61 		goto free;
62 	}
63 
64 	/* ras init of specific umc version */
65 	if (adev->umc.funcs && adev->umc.funcs->err_cnt_init)
66 		adev->umc.funcs->err_cnt_init(adev);
67 
68 	return 0;
69 
70 late_fini:
71 	amdgpu_ras_late_fini(adev, adev->gmc.umc_ras_if, ih_info);
72 free:
73 	kfree(adev->gmc.umc_ras_if);
74 	adev->gmc.umc_ras_if = NULL;
75 	return r;
76 }
77 
78 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
79 		void *ras_error_status,
80 		struct amdgpu_iv_entry *entry)
81 {
82 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
83 
84 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
85 		return AMDGPU_RAS_SUCCESS;
86 
87 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
88 	if (adev->umc.funcs &&
89 	    adev->umc.funcs->query_ras_error_count)
90 	    adev->umc.funcs->query_ras_error_count(adev, ras_error_status);
91 
92 	if (adev->umc.funcs &&
93 	    adev->umc.funcs->query_ras_error_address &&
94 	    adev->umc.max_ras_err_cnt_per_query) {
95 		err_data->err_addr =
96 			kcalloc(adev->umc.max_ras_err_cnt_per_query,
97 				sizeof(struct eeprom_table_record), GFP_KERNEL);
98 		/* still call query_ras_error_address to clear error status
99 		 * even NOMEM error is encountered
100 		 */
101 		if(!err_data->err_addr)
102 			DRM_WARN("Failed to alloc memory for umc error address record!\n");
103 
104 		/* umc query_ras_error_address is also responsible for clearing
105 		 * error status
106 		 */
107 		adev->umc.funcs->query_ras_error_address(adev, ras_error_status);
108 	}
109 
110 	/* only uncorrectable error needs gpu reset */
111 	if (err_data->ue_count) {
112 		if (err_data->err_addr_cnt &&
113 		    amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
114 						err_data->err_addr_cnt))
115 			DRM_WARN("Failed to add ras bad page!\n");
116 
117 		amdgpu_ras_reset_gpu(adev, 0);
118 	}
119 
120 	kfree(err_data->err_addr);
121 	return AMDGPU_RAS_SUCCESS;
122 }
123 
124 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
125 		struct amdgpu_irq_src *source,
126 		struct amdgpu_iv_entry *entry)
127 {
128 	struct ras_common_if *ras_if = adev->gmc.umc_ras_if;
129 	struct ras_dispatch_if ih_data = {
130 		.entry = entry,
131 	};
132 
133 	if (!ras_if)
134 		return 0;
135 
136 	ih_data.head = *ras_if;
137 
138 	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
139 	return 0;
140 }
141