1 /* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include "amdgpu.h" 25 #include "umc_v6_7.h" 26 27 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, 28 struct ras_err_data *err_data, uint64_t err_addr, 29 uint32_t ch_inst, uint32_t umc_inst) 30 { 31 switch (adev->ip_versions[UMC_HWIP][0]) { 32 case IP_VERSION(6, 7, 0): 33 umc_v6_7_convert_error_address(adev, 34 err_data, err_addr, ch_inst, umc_inst); 35 break; 36 default: 37 dev_warn(adev->dev, 38 "UMC address to Physical address translation is not supported\n"); 39 return AMDGPU_RAS_FAIL; 40 } 41 42 return AMDGPU_RAS_SUCCESS; 43 } 44 45 int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, 46 uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) 47 { 48 struct ras_err_data err_data = {0, 0, 0, NULL}; 49 int ret = AMDGPU_RAS_FAIL; 50 51 err_data.err_addr = 52 kcalloc(adev->umc.max_ras_err_cnt_per_query, 53 sizeof(struct eeprom_table_record), GFP_KERNEL); 54 if (!err_data.err_addr) { 55 dev_warn(adev->dev, 56 "Failed to alloc memory for umc error record in MCA notifier!\n"); 57 return AMDGPU_RAS_FAIL; 58 } 59 60 /* 61 * Translate UMC channel address to Physical address 62 */ 63 ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, 64 ch_inst, umc_inst); 65 if (ret) 66 goto out; 67 68 if (amdgpu_bad_page_threshold != 0) { 69 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 70 err_data.err_addr_cnt); 71 amdgpu_ras_save_bad_pages(adev, NULL); 72 } 73 74 out: 75 kfree(err_data.err_addr); 76 return ret; 77 } 78 79 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, 80 void *ras_error_status, 81 struct amdgpu_iv_entry *entry, 82 bool reset) 83 { 84 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 85 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 86 int ret = 0; 87 88 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 89 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); 90 if (ret == -EOPNOTSUPP) { 91 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 92 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) 93 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status); 94 95 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 96 adev->umc.ras->ras_block.hw_ops->query_ras_error_address && 97 adev->umc.max_ras_err_cnt_per_query) { 98 err_data->err_addr = 99 kcalloc(adev->umc.max_ras_err_cnt_per_query, 100 sizeof(struct eeprom_table_record), GFP_KERNEL); 101 102 /* still call query_ras_error_address to clear error status 103 * even NOMEM error is encountered 104 */ 105 if(!err_data->err_addr) 106 dev_warn(adev->dev, "Failed to alloc memory for " 107 "umc error address record!\n"); 108 109 /* umc query_ras_error_address is also responsible for clearing 110 * error status 111 */ 112 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status); 113 } 114 } else if (!ret) { 115 if (adev->umc.ras && 116 adev->umc.ras->ecc_info_query_ras_error_count) 117 adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status); 118 119 if (adev->umc.ras && 120 adev->umc.ras->ecc_info_query_ras_error_address && 121 adev->umc.max_ras_err_cnt_per_query) { 122 err_data->err_addr = 123 kcalloc(adev->umc.max_ras_err_cnt_per_query, 124 sizeof(struct eeprom_table_record), GFP_KERNEL); 125 126 /* still call query_ras_error_address to clear error status 127 * even NOMEM error is encountered 128 */ 129 if(!err_data->err_addr) 130 dev_warn(adev->dev, "Failed to alloc memory for " 131 "umc error address record!\n"); 132 133 /* umc query_ras_error_address is also responsible for clearing 134 * error status 135 */ 136 adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status); 137 } 138 } 139 140 /* only uncorrectable error needs gpu reset */ 141 if (err_data->ue_count) { 142 dev_info(adev->dev, "%ld uncorrectable hardware errors " 143 "detected in UMC block\n", 144 err_data->ue_count); 145 146 if ((amdgpu_bad_page_threshold != 0) && 147 err_data->err_addr_cnt) { 148 amdgpu_ras_add_bad_pages(adev, err_data->err_addr, 149 err_data->err_addr_cnt); 150 amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count)); 151 152 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); 153 154 if (con->update_channel_flag == true) { 155 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); 156 con->update_channel_flag = false; 157 } 158 } 159 160 if (reset) 161 amdgpu_ras_reset_gpu(adev); 162 } 163 164 kfree(err_data->err_addr); 165 return AMDGPU_RAS_SUCCESS; 166 } 167 168 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) 169 { 170 int ret = AMDGPU_RAS_SUCCESS; 171 172 if (adev->gmc.xgmi.connected_to_cpu || 173 adev->gmc.is_app_apu) { 174 if (reset) { 175 /* MCA poison handler is only responsible for GPU reset, 176 * let MCA notifier do page retirement. 177 */ 178 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 179 amdgpu_ras_reset_gpu(adev); 180 } 181 return ret; 182 } 183 184 if (!amdgpu_sriov_vf(adev)) { 185 struct ras_err_data err_data = {0, 0, 0, NULL}; 186 struct ras_common_if head = { 187 .block = AMDGPU_RAS_BLOCK__UMC, 188 }; 189 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); 190 191 ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); 192 193 if (ret == AMDGPU_RAS_SUCCESS && obj) { 194 obj->err_data.ue_count += err_data.ue_count; 195 obj->err_data.ce_count += err_data.ce_count; 196 } 197 } else { 198 if (adev->virt.ops && adev->virt.ops->ras_poison_handler) 199 adev->virt.ops->ras_poison_handler(adev); 200 else 201 dev_warn(adev->dev, 202 "No ras_poison_handler interface in SRIOV!\n"); 203 } 204 205 return ret; 206 } 207 208 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, 209 void *ras_error_status, 210 struct amdgpu_iv_entry *entry) 211 { 212 return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); 213 } 214 215 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) 216 { 217 int err; 218 struct amdgpu_umc_ras *ras; 219 220 if (!adev->umc.ras) 221 return 0; 222 223 ras = adev->umc.ras; 224 225 err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); 226 if (err) { 227 dev_err(adev->dev, "Failed to register umc ras block!\n"); 228 return err; 229 } 230 231 strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc"); 232 ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC; 233 ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 234 adev->umc.ras_if = &ras->ras_block.ras_comm; 235 236 if (!ras->ras_block.ras_late_init) 237 ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init; 238 239 if (!ras->ras_block.ras_cb) 240 ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb; 241 242 return 0; 243 } 244 245 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) 246 { 247 int r; 248 249 r = amdgpu_ras_block_late_init(adev, ras_block); 250 if (r) 251 return r; 252 253 if (amdgpu_ras_is_supported(adev, ras_block->block)) { 254 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); 255 if (r) 256 goto late_fini; 257 } 258 259 /* ras init of specific umc version */ 260 if (adev->umc.ras && 261 adev->umc.ras->err_cnt_init) 262 adev->umc.ras->err_cnt_init(adev); 263 264 return 0; 265 266 late_fini: 267 amdgpu_ras_block_late_fini(adev, ras_block); 268 return r; 269 } 270 271 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, 272 struct amdgpu_irq_src *source, 273 struct amdgpu_iv_entry *entry) 274 { 275 struct ras_common_if *ras_if = adev->umc.ras_if; 276 struct ras_dispatch_if ih_data = { 277 .entry = entry, 278 }; 279 280 if (!ras_if) 281 return 0; 282 283 ih_data.head = *ras_if; 284 285 amdgpu_ras_interrupt_dispatch(adev, &ih_data); 286 return 0; 287 } 288 289 void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, 290 uint64_t err_addr, 291 uint64_t retired_page, 292 uint32_t channel_index, 293 uint32_t umc_inst) 294 { 295 struct eeprom_table_record *err_rec = 296 &err_data->err_addr[err_data->err_addr_cnt]; 297 298 err_rec->address = err_addr; 299 /* page frame address is saved */ 300 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; 301 err_rec->ts = (uint64_t)ktime_get_real_seconds(); 302 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 303 err_rec->cu = 0; 304 err_rec->mem_channel = channel_index; 305 err_rec->mcumc_id = umc_inst; 306 307 err_data->err_addr_cnt++; 308 } 309 310 int amdgpu_umc_loop_channels(struct amdgpu_device *adev, 311 umc_func func, void *data) 312 { 313 uint32_t node_inst = 0; 314 uint32_t umc_inst = 0; 315 uint32_t ch_inst = 0; 316 int ret = 0; 317 318 if (adev->umc.node_inst_num) { 319 LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { 320 ret = func(adev, node_inst, umc_inst, ch_inst, data); 321 if (ret) { 322 dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n", 323 node_inst, umc_inst, ch_inst, ret); 324 return ret; 325 } 326 } 327 } else { 328 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 329 ret = func(adev, 0, umc_inst, ch_inst, data); 330 if (ret) { 331 dev_err(adev->dev, "Umc %d ch %d func returns %d\n", 332 umc_inst, ch_inst, ret); 333 return ret; 334 } 335 } 336 } 337 338 return 0; 339 } 340