1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include <linux/debugfs.h> 25 #include <linux/list.h> 26 #include <linux/module.h> 27 #include <linux/uaccess.h> 28 #include <linux/reboot.h> 29 #include <linux/syscalls.h> 30 31 #include "amdgpu.h" 32 #include "amdgpu_ras.h" 33 #include "amdgpu_atomfirmware.h" 34 #include "amdgpu_xgmi.h" 35 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" 36 37 static const char *RAS_FS_NAME = "ras"; 38 39 const char *ras_error_string[] = { 40 "none", 41 "parity", 42 "single_correctable", 43 "multi_uncorrectable", 44 "poison", 45 }; 46 47 const char *ras_block_string[] = { 48 "umc", 49 "sdma", 50 "gfx", 51 "mmhub", 52 "athub", 53 "pcie_bif", 54 "hdp", 55 "xgmi_wafl", 56 "df", 57 "smn", 58 "sem", 59 "mp0", 60 "mp1", 61 "fuse", 62 }; 63 64 #define ras_err_str(i) (ras_error_string[ffs(i)]) 65 #define ras_block_str(i) (ras_block_string[i]) 66 67 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 68 69 /* inject address is 52 bits */ 70 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) 71 72 /* typical ECC bad page rate(1 bad page per 100MB VRAM) */ 73 #define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) 74 75 enum amdgpu_ras_retire_page_reservation { 76 AMDGPU_RAS_RETIRE_PAGE_RESERVED, 77 AMDGPU_RAS_RETIRE_PAGE_PENDING, 78 AMDGPU_RAS_RETIRE_PAGE_FAULT, 79 }; 80 81 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); 82 83 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 84 uint64_t addr); 85 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 86 uint64_t addr); 87 88 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) 89 { 90 if (adev && amdgpu_ras_get_context(adev)) 91 amdgpu_ras_get_context(adev)->error_query_ready = ready; 92 } 93 94 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) 95 { 96 if (adev && amdgpu_ras_get_context(adev)) 97 return amdgpu_ras_get_context(adev)->error_query_ready; 98 99 return false; 100 } 101 102 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 103 size_t size, loff_t *pos) 104 { 105 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 106 struct ras_query_if info = { 107 .head = obj->head, 108 }; 109 ssize_t s; 110 char val[128]; 111 112 if (amdgpu_ras_query_error_status(obj->adev, &info)) 113 return -EINVAL; 114 115 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 116 "ue", info.ue_count, 117 "ce", info.ce_count); 118 if (*pos >= s) 119 return 0; 120 121 s -= *pos; 122 s = min_t(u64, s, size); 123 124 125 if (copy_to_user(buf, &val[*pos], s)) 126 return -EINVAL; 127 128 *pos += s; 129 130 return s; 131 } 132 133 static const struct file_operations amdgpu_ras_debugfs_ops = { 134 .owner = THIS_MODULE, 135 .read = amdgpu_ras_debugfs_read, 136 .write = NULL, 137 .llseek = default_llseek 138 }; 139 140 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 141 { 142 int i; 143 144 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 145 *block_id = i; 146 if (strcmp(name, ras_block_str(i)) == 0) 147 return 0; 148 } 149 return -EINVAL; 150 } 151 152 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 153 const char __user *buf, size_t size, 154 loff_t *pos, struct ras_debug_if *data) 155 { 156 ssize_t s = min_t(u64, 64, size); 157 char str[65]; 158 char block_name[33]; 159 char err[9] = "ue"; 160 int op = -1; 161 int block_id; 162 uint32_t sub_block; 163 u64 address, value; 164 165 if (*pos) 166 return -EINVAL; 167 *pos = size; 168 169 memset(str, 0, sizeof(str)); 170 memset(data, 0, sizeof(*data)); 171 172 if (copy_from_user(str, buf, s)) 173 return -EINVAL; 174 175 if (sscanf(str, "disable %32s", block_name) == 1) 176 op = 0; 177 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 178 op = 1; 179 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 180 op = 2; 181 else if (str[0] && str[1] && str[2] && str[3]) 182 /* ascii string, but commands are not matched. */ 183 return -EINVAL; 184 185 if (op != -1) { 186 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 187 return -EINVAL; 188 189 data->head.block = block_id; 190 /* only ue and ce errors are supported */ 191 if (!memcmp("ue", err, 2)) 192 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 193 else if (!memcmp("ce", err, 2)) 194 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 195 else 196 return -EINVAL; 197 198 data->op = op; 199 200 if (op == 2) { 201 if (sscanf(str, "%*s %*s %*s %u %llu %llu", 202 &sub_block, &address, &value) != 3) 203 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", 204 &sub_block, &address, &value) != 3) 205 return -EINVAL; 206 data->head.sub_block_index = sub_block; 207 data->inject.address = address; 208 data->inject.value = value; 209 } 210 } else { 211 if (size < sizeof(*data)) 212 return -EINVAL; 213 214 if (copy_from_user(data, buf, sizeof(*data))) 215 return -EINVAL; 216 } 217 218 return 0; 219 } 220 221 /** 222 * DOC: AMDGPU RAS debugfs control interface 223 * 224 * It accepts struct ras_debug_if who has two members. 225 * 226 * First member: ras_debug_if::head or ras_debug_if::inject. 227 * 228 * head is used to indicate which IP block will be under control. 229 * 230 * head has four members, they are block, type, sub_block_index, name. 231 * block: which IP will be under control. 232 * type: what kind of error will be enabled/disabled/injected. 233 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 234 * name: the name of IP. 235 * 236 * inject has two more members than head, they are address, value. 237 * As their names indicate, inject operation will write the 238 * value to the address. 239 * 240 * The second member: struct ras_debug_if::op. 241 * It has three kinds of operations. 242 * 243 * - 0: disable RAS on the block. Take ::head as its data. 244 * - 1: enable RAS on the block. Take ::head as its data. 245 * - 2: inject errors on the block. Take ::inject as its data. 246 * 247 * How to use the interface? 248 * 249 * Programs 250 * 251 * Copy the struct ras_debug_if in your codes and initialize it. 252 * Write the struct to the control node. 253 * 254 * Shells 255 * 256 * .. code-block:: bash 257 * 258 * echo op block [error [sub_block address value]] > .../ras/ras_ctrl 259 * 260 * Parameters: 261 * 262 * op: disable, enable, inject 263 * disable: only block is needed 264 * enable: block and error are needed 265 * inject: error, address, value are needed 266 * block: umc, sdma, gfx, ......... 267 * see ras_block_string[] for details 268 * error: ue, ce 269 * ue: multi_uncorrectable 270 * ce: single_correctable 271 * sub_block: 272 * sub block index, pass 0 if there is no sub block 273 * 274 * here are some examples for bash commands: 275 * 276 * .. code-block:: bash 277 * 278 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 279 * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 280 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 281 * 282 * How to check the result? 283 * 284 * For disable/enable, please check ras features at 285 * /sys/class/drm/card[0/1/2...]/device/ras/features 286 * 287 * For inject, please check corresponding err count at 288 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 289 * 290 * .. note:: 291 * Operations are only allowed on blocks which are supported. 292 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask 293 * to see which blocks support RAS on a particular asic. 294 * 295 */ 296 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, 297 size_t size, loff_t *pos) 298 { 299 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 300 struct ras_debug_if data; 301 int ret = 0; 302 303 if (!amdgpu_ras_get_error_query_ready(adev)) { 304 dev_warn(adev->dev, "RAS WARN: error injection " 305 "currently inaccessible\n"); 306 return size; 307 } 308 309 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 310 if (ret) 311 return -EINVAL; 312 313 if (!amdgpu_ras_is_supported(adev, data.head.block)) 314 return -EINVAL; 315 316 switch (data.op) { 317 case 0: 318 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 319 break; 320 case 1: 321 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 322 break; 323 case 2: 324 if ((data.inject.address >= adev->gmc.mc_vram_size) || 325 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { 326 dev_warn(adev->dev, "RAS WARN: input address " 327 "0x%llx is invalid.", 328 data.inject.address); 329 ret = -EINVAL; 330 break; 331 } 332 333 /* umc ce/ue error injection for a bad page is not allowed */ 334 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && 335 amdgpu_ras_check_bad_page(adev, data.inject.address)) { 336 dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked " 337 "as bad before error injection!\n", 338 data.inject.address); 339 break; 340 } 341 342 /* data.inject.address is offset instead of absolute gpu address */ 343 ret = amdgpu_ras_error_inject(adev, &data.inject); 344 break; 345 default: 346 ret = -EINVAL; 347 break; 348 } 349 350 if (ret) 351 return -EINVAL; 352 353 return size; 354 } 355 356 /** 357 * DOC: AMDGPU RAS debugfs EEPROM table reset interface 358 * 359 * Some boards contain an EEPROM which is used to persistently store a list of 360 * bad pages which experiences ECC errors in vram. This interface provides 361 * a way to reset the EEPROM, e.g., after testing error injection. 362 * 363 * Usage: 364 * 365 * .. code-block:: bash 366 * 367 * echo 1 > ../ras/ras_eeprom_reset 368 * 369 * will reset EEPROM table to 0 entries. 370 * 371 */ 372 static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, 373 size_t size, loff_t *pos) 374 { 375 struct amdgpu_device *adev = 376 (struct amdgpu_device *)file_inode(f)->i_private; 377 int ret; 378 379 ret = amdgpu_ras_eeprom_reset_table( 380 &(amdgpu_ras_get_context(adev)->eeprom_control)); 381 382 if (ret == 1) { 383 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; 384 return size; 385 } else { 386 return -EIO; 387 } 388 } 389 390 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 391 .owner = THIS_MODULE, 392 .read = NULL, 393 .write = amdgpu_ras_debugfs_ctrl_write, 394 .llseek = default_llseek 395 }; 396 397 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { 398 .owner = THIS_MODULE, 399 .read = NULL, 400 .write = amdgpu_ras_debugfs_eeprom_write, 401 .llseek = default_llseek 402 }; 403 404 /** 405 * DOC: AMDGPU RAS sysfs Error Count Interface 406 * 407 * It allows the user to read the error count for each IP block on the gpu through 408 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 409 * 410 * It outputs the multiple lines which report the uncorrected (ue) and corrected 411 * (ce) error counts. 412 * 413 * The format of one line is below, 414 * 415 * [ce|ue]: count 416 * 417 * Example: 418 * 419 * .. code-block:: bash 420 * 421 * ue: 0 422 * ce: 1 423 * 424 */ 425 static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 426 struct device_attribute *attr, char *buf) 427 { 428 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 429 struct ras_query_if info = { 430 .head = obj->head, 431 }; 432 433 if (!amdgpu_ras_get_error_query_ready(obj->adev)) 434 return sysfs_emit(buf, "Query currently inaccessible\n"); 435 436 if (amdgpu_ras_query_error_status(obj->adev, &info)) 437 return -EINVAL; 438 439 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, 440 "ce", info.ce_count); 441 } 442 443 /* obj begin */ 444 445 #define get_obj(obj) do { (obj)->use++; } while (0) 446 #define alive_obj(obj) ((obj)->use) 447 448 static inline void put_obj(struct ras_manager *obj) 449 { 450 if (obj && (--obj->use == 0)) 451 list_del(&obj->node); 452 if (obj && (obj->use < 0)) 453 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); 454 } 455 456 /* make one obj and return it. */ 457 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 458 struct ras_common_if *head) 459 { 460 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 461 struct ras_manager *obj; 462 463 if (!adev->ras_features || !con) 464 return NULL; 465 466 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 467 return NULL; 468 469 obj = &con->objs[head->block]; 470 /* already exist. return obj? */ 471 if (alive_obj(obj)) 472 return NULL; 473 474 obj->head = *head; 475 obj->adev = adev; 476 list_add(&obj->node, &con->head); 477 get_obj(obj); 478 479 return obj; 480 } 481 482 /* return an obj equal to head, or the first when head is NULL */ 483 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 484 struct ras_common_if *head) 485 { 486 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 487 struct ras_manager *obj; 488 int i; 489 490 if (!adev->ras_features || !con) 491 return NULL; 492 493 if (head) { 494 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 495 return NULL; 496 497 obj = &con->objs[head->block]; 498 499 if (alive_obj(obj)) { 500 WARN_ON(head->block != obj->head.block); 501 return obj; 502 } 503 } else { 504 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 505 obj = &con->objs[i]; 506 if (alive_obj(obj)) { 507 WARN_ON(i != obj->head.block); 508 return obj; 509 } 510 } 511 } 512 513 return NULL; 514 } 515 /* obj end */ 516 517 static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, 518 const char* invoke_type, 519 const char* block_name, 520 enum ta_ras_status ret) 521 { 522 switch (ret) { 523 case TA_RAS_STATUS__SUCCESS: 524 return; 525 case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE: 526 dev_warn(adev->dev, 527 "RAS WARN: %s %s currently unavailable\n", 528 invoke_type, 529 block_name); 530 break; 531 default: 532 dev_err(adev->dev, 533 "RAS ERROR: %s %s error failed ret 0x%X\n", 534 invoke_type, 535 block_name, 536 ret); 537 } 538 } 539 540 /* feature ctl begin */ 541 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 542 struct ras_common_if *head) 543 { 544 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 545 546 return con->hw_supported & BIT(head->block); 547 } 548 549 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 550 struct ras_common_if *head) 551 { 552 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 553 554 return con->features & BIT(head->block); 555 } 556 557 /* 558 * if obj is not created, then create one. 559 * set feature enable flag. 560 */ 561 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 562 struct ras_common_if *head, int enable) 563 { 564 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 565 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 566 567 /* If hardware does not support ras, then do not create obj. 568 * But if hardware support ras, we can create the obj. 569 * Ras framework checks con->hw_supported to see if it need do 570 * corresponding initialization. 571 * IP checks con->support to see if it need disable ras. 572 */ 573 if (!amdgpu_ras_is_feature_allowed(adev, head)) 574 return 0; 575 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 576 return 0; 577 578 if (enable) { 579 if (!obj) { 580 obj = amdgpu_ras_create_obj(adev, head); 581 if (!obj) 582 return -EINVAL; 583 } else { 584 /* In case we create obj somewhere else */ 585 get_obj(obj); 586 } 587 con->features |= BIT(head->block); 588 } else { 589 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 590 /* skip clean gfx ras context feature for VEGA20 Gaming. 591 * will clean later 592 */ 593 if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX))) 594 con->features &= ~BIT(head->block); 595 put_obj(obj); 596 } 597 } 598 599 return 0; 600 } 601 602 /* wrapper of psp_ras_enable_features */ 603 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 604 struct ras_common_if *head, bool enable) 605 { 606 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 607 union ta_ras_cmd_input *info; 608 int ret; 609 610 if (!con) 611 return -EINVAL; 612 613 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); 614 if (!info) 615 return -ENOMEM; 616 617 if (!enable) { 618 info->disable_features = (struct ta_ras_disable_features_input) { 619 .block_id = amdgpu_ras_block_to_ta(head->block), 620 .error_type = amdgpu_ras_error_to_ta(head->type), 621 }; 622 } else { 623 info->enable_features = (struct ta_ras_enable_features_input) { 624 .block_id = amdgpu_ras_block_to_ta(head->block), 625 .error_type = amdgpu_ras_error_to_ta(head->type), 626 }; 627 } 628 629 /* Do not enable if it is not allowed. */ 630 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); 631 /* Are we alerady in that state we are going to set? */ 632 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) { 633 ret = 0; 634 goto out; 635 } 636 637 if (!amdgpu_ras_intr_triggered()) { 638 ret = psp_ras_enable_features(&adev->psp, info, enable); 639 if (ret) { 640 amdgpu_ras_parse_status_code(adev, 641 enable ? "enable":"disable", 642 ras_block_str(head->block), 643 (enum ta_ras_status)ret); 644 if (ret == TA_RAS_STATUS__RESET_NEEDED) 645 ret = -EAGAIN; 646 else 647 ret = -EINVAL; 648 649 goto out; 650 } 651 } 652 653 /* setup the obj */ 654 __amdgpu_ras_feature_enable(adev, head, enable); 655 ret = 0; 656 out: 657 kfree(info); 658 return ret; 659 } 660 661 /* Only used in device probe stage and called only once. */ 662 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, 663 struct ras_common_if *head, bool enable) 664 { 665 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 666 int ret; 667 668 if (!con) 669 return -EINVAL; 670 671 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 672 if (enable) { 673 /* There is no harm to issue a ras TA cmd regardless of 674 * the currecnt ras state. 675 * If current state == target state, it will do nothing 676 * But sometimes it requests driver to reset and repost 677 * with error code -EAGAIN. 678 */ 679 ret = amdgpu_ras_feature_enable(adev, head, 1); 680 /* With old ras TA, we might fail to enable ras. 681 * Log it and just setup the object. 682 * TODO need remove this WA in the future. 683 */ 684 if (ret == -EINVAL) { 685 ret = __amdgpu_ras_feature_enable(adev, head, 1); 686 if (!ret) 687 dev_info(adev->dev, 688 "RAS INFO: %s setup object\n", 689 ras_block_str(head->block)); 690 } 691 } else { 692 /* setup the object then issue a ras TA disable cmd.*/ 693 ret = __amdgpu_ras_feature_enable(adev, head, 1); 694 if (ret) 695 return ret; 696 697 /* gfx block ras dsiable cmd must send to ras-ta */ 698 if (head->block == AMDGPU_RAS_BLOCK__GFX) 699 con->features |= BIT(head->block); 700 701 ret = amdgpu_ras_feature_enable(adev, head, 0); 702 } 703 } else 704 ret = amdgpu_ras_feature_enable(adev, head, enable); 705 706 return ret; 707 } 708 709 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 710 bool bypass) 711 { 712 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 713 struct ras_manager *obj, *tmp; 714 715 list_for_each_entry_safe(obj, tmp, &con->head, node) { 716 /* bypass psp. 717 * aka just release the obj and corresponding flags 718 */ 719 if (bypass) { 720 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 721 break; 722 } else { 723 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 724 break; 725 } 726 } 727 728 return con->features; 729 } 730 731 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 732 bool bypass) 733 { 734 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 735 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 736 int i; 737 const enum amdgpu_ras_error_type default_ras_type = 738 AMDGPU_RAS_ERROR__NONE; 739 740 for (i = 0; i < ras_block_count; i++) { 741 struct ras_common_if head = { 742 .block = i, 743 .type = default_ras_type, 744 .sub_block_index = 0, 745 }; 746 strcpy(head.name, ras_block_str(i)); 747 if (bypass) { 748 /* 749 * bypass psp. vbios enable ras for us. 750 * so just create the obj 751 */ 752 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 753 break; 754 } else { 755 if (amdgpu_ras_feature_enable(adev, &head, 1)) 756 break; 757 } 758 } 759 760 return con->features; 761 } 762 /* feature ctl end */ 763 764 /* query/inject/cure begin */ 765 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, 766 struct ras_query_if *info) 767 { 768 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 769 struct ras_err_data err_data = {0, 0, 0, NULL}; 770 int i; 771 772 if (!obj) 773 return -EINVAL; 774 775 switch (info->head.block) { 776 case AMDGPU_RAS_BLOCK__UMC: 777 if (adev->umc.ras_funcs && 778 adev->umc.ras_funcs->query_ras_error_count) 779 adev->umc.ras_funcs->query_ras_error_count(adev, &err_data); 780 /* umc query_ras_error_address is also responsible for clearing 781 * error status 782 */ 783 if (adev->umc.ras_funcs && 784 adev->umc.ras_funcs->query_ras_error_address) 785 adev->umc.ras_funcs->query_ras_error_address(adev, &err_data); 786 break; 787 case AMDGPU_RAS_BLOCK__SDMA: 788 if (adev->sdma.funcs->query_ras_error_count) { 789 for (i = 0; i < adev->sdma.num_instances; i++) 790 adev->sdma.funcs->query_ras_error_count(adev, i, 791 &err_data); 792 } 793 break; 794 case AMDGPU_RAS_BLOCK__GFX: 795 if (adev->gfx.ras_funcs && 796 adev->gfx.ras_funcs->query_ras_error_count) 797 adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); 798 799 if (adev->gfx.ras_funcs && 800 adev->gfx.ras_funcs->query_ras_error_status) 801 adev->gfx.ras_funcs->query_ras_error_status(adev); 802 break; 803 case AMDGPU_RAS_BLOCK__MMHUB: 804 if (adev->mmhub.ras_funcs && 805 adev->mmhub.ras_funcs->query_ras_error_count) 806 adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); 807 808 if (adev->mmhub.ras_funcs && 809 adev->mmhub.ras_funcs->query_ras_error_status) 810 adev->mmhub.ras_funcs->query_ras_error_status(adev); 811 break; 812 case AMDGPU_RAS_BLOCK__PCIE_BIF: 813 if (adev->nbio.ras_funcs && 814 adev->nbio.ras_funcs->query_ras_error_count) 815 adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); 816 break; 817 case AMDGPU_RAS_BLOCK__XGMI_WAFL: 818 if (adev->gmc.xgmi.ras_funcs && 819 adev->gmc.xgmi.ras_funcs->query_ras_error_count) 820 adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); 821 break; 822 default: 823 break; 824 } 825 826 obj->err_data.ue_count += err_data.ue_count; 827 obj->err_data.ce_count += err_data.ce_count; 828 829 info->ue_count = obj->err_data.ue_count; 830 info->ce_count = obj->err_data.ce_count; 831 832 if (err_data.ce_count) { 833 dev_info(adev->dev, "%ld correctable hardware errors " 834 "detected in %s block, no user " 835 "action is needed.\n", 836 obj->err_data.ce_count, 837 ras_block_str(info->head.block)); 838 } 839 if (err_data.ue_count) { 840 dev_info(adev->dev, "%ld uncorrectable hardware errors " 841 "detected in %s block\n", 842 obj->err_data.ue_count, 843 ras_block_str(info->head.block)); 844 } 845 846 return 0; 847 } 848 849 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, 850 enum amdgpu_ras_block block) 851 { 852 if (!amdgpu_ras_is_supported(adev, block)) 853 return -EINVAL; 854 855 switch (block) { 856 case AMDGPU_RAS_BLOCK__GFX: 857 if (adev->gfx.ras_funcs && 858 adev->gfx.ras_funcs->reset_ras_error_count) 859 adev->gfx.ras_funcs->reset_ras_error_count(adev); 860 861 if (adev->gfx.ras_funcs && 862 adev->gfx.ras_funcs->reset_ras_error_status) 863 adev->gfx.ras_funcs->reset_ras_error_status(adev); 864 break; 865 case AMDGPU_RAS_BLOCK__MMHUB: 866 if (adev->mmhub.ras_funcs && 867 adev->mmhub.ras_funcs->reset_ras_error_count) 868 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 869 break; 870 case AMDGPU_RAS_BLOCK__SDMA: 871 if (adev->sdma.funcs->reset_ras_error_count) 872 adev->sdma.funcs->reset_ras_error_count(adev); 873 break; 874 default: 875 break; 876 } 877 878 return 0; 879 } 880 881 /* Trigger XGMI/WAFL error */ 882 static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, 883 struct ta_ras_trigger_error_input *block_info) 884 { 885 int ret; 886 887 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 888 dev_warn(adev->dev, "Failed to disallow df cstate"); 889 890 if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) 891 dev_warn(adev->dev, "Failed to disallow XGMI power down"); 892 893 ret = psp_ras_trigger_error(&adev->psp, block_info); 894 895 if (amdgpu_ras_intr_triggered()) 896 return ret; 897 898 if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) 899 dev_warn(adev->dev, "Failed to allow XGMI power down"); 900 901 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) 902 dev_warn(adev->dev, "Failed to allow df cstate"); 903 904 return ret; 905 } 906 907 /* wrapper of psp_ras_trigger_error */ 908 int amdgpu_ras_error_inject(struct amdgpu_device *adev, 909 struct ras_inject_if *info) 910 { 911 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 912 struct ta_ras_trigger_error_input block_info = { 913 .block_id = amdgpu_ras_block_to_ta(info->head.block), 914 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 915 .sub_block_index = info->head.sub_block_index, 916 .address = info->address, 917 .value = info->value, 918 }; 919 int ret = 0; 920 921 if (!obj) 922 return -EINVAL; 923 924 /* Calculate XGMI relative offset */ 925 if (adev->gmc.xgmi.num_physical_nodes > 1) { 926 block_info.address = 927 amdgpu_xgmi_get_relative_phy_addr(adev, 928 block_info.address); 929 } 930 931 switch (info->head.block) { 932 case AMDGPU_RAS_BLOCK__GFX: 933 if (adev->gfx.ras_funcs && 934 adev->gfx.ras_funcs->ras_error_inject) 935 ret = adev->gfx.ras_funcs->ras_error_inject(adev, info); 936 else 937 ret = -EINVAL; 938 break; 939 case AMDGPU_RAS_BLOCK__UMC: 940 case AMDGPU_RAS_BLOCK__SDMA: 941 case AMDGPU_RAS_BLOCK__MMHUB: 942 case AMDGPU_RAS_BLOCK__PCIE_BIF: 943 ret = psp_ras_trigger_error(&adev->psp, &block_info); 944 break; 945 case AMDGPU_RAS_BLOCK__XGMI_WAFL: 946 ret = amdgpu_ras_error_inject_xgmi(adev, &block_info); 947 break; 948 default: 949 dev_info(adev->dev, "%s error injection is not supported yet\n", 950 ras_block_str(info->head.block)); 951 ret = -EINVAL; 952 } 953 954 amdgpu_ras_parse_status_code(adev, 955 "inject", 956 ras_block_str(info->head.block), 957 (enum ta_ras_status)ret); 958 959 return ret; 960 } 961 962 /* get the total error counts on all IPs */ 963 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, 964 bool is_ce) 965 { 966 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 967 struct ras_manager *obj; 968 struct ras_err_data data = {0, 0}; 969 970 if (!adev->ras_features || !con) 971 return 0; 972 973 list_for_each_entry(obj, &con->head, node) { 974 struct ras_query_if info = { 975 .head = obj->head, 976 }; 977 978 if (amdgpu_ras_query_error_status(adev, &info)) 979 return 0; 980 981 data.ce_count += info.ce_count; 982 data.ue_count += info.ue_count; 983 } 984 985 return is_ce ? data.ce_count : data.ue_count; 986 } 987 /* query/inject/cure end */ 988 989 990 /* sysfs begin */ 991 992 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 993 struct ras_badpage **bps, unsigned int *count); 994 995 static char *amdgpu_ras_badpage_flags_str(unsigned int flags) 996 { 997 switch (flags) { 998 case AMDGPU_RAS_RETIRE_PAGE_RESERVED: 999 return "R"; 1000 case AMDGPU_RAS_RETIRE_PAGE_PENDING: 1001 return "P"; 1002 case AMDGPU_RAS_RETIRE_PAGE_FAULT: 1003 default: 1004 return "F"; 1005 } 1006 } 1007 1008 /** 1009 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface 1010 * 1011 * It allows user to read the bad pages of vram on the gpu through 1012 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages 1013 * 1014 * It outputs multiple lines, and each line stands for one gpu page. 1015 * 1016 * The format of one line is below, 1017 * gpu pfn : gpu page size : flags 1018 * 1019 * gpu pfn and gpu page size are printed in hex format. 1020 * flags can be one of below character, 1021 * 1022 * R: reserved, this gpu page is reserved and not able to use. 1023 * 1024 * P: pending for reserve, this gpu page is marked as bad, will be reserved 1025 * in next window of page_reserve. 1026 * 1027 * F: unable to reserve. this gpu page can't be reserved due to some reasons. 1028 * 1029 * Examples: 1030 * 1031 * .. code-block:: bash 1032 * 1033 * 0x00000001 : 0x00001000 : R 1034 * 0x00000002 : 0x00001000 : P 1035 * 1036 */ 1037 1038 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, 1039 struct kobject *kobj, struct bin_attribute *attr, 1040 char *buf, loff_t ppos, size_t count) 1041 { 1042 struct amdgpu_ras *con = 1043 container_of(attr, struct amdgpu_ras, badpages_attr); 1044 struct amdgpu_device *adev = con->adev; 1045 const unsigned int element_size = 1046 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; 1047 unsigned int start = div64_ul(ppos + element_size - 1, element_size); 1048 unsigned int end = div64_ul(ppos + count - 1, element_size); 1049 ssize_t s = 0; 1050 struct ras_badpage *bps = NULL; 1051 unsigned int bps_count = 0; 1052 1053 memset(buf, 0, count); 1054 1055 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) 1056 return 0; 1057 1058 for (; start < end && start < bps_count; start++) 1059 s += scnprintf(&buf[s], element_size + 1, 1060 "0x%08x : 0x%08x : %1s\n", 1061 bps[start].bp, 1062 bps[start].size, 1063 amdgpu_ras_badpage_flags_str(bps[start].flags)); 1064 1065 kfree(bps); 1066 1067 return s; 1068 } 1069 1070 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 1071 struct device_attribute *attr, char *buf) 1072 { 1073 struct amdgpu_ras *con = 1074 container_of(attr, struct amdgpu_ras, features_attr); 1075 1076 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); 1077 } 1078 1079 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) 1080 { 1081 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1082 1083 sysfs_remove_file_from_group(&adev->dev->kobj, 1084 &con->badpages_attr.attr, 1085 RAS_FS_NAME); 1086 } 1087 1088 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) 1089 { 1090 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1091 struct attribute *attrs[] = { 1092 &con->features_attr.attr, 1093 NULL 1094 }; 1095 struct attribute_group group = { 1096 .name = RAS_FS_NAME, 1097 .attrs = attrs, 1098 }; 1099 1100 sysfs_remove_group(&adev->dev->kobj, &group); 1101 1102 return 0; 1103 } 1104 1105 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 1106 struct ras_fs_if *head) 1107 { 1108 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 1109 1110 if (!obj || obj->attr_inuse) 1111 return -EINVAL; 1112 1113 get_obj(obj); 1114 1115 memcpy(obj->fs_data.sysfs_name, 1116 head->sysfs_name, 1117 sizeof(obj->fs_data.sysfs_name)); 1118 1119 obj->sysfs_attr = (struct device_attribute){ 1120 .attr = { 1121 .name = obj->fs_data.sysfs_name, 1122 .mode = S_IRUGO, 1123 }, 1124 .show = amdgpu_ras_sysfs_read, 1125 }; 1126 sysfs_attr_init(&obj->sysfs_attr.attr); 1127 1128 if (sysfs_add_file_to_group(&adev->dev->kobj, 1129 &obj->sysfs_attr.attr, 1130 RAS_FS_NAME)) { 1131 put_obj(obj); 1132 return -EINVAL; 1133 } 1134 1135 obj->attr_inuse = 1; 1136 1137 return 0; 1138 } 1139 1140 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 1141 struct ras_common_if *head) 1142 { 1143 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 1144 1145 if (!obj || !obj->attr_inuse) 1146 return -EINVAL; 1147 1148 sysfs_remove_file_from_group(&adev->dev->kobj, 1149 &obj->sysfs_attr.attr, 1150 RAS_FS_NAME); 1151 obj->attr_inuse = 0; 1152 put_obj(obj); 1153 1154 return 0; 1155 } 1156 1157 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 1158 { 1159 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1160 struct ras_manager *obj, *tmp; 1161 1162 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1163 amdgpu_ras_sysfs_remove(adev, &obj->head); 1164 } 1165 1166 if (amdgpu_bad_page_threshold != 0) 1167 amdgpu_ras_sysfs_remove_bad_page_node(adev); 1168 1169 amdgpu_ras_sysfs_remove_feature_node(adev); 1170 1171 return 0; 1172 } 1173 /* sysfs end */ 1174 1175 /** 1176 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors 1177 * 1178 * Normally when there is an uncorrectable error, the driver will reset 1179 * the GPU to recover. However, in the event of an unrecoverable error, 1180 * the driver provides an interface to reboot the system automatically 1181 * in that event. 1182 * 1183 * The following file in debugfs provides that interface: 1184 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot 1185 * 1186 * Usage: 1187 * 1188 * .. code-block:: bash 1189 * 1190 * echo true > .../ras/auto_reboot 1191 * 1192 */ 1193 /* debugfs begin */ 1194 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 1195 { 1196 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1197 struct dentry *dir; 1198 struct drm_minor *minor = adev_to_drm(adev)->primary; 1199 1200 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); 1201 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, 1202 &amdgpu_ras_debugfs_ctrl_ops); 1203 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, 1204 &amdgpu_ras_debugfs_eeprom_ops); 1205 1206 /* 1207 * After one uncorrectable error happens, usually GPU recovery will 1208 * be scheduled. But due to the known problem in GPU recovery failing 1209 * to bring GPU back, below interface provides one direct way to 1210 * user to reboot system automatically in such case within 1211 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine 1212 * will never be called. 1213 */ 1214 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); 1215 1216 /* 1217 * User could set this not to clean up hardware's error count register 1218 * of RAS IPs during ras recovery. 1219 */ 1220 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir, 1221 &con->disable_ras_err_cnt_harvest); 1222 return dir; 1223 } 1224 1225 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 1226 struct ras_fs_if *head, 1227 struct dentry *dir) 1228 { 1229 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 1230 1231 if (!obj || !dir) 1232 return; 1233 1234 get_obj(obj); 1235 1236 memcpy(obj->fs_data.debugfs_name, 1237 head->debugfs_name, 1238 sizeof(obj->fs_data.debugfs_name)); 1239 1240 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, 1241 obj, &amdgpu_ras_debugfs_ops); 1242 } 1243 1244 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) 1245 { 1246 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1247 struct dentry *dir; 1248 struct ras_manager *obj; 1249 struct ras_fs_if fs_info; 1250 1251 /* 1252 * it won't be called in resume path, no need to check 1253 * suspend and gpu reset status 1254 */ 1255 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con) 1256 return; 1257 1258 dir = amdgpu_ras_debugfs_create_ctrl_node(adev); 1259 1260 list_for_each_entry(obj, &con->head, node) { 1261 if (amdgpu_ras_is_supported(adev, obj->head.block) && 1262 (obj->attr_inuse == 1)) { 1263 sprintf(fs_info.debugfs_name, "%s_err_inject", 1264 ras_block_str(obj->head.block)); 1265 fs_info.head = obj->head; 1266 amdgpu_ras_debugfs_create(adev, &fs_info, dir); 1267 } 1268 } 1269 } 1270 1271 /* debugfs end */ 1272 1273 /* ras fs */ 1274 static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, 1275 amdgpu_ras_sysfs_badpages_read, NULL, 0); 1276 static DEVICE_ATTR(features, S_IRUGO, 1277 amdgpu_ras_sysfs_features_read, NULL); 1278 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 1279 { 1280 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1281 struct attribute_group group = { 1282 .name = RAS_FS_NAME, 1283 }; 1284 struct attribute *attrs[] = { 1285 &con->features_attr.attr, 1286 NULL 1287 }; 1288 struct bin_attribute *bin_attrs[] = { 1289 NULL, 1290 NULL, 1291 }; 1292 int r; 1293 1294 /* add features entry */ 1295 con->features_attr = dev_attr_features; 1296 group.attrs = attrs; 1297 sysfs_attr_init(attrs[0]); 1298 1299 if (amdgpu_bad_page_threshold != 0) { 1300 /* add bad_page_features entry */ 1301 bin_attr_gpu_vram_bad_pages.private = NULL; 1302 con->badpages_attr = bin_attr_gpu_vram_bad_pages; 1303 bin_attrs[0] = &con->badpages_attr; 1304 group.bin_attrs = bin_attrs; 1305 sysfs_bin_attr_init(bin_attrs[0]); 1306 } 1307 1308 r = sysfs_create_group(&adev->dev->kobj, &group); 1309 if (r) 1310 dev_err(adev->dev, "Failed to create RAS sysfs group!"); 1311 1312 return 0; 1313 } 1314 1315 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 1316 { 1317 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1318 struct ras_manager *con_obj, *ip_obj, *tmp; 1319 1320 if (IS_ENABLED(CONFIG_DEBUG_FS)) { 1321 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { 1322 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); 1323 if (ip_obj) 1324 put_obj(ip_obj); 1325 } 1326 } 1327 1328 amdgpu_ras_sysfs_remove_all(adev); 1329 return 0; 1330 } 1331 /* ras fs end */ 1332 1333 /* ih begin */ 1334 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 1335 { 1336 struct ras_ih_data *data = &obj->ih_data; 1337 struct amdgpu_iv_entry entry; 1338 int ret; 1339 struct ras_err_data err_data = {0, 0, 0, NULL}; 1340 1341 while (data->rptr != data->wptr) { 1342 rmb(); 1343 memcpy(&entry, &data->ring[data->rptr], 1344 data->element_size); 1345 1346 wmb(); 1347 data->rptr = (data->aligned_element_size + 1348 data->rptr) % data->ring_size; 1349 1350 /* Let IP handle its data, maybe we need get the output 1351 * from the callback to udpate the error type/count, etc 1352 */ 1353 if (data->cb) { 1354 ret = data->cb(obj->adev, &err_data, &entry); 1355 /* ue will trigger an interrupt, and in that case 1356 * we need do a reset to recovery the whole system. 1357 * But leave IP do that recovery, here we just dispatch 1358 * the error. 1359 */ 1360 if (ret == AMDGPU_RAS_SUCCESS) { 1361 /* these counts could be left as 0 if 1362 * some blocks do not count error number 1363 */ 1364 obj->err_data.ue_count += err_data.ue_count; 1365 obj->err_data.ce_count += err_data.ce_count; 1366 } 1367 } 1368 } 1369 } 1370 1371 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 1372 { 1373 struct ras_ih_data *data = 1374 container_of(work, struct ras_ih_data, ih_work); 1375 struct ras_manager *obj = 1376 container_of(data, struct ras_manager, ih_data); 1377 1378 amdgpu_ras_interrupt_handler(obj); 1379 } 1380 1381 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 1382 struct ras_dispatch_if *info) 1383 { 1384 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1385 struct ras_ih_data *data = &obj->ih_data; 1386 1387 if (!obj) 1388 return -EINVAL; 1389 1390 if (data->inuse == 0) 1391 return 0; 1392 1393 /* Might be overflow... */ 1394 memcpy(&data->ring[data->wptr], info->entry, 1395 data->element_size); 1396 1397 wmb(); 1398 data->wptr = (data->aligned_element_size + 1399 data->wptr) % data->ring_size; 1400 1401 schedule_work(&data->ih_work); 1402 1403 return 0; 1404 } 1405 1406 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 1407 struct ras_ih_if *info) 1408 { 1409 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1410 struct ras_ih_data *data; 1411 1412 if (!obj) 1413 return -EINVAL; 1414 1415 data = &obj->ih_data; 1416 if (data->inuse == 0) 1417 return 0; 1418 1419 cancel_work_sync(&data->ih_work); 1420 1421 kfree(data->ring); 1422 memset(data, 0, sizeof(*data)); 1423 put_obj(obj); 1424 1425 return 0; 1426 } 1427 1428 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 1429 struct ras_ih_if *info) 1430 { 1431 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1432 struct ras_ih_data *data; 1433 1434 if (!obj) { 1435 /* in case we registe the IH before enable ras feature */ 1436 obj = amdgpu_ras_create_obj(adev, &info->head); 1437 if (!obj) 1438 return -EINVAL; 1439 } else 1440 get_obj(obj); 1441 1442 data = &obj->ih_data; 1443 /* add the callback.etc */ 1444 *data = (struct ras_ih_data) { 1445 .inuse = 0, 1446 .cb = info->cb, 1447 .element_size = sizeof(struct amdgpu_iv_entry), 1448 .rptr = 0, 1449 .wptr = 0, 1450 }; 1451 1452 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 1453 1454 data->aligned_element_size = ALIGN(data->element_size, 8); 1455 /* the ring can store 64 iv entries. */ 1456 data->ring_size = 64 * data->aligned_element_size; 1457 data->ring = kmalloc(data->ring_size, GFP_KERNEL); 1458 if (!data->ring) { 1459 put_obj(obj); 1460 return -ENOMEM; 1461 } 1462 1463 /* IH is ready */ 1464 data->inuse = 1; 1465 1466 return 0; 1467 } 1468 1469 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 1470 { 1471 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1472 struct ras_manager *obj, *tmp; 1473 1474 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1475 struct ras_ih_if info = { 1476 .head = obj->head, 1477 }; 1478 amdgpu_ras_interrupt_remove_handler(adev, &info); 1479 } 1480 1481 return 0; 1482 } 1483 /* ih end */ 1484 1485 /* traversal all IPs except NBIO to query error counter */ 1486 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) 1487 { 1488 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1489 struct ras_manager *obj; 1490 1491 if (!adev->ras_features || !con) 1492 return; 1493 1494 list_for_each_entry(obj, &con->head, node) { 1495 struct ras_query_if info = { 1496 .head = obj->head, 1497 }; 1498 1499 /* 1500 * PCIE_BIF IP has one different isr by ras controller 1501 * interrupt, the specific ras counter query will be 1502 * done in that isr. So skip such block from common 1503 * sync flood interrupt isr calling. 1504 */ 1505 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) 1506 continue; 1507 1508 amdgpu_ras_query_error_status(adev, &info); 1509 } 1510 } 1511 1512 /* Parse RdRspStatus and WrRspStatus */ 1513 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, 1514 struct ras_query_if *info) 1515 { 1516 /* 1517 * Only two block need to query read/write 1518 * RspStatus at current state 1519 */ 1520 switch (info->head.block) { 1521 case AMDGPU_RAS_BLOCK__GFX: 1522 if (adev->gfx.ras_funcs && 1523 adev->gfx.ras_funcs->query_ras_error_status) 1524 adev->gfx.ras_funcs->query_ras_error_status(adev); 1525 break; 1526 case AMDGPU_RAS_BLOCK__MMHUB: 1527 if (adev->mmhub.ras_funcs && 1528 adev->mmhub.ras_funcs->query_ras_error_status) 1529 adev->mmhub.ras_funcs->query_ras_error_status(adev); 1530 break; 1531 default: 1532 break; 1533 } 1534 } 1535 1536 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) 1537 { 1538 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1539 struct ras_manager *obj; 1540 1541 if (!adev->ras_features || !con) 1542 return; 1543 1544 list_for_each_entry(obj, &con->head, node) { 1545 struct ras_query_if info = { 1546 .head = obj->head, 1547 }; 1548 1549 amdgpu_ras_error_status_query(adev, &info); 1550 } 1551 } 1552 1553 /* recovery begin */ 1554 1555 /* return 0 on success. 1556 * caller need free bps. 1557 */ 1558 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 1559 struct ras_badpage **bps, unsigned int *count) 1560 { 1561 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1562 struct ras_err_handler_data *data; 1563 int i = 0; 1564 int ret = 0, status; 1565 1566 if (!con || !con->eh_data || !bps || !count) 1567 return -EINVAL; 1568 1569 mutex_lock(&con->recovery_lock); 1570 data = con->eh_data; 1571 if (!data || data->count == 0) { 1572 *bps = NULL; 1573 ret = -EINVAL; 1574 goto out; 1575 } 1576 1577 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); 1578 if (!*bps) { 1579 ret = -ENOMEM; 1580 goto out; 1581 } 1582 1583 for (; i < data->count; i++) { 1584 (*bps)[i] = (struct ras_badpage){ 1585 .bp = data->bps[i].retired_page, 1586 .size = AMDGPU_GPU_PAGE_SIZE, 1587 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, 1588 }; 1589 status = amdgpu_vram_mgr_query_page_status( 1590 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), 1591 data->bps[i].retired_page); 1592 if (status == -EBUSY) 1593 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; 1594 else if (status == -ENOENT) 1595 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; 1596 } 1597 1598 *count = data->count; 1599 out: 1600 mutex_unlock(&con->recovery_lock); 1601 return ret; 1602 } 1603 1604 static void amdgpu_ras_do_recovery(struct work_struct *work) 1605 { 1606 struct amdgpu_ras *ras = 1607 container_of(work, struct amdgpu_ras, recovery_work); 1608 struct amdgpu_device *remote_adev = NULL; 1609 struct amdgpu_device *adev = ras->adev; 1610 struct list_head device_list, *device_list_handle = NULL; 1611 1612 if (!ras->disable_ras_err_cnt_harvest) { 1613 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 1614 1615 /* Build list of devices to query RAS related errors */ 1616 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { 1617 device_list_handle = &hive->device_list; 1618 } else { 1619 INIT_LIST_HEAD(&device_list); 1620 list_add_tail(&adev->gmc.xgmi.head, &device_list); 1621 device_list_handle = &device_list; 1622 } 1623 1624 list_for_each_entry(remote_adev, 1625 device_list_handle, gmc.xgmi.head) { 1626 amdgpu_ras_query_err_status(remote_adev); 1627 amdgpu_ras_log_on_err_counter(remote_adev); 1628 } 1629 1630 amdgpu_put_xgmi_hive(hive); 1631 } 1632 1633 if (amdgpu_device_should_recover_gpu(ras->adev)) 1634 amdgpu_device_gpu_recover(ras->adev, NULL); 1635 atomic_set(&ras->in_recovery, 0); 1636 } 1637 1638 /* alloc/realloc bps array */ 1639 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 1640 struct ras_err_handler_data *data, int pages) 1641 { 1642 unsigned int old_space = data->count + data->space_left; 1643 unsigned int new_space = old_space + pages; 1644 unsigned int align_space = ALIGN(new_space, 512); 1645 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); 1646 1647 if (!bps) { 1648 kfree(bps); 1649 return -ENOMEM; 1650 } 1651 1652 if (data->bps) { 1653 memcpy(bps, data->bps, 1654 data->count * sizeof(*data->bps)); 1655 kfree(data->bps); 1656 } 1657 1658 data->bps = bps; 1659 data->space_left += align_space - old_space; 1660 return 0; 1661 } 1662 1663 /* it deal with vram only. */ 1664 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 1665 struct eeprom_table_record *bps, int pages) 1666 { 1667 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1668 struct ras_err_handler_data *data; 1669 int ret = 0; 1670 uint32_t i; 1671 1672 if (!con || !con->eh_data || !bps || pages <= 0) 1673 return 0; 1674 1675 mutex_lock(&con->recovery_lock); 1676 data = con->eh_data; 1677 if (!data) 1678 goto out; 1679 1680 for (i = 0; i < pages; i++) { 1681 if (amdgpu_ras_check_bad_page_unlock(con, 1682 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 1683 continue; 1684 1685 if (!data->space_left && 1686 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { 1687 ret = -ENOMEM; 1688 goto out; 1689 } 1690 1691 amdgpu_vram_mgr_reserve_range( 1692 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), 1693 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT, 1694 AMDGPU_GPU_PAGE_SIZE); 1695 1696 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); 1697 data->count++; 1698 data->space_left--; 1699 } 1700 out: 1701 mutex_unlock(&con->recovery_lock); 1702 1703 return ret; 1704 } 1705 1706 /* 1707 * write error record array to eeprom, the function should be 1708 * protected by recovery_lock 1709 */ 1710 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) 1711 { 1712 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1713 struct ras_err_handler_data *data; 1714 struct amdgpu_ras_eeprom_control *control; 1715 int save_count; 1716 1717 if (!con || !con->eh_data) 1718 return 0; 1719 1720 control = &con->eeprom_control; 1721 data = con->eh_data; 1722 save_count = data->count - control->num_recs; 1723 /* only new entries are saved */ 1724 if (save_count > 0) { 1725 if (amdgpu_ras_eeprom_process_recods(control, 1726 &data->bps[control->num_recs], 1727 true, 1728 save_count)) { 1729 dev_err(adev->dev, "Failed to save EEPROM table data!"); 1730 return -EIO; 1731 } 1732 1733 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); 1734 } 1735 1736 return 0; 1737 } 1738 1739 /* 1740 * read error record array in eeprom and reserve enough space for 1741 * storing new bad pages 1742 */ 1743 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 1744 { 1745 struct amdgpu_ras_eeprom_control *control = 1746 &adev->psp.ras.ras->eeprom_control; 1747 struct eeprom_table_record *bps = NULL; 1748 int ret = 0; 1749 1750 /* no bad page record, skip eeprom access */ 1751 if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) 1752 return ret; 1753 1754 bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); 1755 if (!bps) 1756 return -ENOMEM; 1757 1758 if (amdgpu_ras_eeprom_process_recods(control, bps, false, 1759 control->num_recs)) { 1760 dev_err(adev->dev, "Failed to load EEPROM table records!"); 1761 ret = -EIO; 1762 goto out; 1763 } 1764 1765 ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); 1766 1767 out: 1768 kfree(bps); 1769 return ret; 1770 } 1771 1772 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 1773 uint64_t addr) 1774 { 1775 struct ras_err_handler_data *data = con->eh_data; 1776 int i; 1777 1778 addr >>= AMDGPU_GPU_PAGE_SHIFT; 1779 for (i = 0; i < data->count; i++) 1780 if (addr == data->bps[i].retired_page) 1781 return true; 1782 1783 return false; 1784 } 1785 1786 /* 1787 * check if an address belongs to bad page 1788 * 1789 * Note: this check is only for umc block 1790 */ 1791 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 1792 uint64_t addr) 1793 { 1794 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1795 bool ret = false; 1796 1797 if (!con || !con->eh_data) 1798 return ret; 1799 1800 mutex_lock(&con->recovery_lock); 1801 ret = amdgpu_ras_check_bad_page_unlock(con, addr); 1802 mutex_unlock(&con->recovery_lock); 1803 return ret; 1804 } 1805 1806 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, 1807 uint32_t max_length) 1808 { 1809 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1810 int tmp_threshold = amdgpu_bad_page_threshold; 1811 u64 val; 1812 1813 /* 1814 * Justification of value bad_page_cnt_threshold in ras structure 1815 * 1816 * Generally, -1 <= amdgpu_bad_page_threshold <= max record length 1817 * in eeprom, and introduce two scenarios accordingly. 1818 * 1819 * Bad page retirement enablement: 1820 * - If amdgpu_bad_page_threshold = -1, 1821 * bad_page_cnt_threshold = typical value by formula. 1822 * 1823 * - When the value from user is 0 < amdgpu_bad_page_threshold < 1824 * max record length in eeprom, use it directly. 1825 * 1826 * Bad page retirement disablement: 1827 * - If amdgpu_bad_page_threshold = 0, bad page retirement 1828 * functionality is disabled, and bad_page_cnt_threshold will 1829 * take no effect. 1830 */ 1831 1832 if (tmp_threshold < -1) 1833 tmp_threshold = -1; 1834 else if (tmp_threshold > max_length) 1835 tmp_threshold = max_length; 1836 1837 if (tmp_threshold == -1) { 1838 val = adev->gmc.mc_vram_size; 1839 do_div(val, RAS_BAD_PAGE_RATE); 1840 con->bad_page_cnt_threshold = min(lower_32_bits(val), 1841 max_length); 1842 } else { 1843 con->bad_page_cnt_threshold = tmp_threshold; 1844 } 1845 } 1846 1847 int amdgpu_ras_recovery_init(struct amdgpu_device *adev) 1848 { 1849 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1850 struct ras_err_handler_data **data; 1851 uint32_t max_eeprom_records_len = 0; 1852 bool exc_err_limit = false; 1853 int ret; 1854 1855 if (adev->ras_features && con) 1856 data = &con->eh_data; 1857 else 1858 return 0; 1859 1860 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); 1861 if (!*data) { 1862 ret = -ENOMEM; 1863 goto out; 1864 } 1865 1866 mutex_init(&con->recovery_lock); 1867 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 1868 atomic_set(&con->in_recovery, 0); 1869 con->adev = adev; 1870 1871 max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); 1872 amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); 1873 1874 /* Todo: During test the SMU might fail to read the eeprom through I2C 1875 * when the GPU is pending on XGMI reset during probe time 1876 * (Mostly after second bus reset), skip it now 1877 */ 1878 if (adev->gmc.xgmi.pending_reset) 1879 return 0; 1880 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); 1881 /* 1882 * This calling fails when exc_err_limit is true or 1883 * ret != 0. 1884 */ 1885 if (exc_err_limit || ret) 1886 goto free; 1887 1888 if (con->eeprom_control.num_recs) { 1889 ret = amdgpu_ras_load_bad_pages(adev); 1890 if (ret) 1891 goto free; 1892 } 1893 1894 return 0; 1895 1896 free: 1897 kfree((*data)->bps); 1898 kfree(*data); 1899 con->eh_data = NULL; 1900 out: 1901 dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); 1902 1903 /* 1904 * Except error threshold exceeding case, other failure cases in this 1905 * function would not fail amdgpu driver init. 1906 */ 1907 if (!exc_err_limit) 1908 ret = 0; 1909 else 1910 ret = -EINVAL; 1911 1912 return ret; 1913 } 1914 1915 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 1916 { 1917 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1918 struct ras_err_handler_data *data = con->eh_data; 1919 1920 /* recovery_init failed to init it, fini is useless */ 1921 if (!data) 1922 return 0; 1923 1924 cancel_work_sync(&con->recovery_work); 1925 1926 mutex_lock(&con->recovery_lock); 1927 con->eh_data = NULL; 1928 kfree(data->bps); 1929 kfree(data); 1930 mutex_unlock(&con->recovery_lock); 1931 1932 return 0; 1933 } 1934 /* recovery end */ 1935 1936 /* return 0 if ras will reset gpu and repost.*/ 1937 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, 1938 unsigned int block) 1939 { 1940 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1941 1942 if (!ras) 1943 return -EINVAL; 1944 1945 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; 1946 return 0; 1947 } 1948 1949 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) 1950 { 1951 return adev->asic_type == CHIP_VEGA10 || 1952 adev->asic_type == CHIP_VEGA20 || 1953 adev->asic_type == CHIP_ARCTURUS || 1954 adev->asic_type == CHIP_ALDEBARAN || 1955 adev->asic_type == CHIP_SIENNA_CICHLID; 1956 } 1957 1958 /* 1959 * check hardware's ras ability which will be saved in hw_supported. 1960 * if hardware does not support ras, we can skip some ras initializtion and 1961 * forbid some ras operations from IP. 1962 * if software itself, say boot parameter, limit the ras ability. We still 1963 * need allow IP do some limited operations, like disable. In such case, 1964 * we have to initialize ras as normal. but need check if operation is 1965 * allowed or not in each function. 1966 */ 1967 static void amdgpu_ras_check_supported(struct amdgpu_device *adev, 1968 uint32_t *hw_supported, uint32_t *supported) 1969 { 1970 *hw_supported = 0; 1971 *supported = 0; 1972 1973 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || 1974 !amdgpu_ras_asic_supported(adev)) 1975 return; 1976 1977 if (!adev->gmc.xgmi.connected_to_cpu) { 1978 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { 1979 dev_info(adev->dev, "MEM ECC is active.\n"); 1980 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | 1981 1 << AMDGPU_RAS_BLOCK__DF); 1982 } else { 1983 dev_info(adev->dev, "MEM ECC is not presented.\n"); 1984 } 1985 1986 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { 1987 dev_info(adev->dev, "SRAM ECC is active.\n"); 1988 *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1989 1 << AMDGPU_RAS_BLOCK__DF); 1990 } else { 1991 dev_info(adev->dev, "SRAM ECC is not presented.\n"); 1992 } 1993 } else { 1994 /* driver only manages a few IP blocks RAS feature 1995 * when GPU is connected cpu through XGMI */ 1996 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | 1997 1 << AMDGPU_RAS_BLOCK__SDMA | 1998 1 << AMDGPU_RAS_BLOCK__MMHUB); 1999 } 2000 2001 /* hw_supported needs to be aligned with RAS block mask. */ 2002 *hw_supported &= AMDGPU_RAS_BLOCK_MASK; 2003 2004 *supported = amdgpu_ras_enable == 0 ? 2005 0 : *hw_supported & amdgpu_ras_mask; 2006 adev->ras_features = *supported; 2007 } 2008 2009 int amdgpu_ras_init(struct amdgpu_device *adev) 2010 { 2011 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2012 int r; 2013 2014 if (con) 2015 return 0; 2016 2017 con = kmalloc(sizeof(struct amdgpu_ras) + 2018 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, 2019 GFP_KERNEL|__GFP_ZERO); 2020 if (!con) 2021 return -ENOMEM; 2022 2023 con->objs = (struct ras_manager *)(con + 1); 2024 2025 amdgpu_ras_set_context(adev, con); 2026 2027 amdgpu_ras_check_supported(adev, &con->hw_supported, 2028 &con->supported); 2029 if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { 2030 /* set gfx block ras context feature for VEGA20 Gaming 2031 * send ras disable cmd to ras ta during ras late init. 2032 */ 2033 if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) { 2034 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); 2035 2036 return 0; 2037 } 2038 2039 r = 0; 2040 goto release_con; 2041 } 2042 2043 con->features = 0; 2044 INIT_LIST_HEAD(&con->head); 2045 /* Might need get this flag from vbios. */ 2046 con->flags = RAS_DEFAULT_FLAGS; 2047 2048 /* initialize nbio ras function ahead of any other 2049 * ras functions so hardware fatal error interrupt 2050 * can be enabled as early as possible */ 2051 switch (adev->asic_type) { 2052 case CHIP_VEGA20: 2053 case CHIP_ARCTURUS: 2054 case CHIP_ALDEBARAN: 2055 if (!adev->gmc.xgmi.connected_to_cpu) 2056 adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; 2057 break; 2058 default: 2059 /* nbio ras is not available */ 2060 break; 2061 } 2062 2063 if (adev->nbio.ras_funcs && 2064 adev->nbio.ras_funcs->init_ras_controller_interrupt) { 2065 r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); 2066 if (r) 2067 goto release_con; 2068 } 2069 2070 if (adev->nbio.ras_funcs && 2071 adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { 2072 r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); 2073 if (r) 2074 goto release_con; 2075 } 2076 2077 if (amdgpu_ras_fs_init(adev)) { 2078 r = -EINVAL; 2079 goto release_con; 2080 } 2081 2082 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " 2083 "hardware ability[%x] ras_mask[%x]\n", 2084 con->hw_supported, con->supported); 2085 return 0; 2086 release_con: 2087 amdgpu_ras_set_context(adev, NULL); 2088 kfree(con); 2089 2090 return r; 2091 } 2092 2093 /* helper function to handle common stuff in ip late init phase */ 2094 int amdgpu_ras_late_init(struct amdgpu_device *adev, 2095 struct ras_common_if *ras_block, 2096 struct ras_fs_if *fs_info, 2097 struct ras_ih_if *ih_info) 2098 { 2099 int r; 2100 2101 /* disable RAS feature per IP block if it is not supported */ 2102 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { 2103 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); 2104 return 0; 2105 } 2106 2107 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); 2108 if (r) { 2109 if (r == -EAGAIN) { 2110 /* request gpu reset. will run again */ 2111 amdgpu_ras_request_reset_on_boot(adev, 2112 ras_block->block); 2113 return 0; 2114 } else if (adev->in_suspend || amdgpu_in_reset(adev)) { 2115 /* in resume phase, if fail to enable ras, 2116 * clean up all ras fs nodes, and disable ras */ 2117 goto cleanup; 2118 } else 2119 return r; 2120 } 2121 2122 /* in resume phase, no need to create ras fs node */ 2123 if (adev->in_suspend || amdgpu_in_reset(adev)) 2124 return 0; 2125 2126 if (ih_info->cb) { 2127 r = amdgpu_ras_interrupt_add_handler(adev, ih_info); 2128 if (r) 2129 goto interrupt; 2130 } 2131 2132 r = amdgpu_ras_sysfs_create(adev, fs_info); 2133 if (r) 2134 goto sysfs; 2135 2136 return 0; 2137 cleanup: 2138 amdgpu_ras_sysfs_remove(adev, ras_block); 2139 sysfs: 2140 if (ih_info->cb) 2141 amdgpu_ras_interrupt_remove_handler(adev, ih_info); 2142 interrupt: 2143 amdgpu_ras_feature_enable(adev, ras_block, 0); 2144 return r; 2145 } 2146 2147 /* helper function to remove ras fs node and interrupt handler */ 2148 void amdgpu_ras_late_fini(struct amdgpu_device *adev, 2149 struct ras_common_if *ras_block, 2150 struct ras_ih_if *ih_info) 2151 { 2152 if (!ras_block || !ih_info) 2153 return; 2154 2155 amdgpu_ras_sysfs_remove(adev, ras_block); 2156 if (ih_info->cb) 2157 amdgpu_ras_interrupt_remove_handler(adev, ih_info); 2158 amdgpu_ras_feature_enable(adev, ras_block, 0); 2159 } 2160 2161 /* do some init work after IP late init as dependence. 2162 * and it runs in resume/gpu reset/booting up cases. 2163 */ 2164 void amdgpu_ras_resume(struct amdgpu_device *adev) 2165 { 2166 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2167 struct ras_manager *obj, *tmp; 2168 2169 if (!adev->ras_features || !con) { 2170 /* clean ras context for VEGA20 Gaming after send ras disable cmd */ 2171 amdgpu_release_ras_context(adev); 2172 2173 return; 2174 } 2175 2176 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 2177 /* Set up all other IPs which are not implemented. There is a 2178 * tricky thing that IP's actual ras error type should be 2179 * MULTI_UNCORRECTABLE, but as driver does not handle it, so 2180 * ERROR_NONE make sense anyway. 2181 */ 2182 amdgpu_ras_enable_all_features(adev, 1); 2183 2184 /* We enable ras on all hw_supported block, but as boot 2185 * parameter might disable some of them and one or more IP has 2186 * not implemented yet. So we disable them on behalf. 2187 */ 2188 list_for_each_entry_safe(obj, tmp, &con->head, node) { 2189 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 2190 amdgpu_ras_feature_enable(adev, &obj->head, 0); 2191 /* there should be no any reference. */ 2192 WARN_ON(alive_obj(obj)); 2193 } 2194 } 2195 } 2196 2197 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { 2198 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; 2199 /* setup ras obj state as disabled. 2200 * for init_by_vbios case. 2201 * if we want to enable ras, just enable it in a normal way. 2202 * If we want do disable it, need setup ras obj as enabled, 2203 * then issue another TA disable cmd. 2204 * See feature_enable_on_boot 2205 */ 2206 amdgpu_ras_disable_all_features(adev, 1); 2207 amdgpu_ras_reset_gpu(adev); 2208 } 2209 } 2210 2211 void amdgpu_ras_suspend(struct amdgpu_device *adev) 2212 { 2213 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2214 2215 if (!adev->ras_features || !con) 2216 return; 2217 2218 amdgpu_ras_disable_all_features(adev, 0); 2219 /* Make sure all ras objects are disabled. */ 2220 if (con->features) 2221 amdgpu_ras_disable_all_features(adev, 1); 2222 } 2223 2224 /* do some fini work before IP fini as dependence */ 2225 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 2226 { 2227 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2228 2229 if (!adev->ras_features || !con) 2230 return 0; 2231 2232 /* Need disable ras on all IPs here before ip [hw/sw]fini */ 2233 amdgpu_ras_disable_all_features(adev, 0); 2234 amdgpu_ras_recovery_fini(adev); 2235 return 0; 2236 } 2237 2238 int amdgpu_ras_fini(struct amdgpu_device *adev) 2239 { 2240 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2241 2242 if (!adev->ras_features || !con) 2243 return 0; 2244 2245 amdgpu_ras_fs_fini(adev); 2246 amdgpu_ras_interrupt_remove_all(adev); 2247 2248 WARN(con->features, "Feature mask is not cleared"); 2249 2250 if (con->features) 2251 amdgpu_ras_disable_all_features(adev, 1); 2252 2253 amdgpu_ras_set_context(adev, NULL); 2254 kfree(con); 2255 2256 return 0; 2257 } 2258 2259 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) 2260 { 2261 uint32_t hw_supported, supported; 2262 2263 amdgpu_ras_check_supported(adev, &hw_supported, &supported); 2264 if (!hw_supported) 2265 return; 2266 2267 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { 2268 dev_info(adev->dev, "uncorrectable hardware error" 2269 "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); 2270 2271 amdgpu_ras_reset_gpu(adev); 2272 } 2273 } 2274 2275 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) 2276 { 2277 if (adev->asic_type == CHIP_VEGA20 && 2278 adev->pm.fw_version <= 0x283400) { 2279 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && 2280 amdgpu_ras_intr_triggered(); 2281 } 2282 2283 return false; 2284 } 2285 2286 void amdgpu_release_ras_context(struct amdgpu_device *adev) 2287 { 2288 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2289 2290 if (!con) 2291 return; 2292 2293 if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { 2294 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); 2295 amdgpu_ras_set_context(adev, NULL); 2296 kfree(con); 2297 } 2298 } 2299