1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include <linux/debugfs.h> 25 #include <linux/list.h> 26 #include <linux/module.h> 27 #include "amdgpu.h" 28 #include "amdgpu_ras.h" 29 #include "amdgpu_atomfirmware.h" 30 31 struct ras_ih_data { 32 /* interrupt bottom half */ 33 struct work_struct ih_work; 34 int inuse; 35 /* IP callback */ 36 ras_ih_cb cb; 37 /* full of entries */ 38 unsigned char *ring; 39 unsigned int ring_size; 40 unsigned int element_size; 41 unsigned int aligned_element_size; 42 unsigned int rptr; 43 unsigned int wptr; 44 }; 45 46 struct ras_fs_data { 47 char sysfs_name[32]; 48 char debugfs_name[32]; 49 }; 50 51 struct ras_err_data { 52 unsigned long ue_count; 53 unsigned long ce_count; 54 }; 55 56 struct ras_err_handler_data { 57 /* point to bad pages array */ 58 struct { 59 unsigned long bp; 60 struct amdgpu_bo *bo; 61 } *bps; 62 /* the count of entries */ 63 int count; 64 /* the space can place new entries */ 65 int space_left; 66 /* last reserved entry's index + 1 */ 67 int last_reserved; 68 }; 69 70 struct ras_manager { 71 struct ras_common_if head; 72 /* reference count */ 73 int use; 74 /* ras block link */ 75 struct list_head node; 76 /* the device */ 77 struct amdgpu_device *adev; 78 /* debugfs */ 79 struct dentry *ent; 80 /* sysfs */ 81 struct device_attribute sysfs_attr; 82 int attr_inuse; 83 84 /* fs node name */ 85 struct ras_fs_data fs_data; 86 87 /* IH data */ 88 struct ras_ih_data ih_data; 89 90 struct ras_err_data err_data; 91 }; 92 93 struct ras_badpage { 94 unsigned int bp; 95 unsigned int size; 96 unsigned int flags; 97 }; 98 99 const char *ras_error_string[] = { 100 "none", 101 "parity", 102 "single_correctable", 103 "multi_uncorrectable", 104 "poison", 105 }; 106 107 const char *ras_block_string[] = { 108 "umc", 109 "sdma", 110 "gfx", 111 "mmhub", 112 "athub", 113 "pcie_bif", 114 "hdp", 115 "xgmi_wafl", 116 "df", 117 "smn", 118 "sem", 119 "mp0", 120 "mp1", 121 "fuse", 122 }; 123 124 #define ras_err_str(i) (ras_error_string[ffs(i)]) 125 #define ras_block_str(i) (ras_block_string[i]) 126 127 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 128 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 129 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 130 131 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, 132 uint64_t offset, uint64_t size, 133 struct amdgpu_bo **bo_ptr); 134 static int amdgpu_ras_release_vram(struct amdgpu_device *adev, 135 struct amdgpu_bo **bo_ptr); 136 137 static void amdgpu_ras_self_test(struct amdgpu_device *adev) 138 { 139 /* TODO */ 140 } 141 142 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 143 size_t size, loff_t *pos) 144 { 145 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 146 struct ras_query_if info = { 147 .head = obj->head, 148 }; 149 ssize_t s; 150 char val[128]; 151 152 if (amdgpu_ras_error_query(obj->adev, &info)) 153 return -EINVAL; 154 155 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 156 "ue", info.ue_count, 157 "ce", info.ce_count); 158 if (*pos >= s) 159 return 0; 160 161 s -= *pos; 162 s = min_t(u64, s, size); 163 164 165 if (copy_to_user(buf, &val[*pos], s)) 166 return -EINVAL; 167 168 *pos += s; 169 170 return s; 171 } 172 173 static const struct file_operations amdgpu_ras_debugfs_ops = { 174 .owner = THIS_MODULE, 175 .read = amdgpu_ras_debugfs_read, 176 .write = NULL, 177 .llseek = default_llseek 178 }; 179 180 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 181 { 182 int i; 183 184 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 185 *block_id = i; 186 if (strcmp(name, ras_block_str(i)) == 0) 187 return 0; 188 } 189 return -EINVAL; 190 } 191 192 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 193 const char __user *buf, size_t size, 194 loff_t *pos, struct ras_debug_if *data) 195 { 196 ssize_t s = min_t(u64, 64, size); 197 char str[65]; 198 char block_name[33]; 199 char err[9] = "ue"; 200 int op = -1; 201 int block_id; 202 u64 address, value; 203 204 if (*pos) 205 return -EINVAL; 206 *pos = size; 207 208 memset(str, 0, sizeof(str)); 209 memset(data, 0, sizeof(*data)); 210 211 if (copy_from_user(str, buf, s)) 212 return -EINVAL; 213 214 if (sscanf(str, "disable %32s", block_name) == 1) 215 op = 0; 216 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 217 op = 1; 218 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 219 op = 2; 220 else if (str[0] && str[1] && str[2] && str[3]) 221 /* ascii string, but commands are not matched. */ 222 return -EINVAL; 223 224 if (op != -1) { 225 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 226 return -EINVAL; 227 228 data->head.block = block_id; 229 data->head.type = memcmp("ue", err, 2) == 0 ? 230 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE : 231 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 232 data->op = op; 233 234 if (op == 2) { 235 if (sscanf(str, "%*s %*s %*s %llu %llu", 236 &address, &value) != 2) 237 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx", 238 &address, &value) != 2) 239 return -EINVAL; 240 data->inject.address = address; 241 data->inject.value = value; 242 } 243 } else { 244 if (size < sizeof(*data)) 245 return -EINVAL; 246 247 if (copy_from_user(data, buf, sizeof(*data))) 248 return -EINVAL; 249 } 250 251 return 0; 252 } 253 /** 254 * DOC: AMDGPU RAS debugfs control interface 255 * 256 * It accepts struct ras_debug_if who has two members. 257 * 258 * First member: ras_debug_if::head or ras_debug_if::inject. 259 * 260 * head is used to indicate which IP block will be under control. 261 * 262 * head has four members, they are block, type, sub_block_index, name. 263 * block: which IP will be under control. 264 * type: what kind of error will be enabled/disabled/injected. 265 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 266 * name: the name of IP. 267 * 268 * inject has two more members than head, they are address, value. 269 * As their names indicate, inject operation will write the 270 * value to the address. 271 * 272 * Second member: struct ras_debug_if::op. 273 * It has three kinds of operations. 274 * 0: disable RAS on the block. Take ::head as its data. 275 * 1: enable RAS on the block. Take ::head as its data. 276 * 2: inject errors on the block. Take ::inject as its data. 277 * 278 * How to use the interface? 279 * programs: 280 * copy the struct ras_debug_if in your codes and initialize it. 281 * write the struct to the control node. 282 * 283 * bash: 284 * echo op block [error [address value]] > .../ras/ras_ctrl 285 * op: disable, enable, inject 286 * disable: only block is needed 287 * enable: block and error are needed 288 * inject: error, address, value are needed 289 * block: umc, smda, gfx, ......... 290 * see ras_block_string[] for details 291 * error: ue, ce 292 * ue: multi_uncorrectable 293 * ce: single_correctable 294 * 295 * here are some examples for bash commands, 296 * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 297 * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 298 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 299 * 300 * How to check the result? 301 * 302 * For disable/enable, please check ras features at 303 * /sys/class/drm/card[0/1/2...]/device/ras/features 304 * 305 * For inject, please check corresponding err count at 306 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 307 * 308 * NOTE: operation is only allowed on blocks which are supported. 309 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask 310 */ 311 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, 312 size_t size, loff_t *pos) 313 { 314 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 315 struct ras_debug_if data; 316 struct amdgpu_bo *bo; 317 int ret = 0; 318 319 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 320 if (ret) 321 return -EINVAL; 322 323 if (!amdgpu_ras_is_supported(adev, data.head.block)) 324 return -EINVAL; 325 326 switch (data.op) { 327 case 0: 328 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 329 break; 330 case 1: 331 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 332 break; 333 case 2: 334 ret = amdgpu_ras_reserve_vram(adev, 335 data.inject.address, PAGE_SIZE, &bo); 336 if (ret) { 337 /* address was offset, now it is absolute.*/ 338 data.inject.address += adev->gmc.vram_start; 339 if (data.inject.address > adev->gmc.vram_end) 340 break; 341 } else 342 data.inject.address = amdgpu_bo_gpu_offset(bo); 343 ret = amdgpu_ras_error_inject(adev, &data.inject); 344 amdgpu_ras_release_vram(adev, &bo); 345 break; 346 default: 347 ret = -EINVAL; 348 break; 349 }; 350 351 if (ret) 352 return -EINVAL; 353 354 return size; 355 } 356 357 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 358 .owner = THIS_MODULE, 359 .read = NULL, 360 .write = amdgpu_ras_debugfs_ctrl_write, 361 .llseek = default_llseek 362 }; 363 364 static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 365 struct device_attribute *attr, char *buf) 366 { 367 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 368 struct ras_query_if info = { 369 .head = obj->head, 370 }; 371 372 if (amdgpu_ras_error_query(obj->adev, &info)) 373 return -EINVAL; 374 375 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", 376 "ue", info.ue_count, 377 "ce", info.ce_count); 378 } 379 380 /* obj begin */ 381 382 #define get_obj(obj) do { (obj)->use++; } while (0) 383 #define alive_obj(obj) ((obj)->use) 384 385 static inline void put_obj(struct ras_manager *obj) 386 { 387 if (obj && --obj->use == 0) 388 list_del(&obj->node); 389 if (obj && obj->use < 0) { 390 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); 391 } 392 } 393 394 /* make one obj and return it. */ 395 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 396 struct ras_common_if *head) 397 { 398 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 399 struct ras_manager *obj; 400 401 if (!con) 402 return NULL; 403 404 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 405 return NULL; 406 407 obj = &con->objs[head->block]; 408 /* already exist. return obj? */ 409 if (alive_obj(obj)) 410 return NULL; 411 412 obj->head = *head; 413 obj->adev = adev; 414 list_add(&obj->node, &con->head); 415 get_obj(obj); 416 417 return obj; 418 } 419 420 /* return an obj equal to head, or the first when head is NULL */ 421 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 422 struct ras_common_if *head) 423 { 424 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 425 struct ras_manager *obj; 426 int i; 427 428 if (!con) 429 return NULL; 430 431 if (head) { 432 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 433 return NULL; 434 435 obj = &con->objs[head->block]; 436 437 if (alive_obj(obj)) { 438 WARN_ON(head->block != obj->head.block); 439 return obj; 440 } 441 } else { 442 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 443 obj = &con->objs[i]; 444 if (alive_obj(obj)) { 445 WARN_ON(i != obj->head.block); 446 return obj; 447 } 448 } 449 } 450 451 return NULL; 452 } 453 /* obj end */ 454 455 /* feature ctl begin */ 456 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 457 struct ras_common_if *head) 458 { 459 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 460 461 return con->hw_supported & BIT(head->block); 462 } 463 464 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 465 struct ras_common_if *head) 466 { 467 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 468 469 return con->features & BIT(head->block); 470 } 471 472 /* 473 * if obj is not created, then create one. 474 * set feature enable flag. 475 */ 476 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 477 struct ras_common_if *head, int enable) 478 { 479 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 480 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 481 482 /* If hardware does not support ras, then do not create obj. 483 * But if hardware support ras, we can create the obj. 484 * Ras framework checks con->hw_supported to see if it need do 485 * corresponding initialization. 486 * IP checks con->support to see if it need disable ras. 487 */ 488 if (!amdgpu_ras_is_feature_allowed(adev, head)) 489 return 0; 490 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 491 return 0; 492 493 if (enable) { 494 if (!obj) { 495 obj = amdgpu_ras_create_obj(adev, head); 496 if (!obj) 497 return -EINVAL; 498 } else { 499 /* In case we create obj somewhere else */ 500 get_obj(obj); 501 } 502 con->features |= BIT(head->block); 503 } else { 504 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 505 con->features &= ~BIT(head->block); 506 put_obj(obj); 507 } 508 } 509 510 return 0; 511 } 512 513 /* wrapper of psp_ras_enable_features */ 514 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 515 struct ras_common_if *head, bool enable) 516 { 517 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 518 union ta_ras_cmd_input info; 519 int ret; 520 521 if (!con) 522 return -EINVAL; 523 524 if (!enable) { 525 info.disable_features = (struct ta_ras_disable_features_input) { 526 .block_id = amdgpu_ras_block_to_ta(head->block), 527 .error_type = amdgpu_ras_error_to_ta(head->type), 528 }; 529 } else { 530 info.enable_features = (struct ta_ras_enable_features_input) { 531 .block_id = amdgpu_ras_block_to_ta(head->block), 532 .error_type = amdgpu_ras_error_to_ta(head->type), 533 }; 534 } 535 536 /* Do not enable if it is not allowed. */ 537 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); 538 /* Are we alerady in that state we are going to set? */ 539 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 540 return 0; 541 542 ret = psp_ras_enable_features(&adev->psp, &info, enable); 543 if (ret) { 544 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", 545 enable ? "enable":"disable", 546 ras_block_str(head->block), 547 ret); 548 if (ret == TA_RAS_STATUS__RESET_NEEDED) 549 return -EAGAIN; 550 return -EINVAL; 551 } 552 553 /* setup the obj */ 554 __amdgpu_ras_feature_enable(adev, head, enable); 555 556 return 0; 557 } 558 559 /* Only used in device probe stage and called only once. */ 560 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, 561 struct ras_common_if *head, bool enable) 562 { 563 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 564 int ret; 565 566 if (!con) 567 return -EINVAL; 568 569 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 570 if (enable) { 571 /* There is no harm to issue a ras TA cmd regardless of 572 * the currecnt ras state. 573 * If current state == target state, it will do nothing 574 * But sometimes it requests driver to reset and repost 575 * with error code -EAGAIN. 576 */ 577 ret = amdgpu_ras_feature_enable(adev, head, 1); 578 /* With old ras TA, we might fail to enable ras. 579 * Log it and just setup the object. 580 * TODO need remove this WA in the future. 581 */ 582 if (ret == -EINVAL) { 583 ret = __amdgpu_ras_feature_enable(adev, head, 1); 584 if (!ret) 585 DRM_INFO("RAS INFO: %s setup object\n", 586 ras_block_str(head->block)); 587 } 588 } else { 589 /* setup the object then issue a ras TA disable cmd.*/ 590 ret = __amdgpu_ras_feature_enable(adev, head, 1); 591 if (ret) 592 return ret; 593 594 ret = amdgpu_ras_feature_enable(adev, head, 0); 595 } 596 } else 597 ret = amdgpu_ras_feature_enable(adev, head, enable); 598 599 return ret; 600 } 601 602 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 603 bool bypass) 604 { 605 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 606 struct ras_manager *obj, *tmp; 607 608 list_for_each_entry_safe(obj, tmp, &con->head, node) { 609 /* bypass psp. 610 * aka just release the obj and corresponding flags 611 */ 612 if (bypass) { 613 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 614 break; 615 } else { 616 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 617 break; 618 } 619 } 620 621 return con->features; 622 } 623 624 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 625 bool bypass) 626 { 627 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 628 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 629 int i; 630 const enum amdgpu_ras_error_type default_ras_type = 631 AMDGPU_RAS_ERROR__NONE; 632 633 for (i = 0; i < ras_block_count; i++) { 634 struct ras_common_if head = { 635 .block = i, 636 .type = default_ras_type, 637 .sub_block_index = 0, 638 }; 639 strcpy(head.name, ras_block_str(i)); 640 if (bypass) { 641 /* 642 * bypass psp. vbios enable ras for us. 643 * so just create the obj 644 */ 645 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 646 break; 647 } else { 648 if (amdgpu_ras_feature_enable(adev, &head, 1)) 649 break; 650 } 651 } 652 653 return con->features; 654 } 655 /* feature ctl end */ 656 657 /* query/inject/cure begin */ 658 int amdgpu_ras_error_query(struct amdgpu_device *adev, 659 struct ras_query_if *info) 660 { 661 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 662 663 if (!obj) 664 return -EINVAL; 665 /* TODO might read the register to read the count */ 666 667 info->ue_count = obj->err_data.ue_count; 668 info->ce_count = obj->err_data.ce_count; 669 670 return 0; 671 } 672 673 /* wrapper of psp_ras_trigger_error */ 674 int amdgpu_ras_error_inject(struct amdgpu_device *adev, 675 struct ras_inject_if *info) 676 { 677 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 678 struct ta_ras_trigger_error_input block_info = { 679 .block_id = amdgpu_ras_block_to_ta(info->head.block), 680 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 681 .sub_block_index = info->head.sub_block_index, 682 .address = info->address, 683 .value = info->value, 684 }; 685 int ret = 0; 686 687 if (!obj) 688 return -EINVAL; 689 690 ret = psp_ras_trigger_error(&adev->psp, &block_info); 691 if (ret) 692 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", 693 ras_block_str(info->head.block), 694 ret); 695 696 return ret; 697 } 698 699 int amdgpu_ras_error_cure(struct amdgpu_device *adev, 700 struct ras_cure_if *info) 701 { 702 /* psp fw has no cure interface for now. */ 703 return 0; 704 } 705 706 /* get the total error counts on all IPs */ 707 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 708 bool is_ce) 709 { 710 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 711 struct ras_manager *obj; 712 struct ras_err_data data = {0, 0}; 713 714 if (!con) 715 return -EINVAL; 716 717 list_for_each_entry(obj, &con->head, node) { 718 struct ras_query_if info = { 719 .head = obj->head, 720 }; 721 722 if (amdgpu_ras_error_query(adev, &info)) 723 return -EINVAL; 724 725 data.ce_count += info.ce_count; 726 data.ue_count += info.ue_count; 727 } 728 729 return is_ce ? data.ce_count : data.ue_count; 730 } 731 /* query/inject/cure end */ 732 733 734 /* sysfs begin */ 735 736 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 737 struct ras_badpage **bps, unsigned int *count); 738 739 static char *amdgpu_ras_badpage_flags_str(unsigned int flags) 740 { 741 switch (flags) { 742 case 0: 743 return "R"; 744 case 1: 745 return "P"; 746 case 2: 747 default: 748 return "F"; 749 }; 750 } 751 752 /* 753 * DOC: ras sysfs gpu_vram_bad_pages interface 754 * 755 * It allows user to read the bad pages of vram on the gpu through 756 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages 757 * 758 * It outputs multiple lines, and each line stands for one gpu page. 759 * 760 * The format of one line is below, 761 * gpu pfn : gpu page size : flags 762 * 763 * gpu pfn and gpu page size are printed in hex format. 764 * flags can be one of below character, 765 * R: reserved, this gpu page is reserved and not able to use. 766 * P: pending for reserve, this gpu page is marked as bad, will be reserved 767 * in next window of page_reserve. 768 * F: unable to reserve. this gpu page can't be reserved due to some reasons. 769 * 770 * examples: 771 * 0x00000001 : 0x00001000 : R 772 * 0x00000002 : 0x00001000 : P 773 */ 774 775 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, 776 struct kobject *kobj, struct bin_attribute *attr, 777 char *buf, loff_t ppos, size_t count) 778 { 779 struct amdgpu_ras *con = 780 container_of(attr, struct amdgpu_ras, badpages_attr); 781 struct amdgpu_device *adev = con->adev; 782 const unsigned int element_size = 783 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; 784 unsigned int start = div64_ul(ppos + element_size - 1, element_size); 785 unsigned int end = div64_ul(ppos + count - 1, element_size); 786 ssize_t s = 0; 787 struct ras_badpage *bps = NULL; 788 unsigned int bps_count = 0; 789 790 memset(buf, 0, count); 791 792 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) 793 return 0; 794 795 for (; start < end && start < bps_count; start++) 796 s += scnprintf(&buf[s], element_size + 1, 797 "0x%08x : 0x%08x : %1s\n", 798 bps[start].bp, 799 bps[start].size, 800 amdgpu_ras_badpage_flags_str(bps[start].flags)); 801 802 kfree(bps); 803 804 return s; 805 } 806 807 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 808 struct device_attribute *attr, char *buf) 809 { 810 struct amdgpu_ras *con = 811 container_of(attr, struct amdgpu_ras, features_attr); 812 struct drm_device *ddev = dev_get_drvdata(dev); 813 struct amdgpu_device *adev = ddev->dev_private; 814 struct ras_common_if head; 815 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 816 int i; 817 ssize_t s; 818 struct ras_manager *obj; 819 820 s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); 821 822 for (i = 0; i < ras_block_count; i++) { 823 head.block = i; 824 825 if (amdgpu_ras_is_feature_enabled(adev, &head)) { 826 obj = amdgpu_ras_find_obj(adev, &head); 827 s += scnprintf(&buf[s], PAGE_SIZE - s, 828 "%s: %s\n", 829 ras_block_str(i), 830 ras_err_str(obj->head.type)); 831 } else 832 s += scnprintf(&buf[s], PAGE_SIZE - s, 833 "%s: disabled\n", 834 ras_block_str(i)); 835 } 836 837 return s; 838 } 839 840 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) 841 { 842 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 843 struct attribute *attrs[] = { 844 &con->features_attr.attr, 845 NULL 846 }; 847 struct bin_attribute *bin_attrs[] = { 848 &con->badpages_attr, 849 NULL 850 }; 851 struct attribute_group group = { 852 .name = "ras", 853 .attrs = attrs, 854 .bin_attrs = bin_attrs, 855 }; 856 857 con->features_attr = (struct device_attribute) { 858 .attr = { 859 .name = "features", 860 .mode = S_IRUGO, 861 }, 862 .show = amdgpu_ras_sysfs_features_read, 863 }; 864 865 con->badpages_attr = (struct bin_attribute) { 866 .attr = { 867 .name = "gpu_vram_bad_pages", 868 .mode = S_IRUGO, 869 }, 870 .size = 0, 871 .private = NULL, 872 .read = amdgpu_ras_sysfs_badpages_read, 873 }; 874 875 sysfs_attr_init(attrs[0]); 876 sysfs_bin_attr_init(bin_attrs[0]); 877 878 return sysfs_create_group(&adev->dev->kobj, &group); 879 } 880 881 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) 882 { 883 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 884 struct attribute *attrs[] = { 885 &con->features_attr.attr, 886 NULL 887 }; 888 struct bin_attribute *bin_attrs[] = { 889 &con->badpages_attr, 890 NULL 891 }; 892 struct attribute_group group = { 893 .name = "ras", 894 .attrs = attrs, 895 .bin_attrs = bin_attrs, 896 }; 897 898 sysfs_remove_group(&adev->dev->kobj, &group); 899 900 return 0; 901 } 902 903 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 904 struct ras_fs_if *head) 905 { 906 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 907 908 if (!obj || obj->attr_inuse) 909 return -EINVAL; 910 911 get_obj(obj); 912 913 memcpy(obj->fs_data.sysfs_name, 914 head->sysfs_name, 915 sizeof(obj->fs_data.sysfs_name)); 916 917 obj->sysfs_attr = (struct device_attribute){ 918 .attr = { 919 .name = obj->fs_data.sysfs_name, 920 .mode = S_IRUGO, 921 }, 922 .show = amdgpu_ras_sysfs_read, 923 }; 924 sysfs_attr_init(&obj->sysfs_attr.attr); 925 926 if (sysfs_add_file_to_group(&adev->dev->kobj, 927 &obj->sysfs_attr.attr, 928 "ras")) { 929 put_obj(obj); 930 return -EINVAL; 931 } 932 933 obj->attr_inuse = 1; 934 935 return 0; 936 } 937 938 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 939 struct ras_common_if *head) 940 { 941 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 942 943 if (!obj || !obj->attr_inuse) 944 return -EINVAL; 945 946 sysfs_remove_file_from_group(&adev->dev->kobj, 947 &obj->sysfs_attr.attr, 948 "ras"); 949 obj->attr_inuse = 0; 950 put_obj(obj); 951 952 return 0; 953 } 954 955 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 956 { 957 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 958 struct ras_manager *obj, *tmp; 959 960 list_for_each_entry_safe(obj, tmp, &con->head, node) { 961 amdgpu_ras_sysfs_remove(adev, &obj->head); 962 } 963 964 amdgpu_ras_sysfs_remove_feature_node(adev); 965 966 return 0; 967 } 968 /* sysfs end */ 969 970 /* debugfs begin */ 971 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 972 { 973 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 974 struct drm_minor *minor = adev->ddev->primary; 975 976 con->dir = debugfs_create_dir("ras", minor->debugfs_root); 977 con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, 978 adev, &amdgpu_ras_debugfs_ctrl_ops); 979 } 980 981 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 982 struct ras_fs_if *head) 983 { 984 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 985 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 986 987 if (!obj || obj->ent) 988 return; 989 990 get_obj(obj); 991 992 memcpy(obj->fs_data.debugfs_name, 993 head->debugfs_name, 994 sizeof(obj->fs_data.debugfs_name)); 995 996 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name, 997 S_IWUGO | S_IRUGO, con->dir, obj, 998 &amdgpu_ras_debugfs_ops); 999 } 1000 1001 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, 1002 struct ras_common_if *head) 1003 { 1004 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 1005 1006 if (!obj || !obj->ent) 1007 return; 1008 1009 debugfs_remove(obj->ent); 1010 obj->ent = NULL; 1011 put_obj(obj); 1012 } 1013 1014 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) 1015 { 1016 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1017 struct ras_manager *obj, *tmp; 1018 1019 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1020 amdgpu_ras_debugfs_remove(adev, &obj->head); 1021 } 1022 1023 debugfs_remove(con->ent); 1024 debugfs_remove(con->dir); 1025 con->dir = NULL; 1026 con->ent = NULL; 1027 } 1028 /* debugfs end */ 1029 1030 /* ras fs */ 1031 1032 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 1033 { 1034 amdgpu_ras_sysfs_create_feature_node(adev); 1035 amdgpu_ras_debugfs_create_ctrl_node(adev); 1036 1037 return 0; 1038 } 1039 1040 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 1041 { 1042 amdgpu_ras_debugfs_remove_all(adev); 1043 amdgpu_ras_sysfs_remove_all(adev); 1044 return 0; 1045 } 1046 /* ras fs end */ 1047 1048 /* ih begin */ 1049 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 1050 { 1051 struct ras_ih_data *data = &obj->ih_data; 1052 struct amdgpu_iv_entry entry; 1053 int ret; 1054 1055 while (data->rptr != data->wptr) { 1056 rmb(); 1057 memcpy(&entry, &data->ring[data->rptr], 1058 data->element_size); 1059 1060 wmb(); 1061 data->rptr = (data->aligned_element_size + 1062 data->rptr) % data->ring_size; 1063 1064 /* Let IP handle its data, maybe we need get the output 1065 * from the callback to udpate the error type/count, etc 1066 */ 1067 if (data->cb) { 1068 ret = data->cb(obj->adev, &entry); 1069 /* ue will trigger an interrupt, and in that case 1070 * we need do a reset to recovery the whole system. 1071 * But leave IP do that recovery, here we just dispatch 1072 * the error. 1073 */ 1074 if (ret == AMDGPU_RAS_UE) { 1075 obj->err_data.ue_count++; 1076 } 1077 /* Might need get ce count by register, but not all IP 1078 * saves ce count, some IP just use one bit or two bits 1079 * to indicate ce happened. 1080 */ 1081 } 1082 } 1083 } 1084 1085 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 1086 { 1087 struct ras_ih_data *data = 1088 container_of(work, struct ras_ih_data, ih_work); 1089 struct ras_manager *obj = 1090 container_of(data, struct ras_manager, ih_data); 1091 1092 amdgpu_ras_interrupt_handler(obj); 1093 } 1094 1095 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 1096 struct ras_dispatch_if *info) 1097 { 1098 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1099 struct ras_ih_data *data = &obj->ih_data; 1100 1101 if (!obj) 1102 return -EINVAL; 1103 1104 if (data->inuse == 0) 1105 return 0; 1106 1107 /* Might be overflow... */ 1108 memcpy(&data->ring[data->wptr], info->entry, 1109 data->element_size); 1110 1111 wmb(); 1112 data->wptr = (data->aligned_element_size + 1113 data->wptr) % data->ring_size; 1114 1115 schedule_work(&data->ih_work); 1116 1117 return 0; 1118 } 1119 1120 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 1121 struct ras_ih_if *info) 1122 { 1123 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1124 struct ras_ih_data *data; 1125 1126 if (!obj) 1127 return -EINVAL; 1128 1129 data = &obj->ih_data; 1130 if (data->inuse == 0) 1131 return 0; 1132 1133 cancel_work_sync(&data->ih_work); 1134 1135 kfree(data->ring); 1136 memset(data, 0, sizeof(*data)); 1137 put_obj(obj); 1138 1139 return 0; 1140 } 1141 1142 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 1143 struct ras_ih_if *info) 1144 { 1145 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1146 struct ras_ih_data *data; 1147 1148 if (!obj) { 1149 /* in case we registe the IH before enable ras feature */ 1150 obj = amdgpu_ras_create_obj(adev, &info->head); 1151 if (!obj) 1152 return -EINVAL; 1153 } else 1154 get_obj(obj); 1155 1156 data = &obj->ih_data; 1157 /* add the callback.etc */ 1158 *data = (struct ras_ih_data) { 1159 .inuse = 0, 1160 .cb = info->cb, 1161 .element_size = sizeof(struct amdgpu_iv_entry), 1162 .rptr = 0, 1163 .wptr = 0, 1164 }; 1165 1166 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 1167 1168 data->aligned_element_size = ALIGN(data->element_size, 8); 1169 /* the ring can store 64 iv entries. */ 1170 data->ring_size = 64 * data->aligned_element_size; 1171 data->ring = kmalloc(data->ring_size, GFP_KERNEL); 1172 if (!data->ring) { 1173 put_obj(obj); 1174 return -ENOMEM; 1175 } 1176 1177 /* IH is ready */ 1178 data->inuse = 1; 1179 1180 return 0; 1181 } 1182 1183 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 1184 { 1185 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1186 struct ras_manager *obj, *tmp; 1187 1188 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1189 struct ras_ih_if info = { 1190 .head = obj->head, 1191 }; 1192 amdgpu_ras_interrupt_remove_handler(adev, &info); 1193 } 1194 1195 return 0; 1196 } 1197 /* ih end */ 1198 1199 /* recovery begin */ 1200 1201 /* return 0 on success. 1202 * caller need free bps. 1203 */ 1204 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 1205 struct ras_badpage **bps, unsigned int *count) 1206 { 1207 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1208 struct ras_err_handler_data *data; 1209 int i = 0; 1210 int ret = 0; 1211 1212 if (!con || !con->eh_data || !bps || !count) 1213 return -EINVAL; 1214 1215 mutex_lock(&con->recovery_lock); 1216 data = con->eh_data; 1217 if (!data || data->count == 0) { 1218 *bps = NULL; 1219 goto out; 1220 } 1221 1222 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); 1223 if (!*bps) { 1224 ret = -ENOMEM; 1225 goto out; 1226 } 1227 1228 for (; i < data->count; i++) { 1229 (*bps)[i] = (struct ras_badpage){ 1230 .bp = data->bps[i].bp, 1231 .size = AMDGPU_GPU_PAGE_SIZE, 1232 .flags = 0, 1233 }; 1234 1235 if (data->last_reserved <= i) 1236 (*bps)[i].flags = 1; 1237 else if (data->bps[i].bo == NULL) 1238 (*bps)[i].flags = 2; 1239 } 1240 1241 *count = data->count; 1242 out: 1243 mutex_unlock(&con->recovery_lock); 1244 return ret; 1245 } 1246 1247 static void amdgpu_ras_do_recovery(struct work_struct *work) 1248 { 1249 struct amdgpu_ras *ras = 1250 container_of(work, struct amdgpu_ras, recovery_work); 1251 1252 amdgpu_device_gpu_recover(ras->adev, 0); 1253 atomic_set(&ras->in_recovery, 0); 1254 } 1255 1256 static int amdgpu_ras_release_vram(struct amdgpu_device *adev, 1257 struct amdgpu_bo **bo_ptr) 1258 { 1259 /* no need to free it actually. */ 1260 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); 1261 return 0; 1262 } 1263 1264 /* reserve vram with size@offset */ 1265 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, 1266 uint64_t offset, uint64_t size, 1267 struct amdgpu_bo **bo_ptr) 1268 { 1269 struct ttm_operation_ctx ctx = { false, false }; 1270 struct amdgpu_bo_param bp; 1271 int r = 0; 1272 int i; 1273 struct amdgpu_bo *bo; 1274 1275 if (bo_ptr) 1276 *bo_ptr = NULL; 1277 memset(&bp, 0, sizeof(bp)); 1278 bp.size = size; 1279 bp.byte_align = PAGE_SIZE; 1280 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 1281 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | 1282 AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 1283 bp.type = ttm_bo_type_kernel; 1284 bp.resv = NULL; 1285 1286 r = amdgpu_bo_create(adev, &bp, &bo); 1287 if (r) 1288 return -EINVAL; 1289 1290 r = amdgpu_bo_reserve(bo, false); 1291 if (r) 1292 goto error_reserve; 1293 1294 offset = ALIGN(offset, PAGE_SIZE); 1295 for (i = 0; i < bo->placement.num_placement; ++i) { 1296 bo->placements[i].fpfn = offset >> PAGE_SHIFT; 1297 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; 1298 } 1299 1300 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); 1301 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); 1302 if (r) 1303 goto error_pin; 1304 1305 r = amdgpu_bo_pin_restricted(bo, 1306 AMDGPU_GEM_DOMAIN_VRAM, 1307 offset, 1308 offset + size); 1309 if (r) 1310 goto error_pin; 1311 1312 if (bo_ptr) 1313 *bo_ptr = bo; 1314 1315 amdgpu_bo_unreserve(bo); 1316 return r; 1317 1318 error_pin: 1319 amdgpu_bo_unreserve(bo); 1320 error_reserve: 1321 amdgpu_bo_unref(&bo); 1322 return r; 1323 } 1324 1325 /* alloc/realloc bps array */ 1326 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 1327 struct ras_err_handler_data *data, int pages) 1328 { 1329 unsigned int old_space = data->count + data->space_left; 1330 unsigned int new_space = old_space + pages; 1331 unsigned int align_space = ALIGN(new_space, 1024); 1332 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); 1333 1334 if (!tmp) 1335 return -ENOMEM; 1336 1337 if (data->bps) { 1338 memcpy(tmp, data->bps, 1339 data->count * sizeof(*data->bps)); 1340 kfree(data->bps); 1341 } 1342 1343 data->bps = tmp; 1344 data->space_left += align_space - old_space; 1345 return 0; 1346 } 1347 1348 /* it deal with vram only. */ 1349 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 1350 unsigned long *bps, int pages) 1351 { 1352 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1353 struct ras_err_handler_data *data; 1354 int i = pages; 1355 int ret = 0; 1356 1357 if (!con || !con->eh_data || !bps || pages <= 0) 1358 return 0; 1359 1360 mutex_lock(&con->recovery_lock); 1361 data = con->eh_data; 1362 if (!data) 1363 goto out; 1364 1365 if (data->space_left <= pages) 1366 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { 1367 ret = -ENOMEM; 1368 goto out; 1369 } 1370 1371 while (i--) 1372 data->bps[data->count++].bp = bps[i]; 1373 1374 data->space_left -= pages; 1375 out: 1376 mutex_unlock(&con->recovery_lock); 1377 1378 return ret; 1379 } 1380 1381 /* called in gpu recovery/init */ 1382 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) 1383 { 1384 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1385 struct ras_err_handler_data *data; 1386 uint64_t bp; 1387 struct amdgpu_bo *bo; 1388 int i; 1389 1390 if (!con || !con->eh_data) 1391 return 0; 1392 1393 mutex_lock(&con->recovery_lock); 1394 data = con->eh_data; 1395 if (!data) 1396 goto out; 1397 /* reserve vram at driver post stage. */ 1398 for (i = data->last_reserved; i < data->count; i++) { 1399 bp = data->bps[i].bp; 1400 1401 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, 1402 PAGE_SIZE, &bo)) 1403 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); 1404 1405 data->bps[i].bo = bo; 1406 data->last_reserved = i + 1; 1407 } 1408 out: 1409 mutex_unlock(&con->recovery_lock); 1410 return 0; 1411 } 1412 1413 /* called when driver unload */ 1414 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) 1415 { 1416 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1417 struct ras_err_handler_data *data; 1418 struct amdgpu_bo *bo; 1419 int i; 1420 1421 if (!con || !con->eh_data) 1422 return 0; 1423 1424 mutex_lock(&con->recovery_lock); 1425 data = con->eh_data; 1426 if (!data) 1427 goto out; 1428 1429 for (i = data->last_reserved - 1; i >= 0; i--) { 1430 bo = data->bps[i].bo; 1431 1432 amdgpu_ras_release_vram(adev, &bo); 1433 1434 data->bps[i].bo = bo; 1435 data->last_reserved = i; 1436 } 1437 out: 1438 mutex_unlock(&con->recovery_lock); 1439 return 0; 1440 } 1441 1442 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) 1443 { 1444 /* TODO 1445 * write the array to eeprom when SMU disabled. 1446 */ 1447 return 0; 1448 } 1449 1450 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 1451 { 1452 /* TODO 1453 * read the array to eeprom when SMU disabled. 1454 */ 1455 return 0; 1456 } 1457 1458 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) 1459 { 1460 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1461 struct ras_err_handler_data **data = &con->eh_data; 1462 1463 *data = kmalloc(sizeof(**data), 1464 GFP_KERNEL|__GFP_ZERO); 1465 if (!*data) 1466 return -ENOMEM; 1467 1468 mutex_init(&con->recovery_lock); 1469 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 1470 atomic_set(&con->in_recovery, 0); 1471 con->adev = adev; 1472 1473 amdgpu_ras_load_bad_pages(adev); 1474 amdgpu_ras_reserve_bad_pages(adev); 1475 1476 return 0; 1477 } 1478 1479 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 1480 { 1481 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1482 struct ras_err_handler_data *data = con->eh_data; 1483 1484 cancel_work_sync(&con->recovery_work); 1485 amdgpu_ras_save_bad_pages(adev); 1486 amdgpu_ras_release_bad_pages(adev); 1487 1488 mutex_lock(&con->recovery_lock); 1489 con->eh_data = NULL; 1490 kfree(data->bps); 1491 kfree(data); 1492 mutex_unlock(&con->recovery_lock); 1493 1494 return 0; 1495 } 1496 /* recovery end */ 1497 1498 /* return 0 if ras will reset gpu and repost.*/ 1499 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, 1500 unsigned int block) 1501 { 1502 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1503 1504 if (!ras) 1505 return -EINVAL; 1506 1507 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; 1508 return 0; 1509 } 1510 1511 /* 1512 * check hardware's ras ability which will be saved in hw_supported. 1513 * if hardware does not support ras, we can skip some ras initializtion and 1514 * forbid some ras operations from IP. 1515 * if software itself, say boot parameter, limit the ras ability. We still 1516 * need allow IP do some limited operations, like disable. In such case, 1517 * we have to initialize ras as normal. but need check if operation is 1518 * allowed or not in each function. 1519 */ 1520 static void amdgpu_ras_check_supported(struct amdgpu_device *adev, 1521 uint32_t *hw_supported, uint32_t *supported) 1522 { 1523 *hw_supported = 0; 1524 *supported = 0; 1525 1526 if (amdgpu_sriov_vf(adev) || 1527 adev->asic_type != CHIP_VEGA20) 1528 return; 1529 1530 if (adev->is_atom_fw && 1531 (amdgpu_atomfirmware_mem_ecc_supported(adev) || 1532 amdgpu_atomfirmware_sram_ecc_supported(adev))) 1533 *hw_supported = AMDGPU_RAS_BLOCK_MASK; 1534 1535 *supported = amdgpu_ras_enable == 0 ? 1536 0 : *hw_supported & amdgpu_ras_mask; 1537 } 1538 1539 int amdgpu_ras_init(struct amdgpu_device *adev) 1540 { 1541 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1542 1543 if (con) 1544 return 0; 1545 1546 con = kmalloc(sizeof(struct amdgpu_ras) + 1547 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, 1548 GFP_KERNEL|__GFP_ZERO); 1549 if (!con) 1550 return -ENOMEM; 1551 1552 con->objs = (struct ras_manager *)(con + 1); 1553 1554 amdgpu_ras_set_context(adev, con); 1555 1556 amdgpu_ras_check_supported(adev, &con->hw_supported, 1557 &con->supported); 1558 con->features = 0; 1559 INIT_LIST_HEAD(&con->head); 1560 /* Might need get this flag from vbios. */ 1561 con->flags = RAS_DEFAULT_FLAGS; 1562 1563 if (amdgpu_ras_recovery_init(adev)) 1564 goto recovery_out; 1565 1566 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; 1567 1568 if (amdgpu_ras_fs_init(adev)) 1569 goto fs_out; 1570 1571 amdgpu_ras_self_test(adev); 1572 1573 DRM_INFO("RAS INFO: ras initialized successfully, " 1574 "hardware ability[%x] ras_mask[%x]\n", 1575 con->hw_supported, con->supported); 1576 return 0; 1577 fs_out: 1578 amdgpu_ras_recovery_fini(adev); 1579 recovery_out: 1580 amdgpu_ras_set_context(adev, NULL); 1581 kfree(con); 1582 1583 return -EINVAL; 1584 } 1585 1586 /* do some init work after IP late init as dependence. 1587 * and it runs in resume/gpu reset/booting up cases. 1588 */ 1589 void amdgpu_ras_resume(struct amdgpu_device *adev) 1590 { 1591 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1592 struct ras_manager *obj, *tmp; 1593 1594 if (!con) 1595 return; 1596 1597 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 1598 /* Set up all other IPs which are not implemented. There is a 1599 * tricky thing that IP's actual ras error type should be 1600 * MULTI_UNCORRECTABLE, but as driver does not handle it, so 1601 * ERROR_NONE make sense anyway. 1602 */ 1603 amdgpu_ras_enable_all_features(adev, 1); 1604 1605 /* We enable ras on all hw_supported block, but as boot 1606 * parameter might disable some of them and one or more IP has 1607 * not implemented yet. So we disable them on behalf. 1608 */ 1609 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1610 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 1611 amdgpu_ras_feature_enable(adev, &obj->head, 0); 1612 /* there should be no any reference. */ 1613 WARN_ON(alive_obj(obj)); 1614 } 1615 } 1616 } 1617 1618 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { 1619 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; 1620 /* setup ras obj state as disabled. 1621 * for init_by_vbios case. 1622 * if we want to enable ras, just enable it in a normal way. 1623 * If we want do disable it, need setup ras obj as enabled, 1624 * then issue another TA disable cmd. 1625 * See feature_enable_on_boot 1626 */ 1627 amdgpu_ras_disable_all_features(adev, 1); 1628 amdgpu_ras_reset_gpu(adev, 0); 1629 } 1630 } 1631 1632 void amdgpu_ras_suspend(struct amdgpu_device *adev) 1633 { 1634 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1635 1636 if (!con) 1637 return; 1638 1639 amdgpu_ras_disable_all_features(adev, 0); 1640 /* Make sure all ras objects are disabled. */ 1641 if (con->features) 1642 amdgpu_ras_disable_all_features(adev, 1); 1643 } 1644 1645 /* do some fini work before IP fini as dependence */ 1646 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 1647 { 1648 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1649 1650 if (!con) 1651 return 0; 1652 1653 /* Need disable ras on all IPs here before ip [hw/sw]fini */ 1654 amdgpu_ras_disable_all_features(adev, 0); 1655 amdgpu_ras_recovery_fini(adev); 1656 return 0; 1657 } 1658 1659 int amdgpu_ras_fini(struct amdgpu_device *adev) 1660 { 1661 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1662 1663 if (!con) 1664 return 0; 1665 1666 amdgpu_ras_fs_fini(adev); 1667 amdgpu_ras_interrupt_remove_all(adev); 1668 1669 WARN(con->features, "Feature mask is not cleared"); 1670 1671 if (con->features) 1672 amdgpu_ras_disable_all_features(adev, 1); 1673 1674 amdgpu_ras_set_context(adev, NULL); 1675 kfree(con); 1676 1677 return 0; 1678 } 1679