1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include <linux/debugfs.h> 25 #include <linux/list.h> 26 #include <linux/module.h> 27 #include "amdgpu.h" 28 #include "amdgpu_ras.h" 29 #include "amdgpu_atomfirmware.h" 30 31 struct ras_ih_data { 32 /* interrupt bottom half */ 33 struct work_struct ih_work; 34 int inuse; 35 /* IP callback */ 36 ras_ih_cb cb; 37 /* full of entries */ 38 unsigned char *ring; 39 unsigned int ring_size; 40 unsigned int element_size; 41 unsigned int aligned_element_size; 42 unsigned int rptr; 43 unsigned int wptr; 44 }; 45 46 struct ras_fs_data { 47 char sysfs_name[32]; 48 char debugfs_name[32]; 49 }; 50 51 struct ras_err_data { 52 unsigned long ue_count; 53 unsigned long ce_count; 54 }; 55 56 struct ras_err_handler_data { 57 /* point to bad pages array */ 58 struct { 59 unsigned long bp; 60 struct amdgpu_bo *bo; 61 } *bps; 62 /* the count of entries */ 63 int count; 64 /* the space can place new entries */ 65 int space_left; 66 /* last reserved entry's index + 1 */ 67 int last_reserved; 68 }; 69 70 struct ras_manager { 71 struct ras_common_if head; 72 /* reference count */ 73 int use; 74 /* ras block link */ 75 struct list_head node; 76 /* the device */ 77 struct amdgpu_device *adev; 78 /* debugfs */ 79 struct dentry *ent; 80 /* sysfs */ 81 struct device_attribute sysfs_attr; 82 int attr_inuse; 83 84 /* fs node name */ 85 struct ras_fs_data fs_data; 86 87 /* IH data */ 88 struct ras_ih_data ih_data; 89 90 struct ras_err_data err_data; 91 }; 92 93 const char *ras_error_string[] = { 94 "none", 95 "parity", 96 "single_correctable", 97 "multi_uncorrectable", 98 "poison", 99 }; 100 101 const char *ras_block_string[] = { 102 "umc", 103 "sdma", 104 "gfx", 105 "mmhub", 106 "athub", 107 "pcie_bif", 108 "hdp", 109 "xgmi_wafl", 110 "df", 111 "smn", 112 "sem", 113 "mp0", 114 "mp1", 115 "fuse", 116 }; 117 118 #define ras_err_str(i) (ras_error_string[ffs(i)]) 119 #define ras_block_str(i) (ras_block_string[i]) 120 121 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 122 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 123 124 static void amdgpu_ras_self_test(struct amdgpu_device *adev) 125 { 126 /* TODO */ 127 } 128 129 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 130 size_t size, loff_t *pos) 131 { 132 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 133 struct ras_query_if info = { 134 .head = obj->head, 135 }; 136 ssize_t s; 137 char val[128]; 138 139 if (amdgpu_ras_error_query(obj->adev, &info)) 140 return -EINVAL; 141 142 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 143 "ue", info.ue_count, 144 "ce", info.ce_count); 145 if (*pos >= s) 146 return 0; 147 148 s -= *pos; 149 s = min_t(u64, s, size); 150 151 152 if (copy_to_user(buf, &val[*pos], s)) 153 return -EINVAL; 154 155 *pos += s; 156 157 return s; 158 } 159 160 static const struct file_operations amdgpu_ras_debugfs_ops = { 161 .owner = THIS_MODULE, 162 .read = amdgpu_ras_debugfs_read, 163 .write = NULL, 164 .llseek = default_llseek 165 }; 166 167 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 168 { 169 int i; 170 171 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 172 *block_id = i; 173 if (strcmp(name, ras_block_str(i)) == 0) 174 return 0; 175 } 176 return -EINVAL; 177 } 178 179 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 180 const char __user *buf, size_t size, 181 loff_t *pos, struct ras_debug_if *data) 182 { 183 ssize_t s = min_t(u64, 64, size); 184 char str[65]; 185 char block_name[33]; 186 char err[9] = "ue"; 187 int op = -1; 188 int block_id; 189 u64 address, value; 190 191 if (*pos) 192 return -EINVAL; 193 *pos = size; 194 195 memset(str, 0, sizeof(str)); 196 memset(data, 0, sizeof(*data)); 197 198 if (copy_from_user(str, buf, s)) 199 return -EINVAL; 200 201 if (sscanf(str, "disable %32s", block_name) == 1) 202 op = 0; 203 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 204 op = 1; 205 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 206 op = 2; 207 else if (str[0] && str[1] && str[2] && str[3]) 208 /* ascii string, but commands are not matched. */ 209 return -EINVAL; 210 211 if (op != -1) { 212 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 213 return -EINVAL; 214 215 data->head.block = block_id; 216 data->head.type = memcmp("ue", err, 2) == 0 ? 217 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE : 218 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 219 data->op = op; 220 221 if (op == 2) { 222 if (sscanf(str, "%*s %*s %*s %llu %llu", 223 &address, &value) != 2) 224 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx", 225 &address, &value) != 2) 226 return -EINVAL; 227 data->inject.address = address; 228 data->inject.value = value; 229 } 230 } else { 231 if (size < sizeof(*data)) 232 return -EINVAL; 233 234 if (copy_from_user(data, buf, sizeof(*data))) 235 return -EINVAL; 236 } 237 238 return 0; 239 } 240 /* 241 * DOC: ras debugfs control interface 242 * 243 * It accepts struct ras_debug_if who has two members. 244 * 245 * First member: ras_debug_if::head or ras_debug_if::inject. 246 * 247 * head is used to indicate which IP block will be under control. 248 * 249 * head has four members, they are block, type, sub_block_index, name. 250 * block: which IP will be under control. 251 * type: what kind of error will be enabled/disabled/injected. 252 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 253 * name: the name of IP. 254 * 255 * inject has two more members than head, they are address, value. 256 * As their names indicate, inject operation will write the 257 * value to the address. 258 * 259 * Second member: struct ras_debug_if::op. 260 * It has three kinds of operations. 261 * 0: disable RAS on the block. Take ::head as its data. 262 * 1: enable RAS on the block. Take ::head as its data. 263 * 2: inject errors on the block. Take ::inject as its data. 264 * 265 * How to use the interface? 266 * programs: 267 * copy the struct ras_debug_if in your codes and initialize it. 268 * write the struct to the control node. 269 * 270 * bash: 271 * echo op block [error [address value]] > .../ras/ras_ctrl 272 * op: disable, enable, inject 273 * disable: only block is needed 274 * enable: block and error are needed 275 * inject: error, address, value are needed 276 * block: umc, smda, gfx, ......... 277 * see ras_block_string[] for details 278 * error: ue, ce 279 * ue: multi_uncorrectable 280 * ce: single_correctable 281 * 282 * here are some examples for bash commands, 283 * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 284 * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 285 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 286 * 287 * How to check the result? 288 * 289 * For disable/enable, please check ras features at 290 * /sys/class/drm/card[0/1/2...]/device/ras/features 291 * 292 * For inject, please check corresponding err count at 293 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 294 * 295 * NOTE: operation is only allowed on blocks which are supported. 296 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask 297 */ 298 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, 299 size_t size, loff_t *pos) 300 { 301 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 302 struct ras_debug_if data; 303 int ret = 0; 304 305 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 306 if (ret) 307 return -EINVAL; 308 309 if (!amdgpu_ras_is_supported(adev, data.head.block)) 310 return -EINVAL; 311 312 switch (data.op) { 313 case 0: 314 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 315 break; 316 case 1: 317 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 318 break; 319 case 2: 320 ret = amdgpu_ras_error_inject(adev, &data.inject); 321 break; 322 default: 323 ret = -EINVAL; 324 break; 325 }; 326 327 if (ret) 328 return -EINVAL; 329 330 return size; 331 } 332 333 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 334 .owner = THIS_MODULE, 335 .read = NULL, 336 .write = amdgpu_ras_debugfs_ctrl_write, 337 .llseek = default_llseek 338 }; 339 340 static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 341 struct device_attribute *attr, char *buf) 342 { 343 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 344 struct ras_query_if info = { 345 .head = obj->head, 346 }; 347 348 if (amdgpu_ras_error_query(obj->adev, &info)) 349 return -EINVAL; 350 351 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", 352 "ue", info.ue_count, 353 "ce", info.ce_count); 354 } 355 356 /* obj begin */ 357 358 #define get_obj(obj) do { (obj)->use++; } while (0) 359 #define alive_obj(obj) ((obj)->use) 360 361 static inline void put_obj(struct ras_manager *obj) 362 { 363 if (obj && --obj->use == 0) 364 list_del(&obj->node); 365 if (obj && obj->use < 0) { 366 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); 367 } 368 } 369 370 /* make one obj and return it. */ 371 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 372 struct ras_common_if *head) 373 { 374 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 375 struct ras_manager *obj; 376 377 if (!con) 378 return NULL; 379 380 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 381 return NULL; 382 383 obj = &con->objs[head->block]; 384 /* already exist. return obj? */ 385 if (alive_obj(obj)) 386 return NULL; 387 388 obj->head = *head; 389 obj->adev = adev; 390 list_add(&obj->node, &con->head); 391 get_obj(obj); 392 393 return obj; 394 } 395 396 /* return an obj equal to head, or the first when head is NULL */ 397 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 398 struct ras_common_if *head) 399 { 400 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 401 struct ras_manager *obj; 402 int i; 403 404 if (!con) 405 return NULL; 406 407 if (head) { 408 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 409 return NULL; 410 411 obj = &con->objs[head->block]; 412 413 if (alive_obj(obj)) { 414 WARN_ON(head->block != obj->head.block); 415 return obj; 416 } 417 } else { 418 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 419 obj = &con->objs[i]; 420 if (alive_obj(obj)) { 421 WARN_ON(i != obj->head.block); 422 return obj; 423 } 424 } 425 } 426 427 return NULL; 428 } 429 /* obj end */ 430 431 /* feature ctl begin */ 432 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 433 struct ras_common_if *head) 434 { 435 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 436 437 return con->hw_supported & BIT(head->block); 438 } 439 440 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 441 struct ras_common_if *head) 442 { 443 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 444 445 return con->features & BIT(head->block); 446 } 447 448 /* 449 * if obj is not created, then create one. 450 * set feature enable flag. 451 */ 452 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 453 struct ras_common_if *head, int enable) 454 { 455 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 456 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 457 458 /* If hardware does not support ras, then do not create obj. 459 * But if hardware support ras, we can create the obj. 460 * Ras framework checks con->hw_supported to see if it need do 461 * corresponding initialization. 462 * IP checks con->support to see if it need disable ras. 463 */ 464 if (!amdgpu_ras_is_feature_allowed(adev, head)) 465 return 0; 466 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 467 return 0; 468 469 if (enable) { 470 if (!obj) { 471 obj = amdgpu_ras_create_obj(adev, head); 472 if (!obj) 473 return -EINVAL; 474 } else { 475 /* In case we create obj somewhere else */ 476 get_obj(obj); 477 } 478 con->features |= BIT(head->block); 479 } else { 480 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 481 con->features &= ~BIT(head->block); 482 put_obj(obj); 483 } 484 } 485 486 return 0; 487 } 488 489 /* wrapper of psp_ras_enable_features */ 490 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 491 struct ras_common_if *head, bool enable) 492 { 493 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 494 union ta_ras_cmd_input info; 495 int ret; 496 497 if (!con) 498 return -EINVAL; 499 500 if (!enable) { 501 info.disable_features = (struct ta_ras_disable_features_input) { 502 .block_id = amdgpu_ras_block_to_ta(head->block), 503 .error_type = amdgpu_ras_error_to_ta(head->type), 504 }; 505 } else { 506 info.enable_features = (struct ta_ras_enable_features_input) { 507 .block_id = amdgpu_ras_block_to_ta(head->block), 508 .error_type = amdgpu_ras_error_to_ta(head->type), 509 }; 510 } 511 512 /* Do not enable if it is not allowed. */ 513 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); 514 /* Are we alerady in that state we are going to set? */ 515 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 516 return 0; 517 518 ret = psp_ras_enable_features(&adev->psp, &info, enable); 519 if (ret) { 520 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", 521 enable ? "enable":"disable", 522 ras_block_str(head->block), 523 ret); 524 return -EINVAL; 525 } 526 527 /* setup the obj */ 528 __amdgpu_ras_feature_enable(adev, head, enable); 529 530 return 0; 531 } 532 533 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 534 bool bypass) 535 { 536 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 537 struct ras_manager *obj, *tmp; 538 539 list_for_each_entry_safe(obj, tmp, &con->head, node) { 540 /* bypass psp. 541 * aka just release the obj and corresponding flags 542 */ 543 if (bypass) { 544 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 545 break; 546 } else { 547 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 548 break; 549 } 550 } 551 552 return con->features; 553 } 554 555 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 556 bool bypass) 557 { 558 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 559 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 560 int i; 561 562 for (i = 0; i < ras_block_count; i++) { 563 struct ras_common_if head = { 564 .block = i, 565 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, 566 .sub_block_index = 0, 567 }; 568 strcpy(head.name, ras_block_str(i)); 569 if (bypass) { 570 /* 571 * bypass psp. vbios enable ras for us. 572 * so just create the obj 573 */ 574 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 575 break; 576 } else { 577 if (amdgpu_ras_feature_enable(adev, &head, 1)) 578 break; 579 } 580 } 581 582 return con->features; 583 } 584 /* feature ctl end */ 585 586 /* query/inject/cure begin */ 587 int amdgpu_ras_error_query(struct amdgpu_device *adev, 588 struct ras_query_if *info) 589 { 590 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 591 592 if (!obj) 593 return -EINVAL; 594 /* TODO might read the register to read the count */ 595 596 info->ue_count = obj->err_data.ue_count; 597 info->ce_count = obj->err_data.ce_count; 598 599 return 0; 600 } 601 602 /* wrapper of psp_ras_trigger_error */ 603 int amdgpu_ras_error_inject(struct amdgpu_device *adev, 604 struct ras_inject_if *info) 605 { 606 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 607 struct ta_ras_trigger_error_input block_info = { 608 .block_id = amdgpu_ras_block_to_ta(info->head.block), 609 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 610 .sub_block_index = info->head.sub_block_index, 611 .address = info->address, 612 .value = info->value, 613 }; 614 int ret = 0; 615 616 if (!obj) 617 return -EINVAL; 618 619 ret = psp_ras_trigger_error(&adev->psp, &block_info); 620 if (ret) 621 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", 622 ras_block_str(info->head.block), 623 ret); 624 625 return ret; 626 } 627 628 int amdgpu_ras_error_cure(struct amdgpu_device *adev, 629 struct ras_cure_if *info) 630 { 631 /* psp fw has no cure interface for now. */ 632 return 0; 633 } 634 635 /* get the total error counts on all IPs */ 636 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 637 bool is_ce) 638 { 639 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 640 struct ras_manager *obj; 641 struct ras_err_data data = {0, 0}; 642 643 if (!con) 644 return -EINVAL; 645 646 list_for_each_entry(obj, &con->head, node) { 647 struct ras_query_if info = { 648 .head = obj->head, 649 }; 650 651 if (amdgpu_ras_error_query(adev, &info)) 652 return -EINVAL; 653 654 data.ce_count += info.ce_count; 655 data.ue_count += info.ue_count; 656 } 657 658 return is_ce ? data.ce_count : data.ue_count; 659 } 660 /* query/inject/cure end */ 661 662 663 /* sysfs begin */ 664 665 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 666 struct device_attribute *attr, char *buf) 667 { 668 struct amdgpu_ras *con = 669 container_of(attr, struct amdgpu_ras, features_attr); 670 struct drm_device *ddev = dev_get_drvdata(dev); 671 struct amdgpu_device *adev = ddev->dev_private; 672 struct ras_common_if head; 673 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 674 int i; 675 ssize_t s; 676 struct ras_manager *obj; 677 678 s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); 679 680 for (i = 0; i < ras_block_count; i++) { 681 head.block = i; 682 683 if (amdgpu_ras_is_feature_enabled(adev, &head)) { 684 obj = amdgpu_ras_find_obj(adev, &head); 685 s += scnprintf(&buf[s], PAGE_SIZE - s, 686 "%s: %s\n", 687 ras_block_str(i), 688 ras_err_str(obj->head.type)); 689 } else 690 s += scnprintf(&buf[s], PAGE_SIZE - s, 691 "%s: disabled\n", 692 ras_block_str(i)); 693 } 694 695 return s; 696 } 697 698 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) 699 { 700 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 701 struct attribute *attrs[] = { 702 &con->features_attr.attr, 703 NULL 704 }; 705 struct attribute_group group = { 706 .name = "ras", 707 .attrs = attrs, 708 }; 709 710 con->features_attr = (struct device_attribute) { 711 .attr = { 712 .name = "features", 713 .mode = S_IRUGO, 714 }, 715 .show = amdgpu_ras_sysfs_features_read, 716 }; 717 sysfs_attr_init(attrs[0]); 718 719 return sysfs_create_group(&adev->dev->kobj, &group); 720 } 721 722 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) 723 { 724 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 725 struct attribute *attrs[] = { 726 &con->features_attr.attr, 727 NULL 728 }; 729 struct attribute_group group = { 730 .name = "ras", 731 .attrs = attrs, 732 }; 733 734 sysfs_remove_group(&adev->dev->kobj, &group); 735 736 return 0; 737 } 738 739 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 740 struct ras_fs_if *head) 741 { 742 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 743 744 if (!obj || obj->attr_inuse) 745 return -EINVAL; 746 747 get_obj(obj); 748 749 memcpy(obj->fs_data.sysfs_name, 750 head->sysfs_name, 751 sizeof(obj->fs_data.sysfs_name)); 752 753 obj->sysfs_attr = (struct device_attribute){ 754 .attr = { 755 .name = obj->fs_data.sysfs_name, 756 .mode = S_IRUGO, 757 }, 758 .show = amdgpu_ras_sysfs_read, 759 }; 760 sysfs_attr_init(&obj->sysfs_attr.attr); 761 762 if (sysfs_add_file_to_group(&adev->dev->kobj, 763 &obj->sysfs_attr.attr, 764 "ras")) { 765 put_obj(obj); 766 return -EINVAL; 767 } 768 769 obj->attr_inuse = 1; 770 771 return 0; 772 } 773 774 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 775 struct ras_common_if *head) 776 { 777 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 778 779 if (!obj || !obj->attr_inuse) 780 return -EINVAL; 781 782 sysfs_remove_file_from_group(&adev->dev->kobj, 783 &obj->sysfs_attr.attr, 784 "ras"); 785 obj->attr_inuse = 0; 786 put_obj(obj); 787 788 return 0; 789 } 790 791 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 792 { 793 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 794 struct ras_manager *obj, *tmp; 795 796 list_for_each_entry_safe(obj, tmp, &con->head, node) { 797 amdgpu_ras_sysfs_remove(adev, &obj->head); 798 } 799 800 amdgpu_ras_sysfs_remove_feature_node(adev); 801 802 return 0; 803 } 804 /* sysfs end */ 805 806 /* debugfs begin */ 807 static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 808 { 809 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 810 struct drm_minor *minor = adev->ddev->primary; 811 struct dentry *root = minor->debugfs_root, *dir; 812 struct dentry *ent; 813 814 dir = debugfs_create_dir("ras", root); 815 if (IS_ERR(dir)) 816 return -EINVAL; 817 818 con->dir = dir; 819 820 ent = debugfs_create_file("ras_ctrl", 821 S_IWUGO | S_IRUGO, con->dir, 822 adev, &amdgpu_ras_debugfs_ctrl_ops); 823 if (IS_ERR(ent)) { 824 debugfs_remove(con->dir); 825 return -EINVAL; 826 } 827 828 con->ent = ent; 829 return 0; 830 } 831 832 int amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 833 struct ras_fs_if *head) 834 { 835 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 836 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 837 struct dentry *ent; 838 839 if (!obj || obj->ent) 840 return -EINVAL; 841 842 get_obj(obj); 843 844 memcpy(obj->fs_data.debugfs_name, 845 head->debugfs_name, 846 sizeof(obj->fs_data.debugfs_name)); 847 848 ent = debugfs_create_file(obj->fs_data.debugfs_name, 849 S_IWUGO | S_IRUGO, con->dir, 850 obj, &amdgpu_ras_debugfs_ops); 851 852 if (IS_ERR(ent)) 853 return -EINVAL; 854 855 obj->ent = ent; 856 857 return 0; 858 } 859 860 int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, 861 struct ras_common_if *head) 862 { 863 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 864 865 if (!obj || !obj->ent) 866 return 0; 867 868 debugfs_remove(obj->ent); 869 obj->ent = NULL; 870 put_obj(obj); 871 872 return 0; 873 } 874 875 static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) 876 { 877 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 878 struct ras_manager *obj, *tmp; 879 880 list_for_each_entry_safe(obj, tmp, &con->head, node) { 881 amdgpu_ras_debugfs_remove(adev, &obj->head); 882 } 883 884 debugfs_remove(con->ent); 885 debugfs_remove(con->dir); 886 con->dir = NULL; 887 con->ent = NULL; 888 889 return 0; 890 } 891 /* debugfs end */ 892 893 /* ras fs */ 894 895 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 896 { 897 amdgpu_ras_sysfs_create_feature_node(adev); 898 amdgpu_ras_debugfs_create_ctrl_node(adev); 899 900 return 0; 901 } 902 903 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 904 { 905 amdgpu_ras_debugfs_remove_all(adev); 906 amdgpu_ras_sysfs_remove_all(adev); 907 return 0; 908 } 909 /* ras fs end */ 910 911 /* ih begin */ 912 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 913 { 914 struct ras_ih_data *data = &obj->ih_data; 915 struct amdgpu_iv_entry entry; 916 int ret; 917 918 while (data->rptr != data->wptr) { 919 rmb(); 920 memcpy(&entry, &data->ring[data->rptr], 921 data->element_size); 922 923 wmb(); 924 data->rptr = (data->aligned_element_size + 925 data->rptr) % data->ring_size; 926 927 /* Let IP handle its data, maybe we need get the output 928 * from the callback to udpate the error type/count, etc 929 */ 930 if (data->cb) { 931 ret = data->cb(obj->adev, &entry); 932 /* ue will trigger an interrupt, and in that case 933 * we need do a reset to recovery the whole system. 934 * But leave IP do that recovery, here we just dispatch 935 * the error. 936 */ 937 if (ret == AMDGPU_RAS_UE) { 938 obj->err_data.ue_count++; 939 } 940 /* Might need get ce count by register, but not all IP 941 * saves ce count, some IP just use one bit or two bits 942 * to indicate ce happened. 943 */ 944 } 945 } 946 } 947 948 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 949 { 950 struct ras_ih_data *data = 951 container_of(work, struct ras_ih_data, ih_work); 952 struct ras_manager *obj = 953 container_of(data, struct ras_manager, ih_data); 954 955 amdgpu_ras_interrupt_handler(obj); 956 } 957 958 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 959 struct ras_dispatch_if *info) 960 { 961 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 962 struct ras_ih_data *data = &obj->ih_data; 963 964 if (!obj) 965 return -EINVAL; 966 967 if (data->inuse == 0) 968 return 0; 969 970 /* Might be overflow... */ 971 memcpy(&data->ring[data->wptr], info->entry, 972 data->element_size); 973 974 wmb(); 975 data->wptr = (data->aligned_element_size + 976 data->wptr) % data->ring_size; 977 978 schedule_work(&data->ih_work); 979 980 return 0; 981 } 982 983 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 984 struct ras_ih_if *info) 985 { 986 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 987 struct ras_ih_data *data; 988 989 if (!obj) 990 return -EINVAL; 991 992 data = &obj->ih_data; 993 if (data->inuse == 0) 994 return 0; 995 996 cancel_work_sync(&data->ih_work); 997 998 kfree(data->ring); 999 memset(data, 0, sizeof(*data)); 1000 put_obj(obj); 1001 1002 return 0; 1003 } 1004 1005 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 1006 struct ras_ih_if *info) 1007 { 1008 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1009 struct ras_ih_data *data; 1010 1011 if (!obj) { 1012 /* in case we registe the IH before enable ras feature */ 1013 obj = amdgpu_ras_create_obj(adev, &info->head); 1014 if (!obj) 1015 return -EINVAL; 1016 } else 1017 get_obj(obj); 1018 1019 data = &obj->ih_data; 1020 /* add the callback.etc */ 1021 *data = (struct ras_ih_data) { 1022 .inuse = 0, 1023 .cb = info->cb, 1024 .element_size = sizeof(struct amdgpu_iv_entry), 1025 .rptr = 0, 1026 .wptr = 0, 1027 }; 1028 1029 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 1030 1031 data->aligned_element_size = ALIGN(data->element_size, 8); 1032 /* the ring can store 64 iv entries. */ 1033 data->ring_size = 64 * data->aligned_element_size; 1034 data->ring = kmalloc(data->ring_size, GFP_KERNEL); 1035 if (!data->ring) { 1036 put_obj(obj); 1037 return -ENOMEM; 1038 } 1039 1040 /* IH is ready */ 1041 data->inuse = 1; 1042 1043 return 0; 1044 } 1045 1046 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 1047 { 1048 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1049 struct ras_manager *obj, *tmp; 1050 1051 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1052 struct ras_ih_if info = { 1053 .head = obj->head, 1054 }; 1055 amdgpu_ras_interrupt_remove_handler(adev, &info); 1056 } 1057 1058 return 0; 1059 } 1060 /* ih end */ 1061 1062 /* recovery begin */ 1063 static void amdgpu_ras_do_recovery(struct work_struct *work) 1064 { 1065 struct amdgpu_ras *ras = 1066 container_of(work, struct amdgpu_ras, recovery_work); 1067 1068 amdgpu_device_gpu_recover(ras->adev, 0); 1069 atomic_set(&ras->in_recovery, 0); 1070 } 1071 1072 static int amdgpu_ras_release_vram(struct amdgpu_device *adev, 1073 struct amdgpu_bo **bo_ptr) 1074 { 1075 /* no need to free it actually. */ 1076 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); 1077 return 0; 1078 } 1079 1080 /* reserve vram with size@offset */ 1081 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, 1082 uint64_t offset, uint64_t size, 1083 struct amdgpu_bo **bo_ptr) 1084 { 1085 struct ttm_operation_ctx ctx = { false, false }; 1086 struct amdgpu_bo_param bp; 1087 int r = 0; 1088 int i; 1089 struct amdgpu_bo *bo; 1090 1091 if (bo_ptr) 1092 *bo_ptr = NULL; 1093 memset(&bp, 0, sizeof(bp)); 1094 bp.size = size; 1095 bp.byte_align = PAGE_SIZE; 1096 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 1097 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | 1098 AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 1099 bp.type = ttm_bo_type_kernel; 1100 bp.resv = NULL; 1101 1102 r = amdgpu_bo_create(adev, &bp, &bo); 1103 if (r) 1104 return -EINVAL; 1105 1106 r = amdgpu_bo_reserve(bo, false); 1107 if (r) 1108 goto error_reserve; 1109 1110 offset = ALIGN(offset, PAGE_SIZE); 1111 for (i = 0; i < bo->placement.num_placement; ++i) { 1112 bo->placements[i].fpfn = offset >> PAGE_SHIFT; 1113 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; 1114 } 1115 1116 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); 1117 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); 1118 if (r) 1119 goto error_pin; 1120 1121 r = amdgpu_bo_pin_restricted(bo, 1122 AMDGPU_GEM_DOMAIN_VRAM, 1123 offset, 1124 offset + size); 1125 if (r) 1126 goto error_pin; 1127 1128 if (bo_ptr) 1129 *bo_ptr = bo; 1130 1131 amdgpu_bo_unreserve(bo); 1132 return r; 1133 1134 error_pin: 1135 amdgpu_bo_unreserve(bo); 1136 error_reserve: 1137 amdgpu_bo_unref(&bo); 1138 return r; 1139 } 1140 1141 /* alloc/realloc bps array */ 1142 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 1143 struct ras_err_handler_data *data, int pages) 1144 { 1145 unsigned int old_space = data->count + data->space_left; 1146 unsigned int new_space = old_space + pages; 1147 unsigned int align_space = ALIGN(new_space, 1024); 1148 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); 1149 1150 if (!tmp) 1151 return -ENOMEM; 1152 1153 if (data->bps) { 1154 memcpy(tmp, data->bps, 1155 data->count * sizeof(*data->bps)); 1156 kfree(data->bps); 1157 } 1158 1159 data->bps = tmp; 1160 data->space_left += align_space - old_space; 1161 return 0; 1162 } 1163 1164 /* it deal with vram only. */ 1165 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 1166 unsigned long *bps, int pages) 1167 { 1168 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1169 struct ras_err_handler_data *data; 1170 int i = pages; 1171 int ret = 0; 1172 1173 if (!con || !con->eh_data || !bps || pages <= 0) 1174 return 0; 1175 1176 mutex_lock(&con->recovery_lock); 1177 data = con->eh_data; 1178 if (!data) 1179 goto out; 1180 1181 if (data->space_left <= pages) 1182 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { 1183 ret = -ENOMEM; 1184 goto out; 1185 } 1186 1187 while (i--) 1188 data->bps[data->count++].bp = bps[i]; 1189 1190 data->space_left -= pages; 1191 out: 1192 mutex_unlock(&con->recovery_lock); 1193 1194 return ret; 1195 } 1196 1197 /* called in gpu recovery/init */ 1198 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) 1199 { 1200 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1201 struct ras_err_handler_data *data; 1202 uint64_t bp; 1203 struct amdgpu_bo *bo; 1204 int i; 1205 1206 if (!con || !con->eh_data) 1207 return 0; 1208 1209 mutex_lock(&con->recovery_lock); 1210 data = con->eh_data; 1211 if (!data) 1212 goto out; 1213 /* reserve vram at driver post stage. */ 1214 for (i = data->last_reserved; i < data->count; i++) { 1215 bp = data->bps[i].bp; 1216 1217 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, 1218 PAGE_SIZE, &bo)) 1219 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); 1220 1221 data->bps[i].bo = bo; 1222 data->last_reserved = i + 1; 1223 } 1224 out: 1225 mutex_unlock(&con->recovery_lock); 1226 return 0; 1227 } 1228 1229 /* called when driver unload */ 1230 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) 1231 { 1232 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1233 struct ras_err_handler_data *data; 1234 struct amdgpu_bo *bo; 1235 int i; 1236 1237 if (!con || !con->eh_data) 1238 return 0; 1239 1240 mutex_lock(&con->recovery_lock); 1241 data = con->eh_data; 1242 if (!data) 1243 goto out; 1244 1245 for (i = data->last_reserved - 1; i >= 0; i--) { 1246 bo = data->bps[i].bo; 1247 1248 amdgpu_ras_release_vram(adev, &bo); 1249 1250 data->bps[i].bo = bo; 1251 data->last_reserved = i; 1252 } 1253 out: 1254 mutex_unlock(&con->recovery_lock); 1255 return 0; 1256 } 1257 1258 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) 1259 { 1260 /* TODO 1261 * write the array to eeprom when SMU disabled. 1262 */ 1263 return 0; 1264 } 1265 1266 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 1267 { 1268 /* TODO 1269 * read the array to eeprom when SMU disabled. 1270 */ 1271 return 0; 1272 } 1273 1274 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) 1275 { 1276 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1277 struct ras_err_handler_data **data = &con->eh_data; 1278 1279 *data = kmalloc(sizeof(**data), 1280 GFP_KERNEL|__GFP_ZERO); 1281 if (!*data) 1282 return -ENOMEM; 1283 1284 mutex_init(&con->recovery_lock); 1285 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 1286 atomic_set(&con->in_recovery, 0); 1287 con->adev = adev; 1288 1289 amdgpu_ras_load_bad_pages(adev); 1290 amdgpu_ras_reserve_bad_pages(adev); 1291 1292 return 0; 1293 } 1294 1295 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 1296 { 1297 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1298 struct ras_err_handler_data *data = con->eh_data; 1299 1300 cancel_work_sync(&con->recovery_work); 1301 amdgpu_ras_save_bad_pages(adev); 1302 amdgpu_ras_release_bad_pages(adev); 1303 1304 mutex_lock(&con->recovery_lock); 1305 con->eh_data = NULL; 1306 kfree(data->bps); 1307 kfree(data); 1308 mutex_unlock(&con->recovery_lock); 1309 1310 return 0; 1311 } 1312 /* recovery end */ 1313 1314 /* 1315 * check hardware's ras ability which will be saved in hw_supported. 1316 * if hardware does not support ras, we can skip some ras initializtion and 1317 * forbid some ras operations from IP. 1318 * if software itself, say boot parameter, limit the ras ability. We still 1319 * need allow IP do some limited operations, like disable. In such case, 1320 * we have to initialize ras as normal. but need check if operation is 1321 * allowed or not in each function. 1322 */ 1323 static void amdgpu_ras_check_supported(struct amdgpu_device *adev, 1324 uint32_t *hw_supported, uint32_t *supported) 1325 { 1326 *hw_supported = 0; 1327 *supported = 0; 1328 1329 if (amdgpu_sriov_vf(adev) || 1330 adev->asic_type != CHIP_VEGA20) 1331 return; 1332 1333 if (adev->is_atom_fw && 1334 (amdgpu_atomfirmware_mem_ecc_supported(adev) || 1335 amdgpu_atomfirmware_sram_ecc_supported(adev))) 1336 *hw_supported = AMDGPU_RAS_BLOCK_MASK; 1337 1338 *supported = amdgpu_ras_enable == 0 ? 1339 0 : *hw_supported & amdgpu_ras_mask; 1340 } 1341 1342 int amdgpu_ras_init(struct amdgpu_device *adev) 1343 { 1344 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1345 1346 if (con) 1347 return 0; 1348 1349 con = kmalloc(sizeof(struct amdgpu_ras) + 1350 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, 1351 GFP_KERNEL|__GFP_ZERO); 1352 if (!con) 1353 return -ENOMEM; 1354 1355 con->objs = (struct ras_manager *)(con + 1); 1356 1357 amdgpu_ras_set_context(adev, con); 1358 1359 amdgpu_ras_check_supported(adev, &con->hw_supported, 1360 &con->supported); 1361 con->features = 0; 1362 INIT_LIST_HEAD(&con->head); 1363 /* Might need get this flag from vbios. */ 1364 con->flags = RAS_DEFAULT_FLAGS; 1365 1366 if (amdgpu_ras_recovery_init(adev)) 1367 goto recovery_out; 1368 1369 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; 1370 1371 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 1372 amdgpu_ras_enable_all_features(adev, 1); 1373 1374 if (amdgpu_ras_fs_init(adev)) 1375 goto fs_out; 1376 1377 amdgpu_ras_self_test(adev); 1378 1379 DRM_INFO("RAS INFO: ras initialized successfully, " 1380 "hardware ability[%x] ras_mask[%x]\n", 1381 con->hw_supported, con->supported); 1382 return 0; 1383 fs_out: 1384 amdgpu_ras_recovery_fini(adev); 1385 recovery_out: 1386 amdgpu_ras_set_context(adev, NULL); 1387 kfree(con); 1388 1389 return -EINVAL; 1390 } 1391 1392 /* do some init work after IP late init as dependence */ 1393 void amdgpu_ras_post_init(struct amdgpu_device *adev) 1394 { 1395 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1396 struct ras_manager *obj, *tmp; 1397 1398 if (!con) 1399 return; 1400 1401 /* We enable ras on all hw_supported block, but as boot parameter might 1402 * disable some of them and one or more IP has not implemented yet. 1403 * So we disable them on behalf. 1404 */ 1405 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 1406 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1407 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 1408 amdgpu_ras_feature_enable(adev, &obj->head, 0); 1409 /* there should be no any reference. */ 1410 WARN_ON(alive_obj(obj)); 1411 } 1412 }; 1413 } 1414 } 1415 1416 /* do some fini work before IP fini as dependence */ 1417 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 1418 { 1419 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1420 1421 if (!con) 1422 return 0; 1423 1424 /* Need disable ras on all IPs here before ip [hw/sw]fini */ 1425 amdgpu_ras_disable_all_features(adev, 0); 1426 amdgpu_ras_recovery_fini(adev); 1427 return 0; 1428 } 1429 1430 int amdgpu_ras_fini(struct amdgpu_device *adev) 1431 { 1432 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1433 1434 if (!con) 1435 return 0; 1436 1437 amdgpu_ras_fs_fini(adev); 1438 amdgpu_ras_interrupt_remove_all(adev); 1439 1440 WARN(con->features, "Feature mask is not cleared"); 1441 1442 if (con->features) 1443 amdgpu_ras_disable_all_features(adev, 1); 1444 1445 amdgpu_ras_set_context(adev, NULL); 1446 kfree(con); 1447 1448 return 0; 1449 } 1450