1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include <linux/debugfs.h> 25 #include <linux/list.h> 26 #include <linux/module.h> 27 #include <linux/uaccess.h> 28 29 #include "amdgpu.h" 30 #include "amdgpu_ras.h" 31 #include "amdgpu_atomfirmware.h" 32 33 const char *ras_error_string[] = { 34 "none", 35 "parity", 36 "single_correctable", 37 "multi_uncorrectable", 38 "poison", 39 }; 40 41 const char *ras_block_string[] = { 42 "umc", 43 "sdma", 44 "gfx", 45 "mmhub", 46 "athub", 47 "pcie_bif", 48 "hdp", 49 "xgmi_wafl", 50 "df", 51 "smn", 52 "sem", 53 "mp0", 54 "mp1", 55 "fuse", 56 }; 57 58 #define ras_err_str(i) (ras_error_string[ffs(i)]) 59 #define ras_block_str(i) (ras_block_string[i]) 60 61 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 62 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 63 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 64 65 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, 66 uint64_t offset, uint64_t size, 67 struct amdgpu_bo **bo_ptr); 68 static int amdgpu_ras_release_vram(struct amdgpu_device *adev, 69 struct amdgpu_bo **bo_ptr); 70 71 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 72 size_t size, loff_t *pos) 73 { 74 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 75 struct ras_query_if info = { 76 .head = obj->head, 77 }; 78 ssize_t s; 79 char val[128]; 80 81 if (amdgpu_ras_error_query(obj->adev, &info)) 82 return -EINVAL; 83 84 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 85 "ue", info.ue_count, 86 "ce", info.ce_count); 87 if (*pos >= s) 88 return 0; 89 90 s -= *pos; 91 s = min_t(u64, s, size); 92 93 94 if (copy_to_user(buf, &val[*pos], s)) 95 return -EINVAL; 96 97 *pos += s; 98 99 return s; 100 } 101 102 static const struct file_operations amdgpu_ras_debugfs_ops = { 103 .owner = THIS_MODULE, 104 .read = amdgpu_ras_debugfs_read, 105 .write = NULL, 106 .llseek = default_llseek 107 }; 108 109 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 110 { 111 int i; 112 113 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 114 *block_id = i; 115 if (strcmp(name, ras_block_str(i)) == 0) 116 return 0; 117 } 118 return -EINVAL; 119 } 120 121 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 122 const char __user *buf, size_t size, 123 loff_t *pos, struct ras_debug_if *data) 124 { 125 ssize_t s = min_t(u64, 64, size); 126 char str[65]; 127 char block_name[33]; 128 char err[9] = "ue"; 129 int op = -1; 130 int block_id; 131 u64 address, value; 132 133 if (*pos) 134 return -EINVAL; 135 *pos = size; 136 137 memset(str, 0, sizeof(str)); 138 memset(data, 0, sizeof(*data)); 139 140 if (copy_from_user(str, buf, s)) 141 return -EINVAL; 142 143 if (sscanf(str, "disable %32s", block_name) == 1) 144 op = 0; 145 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 146 op = 1; 147 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 148 op = 2; 149 else if (str[0] && str[1] && str[2] && str[3]) 150 /* ascii string, but commands are not matched. */ 151 return -EINVAL; 152 153 if (op != -1) { 154 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 155 return -EINVAL; 156 157 data->head.block = block_id; 158 data->head.type = memcmp("ue", err, 2) == 0 ? 159 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE : 160 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 161 data->op = op; 162 163 if (op == 2) { 164 if (sscanf(str, "%*s %*s %*s %llu %llu", 165 &address, &value) != 2) 166 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx", 167 &address, &value) != 2) 168 return -EINVAL; 169 data->inject.address = address; 170 data->inject.value = value; 171 } 172 } else { 173 if (size < sizeof(*data)) 174 return -EINVAL; 175 176 if (copy_from_user(data, buf, sizeof(*data))) 177 return -EINVAL; 178 } 179 180 return 0; 181 } 182 /** 183 * DOC: AMDGPU RAS debugfs control interface 184 * 185 * It accepts struct ras_debug_if who has two members. 186 * 187 * First member: ras_debug_if::head or ras_debug_if::inject. 188 * 189 * head is used to indicate which IP block will be under control. 190 * 191 * head has four members, they are block, type, sub_block_index, name. 192 * block: which IP will be under control. 193 * type: what kind of error will be enabled/disabled/injected. 194 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 195 * name: the name of IP. 196 * 197 * inject has two more members than head, they are address, value. 198 * As their names indicate, inject operation will write the 199 * value to the address. 200 * 201 * Second member: struct ras_debug_if::op. 202 * It has three kinds of operations. 203 * 0: disable RAS on the block. Take ::head as its data. 204 * 1: enable RAS on the block. Take ::head as its data. 205 * 2: inject errors on the block. Take ::inject as its data. 206 * 207 * How to use the interface? 208 * programs: 209 * copy the struct ras_debug_if in your codes and initialize it. 210 * write the struct to the control node. 211 * 212 * bash: 213 * echo op block [error [address value]] > .../ras/ras_ctrl 214 * op: disable, enable, inject 215 * disable: only block is needed 216 * enable: block and error are needed 217 * inject: error, address, value are needed 218 * block: umc, smda, gfx, ......... 219 * see ras_block_string[] for details 220 * error: ue, ce 221 * ue: multi_uncorrectable 222 * ce: single_correctable 223 * 224 * here are some examples for bash commands, 225 * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 226 * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 227 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 228 * 229 * How to check the result? 230 * 231 * For disable/enable, please check ras features at 232 * /sys/class/drm/card[0/1/2...]/device/ras/features 233 * 234 * For inject, please check corresponding err count at 235 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 236 * 237 * NOTE: operation is only allowed on blocks which are supported. 238 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask 239 */ 240 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, 241 size_t size, loff_t *pos) 242 { 243 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 244 struct ras_debug_if data; 245 struct amdgpu_bo *bo; 246 int ret = 0; 247 248 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 249 if (ret) 250 return -EINVAL; 251 252 if (!amdgpu_ras_is_supported(adev, data.head.block)) 253 return -EINVAL; 254 255 switch (data.op) { 256 case 0: 257 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 258 break; 259 case 1: 260 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 261 break; 262 case 2: 263 ret = amdgpu_ras_reserve_vram(adev, 264 data.inject.address, PAGE_SIZE, &bo); 265 if (ret) { 266 /* address was offset, now it is absolute.*/ 267 data.inject.address += adev->gmc.vram_start; 268 if (data.inject.address > adev->gmc.vram_end) 269 break; 270 } else 271 data.inject.address = amdgpu_bo_gpu_offset(bo); 272 ret = amdgpu_ras_error_inject(adev, &data.inject); 273 amdgpu_ras_release_vram(adev, &bo); 274 break; 275 default: 276 ret = -EINVAL; 277 break; 278 }; 279 280 if (ret) 281 return -EINVAL; 282 283 return size; 284 } 285 286 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 287 .owner = THIS_MODULE, 288 .read = NULL, 289 .write = amdgpu_ras_debugfs_ctrl_write, 290 .llseek = default_llseek 291 }; 292 293 static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 294 struct device_attribute *attr, char *buf) 295 { 296 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 297 struct ras_query_if info = { 298 .head = obj->head, 299 }; 300 301 if (amdgpu_ras_error_query(obj->adev, &info)) 302 return -EINVAL; 303 304 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", 305 "ue", info.ue_count, 306 "ce", info.ce_count); 307 } 308 309 /* obj begin */ 310 311 #define get_obj(obj) do { (obj)->use++; } while (0) 312 #define alive_obj(obj) ((obj)->use) 313 314 static inline void put_obj(struct ras_manager *obj) 315 { 316 if (obj && --obj->use == 0) 317 list_del(&obj->node); 318 if (obj && obj->use < 0) { 319 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); 320 } 321 } 322 323 /* make one obj and return it. */ 324 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 325 struct ras_common_if *head) 326 { 327 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 328 struct ras_manager *obj; 329 330 if (!con) 331 return NULL; 332 333 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 334 return NULL; 335 336 obj = &con->objs[head->block]; 337 /* already exist. return obj? */ 338 if (alive_obj(obj)) 339 return NULL; 340 341 obj->head = *head; 342 obj->adev = adev; 343 list_add(&obj->node, &con->head); 344 get_obj(obj); 345 346 return obj; 347 } 348 349 /* return an obj equal to head, or the first when head is NULL */ 350 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 351 struct ras_common_if *head) 352 { 353 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 354 struct ras_manager *obj; 355 int i; 356 357 if (!con) 358 return NULL; 359 360 if (head) { 361 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 362 return NULL; 363 364 obj = &con->objs[head->block]; 365 366 if (alive_obj(obj)) { 367 WARN_ON(head->block != obj->head.block); 368 return obj; 369 } 370 } else { 371 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 372 obj = &con->objs[i]; 373 if (alive_obj(obj)) { 374 WARN_ON(i != obj->head.block); 375 return obj; 376 } 377 } 378 } 379 380 return NULL; 381 } 382 /* obj end */ 383 384 /* feature ctl begin */ 385 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 386 struct ras_common_if *head) 387 { 388 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 389 390 return con->hw_supported & BIT(head->block); 391 } 392 393 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 394 struct ras_common_if *head) 395 { 396 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 397 398 return con->features & BIT(head->block); 399 } 400 401 /* 402 * if obj is not created, then create one. 403 * set feature enable flag. 404 */ 405 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 406 struct ras_common_if *head, int enable) 407 { 408 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 409 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 410 411 /* If hardware does not support ras, then do not create obj. 412 * But if hardware support ras, we can create the obj. 413 * Ras framework checks con->hw_supported to see if it need do 414 * corresponding initialization. 415 * IP checks con->support to see if it need disable ras. 416 */ 417 if (!amdgpu_ras_is_feature_allowed(adev, head)) 418 return 0; 419 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 420 return 0; 421 422 if (enable) { 423 if (!obj) { 424 obj = amdgpu_ras_create_obj(adev, head); 425 if (!obj) 426 return -EINVAL; 427 } else { 428 /* In case we create obj somewhere else */ 429 get_obj(obj); 430 } 431 con->features |= BIT(head->block); 432 } else { 433 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 434 con->features &= ~BIT(head->block); 435 put_obj(obj); 436 } 437 } 438 439 return 0; 440 } 441 442 /* wrapper of psp_ras_enable_features */ 443 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 444 struct ras_common_if *head, bool enable) 445 { 446 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 447 union ta_ras_cmd_input info; 448 int ret; 449 450 if (!con) 451 return -EINVAL; 452 453 if (!enable) { 454 info.disable_features = (struct ta_ras_disable_features_input) { 455 .block_id = amdgpu_ras_block_to_ta(head->block), 456 .error_type = amdgpu_ras_error_to_ta(head->type), 457 }; 458 } else { 459 info.enable_features = (struct ta_ras_enable_features_input) { 460 .block_id = amdgpu_ras_block_to_ta(head->block), 461 .error_type = amdgpu_ras_error_to_ta(head->type), 462 }; 463 } 464 465 /* Do not enable if it is not allowed. */ 466 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); 467 /* Are we alerady in that state we are going to set? */ 468 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 469 return 0; 470 471 ret = psp_ras_enable_features(&adev->psp, &info, enable); 472 if (ret) { 473 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", 474 enable ? "enable":"disable", 475 ras_block_str(head->block), 476 ret); 477 if (ret == TA_RAS_STATUS__RESET_NEEDED) 478 return -EAGAIN; 479 return -EINVAL; 480 } 481 482 /* setup the obj */ 483 __amdgpu_ras_feature_enable(adev, head, enable); 484 485 return 0; 486 } 487 488 /* Only used in device probe stage and called only once. */ 489 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, 490 struct ras_common_if *head, bool enable) 491 { 492 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 493 int ret; 494 495 if (!con) 496 return -EINVAL; 497 498 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 499 if (enable) { 500 /* There is no harm to issue a ras TA cmd regardless of 501 * the currecnt ras state. 502 * If current state == target state, it will do nothing 503 * But sometimes it requests driver to reset and repost 504 * with error code -EAGAIN. 505 */ 506 ret = amdgpu_ras_feature_enable(adev, head, 1); 507 /* With old ras TA, we might fail to enable ras. 508 * Log it and just setup the object. 509 * TODO need remove this WA in the future. 510 */ 511 if (ret == -EINVAL) { 512 ret = __amdgpu_ras_feature_enable(adev, head, 1); 513 if (!ret) 514 DRM_INFO("RAS INFO: %s setup object\n", 515 ras_block_str(head->block)); 516 } 517 } else { 518 /* setup the object then issue a ras TA disable cmd.*/ 519 ret = __amdgpu_ras_feature_enable(adev, head, 1); 520 if (ret) 521 return ret; 522 523 ret = amdgpu_ras_feature_enable(adev, head, 0); 524 } 525 } else 526 ret = amdgpu_ras_feature_enable(adev, head, enable); 527 528 return ret; 529 } 530 531 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 532 bool bypass) 533 { 534 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 535 struct ras_manager *obj, *tmp; 536 537 list_for_each_entry_safe(obj, tmp, &con->head, node) { 538 /* bypass psp. 539 * aka just release the obj and corresponding flags 540 */ 541 if (bypass) { 542 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 543 break; 544 } else { 545 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 546 break; 547 } 548 } 549 550 return con->features; 551 } 552 553 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 554 bool bypass) 555 { 556 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 557 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 558 int i; 559 const enum amdgpu_ras_error_type default_ras_type = 560 AMDGPU_RAS_ERROR__NONE; 561 562 for (i = 0; i < ras_block_count; i++) { 563 struct ras_common_if head = { 564 .block = i, 565 .type = default_ras_type, 566 .sub_block_index = 0, 567 }; 568 strcpy(head.name, ras_block_str(i)); 569 if (bypass) { 570 /* 571 * bypass psp. vbios enable ras for us. 572 * so just create the obj 573 */ 574 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 575 break; 576 } else { 577 if (amdgpu_ras_feature_enable(adev, &head, 1)) 578 break; 579 } 580 } 581 582 return con->features; 583 } 584 /* feature ctl end */ 585 586 /* query/inject/cure begin */ 587 int amdgpu_ras_error_query(struct amdgpu_device *adev, 588 struct ras_query_if *info) 589 { 590 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 591 592 if (!obj) 593 return -EINVAL; 594 /* TODO might read the register to read the count */ 595 596 info->ue_count = obj->err_data.ue_count; 597 info->ce_count = obj->err_data.ce_count; 598 599 return 0; 600 } 601 602 /* wrapper of psp_ras_trigger_error */ 603 int amdgpu_ras_error_inject(struct amdgpu_device *adev, 604 struct ras_inject_if *info) 605 { 606 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 607 struct ta_ras_trigger_error_input block_info = { 608 .block_id = amdgpu_ras_block_to_ta(info->head.block), 609 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 610 .sub_block_index = info->head.sub_block_index, 611 .address = info->address, 612 .value = info->value, 613 }; 614 int ret = 0; 615 616 if (!obj) 617 return -EINVAL; 618 619 if (block_info.block_id != TA_RAS_BLOCK__UMC) { 620 DRM_INFO("%s error injection is not supported yet\n", 621 ras_block_str(info->head.block)); 622 return -EINVAL; 623 } 624 625 ret = psp_ras_trigger_error(&adev->psp, &block_info); 626 if (ret) 627 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", 628 ras_block_str(info->head.block), 629 ret); 630 631 return ret; 632 } 633 634 int amdgpu_ras_error_cure(struct amdgpu_device *adev, 635 struct ras_cure_if *info) 636 { 637 /* psp fw has no cure interface for now. */ 638 return 0; 639 } 640 641 /* get the total error counts on all IPs */ 642 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 643 bool is_ce) 644 { 645 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 646 struct ras_manager *obj; 647 struct ras_err_data data = {0, 0}; 648 649 if (!con) 650 return -EINVAL; 651 652 list_for_each_entry(obj, &con->head, node) { 653 struct ras_query_if info = { 654 .head = obj->head, 655 }; 656 657 if (amdgpu_ras_error_query(adev, &info)) 658 return -EINVAL; 659 660 data.ce_count += info.ce_count; 661 data.ue_count += info.ue_count; 662 } 663 664 return is_ce ? data.ce_count : data.ue_count; 665 } 666 /* query/inject/cure end */ 667 668 669 /* sysfs begin */ 670 671 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 672 struct ras_badpage **bps, unsigned int *count); 673 674 static char *amdgpu_ras_badpage_flags_str(unsigned int flags) 675 { 676 switch (flags) { 677 case 0: 678 return "R"; 679 case 1: 680 return "P"; 681 case 2: 682 default: 683 return "F"; 684 }; 685 } 686 687 /* 688 * DOC: ras sysfs gpu_vram_bad_pages interface 689 * 690 * It allows user to read the bad pages of vram on the gpu through 691 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages 692 * 693 * It outputs multiple lines, and each line stands for one gpu page. 694 * 695 * The format of one line is below, 696 * gpu pfn : gpu page size : flags 697 * 698 * gpu pfn and gpu page size are printed in hex format. 699 * flags can be one of below character, 700 * R: reserved, this gpu page is reserved and not able to use. 701 * P: pending for reserve, this gpu page is marked as bad, will be reserved 702 * in next window of page_reserve. 703 * F: unable to reserve. this gpu page can't be reserved due to some reasons. 704 * 705 * examples: 706 * 0x00000001 : 0x00001000 : R 707 * 0x00000002 : 0x00001000 : P 708 */ 709 710 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, 711 struct kobject *kobj, struct bin_attribute *attr, 712 char *buf, loff_t ppos, size_t count) 713 { 714 struct amdgpu_ras *con = 715 container_of(attr, struct amdgpu_ras, badpages_attr); 716 struct amdgpu_device *adev = con->adev; 717 const unsigned int element_size = 718 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; 719 unsigned int start = div64_ul(ppos + element_size - 1, element_size); 720 unsigned int end = div64_ul(ppos + count - 1, element_size); 721 ssize_t s = 0; 722 struct ras_badpage *bps = NULL; 723 unsigned int bps_count = 0; 724 725 memset(buf, 0, count); 726 727 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) 728 return 0; 729 730 for (; start < end && start < bps_count; start++) 731 s += scnprintf(&buf[s], element_size + 1, 732 "0x%08x : 0x%08x : %1s\n", 733 bps[start].bp, 734 bps[start].size, 735 amdgpu_ras_badpage_flags_str(bps[start].flags)); 736 737 kfree(bps); 738 739 return s; 740 } 741 742 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 743 struct device_attribute *attr, char *buf) 744 { 745 struct amdgpu_ras *con = 746 container_of(attr, struct amdgpu_ras, features_attr); 747 struct drm_device *ddev = dev_get_drvdata(dev); 748 struct amdgpu_device *adev = ddev->dev_private; 749 struct ras_common_if head; 750 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 751 int i; 752 ssize_t s; 753 struct ras_manager *obj; 754 755 s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); 756 757 for (i = 0; i < ras_block_count; i++) { 758 head.block = i; 759 760 if (amdgpu_ras_is_feature_enabled(adev, &head)) { 761 obj = amdgpu_ras_find_obj(adev, &head); 762 s += scnprintf(&buf[s], PAGE_SIZE - s, 763 "%s: %s\n", 764 ras_block_str(i), 765 ras_err_str(obj->head.type)); 766 } else 767 s += scnprintf(&buf[s], PAGE_SIZE - s, 768 "%s: disabled\n", 769 ras_block_str(i)); 770 } 771 772 return s; 773 } 774 775 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) 776 { 777 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 778 struct attribute *attrs[] = { 779 &con->features_attr.attr, 780 NULL 781 }; 782 struct bin_attribute *bin_attrs[] = { 783 &con->badpages_attr, 784 NULL 785 }; 786 struct attribute_group group = { 787 .name = "ras", 788 .attrs = attrs, 789 .bin_attrs = bin_attrs, 790 }; 791 792 con->features_attr = (struct device_attribute) { 793 .attr = { 794 .name = "features", 795 .mode = S_IRUGO, 796 }, 797 .show = amdgpu_ras_sysfs_features_read, 798 }; 799 800 con->badpages_attr = (struct bin_attribute) { 801 .attr = { 802 .name = "gpu_vram_bad_pages", 803 .mode = S_IRUGO, 804 }, 805 .size = 0, 806 .private = NULL, 807 .read = amdgpu_ras_sysfs_badpages_read, 808 }; 809 810 sysfs_attr_init(attrs[0]); 811 sysfs_bin_attr_init(bin_attrs[0]); 812 813 return sysfs_create_group(&adev->dev->kobj, &group); 814 } 815 816 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) 817 { 818 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 819 struct attribute *attrs[] = { 820 &con->features_attr.attr, 821 NULL 822 }; 823 struct bin_attribute *bin_attrs[] = { 824 &con->badpages_attr, 825 NULL 826 }; 827 struct attribute_group group = { 828 .name = "ras", 829 .attrs = attrs, 830 .bin_attrs = bin_attrs, 831 }; 832 833 sysfs_remove_group(&adev->dev->kobj, &group); 834 835 return 0; 836 } 837 838 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 839 struct ras_fs_if *head) 840 { 841 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 842 843 if (!obj || obj->attr_inuse) 844 return -EINVAL; 845 846 get_obj(obj); 847 848 memcpy(obj->fs_data.sysfs_name, 849 head->sysfs_name, 850 sizeof(obj->fs_data.sysfs_name)); 851 852 obj->sysfs_attr = (struct device_attribute){ 853 .attr = { 854 .name = obj->fs_data.sysfs_name, 855 .mode = S_IRUGO, 856 }, 857 .show = amdgpu_ras_sysfs_read, 858 }; 859 sysfs_attr_init(&obj->sysfs_attr.attr); 860 861 if (sysfs_add_file_to_group(&adev->dev->kobj, 862 &obj->sysfs_attr.attr, 863 "ras")) { 864 put_obj(obj); 865 return -EINVAL; 866 } 867 868 obj->attr_inuse = 1; 869 870 return 0; 871 } 872 873 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 874 struct ras_common_if *head) 875 { 876 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 877 878 if (!obj || !obj->attr_inuse) 879 return -EINVAL; 880 881 sysfs_remove_file_from_group(&adev->dev->kobj, 882 &obj->sysfs_attr.attr, 883 "ras"); 884 obj->attr_inuse = 0; 885 put_obj(obj); 886 887 return 0; 888 } 889 890 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 891 { 892 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 893 struct ras_manager *obj, *tmp; 894 895 list_for_each_entry_safe(obj, tmp, &con->head, node) { 896 amdgpu_ras_sysfs_remove(adev, &obj->head); 897 } 898 899 amdgpu_ras_sysfs_remove_feature_node(adev); 900 901 return 0; 902 } 903 /* sysfs end */ 904 905 /* debugfs begin */ 906 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 907 { 908 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 909 struct drm_minor *minor = adev->ddev->primary; 910 911 con->dir = debugfs_create_dir("ras", minor->debugfs_root); 912 con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, 913 adev, &amdgpu_ras_debugfs_ctrl_ops); 914 } 915 916 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 917 struct ras_fs_if *head) 918 { 919 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 920 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 921 922 if (!obj || obj->ent) 923 return; 924 925 get_obj(obj); 926 927 memcpy(obj->fs_data.debugfs_name, 928 head->debugfs_name, 929 sizeof(obj->fs_data.debugfs_name)); 930 931 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name, 932 S_IWUGO | S_IRUGO, con->dir, obj, 933 &amdgpu_ras_debugfs_ops); 934 } 935 936 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, 937 struct ras_common_if *head) 938 { 939 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 940 941 if (!obj || !obj->ent) 942 return; 943 944 debugfs_remove(obj->ent); 945 obj->ent = NULL; 946 put_obj(obj); 947 } 948 949 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) 950 { 951 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 952 struct ras_manager *obj, *tmp; 953 954 list_for_each_entry_safe(obj, tmp, &con->head, node) { 955 amdgpu_ras_debugfs_remove(adev, &obj->head); 956 } 957 958 debugfs_remove(con->ent); 959 debugfs_remove(con->dir); 960 con->dir = NULL; 961 con->ent = NULL; 962 } 963 /* debugfs end */ 964 965 /* ras fs */ 966 967 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 968 { 969 amdgpu_ras_sysfs_create_feature_node(adev); 970 amdgpu_ras_debugfs_create_ctrl_node(adev); 971 972 return 0; 973 } 974 975 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 976 { 977 amdgpu_ras_debugfs_remove_all(adev); 978 amdgpu_ras_sysfs_remove_all(adev); 979 return 0; 980 } 981 /* ras fs end */ 982 983 /* ih begin */ 984 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 985 { 986 struct ras_ih_data *data = &obj->ih_data; 987 struct amdgpu_iv_entry entry; 988 int ret; 989 990 while (data->rptr != data->wptr) { 991 rmb(); 992 memcpy(&entry, &data->ring[data->rptr], 993 data->element_size); 994 995 wmb(); 996 data->rptr = (data->aligned_element_size + 997 data->rptr) % data->ring_size; 998 999 /* Let IP handle its data, maybe we need get the output 1000 * from the callback to udpate the error type/count, etc 1001 */ 1002 if (data->cb) { 1003 ret = data->cb(obj->adev, &entry); 1004 /* ue will trigger an interrupt, and in that case 1005 * we need do a reset to recovery the whole system. 1006 * But leave IP do that recovery, here we just dispatch 1007 * the error. 1008 */ 1009 if (ret == AMDGPU_RAS_UE) { 1010 obj->err_data.ue_count++; 1011 } 1012 /* Might need get ce count by register, but not all IP 1013 * saves ce count, some IP just use one bit or two bits 1014 * to indicate ce happened. 1015 */ 1016 } 1017 } 1018 } 1019 1020 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 1021 { 1022 struct ras_ih_data *data = 1023 container_of(work, struct ras_ih_data, ih_work); 1024 struct ras_manager *obj = 1025 container_of(data, struct ras_manager, ih_data); 1026 1027 amdgpu_ras_interrupt_handler(obj); 1028 } 1029 1030 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 1031 struct ras_dispatch_if *info) 1032 { 1033 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1034 struct ras_ih_data *data = &obj->ih_data; 1035 1036 if (!obj) 1037 return -EINVAL; 1038 1039 if (data->inuse == 0) 1040 return 0; 1041 1042 /* Might be overflow... */ 1043 memcpy(&data->ring[data->wptr], info->entry, 1044 data->element_size); 1045 1046 wmb(); 1047 data->wptr = (data->aligned_element_size + 1048 data->wptr) % data->ring_size; 1049 1050 schedule_work(&data->ih_work); 1051 1052 return 0; 1053 } 1054 1055 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 1056 struct ras_ih_if *info) 1057 { 1058 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1059 struct ras_ih_data *data; 1060 1061 if (!obj) 1062 return -EINVAL; 1063 1064 data = &obj->ih_data; 1065 if (data->inuse == 0) 1066 return 0; 1067 1068 cancel_work_sync(&data->ih_work); 1069 1070 kfree(data->ring); 1071 memset(data, 0, sizeof(*data)); 1072 put_obj(obj); 1073 1074 return 0; 1075 } 1076 1077 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 1078 struct ras_ih_if *info) 1079 { 1080 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1081 struct ras_ih_data *data; 1082 1083 if (!obj) { 1084 /* in case we registe the IH before enable ras feature */ 1085 obj = amdgpu_ras_create_obj(adev, &info->head); 1086 if (!obj) 1087 return -EINVAL; 1088 } else 1089 get_obj(obj); 1090 1091 data = &obj->ih_data; 1092 /* add the callback.etc */ 1093 *data = (struct ras_ih_data) { 1094 .inuse = 0, 1095 .cb = info->cb, 1096 .element_size = sizeof(struct amdgpu_iv_entry), 1097 .rptr = 0, 1098 .wptr = 0, 1099 }; 1100 1101 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 1102 1103 data->aligned_element_size = ALIGN(data->element_size, 8); 1104 /* the ring can store 64 iv entries. */ 1105 data->ring_size = 64 * data->aligned_element_size; 1106 data->ring = kmalloc(data->ring_size, GFP_KERNEL); 1107 if (!data->ring) { 1108 put_obj(obj); 1109 return -ENOMEM; 1110 } 1111 1112 /* IH is ready */ 1113 data->inuse = 1; 1114 1115 return 0; 1116 } 1117 1118 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 1119 { 1120 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1121 struct ras_manager *obj, *tmp; 1122 1123 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1124 struct ras_ih_if info = { 1125 .head = obj->head, 1126 }; 1127 amdgpu_ras_interrupt_remove_handler(adev, &info); 1128 } 1129 1130 return 0; 1131 } 1132 /* ih end */ 1133 1134 /* recovery begin */ 1135 1136 /* return 0 on success. 1137 * caller need free bps. 1138 */ 1139 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 1140 struct ras_badpage **bps, unsigned int *count) 1141 { 1142 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1143 struct ras_err_handler_data *data; 1144 int i = 0; 1145 int ret = 0; 1146 1147 if (!con || !con->eh_data || !bps || !count) 1148 return -EINVAL; 1149 1150 mutex_lock(&con->recovery_lock); 1151 data = con->eh_data; 1152 if (!data || data->count == 0) { 1153 *bps = NULL; 1154 goto out; 1155 } 1156 1157 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); 1158 if (!*bps) { 1159 ret = -ENOMEM; 1160 goto out; 1161 } 1162 1163 for (; i < data->count; i++) { 1164 (*bps)[i] = (struct ras_badpage){ 1165 .bp = data->bps[i].bp, 1166 .size = AMDGPU_GPU_PAGE_SIZE, 1167 .flags = 0, 1168 }; 1169 1170 if (data->last_reserved <= i) 1171 (*bps)[i].flags = 1; 1172 else if (data->bps[i].bo == NULL) 1173 (*bps)[i].flags = 2; 1174 } 1175 1176 *count = data->count; 1177 out: 1178 mutex_unlock(&con->recovery_lock); 1179 return ret; 1180 } 1181 1182 static void amdgpu_ras_do_recovery(struct work_struct *work) 1183 { 1184 struct amdgpu_ras *ras = 1185 container_of(work, struct amdgpu_ras, recovery_work); 1186 1187 amdgpu_device_gpu_recover(ras->adev, 0); 1188 atomic_set(&ras->in_recovery, 0); 1189 } 1190 1191 static int amdgpu_ras_release_vram(struct amdgpu_device *adev, 1192 struct amdgpu_bo **bo_ptr) 1193 { 1194 /* no need to free it actually. */ 1195 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); 1196 return 0; 1197 } 1198 1199 /* reserve vram with size@offset */ 1200 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, 1201 uint64_t offset, uint64_t size, 1202 struct amdgpu_bo **bo_ptr) 1203 { 1204 struct ttm_operation_ctx ctx = { false, false }; 1205 struct amdgpu_bo_param bp; 1206 int r = 0; 1207 int i; 1208 struct amdgpu_bo *bo; 1209 1210 if (bo_ptr) 1211 *bo_ptr = NULL; 1212 memset(&bp, 0, sizeof(bp)); 1213 bp.size = size; 1214 bp.byte_align = PAGE_SIZE; 1215 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 1216 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | 1217 AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 1218 bp.type = ttm_bo_type_kernel; 1219 bp.resv = NULL; 1220 1221 r = amdgpu_bo_create(adev, &bp, &bo); 1222 if (r) 1223 return -EINVAL; 1224 1225 r = amdgpu_bo_reserve(bo, false); 1226 if (r) 1227 goto error_reserve; 1228 1229 offset = ALIGN(offset, PAGE_SIZE); 1230 for (i = 0; i < bo->placement.num_placement; ++i) { 1231 bo->placements[i].fpfn = offset >> PAGE_SHIFT; 1232 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; 1233 } 1234 1235 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); 1236 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); 1237 if (r) 1238 goto error_pin; 1239 1240 r = amdgpu_bo_pin_restricted(bo, 1241 AMDGPU_GEM_DOMAIN_VRAM, 1242 offset, 1243 offset + size); 1244 if (r) 1245 goto error_pin; 1246 1247 if (bo_ptr) 1248 *bo_ptr = bo; 1249 1250 amdgpu_bo_unreserve(bo); 1251 return r; 1252 1253 error_pin: 1254 amdgpu_bo_unreserve(bo); 1255 error_reserve: 1256 amdgpu_bo_unref(&bo); 1257 return r; 1258 } 1259 1260 /* alloc/realloc bps array */ 1261 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 1262 struct ras_err_handler_data *data, int pages) 1263 { 1264 unsigned int old_space = data->count + data->space_left; 1265 unsigned int new_space = old_space + pages; 1266 unsigned int align_space = ALIGN(new_space, 1024); 1267 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); 1268 1269 if (!tmp) 1270 return -ENOMEM; 1271 1272 if (data->bps) { 1273 memcpy(tmp, data->bps, 1274 data->count * sizeof(*data->bps)); 1275 kfree(data->bps); 1276 } 1277 1278 data->bps = tmp; 1279 data->space_left += align_space - old_space; 1280 return 0; 1281 } 1282 1283 /* it deal with vram only. */ 1284 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 1285 unsigned long *bps, int pages) 1286 { 1287 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1288 struct ras_err_handler_data *data; 1289 int i = pages; 1290 int ret = 0; 1291 1292 if (!con || !con->eh_data || !bps || pages <= 0) 1293 return 0; 1294 1295 mutex_lock(&con->recovery_lock); 1296 data = con->eh_data; 1297 if (!data) 1298 goto out; 1299 1300 if (data->space_left <= pages) 1301 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { 1302 ret = -ENOMEM; 1303 goto out; 1304 } 1305 1306 while (i--) 1307 data->bps[data->count++].bp = bps[i]; 1308 1309 data->space_left -= pages; 1310 out: 1311 mutex_unlock(&con->recovery_lock); 1312 1313 return ret; 1314 } 1315 1316 /* called in gpu recovery/init */ 1317 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) 1318 { 1319 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1320 struct ras_err_handler_data *data; 1321 uint64_t bp; 1322 struct amdgpu_bo *bo; 1323 int i; 1324 1325 if (!con || !con->eh_data) 1326 return 0; 1327 1328 mutex_lock(&con->recovery_lock); 1329 data = con->eh_data; 1330 if (!data) 1331 goto out; 1332 /* reserve vram at driver post stage. */ 1333 for (i = data->last_reserved; i < data->count; i++) { 1334 bp = data->bps[i].bp; 1335 1336 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, 1337 PAGE_SIZE, &bo)) 1338 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); 1339 1340 data->bps[i].bo = bo; 1341 data->last_reserved = i + 1; 1342 } 1343 out: 1344 mutex_unlock(&con->recovery_lock); 1345 return 0; 1346 } 1347 1348 /* called when driver unload */ 1349 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) 1350 { 1351 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1352 struct ras_err_handler_data *data; 1353 struct amdgpu_bo *bo; 1354 int i; 1355 1356 if (!con || !con->eh_data) 1357 return 0; 1358 1359 mutex_lock(&con->recovery_lock); 1360 data = con->eh_data; 1361 if (!data) 1362 goto out; 1363 1364 for (i = data->last_reserved - 1; i >= 0; i--) { 1365 bo = data->bps[i].bo; 1366 1367 amdgpu_ras_release_vram(adev, &bo); 1368 1369 data->bps[i].bo = bo; 1370 data->last_reserved = i; 1371 } 1372 out: 1373 mutex_unlock(&con->recovery_lock); 1374 return 0; 1375 } 1376 1377 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) 1378 { 1379 /* TODO 1380 * write the array to eeprom when SMU disabled. 1381 */ 1382 return 0; 1383 } 1384 1385 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 1386 { 1387 /* TODO 1388 * read the array to eeprom when SMU disabled. 1389 */ 1390 return 0; 1391 } 1392 1393 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) 1394 { 1395 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1396 struct ras_err_handler_data **data = &con->eh_data; 1397 1398 *data = kmalloc(sizeof(**data), 1399 GFP_KERNEL|__GFP_ZERO); 1400 if (!*data) 1401 return -ENOMEM; 1402 1403 mutex_init(&con->recovery_lock); 1404 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 1405 atomic_set(&con->in_recovery, 0); 1406 con->adev = adev; 1407 1408 amdgpu_ras_load_bad_pages(adev); 1409 amdgpu_ras_reserve_bad_pages(adev); 1410 1411 return 0; 1412 } 1413 1414 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 1415 { 1416 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1417 struct ras_err_handler_data *data = con->eh_data; 1418 1419 cancel_work_sync(&con->recovery_work); 1420 amdgpu_ras_save_bad_pages(adev); 1421 amdgpu_ras_release_bad_pages(adev); 1422 1423 mutex_lock(&con->recovery_lock); 1424 con->eh_data = NULL; 1425 kfree(data->bps); 1426 kfree(data); 1427 mutex_unlock(&con->recovery_lock); 1428 1429 return 0; 1430 } 1431 /* recovery end */ 1432 1433 /* return 0 if ras will reset gpu and repost.*/ 1434 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, 1435 unsigned int block) 1436 { 1437 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1438 1439 if (!ras) 1440 return -EINVAL; 1441 1442 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; 1443 return 0; 1444 } 1445 1446 /* 1447 * check hardware's ras ability which will be saved in hw_supported. 1448 * if hardware does not support ras, we can skip some ras initializtion and 1449 * forbid some ras operations from IP. 1450 * if software itself, say boot parameter, limit the ras ability. We still 1451 * need allow IP do some limited operations, like disable. In such case, 1452 * we have to initialize ras as normal. but need check if operation is 1453 * allowed or not in each function. 1454 */ 1455 static void amdgpu_ras_check_supported(struct amdgpu_device *adev, 1456 uint32_t *hw_supported, uint32_t *supported) 1457 { 1458 *hw_supported = 0; 1459 *supported = 0; 1460 1461 if (amdgpu_sriov_vf(adev) || 1462 adev->asic_type != CHIP_VEGA20) 1463 return; 1464 1465 if (adev->is_atom_fw && 1466 (amdgpu_atomfirmware_mem_ecc_supported(adev) || 1467 amdgpu_atomfirmware_sram_ecc_supported(adev))) 1468 *hw_supported = AMDGPU_RAS_BLOCK_MASK; 1469 1470 *supported = amdgpu_ras_enable == 0 ? 1471 0 : *hw_supported & amdgpu_ras_mask; 1472 } 1473 1474 int amdgpu_ras_init(struct amdgpu_device *adev) 1475 { 1476 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1477 1478 if (con) 1479 return 0; 1480 1481 con = kmalloc(sizeof(struct amdgpu_ras) + 1482 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, 1483 GFP_KERNEL|__GFP_ZERO); 1484 if (!con) 1485 return -ENOMEM; 1486 1487 con->objs = (struct ras_manager *)(con + 1); 1488 1489 amdgpu_ras_set_context(adev, con); 1490 1491 amdgpu_ras_check_supported(adev, &con->hw_supported, 1492 &con->supported); 1493 if (!con->hw_supported) { 1494 amdgpu_ras_set_context(adev, NULL); 1495 kfree(con); 1496 return 0; 1497 } 1498 1499 con->features = 0; 1500 INIT_LIST_HEAD(&con->head); 1501 /* Might need get this flag from vbios. */ 1502 con->flags = RAS_DEFAULT_FLAGS; 1503 1504 if (amdgpu_ras_recovery_init(adev)) 1505 goto recovery_out; 1506 1507 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; 1508 1509 if (amdgpu_ras_fs_init(adev)) 1510 goto fs_out; 1511 1512 DRM_INFO("RAS INFO: ras initialized successfully, " 1513 "hardware ability[%x] ras_mask[%x]\n", 1514 con->hw_supported, con->supported); 1515 return 0; 1516 fs_out: 1517 amdgpu_ras_recovery_fini(adev); 1518 recovery_out: 1519 amdgpu_ras_set_context(adev, NULL); 1520 kfree(con); 1521 1522 return -EINVAL; 1523 } 1524 1525 /* do some init work after IP late init as dependence. 1526 * and it runs in resume/gpu reset/booting up cases. 1527 */ 1528 void amdgpu_ras_resume(struct amdgpu_device *adev) 1529 { 1530 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1531 struct ras_manager *obj, *tmp; 1532 1533 if (!con) 1534 return; 1535 1536 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 1537 /* Set up all other IPs which are not implemented. There is a 1538 * tricky thing that IP's actual ras error type should be 1539 * MULTI_UNCORRECTABLE, but as driver does not handle it, so 1540 * ERROR_NONE make sense anyway. 1541 */ 1542 amdgpu_ras_enable_all_features(adev, 1); 1543 1544 /* We enable ras on all hw_supported block, but as boot 1545 * parameter might disable some of them and one or more IP has 1546 * not implemented yet. So we disable them on behalf. 1547 */ 1548 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1549 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 1550 amdgpu_ras_feature_enable(adev, &obj->head, 0); 1551 /* there should be no any reference. */ 1552 WARN_ON(alive_obj(obj)); 1553 } 1554 } 1555 } 1556 1557 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { 1558 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; 1559 /* setup ras obj state as disabled. 1560 * for init_by_vbios case. 1561 * if we want to enable ras, just enable it in a normal way. 1562 * If we want do disable it, need setup ras obj as enabled, 1563 * then issue another TA disable cmd. 1564 * See feature_enable_on_boot 1565 */ 1566 amdgpu_ras_disable_all_features(adev, 1); 1567 amdgpu_ras_reset_gpu(adev, 0); 1568 } 1569 } 1570 1571 void amdgpu_ras_suspend(struct amdgpu_device *adev) 1572 { 1573 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1574 1575 if (!con) 1576 return; 1577 1578 amdgpu_ras_disable_all_features(adev, 0); 1579 /* Make sure all ras objects are disabled. */ 1580 if (con->features) 1581 amdgpu_ras_disable_all_features(adev, 1); 1582 } 1583 1584 /* do some fini work before IP fini as dependence */ 1585 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 1586 { 1587 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1588 1589 if (!con) 1590 return 0; 1591 1592 /* Need disable ras on all IPs here before ip [hw/sw]fini */ 1593 amdgpu_ras_disable_all_features(adev, 0); 1594 amdgpu_ras_recovery_fini(adev); 1595 return 0; 1596 } 1597 1598 int amdgpu_ras_fini(struct amdgpu_device *adev) 1599 { 1600 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1601 1602 if (!con) 1603 return 0; 1604 1605 amdgpu_ras_fs_fini(adev); 1606 amdgpu_ras_interrupt_remove_all(adev); 1607 1608 WARN(con->features, "Feature mask is not cleared"); 1609 1610 if (con->features) 1611 amdgpu_ras_disable_all_features(adev, 1); 1612 1613 amdgpu_ras_set_context(adev, NULL); 1614 kfree(con); 1615 1616 return 0; 1617 } 1618