1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  *
23  */
24 #include <linux/debugfs.h>
25 #include <linux/list.h>
26 #include <linux/module.h>
27 #include "amdgpu.h"
28 #include "amdgpu_ras.h"
29 #include "amdgpu_atomfirmware.h"
30 
31 struct ras_ih_data {
32 	/* interrupt bottom half */
33 	struct work_struct ih_work;
34 	int inuse;
35 	/* IP callback */
36 	ras_ih_cb cb;
37 	/* full of entries */
38 	unsigned char *ring;
39 	unsigned int ring_size;
40 	unsigned int element_size;
41 	unsigned int aligned_element_size;
42 	unsigned int rptr;
43 	unsigned int wptr;
44 };
45 
46 struct ras_fs_data {
47 	char sysfs_name[32];
48 	char debugfs_name[32];
49 };
50 
51 struct ras_err_data {
52 	unsigned long ue_count;
53 	unsigned long ce_count;
54 };
55 
56 struct ras_err_handler_data {
57 	/* point to bad pages array */
58 	struct {
59 		unsigned long bp;
60 		struct amdgpu_bo *bo;
61 	} *bps;
62 	/* the count of entries */
63 	int count;
64 	/* the space can place new entries */
65 	int space_left;
66 	/* last reserved entry's index + 1 */
67 	int last_reserved;
68 };
69 
70 struct ras_manager {
71 	struct ras_common_if head;
72 	/* reference count */
73 	int use;
74 	/* ras block link */
75 	struct list_head node;
76 	/* the device */
77 	struct amdgpu_device *adev;
78 	/* debugfs */
79 	struct dentry *ent;
80 	/* sysfs */
81 	struct device_attribute sysfs_attr;
82 	int attr_inuse;
83 
84 	/* fs node name */
85 	struct ras_fs_data fs_data;
86 
87 	/* IH data */
88 	struct ras_ih_data ih_data;
89 
90 	struct ras_err_data err_data;
91 };
92 
93 const char *ras_error_string[] = {
94 	"none",
95 	"parity",
96 	"single_correctable",
97 	"multi_uncorrectable",
98 	"poison",
99 };
100 
101 const char *ras_block_string[] = {
102 	"umc",
103 	"sdma",
104 	"gfx",
105 	"mmhub",
106 	"athub",
107 	"pcie_bif",
108 	"hdp",
109 	"xgmi_wafl",
110 	"df",
111 	"smn",
112 	"sem",
113 	"mp0",
114 	"mp1",
115 	"fuse",
116 };
117 
118 #define ras_err_str(i) (ras_error_string[ffs(i)])
119 #define ras_block_str(i) (ras_block_string[i])
120 
121 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
122 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
123 
124 static void amdgpu_ras_self_test(struct amdgpu_device *adev)
125 {
126 	/* TODO */
127 }
128 
129 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
130 					size_t size, loff_t *pos)
131 {
132 	struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
133 	struct ras_query_if info = {
134 		.head = obj->head,
135 	};
136 	ssize_t s;
137 	char val[128];
138 
139 	if (amdgpu_ras_error_query(obj->adev, &info))
140 		return -EINVAL;
141 
142 	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
143 			"ue", info.ue_count,
144 			"ce", info.ce_count);
145 	if (*pos >= s)
146 		return 0;
147 
148 	s -= *pos;
149 	s = min_t(u64, s, size);
150 
151 
152 	if (copy_to_user(buf, &val[*pos], s))
153 		return -EINVAL;
154 
155 	*pos += s;
156 
157 	return s;
158 }
159 
160 static const struct file_operations amdgpu_ras_debugfs_ops = {
161 	.owner = THIS_MODULE,
162 	.read = amdgpu_ras_debugfs_read,
163 	.write = NULL,
164 	.llseek = default_llseek
165 };
166 
167 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
168 {
169 	int i;
170 
171 	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
172 		*block_id = i;
173 		if (strcmp(name, ras_block_str(i)) == 0)
174 			return 0;
175 	}
176 	return -EINVAL;
177 }
178 
179 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
180 		const char __user *buf, size_t size,
181 		loff_t *pos, struct ras_debug_if *data)
182 {
183 	ssize_t s = min_t(u64, 64, size);
184 	char str[65];
185 	char block_name[33];
186 	char err[9] = "ue";
187 	int op = -1;
188 	int block_id;
189 	u64 address, value;
190 
191 	if (*pos)
192 		return -EINVAL;
193 	*pos = size;
194 
195 	memset(str, 0, sizeof(str));
196 	memset(data, 0, sizeof(*data));
197 
198 	if (copy_from_user(str, buf, s))
199 		return -EINVAL;
200 
201 	if (sscanf(str, "disable %32s", block_name) == 1)
202 		op = 0;
203 	else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
204 		op = 1;
205 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
206 		op = 2;
207 	else if (str[0] && str[1] && str[2] && str[3])
208 		/* ascii string, but commands are not matched. */
209 		return -EINVAL;
210 
211 	if (op != -1) {
212 		if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
213 			return -EINVAL;
214 
215 		data->head.block = block_id;
216 		data->head.type = memcmp("ue", err, 2) == 0 ?
217 			AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
218 			AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
219 		data->op = op;
220 
221 		if (op == 2) {
222 			if (sscanf(str, "%*s %*s %*s %llu %llu",
223 						&address, &value) != 2)
224 				if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
225 							&address, &value) != 2)
226 					return -EINVAL;
227 			data->inject.address = address;
228 			data->inject.value = value;
229 		}
230 	} else {
231 		if (size < sizeof(*data))
232 			return -EINVAL;
233 
234 		if (copy_from_user(data, buf, sizeof(*data)))
235 			return -EINVAL;
236 	}
237 
238 	return 0;
239 }
240 /*
241  * DOC: ras debugfs control interface
242  *
243  * It accepts struct ras_debug_if who has two members.
244  *
245  * First member: ras_debug_if::head or ras_debug_if::inject.
246  *
247  * head is used to indicate which IP block will be under control.
248  *
249  * head has four members, they are block, type, sub_block_index, name.
250  * block: which IP will be under control.
251  * type: what kind of error will be enabled/disabled/injected.
252  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
253  * name: the name of IP.
254  *
255  * inject has two more members than head, they are address, value.
256  * As their names indicate, inject operation will write the
257  * value to the address.
258  *
259  * Second member: struct ras_debug_if::op.
260  * It has three kinds of operations.
261  *  0: disable RAS on the block. Take ::head as its data.
262  *  1: enable RAS on the block. Take ::head as its data.
263  *  2: inject errors on the block. Take ::inject as its data.
264  *
265  * How to use the interface?
266  * programs:
267  * copy the struct ras_debug_if in your codes and initialize it.
268  * write the struct to the control node.
269  *
270  * bash:
271  * echo op block [error [address value]] > .../ras/ras_ctrl
272  *	op: disable, enable, inject
273  *		disable: only block is needed
274  *		enable: block and error are needed
275  *		inject: error, address, value are needed
276  *	block: umc, smda, gfx, .........
277  *		see ras_block_string[] for details
278  *	error: ue, ce
279  *		ue: multi_uncorrectable
280  *		ce: single_correctable
281  *
282  * here are some examples for bash commands,
283  *	echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
284  *	echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
285  *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
286  *
287  * How to check the result?
288  *
289  * For disable/enable, please check ras features at
290  * /sys/class/drm/card[0/1/2...]/device/ras/features
291  *
292  * For inject, please check corresponding err count at
293  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
294  *
295  * NOTE: operation is only allowed on blocks which are supported.
296  * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
297  */
298 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
299 		size_t size, loff_t *pos)
300 {
301 	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
302 	struct ras_debug_if data;
303 	int ret = 0;
304 
305 	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
306 	if (ret)
307 		return -EINVAL;
308 
309 	if (!amdgpu_ras_is_supported(adev, data.head.block))
310 		return -EINVAL;
311 
312 	switch (data.op) {
313 	case 0:
314 		ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
315 		break;
316 	case 1:
317 		ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
318 		break;
319 	case 2:
320 		ret = amdgpu_ras_error_inject(adev, &data.inject);
321 		break;
322 	default:
323 		ret = -EINVAL;
324 		break;
325 	};
326 
327 	if (ret)
328 		return -EINVAL;
329 
330 	return size;
331 }
332 
333 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
334 	.owner = THIS_MODULE,
335 	.read = NULL,
336 	.write = amdgpu_ras_debugfs_ctrl_write,
337 	.llseek = default_llseek
338 };
339 
340 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
341 		struct device_attribute *attr, char *buf)
342 {
343 	struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
344 	struct ras_query_if info = {
345 		.head = obj->head,
346 	};
347 
348 	if (amdgpu_ras_error_query(obj->adev, &info))
349 		return -EINVAL;
350 
351 	return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
352 			"ue", info.ue_count,
353 			"ce", info.ce_count);
354 }
355 
356 /* obj begin */
357 
358 #define get_obj(obj) do { (obj)->use++; } while (0)
359 #define alive_obj(obj) ((obj)->use)
360 
361 static inline void put_obj(struct ras_manager *obj)
362 {
363 	if (obj && --obj->use == 0)
364 		list_del(&obj->node);
365 	if (obj && obj->use < 0) {
366 		 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
367 	}
368 }
369 
370 /* make one obj and return it. */
371 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
372 		struct ras_common_if *head)
373 {
374 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
375 	struct ras_manager *obj;
376 
377 	if (!con)
378 		return NULL;
379 
380 	if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
381 		return NULL;
382 
383 	obj = &con->objs[head->block];
384 	/* already exist. return obj? */
385 	if (alive_obj(obj))
386 		return NULL;
387 
388 	obj->head = *head;
389 	obj->adev = adev;
390 	list_add(&obj->node, &con->head);
391 	get_obj(obj);
392 
393 	return obj;
394 }
395 
396 /* return an obj equal to head, or the first when head is NULL */
397 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
398 		struct ras_common_if *head)
399 {
400 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
401 	struct ras_manager *obj;
402 	int i;
403 
404 	if (!con)
405 		return NULL;
406 
407 	if (head) {
408 		if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
409 			return NULL;
410 
411 		obj = &con->objs[head->block];
412 
413 		if (alive_obj(obj)) {
414 			WARN_ON(head->block != obj->head.block);
415 			return obj;
416 		}
417 	} else {
418 		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
419 			obj = &con->objs[i];
420 			if (alive_obj(obj)) {
421 				WARN_ON(i != obj->head.block);
422 				return obj;
423 			}
424 		}
425 	}
426 
427 	return NULL;
428 }
429 /* obj end */
430 
431 /* feature ctl begin */
432 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
433 		struct ras_common_if *head)
434 {
435 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
436 
437 	return con->hw_supported & BIT(head->block);
438 }
439 
440 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
441 		struct ras_common_if *head)
442 {
443 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
444 
445 	return con->features & BIT(head->block);
446 }
447 
448 /*
449  * if obj is not created, then create one.
450  * set feature enable flag.
451  */
452 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
453 		struct ras_common_if *head, int enable)
454 {
455 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
456 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
457 
458 	/* If hardware does not support ras, then do not create obj.
459 	 * But if hardware support ras, we can create the obj.
460 	 * Ras framework checks con->hw_supported to see if it need do
461 	 * corresponding initialization.
462 	 * IP checks con->support to see if it need disable ras.
463 	 */
464 	if (!amdgpu_ras_is_feature_allowed(adev, head))
465 		return 0;
466 	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
467 		return 0;
468 
469 	if (enable) {
470 		if (!obj) {
471 			obj = amdgpu_ras_create_obj(adev, head);
472 			if (!obj)
473 				return -EINVAL;
474 		} else {
475 			/* In case we create obj somewhere else */
476 			get_obj(obj);
477 		}
478 		con->features |= BIT(head->block);
479 	} else {
480 		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
481 			con->features &= ~BIT(head->block);
482 			put_obj(obj);
483 		}
484 	}
485 
486 	return 0;
487 }
488 
489 /* wrapper of psp_ras_enable_features */
490 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
491 		struct ras_common_if *head, bool enable)
492 {
493 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
494 	union ta_ras_cmd_input info;
495 	int ret;
496 
497 	if (!con)
498 		return -EINVAL;
499 
500 	if (!enable) {
501 		info.disable_features = (struct ta_ras_disable_features_input) {
502 			.block_id =  amdgpu_ras_block_to_ta(head->block),
503 			.error_type = amdgpu_ras_error_to_ta(head->type),
504 		};
505 	} else {
506 		info.enable_features = (struct ta_ras_enable_features_input) {
507 			.block_id =  amdgpu_ras_block_to_ta(head->block),
508 			.error_type = amdgpu_ras_error_to_ta(head->type),
509 		};
510 	}
511 
512 	/* Do not enable if it is not allowed. */
513 	WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
514 	/* Are we alerady in that state we are going to set? */
515 	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
516 		return 0;
517 
518 	ret = psp_ras_enable_features(&adev->psp, &info, enable);
519 	if (ret) {
520 		DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
521 				enable ? "enable":"disable",
522 				ras_block_str(head->block),
523 				ret);
524 		return -EINVAL;
525 	}
526 
527 	/* setup the obj */
528 	__amdgpu_ras_feature_enable(adev, head, enable);
529 
530 	return 0;
531 }
532 
533 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
534 		bool bypass)
535 {
536 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
537 	struct ras_manager *obj, *tmp;
538 
539 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
540 		/* bypass psp.
541 		 * aka just release the obj and corresponding flags
542 		 */
543 		if (bypass) {
544 			if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
545 				break;
546 		} else {
547 			if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
548 				break;
549 		}
550 	}
551 
552 	return con->features;
553 }
554 
555 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
556 		bool bypass)
557 {
558 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
559 	int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
560 	int i;
561 
562 	for (i = 0; i < ras_block_count; i++) {
563 		struct ras_common_if head = {
564 			.block = i,
565 			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
566 			.sub_block_index = 0,
567 		};
568 		strcpy(head.name, ras_block_str(i));
569 		if (bypass) {
570 			/*
571 			 * bypass psp. vbios enable ras for us.
572 			 * so just create the obj
573 			 */
574 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
575 				break;
576 		} else {
577 			if (amdgpu_ras_feature_enable(adev, &head, 1))
578 				break;
579 		}
580 	}
581 
582 	return con->features;
583 }
584 /* feature ctl end */
585 
586 /* query/inject/cure begin */
587 int amdgpu_ras_error_query(struct amdgpu_device *adev,
588 		struct ras_query_if *info)
589 {
590 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
591 
592 	if (!obj)
593 		return -EINVAL;
594 	/* TODO might read the register to read the count */
595 
596 	info->ue_count = obj->err_data.ue_count;
597 	info->ce_count = obj->err_data.ce_count;
598 
599 	return 0;
600 }
601 
602 /* wrapper of psp_ras_trigger_error */
603 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
604 		struct ras_inject_if *info)
605 {
606 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
607 	struct ta_ras_trigger_error_input block_info = {
608 		.block_id =  amdgpu_ras_block_to_ta(info->head.block),
609 		.inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
610 		.sub_block_index = info->head.sub_block_index,
611 		.address = info->address,
612 		.value = info->value,
613 	};
614 	int ret = 0;
615 
616 	if (!obj)
617 		return -EINVAL;
618 
619 	ret = psp_ras_trigger_error(&adev->psp, &block_info);
620 	if (ret)
621 		DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
622 				ras_block_str(info->head.block),
623 				ret);
624 
625 	return ret;
626 }
627 
628 int amdgpu_ras_error_cure(struct amdgpu_device *adev,
629 		struct ras_cure_if *info)
630 {
631 	/* psp fw has no cure interface for now. */
632 	return 0;
633 }
634 
635 /* get the total error counts on all IPs */
636 int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
637 		bool is_ce)
638 {
639 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
640 	struct ras_manager *obj;
641 	struct ras_err_data data = {0, 0};
642 
643 	if (!con)
644 		return -EINVAL;
645 
646 	list_for_each_entry(obj, &con->head, node) {
647 		struct ras_query_if info = {
648 			.head = obj->head,
649 		};
650 
651 		if (amdgpu_ras_error_query(adev, &info))
652 			return -EINVAL;
653 
654 		data.ce_count += info.ce_count;
655 		data.ue_count += info.ue_count;
656 	}
657 
658 	return is_ce ? data.ce_count : data.ue_count;
659 }
660 /* query/inject/cure end */
661 
662 
663 /* sysfs begin */
664 
665 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
666 		struct device_attribute *attr, char *buf)
667 {
668 	struct amdgpu_ras *con =
669 		container_of(attr, struct amdgpu_ras, features_attr);
670 	struct drm_device *ddev = dev_get_drvdata(dev);
671 	struct amdgpu_device *adev = ddev->dev_private;
672 	struct ras_common_if head;
673 	int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
674 	int i;
675 	ssize_t s;
676 	struct ras_manager *obj;
677 
678 	s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
679 
680 	for (i = 0; i < ras_block_count; i++) {
681 		head.block = i;
682 
683 		if (amdgpu_ras_is_feature_enabled(adev, &head)) {
684 			obj = amdgpu_ras_find_obj(adev, &head);
685 			s += scnprintf(&buf[s], PAGE_SIZE - s,
686 					"%s: %s\n",
687 					ras_block_str(i),
688 					ras_err_str(obj->head.type));
689 		} else
690 			s += scnprintf(&buf[s], PAGE_SIZE - s,
691 					"%s: disabled\n",
692 					ras_block_str(i));
693 	}
694 
695 	return s;
696 }
697 
698 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
699 {
700 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
701 	struct attribute *attrs[] = {
702 		&con->features_attr.attr,
703 		NULL
704 	};
705 	struct attribute_group group = {
706 		.name = "ras",
707 		.attrs = attrs,
708 	};
709 
710 	con->features_attr = (struct device_attribute) {
711 		.attr = {
712 			.name = "features",
713 			.mode = S_IRUGO,
714 		},
715 			.show = amdgpu_ras_sysfs_features_read,
716 	};
717 	sysfs_attr_init(attrs[0]);
718 
719 	return sysfs_create_group(&adev->dev->kobj, &group);
720 }
721 
722 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
723 {
724 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
725 	struct attribute *attrs[] = {
726 		&con->features_attr.attr,
727 		NULL
728 	};
729 	struct attribute_group group = {
730 		.name = "ras",
731 		.attrs = attrs,
732 	};
733 
734 	sysfs_remove_group(&adev->dev->kobj, &group);
735 
736 	return 0;
737 }
738 
739 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
740 		struct ras_fs_if *head)
741 {
742 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
743 
744 	if (!obj || obj->attr_inuse)
745 		return -EINVAL;
746 
747 	get_obj(obj);
748 
749 	memcpy(obj->fs_data.sysfs_name,
750 			head->sysfs_name,
751 			sizeof(obj->fs_data.sysfs_name));
752 
753 	obj->sysfs_attr = (struct device_attribute){
754 		.attr = {
755 			.name = obj->fs_data.sysfs_name,
756 			.mode = S_IRUGO,
757 		},
758 			.show = amdgpu_ras_sysfs_read,
759 	};
760 	sysfs_attr_init(&obj->sysfs_attr.attr);
761 
762 	if (sysfs_add_file_to_group(&adev->dev->kobj,
763 				&obj->sysfs_attr.attr,
764 				"ras")) {
765 		put_obj(obj);
766 		return -EINVAL;
767 	}
768 
769 	obj->attr_inuse = 1;
770 
771 	return 0;
772 }
773 
774 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
775 		struct ras_common_if *head)
776 {
777 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
778 
779 	if (!obj || !obj->attr_inuse)
780 		return -EINVAL;
781 
782 	sysfs_remove_file_from_group(&adev->dev->kobj,
783 				&obj->sysfs_attr.attr,
784 				"ras");
785 	obj->attr_inuse = 0;
786 	put_obj(obj);
787 
788 	return 0;
789 }
790 
791 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
792 {
793 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
794 	struct ras_manager *obj, *tmp;
795 
796 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
797 		amdgpu_ras_sysfs_remove(adev, &obj->head);
798 	}
799 
800 	amdgpu_ras_sysfs_remove_feature_node(adev);
801 
802 	return 0;
803 }
804 /* sysfs end */
805 
806 /* debugfs begin */
807 static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
808 {
809 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
810 	struct drm_minor *minor = adev->ddev->primary;
811 	struct dentry *root = minor->debugfs_root, *dir;
812 	struct dentry *ent;
813 
814 	dir = debugfs_create_dir("ras", root);
815 	if (IS_ERR(dir))
816 		return -EINVAL;
817 
818 	con->dir = dir;
819 
820 	ent = debugfs_create_file("ras_ctrl",
821 			S_IWUGO | S_IRUGO, con->dir,
822 			adev, &amdgpu_ras_debugfs_ctrl_ops);
823 	if (IS_ERR(ent)) {
824 		debugfs_remove(con->dir);
825 		return -EINVAL;
826 	}
827 
828 	con->ent = ent;
829 	return 0;
830 }
831 
832 int amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
833 		struct ras_fs_if *head)
834 {
835 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
836 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
837 	struct dentry *ent;
838 
839 	if (!obj || obj->ent)
840 		return -EINVAL;
841 
842 	get_obj(obj);
843 
844 	memcpy(obj->fs_data.debugfs_name,
845 			head->debugfs_name,
846 			sizeof(obj->fs_data.debugfs_name));
847 
848 	ent = debugfs_create_file(obj->fs_data.debugfs_name,
849 			S_IWUGO | S_IRUGO, con->dir,
850 			obj, &amdgpu_ras_debugfs_ops);
851 
852 	if (IS_ERR(ent))
853 		return -EINVAL;
854 
855 	obj->ent = ent;
856 
857 	return 0;
858 }
859 
860 int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
861 		struct ras_common_if *head)
862 {
863 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
864 
865 	if (!obj || !obj->ent)
866 		return 0;
867 
868 	debugfs_remove(obj->ent);
869 	obj->ent = NULL;
870 	put_obj(obj);
871 
872 	return 0;
873 }
874 
875 static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
876 {
877 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
878 	struct ras_manager *obj, *tmp;
879 
880 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
881 		amdgpu_ras_debugfs_remove(adev, &obj->head);
882 	}
883 
884 	debugfs_remove(con->ent);
885 	debugfs_remove(con->dir);
886 	con->dir = NULL;
887 	con->ent = NULL;
888 
889 	return 0;
890 }
891 /* debugfs end */
892 
893 /* ras fs */
894 
895 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
896 {
897 	amdgpu_ras_sysfs_create_feature_node(adev);
898 	amdgpu_ras_debugfs_create_ctrl_node(adev);
899 
900 	return 0;
901 }
902 
903 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
904 {
905 	amdgpu_ras_debugfs_remove_all(adev);
906 	amdgpu_ras_sysfs_remove_all(adev);
907 	return 0;
908 }
909 /* ras fs end */
910 
911 /* ih begin */
912 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
913 {
914 	struct ras_ih_data *data = &obj->ih_data;
915 	struct amdgpu_iv_entry entry;
916 	int ret;
917 
918 	while (data->rptr != data->wptr) {
919 		rmb();
920 		memcpy(&entry, &data->ring[data->rptr],
921 				data->element_size);
922 
923 		wmb();
924 		data->rptr = (data->aligned_element_size +
925 				data->rptr) % data->ring_size;
926 
927 		/* Let IP handle its data, maybe we need get the output
928 		 * from the callback to udpate the error type/count, etc
929 		 */
930 		if (data->cb) {
931 			ret = data->cb(obj->adev, &entry);
932 			/* ue will trigger an interrupt, and in that case
933 			 * we need do a reset to recovery the whole system.
934 			 * But leave IP do that recovery, here we just dispatch
935 			 * the error.
936 			 */
937 			if (ret == AMDGPU_RAS_UE) {
938 				obj->err_data.ue_count++;
939 			}
940 			/* Might need get ce count by register, but not all IP
941 			 * saves ce count, some IP just use one bit or two bits
942 			 * to indicate ce happened.
943 			 */
944 		}
945 	}
946 }
947 
948 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
949 {
950 	struct ras_ih_data *data =
951 		container_of(work, struct ras_ih_data, ih_work);
952 	struct ras_manager *obj =
953 		container_of(data, struct ras_manager, ih_data);
954 
955 	amdgpu_ras_interrupt_handler(obj);
956 }
957 
958 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
959 		struct ras_dispatch_if *info)
960 {
961 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
962 	struct ras_ih_data *data = &obj->ih_data;
963 
964 	if (!obj)
965 		return -EINVAL;
966 
967 	if (data->inuse == 0)
968 		return 0;
969 
970 	/* Might be overflow... */
971 	memcpy(&data->ring[data->wptr], info->entry,
972 			data->element_size);
973 
974 	wmb();
975 	data->wptr = (data->aligned_element_size +
976 			data->wptr) % data->ring_size;
977 
978 	schedule_work(&data->ih_work);
979 
980 	return 0;
981 }
982 
983 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
984 		struct ras_ih_if *info)
985 {
986 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
987 	struct ras_ih_data *data;
988 
989 	if (!obj)
990 		return -EINVAL;
991 
992 	data = &obj->ih_data;
993 	if (data->inuse == 0)
994 		return 0;
995 
996 	cancel_work_sync(&data->ih_work);
997 
998 	kfree(data->ring);
999 	memset(data, 0, sizeof(*data));
1000 	put_obj(obj);
1001 
1002 	return 0;
1003 }
1004 
1005 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1006 		struct ras_ih_if *info)
1007 {
1008 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1009 	struct ras_ih_data *data;
1010 
1011 	if (!obj) {
1012 		/* in case we registe the IH before enable ras feature */
1013 		obj = amdgpu_ras_create_obj(adev, &info->head);
1014 		if (!obj)
1015 			return -EINVAL;
1016 	} else
1017 		get_obj(obj);
1018 
1019 	data = &obj->ih_data;
1020 	/* add the callback.etc */
1021 	*data = (struct ras_ih_data) {
1022 		.inuse = 0,
1023 		.cb = info->cb,
1024 		.element_size = sizeof(struct amdgpu_iv_entry),
1025 		.rptr = 0,
1026 		.wptr = 0,
1027 	};
1028 
1029 	INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1030 
1031 	data->aligned_element_size = ALIGN(data->element_size, 8);
1032 	/* the ring can store 64 iv entries. */
1033 	data->ring_size = 64 * data->aligned_element_size;
1034 	data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1035 	if (!data->ring) {
1036 		put_obj(obj);
1037 		return -ENOMEM;
1038 	}
1039 
1040 	/* IH is ready */
1041 	data->inuse = 1;
1042 
1043 	return 0;
1044 }
1045 
1046 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1047 {
1048 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1049 	struct ras_manager *obj, *tmp;
1050 
1051 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
1052 		struct ras_ih_if info = {
1053 			.head = obj->head,
1054 		};
1055 		amdgpu_ras_interrupt_remove_handler(adev, &info);
1056 	}
1057 
1058 	return 0;
1059 }
1060 /* ih end */
1061 
1062 /* recovery begin */
1063 static void amdgpu_ras_do_recovery(struct work_struct *work)
1064 {
1065 	struct amdgpu_ras *ras =
1066 		container_of(work, struct amdgpu_ras, recovery_work);
1067 
1068 	amdgpu_device_gpu_recover(ras->adev, 0);
1069 	atomic_set(&ras->in_recovery, 0);
1070 }
1071 
1072 static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1073 		struct amdgpu_bo **bo_ptr)
1074 {
1075 	/* no need to free it actually. */
1076 	amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1077 	return 0;
1078 }
1079 
1080 /* reserve vram with size@offset */
1081 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1082 		uint64_t offset, uint64_t size,
1083 		struct amdgpu_bo **bo_ptr)
1084 {
1085 	struct ttm_operation_ctx ctx = { false, false };
1086 	struct amdgpu_bo_param bp;
1087 	int r = 0;
1088 	int i;
1089 	struct amdgpu_bo *bo;
1090 
1091 	if (bo_ptr)
1092 		*bo_ptr = NULL;
1093 	memset(&bp, 0, sizeof(bp));
1094 	bp.size = size;
1095 	bp.byte_align = PAGE_SIZE;
1096 	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1097 	bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1098 		AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1099 	bp.type = ttm_bo_type_kernel;
1100 	bp.resv = NULL;
1101 
1102 	r = amdgpu_bo_create(adev, &bp, &bo);
1103 	if (r)
1104 		return -EINVAL;
1105 
1106 	r = amdgpu_bo_reserve(bo, false);
1107 	if (r)
1108 		goto error_reserve;
1109 
1110 	offset = ALIGN(offset, PAGE_SIZE);
1111 	for (i = 0; i < bo->placement.num_placement; ++i) {
1112 		bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1113 		bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1114 	}
1115 
1116 	ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1117 	r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1118 	if (r)
1119 		goto error_pin;
1120 
1121 	r = amdgpu_bo_pin_restricted(bo,
1122 			AMDGPU_GEM_DOMAIN_VRAM,
1123 			offset,
1124 			offset + size);
1125 	if (r)
1126 		goto error_pin;
1127 
1128 	if (bo_ptr)
1129 		*bo_ptr = bo;
1130 
1131 	amdgpu_bo_unreserve(bo);
1132 	return r;
1133 
1134 error_pin:
1135 	amdgpu_bo_unreserve(bo);
1136 error_reserve:
1137 	amdgpu_bo_unref(&bo);
1138 	return r;
1139 }
1140 
1141 /* alloc/realloc bps array */
1142 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1143 		struct ras_err_handler_data *data, int pages)
1144 {
1145 	unsigned int old_space = data->count + data->space_left;
1146 	unsigned int new_space = old_space + pages;
1147 	unsigned int align_space = ALIGN(new_space, 1024);
1148 	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1149 
1150 	if (!tmp)
1151 		return -ENOMEM;
1152 
1153 	if (data->bps) {
1154 		memcpy(tmp, data->bps,
1155 				data->count * sizeof(*data->bps));
1156 		kfree(data->bps);
1157 	}
1158 
1159 	data->bps = tmp;
1160 	data->space_left += align_space - old_space;
1161 	return 0;
1162 }
1163 
1164 /* it deal with vram only. */
1165 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1166 		unsigned long *bps, int pages)
1167 {
1168 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1169 	struct ras_err_handler_data *data;
1170 	int i = pages;
1171 	int ret = 0;
1172 
1173 	if (!con || !con->eh_data || !bps || pages <= 0)
1174 		return 0;
1175 
1176 	mutex_lock(&con->recovery_lock);
1177 	data = con->eh_data;
1178 	if (!data)
1179 		goto out;
1180 
1181 	if (data->space_left <= pages)
1182 		if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1183 			ret = -ENOMEM;
1184 			goto out;
1185 		}
1186 
1187 	while (i--)
1188 		data->bps[data->count++].bp = bps[i];
1189 
1190 	data->space_left -= pages;
1191 out:
1192 	mutex_unlock(&con->recovery_lock);
1193 
1194 	return ret;
1195 }
1196 
1197 /* called in gpu recovery/init */
1198 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1199 {
1200 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1201 	struct ras_err_handler_data *data;
1202 	uint64_t bp;
1203 	struct amdgpu_bo *bo;
1204 	int i;
1205 
1206 	if (!con || !con->eh_data)
1207 		return 0;
1208 
1209 	mutex_lock(&con->recovery_lock);
1210 	data = con->eh_data;
1211 	if (!data)
1212 		goto out;
1213 	/* reserve vram at driver post stage. */
1214 	for (i = data->last_reserved; i < data->count; i++) {
1215 		bp = data->bps[i].bp;
1216 
1217 		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1218 					PAGE_SIZE, &bo))
1219 			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1220 
1221 		data->bps[i].bo = bo;
1222 		data->last_reserved = i + 1;
1223 	}
1224 out:
1225 	mutex_unlock(&con->recovery_lock);
1226 	return 0;
1227 }
1228 
1229 /* called when driver unload */
1230 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1231 {
1232 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1233 	struct ras_err_handler_data *data;
1234 	struct amdgpu_bo *bo;
1235 	int i;
1236 
1237 	if (!con || !con->eh_data)
1238 		return 0;
1239 
1240 	mutex_lock(&con->recovery_lock);
1241 	data = con->eh_data;
1242 	if (!data)
1243 		goto out;
1244 
1245 	for (i = data->last_reserved - 1; i >= 0; i--) {
1246 		bo = data->bps[i].bo;
1247 
1248 		amdgpu_ras_release_vram(adev, &bo);
1249 
1250 		data->bps[i].bo = bo;
1251 		data->last_reserved = i;
1252 	}
1253 out:
1254 	mutex_unlock(&con->recovery_lock);
1255 	return 0;
1256 }
1257 
1258 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1259 {
1260 	/* TODO
1261 	 * write the array to eeprom when SMU disabled.
1262 	 */
1263 	return 0;
1264 }
1265 
1266 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1267 {
1268 	/* TODO
1269 	 * read the array to eeprom when SMU disabled.
1270 	 */
1271 	return 0;
1272 }
1273 
1274 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1275 {
1276 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1277 	struct ras_err_handler_data **data = &con->eh_data;
1278 
1279 	*data = kmalloc(sizeof(**data),
1280 			GFP_KERNEL|__GFP_ZERO);
1281 	if (!*data)
1282 		return -ENOMEM;
1283 
1284 	mutex_init(&con->recovery_lock);
1285 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1286 	atomic_set(&con->in_recovery, 0);
1287 	con->adev = adev;
1288 
1289 	amdgpu_ras_load_bad_pages(adev);
1290 	amdgpu_ras_reserve_bad_pages(adev);
1291 
1292 	return 0;
1293 }
1294 
1295 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1296 {
1297 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1298 	struct ras_err_handler_data *data = con->eh_data;
1299 
1300 	cancel_work_sync(&con->recovery_work);
1301 	amdgpu_ras_save_bad_pages(adev);
1302 	amdgpu_ras_release_bad_pages(adev);
1303 
1304 	mutex_lock(&con->recovery_lock);
1305 	con->eh_data = NULL;
1306 	kfree(data->bps);
1307 	kfree(data);
1308 	mutex_unlock(&con->recovery_lock);
1309 
1310 	return 0;
1311 }
1312 /* recovery end */
1313 
1314 /*
1315  * check hardware's ras ability which will be saved in hw_supported.
1316  * if hardware does not support ras, we can skip some ras initializtion and
1317  * forbid some ras operations from IP.
1318  * if software itself, say boot parameter, limit the ras ability. We still
1319  * need allow IP do some limited operations, like disable. In such case,
1320  * we have to initialize ras as normal. but need check if operation is
1321  * allowed or not in each function.
1322  */
1323 static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1324 		uint32_t *hw_supported, uint32_t *supported)
1325 {
1326 	*hw_supported = 0;
1327 	*supported = 0;
1328 
1329 	if (amdgpu_sriov_vf(adev) ||
1330 			adev->asic_type != CHIP_VEGA20)
1331 		return;
1332 
1333 	if (adev->is_atom_fw &&
1334 			(amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1335 			 amdgpu_atomfirmware_sram_ecc_supported(adev)))
1336 		*hw_supported = AMDGPU_RAS_BLOCK_MASK;
1337 
1338 	*supported = amdgpu_ras_enable == 0 ?
1339 				0 : *hw_supported & amdgpu_ras_mask;
1340 }
1341 
1342 int amdgpu_ras_init(struct amdgpu_device *adev)
1343 {
1344 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1345 
1346 	if (con)
1347 		return 0;
1348 
1349 	con = kmalloc(sizeof(struct amdgpu_ras) +
1350 			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1351 			GFP_KERNEL|__GFP_ZERO);
1352 	if (!con)
1353 		return -ENOMEM;
1354 
1355 	con->objs = (struct ras_manager *)(con + 1);
1356 
1357 	amdgpu_ras_set_context(adev, con);
1358 
1359 	amdgpu_ras_check_supported(adev, &con->hw_supported,
1360 			&con->supported);
1361 	con->features = 0;
1362 	INIT_LIST_HEAD(&con->head);
1363 	/* Might need get this flag from vbios. */
1364 	con->flags = RAS_DEFAULT_FLAGS;
1365 
1366 	if (amdgpu_ras_recovery_init(adev))
1367 		goto recovery_out;
1368 
1369 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1370 
1371 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
1372 		amdgpu_ras_enable_all_features(adev, 1);
1373 
1374 	if (amdgpu_ras_fs_init(adev))
1375 		goto fs_out;
1376 
1377 	amdgpu_ras_self_test(adev);
1378 
1379 	DRM_INFO("RAS INFO: ras initialized successfully, "
1380 			"hardware ability[%x] ras_mask[%x]\n",
1381 			con->hw_supported, con->supported);
1382 	return 0;
1383 fs_out:
1384 	amdgpu_ras_recovery_fini(adev);
1385 recovery_out:
1386 	amdgpu_ras_set_context(adev, NULL);
1387 	kfree(con);
1388 
1389 	return -EINVAL;
1390 }
1391 
1392 /* do some init work after IP late init as dependence */
1393 void amdgpu_ras_post_init(struct amdgpu_device *adev)
1394 {
1395 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1396 	struct ras_manager *obj, *tmp;
1397 
1398 	if (!con)
1399 		return;
1400 
1401 	/* We enable ras on all hw_supported block, but as boot parameter might
1402 	 * disable some of them and one or more IP has not implemented yet.
1403 	 * So we disable them on behalf.
1404 	 */
1405 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1406 		list_for_each_entry_safe(obj, tmp, &con->head, node) {
1407 			if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1408 				amdgpu_ras_feature_enable(adev, &obj->head, 0);
1409 				/* there should be no any reference. */
1410 				WARN_ON(alive_obj(obj));
1411 			}
1412 		};
1413 	}
1414 }
1415 
1416 /* do some fini work before IP fini as dependence */
1417 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1418 {
1419 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1420 
1421 	if (!con)
1422 		return 0;
1423 
1424 	/* Need disable ras on all IPs here before ip [hw/sw]fini */
1425 	amdgpu_ras_disable_all_features(adev, 0);
1426 	amdgpu_ras_recovery_fini(adev);
1427 	return 0;
1428 }
1429 
1430 int amdgpu_ras_fini(struct amdgpu_device *adev)
1431 {
1432 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1433 
1434 	if (!con)
1435 		return 0;
1436 
1437 	amdgpu_ras_fs_fini(adev);
1438 	amdgpu_ras_interrupt_remove_all(adev);
1439 
1440 	WARN(con->features, "Feature mask is not cleared");
1441 
1442 	if (con->features)
1443 		amdgpu_ras_disable_all_features(adev, 1);
1444 
1445 	amdgpu_ras_set_context(adev, NULL);
1446 	kfree(con);
1447 
1448 	return 0;
1449 }
1450