1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  *
23  */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 #include "amdgpu_xgmi.h"
27 #include "amdgpu_smu.h"
28 #include "amdgpu_ras.h"
29 #include "soc15.h"
30 #include "df/df_3_6_offset.h"
31 #include "xgmi/xgmi_4_0_0_smn.h"
32 #include "xgmi/xgmi_4_0_0_sh_mask.h"
33 #include "wafl/wafl2_4_0_0_smn.h"
34 #include "wafl/wafl2_4_0_0_sh_mask.h"
35 
36 static DEFINE_MUTEX(xgmi_mutex);
37 
38 #define AMDGPU_MAX_XGMI_HIVE			8
39 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE		4
40 
41 static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
42 static unsigned hive_count = 0;
43 
44 static const int xgmi_pcs_err_status_reg_vg20[] = {
45 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
46 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
47 };
48 
49 static const int wafl_pcs_err_status_reg_vg20[] = {
50 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
51 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
52 };
53 
54 static const int xgmi_pcs_err_status_reg_arct[] = {
55 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
56 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
57 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
58 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
59 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
60 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
61 };
62 
63 /* same as vg20*/
64 static const int wafl_pcs_err_status_reg_arct[] = {
65 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
66 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
67 };
68 
69 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
70 	{"XGMI PCS DataLossErr",
71 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
72 	{"XGMI PCS TrainingErr",
73 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
74 	{"XGMI PCS CRCErr",
75 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
76 	{"XGMI PCS BERExceededErr",
77 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
78 	{"XGMI PCS TxMetaDataErr",
79 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
80 	{"XGMI PCS ReplayBufParityErr",
81 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
82 	{"XGMI PCS DataParityErr",
83 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
84 	{"XGMI PCS ReplayFifoOverflowErr",
85 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
86 	{"XGMI PCS ReplayFifoUnderflowErr",
87 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
88 	{"XGMI PCS ElasticFifoOverflowErr",
89 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
90 	{"XGMI PCS DeskewErr",
91 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
92 	{"XGMI PCS DataStartupLimitErr",
93 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
94 	{"XGMI PCS FCInitTimeoutErr",
95 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
96 	{"XGMI PCS RecoveryTimeoutErr",
97 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
98 	{"XGMI PCS ReadySerialTimeoutErr",
99 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
100 	{"XGMI PCS ReadySerialAttemptErr",
101 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
102 	{"XGMI PCS RecoveryAttemptErr",
103 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
104 	{"XGMI PCS RecoveryRelockAttemptErr",
105 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
106 };
107 
108 static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
109 	{"WAFL PCS DataLossErr",
110 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
111 	{"WAFL PCS TrainingErr",
112 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
113 	{"WAFL PCS CRCErr",
114 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
115 	{"WAFL PCS BERExceededErr",
116 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
117 	{"WAFL PCS TxMetaDataErr",
118 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
119 	{"WAFL PCS ReplayBufParityErr",
120 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
121 	{"WAFL PCS DataParityErr",
122 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
123 	{"WAFL PCS ReplayFifoOverflowErr",
124 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
125 	{"WAFL PCS ReplayFifoUnderflowErr",
126 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
127 	{"WAFL PCS ElasticFifoOverflowErr",
128 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
129 	{"WAFL PCS DeskewErr",
130 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
131 	{"WAFL PCS DataStartupLimitErr",
132 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
133 	{"WAFL PCS FCInitTimeoutErr",
134 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
135 	{"WAFL PCS RecoveryTimeoutErr",
136 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
137 	{"WAFL PCS ReadySerialTimeoutErr",
138 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
139 	{"WAFL PCS ReadySerialAttemptErr",
140 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
141 	{"WAFL PCS RecoveryAttemptErr",
142 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
143 	{"WAFL PCS RecoveryRelockAttemptErr",
144 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
145 };
146 
147 void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
148 {
149 	return &hive->device_list;
150 }
151 
152 /**
153  * DOC: AMDGPU XGMI Support
154  *
155  * XGMI is a high speed interconnect that joins multiple GPU cards
156  * into a homogeneous memory space that is organized by a collective
157  * hive ID and individual node IDs, both of which are 64-bit numbers.
158  *
159  * The file xgmi_device_id contains the unique per GPU device ID and
160  * is stored in the /sys/class/drm/card${cardno}/device/ directory.
161  *
162  * Inside the device directory a sub-directory 'xgmi_hive_info' is
163  * created which contains the hive ID and the list of nodes.
164  *
165  * The hive ID is stored in:
166  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
167  *
168  * The node information is stored in numbered directories:
169  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
170  *
171  * Each device has their own xgmi_hive_info direction with a mirror
172  * set of node sub-directories.
173  *
174  * The XGMI memory space is built by contiguously adding the power of
175  * two padded VRAM space from each node to each other.
176  *
177  */
178 
179 
180 static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
181 		struct device_attribute *attr, char *buf)
182 {
183 	struct amdgpu_hive_info *hive =
184 			container_of(attr, struct amdgpu_hive_info, dev_attr);
185 
186 	return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
187 }
188 
189 static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
190 				    struct amdgpu_hive_info *hive)
191 {
192 	int ret = 0;
193 
194 	if (WARN_ON(hive->kobj))
195 		return -EINVAL;
196 
197 	hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
198 	if (!hive->kobj) {
199 		dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
200 		return -EINVAL;
201 	}
202 
203 	hive->dev_attr = (struct device_attribute) {
204 		.attr = {
205 			.name = "xgmi_hive_id",
206 			.mode = S_IRUGO,
207 
208 		},
209 		.show = amdgpu_xgmi_show_hive_id,
210 	};
211 
212 	ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
213 	if (ret) {
214 		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
215 		kobject_del(hive->kobj);
216 		kobject_put(hive->kobj);
217 		hive->kobj = NULL;
218 	}
219 
220 	return ret;
221 }
222 
223 static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
224 				    struct amdgpu_hive_info *hive)
225 {
226 	sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
227 	kobject_del(hive->kobj);
228 	kobject_put(hive->kobj);
229 	hive->kobj = NULL;
230 }
231 
232 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
233 				     struct device_attribute *attr,
234 				     char *buf)
235 {
236 	struct drm_device *ddev = dev_get_drvdata(dev);
237 	struct amdgpu_device *adev = ddev->dev_private;
238 
239 	return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
240 
241 }
242 
243 #define AMDGPU_XGMI_SET_FICAA(o)	((o) | 0x456801)
244 static ssize_t amdgpu_xgmi_show_error(struct device *dev,
245 				      struct device_attribute *attr,
246 				      char *buf)
247 {
248 	struct drm_device *ddev = dev_get_drvdata(dev);
249 	struct amdgpu_device *adev = ddev->dev_private;
250 	uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
251 	uint64_t fica_out;
252 	unsigned int error_count = 0;
253 
254 	ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
255 	ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
256 
257 	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
258 	if (fica_out != 0x1f)
259 		pr_err("xGMI error counters not enabled!\n");
260 
261 	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
262 
263 	if ((fica_out & 0xffff) == 2)
264 		error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
265 
266 	adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
267 
268 	return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
269 }
270 
271 
272 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
273 static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
274 
275 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
276 					 struct amdgpu_hive_info *hive)
277 {
278 	int ret = 0;
279 	char node[10] = { 0 };
280 
281 	/* Create xgmi device id file */
282 	ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
283 	if (ret) {
284 		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
285 		return ret;
286 	}
287 
288 	/* Create xgmi error file */
289 	ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
290 	if (ret)
291 		pr_err("failed to create xgmi_error\n");
292 
293 
294 	/* Create sysfs link to hive info folder on the first device */
295 	if (adev != hive->adev) {
296 		ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
297 					"xgmi_hive_info");
298 		if (ret) {
299 			dev_err(adev->dev, "XGMI: Failed to create link to hive info");
300 			goto remove_file;
301 		}
302 	}
303 
304 	sprintf(node, "node%d", hive->number_devices);
305 	/* Create sysfs link form the hive folder to yourself */
306 	ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
307 	if (ret) {
308 		dev_err(adev->dev, "XGMI: Failed to create link from hive info");
309 		goto remove_link;
310 	}
311 
312 	goto success;
313 
314 
315 remove_link:
316 	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
317 
318 remove_file:
319 	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
320 
321 success:
322 	return ret;
323 }
324 
325 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
326 					  struct amdgpu_hive_info *hive)
327 {
328 	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
329 	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
330 	sysfs_remove_link(hive->kobj, adev->ddev->unique);
331 }
332 
333 
334 
335 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
336 {
337 	int i;
338 	struct amdgpu_hive_info *tmp;
339 
340 	if (!adev->gmc.xgmi.hive_id)
341 		return NULL;
342 
343 	mutex_lock(&xgmi_mutex);
344 
345 	for (i = 0 ; i < hive_count; ++i) {
346 		tmp = &xgmi_hives[i];
347 		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
348 			if (lock)
349 				mutex_lock(&tmp->hive_lock);
350 			mutex_unlock(&xgmi_mutex);
351 			return tmp;
352 		}
353 	}
354 	if (i >= AMDGPU_MAX_XGMI_HIVE) {
355 		mutex_unlock(&xgmi_mutex);
356 		return NULL;
357 	}
358 
359 	/* initialize new hive if not exist */
360 	tmp = &xgmi_hives[hive_count++];
361 
362 	if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
363 		mutex_unlock(&xgmi_mutex);
364 		return NULL;
365 	}
366 
367 	tmp->adev = adev;
368 	tmp->hive_id = adev->gmc.xgmi.hive_id;
369 	INIT_LIST_HEAD(&tmp->device_list);
370 	mutex_init(&tmp->hive_lock);
371 	mutex_init(&tmp->reset_lock);
372 	task_barrier_init(&tmp->tb);
373 
374 	if (lock)
375 		mutex_lock(&tmp->hive_lock);
376 	tmp->pstate = -1;
377 	mutex_unlock(&xgmi_mutex);
378 
379 	return tmp;
380 }
381 
382 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
383 {
384 	int ret = 0;
385 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
386 	struct amdgpu_device *tmp_adev;
387 	bool update_hive_pstate = true;
388 	bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20;
389 
390 	if (!hive)
391 		return 0;
392 
393 	mutex_lock(&hive->hive_lock);
394 
395 	if (hive->pstate == pstate) {
396 		adev->pstate = is_high_pstate ? pstate : adev->pstate;
397 		goto out;
398 	}
399 
400 	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
401 
402 	ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate);
403 	if (ret) {
404 		dev_err(adev->dev,
405 			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
406 			adev->gmc.xgmi.node_id,
407 			adev->gmc.xgmi.hive_id, ret);
408 		goto out;
409 	}
410 
411 	/* Update device pstate */
412 	adev->pstate = pstate;
413 
414 	/*
415 	 * Update the hive pstate only all devices of the hive
416 	 * are in the same pstate
417 	 */
418 	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
419 		if (tmp_adev->pstate != adev->pstate) {
420 			update_hive_pstate = false;
421 			break;
422 		}
423 	}
424 	if (update_hive_pstate || is_high_pstate)
425 		hive->pstate = pstate;
426 
427 out:
428 	mutex_unlock(&hive->hive_lock);
429 
430 	return ret;
431 }
432 
433 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
434 {
435 	int ret = -EINVAL;
436 
437 	/* Each psp need to set the latest topology */
438 	ret = psp_xgmi_set_topology_info(&adev->psp,
439 					 hive->number_devices,
440 					 &adev->psp.xgmi_context.top_info);
441 	if (ret)
442 		dev_err(adev->dev,
443 			"XGMI: Set topology failure on device %llx, hive %llx, ret %d",
444 			adev->gmc.xgmi.node_id,
445 			adev->gmc.xgmi.hive_id, ret);
446 
447 	return ret;
448 }
449 
450 
451 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
452 		struct amdgpu_device *peer_adev)
453 {
454 	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
455 	int i;
456 
457 	for (i = 0 ; i < top->num_nodes; ++i)
458 		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
459 			return top->nodes[i].num_hops;
460 	return	-EINVAL;
461 }
462 
463 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
464 {
465 	struct psp_xgmi_topology_info *top_info;
466 	struct amdgpu_hive_info *hive;
467 	struct amdgpu_xgmi	*entry;
468 	struct amdgpu_device *tmp_adev = NULL;
469 
470 	int count = 0, ret = 0;
471 
472 	if (!adev->gmc.xgmi.supported)
473 		return 0;
474 
475 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
476 		ret = psp_xgmi_initialize(&adev->psp);
477 		if (ret) {
478 			dev_err(adev->dev,
479 				"XGMI: Failed to initialize xgmi session\n");
480 			return ret;
481 		}
482 
483 		ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
484 		if (ret) {
485 			dev_err(adev->dev,
486 				"XGMI: Failed to get hive id\n");
487 			return ret;
488 		}
489 
490 		ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
491 		if (ret) {
492 			dev_err(adev->dev,
493 				"XGMI: Failed to get node id\n");
494 			return ret;
495 		}
496 	} else {
497 		adev->gmc.xgmi.hive_id = 16;
498 		adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
499 	}
500 
501 	hive = amdgpu_get_xgmi_hive(adev, 1);
502 	if (!hive) {
503 		ret = -EINVAL;
504 		dev_err(adev->dev,
505 			"XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
506 			adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
507 		goto exit;
508 	}
509 
510 	/* Set default device pstate */
511 	adev->pstate = -1;
512 
513 	top_info = &adev->psp.xgmi_context.top_info;
514 
515 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
516 	list_for_each_entry(entry, &hive->device_list, head)
517 		top_info->nodes[count++].node_id = entry->node_id;
518 	top_info->num_nodes = count;
519 	hive->number_devices = count;
520 
521 	task_barrier_add_task(&hive->tb);
522 
523 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
524 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
525 			/* update node list for other device in the hive */
526 			if (tmp_adev != adev) {
527 				top_info = &tmp_adev->psp.xgmi_context.top_info;
528 				top_info->nodes[count - 1].node_id =
529 					adev->gmc.xgmi.node_id;
530 				top_info->num_nodes = count;
531 			}
532 			ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
533 			if (ret)
534 				goto exit;
535 		}
536 
537 		/* get latest topology info for each device from psp */
538 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
539 			ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
540 					&tmp_adev->psp.xgmi_context.top_info);
541 			if (ret) {
542 				dev_err(tmp_adev->dev,
543 					"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
544 					tmp_adev->gmc.xgmi.node_id,
545 					tmp_adev->gmc.xgmi.hive_id, ret);
546 				/* To do : continue with some node failed or disable the whole hive */
547 				goto exit;
548 			}
549 		}
550 	}
551 
552 	if (!ret)
553 		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
554 
555 
556 	mutex_unlock(&hive->hive_lock);
557 exit:
558 	if (!ret)
559 		dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
560 			 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
561 	else
562 		dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
563 			adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
564 			ret);
565 
566 	return ret;
567 }
568 
569 int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
570 {
571 	struct amdgpu_hive_info *hive;
572 
573 	if (!adev->gmc.xgmi.supported)
574 		return -EINVAL;
575 
576 	hive = amdgpu_get_xgmi_hive(adev, 1);
577 	if (!hive)
578 		return -EINVAL;
579 
580 	if (!(hive->number_devices--)) {
581 		amdgpu_xgmi_sysfs_destroy(adev, hive);
582 		mutex_destroy(&hive->hive_lock);
583 		mutex_destroy(&hive->reset_lock);
584 	} else {
585 		task_barrier_rem_task(&hive->tb);
586 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
587 		mutex_unlock(&hive->hive_lock);
588 	}
589 
590 	return psp_xgmi_terminate(&adev->psp);
591 }
592 
593 int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
594 {
595 	int r;
596 	struct ras_ih_if ih_info = {
597 		.cb = NULL,
598 	};
599 	struct ras_fs_if fs_info = {
600 		.sysfs_name = "xgmi_wafl_err_count",
601 	};
602 
603 	if (!adev->gmc.xgmi.supported ||
604 	    adev->gmc.xgmi.num_physical_nodes == 0)
605 		return 0;
606 
607 	if (!adev->gmc.xgmi.ras_if) {
608 		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
609 		if (!adev->gmc.xgmi.ras_if)
610 			return -ENOMEM;
611 		adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
612 		adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
613 		adev->gmc.xgmi.ras_if->sub_block_index = 0;
614 		strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
615 	}
616 	ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
617 	r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
618 				 &fs_info, &ih_info);
619 	if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
620 		kfree(adev->gmc.xgmi.ras_if);
621 		adev->gmc.xgmi.ras_if = NULL;
622 	}
623 
624 	return r;
625 }
626 
627 void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
628 {
629 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
630 			adev->gmc.xgmi.ras_if) {
631 		struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
632 		struct ras_ih_if ih_info = {
633 			.cb = NULL,
634 		};
635 
636 		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
637 		kfree(ras_if);
638 	}
639 }
640 
641 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
642 					   uint64_t addr)
643 {
644 	uint32_t df_inst_id;
645 	uint64_t dram_base_addr = 0;
646 	const struct amdgpu_df_funcs *df_funcs = adev->df.funcs;
647 
648 	if ((!df_funcs)                 ||
649 	    (!df_funcs->get_df_inst_id) ||
650 	    (!df_funcs->get_dram_base_addr)) {
651 		dev_warn(adev->dev,
652 			 "XGMI: relative phy_addr algorithm is not supported\n");
653 		return addr;
654 	}
655 
656 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) {
657 		dev_warn(adev->dev,
658 			 "failed to disable DF-Cstate, DF register may not be accessible\n");
659 		return addr;
660 	}
661 
662 	df_inst_id = df_funcs->get_df_inst_id(adev);
663 	dram_base_addr = df_funcs->get_dram_base_addr(adev, df_inst_id);
664 
665 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
666 		dev_warn(adev->dev, "failed to enable DF-Cstate\n");
667 
668 	return addr + dram_base_addr;
669 }
670 
671 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
672 					      uint32_t value,
673 					      uint32_t *ue_count,
674 					      uint32_t *ce_count,
675 					      bool is_xgmi_pcs)
676 {
677 	int i;
678 	int ue_cnt;
679 
680 	if (is_xgmi_pcs) {
681 		/* query xgmi pcs error status,
682 		 * only ue is supported */
683 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
684 			ue_cnt = (value &
685 				  xgmi_pcs_ras_fields[i].pcs_err_mask) >>
686 				  xgmi_pcs_ras_fields[i].pcs_err_shift;
687 			if (ue_cnt) {
688 				dev_info(adev->dev, "%s detected\n",
689 					 xgmi_pcs_ras_fields[i].err_name);
690 				*ue_count += ue_cnt;
691 			}
692 		}
693 	} else {
694 		/* query wafl pcs error status,
695 		 * only ue is supported */
696 		for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
697 			ue_cnt = (value &
698 				  wafl_pcs_ras_fields[i].pcs_err_mask) >>
699 				  wafl_pcs_ras_fields[i].pcs_err_shift;
700 			if (ue_cnt) {
701 				dev_info(adev->dev, "%s detected\n",
702 					 wafl_pcs_ras_fields[i].err_name);
703 				*ue_count += ue_cnt;
704 			}
705 		}
706 	}
707 
708 	return 0;
709 }
710 
711 int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
712 				      void *ras_error_status)
713 {
714 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
715 	int i;
716 	uint32_t data;
717 	uint32_t ue_cnt = 0, ce_cnt = 0;
718 
719 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
720 		return -EINVAL;
721 
722 	err_data->ue_count = 0;
723 	err_data->ce_count = 0;
724 
725 	switch (adev->asic_type) {
726 	case CHIP_ARCTURUS:
727 		/* check xgmi pcs error */
728 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
729 			data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
730 			if (data)
731 				amdgpu_xgmi_query_pcs_error_status(adev,
732 						data, &ue_cnt, &ce_cnt, true);
733 		}
734 		/* check wafl pcs error */
735 		for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
736 			data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
737 			if (data)
738 				amdgpu_xgmi_query_pcs_error_status(adev,
739 						data, &ue_cnt, &ce_cnt, false);
740 		}
741 		break;
742 	case CHIP_VEGA20:
743 	default:
744 		/* check xgmi pcs error */
745 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
746 			data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
747 			if (data)
748 				amdgpu_xgmi_query_pcs_error_status(adev,
749 						data, &ue_cnt, &ce_cnt, true);
750 		}
751 		/* check wafl pcs error */
752 		for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
753 			data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
754 			if (data)
755 				amdgpu_xgmi_query_pcs_error_status(adev,
756 						data, &ue_cnt, &ce_cnt, false);
757 		}
758 		break;
759 	}
760 
761 	err_data->ue_count += ue_cnt;
762 	err_data->ce_count += ce_cnt;
763 
764 	return 0;
765 }
766