xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c (revision 6b36fa6143f6caa16bfe92c639b3dad4694b58b0)
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  *
23  */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 #include "amdgpu_xgmi.h"
27 #include "amdgpu_smu.h"
28 #include "amdgpu_ras.h"
29 #include "soc15.h"
30 #include "df/df_3_6_offset.h"
31 #include "xgmi/xgmi_4_0_0_smn.h"
32 #include "xgmi/xgmi_4_0_0_sh_mask.h"
33 #include "wafl/wafl2_4_0_0_smn.h"
34 #include "wafl/wafl2_4_0_0_sh_mask.h"
35 
36 static DEFINE_MUTEX(xgmi_mutex);
37 
38 #define AMDGPU_MAX_XGMI_HIVE			8
39 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE		4
40 
41 static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
42 static unsigned hive_count = 0;
43 
44 static const int xgmi_pcs_err_status_reg_vg20[] = {
45 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
46 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
47 };
48 
49 static const int wafl_pcs_err_status_reg_vg20[] = {
50 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
51 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
52 };
53 
54 static const int xgmi_pcs_err_status_reg_arct[] = {
55 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
56 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
57 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
58 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
59 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
60 	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
61 };
62 
63 /* same as vg20*/
64 static const int wafl_pcs_err_status_reg_arct[] = {
65 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
66 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
67 };
68 
69 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
70 	{"XGMI PCS DataLossErr",
71 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
72 	{"XGMI PCS TrainingErr",
73 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
74 	{"XGMI PCS CRCErr",
75 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
76 	{"XGMI PCS BERExceededErr",
77 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
78 	{"XGMI PCS TxMetaDataErr",
79 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
80 	{"XGMI PCS ReplayBufParityErr",
81 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
82 	{"XGMI PCS DataParityErr",
83 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
84 	{"XGMI PCS ReplayFifoOverflowErr",
85 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
86 	{"XGMI PCS ReplayFifoUnderflowErr",
87 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
88 	{"XGMI PCS ElasticFifoOverflowErr",
89 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
90 	{"XGMI PCS DeskewErr",
91 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
92 	{"XGMI PCS DataStartupLimitErr",
93 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
94 	{"XGMI PCS FCInitTimeoutErr",
95 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
96 	{"XGMI PCS RecoveryTimeoutErr",
97 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
98 	{"XGMI PCS ReadySerialTimeoutErr",
99 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
100 	{"XGMI PCS ReadySerialAttemptErr",
101 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
102 	{"XGMI PCS RecoveryAttemptErr",
103 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
104 	{"XGMI PCS RecoveryRelockAttemptErr",
105 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
106 };
107 
108 static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
109 	{"WAFL PCS DataLossErr",
110 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
111 	{"WAFL PCS TrainingErr",
112 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
113 	{"WAFL PCS CRCErr",
114 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
115 	{"WAFL PCS BERExceededErr",
116 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
117 	{"WAFL PCS TxMetaDataErr",
118 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
119 	{"WAFL PCS ReplayBufParityErr",
120 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
121 	{"WAFL PCS DataParityErr",
122 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
123 	{"WAFL PCS ReplayFifoOverflowErr",
124 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
125 	{"WAFL PCS ReplayFifoUnderflowErr",
126 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
127 	{"WAFL PCS ElasticFifoOverflowErr",
128 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
129 	{"WAFL PCS DeskewErr",
130 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
131 	{"WAFL PCS DataStartupLimitErr",
132 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
133 	{"WAFL PCS FCInitTimeoutErr",
134 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
135 	{"WAFL PCS RecoveryTimeoutErr",
136 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
137 	{"WAFL PCS ReadySerialTimeoutErr",
138 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
139 	{"WAFL PCS ReadySerialAttemptErr",
140 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
141 	{"WAFL PCS RecoveryAttemptErr",
142 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
143 	{"WAFL PCS RecoveryRelockAttemptErr",
144 	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
145 };
146 
147 /**
148  * DOC: AMDGPU XGMI Support
149  *
150  * XGMI is a high speed interconnect that joins multiple GPU cards
151  * into a homogeneous memory space that is organized by a collective
152  * hive ID and individual node IDs, both of which are 64-bit numbers.
153  *
154  * The file xgmi_device_id contains the unique per GPU device ID and
155  * is stored in the /sys/class/drm/card${cardno}/device/ directory.
156  *
157  * Inside the device directory a sub-directory 'xgmi_hive_info' is
158  * created which contains the hive ID and the list of nodes.
159  *
160  * The hive ID is stored in:
161  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
162  *
163  * The node information is stored in numbered directories:
164  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
165  *
166  * Each device has their own xgmi_hive_info direction with a mirror
167  * set of node sub-directories.
168  *
169  * The XGMI memory space is built by contiguously adding the power of
170  * two padded VRAM space from each node to each other.
171  *
172  */
173 
174 
175 static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
176 		struct device_attribute *attr, char *buf)
177 {
178 	struct amdgpu_hive_info *hive =
179 			container_of(attr, struct amdgpu_hive_info, dev_attr);
180 
181 	return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
182 }
183 
184 static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
185 				    struct amdgpu_hive_info *hive)
186 {
187 	int ret = 0;
188 
189 	if (WARN_ON(hive->kobj))
190 		return -EINVAL;
191 
192 	hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
193 	if (!hive->kobj) {
194 		dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
195 		return -EINVAL;
196 	}
197 
198 	hive->dev_attr = (struct device_attribute) {
199 		.attr = {
200 			.name = "xgmi_hive_id",
201 			.mode = S_IRUGO,
202 
203 		},
204 		.show = amdgpu_xgmi_show_hive_id,
205 	};
206 
207 	ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
208 	if (ret) {
209 		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
210 		kobject_del(hive->kobj);
211 		kobject_put(hive->kobj);
212 		hive->kobj = NULL;
213 	}
214 
215 	return ret;
216 }
217 
218 static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
219 				    struct amdgpu_hive_info *hive)
220 {
221 	sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
222 	kobject_del(hive->kobj);
223 	kobject_put(hive->kobj);
224 	hive->kobj = NULL;
225 }
226 
227 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
228 				     struct device_attribute *attr,
229 				     char *buf)
230 {
231 	struct drm_device *ddev = dev_get_drvdata(dev);
232 	struct amdgpu_device *adev = ddev->dev_private;
233 
234 	return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
235 
236 }
237 
238 #define AMDGPU_XGMI_SET_FICAA(o)	((o) | 0x456801)
239 static ssize_t amdgpu_xgmi_show_error(struct device *dev,
240 				      struct device_attribute *attr,
241 				      char *buf)
242 {
243 	struct drm_device *ddev = dev_get_drvdata(dev);
244 	struct amdgpu_device *adev = ddev->dev_private;
245 	uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
246 	uint64_t fica_out;
247 	unsigned int error_count = 0;
248 
249 	ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
250 	ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
251 
252 	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
253 	if (fica_out != 0x1f)
254 		pr_err("xGMI error counters not enabled!\n");
255 
256 	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
257 
258 	if ((fica_out & 0xffff) == 2)
259 		error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
260 
261 	adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
262 
263 	return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
264 }
265 
266 
267 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
268 static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
269 
270 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
271 					 struct amdgpu_hive_info *hive)
272 {
273 	int ret = 0;
274 	char node[10] = { 0 };
275 
276 	/* Create xgmi device id file */
277 	ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
278 	if (ret) {
279 		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
280 		return ret;
281 	}
282 
283 	/* Create xgmi error file */
284 	ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
285 	if (ret)
286 		pr_err("failed to create xgmi_error\n");
287 
288 
289 	/* Create sysfs link to hive info folder on the first device */
290 	if (adev != hive->adev) {
291 		ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
292 					"xgmi_hive_info");
293 		if (ret) {
294 			dev_err(adev->dev, "XGMI: Failed to create link to hive info");
295 			goto remove_file;
296 		}
297 	}
298 
299 	sprintf(node, "node%d", hive->number_devices);
300 	/* Create sysfs link form the hive folder to yourself */
301 	ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
302 	if (ret) {
303 		dev_err(adev->dev, "XGMI: Failed to create link from hive info");
304 		goto remove_link;
305 	}
306 
307 	goto success;
308 
309 
310 remove_link:
311 	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
312 
313 remove_file:
314 	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
315 
316 success:
317 	return ret;
318 }
319 
320 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
321 					  struct amdgpu_hive_info *hive)
322 {
323 	char node[10];
324 	memset(node, 0, sizeof(node));
325 
326 	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
327 	device_remove_file(adev->dev, &dev_attr_xgmi_error);
328 
329 	if (adev != hive->adev)
330 		sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
331 
332 	sprintf(node, "node%d", hive->number_devices);
333 	sysfs_remove_link(hive->kobj, node);
334 
335 }
336 
337 
338 
339 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
340 {
341 	int i;
342 	struct amdgpu_hive_info *tmp;
343 
344 	if (!adev->gmc.xgmi.hive_id)
345 		return NULL;
346 
347 	mutex_lock(&xgmi_mutex);
348 
349 	for (i = 0 ; i < hive_count; ++i) {
350 		tmp = &xgmi_hives[i];
351 		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
352 			if (lock)
353 				mutex_lock(&tmp->hive_lock);
354 			mutex_unlock(&xgmi_mutex);
355 			return tmp;
356 		}
357 	}
358 	if (i >= AMDGPU_MAX_XGMI_HIVE) {
359 		mutex_unlock(&xgmi_mutex);
360 		return NULL;
361 	}
362 
363 	/* initialize new hive if not exist */
364 	tmp = &xgmi_hives[hive_count++];
365 
366 	if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
367 		mutex_unlock(&xgmi_mutex);
368 		return NULL;
369 	}
370 
371 	tmp->adev = adev;
372 	tmp->hive_id = adev->gmc.xgmi.hive_id;
373 	INIT_LIST_HEAD(&tmp->device_list);
374 	mutex_init(&tmp->hive_lock);
375 	atomic_set(&tmp->in_reset, 0);
376 	task_barrier_init(&tmp->tb);
377 
378 	if (lock)
379 		mutex_lock(&tmp->hive_lock);
380 	tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
381 	tmp->hi_req_gpu = NULL;
382 	/*
383 	 * hive pstate on boot is high in vega20 so we have to go to low
384 	 * pstate on after boot.
385 	 */
386 	tmp->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
387 	mutex_unlock(&xgmi_mutex);
388 
389 	return tmp;
390 }
391 
392 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
393 {
394 	int ret = 0;
395 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
396 	struct amdgpu_device *request_adev = hive->hi_req_gpu ?
397 						hive->hi_req_gpu : adev;
398 	bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
399 	bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
400 	bool locked;
401 
402 	/* fw bug so temporarily disable pstate switching */
403 	return 0;
404 
405 	if (!hive || adev->asic_type != CHIP_VEGA20)
406 		return 0;
407 
408 	locked = atomic_read(&hive->in_reset) ? false : true;
409 	if (locked)
410 		mutex_lock(&hive->hive_lock);
411 
412 	if (is_hi_req)
413 		hive->hi_req_count++;
414 	else
415 		hive->hi_req_count--;
416 
417 	/*
418 	 * Vega20 only needs single peer to request pstate high for the hive to
419 	 * go high but all peers must request pstate low for the hive to go low
420 	 */
421 	if (hive->pstate == pstate ||
422 			(!is_hi_req && hive->hi_req_count && !init_low))
423 		goto out;
424 
425 	dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
426 
427 	ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
428 	if (ret) {
429 		dev_err(request_adev->dev,
430 			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
431 			request_adev->gmc.xgmi.node_id,
432 			request_adev->gmc.xgmi.hive_id, ret);
433 		goto out;
434 	}
435 
436 	if (init_low)
437 		hive->pstate = hive->hi_req_count ?
438 					hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
439 	else {
440 		hive->pstate = pstate;
441 		hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
442 							adev : NULL;
443 	}
444 out:
445 	if (locked)
446 		mutex_unlock(&hive->hive_lock);
447 	return ret;
448 }
449 
450 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
451 {
452 	int ret;
453 
454 	/* Each psp need to set the latest topology */
455 	ret = psp_xgmi_set_topology_info(&adev->psp,
456 					 hive->number_devices,
457 					 &adev->psp.xgmi_context.top_info);
458 	if (ret)
459 		dev_err(adev->dev,
460 			"XGMI: Set topology failure on device %llx, hive %llx, ret %d",
461 			adev->gmc.xgmi.node_id,
462 			adev->gmc.xgmi.hive_id, ret);
463 
464 	return ret;
465 }
466 
467 
468 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
469 		struct amdgpu_device *peer_adev)
470 {
471 	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
472 	int i;
473 
474 	for (i = 0 ; i < top->num_nodes; ++i)
475 		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
476 			return top->nodes[i].num_hops;
477 	return	-EINVAL;
478 }
479 
480 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
481 {
482 	struct psp_xgmi_topology_info *top_info;
483 	struct amdgpu_hive_info *hive;
484 	struct amdgpu_xgmi	*entry;
485 	struct amdgpu_device *tmp_adev = NULL;
486 
487 	int count = 0, ret = 0;
488 
489 	if (!adev->gmc.xgmi.supported)
490 		return 0;
491 
492 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
493 		ret = psp_xgmi_initialize(&adev->psp);
494 		if (ret) {
495 			dev_err(adev->dev,
496 				"XGMI: Failed to initialize xgmi session\n");
497 			return ret;
498 		}
499 
500 		ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
501 		if (ret) {
502 			dev_err(adev->dev,
503 				"XGMI: Failed to get hive id\n");
504 			return ret;
505 		}
506 
507 		ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
508 		if (ret) {
509 			dev_err(adev->dev,
510 				"XGMI: Failed to get node id\n");
511 			return ret;
512 		}
513 	} else {
514 		adev->gmc.xgmi.hive_id = 16;
515 		adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
516 	}
517 
518 	hive = amdgpu_get_xgmi_hive(adev, 1);
519 	if (!hive) {
520 		ret = -EINVAL;
521 		dev_err(adev->dev,
522 			"XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
523 			adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
524 		goto exit;
525 	}
526 
527 	top_info = &adev->psp.xgmi_context.top_info;
528 
529 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
530 	list_for_each_entry(entry, &hive->device_list, head)
531 		top_info->nodes[count++].node_id = entry->node_id;
532 	top_info->num_nodes = count;
533 	hive->number_devices = count;
534 
535 	task_barrier_add_task(&hive->tb);
536 
537 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
538 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
539 			/* update node list for other device in the hive */
540 			if (tmp_adev != adev) {
541 				top_info = &tmp_adev->psp.xgmi_context.top_info;
542 				top_info->nodes[count - 1].node_id =
543 					adev->gmc.xgmi.node_id;
544 				top_info->num_nodes = count;
545 			}
546 			ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
547 			if (ret)
548 				goto exit;
549 		}
550 
551 		/* get latest topology info for each device from psp */
552 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
553 			ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
554 					&tmp_adev->psp.xgmi_context.top_info);
555 			if (ret) {
556 				dev_err(tmp_adev->dev,
557 					"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
558 					tmp_adev->gmc.xgmi.node_id,
559 					tmp_adev->gmc.xgmi.hive_id, ret);
560 				/* To do : continue with some node failed or disable the whole hive */
561 				goto exit;
562 			}
563 		}
564 	}
565 
566 	if (!ret)
567 		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
568 
569 
570 	mutex_unlock(&hive->hive_lock);
571 exit:
572 	if (!ret)
573 		dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
574 			 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
575 	else
576 		dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
577 			adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
578 			ret);
579 
580 	return ret;
581 }
582 
583 int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
584 {
585 	struct amdgpu_hive_info *hive;
586 
587 	if (!adev->gmc.xgmi.supported)
588 		return -EINVAL;
589 
590 	hive = amdgpu_get_xgmi_hive(adev, 1);
591 	if (!hive)
592 		return -EINVAL;
593 
594 	task_barrier_rem_task(&hive->tb);
595 	amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
596 	mutex_unlock(&hive->hive_lock);
597 
598 	if(!(--hive->number_devices)){
599 		amdgpu_xgmi_sysfs_destroy(adev, hive);
600 		mutex_destroy(&hive->hive_lock);
601 	}
602 
603 	return psp_xgmi_terminate(&adev->psp);
604 }
605 
606 int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
607 {
608 	int r;
609 	struct ras_ih_if ih_info = {
610 		.cb = NULL,
611 	};
612 	struct ras_fs_if fs_info = {
613 		.sysfs_name = "xgmi_wafl_err_count",
614 	};
615 
616 	if (!adev->gmc.xgmi.supported ||
617 	    adev->gmc.xgmi.num_physical_nodes == 0)
618 		return 0;
619 
620 	amdgpu_xgmi_reset_ras_error_count(adev);
621 
622 	if (!adev->gmc.xgmi.ras_if) {
623 		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
624 		if (!adev->gmc.xgmi.ras_if)
625 			return -ENOMEM;
626 		adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
627 		adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
628 		adev->gmc.xgmi.ras_if->sub_block_index = 0;
629 		strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
630 	}
631 	ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
632 	r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
633 				 &fs_info, &ih_info);
634 	if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
635 		kfree(adev->gmc.xgmi.ras_if);
636 		adev->gmc.xgmi.ras_if = NULL;
637 	}
638 
639 	return r;
640 }
641 
642 void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
643 {
644 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
645 			adev->gmc.xgmi.ras_if) {
646 		struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
647 		struct ras_ih_if ih_info = {
648 			.cb = NULL,
649 		};
650 
651 		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
652 		kfree(ras_if);
653 	}
654 }
655 
656 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
657 					   uint64_t addr)
658 {
659 	struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
660 	return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
661 }
662 
663 static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
664 {
665 	WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
666 	WREG32_PCIE(pcs_status_reg, 0);
667 }
668 
669 void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
670 {
671 	uint32_t i;
672 
673 	switch (adev->asic_type) {
674 	case CHIP_ARCTURUS:
675 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
676 			pcs_clear_status(adev,
677 					 xgmi_pcs_err_status_reg_arct[i]);
678 		break;
679 	case CHIP_VEGA20:
680 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
681 			pcs_clear_status(adev,
682 					 xgmi_pcs_err_status_reg_vg20[i]);
683 		break;
684 	default:
685 		break;
686 	}
687 }
688 
689 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
690 					      uint32_t value,
691 					      uint32_t *ue_count,
692 					      uint32_t *ce_count,
693 					      bool is_xgmi_pcs)
694 {
695 	int i;
696 	int ue_cnt;
697 
698 	if (is_xgmi_pcs) {
699 		/* query xgmi pcs error status,
700 		 * only ue is supported */
701 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
702 			ue_cnt = (value &
703 				  xgmi_pcs_ras_fields[i].pcs_err_mask) >>
704 				  xgmi_pcs_ras_fields[i].pcs_err_shift;
705 			if (ue_cnt) {
706 				dev_info(adev->dev, "%s detected\n",
707 					 xgmi_pcs_ras_fields[i].err_name);
708 				*ue_count += ue_cnt;
709 			}
710 		}
711 	} else {
712 		/* query wafl pcs error status,
713 		 * only ue is supported */
714 		for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
715 			ue_cnt = (value &
716 				  wafl_pcs_ras_fields[i].pcs_err_mask) >>
717 				  wafl_pcs_ras_fields[i].pcs_err_shift;
718 			if (ue_cnt) {
719 				dev_info(adev->dev, "%s detected\n",
720 					 wafl_pcs_ras_fields[i].err_name);
721 				*ue_count += ue_cnt;
722 			}
723 		}
724 	}
725 
726 	return 0;
727 }
728 
729 int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
730 				      void *ras_error_status)
731 {
732 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
733 	int i;
734 	uint32_t data;
735 	uint32_t ue_cnt = 0, ce_cnt = 0;
736 
737 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
738 		return -EINVAL;
739 
740 	err_data->ue_count = 0;
741 	err_data->ce_count = 0;
742 
743 	switch (adev->asic_type) {
744 	case CHIP_ARCTURUS:
745 		/* check xgmi pcs error */
746 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
747 			data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
748 			if (data)
749 				amdgpu_xgmi_query_pcs_error_status(adev,
750 						data, &ue_cnt, &ce_cnt, true);
751 		}
752 		/* check wafl pcs error */
753 		for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
754 			data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
755 			if (data)
756 				amdgpu_xgmi_query_pcs_error_status(adev,
757 						data, &ue_cnt, &ce_cnt, false);
758 		}
759 		break;
760 	case CHIP_VEGA20:
761 	default:
762 		/* check xgmi pcs error */
763 		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
764 			data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
765 			if (data)
766 				amdgpu_xgmi_query_pcs_error_status(adev,
767 						data, &ue_cnt, &ce_cnt, true);
768 		}
769 		/* check wafl pcs error */
770 		for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
771 			data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
772 			if (data)
773 				amdgpu_xgmi_query_pcs_error_status(adev,
774 						data, &ue_cnt, &ce_cnt, false);
775 		}
776 		break;
777 	}
778 
779 	amdgpu_xgmi_reset_ras_error_count(adev);
780 
781 	err_data->ue_count += ue_cnt;
782 	err_data->ce_count += ce_cnt;
783 
784 	return 0;
785 }
786