1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include <linux/list.h> 25 #include "amdgpu.h" 26 #include "amdgpu_xgmi.h" 27 #include "amdgpu_smu.h" 28 29 30 static DEFINE_MUTEX(xgmi_mutex); 31 32 #define AMDGPU_MAX_XGMI_HIVE 8 33 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 34 35 static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; 36 static unsigned hive_count = 0; 37 38 void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) 39 { 40 return &hive->device_list; 41 } 42 43 /** 44 * DOC: AMDGPU XGMI Support 45 * 46 * XGMI is a high speed interconnect that joins multiple GPU cards 47 * into a homogeneous memory space that is organized by a collective 48 * hive ID and individual node IDs, both of which are 64-bit numbers. 49 * 50 * The file xgmi_device_id contains the unique per GPU device ID and 51 * is stored in the /sys/class/drm/card${cardno}/device/ directory. 52 * 53 * Inside the device directory a sub-directory 'xgmi_hive_info' is 54 * created which contains the hive ID and the list of nodes. 55 * 56 * The hive ID is stored in: 57 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id 58 * 59 * The node information is stored in numbered directories: 60 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id 61 * 62 * Each device has their own xgmi_hive_info direction with a mirror 63 * set of node sub-directories. 64 * 65 * The XGMI memory space is built by contiguously adding the power of 66 * two padded VRAM space from each node to each other. 67 * 68 */ 69 70 71 static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, 72 struct device_attribute *attr, char *buf) 73 { 74 struct amdgpu_hive_info *hive = 75 container_of(attr, struct amdgpu_hive_info, dev_attr); 76 77 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); 78 } 79 80 static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev, 81 struct amdgpu_hive_info *hive) 82 { 83 int ret = 0; 84 85 if (WARN_ON(hive->kobj)) 86 return -EINVAL; 87 88 hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj); 89 if (!hive->kobj) { 90 dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n"); 91 return -EINVAL; 92 } 93 94 hive->dev_attr = (struct device_attribute) { 95 .attr = { 96 .name = "xgmi_hive_id", 97 .mode = S_IRUGO, 98 99 }, 100 .show = amdgpu_xgmi_show_hive_id, 101 }; 102 103 ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr); 104 if (ret) { 105 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n"); 106 kobject_del(hive->kobj); 107 kobject_put(hive->kobj); 108 hive->kobj = NULL; 109 } 110 111 return ret; 112 } 113 114 static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev, 115 struct amdgpu_hive_info *hive) 116 { 117 sysfs_remove_file(hive->kobj, &hive->dev_attr.attr); 118 kobject_del(hive->kobj); 119 kobject_put(hive->kobj); 120 hive->kobj = NULL; 121 } 122 123 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, 124 struct device_attribute *attr, 125 char *buf) 126 { 127 struct drm_device *ddev = dev_get_drvdata(dev); 128 struct amdgpu_device *adev = ddev->dev_private; 129 130 return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id); 131 132 } 133 134 135 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); 136 137 138 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, 139 struct amdgpu_hive_info *hive) 140 { 141 int ret = 0; 142 char node[10] = { 0 }; 143 144 /* Create xgmi device id file */ 145 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); 146 if (ret) { 147 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); 148 return ret; 149 } 150 151 /* Create sysfs link to hive info folder on the first device */ 152 if (adev != hive->adev) { 153 ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, 154 "xgmi_hive_info"); 155 if (ret) { 156 dev_err(adev->dev, "XGMI: Failed to create link to hive info"); 157 goto remove_file; 158 } 159 } 160 161 sprintf(node, "node%d", hive->number_devices); 162 /* Create sysfs link form the hive folder to yourself */ 163 ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node); 164 if (ret) { 165 dev_err(adev->dev, "XGMI: Failed to create link from hive info"); 166 goto remove_link; 167 } 168 169 goto success; 170 171 172 remove_link: 173 sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); 174 175 remove_file: 176 device_remove_file(adev->dev, &dev_attr_xgmi_device_id); 177 178 success: 179 return ret; 180 } 181 182 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, 183 struct amdgpu_hive_info *hive) 184 { 185 device_remove_file(adev->dev, &dev_attr_xgmi_device_id); 186 sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); 187 sysfs_remove_link(hive->kobj, adev->ddev->unique); 188 } 189 190 191 192 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) 193 { 194 int i; 195 struct amdgpu_hive_info *tmp; 196 197 if (!adev->gmc.xgmi.hive_id) 198 return NULL; 199 200 mutex_lock(&xgmi_mutex); 201 202 for (i = 0 ; i < hive_count; ++i) { 203 tmp = &xgmi_hives[i]; 204 if (tmp->hive_id == adev->gmc.xgmi.hive_id) { 205 if (lock) 206 mutex_lock(&tmp->hive_lock); 207 mutex_unlock(&xgmi_mutex); 208 return tmp; 209 } 210 } 211 if (i >= AMDGPU_MAX_XGMI_HIVE) { 212 mutex_unlock(&xgmi_mutex); 213 return NULL; 214 } 215 216 /* initialize new hive if not exist */ 217 tmp = &xgmi_hives[hive_count++]; 218 219 if (amdgpu_xgmi_sysfs_create(adev, tmp)) { 220 mutex_unlock(&xgmi_mutex); 221 return NULL; 222 } 223 224 tmp->adev = adev; 225 tmp->hive_id = adev->gmc.xgmi.hive_id; 226 INIT_LIST_HEAD(&tmp->device_list); 227 mutex_init(&tmp->hive_lock); 228 mutex_init(&tmp->reset_lock); 229 230 if (lock) 231 mutex_lock(&tmp->hive_lock); 232 tmp->pstate = -1; 233 mutex_unlock(&xgmi_mutex); 234 235 return tmp; 236 } 237 238 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) 239 { 240 int ret = 0; 241 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); 242 243 if (!hive) 244 return 0; 245 246 if (hive->pstate == pstate) 247 return 0; 248 249 dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); 250 251 if (is_support_sw_smu(adev)) 252 ret = smu_set_xgmi_pstate(&adev->smu, pstate); 253 if (ret) 254 dev_err(adev->dev, 255 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", 256 adev->gmc.xgmi.node_id, 257 adev->gmc.xgmi.hive_id, ret); 258 259 return ret; 260 } 261 262 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) 263 { 264 int ret = -EINVAL; 265 266 /* Each psp need to set the latest topology */ 267 ret = psp_xgmi_set_topology_info(&adev->psp, 268 hive->number_devices, 269 &adev->psp.xgmi_context.top_info); 270 if (ret) 271 dev_err(adev->dev, 272 "XGMI: Set topology failure on device %llx, hive %llx, ret %d", 273 adev->gmc.xgmi.node_id, 274 adev->gmc.xgmi.hive_id, ret); 275 276 return ret; 277 } 278 279 280 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, 281 struct amdgpu_device *peer_adev) 282 { 283 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 284 int i; 285 286 for (i = 0 ; i < top->num_nodes; ++i) 287 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) 288 return top->nodes[i].num_hops; 289 return -EINVAL; 290 } 291 292 int amdgpu_xgmi_add_device(struct amdgpu_device *adev) 293 { 294 struct psp_xgmi_topology_info *top_info; 295 struct amdgpu_hive_info *hive; 296 struct amdgpu_xgmi *entry; 297 struct amdgpu_device *tmp_adev = NULL; 298 299 int count = 0, ret = -EINVAL; 300 301 if (!adev->gmc.xgmi.supported) 302 return 0; 303 304 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); 305 if (ret) { 306 dev_err(adev->dev, 307 "XGMI: Failed to get node id\n"); 308 return ret; 309 } 310 311 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); 312 if (ret) { 313 dev_err(adev->dev, 314 "XGMI: Failed to get hive id\n"); 315 return ret; 316 } 317 318 hive = amdgpu_get_xgmi_hive(adev, 1); 319 if (!hive) { 320 ret = -EINVAL; 321 dev_err(adev->dev, 322 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", 323 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); 324 goto exit; 325 } 326 327 top_info = &adev->psp.xgmi_context.top_info; 328 329 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); 330 list_for_each_entry(entry, &hive->device_list, head) 331 top_info->nodes[count++].node_id = entry->node_id; 332 top_info->num_nodes = count; 333 hive->number_devices = count; 334 335 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 336 /* update node list for other device in the hive */ 337 if (tmp_adev != adev) { 338 top_info = &tmp_adev->psp.xgmi_context.top_info; 339 top_info->nodes[count - 1].node_id = adev->gmc.xgmi.node_id; 340 top_info->num_nodes = count; 341 } 342 ret = amdgpu_xgmi_update_topology(hive, tmp_adev); 343 if (ret) 344 goto exit; 345 } 346 347 /* get latest topology info for each device from psp */ 348 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 349 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, 350 &tmp_adev->psp.xgmi_context.top_info); 351 if (ret) { 352 dev_err(tmp_adev->dev, 353 "XGMI: Get topology failure on device %llx, hive %llx, ret %d", 354 tmp_adev->gmc.xgmi.node_id, 355 tmp_adev->gmc.xgmi.hive_id, ret); 356 /* To do : continue with some node failed or disable the whole hive */ 357 goto exit; 358 } 359 } 360 361 if (!ret) 362 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); 363 364 365 mutex_unlock(&hive->hive_lock); 366 exit: 367 if (!ret) 368 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", 369 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); 370 else 371 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", 372 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, 373 ret); 374 375 return ret; 376 } 377 378 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) 379 { 380 struct amdgpu_hive_info *hive; 381 382 if (!adev->gmc.xgmi.supported) 383 return; 384 385 hive = amdgpu_get_xgmi_hive(adev, 1); 386 if (!hive) 387 return; 388 389 if (!(hive->number_devices--)) { 390 amdgpu_xgmi_sysfs_destroy(adev, hive); 391 mutex_destroy(&hive->hive_lock); 392 mutex_destroy(&hive->reset_lock); 393 } else { 394 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); 395 mutex_unlock(&hive->hive_lock); 396 } 397 } 398