1 /* 2 * Copyright 2015-2017 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include <linux/pci.h> 24 #include <linux/acpi.h> 25 #include "kfd_crat.h" 26 #include "kfd_priv.h" 27 #include "kfd_topology.h" 28 #include "kfd_iommu.h" 29 #include "amdgpu_amdkfd.h" 30 31 /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. 32 * GPU processor ID are expressed with Bit[31]=1. 33 * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs 34 * used in the CRAT. 35 */ 36 static uint32_t gpu_processor_id_low = 0x80001000; 37 38 /* Return the next available gpu_processor_id and increment it for next GPU 39 * @total_cu_count - Total CUs present in the GPU including ones 40 * masked off 41 */ 42 static inline unsigned int get_and_inc_gpu_processor_id( 43 unsigned int total_cu_count) 44 { 45 int current_id = gpu_processor_id_low; 46 47 gpu_processor_id_low += total_cu_count; 48 return current_id; 49 } 50 51 /* Static table to describe GPU Cache information */ 52 struct kfd_gpu_cache_info { 53 uint32_t cache_size; 54 uint32_t cache_level; 55 uint32_t flags; 56 /* Indicates how many Compute Units share this cache 57 * Value = 1 indicates the cache is not shared 58 */ 59 uint32_t num_cu_shared; 60 }; 61 62 static struct kfd_gpu_cache_info kaveri_cache_info[] = { 63 { 64 /* TCP L1 Cache per CU */ 65 .cache_size = 16, 66 .cache_level = 1, 67 .flags = (CRAT_CACHE_FLAGS_ENABLED | 68 CRAT_CACHE_FLAGS_DATA_CACHE | 69 CRAT_CACHE_FLAGS_SIMD_CACHE), 70 .num_cu_shared = 1, 71 72 }, 73 { 74 /* Scalar L1 Instruction Cache (in SQC module) per bank */ 75 .cache_size = 16, 76 .cache_level = 1, 77 .flags = (CRAT_CACHE_FLAGS_ENABLED | 78 CRAT_CACHE_FLAGS_INST_CACHE | 79 CRAT_CACHE_FLAGS_SIMD_CACHE), 80 .num_cu_shared = 2, 81 }, 82 { 83 /* Scalar L1 Data Cache (in SQC module) per bank */ 84 .cache_size = 8, 85 .cache_level = 1, 86 .flags = (CRAT_CACHE_FLAGS_ENABLED | 87 CRAT_CACHE_FLAGS_DATA_CACHE | 88 CRAT_CACHE_FLAGS_SIMD_CACHE), 89 .num_cu_shared = 2, 90 }, 91 92 /* TODO: Add L2 Cache information */ 93 }; 94 95 96 static struct kfd_gpu_cache_info carrizo_cache_info[] = { 97 { 98 /* TCP L1 Cache per CU */ 99 .cache_size = 16, 100 .cache_level = 1, 101 .flags = (CRAT_CACHE_FLAGS_ENABLED | 102 CRAT_CACHE_FLAGS_DATA_CACHE | 103 CRAT_CACHE_FLAGS_SIMD_CACHE), 104 .num_cu_shared = 1, 105 }, 106 { 107 /* Scalar L1 Instruction Cache (in SQC module) per bank */ 108 .cache_size = 8, 109 .cache_level = 1, 110 .flags = (CRAT_CACHE_FLAGS_ENABLED | 111 CRAT_CACHE_FLAGS_INST_CACHE | 112 CRAT_CACHE_FLAGS_SIMD_CACHE), 113 .num_cu_shared = 4, 114 }, 115 { 116 /* Scalar L1 Data Cache (in SQC module) per bank. */ 117 .cache_size = 4, 118 .cache_level = 1, 119 .flags = (CRAT_CACHE_FLAGS_ENABLED | 120 CRAT_CACHE_FLAGS_DATA_CACHE | 121 CRAT_CACHE_FLAGS_SIMD_CACHE), 122 .num_cu_shared = 4, 123 }, 124 125 /* TODO: Add L2 Cache information */ 126 }; 127 128 /* NOTE: In future if more information is added to struct kfd_gpu_cache_info 129 * the following ASICs may need a separate table. 130 */ 131 #define hawaii_cache_info kaveri_cache_info 132 #define tonga_cache_info carrizo_cache_info 133 #define fiji_cache_info carrizo_cache_info 134 #define polaris10_cache_info carrizo_cache_info 135 #define polaris11_cache_info carrizo_cache_info 136 #define polaris12_cache_info carrizo_cache_info 137 #define vegam_cache_info carrizo_cache_info 138 /* TODO - check & update Vega10 cache details */ 139 #define vega10_cache_info carrizo_cache_info 140 #define raven_cache_info carrizo_cache_info 141 142 static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, 143 struct crat_subtype_computeunit *cu) 144 { 145 dev->node_props.cpu_cores_count = cu->num_cpu_cores; 146 dev->node_props.cpu_core_id_base = cu->processor_id_low; 147 if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) 148 dev->node_props.capability |= HSA_CAP_ATS_PRESENT; 149 150 pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, 151 cu->processor_id_low); 152 } 153 154 static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, 155 struct crat_subtype_computeunit *cu) 156 { 157 dev->node_props.simd_id_base = cu->processor_id_low; 158 dev->node_props.simd_count = cu->num_simd_cores; 159 dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; 160 dev->node_props.max_waves_per_simd = cu->max_waves_simd; 161 dev->node_props.wave_front_size = cu->wave_front_size; 162 dev->node_props.array_count = cu->array_count; 163 dev->node_props.cu_per_simd_array = cu->num_cu_per_array; 164 dev->node_props.simd_per_cu = cu->num_simd_per_cu; 165 dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; 166 if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) 167 dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; 168 pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); 169 } 170 171 /* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct 172 * topology device present in the device_list 173 */ 174 static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, 175 struct list_head *device_list) 176 { 177 struct kfd_topology_device *dev; 178 179 pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", 180 cu->proximity_domain, cu->hsa_capability); 181 list_for_each_entry(dev, device_list, list) { 182 if (cu->proximity_domain == dev->proximity_domain) { 183 if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) 184 kfd_populated_cu_info_cpu(dev, cu); 185 186 if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) 187 kfd_populated_cu_info_gpu(dev, cu); 188 break; 189 } 190 } 191 192 return 0; 193 } 194 195 static struct kfd_mem_properties * 196 find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, 197 struct kfd_topology_device *dev) 198 { 199 struct kfd_mem_properties *props; 200 201 list_for_each_entry(props, &dev->mem_props, list) { 202 if (props->heap_type == heap_type 203 && props->flags == flags 204 && props->width == width) 205 return props; 206 } 207 208 return NULL; 209 } 210 /* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct 211 * topology device present in the device_list 212 */ 213 static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, 214 struct list_head *device_list) 215 { 216 struct kfd_mem_properties *props; 217 struct kfd_topology_device *dev; 218 uint32_t heap_type; 219 uint64_t size_in_bytes; 220 uint32_t flags = 0; 221 uint32_t width; 222 223 pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", 224 mem->proximity_domain); 225 list_for_each_entry(dev, device_list, list) { 226 if (mem->proximity_domain == dev->proximity_domain) { 227 /* We're on GPU node */ 228 if (dev->node_props.cpu_cores_count == 0) { 229 /* APU */ 230 if (mem->visibility_type == 0) 231 heap_type = 232 HSA_MEM_HEAP_TYPE_FB_PRIVATE; 233 /* dGPU */ 234 else 235 heap_type = mem->visibility_type; 236 } else 237 heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; 238 239 if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) 240 flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; 241 if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) 242 flags |= HSA_MEM_FLAGS_NON_VOLATILE; 243 244 size_in_bytes = 245 ((uint64_t)mem->length_high << 32) + 246 mem->length_low; 247 width = mem->width; 248 249 /* Multiple banks of the same type are aggregated into 250 * one. User mode doesn't care about multiple physical 251 * memory segments. It's managed as a single virtual 252 * heap for user mode. 253 */ 254 props = find_subtype_mem(heap_type, flags, width, dev); 255 if (props) { 256 props->size_in_bytes += size_in_bytes; 257 break; 258 } 259 260 props = kfd_alloc_struct(props); 261 if (!props) 262 return -ENOMEM; 263 264 props->heap_type = heap_type; 265 props->flags = flags; 266 props->size_in_bytes = size_in_bytes; 267 props->width = width; 268 269 dev->node_props.mem_banks_count++; 270 list_add_tail(&props->list, &dev->mem_props); 271 272 break; 273 } 274 } 275 276 return 0; 277 } 278 279 /* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct 280 * topology device present in the device_list 281 */ 282 static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, 283 struct list_head *device_list) 284 { 285 struct kfd_cache_properties *props; 286 struct kfd_topology_device *dev; 287 uint32_t id; 288 uint32_t total_num_of_cu; 289 290 id = cache->processor_id_low; 291 292 pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); 293 list_for_each_entry(dev, device_list, list) { 294 total_num_of_cu = (dev->node_props.array_count * 295 dev->node_props.cu_per_simd_array); 296 297 /* Cache infomration in CRAT doesn't have proximity_domain 298 * information as it is associated with a CPU core or GPU 299 * Compute Unit. So map the cache using CPU core Id or SIMD 300 * (GPU) ID. 301 * TODO: This works because currently we can safely assume that 302 * Compute Units are parsed before caches are parsed. In 303 * future, remove this dependency 304 */ 305 if ((id >= dev->node_props.cpu_core_id_base && 306 id <= dev->node_props.cpu_core_id_base + 307 dev->node_props.cpu_cores_count) || 308 (id >= dev->node_props.simd_id_base && 309 id < dev->node_props.simd_id_base + 310 total_num_of_cu)) { 311 props = kfd_alloc_struct(props); 312 if (!props) 313 return -ENOMEM; 314 315 props->processor_id_low = id; 316 props->cache_level = cache->cache_level; 317 props->cache_size = cache->cache_size; 318 props->cacheline_size = cache->cache_line_size; 319 props->cachelines_per_tag = cache->lines_per_tag; 320 props->cache_assoc = cache->associativity; 321 props->cache_latency = cache->cache_latency; 322 memcpy(props->sibling_map, cache->sibling_map, 323 sizeof(props->sibling_map)); 324 325 if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) 326 props->cache_type |= HSA_CACHE_TYPE_DATA; 327 if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) 328 props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; 329 if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) 330 props->cache_type |= HSA_CACHE_TYPE_CPU; 331 if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) 332 props->cache_type |= HSA_CACHE_TYPE_HSACU; 333 334 dev->cache_count++; 335 dev->node_props.caches_count++; 336 list_add_tail(&props->list, &dev->cache_props); 337 338 break; 339 } 340 } 341 342 return 0; 343 } 344 345 /* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct 346 * topology device present in the device_list 347 */ 348 static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, 349 struct list_head *device_list) 350 { 351 struct kfd_iolink_properties *props = NULL, *props2; 352 struct kfd_topology_device *dev, *to_dev; 353 uint32_t id_from; 354 uint32_t id_to; 355 356 id_from = iolink->proximity_domain_from; 357 id_to = iolink->proximity_domain_to; 358 359 pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n", 360 id_from, id_to); 361 list_for_each_entry(dev, device_list, list) { 362 if (id_from == dev->proximity_domain) { 363 props = kfd_alloc_struct(props); 364 if (!props) 365 return -ENOMEM; 366 367 props->node_from = id_from; 368 props->node_to = id_to; 369 props->ver_maj = iolink->version_major; 370 props->ver_min = iolink->version_minor; 371 props->iolink_type = iolink->io_interface_type; 372 373 if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) 374 props->weight = 20; 375 else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) 376 props->weight = 15 * iolink->num_hops_xgmi; 377 else 378 props->weight = node_distance(id_from, id_to); 379 380 props->min_latency = iolink->minimum_latency; 381 props->max_latency = iolink->maximum_latency; 382 props->min_bandwidth = iolink->minimum_bandwidth_mbs; 383 props->max_bandwidth = iolink->maximum_bandwidth_mbs; 384 props->rec_transfer_size = 385 iolink->recommended_transfer_size; 386 387 dev->io_link_count++; 388 dev->node_props.io_links_count++; 389 list_add_tail(&props->list, &dev->io_link_props); 390 break; 391 } 392 } 393 394 /* CPU topology is created before GPUs are detected, so CPU->GPU 395 * links are not built at that time. If a PCIe type is discovered, it 396 * means a GPU is detected and we are adding GPU->CPU to the topology. 397 * At this time, also add the corresponded CPU->GPU link if GPU 398 * is large bar. 399 * For xGMI, we only added the link with one direction in the crat 400 * table, add corresponded reversed direction link now. 401 */ 402 if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) { 403 to_dev = kfd_topology_device_by_proximity_domain(id_to); 404 if (!to_dev) 405 return -ENODEV; 406 /* same everything but the other direction */ 407 props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); 408 props2->node_from = id_to; 409 props2->node_to = id_from; 410 props2->kobj = NULL; 411 to_dev->io_link_count++; 412 to_dev->node_props.io_links_count++; 413 list_add_tail(&props2->list, &to_dev->io_link_props); 414 } 415 416 return 0; 417 } 418 419 /* kfd_parse_subtype - parse subtypes and attach it to correct topology device 420 * present in the device_list 421 * @sub_type_hdr - subtype section of crat_image 422 * @device_list - list of topology devices present in this crat_image 423 */ 424 static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, 425 struct list_head *device_list) 426 { 427 struct crat_subtype_computeunit *cu; 428 struct crat_subtype_memory *mem; 429 struct crat_subtype_cache *cache; 430 struct crat_subtype_iolink *iolink; 431 int ret = 0; 432 433 switch (sub_type_hdr->type) { 434 case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: 435 cu = (struct crat_subtype_computeunit *)sub_type_hdr; 436 ret = kfd_parse_subtype_cu(cu, device_list); 437 break; 438 case CRAT_SUBTYPE_MEMORY_AFFINITY: 439 mem = (struct crat_subtype_memory *)sub_type_hdr; 440 ret = kfd_parse_subtype_mem(mem, device_list); 441 break; 442 case CRAT_SUBTYPE_CACHE_AFFINITY: 443 cache = (struct crat_subtype_cache *)sub_type_hdr; 444 ret = kfd_parse_subtype_cache(cache, device_list); 445 break; 446 case CRAT_SUBTYPE_TLB_AFFINITY: 447 /* 448 * For now, nothing to do here 449 */ 450 pr_debug("Found TLB entry in CRAT table (not processing)\n"); 451 break; 452 case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: 453 /* 454 * For now, nothing to do here 455 */ 456 pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); 457 break; 458 case CRAT_SUBTYPE_IOLINK_AFFINITY: 459 iolink = (struct crat_subtype_iolink *)sub_type_hdr; 460 ret = kfd_parse_subtype_iolink(iolink, device_list); 461 break; 462 default: 463 pr_warn("Unknown subtype %d in CRAT\n", 464 sub_type_hdr->type); 465 } 466 467 return ret; 468 } 469 470 /* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT 471 * create a kfd_topology_device and add in to device_list. Also parse 472 * CRAT subtypes and attach it to appropriate kfd_topology_device 473 * @crat_image - input image containing CRAT 474 * @device_list - [OUT] list of kfd_topology_device generated after 475 * parsing crat_image 476 * @proximity_domain - Proximity domain of the first device in the table 477 * 478 * Return - 0 if successful else -ve value 479 */ 480 int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, 481 uint32_t proximity_domain) 482 { 483 struct kfd_topology_device *top_dev = NULL; 484 struct crat_subtype_generic *sub_type_hdr; 485 uint16_t node_id; 486 int ret = 0; 487 struct crat_header *crat_table = (struct crat_header *)crat_image; 488 uint16_t num_nodes; 489 uint32_t image_len; 490 491 if (!crat_image) 492 return -EINVAL; 493 494 if (!list_empty(device_list)) { 495 pr_warn("Error device list should be empty\n"); 496 return -EINVAL; 497 } 498 499 num_nodes = crat_table->num_domains; 500 image_len = crat_table->length; 501 502 pr_info("Parsing CRAT table with %d nodes\n", num_nodes); 503 504 for (node_id = 0; node_id < num_nodes; node_id++) { 505 top_dev = kfd_create_topology_device(device_list); 506 if (!top_dev) 507 break; 508 top_dev->proximity_domain = proximity_domain++; 509 } 510 511 if (!top_dev) { 512 ret = -ENOMEM; 513 goto err; 514 } 515 516 memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); 517 memcpy(top_dev->oem_table_id, crat_table->oem_table_id, 518 CRAT_OEMTABLEID_LENGTH); 519 top_dev->oem_revision = crat_table->oem_revision; 520 521 sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); 522 while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < 523 ((char *)crat_image) + image_len) { 524 if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { 525 ret = kfd_parse_subtype(sub_type_hdr, device_list); 526 if (ret) 527 break; 528 } 529 530 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 531 sub_type_hdr->length); 532 } 533 534 err: 535 if (ret) 536 kfd_release_topology_device_list(device_list); 537 538 return ret; 539 } 540 541 /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ 542 static int fill_in_pcache(struct crat_subtype_cache *pcache, 543 struct kfd_gpu_cache_info *pcache_info, 544 struct kfd_cu_info *cu_info, 545 int mem_available, 546 int cu_bitmask, 547 int cache_type, unsigned int cu_processor_id, 548 int cu_block) 549 { 550 unsigned int cu_sibling_map_mask; 551 int first_active_cu; 552 553 /* First check if enough memory is available */ 554 if (sizeof(struct crat_subtype_cache) > mem_available) 555 return -ENOMEM; 556 557 cu_sibling_map_mask = cu_bitmask; 558 cu_sibling_map_mask >>= cu_block; 559 cu_sibling_map_mask &= 560 ((1 << pcache_info[cache_type].num_cu_shared) - 1); 561 first_active_cu = ffs(cu_sibling_map_mask); 562 563 /* CU could be inactive. In case of shared cache find the first active 564 * CU. and incase of non-shared cache check if the CU is inactive. If 565 * inactive active skip it 566 */ 567 if (first_active_cu) { 568 memset(pcache, 0, sizeof(struct crat_subtype_cache)); 569 pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; 570 pcache->length = sizeof(struct crat_subtype_cache); 571 pcache->flags = pcache_info[cache_type].flags; 572 pcache->processor_id_low = cu_processor_id 573 + (first_active_cu - 1); 574 pcache->cache_level = pcache_info[cache_type].cache_level; 575 pcache->cache_size = pcache_info[cache_type].cache_size; 576 577 /* Sibling map is w.r.t processor_id_low, so shift out 578 * inactive CU 579 */ 580 cu_sibling_map_mask = 581 cu_sibling_map_mask >> (first_active_cu - 1); 582 583 pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); 584 pcache->sibling_map[1] = 585 (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); 586 pcache->sibling_map[2] = 587 (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); 588 pcache->sibling_map[3] = 589 (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); 590 return 0; 591 } 592 return 1; 593 } 594 595 /* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info 596 * tables 597 * 598 * @kdev - [IN] GPU device 599 * @gpu_processor_id - [IN] GPU processor ID to which these caches 600 * associate 601 * @available_size - [IN] Amount of memory available in pcache 602 * @cu_info - [IN] Compute Unit info obtained from KGD 603 * @pcache - [OUT] memory into which cache data is to be filled in. 604 * @size_filled - [OUT] amount of data used up in pcache. 605 * @num_of_entries - [OUT] number of caches added 606 */ 607 static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, 608 int gpu_processor_id, 609 int available_size, 610 struct kfd_cu_info *cu_info, 611 struct crat_subtype_cache *pcache, 612 int *size_filled, 613 int *num_of_entries) 614 { 615 struct kfd_gpu_cache_info *pcache_info; 616 int num_of_cache_types = 0; 617 int i, j, k; 618 int ct = 0; 619 int mem_available = available_size; 620 unsigned int cu_processor_id; 621 int ret; 622 623 switch (kdev->device_info->asic_family) { 624 case CHIP_KAVERI: 625 pcache_info = kaveri_cache_info; 626 num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); 627 break; 628 case CHIP_HAWAII: 629 pcache_info = hawaii_cache_info; 630 num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); 631 break; 632 case CHIP_CARRIZO: 633 pcache_info = carrizo_cache_info; 634 num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); 635 break; 636 case CHIP_TONGA: 637 pcache_info = tonga_cache_info; 638 num_of_cache_types = ARRAY_SIZE(tonga_cache_info); 639 break; 640 case CHIP_FIJI: 641 pcache_info = fiji_cache_info; 642 num_of_cache_types = ARRAY_SIZE(fiji_cache_info); 643 break; 644 case CHIP_POLARIS10: 645 pcache_info = polaris10_cache_info; 646 num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); 647 break; 648 case CHIP_POLARIS11: 649 pcache_info = polaris11_cache_info; 650 num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); 651 break; 652 case CHIP_POLARIS12: 653 pcache_info = polaris12_cache_info; 654 num_of_cache_types = ARRAY_SIZE(polaris12_cache_info); 655 break; 656 case CHIP_VEGAM: 657 pcache_info = vegam_cache_info; 658 num_of_cache_types = ARRAY_SIZE(vegam_cache_info); 659 break; 660 case CHIP_VEGA10: 661 case CHIP_VEGA12: 662 case CHIP_VEGA20: 663 pcache_info = vega10_cache_info; 664 num_of_cache_types = ARRAY_SIZE(vega10_cache_info); 665 break; 666 case CHIP_RAVEN: 667 pcache_info = raven_cache_info; 668 num_of_cache_types = ARRAY_SIZE(raven_cache_info); 669 break; 670 default: 671 return -EINVAL; 672 } 673 674 *size_filled = 0; 675 *num_of_entries = 0; 676 677 /* For each type of cache listed in the kfd_gpu_cache_info table, 678 * go through all available Compute Units. 679 * The [i,j,k] loop will 680 * if kfd_gpu_cache_info.num_cu_shared = 1 681 * will parse through all available CU 682 * If (kfd_gpu_cache_info.num_cu_shared != 1) 683 * then it will consider only one CU from 684 * the shared unit 685 */ 686 687 for (ct = 0; ct < num_of_cache_types; ct++) { 688 cu_processor_id = gpu_processor_id; 689 for (i = 0; i < cu_info->num_shader_engines; i++) { 690 for (j = 0; j < cu_info->num_shader_arrays_per_engine; 691 j++) { 692 for (k = 0; k < cu_info->num_cu_per_sh; 693 k += pcache_info[ct].num_cu_shared) { 694 695 ret = fill_in_pcache(pcache, 696 pcache_info, 697 cu_info, 698 mem_available, 699 cu_info->cu_bitmap[i][j], 700 ct, 701 cu_processor_id, 702 k); 703 704 if (ret < 0) 705 break; 706 707 if (!ret) { 708 pcache++; 709 (*num_of_entries)++; 710 mem_available -= 711 sizeof(*pcache); 712 (*size_filled) += 713 sizeof(*pcache); 714 } 715 716 /* Move to next CU block */ 717 cu_processor_id += 718 pcache_info[ct].num_cu_shared; 719 } 720 } 721 } 722 } 723 724 pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); 725 726 return 0; 727 } 728 729 /* 730 * kfd_create_crat_image_acpi - Allocates memory for CRAT image and 731 * copies CRAT from ACPI (if available). 732 * NOTE: Call kfd_destroy_crat_image to free CRAT image memory 733 * 734 * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then 735 * crat_image will be NULL 736 * @size: [OUT] size of crat_image 737 * 738 * Return 0 if successful else return error code 739 */ 740 int kfd_create_crat_image_acpi(void **crat_image, size_t *size) 741 { 742 struct acpi_table_header *crat_table; 743 acpi_status status; 744 void *pcrat_image; 745 746 if (!crat_image) 747 return -EINVAL; 748 749 *crat_image = NULL; 750 751 /* Fetch the CRAT table from ACPI */ 752 status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); 753 if (status == AE_NOT_FOUND) { 754 pr_warn("CRAT table not found\n"); 755 return -ENODATA; 756 } else if (ACPI_FAILURE(status)) { 757 const char *err = acpi_format_exception(status); 758 759 pr_err("CRAT table error: %s\n", err); 760 return -EINVAL; 761 } 762 763 if (ignore_crat) { 764 pr_info("CRAT table disabled by module option\n"); 765 return -ENODATA; 766 } 767 768 pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL); 769 if (!pcrat_image) 770 return -ENOMEM; 771 772 *crat_image = pcrat_image; 773 *size = crat_table->length; 774 775 return 0; 776 } 777 778 /* Memory required to create Virtual CRAT. 779 * Since there is no easy way to predict the amount of memory required, the 780 * following amount are allocated for CPU and GPU Virtual CRAT. This is 781 * expected to cover all known conditions. But to be safe additional check 782 * is put in the code to ensure we don't overwrite. 783 */ 784 #define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) 785 #define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) 786 787 /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node 788 * 789 * @numa_node_id: CPU NUMA node id 790 * @avail_size: Available size in the memory 791 * @sub_type_hdr: Memory into which compute info will be filled in 792 * 793 * Return 0 if successful else return -ve value 794 */ 795 static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, 796 int proximity_domain, 797 struct crat_subtype_computeunit *sub_type_hdr) 798 { 799 const struct cpumask *cpumask; 800 801 *avail_size -= sizeof(struct crat_subtype_computeunit); 802 if (*avail_size < 0) 803 return -ENOMEM; 804 805 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); 806 807 /* Fill in subtype header data */ 808 sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; 809 sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); 810 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 811 812 cpumask = cpumask_of_node(numa_node_id); 813 814 /* Fill in CU data */ 815 sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; 816 sub_type_hdr->proximity_domain = proximity_domain; 817 sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); 818 if (sub_type_hdr->processor_id_low == -1) 819 return -EINVAL; 820 821 sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); 822 823 return 0; 824 } 825 826 /* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node 827 * 828 * @numa_node_id: CPU NUMA node id 829 * @avail_size: Available size in the memory 830 * @sub_type_hdr: Memory into which compute info will be filled in 831 * 832 * Return 0 if successful else return -ve value 833 */ 834 static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, 835 int proximity_domain, 836 struct crat_subtype_memory *sub_type_hdr) 837 { 838 uint64_t mem_in_bytes = 0; 839 pg_data_t *pgdat; 840 int zone_type; 841 842 *avail_size -= sizeof(struct crat_subtype_memory); 843 if (*avail_size < 0) 844 return -ENOMEM; 845 846 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); 847 848 /* Fill in subtype header data */ 849 sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; 850 sub_type_hdr->length = sizeof(struct crat_subtype_memory); 851 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 852 853 /* Fill in Memory Subunit data */ 854 855 /* Unlike si_meminfo, si_meminfo_node is not exported. So 856 * the following lines are duplicated from si_meminfo_node 857 * function 858 */ 859 pgdat = NODE_DATA(numa_node_id); 860 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 861 mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]); 862 mem_in_bytes <<= PAGE_SHIFT; 863 864 sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); 865 sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); 866 sub_type_hdr->proximity_domain = proximity_domain; 867 868 return 0; 869 } 870 871 #ifdef CONFIG_X86_64 872 static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, 873 uint32_t *num_entries, 874 struct crat_subtype_iolink *sub_type_hdr) 875 { 876 int nid; 877 struct cpuinfo_x86 *c = &cpu_data(0); 878 uint8_t link_type; 879 880 if (c->x86_vendor == X86_VENDOR_AMD) 881 link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; 882 else 883 link_type = CRAT_IOLINK_TYPE_QPI_1_1; 884 885 *num_entries = 0; 886 887 /* Create IO links from this node to other CPU nodes */ 888 for_each_online_node(nid) { 889 if (nid == numa_node_id) /* node itself */ 890 continue; 891 892 *avail_size -= sizeof(struct crat_subtype_iolink); 893 if (*avail_size < 0) 894 return -ENOMEM; 895 896 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 897 898 /* Fill in subtype header data */ 899 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 900 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 901 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 902 903 /* Fill in IO link data */ 904 sub_type_hdr->proximity_domain_from = numa_node_id; 905 sub_type_hdr->proximity_domain_to = nid; 906 sub_type_hdr->io_interface_type = link_type; 907 908 (*num_entries)++; 909 sub_type_hdr++; 910 } 911 912 return 0; 913 } 914 #endif 915 916 /* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU 917 * 918 * @pcrat_image: Fill in VCRAT for CPU 919 * @size: [IN] allocated size of crat_image. 920 * [OUT] actual size of data filled in crat_image 921 */ 922 static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) 923 { 924 struct crat_header *crat_table = (struct crat_header *)pcrat_image; 925 struct acpi_table_header *acpi_table; 926 acpi_status status; 927 struct crat_subtype_generic *sub_type_hdr; 928 int avail_size = *size; 929 int numa_node_id; 930 #ifdef CONFIG_X86_64 931 uint32_t entries = 0; 932 #endif 933 int ret = 0; 934 935 if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) 936 return -EINVAL; 937 938 /* Fill in CRAT Header. 939 * Modify length and total_entries as subunits are added. 940 */ 941 avail_size -= sizeof(struct crat_header); 942 if (avail_size < 0) 943 return -ENOMEM; 944 945 memset(crat_table, 0, sizeof(struct crat_header)); 946 memcpy(&crat_table->signature, CRAT_SIGNATURE, 947 sizeof(crat_table->signature)); 948 crat_table->length = sizeof(struct crat_header); 949 950 status = acpi_get_table("DSDT", 0, &acpi_table); 951 if (status != AE_OK) 952 pr_warn("DSDT table not found for OEM information\n"); 953 else { 954 crat_table->oem_revision = acpi_table->revision; 955 memcpy(crat_table->oem_id, acpi_table->oem_id, 956 CRAT_OEMID_LENGTH); 957 memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, 958 CRAT_OEMTABLEID_LENGTH); 959 } 960 crat_table->total_entries = 0; 961 crat_table->num_domains = 0; 962 963 sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); 964 965 for_each_online_node(numa_node_id) { 966 if (kfd_numa_node_to_apic_id(numa_node_id) == -1) 967 continue; 968 969 /* Fill in Subtype: Compute Unit */ 970 ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, 971 crat_table->num_domains, 972 (struct crat_subtype_computeunit *)sub_type_hdr); 973 if (ret < 0) 974 return ret; 975 crat_table->length += sub_type_hdr->length; 976 crat_table->total_entries++; 977 978 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 979 sub_type_hdr->length); 980 981 /* Fill in Subtype: Memory */ 982 ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, 983 crat_table->num_domains, 984 (struct crat_subtype_memory *)sub_type_hdr); 985 if (ret < 0) 986 return ret; 987 crat_table->length += sub_type_hdr->length; 988 crat_table->total_entries++; 989 990 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 991 sub_type_hdr->length); 992 993 /* Fill in Subtype: IO Link */ 994 #ifdef CONFIG_X86_64 995 ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, 996 &entries, 997 (struct crat_subtype_iolink *)sub_type_hdr); 998 if (ret < 0) 999 return ret; 1000 crat_table->length += (sub_type_hdr->length * entries); 1001 crat_table->total_entries += entries; 1002 1003 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1004 sub_type_hdr->length * entries); 1005 #else 1006 pr_info("IO link not available for non x86 platforms\n"); 1007 #endif 1008 1009 crat_table->num_domains++; 1010 } 1011 1012 /* TODO: Add cache Subtype for CPU. 1013 * Currently, CPU cache information is available in function 1014 * detect_cache_attributes(cpu) defined in the file 1015 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not 1016 * exported and to get the same information the code needs to be 1017 * duplicated. 1018 */ 1019 1020 *size = crat_table->length; 1021 pr_info("Virtual CRAT table created for CPU\n"); 1022 1023 return 0; 1024 } 1025 1026 static int kfd_fill_gpu_memory_affinity(int *avail_size, 1027 struct kfd_dev *kdev, uint8_t type, uint64_t size, 1028 struct crat_subtype_memory *sub_type_hdr, 1029 uint32_t proximity_domain, 1030 const struct kfd_local_mem_info *local_mem_info) 1031 { 1032 *avail_size -= sizeof(struct crat_subtype_memory); 1033 if (*avail_size < 0) 1034 return -ENOMEM; 1035 1036 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); 1037 sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; 1038 sub_type_hdr->length = sizeof(struct crat_subtype_memory); 1039 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; 1040 1041 sub_type_hdr->proximity_domain = proximity_domain; 1042 1043 pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", 1044 type, size); 1045 1046 sub_type_hdr->length_low = lower_32_bits(size); 1047 sub_type_hdr->length_high = upper_32_bits(size); 1048 1049 sub_type_hdr->width = local_mem_info->vram_width; 1050 sub_type_hdr->visibility_type = type; 1051 1052 return 0; 1053 } 1054 1055 /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU 1056 * to its NUMA node 1057 * @avail_size: Available size in the memory 1058 * @kdev - [IN] GPU device 1059 * @sub_type_hdr: Memory into which io link info will be filled in 1060 * @proximity_domain - proximity domain of the GPU node 1061 * 1062 * Return 0 if successful else return -ve value 1063 */ 1064 static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, 1065 struct kfd_dev *kdev, 1066 struct crat_subtype_iolink *sub_type_hdr, 1067 uint32_t proximity_domain) 1068 { 1069 *avail_size -= sizeof(struct crat_subtype_iolink); 1070 if (*avail_size < 0) 1071 return -ENOMEM; 1072 1073 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 1074 1075 /* Fill in subtype header data */ 1076 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 1077 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 1078 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; 1079 if (kfd_dev_is_large_bar(kdev)) 1080 sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 1081 1082 /* Fill in IOLINK subtype. 1083 * TODO: Fill-in other fields of iolink subtype 1084 */ 1085 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; 1086 sub_type_hdr->proximity_domain_from = proximity_domain; 1087 #ifdef CONFIG_NUMA 1088 if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) 1089 sub_type_hdr->proximity_domain_to = 0; 1090 else 1091 sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; 1092 #else 1093 sub_type_hdr->proximity_domain_to = 0; 1094 #endif 1095 return 0; 1096 } 1097 1098 static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, 1099 struct kfd_dev *kdev, 1100 struct kfd_dev *peer_kdev, 1101 struct crat_subtype_iolink *sub_type_hdr, 1102 uint32_t proximity_domain_from, 1103 uint32_t proximity_domain_to) 1104 { 1105 *avail_size -= sizeof(struct crat_subtype_iolink); 1106 if (*avail_size < 0) 1107 return -ENOMEM; 1108 1109 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 1110 1111 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 1112 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 1113 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED | 1114 CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 1115 1116 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; 1117 sub_type_hdr->proximity_domain_from = proximity_domain_from; 1118 sub_type_hdr->proximity_domain_to = proximity_domain_to; 1119 sub_type_hdr->num_hops_xgmi = 1120 amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd); 1121 return 0; 1122 } 1123 1124 /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU 1125 * 1126 * @pcrat_image: Fill in VCRAT for GPU 1127 * @size: [IN] allocated size of crat_image. 1128 * [OUT] actual size of data filled in crat_image 1129 */ 1130 static int kfd_create_vcrat_image_gpu(void *pcrat_image, 1131 size_t *size, struct kfd_dev *kdev, 1132 uint32_t proximity_domain) 1133 { 1134 struct crat_header *crat_table = (struct crat_header *)pcrat_image; 1135 struct crat_subtype_generic *sub_type_hdr; 1136 struct kfd_local_mem_info local_mem_info; 1137 struct kfd_topology_device *peer_dev; 1138 struct crat_subtype_computeunit *cu; 1139 struct kfd_cu_info cu_info; 1140 int avail_size = *size; 1141 uint32_t total_num_of_cu; 1142 int num_of_cache_entries = 0; 1143 int cache_mem_filled = 0; 1144 uint32_t nid = 0; 1145 int ret = 0; 1146 1147 if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) 1148 return -EINVAL; 1149 1150 /* Fill the CRAT Header. 1151 * Modify length and total_entries as subunits are added. 1152 */ 1153 avail_size -= sizeof(struct crat_header); 1154 if (avail_size < 0) 1155 return -ENOMEM; 1156 1157 memset(crat_table, 0, sizeof(struct crat_header)); 1158 1159 memcpy(&crat_table->signature, CRAT_SIGNATURE, 1160 sizeof(crat_table->signature)); 1161 /* Change length as we add more subtypes*/ 1162 crat_table->length = sizeof(struct crat_header); 1163 crat_table->num_domains = 1; 1164 crat_table->total_entries = 0; 1165 1166 /* Fill in Subtype: Compute Unit 1167 * First fill in the sub type header and then sub type data 1168 */ 1169 avail_size -= sizeof(struct crat_subtype_computeunit); 1170 if (avail_size < 0) 1171 return -ENOMEM; 1172 1173 sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); 1174 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); 1175 1176 sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; 1177 sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); 1178 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 1179 1180 /* Fill CU subtype data */ 1181 cu = (struct crat_subtype_computeunit *)sub_type_hdr; 1182 cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; 1183 cu->proximity_domain = proximity_domain; 1184 1185 amdgpu_amdkfd_get_cu_info(kdev->kgd, &cu_info); 1186 cu->num_simd_per_cu = cu_info.simd_per_cu; 1187 cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; 1188 cu->max_waves_simd = cu_info.max_waves_per_simd; 1189 1190 cu->wave_front_size = cu_info.wave_front_size; 1191 cu->array_count = cu_info.num_shader_arrays_per_engine * 1192 cu_info.num_shader_engines; 1193 total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); 1194 cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); 1195 cu->num_cu_per_array = cu_info.num_cu_per_sh; 1196 cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; 1197 cu->num_banks = cu_info.num_shader_engines; 1198 cu->lds_size_in_kb = cu_info.lds_size; 1199 1200 cu->hsa_capability = 0; 1201 1202 /* Check if this node supports IOMMU. During parsing this flag will 1203 * translate to HSA_CAP_ATS_PRESENT 1204 */ 1205 if (!kfd_iommu_check_device(kdev)) 1206 cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; 1207 1208 crat_table->length += sub_type_hdr->length; 1209 crat_table->total_entries++; 1210 1211 /* Fill in Subtype: Memory. Only on systems with large BAR (no 1212 * private FB), report memory as public. On other systems 1213 * report the total FB size (public+private) as a single 1214 * private heap. 1215 */ 1216 amdgpu_amdkfd_get_local_mem_info(kdev->kgd, &local_mem_info); 1217 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1218 sub_type_hdr->length); 1219 1220 if (debug_largebar) 1221 local_mem_info.local_mem_size_private = 0; 1222 1223 if (local_mem_info.local_mem_size_private == 0) 1224 ret = kfd_fill_gpu_memory_affinity(&avail_size, 1225 kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, 1226 local_mem_info.local_mem_size_public, 1227 (struct crat_subtype_memory *)sub_type_hdr, 1228 proximity_domain, 1229 &local_mem_info); 1230 else 1231 ret = kfd_fill_gpu_memory_affinity(&avail_size, 1232 kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, 1233 local_mem_info.local_mem_size_public + 1234 local_mem_info.local_mem_size_private, 1235 (struct crat_subtype_memory *)sub_type_hdr, 1236 proximity_domain, 1237 &local_mem_info); 1238 if (ret < 0) 1239 return ret; 1240 1241 crat_table->length += sizeof(struct crat_subtype_memory); 1242 crat_table->total_entries++; 1243 1244 /* TODO: Fill in cache information. This information is NOT readily 1245 * available in KGD 1246 */ 1247 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1248 sub_type_hdr->length); 1249 ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, 1250 avail_size, 1251 &cu_info, 1252 (struct crat_subtype_cache *)sub_type_hdr, 1253 &cache_mem_filled, 1254 &num_of_cache_entries); 1255 1256 if (ret < 0) 1257 return ret; 1258 1259 crat_table->length += cache_mem_filled; 1260 crat_table->total_entries += num_of_cache_entries; 1261 avail_size -= cache_mem_filled; 1262 1263 /* Fill in Subtype: IO_LINKS 1264 * Only direct links are added here which is Link from GPU to 1265 * to its NUMA node. Indirect links are added by userspace. 1266 */ 1267 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1268 cache_mem_filled); 1269 ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev, 1270 (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); 1271 1272 if (ret < 0) 1273 return ret; 1274 1275 crat_table->length += sub_type_hdr->length; 1276 crat_table->total_entries++; 1277 1278 1279 /* Fill in Subtype: IO_LINKS 1280 * Direct links from GPU to other GPUs through xGMI. 1281 * We will loop GPUs that already be processed (with lower value 1282 * of proximity_domain), add the link for the GPUs with same 1283 * hive id (from this GPU to other GPU) . The reversed iolink 1284 * (from other GPU to this GPU) will be added 1285 * in kfd_parse_subtype_iolink. 1286 */ 1287 if (kdev->hive_id) { 1288 for (nid = 0; nid < proximity_domain; ++nid) { 1289 peer_dev = kfd_topology_device_by_proximity_domain(nid); 1290 if (!peer_dev->gpu) 1291 continue; 1292 if (peer_dev->gpu->hive_id != kdev->hive_id) 1293 continue; 1294 sub_type_hdr = (typeof(sub_type_hdr))( 1295 (char *)sub_type_hdr + 1296 sizeof(struct crat_subtype_iolink)); 1297 ret = kfd_fill_gpu_xgmi_link_to_gpu( 1298 &avail_size, kdev, peer_dev->gpu, 1299 (struct crat_subtype_iolink *)sub_type_hdr, 1300 proximity_domain, nid); 1301 if (ret < 0) 1302 return ret; 1303 crat_table->length += sub_type_hdr->length; 1304 crat_table->total_entries++; 1305 } 1306 } 1307 *size = crat_table->length; 1308 pr_info("Virtual CRAT table created for GPU\n"); 1309 1310 return ret; 1311 } 1312 1313 /* kfd_create_crat_image_virtual - Allocates memory for CRAT image and 1314 * creates a Virtual CRAT (VCRAT) image 1315 * 1316 * NOTE: Call kfd_destroy_crat_image to free CRAT image memory 1317 * 1318 * @crat_image: VCRAT image created because ACPI does not have a 1319 * CRAT for this device 1320 * @size: [OUT] size of virtual crat_image 1321 * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device 1322 * COMPUTE_UNIT_GPU - Create VCRAT for GPU 1323 * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU 1324 * -- this option is not currently implemented. 1325 * The assumption is that all AMD APUs will have CRAT 1326 * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU 1327 * 1328 * Return 0 if successful else return -ve value 1329 */ 1330 int kfd_create_crat_image_virtual(void **crat_image, size_t *size, 1331 int flags, struct kfd_dev *kdev, 1332 uint32_t proximity_domain) 1333 { 1334 void *pcrat_image = NULL; 1335 int ret = 0; 1336 1337 if (!crat_image) 1338 return -EINVAL; 1339 1340 *crat_image = NULL; 1341 1342 /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and 1343 * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover 1344 * all the current conditions. A check is put not to overwrite beyond 1345 * allocated size 1346 */ 1347 switch (flags) { 1348 case COMPUTE_UNIT_CPU: 1349 pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); 1350 if (!pcrat_image) 1351 return -ENOMEM; 1352 *size = VCRAT_SIZE_FOR_CPU; 1353 ret = kfd_create_vcrat_image_cpu(pcrat_image, size); 1354 break; 1355 case COMPUTE_UNIT_GPU: 1356 if (!kdev) 1357 return -EINVAL; 1358 pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); 1359 if (!pcrat_image) 1360 return -ENOMEM; 1361 *size = VCRAT_SIZE_FOR_GPU; 1362 ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, 1363 proximity_domain); 1364 break; 1365 case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): 1366 /* TODO: */ 1367 ret = -EINVAL; 1368 pr_err("VCRAT not implemented for APU\n"); 1369 break; 1370 default: 1371 ret = -EINVAL; 1372 } 1373 1374 if (!ret) 1375 *crat_image = pcrat_image; 1376 else 1377 kfree(pcrat_image); 1378 1379 return ret; 1380 } 1381 1382 1383 /* kfd_destroy_crat_image 1384 * 1385 * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) 1386 * 1387 */ 1388 void kfd_destroy_crat_image(void *crat_image) 1389 { 1390 kfree(crat_image); 1391 } 1392