1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2015-2022 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/pci.h> 25 #include <linux/acpi.h> 26 #include "kfd_crat.h" 27 #include "kfd_priv.h" 28 #include "kfd_topology.h" 29 #include "kfd_iommu.h" 30 #include "amdgpu.h" 31 #include "amdgpu_amdkfd.h" 32 33 /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. 34 * GPU processor ID are expressed with Bit[31]=1. 35 * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs 36 * used in the CRAT. 37 */ 38 static uint32_t gpu_processor_id_low = 0x80001000; 39 40 /* Return the next available gpu_processor_id and increment it for next GPU 41 * @total_cu_count - Total CUs present in the GPU including ones 42 * masked off 43 */ 44 static inline unsigned int get_and_inc_gpu_processor_id( 45 unsigned int total_cu_count) 46 { 47 int current_id = gpu_processor_id_low; 48 49 gpu_processor_id_low += total_cu_count; 50 return current_id; 51 } 52 53 /* Static table to describe GPU Cache information */ 54 struct kfd_gpu_cache_info { 55 uint32_t cache_size; 56 uint32_t cache_level; 57 uint32_t flags; 58 /* Indicates how many Compute Units share this cache 59 * within a SA. Value = 1 indicates the cache is not shared 60 */ 61 uint32_t num_cu_shared; 62 }; 63 64 static struct kfd_gpu_cache_info kaveri_cache_info[] = { 65 { 66 /* TCP L1 Cache per CU */ 67 .cache_size = 16, 68 .cache_level = 1, 69 .flags = (CRAT_CACHE_FLAGS_ENABLED | 70 CRAT_CACHE_FLAGS_DATA_CACHE | 71 CRAT_CACHE_FLAGS_SIMD_CACHE), 72 .num_cu_shared = 1, 73 }, 74 { 75 /* Scalar L1 Instruction Cache (in SQC module) per bank */ 76 .cache_size = 16, 77 .cache_level = 1, 78 .flags = (CRAT_CACHE_FLAGS_ENABLED | 79 CRAT_CACHE_FLAGS_INST_CACHE | 80 CRAT_CACHE_FLAGS_SIMD_CACHE), 81 .num_cu_shared = 2, 82 }, 83 { 84 /* Scalar L1 Data Cache (in SQC module) per bank */ 85 .cache_size = 8, 86 .cache_level = 1, 87 .flags = (CRAT_CACHE_FLAGS_ENABLED | 88 CRAT_CACHE_FLAGS_DATA_CACHE | 89 CRAT_CACHE_FLAGS_SIMD_CACHE), 90 .num_cu_shared = 2, 91 }, 92 93 /* TODO: Add L2 Cache information */ 94 }; 95 96 97 static struct kfd_gpu_cache_info carrizo_cache_info[] = { 98 { 99 /* TCP L1 Cache per CU */ 100 .cache_size = 16, 101 .cache_level = 1, 102 .flags = (CRAT_CACHE_FLAGS_ENABLED | 103 CRAT_CACHE_FLAGS_DATA_CACHE | 104 CRAT_CACHE_FLAGS_SIMD_CACHE), 105 .num_cu_shared = 1, 106 }, 107 { 108 /* Scalar L1 Instruction Cache (in SQC module) per bank */ 109 .cache_size = 8, 110 .cache_level = 1, 111 .flags = (CRAT_CACHE_FLAGS_ENABLED | 112 CRAT_CACHE_FLAGS_INST_CACHE | 113 CRAT_CACHE_FLAGS_SIMD_CACHE), 114 .num_cu_shared = 4, 115 }, 116 { 117 /* Scalar L1 Data Cache (in SQC module) per bank. */ 118 .cache_size = 4, 119 .cache_level = 1, 120 .flags = (CRAT_CACHE_FLAGS_ENABLED | 121 CRAT_CACHE_FLAGS_DATA_CACHE | 122 CRAT_CACHE_FLAGS_SIMD_CACHE), 123 .num_cu_shared = 4, 124 }, 125 126 /* TODO: Add L2 Cache information */ 127 }; 128 129 #define hawaii_cache_info kaveri_cache_info 130 #define tonga_cache_info carrizo_cache_info 131 #define fiji_cache_info carrizo_cache_info 132 #define polaris10_cache_info carrizo_cache_info 133 #define polaris11_cache_info carrizo_cache_info 134 #define polaris12_cache_info carrizo_cache_info 135 #define vegam_cache_info carrizo_cache_info 136 137 /* NOTE: L1 cache information has been updated and L2/L3 138 * cache information has been added for Vega10 and 139 * newer ASICs. The unit for cache_size is KiB. 140 * In future, check & update cache details 141 * for every new ASIC is required. 142 */ 143 144 static struct kfd_gpu_cache_info vega10_cache_info[] = { 145 { 146 /* TCP L1 Cache per CU */ 147 .cache_size = 16, 148 .cache_level = 1, 149 .flags = (CRAT_CACHE_FLAGS_ENABLED | 150 CRAT_CACHE_FLAGS_DATA_CACHE | 151 CRAT_CACHE_FLAGS_SIMD_CACHE), 152 .num_cu_shared = 1, 153 }, 154 { 155 /* Scalar L1 Instruction Cache per SQC */ 156 .cache_size = 32, 157 .cache_level = 1, 158 .flags = (CRAT_CACHE_FLAGS_ENABLED | 159 CRAT_CACHE_FLAGS_INST_CACHE | 160 CRAT_CACHE_FLAGS_SIMD_CACHE), 161 .num_cu_shared = 3, 162 }, 163 { 164 /* Scalar L1 Data Cache per SQC */ 165 .cache_size = 16, 166 .cache_level = 1, 167 .flags = (CRAT_CACHE_FLAGS_ENABLED | 168 CRAT_CACHE_FLAGS_DATA_CACHE | 169 CRAT_CACHE_FLAGS_SIMD_CACHE), 170 .num_cu_shared = 3, 171 }, 172 { 173 /* L2 Data Cache per GPU (Total Tex Cache) */ 174 .cache_size = 4096, 175 .cache_level = 2, 176 .flags = (CRAT_CACHE_FLAGS_ENABLED | 177 CRAT_CACHE_FLAGS_DATA_CACHE | 178 CRAT_CACHE_FLAGS_SIMD_CACHE), 179 .num_cu_shared = 16, 180 }, 181 }; 182 183 static struct kfd_gpu_cache_info raven_cache_info[] = { 184 { 185 /* TCP L1 Cache per CU */ 186 .cache_size = 16, 187 .cache_level = 1, 188 .flags = (CRAT_CACHE_FLAGS_ENABLED | 189 CRAT_CACHE_FLAGS_DATA_CACHE | 190 CRAT_CACHE_FLAGS_SIMD_CACHE), 191 .num_cu_shared = 1, 192 }, 193 { 194 /* Scalar L1 Instruction Cache per SQC */ 195 .cache_size = 32, 196 .cache_level = 1, 197 .flags = (CRAT_CACHE_FLAGS_ENABLED | 198 CRAT_CACHE_FLAGS_INST_CACHE | 199 CRAT_CACHE_FLAGS_SIMD_CACHE), 200 .num_cu_shared = 3, 201 }, 202 { 203 /* Scalar L1 Data Cache per SQC */ 204 .cache_size = 16, 205 .cache_level = 1, 206 .flags = (CRAT_CACHE_FLAGS_ENABLED | 207 CRAT_CACHE_FLAGS_DATA_CACHE | 208 CRAT_CACHE_FLAGS_SIMD_CACHE), 209 .num_cu_shared = 3, 210 }, 211 { 212 /* L2 Data Cache per GPU (Total Tex Cache) */ 213 .cache_size = 1024, 214 .cache_level = 2, 215 .flags = (CRAT_CACHE_FLAGS_ENABLED | 216 CRAT_CACHE_FLAGS_DATA_CACHE | 217 CRAT_CACHE_FLAGS_SIMD_CACHE), 218 .num_cu_shared = 11, 219 }, 220 }; 221 222 static struct kfd_gpu_cache_info renoir_cache_info[] = { 223 { 224 /* TCP L1 Cache per CU */ 225 .cache_size = 16, 226 .cache_level = 1, 227 .flags = (CRAT_CACHE_FLAGS_ENABLED | 228 CRAT_CACHE_FLAGS_DATA_CACHE | 229 CRAT_CACHE_FLAGS_SIMD_CACHE), 230 .num_cu_shared = 1, 231 }, 232 { 233 /* Scalar L1 Instruction Cache per SQC */ 234 .cache_size = 32, 235 .cache_level = 1, 236 .flags = (CRAT_CACHE_FLAGS_ENABLED | 237 CRAT_CACHE_FLAGS_INST_CACHE | 238 CRAT_CACHE_FLAGS_SIMD_CACHE), 239 .num_cu_shared = 3, 240 }, 241 { 242 /* Scalar L1 Data Cache per SQC */ 243 .cache_size = 16, 244 .cache_level = 1, 245 .flags = (CRAT_CACHE_FLAGS_ENABLED | 246 CRAT_CACHE_FLAGS_DATA_CACHE | 247 CRAT_CACHE_FLAGS_SIMD_CACHE), 248 .num_cu_shared = 3, 249 }, 250 { 251 /* L2 Data Cache per GPU (Total Tex Cache) */ 252 .cache_size = 1024, 253 .cache_level = 2, 254 .flags = (CRAT_CACHE_FLAGS_ENABLED | 255 CRAT_CACHE_FLAGS_DATA_CACHE | 256 CRAT_CACHE_FLAGS_SIMD_CACHE), 257 .num_cu_shared = 8, 258 }, 259 }; 260 261 static struct kfd_gpu_cache_info vega12_cache_info[] = { 262 { 263 /* TCP L1 Cache per CU */ 264 .cache_size = 16, 265 .cache_level = 1, 266 .flags = (CRAT_CACHE_FLAGS_ENABLED | 267 CRAT_CACHE_FLAGS_DATA_CACHE | 268 CRAT_CACHE_FLAGS_SIMD_CACHE), 269 .num_cu_shared = 1, 270 }, 271 { 272 /* Scalar L1 Instruction Cache per SQC */ 273 .cache_size = 32, 274 .cache_level = 1, 275 .flags = (CRAT_CACHE_FLAGS_ENABLED | 276 CRAT_CACHE_FLAGS_INST_CACHE | 277 CRAT_CACHE_FLAGS_SIMD_CACHE), 278 .num_cu_shared = 3, 279 }, 280 { 281 /* Scalar L1 Data Cache per SQC */ 282 .cache_size = 16, 283 .cache_level = 1, 284 .flags = (CRAT_CACHE_FLAGS_ENABLED | 285 CRAT_CACHE_FLAGS_DATA_CACHE | 286 CRAT_CACHE_FLAGS_SIMD_CACHE), 287 .num_cu_shared = 3, 288 }, 289 { 290 /* L2 Data Cache per GPU (Total Tex Cache) */ 291 .cache_size = 2048, 292 .cache_level = 2, 293 .flags = (CRAT_CACHE_FLAGS_ENABLED | 294 CRAT_CACHE_FLAGS_DATA_CACHE | 295 CRAT_CACHE_FLAGS_SIMD_CACHE), 296 .num_cu_shared = 5, 297 }, 298 }; 299 300 static struct kfd_gpu_cache_info vega20_cache_info[] = { 301 { 302 /* TCP L1 Cache per CU */ 303 .cache_size = 16, 304 .cache_level = 1, 305 .flags = (CRAT_CACHE_FLAGS_ENABLED | 306 CRAT_CACHE_FLAGS_DATA_CACHE | 307 CRAT_CACHE_FLAGS_SIMD_CACHE), 308 .num_cu_shared = 1, 309 }, 310 { 311 /* Scalar L1 Instruction Cache per SQC */ 312 .cache_size = 32, 313 .cache_level = 1, 314 .flags = (CRAT_CACHE_FLAGS_ENABLED | 315 CRAT_CACHE_FLAGS_INST_CACHE | 316 CRAT_CACHE_FLAGS_SIMD_CACHE), 317 .num_cu_shared = 3, 318 }, 319 { 320 /* Scalar L1 Data Cache per SQC */ 321 .cache_size = 16, 322 .cache_level = 1, 323 .flags = (CRAT_CACHE_FLAGS_ENABLED | 324 CRAT_CACHE_FLAGS_DATA_CACHE | 325 CRAT_CACHE_FLAGS_SIMD_CACHE), 326 .num_cu_shared = 3, 327 }, 328 { 329 /* L2 Data Cache per GPU (Total Tex Cache) */ 330 .cache_size = 8192, 331 .cache_level = 2, 332 .flags = (CRAT_CACHE_FLAGS_ENABLED | 333 CRAT_CACHE_FLAGS_DATA_CACHE | 334 CRAT_CACHE_FLAGS_SIMD_CACHE), 335 .num_cu_shared = 16, 336 }, 337 }; 338 339 static struct kfd_gpu_cache_info aldebaran_cache_info[] = { 340 { 341 /* TCP L1 Cache per CU */ 342 .cache_size = 16, 343 .cache_level = 1, 344 .flags = (CRAT_CACHE_FLAGS_ENABLED | 345 CRAT_CACHE_FLAGS_DATA_CACHE | 346 CRAT_CACHE_FLAGS_SIMD_CACHE), 347 .num_cu_shared = 1, 348 }, 349 { 350 /* Scalar L1 Instruction Cache per SQC */ 351 .cache_size = 32, 352 .cache_level = 1, 353 .flags = (CRAT_CACHE_FLAGS_ENABLED | 354 CRAT_CACHE_FLAGS_INST_CACHE | 355 CRAT_CACHE_FLAGS_SIMD_CACHE), 356 .num_cu_shared = 2, 357 }, 358 { 359 /* Scalar L1 Data Cache per SQC */ 360 .cache_size = 16, 361 .cache_level = 1, 362 .flags = (CRAT_CACHE_FLAGS_ENABLED | 363 CRAT_CACHE_FLAGS_DATA_CACHE | 364 CRAT_CACHE_FLAGS_SIMD_CACHE), 365 .num_cu_shared = 2, 366 }, 367 { 368 /* L2 Data Cache per GPU (Total Tex Cache) */ 369 .cache_size = 8192, 370 .cache_level = 2, 371 .flags = (CRAT_CACHE_FLAGS_ENABLED | 372 CRAT_CACHE_FLAGS_DATA_CACHE | 373 CRAT_CACHE_FLAGS_SIMD_CACHE), 374 .num_cu_shared = 14, 375 }, 376 }; 377 378 static struct kfd_gpu_cache_info navi10_cache_info[] = { 379 { 380 /* TCP L1 Cache per CU */ 381 .cache_size = 16, 382 .cache_level = 1, 383 .flags = (CRAT_CACHE_FLAGS_ENABLED | 384 CRAT_CACHE_FLAGS_DATA_CACHE | 385 CRAT_CACHE_FLAGS_SIMD_CACHE), 386 .num_cu_shared = 1, 387 }, 388 { 389 /* Scalar L1 Instruction Cache per SQC */ 390 .cache_size = 32, 391 .cache_level = 1, 392 .flags = (CRAT_CACHE_FLAGS_ENABLED | 393 CRAT_CACHE_FLAGS_INST_CACHE | 394 CRAT_CACHE_FLAGS_SIMD_CACHE), 395 .num_cu_shared = 2, 396 }, 397 { 398 /* Scalar L1 Data Cache per SQC */ 399 .cache_size = 16, 400 .cache_level = 1, 401 .flags = (CRAT_CACHE_FLAGS_ENABLED | 402 CRAT_CACHE_FLAGS_DATA_CACHE | 403 CRAT_CACHE_FLAGS_SIMD_CACHE), 404 .num_cu_shared = 2, 405 }, 406 { 407 /* GL1 Data Cache per SA */ 408 .cache_size = 128, 409 .cache_level = 1, 410 .flags = (CRAT_CACHE_FLAGS_ENABLED | 411 CRAT_CACHE_FLAGS_DATA_CACHE | 412 CRAT_CACHE_FLAGS_SIMD_CACHE), 413 .num_cu_shared = 10, 414 }, 415 { 416 /* L2 Data Cache per GPU (Total Tex Cache) */ 417 .cache_size = 4096, 418 .cache_level = 2, 419 .flags = (CRAT_CACHE_FLAGS_ENABLED | 420 CRAT_CACHE_FLAGS_DATA_CACHE | 421 CRAT_CACHE_FLAGS_SIMD_CACHE), 422 .num_cu_shared = 10, 423 }, 424 }; 425 426 static struct kfd_gpu_cache_info vangogh_cache_info[] = { 427 { 428 /* TCP L1 Cache per CU */ 429 .cache_size = 16, 430 .cache_level = 1, 431 .flags = (CRAT_CACHE_FLAGS_ENABLED | 432 CRAT_CACHE_FLAGS_DATA_CACHE | 433 CRAT_CACHE_FLAGS_SIMD_CACHE), 434 .num_cu_shared = 1, 435 }, 436 { 437 /* Scalar L1 Instruction Cache per SQC */ 438 .cache_size = 32, 439 .cache_level = 1, 440 .flags = (CRAT_CACHE_FLAGS_ENABLED | 441 CRAT_CACHE_FLAGS_INST_CACHE | 442 CRAT_CACHE_FLAGS_SIMD_CACHE), 443 .num_cu_shared = 2, 444 }, 445 { 446 /* Scalar L1 Data Cache per SQC */ 447 .cache_size = 16, 448 .cache_level = 1, 449 .flags = (CRAT_CACHE_FLAGS_ENABLED | 450 CRAT_CACHE_FLAGS_DATA_CACHE | 451 CRAT_CACHE_FLAGS_SIMD_CACHE), 452 .num_cu_shared = 2, 453 }, 454 { 455 /* GL1 Data Cache per SA */ 456 .cache_size = 128, 457 .cache_level = 1, 458 .flags = (CRAT_CACHE_FLAGS_ENABLED | 459 CRAT_CACHE_FLAGS_DATA_CACHE | 460 CRAT_CACHE_FLAGS_SIMD_CACHE), 461 .num_cu_shared = 8, 462 }, 463 { 464 /* L2 Data Cache per GPU (Total Tex Cache) */ 465 .cache_size = 1024, 466 .cache_level = 2, 467 .flags = (CRAT_CACHE_FLAGS_ENABLED | 468 CRAT_CACHE_FLAGS_DATA_CACHE | 469 CRAT_CACHE_FLAGS_SIMD_CACHE), 470 .num_cu_shared = 8, 471 }, 472 }; 473 474 static struct kfd_gpu_cache_info navi14_cache_info[] = { 475 { 476 /* TCP L1 Cache per CU */ 477 .cache_size = 16, 478 .cache_level = 1, 479 .flags = (CRAT_CACHE_FLAGS_ENABLED | 480 CRAT_CACHE_FLAGS_DATA_CACHE | 481 CRAT_CACHE_FLAGS_SIMD_CACHE), 482 .num_cu_shared = 1, 483 }, 484 { 485 /* Scalar L1 Instruction Cache per SQC */ 486 .cache_size = 32, 487 .cache_level = 1, 488 .flags = (CRAT_CACHE_FLAGS_ENABLED | 489 CRAT_CACHE_FLAGS_INST_CACHE | 490 CRAT_CACHE_FLAGS_SIMD_CACHE), 491 .num_cu_shared = 2, 492 }, 493 { 494 /* Scalar L1 Data Cache per SQC */ 495 .cache_size = 16, 496 .cache_level = 1, 497 .flags = (CRAT_CACHE_FLAGS_ENABLED | 498 CRAT_CACHE_FLAGS_DATA_CACHE | 499 CRAT_CACHE_FLAGS_SIMD_CACHE), 500 .num_cu_shared = 2, 501 }, 502 { 503 /* GL1 Data Cache per SA */ 504 .cache_size = 128, 505 .cache_level = 1, 506 .flags = (CRAT_CACHE_FLAGS_ENABLED | 507 CRAT_CACHE_FLAGS_DATA_CACHE | 508 CRAT_CACHE_FLAGS_SIMD_CACHE), 509 .num_cu_shared = 12, 510 }, 511 { 512 /* L2 Data Cache per GPU (Total Tex Cache) */ 513 .cache_size = 2048, 514 .cache_level = 2, 515 .flags = (CRAT_CACHE_FLAGS_ENABLED | 516 CRAT_CACHE_FLAGS_DATA_CACHE | 517 CRAT_CACHE_FLAGS_SIMD_CACHE), 518 .num_cu_shared = 12, 519 }, 520 }; 521 522 static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { 523 { 524 /* TCP L1 Cache per CU */ 525 .cache_size = 16, 526 .cache_level = 1, 527 .flags = (CRAT_CACHE_FLAGS_ENABLED | 528 CRAT_CACHE_FLAGS_DATA_CACHE | 529 CRAT_CACHE_FLAGS_SIMD_CACHE), 530 .num_cu_shared = 1, 531 }, 532 { 533 /* Scalar L1 Instruction Cache per SQC */ 534 .cache_size = 32, 535 .cache_level = 1, 536 .flags = (CRAT_CACHE_FLAGS_ENABLED | 537 CRAT_CACHE_FLAGS_INST_CACHE | 538 CRAT_CACHE_FLAGS_SIMD_CACHE), 539 .num_cu_shared = 2, 540 }, 541 { 542 /* Scalar L1 Data Cache per SQC */ 543 .cache_size = 16, 544 .cache_level = 1, 545 .flags = (CRAT_CACHE_FLAGS_ENABLED | 546 CRAT_CACHE_FLAGS_DATA_CACHE | 547 CRAT_CACHE_FLAGS_SIMD_CACHE), 548 .num_cu_shared = 2, 549 }, 550 { 551 /* GL1 Data Cache per SA */ 552 .cache_size = 128, 553 .cache_level = 1, 554 .flags = (CRAT_CACHE_FLAGS_ENABLED | 555 CRAT_CACHE_FLAGS_DATA_CACHE | 556 CRAT_CACHE_FLAGS_SIMD_CACHE), 557 .num_cu_shared = 10, 558 }, 559 { 560 /* L2 Data Cache per GPU (Total Tex Cache) */ 561 .cache_size = 4096, 562 .cache_level = 2, 563 .flags = (CRAT_CACHE_FLAGS_ENABLED | 564 CRAT_CACHE_FLAGS_DATA_CACHE | 565 CRAT_CACHE_FLAGS_SIMD_CACHE), 566 .num_cu_shared = 10, 567 }, 568 { 569 /* L3 Data Cache per GPU */ 570 .cache_size = 128*1024, 571 .cache_level = 3, 572 .flags = (CRAT_CACHE_FLAGS_ENABLED | 573 CRAT_CACHE_FLAGS_DATA_CACHE | 574 CRAT_CACHE_FLAGS_SIMD_CACHE), 575 .num_cu_shared = 10, 576 }, 577 }; 578 579 static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { 580 { 581 /* TCP L1 Cache per CU */ 582 .cache_size = 16, 583 .cache_level = 1, 584 .flags = (CRAT_CACHE_FLAGS_ENABLED | 585 CRAT_CACHE_FLAGS_DATA_CACHE | 586 CRAT_CACHE_FLAGS_SIMD_CACHE), 587 .num_cu_shared = 1, 588 }, 589 { 590 /* Scalar L1 Instruction Cache per SQC */ 591 .cache_size = 32, 592 .cache_level = 1, 593 .flags = (CRAT_CACHE_FLAGS_ENABLED | 594 CRAT_CACHE_FLAGS_INST_CACHE | 595 CRAT_CACHE_FLAGS_SIMD_CACHE), 596 .num_cu_shared = 2, 597 }, 598 { 599 /* Scalar L1 Data Cache per SQC */ 600 .cache_size = 16, 601 .cache_level = 1, 602 .flags = (CRAT_CACHE_FLAGS_ENABLED | 603 CRAT_CACHE_FLAGS_DATA_CACHE | 604 CRAT_CACHE_FLAGS_SIMD_CACHE), 605 .num_cu_shared = 2, 606 }, 607 { 608 /* GL1 Data Cache per SA */ 609 .cache_size = 128, 610 .cache_level = 1, 611 .flags = (CRAT_CACHE_FLAGS_ENABLED | 612 CRAT_CACHE_FLAGS_DATA_CACHE | 613 CRAT_CACHE_FLAGS_SIMD_CACHE), 614 .num_cu_shared = 10, 615 }, 616 { 617 /* L2 Data Cache per GPU (Total Tex Cache) */ 618 .cache_size = 3072, 619 .cache_level = 2, 620 .flags = (CRAT_CACHE_FLAGS_ENABLED | 621 CRAT_CACHE_FLAGS_DATA_CACHE | 622 CRAT_CACHE_FLAGS_SIMD_CACHE), 623 .num_cu_shared = 10, 624 }, 625 { 626 /* L3 Data Cache per GPU */ 627 .cache_size = 96*1024, 628 .cache_level = 3, 629 .flags = (CRAT_CACHE_FLAGS_ENABLED | 630 CRAT_CACHE_FLAGS_DATA_CACHE | 631 CRAT_CACHE_FLAGS_SIMD_CACHE), 632 .num_cu_shared = 10, 633 }, 634 }; 635 636 static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { 637 { 638 /* TCP L1 Cache per CU */ 639 .cache_size = 16, 640 .cache_level = 1, 641 .flags = (CRAT_CACHE_FLAGS_ENABLED | 642 CRAT_CACHE_FLAGS_DATA_CACHE | 643 CRAT_CACHE_FLAGS_SIMD_CACHE), 644 .num_cu_shared = 1, 645 }, 646 { 647 /* Scalar L1 Instruction Cache per SQC */ 648 .cache_size = 32, 649 .cache_level = 1, 650 .flags = (CRAT_CACHE_FLAGS_ENABLED | 651 CRAT_CACHE_FLAGS_INST_CACHE | 652 CRAT_CACHE_FLAGS_SIMD_CACHE), 653 .num_cu_shared = 2, 654 }, 655 { 656 /* Scalar L1 Data Cache per SQC */ 657 .cache_size = 16, 658 .cache_level = 1, 659 .flags = (CRAT_CACHE_FLAGS_ENABLED | 660 CRAT_CACHE_FLAGS_DATA_CACHE | 661 CRAT_CACHE_FLAGS_SIMD_CACHE), 662 .num_cu_shared = 2, 663 }, 664 { 665 /* GL1 Data Cache per SA */ 666 .cache_size = 128, 667 .cache_level = 1, 668 .flags = (CRAT_CACHE_FLAGS_ENABLED | 669 CRAT_CACHE_FLAGS_DATA_CACHE | 670 CRAT_CACHE_FLAGS_SIMD_CACHE), 671 .num_cu_shared = 8, 672 }, 673 { 674 /* L2 Data Cache per GPU (Total Tex Cache) */ 675 .cache_size = 2048, 676 .cache_level = 2, 677 .flags = (CRAT_CACHE_FLAGS_ENABLED | 678 CRAT_CACHE_FLAGS_DATA_CACHE | 679 CRAT_CACHE_FLAGS_SIMD_CACHE), 680 .num_cu_shared = 8, 681 }, 682 { 683 /* L3 Data Cache per GPU */ 684 .cache_size = 32*1024, 685 .cache_level = 3, 686 .flags = (CRAT_CACHE_FLAGS_ENABLED | 687 CRAT_CACHE_FLAGS_DATA_CACHE | 688 CRAT_CACHE_FLAGS_SIMD_CACHE), 689 .num_cu_shared = 8, 690 }, 691 }; 692 693 static struct kfd_gpu_cache_info beige_goby_cache_info[] = { 694 { 695 /* TCP L1 Cache per CU */ 696 .cache_size = 16, 697 .cache_level = 1, 698 .flags = (CRAT_CACHE_FLAGS_ENABLED | 699 CRAT_CACHE_FLAGS_DATA_CACHE | 700 CRAT_CACHE_FLAGS_SIMD_CACHE), 701 .num_cu_shared = 1, 702 }, 703 { 704 /* Scalar L1 Instruction Cache per SQC */ 705 .cache_size = 32, 706 .cache_level = 1, 707 .flags = (CRAT_CACHE_FLAGS_ENABLED | 708 CRAT_CACHE_FLAGS_INST_CACHE | 709 CRAT_CACHE_FLAGS_SIMD_CACHE), 710 .num_cu_shared = 2, 711 }, 712 { 713 /* Scalar L1 Data Cache per SQC */ 714 .cache_size = 16, 715 .cache_level = 1, 716 .flags = (CRAT_CACHE_FLAGS_ENABLED | 717 CRAT_CACHE_FLAGS_DATA_CACHE | 718 CRAT_CACHE_FLAGS_SIMD_CACHE), 719 .num_cu_shared = 2, 720 }, 721 { 722 /* GL1 Data Cache per SA */ 723 .cache_size = 128, 724 .cache_level = 1, 725 .flags = (CRAT_CACHE_FLAGS_ENABLED | 726 CRAT_CACHE_FLAGS_DATA_CACHE | 727 CRAT_CACHE_FLAGS_SIMD_CACHE), 728 .num_cu_shared = 8, 729 }, 730 { 731 /* L2 Data Cache per GPU (Total Tex Cache) */ 732 .cache_size = 1024, 733 .cache_level = 2, 734 .flags = (CRAT_CACHE_FLAGS_ENABLED | 735 CRAT_CACHE_FLAGS_DATA_CACHE | 736 CRAT_CACHE_FLAGS_SIMD_CACHE), 737 .num_cu_shared = 8, 738 }, 739 { 740 /* L3 Data Cache per GPU */ 741 .cache_size = 16*1024, 742 .cache_level = 3, 743 .flags = (CRAT_CACHE_FLAGS_ENABLED | 744 CRAT_CACHE_FLAGS_DATA_CACHE | 745 CRAT_CACHE_FLAGS_SIMD_CACHE), 746 .num_cu_shared = 8, 747 }, 748 }; 749 750 static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { 751 { 752 /* TCP L1 Cache per CU */ 753 .cache_size = 16, 754 .cache_level = 1, 755 .flags = (CRAT_CACHE_FLAGS_ENABLED | 756 CRAT_CACHE_FLAGS_DATA_CACHE | 757 CRAT_CACHE_FLAGS_SIMD_CACHE), 758 .num_cu_shared = 1, 759 }, 760 { 761 /* Scalar L1 Instruction Cache per SQC */ 762 .cache_size = 32, 763 .cache_level = 1, 764 .flags = (CRAT_CACHE_FLAGS_ENABLED | 765 CRAT_CACHE_FLAGS_INST_CACHE | 766 CRAT_CACHE_FLAGS_SIMD_CACHE), 767 .num_cu_shared = 2, 768 }, 769 { 770 /* Scalar L1 Data Cache per SQC */ 771 .cache_size = 16, 772 .cache_level = 1, 773 .flags = (CRAT_CACHE_FLAGS_ENABLED | 774 CRAT_CACHE_FLAGS_DATA_CACHE | 775 CRAT_CACHE_FLAGS_SIMD_CACHE), 776 .num_cu_shared = 2, 777 }, 778 { 779 /* GL1 Data Cache per SA */ 780 .cache_size = 128, 781 .cache_level = 1, 782 .flags = (CRAT_CACHE_FLAGS_ENABLED | 783 CRAT_CACHE_FLAGS_DATA_CACHE | 784 CRAT_CACHE_FLAGS_SIMD_CACHE), 785 .num_cu_shared = 6, 786 }, 787 { 788 /* L2 Data Cache per GPU (Total Tex Cache) */ 789 .cache_size = 2048, 790 .cache_level = 2, 791 .flags = (CRAT_CACHE_FLAGS_ENABLED | 792 CRAT_CACHE_FLAGS_DATA_CACHE | 793 CRAT_CACHE_FLAGS_SIMD_CACHE), 794 .num_cu_shared = 6, 795 }, 796 }; 797 798 static struct kfd_gpu_cache_info gfx1037_cache_info[] = { 799 { 800 /* TCP L1 Cache per CU */ 801 .cache_size = 16, 802 .cache_level = 1, 803 .flags = (CRAT_CACHE_FLAGS_ENABLED | 804 CRAT_CACHE_FLAGS_DATA_CACHE | 805 CRAT_CACHE_FLAGS_SIMD_CACHE), 806 .num_cu_shared = 1, 807 }, 808 { 809 /* Scalar L1 Instruction Cache per SQC */ 810 .cache_size = 32, 811 .cache_level = 1, 812 .flags = (CRAT_CACHE_FLAGS_ENABLED | 813 CRAT_CACHE_FLAGS_INST_CACHE | 814 CRAT_CACHE_FLAGS_SIMD_CACHE), 815 .num_cu_shared = 2, 816 }, 817 { 818 /* Scalar L1 Data Cache per SQC */ 819 .cache_size = 16, 820 .cache_level = 1, 821 .flags = (CRAT_CACHE_FLAGS_ENABLED | 822 CRAT_CACHE_FLAGS_DATA_CACHE | 823 CRAT_CACHE_FLAGS_SIMD_CACHE), 824 .num_cu_shared = 2, 825 }, 826 { 827 /* GL1 Data Cache per SA */ 828 .cache_size = 128, 829 .cache_level = 1, 830 .flags = (CRAT_CACHE_FLAGS_ENABLED | 831 CRAT_CACHE_FLAGS_DATA_CACHE | 832 CRAT_CACHE_FLAGS_SIMD_CACHE), 833 .num_cu_shared = 2, 834 }, 835 { 836 /* L2 Data Cache per GPU (Total Tex Cache) */ 837 .cache_size = 256, 838 .cache_level = 2, 839 .flags = (CRAT_CACHE_FLAGS_ENABLED | 840 CRAT_CACHE_FLAGS_DATA_CACHE | 841 CRAT_CACHE_FLAGS_SIMD_CACHE), 842 .num_cu_shared = 2, 843 }, 844 }; 845 846 static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { 847 { 848 /* TCP L1 Cache per CU */ 849 .cache_size = 16, 850 .cache_level = 1, 851 .flags = (CRAT_CACHE_FLAGS_ENABLED | 852 CRAT_CACHE_FLAGS_DATA_CACHE | 853 CRAT_CACHE_FLAGS_SIMD_CACHE), 854 .num_cu_shared = 1, 855 }, 856 { 857 /* Scalar L1 Instruction Cache per SQC */ 858 .cache_size = 32, 859 .cache_level = 1, 860 .flags = (CRAT_CACHE_FLAGS_ENABLED | 861 CRAT_CACHE_FLAGS_INST_CACHE | 862 CRAT_CACHE_FLAGS_SIMD_CACHE), 863 .num_cu_shared = 2, 864 }, 865 { 866 /* Scalar L1 Data Cache per SQC */ 867 .cache_size = 16, 868 .cache_level = 1, 869 .flags = (CRAT_CACHE_FLAGS_ENABLED | 870 CRAT_CACHE_FLAGS_DATA_CACHE | 871 CRAT_CACHE_FLAGS_SIMD_CACHE), 872 .num_cu_shared = 2, 873 }, 874 { 875 /* GL1 Data Cache per SA */ 876 .cache_size = 128, 877 .cache_level = 1, 878 .flags = (CRAT_CACHE_FLAGS_ENABLED | 879 CRAT_CACHE_FLAGS_DATA_CACHE | 880 CRAT_CACHE_FLAGS_SIMD_CACHE), 881 .num_cu_shared = 2, 882 }, 883 { 884 /* L2 Data Cache per GPU (Total Tex Cache) */ 885 .cache_size = 256, 886 .cache_level = 2, 887 .flags = (CRAT_CACHE_FLAGS_ENABLED | 888 CRAT_CACHE_FLAGS_DATA_CACHE | 889 CRAT_CACHE_FLAGS_SIMD_CACHE), 890 .num_cu_shared = 2, 891 }, 892 }; 893 894 static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, 895 struct crat_subtype_computeunit *cu) 896 { 897 dev->node_props.cpu_cores_count = cu->num_cpu_cores; 898 dev->node_props.cpu_core_id_base = cu->processor_id_low; 899 if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) 900 dev->node_props.capability |= HSA_CAP_ATS_PRESENT; 901 902 pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, 903 cu->processor_id_low); 904 } 905 906 static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, 907 struct crat_subtype_computeunit *cu) 908 { 909 dev->node_props.simd_id_base = cu->processor_id_low; 910 dev->node_props.simd_count = cu->num_simd_cores; 911 dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; 912 dev->node_props.max_waves_per_simd = cu->max_waves_simd; 913 dev->node_props.wave_front_size = cu->wave_front_size; 914 dev->node_props.array_count = cu->array_count; 915 dev->node_props.cu_per_simd_array = cu->num_cu_per_array; 916 dev->node_props.simd_per_cu = cu->num_simd_per_cu; 917 dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; 918 if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) 919 dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; 920 pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); 921 } 922 923 /* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct 924 * topology device present in the device_list 925 */ 926 static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, 927 struct list_head *device_list) 928 { 929 struct kfd_topology_device *dev; 930 931 pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", 932 cu->proximity_domain, cu->hsa_capability); 933 list_for_each_entry(dev, device_list, list) { 934 if (cu->proximity_domain == dev->proximity_domain) { 935 if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) 936 kfd_populated_cu_info_cpu(dev, cu); 937 938 if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) 939 kfd_populated_cu_info_gpu(dev, cu); 940 break; 941 } 942 } 943 944 return 0; 945 } 946 947 static struct kfd_mem_properties * 948 find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, 949 struct kfd_topology_device *dev) 950 { 951 struct kfd_mem_properties *props; 952 953 list_for_each_entry(props, &dev->mem_props, list) { 954 if (props->heap_type == heap_type 955 && props->flags == flags 956 && props->width == width) 957 return props; 958 } 959 960 return NULL; 961 } 962 /* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct 963 * topology device present in the device_list 964 */ 965 static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, 966 struct list_head *device_list) 967 { 968 struct kfd_mem_properties *props; 969 struct kfd_topology_device *dev; 970 uint32_t heap_type; 971 uint64_t size_in_bytes; 972 uint32_t flags = 0; 973 uint32_t width; 974 975 pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", 976 mem->proximity_domain); 977 list_for_each_entry(dev, device_list, list) { 978 if (mem->proximity_domain == dev->proximity_domain) { 979 /* We're on GPU node */ 980 if (dev->node_props.cpu_cores_count == 0) { 981 /* APU */ 982 if (mem->visibility_type == 0) 983 heap_type = 984 HSA_MEM_HEAP_TYPE_FB_PRIVATE; 985 /* dGPU */ 986 else 987 heap_type = mem->visibility_type; 988 } else 989 heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; 990 991 if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) 992 flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; 993 if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) 994 flags |= HSA_MEM_FLAGS_NON_VOLATILE; 995 996 size_in_bytes = 997 ((uint64_t)mem->length_high << 32) + 998 mem->length_low; 999 width = mem->width; 1000 1001 /* Multiple banks of the same type are aggregated into 1002 * one. User mode doesn't care about multiple physical 1003 * memory segments. It's managed as a single virtual 1004 * heap for user mode. 1005 */ 1006 props = find_subtype_mem(heap_type, flags, width, dev); 1007 if (props) { 1008 props->size_in_bytes += size_in_bytes; 1009 break; 1010 } 1011 1012 props = kfd_alloc_struct(props); 1013 if (!props) 1014 return -ENOMEM; 1015 1016 props->heap_type = heap_type; 1017 props->flags = flags; 1018 props->size_in_bytes = size_in_bytes; 1019 props->width = width; 1020 1021 dev->node_props.mem_banks_count++; 1022 list_add_tail(&props->list, &dev->mem_props); 1023 1024 break; 1025 } 1026 } 1027 1028 return 0; 1029 } 1030 1031 /* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct 1032 * topology device present in the device_list 1033 */ 1034 static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, 1035 struct list_head *device_list) 1036 { 1037 struct kfd_cache_properties *props; 1038 struct kfd_topology_device *dev; 1039 uint32_t id; 1040 uint32_t total_num_of_cu; 1041 1042 id = cache->processor_id_low; 1043 1044 pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); 1045 list_for_each_entry(dev, device_list, list) { 1046 total_num_of_cu = (dev->node_props.array_count * 1047 dev->node_props.cu_per_simd_array); 1048 1049 /* Cache infomration in CRAT doesn't have proximity_domain 1050 * information as it is associated with a CPU core or GPU 1051 * Compute Unit. So map the cache using CPU core Id or SIMD 1052 * (GPU) ID. 1053 * TODO: This works because currently we can safely assume that 1054 * Compute Units are parsed before caches are parsed. In 1055 * future, remove this dependency 1056 */ 1057 if ((id >= dev->node_props.cpu_core_id_base && 1058 id <= dev->node_props.cpu_core_id_base + 1059 dev->node_props.cpu_cores_count) || 1060 (id >= dev->node_props.simd_id_base && 1061 id < dev->node_props.simd_id_base + 1062 total_num_of_cu)) { 1063 props = kfd_alloc_struct(props); 1064 if (!props) 1065 return -ENOMEM; 1066 1067 props->processor_id_low = id; 1068 props->cache_level = cache->cache_level; 1069 props->cache_size = cache->cache_size; 1070 props->cacheline_size = cache->cache_line_size; 1071 props->cachelines_per_tag = cache->lines_per_tag; 1072 props->cache_assoc = cache->associativity; 1073 props->cache_latency = cache->cache_latency; 1074 memcpy(props->sibling_map, cache->sibling_map, 1075 sizeof(props->sibling_map)); 1076 1077 if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) 1078 props->cache_type |= HSA_CACHE_TYPE_DATA; 1079 if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) 1080 props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; 1081 if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) 1082 props->cache_type |= HSA_CACHE_TYPE_CPU; 1083 if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) 1084 props->cache_type |= HSA_CACHE_TYPE_HSACU; 1085 1086 dev->cache_count++; 1087 dev->node_props.caches_count++; 1088 list_add_tail(&props->list, &dev->cache_props); 1089 1090 break; 1091 } 1092 } 1093 1094 return 0; 1095 } 1096 1097 /* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct 1098 * topology device present in the device_list 1099 */ 1100 static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, 1101 struct list_head *device_list) 1102 { 1103 struct kfd_iolink_properties *props = NULL, *props2; 1104 struct kfd_topology_device *dev, *to_dev; 1105 uint32_t id_from; 1106 uint32_t id_to; 1107 1108 id_from = iolink->proximity_domain_from; 1109 id_to = iolink->proximity_domain_to; 1110 1111 pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n", 1112 id_from, id_to); 1113 list_for_each_entry(dev, device_list, list) { 1114 if (id_from == dev->proximity_domain) { 1115 props = kfd_alloc_struct(props); 1116 if (!props) 1117 return -ENOMEM; 1118 1119 props->node_from = id_from; 1120 props->node_to = id_to; 1121 props->ver_maj = iolink->version_major; 1122 props->ver_min = iolink->version_minor; 1123 props->iolink_type = iolink->io_interface_type; 1124 1125 if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) 1126 props->weight = 20; 1127 else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) 1128 props->weight = 15 * iolink->num_hops_xgmi; 1129 else 1130 props->weight = node_distance(id_from, id_to); 1131 1132 props->min_latency = iolink->minimum_latency; 1133 props->max_latency = iolink->maximum_latency; 1134 props->min_bandwidth = iolink->minimum_bandwidth_mbs; 1135 props->max_bandwidth = iolink->maximum_bandwidth_mbs; 1136 props->rec_transfer_size = 1137 iolink->recommended_transfer_size; 1138 1139 dev->node_props.io_links_count++; 1140 list_add_tail(&props->list, &dev->io_link_props); 1141 break; 1142 } 1143 } 1144 1145 /* CPU topology is created before GPUs are detected, so CPU->GPU 1146 * links are not built at that time. If a PCIe type is discovered, it 1147 * means a GPU is detected and we are adding GPU->CPU to the topology. 1148 * At this time, also add the corresponded CPU->GPU link if GPU 1149 * is large bar. 1150 * For xGMI, we only added the link with one direction in the crat 1151 * table, add corresponded reversed direction link now. 1152 */ 1153 if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) { 1154 to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to); 1155 if (!to_dev) 1156 return -ENODEV; 1157 /* same everything but the other direction */ 1158 props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); 1159 if (!props2) 1160 return -ENOMEM; 1161 1162 props2->node_from = id_to; 1163 props2->node_to = id_from; 1164 props2->kobj = NULL; 1165 to_dev->node_props.io_links_count++; 1166 list_add_tail(&props2->list, &to_dev->io_link_props); 1167 } 1168 1169 return 0; 1170 } 1171 1172 /* kfd_parse_subtype - parse subtypes and attach it to correct topology device 1173 * present in the device_list 1174 * @sub_type_hdr - subtype section of crat_image 1175 * @device_list - list of topology devices present in this crat_image 1176 */ 1177 static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, 1178 struct list_head *device_list) 1179 { 1180 struct crat_subtype_computeunit *cu; 1181 struct crat_subtype_memory *mem; 1182 struct crat_subtype_cache *cache; 1183 struct crat_subtype_iolink *iolink; 1184 int ret = 0; 1185 1186 switch (sub_type_hdr->type) { 1187 case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: 1188 cu = (struct crat_subtype_computeunit *)sub_type_hdr; 1189 ret = kfd_parse_subtype_cu(cu, device_list); 1190 break; 1191 case CRAT_SUBTYPE_MEMORY_AFFINITY: 1192 mem = (struct crat_subtype_memory *)sub_type_hdr; 1193 ret = kfd_parse_subtype_mem(mem, device_list); 1194 break; 1195 case CRAT_SUBTYPE_CACHE_AFFINITY: 1196 cache = (struct crat_subtype_cache *)sub_type_hdr; 1197 ret = kfd_parse_subtype_cache(cache, device_list); 1198 break; 1199 case CRAT_SUBTYPE_TLB_AFFINITY: 1200 /* 1201 * For now, nothing to do here 1202 */ 1203 pr_debug("Found TLB entry in CRAT table (not processing)\n"); 1204 break; 1205 case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: 1206 /* 1207 * For now, nothing to do here 1208 */ 1209 pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); 1210 break; 1211 case CRAT_SUBTYPE_IOLINK_AFFINITY: 1212 iolink = (struct crat_subtype_iolink *)sub_type_hdr; 1213 ret = kfd_parse_subtype_iolink(iolink, device_list); 1214 break; 1215 default: 1216 pr_warn("Unknown subtype %d in CRAT\n", 1217 sub_type_hdr->type); 1218 } 1219 1220 return ret; 1221 } 1222 1223 /* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT 1224 * create a kfd_topology_device and add in to device_list. Also parse 1225 * CRAT subtypes and attach it to appropriate kfd_topology_device 1226 * @crat_image - input image containing CRAT 1227 * @device_list - [OUT] list of kfd_topology_device generated after 1228 * parsing crat_image 1229 * @proximity_domain - Proximity domain of the first device in the table 1230 * 1231 * Return - 0 if successful else -ve value 1232 */ 1233 int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, 1234 uint32_t proximity_domain) 1235 { 1236 struct kfd_topology_device *top_dev = NULL; 1237 struct crat_subtype_generic *sub_type_hdr; 1238 uint16_t node_id; 1239 int ret = 0; 1240 struct crat_header *crat_table = (struct crat_header *)crat_image; 1241 uint16_t num_nodes; 1242 uint32_t image_len; 1243 1244 if (!crat_image) 1245 return -EINVAL; 1246 1247 if (!list_empty(device_list)) { 1248 pr_warn("Error device list should be empty\n"); 1249 return -EINVAL; 1250 } 1251 1252 num_nodes = crat_table->num_domains; 1253 image_len = crat_table->length; 1254 1255 pr_debug("Parsing CRAT table with %d nodes\n", num_nodes); 1256 1257 for (node_id = 0; node_id < num_nodes; node_id++) { 1258 top_dev = kfd_create_topology_device(device_list); 1259 if (!top_dev) 1260 break; 1261 top_dev->proximity_domain = proximity_domain++; 1262 } 1263 1264 if (!top_dev) { 1265 ret = -ENOMEM; 1266 goto err; 1267 } 1268 1269 memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); 1270 memcpy(top_dev->oem_table_id, crat_table->oem_table_id, 1271 CRAT_OEMTABLEID_LENGTH); 1272 top_dev->oem_revision = crat_table->oem_revision; 1273 1274 sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); 1275 while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < 1276 ((char *)crat_image) + image_len) { 1277 if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { 1278 ret = kfd_parse_subtype(sub_type_hdr, device_list); 1279 if (ret) 1280 break; 1281 } 1282 1283 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1284 sub_type_hdr->length); 1285 } 1286 1287 err: 1288 if (ret) 1289 kfd_release_topology_device_list(device_list); 1290 1291 return ret; 1292 } 1293 1294 /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ 1295 static int fill_in_l1_pcache(struct crat_subtype_cache *pcache, 1296 struct kfd_gpu_cache_info *pcache_info, 1297 struct kfd_cu_info *cu_info, 1298 int mem_available, 1299 int cu_bitmask, 1300 int cache_type, unsigned int cu_processor_id, 1301 int cu_block) 1302 { 1303 unsigned int cu_sibling_map_mask; 1304 int first_active_cu; 1305 1306 /* First check if enough memory is available */ 1307 if (sizeof(struct crat_subtype_cache) > mem_available) 1308 return -ENOMEM; 1309 1310 cu_sibling_map_mask = cu_bitmask; 1311 cu_sibling_map_mask >>= cu_block; 1312 cu_sibling_map_mask &= 1313 ((1 << pcache_info[cache_type].num_cu_shared) - 1); 1314 first_active_cu = ffs(cu_sibling_map_mask); 1315 1316 /* CU could be inactive. In case of shared cache find the first active 1317 * CU. and incase of non-shared cache check if the CU is inactive. If 1318 * inactive active skip it 1319 */ 1320 if (first_active_cu) { 1321 memset(pcache, 0, sizeof(struct crat_subtype_cache)); 1322 pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; 1323 pcache->length = sizeof(struct crat_subtype_cache); 1324 pcache->flags = pcache_info[cache_type].flags; 1325 pcache->processor_id_low = cu_processor_id 1326 + (first_active_cu - 1); 1327 pcache->cache_level = pcache_info[cache_type].cache_level; 1328 pcache->cache_size = pcache_info[cache_type].cache_size; 1329 1330 /* Sibling map is w.r.t processor_id_low, so shift out 1331 * inactive CU 1332 */ 1333 cu_sibling_map_mask = 1334 cu_sibling_map_mask >> (first_active_cu - 1); 1335 1336 pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); 1337 pcache->sibling_map[1] = 1338 (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); 1339 pcache->sibling_map[2] = 1340 (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); 1341 pcache->sibling_map[3] = 1342 (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); 1343 return 0; 1344 } 1345 return 1; 1346 } 1347 1348 /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ 1349 static int fill_in_l2_l3_pcache(struct crat_subtype_cache *pcache, 1350 struct kfd_gpu_cache_info *pcache_info, 1351 struct kfd_cu_info *cu_info, 1352 int mem_available, 1353 int cache_type, unsigned int cu_processor_id) 1354 { 1355 unsigned int cu_sibling_map_mask; 1356 int first_active_cu; 1357 int i, j, k; 1358 1359 /* First check if enough memory is available */ 1360 if (sizeof(struct crat_subtype_cache) > mem_available) 1361 return -ENOMEM; 1362 1363 cu_sibling_map_mask = cu_info->cu_bitmap[0][0]; 1364 cu_sibling_map_mask &= 1365 ((1 << pcache_info[cache_type].num_cu_shared) - 1); 1366 first_active_cu = ffs(cu_sibling_map_mask); 1367 1368 /* CU could be inactive. In case of shared cache find the first active 1369 * CU. and incase of non-shared cache check if the CU is inactive. If 1370 * inactive active skip it 1371 */ 1372 if (first_active_cu) { 1373 memset(pcache, 0, sizeof(struct crat_subtype_cache)); 1374 pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; 1375 pcache->length = sizeof(struct crat_subtype_cache); 1376 pcache->flags = pcache_info[cache_type].flags; 1377 pcache->processor_id_low = cu_processor_id 1378 + (first_active_cu - 1); 1379 pcache->cache_level = pcache_info[cache_type].cache_level; 1380 pcache->cache_size = pcache_info[cache_type].cache_size; 1381 1382 /* Sibling map is w.r.t processor_id_low, so shift out 1383 * inactive CU 1384 */ 1385 cu_sibling_map_mask = 1386 cu_sibling_map_mask >> (first_active_cu - 1); 1387 k = 0; 1388 for (i = 0; i < cu_info->num_shader_engines; i++) { 1389 for (j = 0; j < cu_info->num_shader_arrays_per_engine; 1390 j++) { 1391 pcache->sibling_map[k] = 1392 (uint8_t)(cu_sibling_map_mask & 0xFF); 1393 pcache->sibling_map[k+1] = 1394 (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); 1395 pcache->sibling_map[k+2] = 1396 (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); 1397 pcache->sibling_map[k+3] = 1398 (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); 1399 k += 4; 1400 cu_sibling_map_mask = 1401 cu_info->cu_bitmap[i % 4][j + i / 4]; 1402 cu_sibling_map_mask &= ( 1403 (1 << pcache_info[cache_type].num_cu_shared) 1404 - 1); 1405 } 1406 } 1407 return 0; 1408 } 1409 return 1; 1410 } 1411 1412 #define KFD_MAX_CACHE_TYPES 6 1413 1414 static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, 1415 struct kfd_gpu_cache_info *pcache_info) 1416 { 1417 struct amdgpu_device *adev = kdev->adev; 1418 int i = 0; 1419 1420 /* TCP L1 Cache per CU */ 1421 if (adev->gfx.config.gc_tcp_l1_size) { 1422 pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size; 1423 pcache_info[i].cache_level = 1; 1424 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1425 CRAT_CACHE_FLAGS_DATA_CACHE | 1426 CRAT_CACHE_FLAGS_SIMD_CACHE); 1427 pcache_info[0].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; 1428 i++; 1429 } 1430 /* Scalar L1 Instruction Cache per SQC */ 1431 if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) { 1432 pcache_info[i].cache_size = 1433 adev->gfx.config.gc_l1_instruction_cache_size_per_sqc; 1434 pcache_info[i].cache_level = 1; 1435 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1436 CRAT_CACHE_FLAGS_INST_CACHE | 1437 CRAT_CACHE_FLAGS_SIMD_CACHE); 1438 pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; 1439 i++; 1440 } 1441 /* Scalar L1 Data Cache per SQC */ 1442 if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) { 1443 pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc; 1444 pcache_info[i].cache_level = 1; 1445 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1446 CRAT_CACHE_FLAGS_DATA_CACHE | 1447 CRAT_CACHE_FLAGS_SIMD_CACHE); 1448 pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; 1449 i++; 1450 } 1451 /* GL1 Data Cache per SA */ 1452 if (adev->gfx.config.gc_gl1c_per_sa && 1453 adev->gfx.config.gc_gl1c_size_per_instance) { 1454 pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa * 1455 adev->gfx.config.gc_gl1c_size_per_instance; 1456 pcache_info[i].cache_level = 1; 1457 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1458 CRAT_CACHE_FLAGS_DATA_CACHE | 1459 CRAT_CACHE_FLAGS_SIMD_CACHE); 1460 pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; 1461 i++; 1462 } 1463 /* L2 Data Cache per GPU (Total Tex Cache) */ 1464 if (adev->gfx.config.gc_gl2c_per_gpu) { 1465 pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu; 1466 pcache_info[i].cache_level = 2; 1467 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1468 CRAT_CACHE_FLAGS_DATA_CACHE | 1469 CRAT_CACHE_FLAGS_SIMD_CACHE); 1470 pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; 1471 i++; 1472 } 1473 /* L3 Data Cache per GPU */ 1474 if (adev->gmc.mall_size) { 1475 pcache_info[i].cache_size = adev->gmc.mall_size / 1024; 1476 pcache_info[i].cache_level = 3; 1477 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1478 CRAT_CACHE_FLAGS_DATA_CACHE | 1479 CRAT_CACHE_FLAGS_SIMD_CACHE); 1480 pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; 1481 i++; 1482 } 1483 return i; 1484 } 1485 1486 /* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info 1487 * tables 1488 * 1489 * @kdev - [IN] GPU device 1490 * @gpu_processor_id - [IN] GPU processor ID to which these caches 1491 * associate 1492 * @available_size - [IN] Amount of memory available in pcache 1493 * @cu_info - [IN] Compute Unit info obtained from KGD 1494 * @pcache - [OUT] memory into which cache data is to be filled in. 1495 * @size_filled - [OUT] amount of data used up in pcache. 1496 * @num_of_entries - [OUT] number of caches added 1497 */ 1498 static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, 1499 int gpu_processor_id, 1500 int available_size, 1501 struct kfd_cu_info *cu_info, 1502 struct crat_subtype_cache *pcache, 1503 int *size_filled, 1504 int *num_of_entries) 1505 { 1506 struct kfd_gpu_cache_info *pcache_info; 1507 struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES]; 1508 int num_of_cache_types = 0; 1509 int i, j, k; 1510 int ct = 0; 1511 int mem_available = available_size; 1512 unsigned int cu_processor_id; 1513 int ret; 1514 unsigned int num_cu_shared; 1515 1516 switch (kdev->adev->asic_type) { 1517 case CHIP_KAVERI: 1518 pcache_info = kaveri_cache_info; 1519 num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); 1520 break; 1521 case CHIP_HAWAII: 1522 pcache_info = hawaii_cache_info; 1523 num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); 1524 break; 1525 case CHIP_CARRIZO: 1526 pcache_info = carrizo_cache_info; 1527 num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); 1528 break; 1529 case CHIP_TONGA: 1530 pcache_info = tonga_cache_info; 1531 num_of_cache_types = ARRAY_SIZE(tonga_cache_info); 1532 break; 1533 case CHIP_FIJI: 1534 pcache_info = fiji_cache_info; 1535 num_of_cache_types = ARRAY_SIZE(fiji_cache_info); 1536 break; 1537 case CHIP_POLARIS10: 1538 pcache_info = polaris10_cache_info; 1539 num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); 1540 break; 1541 case CHIP_POLARIS11: 1542 pcache_info = polaris11_cache_info; 1543 num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); 1544 break; 1545 case CHIP_POLARIS12: 1546 pcache_info = polaris12_cache_info; 1547 num_of_cache_types = ARRAY_SIZE(polaris12_cache_info); 1548 break; 1549 case CHIP_VEGAM: 1550 pcache_info = vegam_cache_info; 1551 num_of_cache_types = ARRAY_SIZE(vegam_cache_info); 1552 break; 1553 default: 1554 switch (KFD_GC_VERSION(kdev)) { 1555 case IP_VERSION(9, 0, 1): 1556 pcache_info = vega10_cache_info; 1557 num_of_cache_types = ARRAY_SIZE(vega10_cache_info); 1558 break; 1559 case IP_VERSION(9, 2, 1): 1560 pcache_info = vega12_cache_info; 1561 num_of_cache_types = ARRAY_SIZE(vega12_cache_info); 1562 break; 1563 case IP_VERSION(9, 4, 0): 1564 case IP_VERSION(9, 4, 1): 1565 pcache_info = vega20_cache_info; 1566 num_of_cache_types = ARRAY_SIZE(vega20_cache_info); 1567 break; 1568 case IP_VERSION(9, 4, 2): 1569 pcache_info = aldebaran_cache_info; 1570 num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info); 1571 break; 1572 case IP_VERSION(9, 1, 0): 1573 case IP_VERSION(9, 2, 2): 1574 pcache_info = raven_cache_info; 1575 num_of_cache_types = ARRAY_SIZE(raven_cache_info); 1576 break; 1577 case IP_VERSION(9, 3, 0): 1578 pcache_info = renoir_cache_info; 1579 num_of_cache_types = ARRAY_SIZE(renoir_cache_info); 1580 break; 1581 case IP_VERSION(10, 1, 10): 1582 case IP_VERSION(10, 1, 2): 1583 case IP_VERSION(10, 1, 3): 1584 case IP_VERSION(10, 1, 4): 1585 pcache_info = navi10_cache_info; 1586 num_of_cache_types = ARRAY_SIZE(navi10_cache_info); 1587 break; 1588 case IP_VERSION(10, 1, 1): 1589 pcache_info = navi14_cache_info; 1590 num_of_cache_types = ARRAY_SIZE(navi14_cache_info); 1591 break; 1592 case IP_VERSION(10, 3, 0): 1593 pcache_info = sienna_cichlid_cache_info; 1594 num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info); 1595 break; 1596 case IP_VERSION(10, 3, 2): 1597 pcache_info = navy_flounder_cache_info; 1598 num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info); 1599 break; 1600 case IP_VERSION(10, 3, 4): 1601 pcache_info = dimgrey_cavefish_cache_info; 1602 num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info); 1603 break; 1604 case IP_VERSION(10, 3, 1): 1605 pcache_info = vangogh_cache_info; 1606 num_of_cache_types = ARRAY_SIZE(vangogh_cache_info); 1607 break; 1608 case IP_VERSION(10, 3, 5): 1609 pcache_info = beige_goby_cache_info; 1610 num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info); 1611 break; 1612 case IP_VERSION(10, 3, 3): 1613 pcache_info = yellow_carp_cache_info; 1614 num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info); 1615 break; 1616 case IP_VERSION(10, 3, 6): 1617 pcache_info = gc_10_3_6_cache_info; 1618 num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info); 1619 break; 1620 case IP_VERSION(10, 3, 7): 1621 pcache_info = gfx1037_cache_info; 1622 num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info); 1623 break; 1624 case IP_VERSION(11, 0, 0): 1625 case IP_VERSION(11, 0, 1): 1626 case IP_VERSION(11, 0, 2): 1627 case IP_VERSION(11, 0, 3): 1628 pcache_info = cache_info; 1629 num_of_cache_types = 1630 kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info); 1631 break; 1632 default: 1633 return -EINVAL; 1634 } 1635 } 1636 1637 *size_filled = 0; 1638 *num_of_entries = 0; 1639 1640 /* For each type of cache listed in the kfd_gpu_cache_info table, 1641 * go through all available Compute Units. 1642 * The [i,j,k] loop will 1643 * if kfd_gpu_cache_info.num_cu_shared = 1 1644 * will parse through all available CU 1645 * If (kfd_gpu_cache_info.num_cu_shared != 1) 1646 * then it will consider only one CU from 1647 * the shared unit 1648 */ 1649 1650 for (ct = 0; ct < num_of_cache_types; ct++) { 1651 cu_processor_id = gpu_processor_id; 1652 if (pcache_info[ct].cache_level == 1) { 1653 for (i = 0; i < cu_info->num_shader_engines; i++) { 1654 for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) { 1655 for (k = 0; k < cu_info->num_cu_per_sh; 1656 k += pcache_info[ct].num_cu_shared) { 1657 ret = fill_in_l1_pcache(pcache, 1658 pcache_info, 1659 cu_info, 1660 mem_available, 1661 cu_info->cu_bitmap[i % 4][j + i / 4], 1662 ct, 1663 cu_processor_id, 1664 k); 1665 1666 if (ret < 0) 1667 break; 1668 1669 if (!ret) { 1670 pcache++; 1671 (*num_of_entries)++; 1672 mem_available -= sizeof(*pcache); 1673 (*size_filled) += sizeof(*pcache); 1674 } 1675 1676 /* Move to next CU block */ 1677 num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <= 1678 cu_info->num_cu_per_sh) ? 1679 pcache_info[ct].num_cu_shared : 1680 (cu_info->num_cu_per_sh - k); 1681 cu_processor_id += num_cu_shared; 1682 } 1683 } 1684 } 1685 } else { 1686 ret = fill_in_l2_l3_pcache(pcache, 1687 pcache_info, 1688 cu_info, 1689 mem_available, 1690 ct, 1691 cu_processor_id); 1692 1693 if (ret < 0) 1694 break; 1695 1696 if (!ret) { 1697 pcache++; 1698 (*num_of_entries)++; 1699 mem_available -= sizeof(*pcache); 1700 (*size_filled) += sizeof(*pcache); 1701 } 1702 } 1703 } 1704 1705 pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); 1706 1707 return 0; 1708 } 1709 1710 static bool kfd_ignore_crat(void) 1711 { 1712 bool ret; 1713 1714 if (ignore_crat) 1715 return true; 1716 1717 #ifndef KFD_SUPPORT_IOMMU_V2 1718 ret = true; 1719 #else 1720 ret = false; 1721 #endif 1722 1723 return ret; 1724 } 1725 1726 /* 1727 * kfd_create_crat_image_acpi - Allocates memory for CRAT image and 1728 * copies CRAT from ACPI (if available). 1729 * NOTE: Call kfd_destroy_crat_image to free CRAT image memory 1730 * 1731 * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then 1732 * crat_image will be NULL 1733 * @size: [OUT] size of crat_image 1734 * 1735 * Return 0 if successful else return error code 1736 */ 1737 int kfd_create_crat_image_acpi(void **crat_image, size_t *size) 1738 { 1739 struct acpi_table_header *crat_table; 1740 acpi_status status; 1741 void *pcrat_image; 1742 int rc = 0; 1743 1744 if (!crat_image) 1745 return -EINVAL; 1746 1747 *crat_image = NULL; 1748 1749 if (kfd_ignore_crat()) { 1750 pr_info("CRAT table disabled by module option\n"); 1751 return -ENODATA; 1752 } 1753 1754 /* Fetch the CRAT table from ACPI */ 1755 status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); 1756 if (status == AE_NOT_FOUND) { 1757 pr_info("CRAT table not found\n"); 1758 return -ENODATA; 1759 } else if (ACPI_FAILURE(status)) { 1760 const char *err = acpi_format_exception(status); 1761 1762 pr_err("CRAT table error: %s\n", err); 1763 return -EINVAL; 1764 } 1765 1766 pcrat_image = kvmalloc(crat_table->length, GFP_KERNEL); 1767 if (!pcrat_image) { 1768 rc = -ENOMEM; 1769 goto out; 1770 } 1771 1772 memcpy(pcrat_image, crat_table, crat_table->length); 1773 *crat_image = pcrat_image; 1774 *size = crat_table->length; 1775 out: 1776 acpi_put_table(crat_table); 1777 return rc; 1778 } 1779 1780 /* Memory required to create Virtual CRAT. 1781 * Since there is no easy way to predict the amount of memory required, the 1782 * following amount is allocated for GPU Virtual CRAT. This is 1783 * expected to cover all known conditions. But to be safe additional check 1784 * is put in the code to ensure we don't overwrite. 1785 */ 1786 #define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE) 1787 1788 /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node 1789 * 1790 * @numa_node_id: CPU NUMA node id 1791 * @avail_size: Available size in the memory 1792 * @sub_type_hdr: Memory into which compute info will be filled in 1793 * 1794 * Return 0 if successful else return -ve value 1795 */ 1796 static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, 1797 int proximity_domain, 1798 struct crat_subtype_computeunit *sub_type_hdr) 1799 { 1800 const struct cpumask *cpumask; 1801 1802 *avail_size -= sizeof(struct crat_subtype_computeunit); 1803 if (*avail_size < 0) 1804 return -ENOMEM; 1805 1806 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); 1807 1808 /* Fill in subtype header data */ 1809 sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; 1810 sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); 1811 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 1812 1813 cpumask = cpumask_of_node(numa_node_id); 1814 1815 /* Fill in CU data */ 1816 sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; 1817 sub_type_hdr->proximity_domain = proximity_domain; 1818 sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); 1819 if (sub_type_hdr->processor_id_low == -1) 1820 return -EINVAL; 1821 1822 sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); 1823 1824 return 0; 1825 } 1826 1827 /* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node 1828 * 1829 * @numa_node_id: CPU NUMA node id 1830 * @avail_size: Available size in the memory 1831 * @sub_type_hdr: Memory into which compute info will be filled in 1832 * 1833 * Return 0 if successful else return -ve value 1834 */ 1835 static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, 1836 int proximity_domain, 1837 struct crat_subtype_memory *sub_type_hdr) 1838 { 1839 uint64_t mem_in_bytes = 0; 1840 pg_data_t *pgdat; 1841 int zone_type; 1842 1843 *avail_size -= sizeof(struct crat_subtype_memory); 1844 if (*avail_size < 0) 1845 return -ENOMEM; 1846 1847 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); 1848 1849 /* Fill in subtype header data */ 1850 sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; 1851 sub_type_hdr->length = sizeof(struct crat_subtype_memory); 1852 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 1853 1854 /* Fill in Memory Subunit data */ 1855 1856 /* Unlike si_meminfo, si_meminfo_node is not exported. So 1857 * the following lines are duplicated from si_meminfo_node 1858 * function 1859 */ 1860 pgdat = NODE_DATA(numa_node_id); 1861 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 1862 mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]); 1863 mem_in_bytes <<= PAGE_SHIFT; 1864 1865 sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); 1866 sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); 1867 sub_type_hdr->proximity_domain = proximity_domain; 1868 1869 return 0; 1870 } 1871 1872 #ifdef CONFIG_X86_64 1873 static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, 1874 uint32_t *num_entries, 1875 struct crat_subtype_iolink *sub_type_hdr) 1876 { 1877 int nid; 1878 struct cpuinfo_x86 *c = &cpu_data(0); 1879 uint8_t link_type; 1880 1881 if (c->x86_vendor == X86_VENDOR_AMD) 1882 link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; 1883 else 1884 link_type = CRAT_IOLINK_TYPE_QPI_1_1; 1885 1886 *num_entries = 0; 1887 1888 /* Create IO links from this node to other CPU nodes */ 1889 for_each_online_node(nid) { 1890 if (nid == numa_node_id) /* node itself */ 1891 continue; 1892 1893 *avail_size -= sizeof(struct crat_subtype_iolink); 1894 if (*avail_size < 0) 1895 return -ENOMEM; 1896 1897 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 1898 1899 /* Fill in subtype header data */ 1900 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 1901 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 1902 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 1903 1904 /* Fill in IO link data */ 1905 sub_type_hdr->proximity_domain_from = numa_node_id; 1906 sub_type_hdr->proximity_domain_to = nid; 1907 sub_type_hdr->io_interface_type = link_type; 1908 1909 (*num_entries)++; 1910 sub_type_hdr++; 1911 } 1912 1913 return 0; 1914 } 1915 #endif 1916 1917 /* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU 1918 * 1919 * @pcrat_image: Fill in VCRAT for CPU 1920 * @size: [IN] allocated size of crat_image. 1921 * [OUT] actual size of data filled in crat_image 1922 */ 1923 static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) 1924 { 1925 struct crat_header *crat_table = (struct crat_header *)pcrat_image; 1926 struct acpi_table_header *acpi_table; 1927 acpi_status status; 1928 struct crat_subtype_generic *sub_type_hdr; 1929 int avail_size = *size; 1930 int numa_node_id; 1931 #ifdef CONFIG_X86_64 1932 uint32_t entries = 0; 1933 #endif 1934 int ret = 0; 1935 1936 if (!pcrat_image) 1937 return -EINVAL; 1938 1939 /* Fill in CRAT Header. 1940 * Modify length and total_entries as subunits are added. 1941 */ 1942 avail_size -= sizeof(struct crat_header); 1943 if (avail_size < 0) 1944 return -ENOMEM; 1945 1946 memset(crat_table, 0, sizeof(struct crat_header)); 1947 memcpy(&crat_table->signature, CRAT_SIGNATURE, 1948 sizeof(crat_table->signature)); 1949 crat_table->length = sizeof(struct crat_header); 1950 1951 status = acpi_get_table("DSDT", 0, &acpi_table); 1952 if (status != AE_OK) 1953 pr_warn("DSDT table not found for OEM information\n"); 1954 else { 1955 crat_table->oem_revision = acpi_table->revision; 1956 memcpy(crat_table->oem_id, acpi_table->oem_id, 1957 CRAT_OEMID_LENGTH); 1958 memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, 1959 CRAT_OEMTABLEID_LENGTH); 1960 acpi_put_table(acpi_table); 1961 } 1962 crat_table->total_entries = 0; 1963 crat_table->num_domains = 0; 1964 1965 sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); 1966 1967 for_each_online_node(numa_node_id) { 1968 if (kfd_numa_node_to_apic_id(numa_node_id) == -1) 1969 continue; 1970 1971 /* Fill in Subtype: Compute Unit */ 1972 ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, 1973 crat_table->num_domains, 1974 (struct crat_subtype_computeunit *)sub_type_hdr); 1975 if (ret < 0) 1976 return ret; 1977 crat_table->length += sub_type_hdr->length; 1978 crat_table->total_entries++; 1979 1980 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1981 sub_type_hdr->length); 1982 1983 /* Fill in Subtype: Memory */ 1984 ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, 1985 crat_table->num_domains, 1986 (struct crat_subtype_memory *)sub_type_hdr); 1987 if (ret < 0) 1988 return ret; 1989 crat_table->length += sub_type_hdr->length; 1990 crat_table->total_entries++; 1991 1992 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1993 sub_type_hdr->length); 1994 1995 /* Fill in Subtype: IO Link */ 1996 #ifdef CONFIG_X86_64 1997 ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, 1998 &entries, 1999 (struct crat_subtype_iolink *)sub_type_hdr); 2000 if (ret < 0) 2001 return ret; 2002 2003 if (entries) { 2004 crat_table->length += (sub_type_hdr->length * entries); 2005 crat_table->total_entries += entries; 2006 2007 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 2008 sub_type_hdr->length * entries); 2009 } 2010 #else 2011 pr_info("IO link not available for non x86 platforms\n"); 2012 #endif 2013 2014 crat_table->num_domains++; 2015 } 2016 2017 /* TODO: Add cache Subtype for CPU. 2018 * Currently, CPU cache information is available in function 2019 * detect_cache_attributes(cpu) defined in the file 2020 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not 2021 * exported and to get the same information the code needs to be 2022 * duplicated. 2023 */ 2024 2025 *size = crat_table->length; 2026 pr_info("Virtual CRAT table created for CPU\n"); 2027 2028 return 0; 2029 } 2030 2031 static int kfd_fill_gpu_memory_affinity(int *avail_size, 2032 struct kfd_dev *kdev, uint8_t type, uint64_t size, 2033 struct crat_subtype_memory *sub_type_hdr, 2034 uint32_t proximity_domain, 2035 const struct kfd_local_mem_info *local_mem_info) 2036 { 2037 *avail_size -= sizeof(struct crat_subtype_memory); 2038 if (*avail_size < 0) 2039 return -ENOMEM; 2040 2041 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); 2042 sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; 2043 sub_type_hdr->length = sizeof(struct crat_subtype_memory); 2044 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; 2045 2046 sub_type_hdr->proximity_domain = proximity_domain; 2047 2048 pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", 2049 type, size); 2050 2051 sub_type_hdr->length_low = lower_32_bits(size); 2052 sub_type_hdr->length_high = upper_32_bits(size); 2053 2054 sub_type_hdr->width = local_mem_info->vram_width; 2055 sub_type_hdr->visibility_type = type; 2056 2057 return 0; 2058 } 2059 2060 #ifdef CONFIG_ACPI_NUMA 2061 static void kfd_find_numa_node_in_srat(struct kfd_dev *kdev) 2062 { 2063 struct acpi_table_header *table_header = NULL; 2064 struct acpi_subtable_header *sub_header = NULL; 2065 unsigned long table_end, subtable_len; 2066 u32 pci_id = pci_domain_nr(kdev->pdev->bus) << 16 | 2067 pci_dev_id(kdev->pdev); 2068 u32 bdf; 2069 acpi_status status; 2070 struct acpi_srat_cpu_affinity *cpu; 2071 struct acpi_srat_generic_affinity *gpu; 2072 int pxm = 0, max_pxm = 0; 2073 int numa_node = NUMA_NO_NODE; 2074 bool found = false; 2075 2076 /* Fetch the SRAT table from ACPI */ 2077 status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header); 2078 if (status == AE_NOT_FOUND) { 2079 pr_warn("SRAT table not found\n"); 2080 return; 2081 } else if (ACPI_FAILURE(status)) { 2082 const char *err = acpi_format_exception(status); 2083 pr_err("SRAT table error: %s\n", err); 2084 return; 2085 } 2086 2087 table_end = (unsigned long)table_header + table_header->length; 2088 2089 /* Parse all entries looking for a match. */ 2090 sub_header = (struct acpi_subtable_header *) 2091 ((unsigned long)table_header + 2092 sizeof(struct acpi_table_srat)); 2093 subtable_len = sub_header->length; 2094 2095 while (((unsigned long)sub_header) + subtable_len < table_end) { 2096 /* 2097 * If length is 0, break from this loop to avoid 2098 * infinite loop. 2099 */ 2100 if (subtable_len == 0) { 2101 pr_err("SRAT invalid zero length\n"); 2102 break; 2103 } 2104 2105 switch (sub_header->type) { 2106 case ACPI_SRAT_TYPE_CPU_AFFINITY: 2107 cpu = (struct acpi_srat_cpu_affinity *)sub_header; 2108 pxm = *((u32 *)cpu->proximity_domain_hi) << 8 | 2109 cpu->proximity_domain_lo; 2110 if (pxm > max_pxm) 2111 max_pxm = pxm; 2112 break; 2113 case ACPI_SRAT_TYPE_GENERIC_AFFINITY: 2114 gpu = (struct acpi_srat_generic_affinity *)sub_header; 2115 bdf = *((u16 *)(&gpu->device_handle[0])) << 16 | 2116 *((u16 *)(&gpu->device_handle[2])); 2117 if (bdf == pci_id) { 2118 found = true; 2119 numa_node = pxm_to_node(gpu->proximity_domain); 2120 } 2121 break; 2122 default: 2123 break; 2124 } 2125 2126 if (found) 2127 break; 2128 2129 sub_header = (struct acpi_subtable_header *) 2130 ((unsigned long)sub_header + subtable_len); 2131 subtable_len = sub_header->length; 2132 } 2133 2134 acpi_put_table(table_header); 2135 2136 /* Workaround bad cpu-gpu binding case */ 2137 if (found && (numa_node < 0 || 2138 numa_node > pxm_to_node(max_pxm))) 2139 numa_node = 0; 2140 2141 if (numa_node != NUMA_NO_NODE) 2142 set_dev_node(&kdev->pdev->dev, numa_node); 2143 } 2144 #endif 2145 2146 /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU 2147 * to its NUMA node 2148 * @avail_size: Available size in the memory 2149 * @kdev - [IN] GPU device 2150 * @sub_type_hdr: Memory into which io link info will be filled in 2151 * @proximity_domain - proximity domain of the GPU node 2152 * 2153 * Return 0 if successful else return -ve value 2154 */ 2155 static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, 2156 struct kfd_dev *kdev, 2157 struct crat_subtype_iolink *sub_type_hdr, 2158 uint32_t proximity_domain) 2159 { 2160 *avail_size -= sizeof(struct crat_subtype_iolink); 2161 if (*avail_size < 0) 2162 return -ENOMEM; 2163 2164 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 2165 2166 /* Fill in subtype header data */ 2167 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 2168 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 2169 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; 2170 if (kfd_dev_is_large_bar(kdev)) 2171 sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 2172 2173 /* Fill in IOLINK subtype. 2174 * TODO: Fill-in other fields of iolink subtype 2175 */ 2176 if (kdev->adev->gmc.xgmi.connected_to_cpu) { 2177 /* 2178 * with host gpu xgmi link, host can access gpu memory whether 2179 * or not pcie bar type is large, so always create bidirectional 2180 * io link. 2181 */ 2182 sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 2183 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; 2184 sub_type_hdr->num_hops_xgmi = 1; 2185 if (KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 2)) { 2186 sub_type_hdr->minimum_bandwidth_mbs = 2187 amdgpu_amdkfd_get_xgmi_bandwidth_mbytes( 2188 kdev->adev, NULL, true); 2189 sub_type_hdr->maximum_bandwidth_mbs = 2190 sub_type_hdr->minimum_bandwidth_mbs; 2191 } 2192 } else { 2193 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; 2194 sub_type_hdr->minimum_bandwidth_mbs = 2195 amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true); 2196 sub_type_hdr->maximum_bandwidth_mbs = 2197 amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false); 2198 } 2199 2200 sub_type_hdr->proximity_domain_from = proximity_domain; 2201 2202 #ifdef CONFIG_ACPI_NUMA 2203 if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) 2204 kfd_find_numa_node_in_srat(kdev); 2205 #endif 2206 #ifdef CONFIG_NUMA 2207 if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) 2208 sub_type_hdr->proximity_domain_to = 0; 2209 else 2210 sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; 2211 #else 2212 sub_type_hdr->proximity_domain_to = 0; 2213 #endif 2214 return 0; 2215 } 2216 2217 static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, 2218 struct kfd_dev *kdev, 2219 struct kfd_dev *peer_kdev, 2220 struct crat_subtype_iolink *sub_type_hdr, 2221 uint32_t proximity_domain_from, 2222 uint32_t proximity_domain_to) 2223 { 2224 *avail_size -= sizeof(struct crat_subtype_iolink); 2225 if (*avail_size < 0) 2226 return -ENOMEM; 2227 2228 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 2229 2230 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 2231 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 2232 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED | 2233 CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 2234 2235 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; 2236 sub_type_hdr->proximity_domain_from = proximity_domain_from; 2237 sub_type_hdr->proximity_domain_to = proximity_domain_to; 2238 sub_type_hdr->num_hops_xgmi = 2239 amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, peer_kdev->adev); 2240 sub_type_hdr->maximum_bandwidth_mbs = 2241 amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, peer_kdev->adev, false); 2242 sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ? 2243 amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, NULL, true) : 0; 2244 2245 return 0; 2246 } 2247 2248 /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU 2249 * 2250 * @pcrat_image: Fill in VCRAT for GPU 2251 * @size: [IN] allocated size of crat_image. 2252 * [OUT] actual size of data filled in crat_image 2253 */ 2254 static int kfd_create_vcrat_image_gpu(void *pcrat_image, 2255 size_t *size, struct kfd_dev *kdev, 2256 uint32_t proximity_domain) 2257 { 2258 struct crat_header *crat_table = (struct crat_header *)pcrat_image; 2259 struct crat_subtype_generic *sub_type_hdr; 2260 struct kfd_local_mem_info local_mem_info; 2261 struct kfd_topology_device *peer_dev; 2262 struct crat_subtype_computeunit *cu; 2263 struct kfd_cu_info cu_info; 2264 int avail_size = *size; 2265 uint32_t total_num_of_cu; 2266 int num_of_cache_entries = 0; 2267 int cache_mem_filled = 0; 2268 uint32_t nid = 0; 2269 int ret = 0; 2270 2271 if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) 2272 return -EINVAL; 2273 2274 /* Fill the CRAT Header. 2275 * Modify length and total_entries as subunits are added. 2276 */ 2277 avail_size -= sizeof(struct crat_header); 2278 if (avail_size < 0) 2279 return -ENOMEM; 2280 2281 memset(crat_table, 0, sizeof(struct crat_header)); 2282 2283 memcpy(&crat_table->signature, CRAT_SIGNATURE, 2284 sizeof(crat_table->signature)); 2285 /* Change length as we add more subtypes*/ 2286 crat_table->length = sizeof(struct crat_header); 2287 crat_table->num_domains = 1; 2288 crat_table->total_entries = 0; 2289 2290 /* Fill in Subtype: Compute Unit 2291 * First fill in the sub type header and then sub type data 2292 */ 2293 avail_size -= sizeof(struct crat_subtype_computeunit); 2294 if (avail_size < 0) 2295 return -ENOMEM; 2296 2297 sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); 2298 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); 2299 2300 sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; 2301 sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); 2302 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 2303 2304 /* Fill CU subtype data */ 2305 cu = (struct crat_subtype_computeunit *)sub_type_hdr; 2306 cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; 2307 cu->proximity_domain = proximity_domain; 2308 2309 amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info); 2310 cu->num_simd_per_cu = cu_info.simd_per_cu; 2311 cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; 2312 cu->max_waves_simd = cu_info.max_waves_per_simd; 2313 2314 cu->wave_front_size = cu_info.wave_front_size; 2315 cu->array_count = cu_info.num_shader_arrays_per_engine * 2316 cu_info.num_shader_engines; 2317 total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); 2318 cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); 2319 cu->num_cu_per_array = cu_info.num_cu_per_sh; 2320 cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; 2321 cu->num_banks = cu_info.num_shader_engines; 2322 cu->lds_size_in_kb = cu_info.lds_size; 2323 2324 cu->hsa_capability = 0; 2325 2326 /* Check if this node supports IOMMU. During parsing this flag will 2327 * translate to HSA_CAP_ATS_PRESENT 2328 */ 2329 if (!kfd_iommu_check_device(kdev)) 2330 cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; 2331 2332 crat_table->length += sub_type_hdr->length; 2333 crat_table->total_entries++; 2334 2335 /* Fill in Subtype: Memory. Only on systems with large BAR (no 2336 * private FB), report memory as public. On other systems 2337 * report the total FB size (public+private) as a single 2338 * private heap. 2339 */ 2340 local_mem_info = kdev->local_mem_info; 2341 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 2342 sub_type_hdr->length); 2343 2344 if (debug_largebar) 2345 local_mem_info.local_mem_size_private = 0; 2346 2347 if (local_mem_info.local_mem_size_private == 0) 2348 ret = kfd_fill_gpu_memory_affinity(&avail_size, 2349 kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, 2350 local_mem_info.local_mem_size_public, 2351 (struct crat_subtype_memory *)sub_type_hdr, 2352 proximity_domain, 2353 &local_mem_info); 2354 else 2355 ret = kfd_fill_gpu_memory_affinity(&avail_size, 2356 kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, 2357 local_mem_info.local_mem_size_public + 2358 local_mem_info.local_mem_size_private, 2359 (struct crat_subtype_memory *)sub_type_hdr, 2360 proximity_domain, 2361 &local_mem_info); 2362 if (ret < 0) 2363 return ret; 2364 2365 crat_table->length += sizeof(struct crat_subtype_memory); 2366 crat_table->total_entries++; 2367 2368 /* TODO: Fill in cache information. This information is NOT readily 2369 * available in KGD 2370 */ 2371 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 2372 sub_type_hdr->length); 2373 ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, 2374 avail_size, 2375 &cu_info, 2376 (struct crat_subtype_cache *)sub_type_hdr, 2377 &cache_mem_filled, 2378 &num_of_cache_entries); 2379 2380 if (ret < 0) 2381 return ret; 2382 2383 crat_table->length += cache_mem_filled; 2384 crat_table->total_entries += num_of_cache_entries; 2385 avail_size -= cache_mem_filled; 2386 2387 /* Fill in Subtype: IO_LINKS 2388 * Only direct links are added here which is Link from GPU to 2389 * its NUMA node. Indirect links are added by userspace. 2390 */ 2391 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 2392 cache_mem_filled); 2393 ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev, 2394 (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); 2395 2396 if (ret < 0) 2397 return ret; 2398 2399 crat_table->length += sub_type_hdr->length; 2400 crat_table->total_entries++; 2401 2402 2403 /* Fill in Subtype: IO_LINKS 2404 * Direct links from GPU to other GPUs through xGMI. 2405 * We will loop GPUs that already be processed (with lower value 2406 * of proximity_domain), add the link for the GPUs with same 2407 * hive id (from this GPU to other GPU) . The reversed iolink 2408 * (from other GPU to this GPU) will be added 2409 * in kfd_parse_subtype_iolink. 2410 */ 2411 if (kdev->hive_id) { 2412 for (nid = 0; nid < proximity_domain; ++nid) { 2413 peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid); 2414 if (!peer_dev->gpu) 2415 continue; 2416 if (peer_dev->gpu->hive_id != kdev->hive_id) 2417 continue; 2418 sub_type_hdr = (typeof(sub_type_hdr))( 2419 (char *)sub_type_hdr + 2420 sizeof(struct crat_subtype_iolink)); 2421 ret = kfd_fill_gpu_xgmi_link_to_gpu( 2422 &avail_size, kdev, peer_dev->gpu, 2423 (struct crat_subtype_iolink *)sub_type_hdr, 2424 proximity_domain, nid); 2425 if (ret < 0) 2426 return ret; 2427 crat_table->length += sub_type_hdr->length; 2428 crat_table->total_entries++; 2429 } 2430 } 2431 *size = crat_table->length; 2432 pr_info("Virtual CRAT table created for GPU\n"); 2433 2434 return ret; 2435 } 2436 2437 /* kfd_create_crat_image_virtual - Allocates memory for CRAT image and 2438 * creates a Virtual CRAT (VCRAT) image 2439 * 2440 * NOTE: Call kfd_destroy_crat_image to free CRAT image memory 2441 * 2442 * @crat_image: VCRAT image created because ACPI does not have a 2443 * CRAT for this device 2444 * @size: [OUT] size of virtual crat_image 2445 * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device 2446 * COMPUTE_UNIT_GPU - Create VCRAT for GPU 2447 * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU 2448 * -- this option is not currently implemented. 2449 * The assumption is that all AMD APUs will have CRAT 2450 * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU 2451 * 2452 * Return 0 if successful else return -ve value 2453 */ 2454 int kfd_create_crat_image_virtual(void **crat_image, size_t *size, 2455 int flags, struct kfd_dev *kdev, 2456 uint32_t proximity_domain) 2457 { 2458 void *pcrat_image = NULL; 2459 int ret = 0, num_nodes; 2460 size_t dyn_size; 2461 2462 if (!crat_image) 2463 return -EINVAL; 2464 2465 *crat_image = NULL; 2466 2467 /* Allocate the CPU Virtual CRAT size based on the number of online 2468 * nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. 2469 * This should cover all the current conditions. A check is put not 2470 * to overwrite beyond allocated size for GPUs 2471 */ 2472 switch (flags) { 2473 case COMPUTE_UNIT_CPU: 2474 num_nodes = num_online_nodes(); 2475 dyn_size = sizeof(struct crat_header) + 2476 num_nodes * (sizeof(struct crat_subtype_computeunit) + 2477 sizeof(struct crat_subtype_memory) + 2478 (num_nodes - 1) * sizeof(struct crat_subtype_iolink)); 2479 pcrat_image = kvmalloc(dyn_size, GFP_KERNEL); 2480 if (!pcrat_image) 2481 return -ENOMEM; 2482 *size = dyn_size; 2483 pr_debug("CRAT size is %ld", dyn_size); 2484 ret = kfd_create_vcrat_image_cpu(pcrat_image, size); 2485 break; 2486 case COMPUTE_UNIT_GPU: 2487 if (!kdev) 2488 return -EINVAL; 2489 pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); 2490 if (!pcrat_image) 2491 return -ENOMEM; 2492 *size = VCRAT_SIZE_FOR_GPU; 2493 ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, 2494 proximity_domain); 2495 break; 2496 case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): 2497 /* TODO: */ 2498 ret = -EINVAL; 2499 pr_err("VCRAT not implemented for APU\n"); 2500 break; 2501 default: 2502 ret = -EINVAL; 2503 } 2504 2505 if (!ret) 2506 *crat_image = pcrat_image; 2507 else 2508 kvfree(pcrat_image); 2509 2510 return ret; 2511 } 2512 2513 2514 /* kfd_destroy_crat_image 2515 * 2516 * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) 2517 * 2518 */ 2519 void kfd_destroy_crat_image(void *crat_image) 2520 { 2521 kvfree(crat_image); 2522 } 2523