1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2015-2022 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/pci.h> 25 #include <linux/acpi.h> 26 #include "kfd_crat.h" 27 #include "kfd_priv.h" 28 #include "kfd_topology.h" 29 #include "kfd_iommu.h" 30 #include "amdgpu.h" 31 #include "amdgpu_amdkfd.h" 32 33 /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. 34 * GPU processor ID are expressed with Bit[31]=1. 35 * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs 36 * used in the CRAT. 37 */ 38 static uint32_t gpu_processor_id_low = 0x80001000; 39 40 /* Return the next available gpu_processor_id and increment it for next GPU 41 * @total_cu_count - Total CUs present in the GPU including ones 42 * masked off 43 */ 44 static inline unsigned int get_and_inc_gpu_processor_id( 45 unsigned int total_cu_count) 46 { 47 int current_id = gpu_processor_id_low; 48 49 gpu_processor_id_low += total_cu_count; 50 return current_id; 51 } 52 53 54 static struct kfd_gpu_cache_info kaveri_cache_info[] = { 55 { 56 /* TCP L1 Cache per CU */ 57 .cache_size = 16, 58 .cache_level = 1, 59 .flags = (CRAT_CACHE_FLAGS_ENABLED | 60 CRAT_CACHE_FLAGS_DATA_CACHE | 61 CRAT_CACHE_FLAGS_SIMD_CACHE), 62 .num_cu_shared = 1, 63 }, 64 { 65 /* Scalar L1 Instruction Cache (in SQC module) per bank */ 66 .cache_size = 16, 67 .cache_level = 1, 68 .flags = (CRAT_CACHE_FLAGS_ENABLED | 69 CRAT_CACHE_FLAGS_INST_CACHE | 70 CRAT_CACHE_FLAGS_SIMD_CACHE), 71 .num_cu_shared = 2, 72 }, 73 { 74 /* Scalar L1 Data Cache (in SQC module) per bank */ 75 .cache_size = 8, 76 .cache_level = 1, 77 .flags = (CRAT_CACHE_FLAGS_ENABLED | 78 CRAT_CACHE_FLAGS_DATA_CACHE | 79 CRAT_CACHE_FLAGS_SIMD_CACHE), 80 .num_cu_shared = 2, 81 }, 82 83 /* TODO: Add L2 Cache information */ 84 }; 85 86 87 static struct kfd_gpu_cache_info carrizo_cache_info[] = { 88 { 89 /* TCP L1 Cache per CU */ 90 .cache_size = 16, 91 .cache_level = 1, 92 .flags = (CRAT_CACHE_FLAGS_ENABLED | 93 CRAT_CACHE_FLAGS_DATA_CACHE | 94 CRAT_CACHE_FLAGS_SIMD_CACHE), 95 .num_cu_shared = 1, 96 }, 97 { 98 /* Scalar L1 Instruction Cache (in SQC module) per bank */ 99 .cache_size = 8, 100 .cache_level = 1, 101 .flags = (CRAT_CACHE_FLAGS_ENABLED | 102 CRAT_CACHE_FLAGS_INST_CACHE | 103 CRAT_CACHE_FLAGS_SIMD_CACHE), 104 .num_cu_shared = 4, 105 }, 106 { 107 /* Scalar L1 Data Cache (in SQC module) per bank. */ 108 .cache_size = 4, 109 .cache_level = 1, 110 .flags = (CRAT_CACHE_FLAGS_ENABLED | 111 CRAT_CACHE_FLAGS_DATA_CACHE | 112 CRAT_CACHE_FLAGS_SIMD_CACHE), 113 .num_cu_shared = 4, 114 }, 115 116 /* TODO: Add L2 Cache information */ 117 }; 118 119 #define hawaii_cache_info kaveri_cache_info 120 #define tonga_cache_info carrizo_cache_info 121 #define fiji_cache_info carrizo_cache_info 122 #define polaris10_cache_info carrizo_cache_info 123 #define polaris11_cache_info carrizo_cache_info 124 #define polaris12_cache_info carrizo_cache_info 125 #define vegam_cache_info carrizo_cache_info 126 127 /* NOTE: L1 cache information has been updated and L2/L3 128 * cache information has been added for Vega10 and 129 * newer ASICs. The unit for cache_size is KiB. 130 * In future, check & update cache details 131 * for every new ASIC is required. 132 */ 133 134 static struct kfd_gpu_cache_info vega10_cache_info[] = { 135 { 136 /* TCP L1 Cache per CU */ 137 .cache_size = 16, 138 .cache_level = 1, 139 .flags = (CRAT_CACHE_FLAGS_ENABLED | 140 CRAT_CACHE_FLAGS_DATA_CACHE | 141 CRAT_CACHE_FLAGS_SIMD_CACHE), 142 .num_cu_shared = 1, 143 }, 144 { 145 /* Scalar L1 Instruction Cache per SQC */ 146 .cache_size = 32, 147 .cache_level = 1, 148 .flags = (CRAT_CACHE_FLAGS_ENABLED | 149 CRAT_CACHE_FLAGS_INST_CACHE | 150 CRAT_CACHE_FLAGS_SIMD_CACHE), 151 .num_cu_shared = 3, 152 }, 153 { 154 /* Scalar L1 Data Cache per SQC */ 155 .cache_size = 16, 156 .cache_level = 1, 157 .flags = (CRAT_CACHE_FLAGS_ENABLED | 158 CRAT_CACHE_FLAGS_DATA_CACHE | 159 CRAT_CACHE_FLAGS_SIMD_CACHE), 160 .num_cu_shared = 3, 161 }, 162 { 163 /* L2 Data Cache per GPU (Total Tex Cache) */ 164 .cache_size = 4096, 165 .cache_level = 2, 166 .flags = (CRAT_CACHE_FLAGS_ENABLED | 167 CRAT_CACHE_FLAGS_DATA_CACHE | 168 CRAT_CACHE_FLAGS_SIMD_CACHE), 169 .num_cu_shared = 16, 170 }, 171 }; 172 173 static struct kfd_gpu_cache_info raven_cache_info[] = { 174 { 175 /* TCP L1 Cache per CU */ 176 .cache_size = 16, 177 .cache_level = 1, 178 .flags = (CRAT_CACHE_FLAGS_ENABLED | 179 CRAT_CACHE_FLAGS_DATA_CACHE | 180 CRAT_CACHE_FLAGS_SIMD_CACHE), 181 .num_cu_shared = 1, 182 }, 183 { 184 /* Scalar L1 Instruction Cache per SQC */ 185 .cache_size = 32, 186 .cache_level = 1, 187 .flags = (CRAT_CACHE_FLAGS_ENABLED | 188 CRAT_CACHE_FLAGS_INST_CACHE | 189 CRAT_CACHE_FLAGS_SIMD_CACHE), 190 .num_cu_shared = 3, 191 }, 192 { 193 /* Scalar L1 Data Cache per SQC */ 194 .cache_size = 16, 195 .cache_level = 1, 196 .flags = (CRAT_CACHE_FLAGS_ENABLED | 197 CRAT_CACHE_FLAGS_DATA_CACHE | 198 CRAT_CACHE_FLAGS_SIMD_CACHE), 199 .num_cu_shared = 3, 200 }, 201 { 202 /* L2 Data Cache per GPU (Total Tex Cache) */ 203 .cache_size = 1024, 204 .cache_level = 2, 205 .flags = (CRAT_CACHE_FLAGS_ENABLED | 206 CRAT_CACHE_FLAGS_DATA_CACHE | 207 CRAT_CACHE_FLAGS_SIMD_CACHE), 208 .num_cu_shared = 11, 209 }, 210 }; 211 212 static struct kfd_gpu_cache_info renoir_cache_info[] = { 213 { 214 /* TCP L1 Cache per CU */ 215 .cache_size = 16, 216 .cache_level = 1, 217 .flags = (CRAT_CACHE_FLAGS_ENABLED | 218 CRAT_CACHE_FLAGS_DATA_CACHE | 219 CRAT_CACHE_FLAGS_SIMD_CACHE), 220 .num_cu_shared = 1, 221 }, 222 { 223 /* Scalar L1 Instruction Cache per SQC */ 224 .cache_size = 32, 225 .cache_level = 1, 226 .flags = (CRAT_CACHE_FLAGS_ENABLED | 227 CRAT_CACHE_FLAGS_INST_CACHE | 228 CRAT_CACHE_FLAGS_SIMD_CACHE), 229 .num_cu_shared = 3, 230 }, 231 { 232 /* Scalar L1 Data Cache per SQC */ 233 .cache_size = 16, 234 .cache_level = 1, 235 .flags = (CRAT_CACHE_FLAGS_ENABLED | 236 CRAT_CACHE_FLAGS_DATA_CACHE | 237 CRAT_CACHE_FLAGS_SIMD_CACHE), 238 .num_cu_shared = 3, 239 }, 240 { 241 /* L2 Data Cache per GPU (Total Tex Cache) */ 242 .cache_size = 1024, 243 .cache_level = 2, 244 .flags = (CRAT_CACHE_FLAGS_ENABLED | 245 CRAT_CACHE_FLAGS_DATA_CACHE | 246 CRAT_CACHE_FLAGS_SIMD_CACHE), 247 .num_cu_shared = 8, 248 }, 249 }; 250 251 static struct kfd_gpu_cache_info vega12_cache_info[] = { 252 { 253 /* TCP L1 Cache per CU */ 254 .cache_size = 16, 255 .cache_level = 1, 256 .flags = (CRAT_CACHE_FLAGS_ENABLED | 257 CRAT_CACHE_FLAGS_DATA_CACHE | 258 CRAT_CACHE_FLAGS_SIMD_CACHE), 259 .num_cu_shared = 1, 260 }, 261 { 262 /* Scalar L1 Instruction Cache per SQC */ 263 .cache_size = 32, 264 .cache_level = 1, 265 .flags = (CRAT_CACHE_FLAGS_ENABLED | 266 CRAT_CACHE_FLAGS_INST_CACHE | 267 CRAT_CACHE_FLAGS_SIMD_CACHE), 268 .num_cu_shared = 3, 269 }, 270 { 271 /* Scalar L1 Data Cache per SQC */ 272 .cache_size = 16, 273 .cache_level = 1, 274 .flags = (CRAT_CACHE_FLAGS_ENABLED | 275 CRAT_CACHE_FLAGS_DATA_CACHE | 276 CRAT_CACHE_FLAGS_SIMD_CACHE), 277 .num_cu_shared = 3, 278 }, 279 { 280 /* L2 Data Cache per GPU (Total Tex Cache) */ 281 .cache_size = 2048, 282 .cache_level = 2, 283 .flags = (CRAT_CACHE_FLAGS_ENABLED | 284 CRAT_CACHE_FLAGS_DATA_CACHE | 285 CRAT_CACHE_FLAGS_SIMD_CACHE), 286 .num_cu_shared = 5, 287 }, 288 }; 289 290 static struct kfd_gpu_cache_info vega20_cache_info[] = { 291 { 292 /* TCP L1 Cache per CU */ 293 .cache_size = 16, 294 .cache_level = 1, 295 .flags = (CRAT_CACHE_FLAGS_ENABLED | 296 CRAT_CACHE_FLAGS_DATA_CACHE | 297 CRAT_CACHE_FLAGS_SIMD_CACHE), 298 .num_cu_shared = 1, 299 }, 300 { 301 /* Scalar L1 Instruction Cache per SQC */ 302 .cache_size = 32, 303 .cache_level = 1, 304 .flags = (CRAT_CACHE_FLAGS_ENABLED | 305 CRAT_CACHE_FLAGS_INST_CACHE | 306 CRAT_CACHE_FLAGS_SIMD_CACHE), 307 .num_cu_shared = 3, 308 }, 309 { 310 /* Scalar L1 Data Cache per SQC */ 311 .cache_size = 16, 312 .cache_level = 1, 313 .flags = (CRAT_CACHE_FLAGS_ENABLED | 314 CRAT_CACHE_FLAGS_DATA_CACHE | 315 CRAT_CACHE_FLAGS_SIMD_CACHE), 316 .num_cu_shared = 3, 317 }, 318 { 319 /* L2 Data Cache per GPU (Total Tex Cache) */ 320 .cache_size = 8192, 321 .cache_level = 2, 322 .flags = (CRAT_CACHE_FLAGS_ENABLED | 323 CRAT_CACHE_FLAGS_DATA_CACHE | 324 CRAT_CACHE_FLAGS_SIMD_CACHE), 325 .num_cu_shared = 16, 326 }, 327 }; 328 329 static struct kfd_gpu_cache_info aldebaran_cache_info[] = { 330 { 331 /* TCP L1 Cache per CU */ 332 .cache_size = 16, 333 .cache_level = 1, 334 .flags = (CRAT_CACHE_FLAGS_ENABLED | 335 CRAT_CACHE_FLAGS_DATA_CACHE | 336 CRAT_CACHE_FLAGS_SIMD_CACHE), 337 .num_cu_shared = 1, 338 }, 339 { 340 /* Scalar L1 Instruction Cache per SQC */ 341 .cache_size = 32, 342 .cache_level = 1, 343 .flags = (CRAT_CACHE_FLAGS_ENABLED | 344 CRAT_CACHE_FLAGS_INST_CACHE | 345 CRAT_CACHE_FLAGS_SIMD_CACHE), 346 .num_cu_shared = 2, 347 }, 348 { 349 /* Scalar L1 Data Cache per SQC */ 350 .cache_size = 16, 351 .cache_level = 1, 352 .flags = (CRAT_CACHE_FLAGS_ENABLED | 353 CRAT_CACHE_FLAGS_DATA_CACHE | 354 CRAT_CACHE_FLAGS_SIMD_CACHE), 355 .num_cu_shared = 2, 356 }, 357 { 358 /* L2 Data Cache per GPU (Total Tex Cache) */ 359 .cache_size = 8192, 360 .cache_level = 2, 361 .flags = (CRAT_CACHE_FLAGS_ENABLED | 362 CRAT_CACHE_FLAGS_DATA_CACHE | 363 CRAT_CACHE_FLAGS_SIMD_CACHE), 364 .num_cu_shared = 14, 365 }, 366 }; 367 368 static struct kfd_gpu_cache_info navi10_cache_info[] = { 369 { 370 /* TCP L1 Cache per CU */ 371 .cache_size = 16, 372 .cache_level = 1, 373 .flags = (CRAT_CACHE_FLAGS_ENABLED | 374 CRAT_CACHE_FLAGS_DATA_CACHE | 375 CRAT_CACHE_FLAGS_SIMD_CACHE), 376 .num_cu_shared = 1, 377 }, 378 { 379 /* Scalar L1 Instruction Cache per SQC */ 380 .cache_size = 32, 381 .cache_level = 1, 382 .flags = (CRAT_CACHE_FLAGS_ENABLED | 383 CRAT_CACHE_FLAGS_INST_CACHE | 384 CRAT_CACHE_FLAGS_SIMD_CACHE), 385 .num_cu_shared = 2, 386 }, 387 { 388 /* Scalar L1 Data Cache per SQC */ 389 .cache_size = 16, 390 .cache_level = 1, 391 .flags = (CRAT_CACHE_FLAGS_ENABLED | 392 CRAT_CACHE_FLAGS_DATA_CACHE | 393 CRAT_CACHE_FLAGS_SIMD_CACHE), 394 .num_cu_shared = 2, 395 }, 396 { 397 /* GL1 Data Cache per SA */ 398 .cache_size = 128, 399 .cache_level = 1, 400 .flags = (CRAT_CACHE_FLAGS_ENABLED | 401 CRAT_CACHE_FLAGS_DATA_CACHE | 402 CRAT_CACHE_FLAGS_SIMD_CACHE), 403 .num_cu_shared = 10, 404 }, 405 { 406 /* L2 Data Cache per GPU (Total Tex Cache) */ 407 .cache_size = 4096, 408 .cache_level = 2, 409 .flags = (CRAT_CACHE_FLAGS_ENABLED | 410 CRAT_CACHE_FLAGS_DATA_CACHE | 411 CRAT_CACHE_FLAGS_SIMD_CACHE), 412 .num_cu_shared = 10, 413 }, 414 }; 415 416 static struct kfd_gpu_cache_info vangogh_cache_info[] = { 417 { 418 /* TCP L1 Cache per CU */ 419 .cache_size = 16, 420 .cache_level = 1, 421 .flags = (CRAT_CACHE_FLAGS_ENABLED | 422 CRAT_CACHE_FLAGS_DATA_CACHE | 423 CRAT_CACHE_FLAGS_SIMD_CACHE), 424 .num_cu_shared = 1, 425 }, 426 { 427 /* Scalar L1 Instruction Cache per SQC */ 428 .cache_size = 32, 429 .cache_level = 1, 430 .flags = (CRAT_CACHE_FLAGS_ENABLED | 431 CRAT_CACHE_FLAGS_INST_CACHE | 432 CRAT_CACHE_FLAGS_SIMD_CACHE), 433 .num_cu_shared = 2, 434 }, 435 { 436 /* Scalar L1 Data Cache per SQC */ 437 .cache_size = 16, 438 .cache_level = 1, 439 .flags = (CRAT_CACHE_FLAGS_ENABLED | 440 CRAT_CACHE_FLAGS_DATA_CACHE | 441 CRAT_CACHE_FLAGS_SIMD_CACHE), 442 .num_cu_shared = 2, 443 }, 444 { 445 /* GL1 Data Cache per SA */ 446 .cache_size = 128, 447 .cache_level = 1, 448 .flags = (CRAT_CACHE_FLAGS_ENABLED | 449 CRAT_CACHE_FLAGS_DATA_CACHE | 450 CRAT_CACHE_FLAGS_SIMD_CACHE), 451 .num_cu_shared = 8, 452 }, 453 { 454 /* L2 Data Cache per GPU (Total Tex Cache) */ 455 .cache_size = 1024, 456 .cache_level = 2, 457 .flags = (CRAT_CACHE_FLAGS_ENABLED | 458 CRAT_CACHE_FLAGS_DATA_CACHE | 459 CRAT_CACHE_FLAGS_SIMD_CACHE), 460 .num_cu_shared = 8, 461 }, 462 }; 463 464 static struct kfd_gpu_cache_info navi14_cache_info[] = { 465 { 466 /* TCP L1 Cache per CU */ 467 .cache_size = 16, 468 .cache_level = 1, 469 .flags = (CRAT_CACHE_FLAGS_ENABLED | 470 CRAT_CACHE_FLAGS_DATA_CACHE | 471 CRAT_CACHE_FLAGS_SIMD_CACHE), 472 .num_cu_shared = 1, 473 }, 474 { 475 /* Scalar L1 Instruction Cache per SQC */ 476 .cache_size = 32, 477 .cache_level = 1, 478 .flags = (CRAT_CACHE_FLAGS_ENABLED | 479 CRAT_CACHE_FLAGS_INST_CACHE | 480 CRAT_CACHE_FLAGS_SIMD_CACHE), 481 .num_cu_shared = 2, 482 }, 483 { 484 /* Scalar L1 Data Cache per SQC */ 485 .cache_size = 16, 486 .cache_level = 1, 487 .flags = (CRAT_CACHE_FLAGS_ENABLED | 488 CRAT_CACHE_FLAGS_DATA_CACHE | 489 CRAT_CACHE_FLAGS_SIMD_CACHE), 490 .num_cu_shared = 2, 491 }, 492 { 493 /* GL1 Data Cache per SA */ 494 .cache_size = 128, 495 .cache_level = 1, 496 .flags = (CRAT_CACHE_FLAGS_ENABLED | 497 CRAT_CACHE_FLAGS_DATA_CACHE | 498 CRAT_CACHE_FLAGS_SIMD_CACHE), 499 .num_cu_shared = 12, 500 }, 501 { 502 /* L2 Data Cache per GPU (Total Tex Cache) */ 503 .cache_size = 2048, 504 .cache_level = 2, 505 .flags = (CRAT_CACHE_FLAGS_ENABLED | 506 CRAT_CACHE_FLAGS_DATA_CACHE | 507 CRAT_CACHE_FLAGS_SIMD_CACHE), 508 .num_cu_shared = 12, 509 }, 510 }; 511 512 static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { 513 { 514 /* TCP L1 Cache per CU */ 515 .cache_size = 16, 516 .cache_level = 1, 517 .flags = (CRAT_CACHE_FLAGS_ENABLED | 518 CRAT_CACHE_FLAGS_DATA_CACHE | 519 CRAT_CACHE_FLAGS_SIMD_CACHE), 520 .num_cu_shared = 1, 521 }, 522 { 523 /* Scalar L1 Instruction Cache per SQC */ 524 .cache_size = 32, 525 .cache_level = 1, 526 .flags = (CRAT_CACHE_FLAGS_ENABLED | 527 CRAT_CACHE_FLAGS_INST_CACHE | 528 CRAT_CACHE_FLAGS_SIMD_CACHE), 529 .num_cu_shared = 2, 530 }, 531 { 532 /* Scalar L1 Data Cache per SQC */ 533 .cache_size = 16, 534 .cache_level = 1, 535 .flags = (CRAT_CACHE_FLAGS_ENABLED | 536 CRAT_CACHE_FLAGS_DATA_CACHE | 537 CRAT_CACHE_FLAGS_SIMD_CACHE), 538 .num_cu_shared = 2, 539 }, 540 { 541 /* GL1 Data Cache per SA */ 542 .cache_size = 128, 543 .cache_level = 1, 544 .flags = (CRAT_CACHE_FLAGS_ENABLED | 545 CRAT_CACHE_FLAGS_DATA_CACHE | 546 CRAT_CACHE_FLAGS_SIMD_CACHE), 547 .num_cu_shared = 10, 548 }, 549 { 550 /* L2 Data Cache per GPU (Total Tex Cache) */ 551 .cache_size = 4096, 552 .cache_level = 2, 553 .flags = (CRAT_CACHE_FLAGS_ENABLED | 554 CRAT_CACHE_FLAGS_DATA_CACHE | 555 CRAT_CACHE_FLAGS_SIMD_CACHE), 556 .num_cu_shared = 10, 557 }, 558 { 559 /* L3 Data Cache per GPU */ 560 .cache_size = 128*1024, 561 .cache_level = 3, 562 .flags = (CRAT_CACHE_FLAGS_ENABLED | 563 CRAT_CACHE_FLAGS_DATA_CACHE | 564 CRAT_CACHE_FLAGS_SIMD_CACHE), 565 .num_cu_shared = 10, 566 }, 567 }; 568 569 static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { 570 { 571 /* TCP L1 Cache per CU */ 572 .cache_size = 16, 573 .cache_level = 1, 574 .flags = (CRAT_CACHE_FLAGS_ENABLED | 575 CRAT_CACHE_FLAGS_DATA_CACHE | 576 CRAT_CACHE_FLAGS_SIMD_CACHE), 577 .num_cu_shared = 1, 578 }, 579 { 580 /* Scalar L1 Instruction Cache per SQC */ 581 .cache_size = 32, 582 .cache_level = 1, 583 .flags = (CRAT_CACHE_FLAGS_ENABLED | 584 CRAT_CACHE_FLAGS_INST_CACHE | 585 CRAT_CACHE_FLAGS_SIMD_CACHE), 586 .num_cu_shared = 2, 587 }, 588 { 589 /* Scalar L1 Data Cache per SQC */ 590 .cache_size = 16, 591 .cache_level = 1, 592 .flags = (CRAT_CACHE_FLAGS_ENABLED | 593 CRAT_CACHE_FLAGS_DATA_CACHE | 594 CRAT_CACHE_FLAGS_SIMD_CACHE), 595 .num_cu_shared = 2, 596 }, 597 { 598 /* GL1 Data Cache per SA */ 599 .cache_size = 128, 600 .cache_level = 1, 601 .flags = (CRAT_CACHE_FLAGS_ENABLED | 602 CRAT_CACHE_FLAGS_DATA_CACHE | 603 CRAT_CACHE_FLAGS_SIMD_CACHE), 604 .num_cu_shared = 10, 605 }, 606 { 607 /* L2 Data Cache per GPU (Total Tex Cache) */ 608 .cache_size = 3072, 609 .cache_level = 2, 610 .flags = (CRAT_CACHE_FLAGS_ENABLED | 611 CRAT_CACHE_FLAGS_DATA_CACHE | 612 CRAT_CACHE_FLAGS_SIMD_CACHE), 613 .num_cu_shared = 10, 614 }, 615 { 616 /* L3 Data Cache per GPU */ 617 .cache_size = 96*1024, 618 .cache_level = 3, 619 .flags = (CRAT_CACHE_FLAGS_ENABLED | 620 CRAT_CACHE_FLAGS_DATA_CACHE | 621 CRAT_CACHE_FLAGS_SIMD_CACHE), 622 .num_cu_shared = 10, 623 }, 624 }; 625 626 static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { 627 { 628 /* TCP L1 Cache per CU */ 629 .cache_size = 16, 630 .cache_level = 1, 631 .flags = (CRAT_CACHE_FLAGS_ENABLED | 632 CRAT_CACHE_FLAGS_DATA_CACHE | 633 CRAT_CACHE_FLAGS_SIMD_CACHE), 634 .num_cu_shared = 1, 635 }, 636 { 637 /* Scalar L1 Instruction Cache per SQC */ 638 .cache_size = 32, 639 .cache_level = 1, 640 .flags = (CRAT_CACHE_FLAGS_ENABLED | 641 CRAT_CACHE_FLAGS_INST_CACHE | 642 CRAT_CACHE_FLAGS_SIMD_CACHE), 643 .num_cu_shared = 2, 644 }, 645 { 646 /* Scalar L1 Data Cache per SQC */ 647 .cache_size = 16, 648 .cache_level = 1, 649 .flags = (CRAT_CACHE_FLAGS_ENABLED | 650 CRAT_CACHE_FLAGS_DATA_CACHE | 651 CRAT_CACHE_FLAGS_SIMD_CACHE), 652 .num_cu_shared = 2, 653 }, 654 { 655 /* GL1 Data Cache per SA */ 656 .cache_size = 128, 657 .cache_level = 1, 658 .flags = (CRAT_CACHE_FLAGS_ENABLED | 659 CRAT_CACHE_FLAGS_DATA_CACHE | 660 CRAT_CACHE_FLAGS_SIMD_CACHE), 661 .num_cu_shared = 8, 662 }, 663 { 664 /* L2 Data Cache per GPU (Total Tex Cache) */ 665 .cache_size = 2048, 666 .cache_level = 2, 667 .flags = (CRAT_CACHE_FLAGS_ENABLED | 668 CRAT_CACHE_FLAGS_DATA_CACHE | 669 CRAT_CACHE_FLAGS_SIMD_CACHE), 670 .num_cu_shared = 8, 671 }, 672 { 673 /* L3 Data Cache per GPU */ 674 .cache_size = 32*1024, 675 .cache_level = 3, 676 .flags = (CRAT_CACHE_FLAGS_ENABLED | 677 CRAT_CACHE_FLAGS_DATA_CACHE | 678 CRAT_CACHE_FLAGS_SIMD_CACHE), 679 .num_cu_shared = 8, 680 }, 681 }; 682 683 static struct kfd_gpu_cache_info beige_goby_cache_info[] = { 684 { 685 /* TCP L1 Cache per CU */ 686 .cache_size = 16, 687 .cache_level = 1, 688 .flags = (CRAT_CACHE_FLAGS_ENABLED | 689 CRAT_CACHE_FLAGS_DATA_CACHE | 690 CRAT_CACHE_FLAGS_SIMD_CACHE), 691 .num_cu_shared = 1, 692 }, 693 { 694 /* Scalar L1 Instruction Cache per SQC */ 695 .cache_size = 32, 696 .cache_level = 1, 697 .flags = (CRAT_CACHE_FLAGS_ENABLED | 698 CRAT_CACHE_FLAGS_INST_CACHE | 699 CRAT_CACHE_FLAGS_SIMD_CACHE), 700 .num_cu_shared = 2, 701 }, 702 { 703 /* Scalar L1 Data Cache per SQC */ 704 .cache_size = 16, 705 .cache_level = 1, 706 .flags = (CRAT_CACHE_FLAGS_ENABLED | 707 CRAT_CACHE_FLAGS_DATA_CACHE | 708 CRAT_CACHE_FLAGS_SIMD_CACHE), 709 .num_cu_shared = 2, 710 }, 711 { 712 /* GL1 Data Cache per SA */ 713 .cache_size = 128, 714 .cache_level = 1, 715 .flags = (CRAT_CACHE_FLAGS_ENABLED | 716 CRAT_CACHE_FLAGS_DATA_CACHE | 717 CRAT_CACHE_FLAGS_SIMD_CACHE), 718 .num_cu_shared = 8, 719 }, 720 { 721 /* L2 Data Cache per GPU (Total Tex Cache) */ 722 .cache_size = 1024, 723 .cache_level = 2, 724 .flags = (CRAT_CACHE_FLAGS_ENABLED | 725 CRAT_CACHE_FLAGS_DATA_CACHE | 726 CRAT_CACHE_FLAGS_SIMD_CACHE), 727 .num_cu_shared = 8, 728 }, 729 { 730 /* L3 Data Cache per GPU */ 731 .cache_size = 16*1024, 732 .cache_level = 3, 733 .flags = (CRAT_CACHE_FLAGS_ENABLED | 734 CRAT_CACHE_FLAGS_DATA_CACHE | 735 CRAT_CACHE_FLAGS_SIMD_CACHE), 736 .num_cu_shared = 8, 737 }, 738 }; 739 740 static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { 741 { 742 /* TCP L1 Cache per CU */ 743 .cache_size = 16, 744 .cache_level = 1, 745 .flags = (CRAT_CACHE_FLAGS_ENABLED | 746 CRAT_CACHE_FLAGS_DATA_CACHE | 747 CRAT_CACHE_FLAGS_SIMD_CACHE), 748 .num_cu_shared = 1, 749 }, 750 { 751 /* Scalar L1 Instruction Cache per SQC */ 752 .cache_size = 32, 753 .cache_level = 1, 754 .flags = (CRAT_CACHE_FLAGS_ENABLED | 755 CRAT_CACHE_FLAGS_INST_CACHE | 756 CRAT_CACHE_FLAGS_SIMD_CACHE), 757 .num_cu_shared = 2, 758 }, 759 { 760 /* Scalar L1 Data Cache per SQC */ 761 .cache_size = 16, 762 .cache_level = 1, 763 .flags = (CRAT_CACHE_FLAGS_ENABLED | 764 CRAT_CACHE_FLAGS_DATA_CACHE | 765 CRAT_CACHE_FLAGS_SIMD_CACHE), 766 .num_cu_shared = 2, 767 }, 768 { 769 /* GL1 Data Cache per SA */ 770 .cache_size = 128, 771 .cache_level = 1, 772 .flags = (CRAT_CACHE_FLAGS_ENABLED | 773 CRAT_CACHE_FLAGS_DATA_CACHE | 774 CRAT_CACHE_FLAGS_SIMD_CACHE), 775 .num_cu_shared = 6, 776 }, 777 { 778 /* L2 Data Cache per GPU (Total Tex Cache) */ 779 .cache_size = 2048, 780 .cache_level = 2, 781 .flags = (CRAT_CACHE_FLAGS_ENABLED | 782 CRAT_CACHE_FLAGS_DATA_CACHE | 783 CRAT_CACHE_FLAGS_SIMD_CACHE), 784 .num_cu_shared = 6, 785 }, 786 }; 787 788 static struct kfd_gpu_cache_info gfx1037_cache_info[] = { 789 { 790 /* TCP L1 Cache per CU */ 791 .cache_size = 16, 792 .cache_level = 1, 793 .flags = (CRAT_CACHE_FLAGS_ENABLED | 794 CRAT_CACHE_FLAGS_DATA_CACHE | 795 CRAT_CACHE_FLAGS_SIMD_CACHE), 796 .num_cu_shared = 1, 797 }, 798 { 799 /* Scalar L1 Instruction Cache per SQC */ 800 .cache_size = 32, 801 .cache_level = 1, 802 .flags = (CRAT_CACHE_FLAGS_ENABLED | 803 CRAT_CACHE_FLAGS_INST_CACHE | 804 CRAT_CACHE_FLAGS_SIMD_CACHE), 805 .num_cu_shared = 2, 806 }, 807 { 808 /* Scalar L1 Data Cache per SQC */ 809 .cache_size = 16, 810 .cache_level = 1, 811 .flags = (CRAT_CACHE_FLAGS_ENABLED | 812 CRAT_CACHE_FLAGS_DATA_CACHE | 813 CRAT_CACHE_FLAGS_SIMD_CACHE), 814 .num_cu_shared = 2, 815 }, 816 { 817 /* GL1 Data Cache per SA */ 818 .cache_size = 128, 819 .cache_level = 1, 820 .flags = (CRAT_CACHE_FLAGS_ENABLED | 821 CRAT_CACHE_FLAGS_DATA_CACHE | 822 CRAT_CACHE_FLAGS_SIMD_CACHE), 823 .num_cu_shared = 2, 824 }, 825 { 826 /* L2 Data Cache per GPU (Total Tex Cache) */ 827 .cache_size = 256, 828 .cache_level = 2, 829 .flags = (CRAT_CACHE_FLAGS_ENABLED | 830 CRAT_CACHE_FLAGS_DATA_CACHE | 831 CRAT_CACHE_FLAGS_SIMD_CACHE), 832 .num_cu_shared = 2, 833 }, 834 }; 835 836 static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { 837 { 838 /* TCP L1 Cache per CU */ 839 .cache_size = 16, 840 .cache_level = 1, 841 .flags = (CRAT_CACHE_FLAGS_ENABLED | 842 CRAT_CACHE_FLAGS_DATA_CACHE | 843 CRAT_CACHE_FLAGS_SIMD_CACHE), 844 .num_cu_shared = 1, 845 }, 846 { 847 /* Scalar L1 Instruction Cache per SQC */ 848 .cache_size = 32, 849 .cache_level = 1, 850 .flags = (CRAT_CACHE_FLAGS_ENABLED | 851 CRAT_CACHE_FLAGS_INST_CACHE | 852 CRAT_CACHE_FLAGS_SIMD_CACHE), 853 .num_cu_shared = 2, 854 }, 855 { 856 /* Scalar L1 Data Cache per SQC */ 857 .cache_size = 16, 858 .cache_level = 1, 859 .flags = (CRAT_CACHE_FLAGS_ENABLED | 860 CRAT_CACHE_FLAGS_DATA_CACHE | 861 CRAT_CACHE_FLAGS_SIMD_CACHE), 862 .num_cu_shared = 2, 863 }, 864 { 865 /* GL1 Data Cache per SA */ 866 .cache_size = 128, 867 .cache_level = 1, 868 .flags = (CRAT_CACHE_FLAGS_ENABLED | 869 CRAT_CACHE_FLAGS_DATA_CACHE | 870 CRAT_CACHE_FLAGS_SIMD_CACHE), 871 .num_cu_shared = 2, 872 }, 873 { 874 /* L2 Data Cache per GPU (Total Tex Cache) */ 875 .cache_size = 256, 876 .cache_level = 2, 877 .flags = (CRAT_CACHE_FLAGS_ENABLED | 878 CRAT_CACHE_FLAGS_DATA_CACHE | 879 CRAT_CACHE_FLAGS_SIMD_CACHE), 880 .num_cu_shared = 2, 881 }, 882 }; 883 884 static struct kfd_gpu_cache_info dummy_cache_info[] = { 885 { 886 /* TCP L1 Cache per CU */ 887 .cache_size = 16, 888 .cache_level = 1, 889 .flags = (CRAT_CACHE_FLAGS_ENABLED | 890 CRAT_CACHE_FLAGS_DATA_CACHE | 891 CRAT_CACHE_FLAGS_SIMD_CACHE), 892 .num_cu_shared = 1, 893 }, 894 { 895 /* Scalar L1 Instruction Cache per SQC */ 896 .cache_size = 32, 897 .cache_level = 1, 898 .flags = (CRAT_CACHE_FLAGS_ENABLED | 899 CRAT_CACHE_FLAGS_INST_CACHE | 900 CRAT_CACHE_FLAGS_SIMD_CACHE), 901 .num_cu_shared = 2, 902 }, 903 { 904 /* Scalar L1 Data Cache per SQC */ 905 .cache_size = 16, 906 .cache_level = 1, 907 .flags = (CRAT_CACHE_FLAGS_ENABLED | 908 CRAT_CACHE_FLAGS_DATA_CACHE | 909 CRAT_CACHE_FLAGS_SIMD_CACHE), 910 .num_cu_shared = 2, 911 }, 912 { 913 /* GL1 Data Cache per SA */ 914 .cache_size = 128, 915 .cache_level = 1, 916 .flags = (CRAT_CACHE_FLAGS_ENABLED | 917 CRAT_CACHE_FLAGS_DATA_CACHE | 918 CRAT_CACHE_FLAGS_SIMD_CACHE), 919 .num_cu_shared = 6, 920 }, 921 { 922 /* L2 Data Cache per GPU (Total Tex Cache) */ 923 .cache_size = 2048, 924 .cache_level = 2, 925 .flags = (CRAT_CACHE_FLAGS_ENABLED | 926 CRAT_CACHE_FLAGS_DATA_CACHE | 927 CRAT_CACHE_FLAGS_SIMD_CACHE), 928 .num_cu_shared = 6, 929 }, 930 }; 931 932 static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, 933 struct crat_subtype_computeunit *cu) 934 { 935 dev->node_props.cpu_cores_count = cu->num_cpu_cores; 936 dev->node_props.cpu_core_id_base = cu->processor_id_low; 937 if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) 938 dev->node_props.capability |= HSA_CAP_ATS_PRESENT; 939 940 pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, 941 cu->processor_id_low); 942 } 943 944 static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, 945 struct crat_subtype_computeunit *cu) 946 { 947 dev->node_props.simd_id_base = cu->processor_id_low; 948 dev->node_props.simd_count = cu->num_simd_cores; 949 dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; 950 dev->node_props.max_waves_per_simd = cu->max_waves_simd; 951 dev->node_props.wave_front_size = cu->wave_front_size; 952 dev->node_props.array_count = cu->array_count; 953 dev->node_props.cu_per_simd_array = cu->num_cu_per_array; 954 dev->node_props.simd_per_cu = cu->num_simd_per_cu; 955 dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; 956 if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) 957 dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; 958 pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); 959 } 960 961 /* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct 962 * topology device present in the device_list 963 */ 964 static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, 965 struct list_head *device_list) 966 { 967 struct kfd_topology_device *dev; 968 969 pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", 970 cu->proximity_domain, cu->hsa_capability); 971 list_for_each_entry(dev, device_list, list) { 972 if (cu->proximity_domain == dev->proximity_domain) { 973 if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) 974 kfd_populated_cu_info_cpu(dev, cu); 975 976 if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) 977 kfd_populated_cu_info_gpu(dev, cu); 978 break; 979 } 980 } 981 982 return 0; 983 } 984 985 static struct kfd_mem_properties * 986 find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, 987 struct kfd_topology_device *dev) 988 { 989 struct kfd_mem_properties *props; 990 991 list_for_each_entry(props, &dev->mem_props, list) { 992 if (props->heap_type == heap_type 993 && props->flags == flags 994 && props->width == width) 995 return props; 996 } 997 998 return NULL; 999 } 1000 /* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct 1001 * topology device present in the device_list 1002 */ 1003 static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, 1004 struct list_head *device_list) 1005 { 1006 struct kfd_mem_properties *props; 1007 struct kfd_topology_device *dev; 1008 uint32_t heap_type; 1009 uint64_t size_in_bytes; 1010 uint32_t flags = 0; 1011 uint32_t width; 1012 1013 pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", 1014 mem->proximity_domain); 1015 list_for_each_entry(dev, device_list, list) { 1016 if (mem->proximity_domain == dev->proximity_domain) { 1017 /* We're on GPU node */ 1018 if (dev->node_props.cpu_cores_count == 0) { 1019 /* APU */ 1020 if (mem->visibility_type == 0) 1021 heap_type = 1022 HSA_MEM_HEAP_TYPE_FB_PRIVATE; 1023 /* dGPU */ 1024 else 1025 heap_type = mem->visibility_type; 1026 } else 1027 heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; 1028 1029 if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) 1030 flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; 1031 if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) 1032 flags |= HSA_MEM_FLAGS_NON_VOLATILE; 1033 1034 size_in_bytes = 1035 ((uint64_t)mem->length_high << 32) + 1036 mem->length_low; 1037 width = mem->width; 1038 1039 /* Multiple banks of the same type are aggregated into 1040 * one. User mode doesn't care about multiple physical 1041 * memory segments. It's managed as a single virtual 1042 * heap for user mode. 1043 */ 1044 props = find_subtype_mem(heap_type, flags, width, dev); 1045 if (props) { 1046 props->size_in_bytes += size_in_bytes; 1047 break; 1048 } 1049 1050 props = kfd_alloc_struct(props); 1051 if (!props) 1052 return -ENOMEM; 1053 1054 props->heap_type = heap_type; 1055 props->flags = flags; 1056 props->size_in_bytes = size_in_bytes; 1057 props->width = width; 1058 1059 dev->node_props.mem_banks_count++; 1060 list_add_tail(&props->list, &dev->mem_props); 1061 1062 break; 1063 } 1064 } 1065 1066 return 0; 1067 } 1068 1069 /* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct 1070 * topology device present in the device_list 1071 */ 1072 static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, 1073 struct list_head *device_list) 1074 { 1075 struct kfd_cache_properties *props; 1076 struct kfd_topology_device *dev; 1077 uint32_t id; 1078 uint32_t total_num_of_cu; 1079 1080 id = cache->processor_id_low; 1081 1082 pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); 1083 list_for_each_entry(dev, device_list, list) { 1084 total_num_of_cu = (dev->node_props.array_count * 1085 dev->node_props.cu_per_simd_array); 1086 1087 /* Cache infomration in CRAT doesn't have proximity_domain 1088 * information as it is associated with a CPU core or GPU 1089 * Compute Unit. So map the cache using CPU core Id or SIMD 1090 * (GPU) ID. 1091 * TODO: This works because currently we can safely assume that 1092 * Compute Units are parsed before caches are parsed. In 1093 * future, remove this dependency 1094 */ 1095 if ((id >= dev->node_props.cpu_core_id_base && 1096 id <= dev->node_props.cpu_core_id_base + 1097 dev->node_props.cpu_cores_count) || 1098 (id >= dev->node_props.simd_id_base && 1099 id < dev->node_props.simd_id_base + 1100 total_num_of_cu)) { 1101 props = kfd_alloc_struct(props); 1102 if (!props) 1103 return -ENOMEM; 1104 1105 props->processor_id_low = id; 1106 props->cache_level = cache->cache_level; 1107 props->cache_size = cache->cache_size; 1108 props->cacheline_size = cache->cache_line_size; 1109 props->cachelines_per_tag = cache->lines_per_tag; 1110 props->cache_assoc = cache->associativity; 1111 props->cache_latency = cache->cache_latency; 1112 1113 memcpy(props->sibling_map, cache->sibling_map, 1114 CRAT_SIBLINGMAP_SIZE); 1115 1116 /* set the sibling_map_size as 32 for CRAT from ACPI */ 1117 props->sibling_map_size = CRAT_SIBLINGMAP_SIZE; 1118 1119 if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) 1120 props->cache_type |= HSA_CACHE_TYPE_DATA; 1121 if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) 1122 props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; 1123 if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) 1124 props->cache_type |= HSA_CACHE_TYPE_CPU; 1125 if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) 1126 props->cache_type |= HSA_CACHE_TYPE_HSACU; 1127 1128 dev->node_props.caches_count++; 1129 list_add_tail(&props->list, &dev->cache_props); 1130 1131 break; 1132 } 1133 } 1134 1135 return 0; 1136 } 1137 1138 /* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct 1139 * topology device present in the device_list 1140 */ 1141 static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, 1142 struct list_head *device_list) 1143 { 1144 struct kfd_iolink_properties *props = NULL, *props2; 1145 struct kfd_topology_device *dev, *to_dev; 1146 uint32_t id_from; 1147 uint32_t id_to; 1148 1149 id_from = iolink->proximity_domain_from; 1150 id_to = iolink->proximity_domain_to; 1151 1152 pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n", 1153 id_from, id_to); 1154 list_for_each_entry(dev, device_list, list) { 1155 if (id_from == dev->proximity_domain) { 1156 props = kfd_alloc_struct(props); 1157 if (!props) 1158 return -ENOMEM; 1159 1160 props->node_from = id_from; 1161 props->node_to = id_to; 1162 props->ver_maj = iolink->version_major; 1163 props->ver_min = iolink->version_minor; 1164 props->iolink_type = iolink->io_interface_type; 1165 1166 if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) 1167 props->weight = 20; 1168 else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) 1169 props->weight = iolink->weight_xgmi; 1170 else 1171 props->weight = node_distance(id_from, id_to); 1172 1173 props->min_latency = iolink->minimum_latency; 1174 props->max_latency = iolink->maximum_latency; 1175 props->min_bandwidth = iolink->minimum_bandwidth_mbs; 1176 props->max_bandwidth = iolink->maximum_bandwidth_mbs; 1177 props->rec_transfer_size = 1178 iolink->recommended_transfer_size; 1179 1180 dev->node_props.io_links_count++; 1181 list_add_tail(&props->list, &dev->io_link_props); 1182 break; 1183 } 1184 } 1185 1186 /* CPU topology is created before GPUs are detected, so CPU->GPU 1187 * links are not built at that time. If a PCIe type is discovered, it 1188 * means a GPU is detected and we are adding GPU->CPU to the topology. 1189 * At this time, also add the corresponded CPU->GPU link if GPU 1190 * is large bar. 1191 * For xGMI, we only added the link with one direction in the crat 1192 * table, add corresponded reversed direction link now. 1193 */ 1194 if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) { 1195 to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to); 1196 if (!to_dev) 1197 return -ENODEV; 1198 /* same everything but the other direction */ 1199 props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); 1200 if (!props2) 1201 return -ENOMEM; 1202 1203 props2->node_from = id_to; 1204 props2->node_to = id_from; 1205 props2->kobj = NULL; 1206 to_dev->node_props.io_links_count++; 1207 list_add_tail(&props2->list, &to_dev->io_link_props); 1208 } 1209 1210 return 0; 1211 } 1212 1213 /* kfd_parse_subtype - parse subtypes and attach it to correct topology device 1214 * present in the device_list 1215 * @sub_type_hdr - subtype section of crat_image 1216 * @device_list - list of topology devices present in this crat_image 1217 */ 1218 static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, 1219 struct list_head *device_list) 1220 { 1221 struct crat_subtype_computeunit *cu; 1222 struct crat_subtype_memory *mem; 1223 struct crat_subtype_cache *cache; 1224 struct crat_subtype_iolink *iolink; 1225 int ret = 0; 1226 1227 switch (sub_type_hdr->type) { 1228 case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: 1229 cu = (struct crat_subtype_computeunit *)sub_type_hdr; 1230 ret = kfd_parse_subtype_cu(cu, device_list); 1231 break; 1232 case CRAT_SUBTYPE_MEMORY_AFFINITY: 1233 mem = (struct crat_subtype_memory *)sub_type_hdr; 1234 ret = kfd_parse_subtype_mem(mem, device_list); 1235 break; 1236 case CRAT_SUBTYPE_CACHE_AFFINITY: 1237 cache = (struct crat_subtype_cache *)sub_type_hdr; 1238 ret = kfd_parse_subtype_cache(cache, device_list); 1239 break; 1240 case CRAT_SUBTYPE_TLB_AFFINITY: 1241 /* 1242 * For now, nothing to do here 1243 */ 1244 pr_debug("Found TLB entry in CRAT table (not processing)\n"); 1245 break; 1246 case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: 1247 /* 1248 * For now, nothing to do here 1249 */ 1250 pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); 1251 break; 1252 case CRAT_SUBTYPE_IOLINK_AFFINITY: 1253 iolink = (struct crat_subtype_iolink *)sub_type_hdr; 1254 ret = kfd_parse_subtype_iolink(iolink, device_list); 1255 break; 1256 default: 1257 pr_warn("Unknown subtype %d in CRAT\n", 1258 sub_type_hdr->type); 1259 } 1260 1261 return ret; 1262 } 1263 1264 /* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT 1265 * create a kfd_topology_device and add in to device_list. Also parse 1266 * CRAT subtypes and attach it to appropriate kfd_topology_device 1267 * @crat_image - input image containing CRAT 1268 * @device_list - [OUT] list of kfd_topology_device generated after 1269 * parsing crat_image 1270 * @proximity_domain - Proximity domain of the first device in the table 1271 * 1272 * Return - 0 if successful else -ve value 1273 */ 1274 int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, 1275 uint32_t proximity_domain) 1276 { 1277 struct kfd_topology_device *top_dev = NULL; 1278 struct crat_subtype_generic *sub_type_hdr; 1279 uint16_t node_id; 1280 int ret = 0; 1281 struct crat_header *crat_table = (struct crat_header *)crat_image; 1282 uint16_t num_nodes; 1283 uint32_t image_len; 1284 1285 if (!crat_image) 1286 return -EINVAL; 1287 1288 if (!list_empty(device_list)) { 1289 pr_warn("Error device list should be empty\n"); 1290 return -EINVAL; 1291 } 1292 1293 num_nodes = crat_table->num_domains; 1294 image_len = crat_table->length; 1295 1296 pr_debug("Parsing CRAT table with %d nodes\n", num_nodes); 1297 1298 for (node_id = 0; node_id < num_nodes; node_id++) { 1299 top_dev = kfd_create_topology_device(device_list); 1300 if (!top_dev) 1301 break; 1302 top_dev->proximity_domain = proximity_domain++; 1303 } 1304 1305 if (!top_dev) { 1306 ret = -ENOMEM; 1307 goto err; 1308 } 1309 1310 memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); 1311 memcpy(top_dev->oem_table_id, crat_table->oem_table_id, 1312 CRAT_OEMTABLEID_LENGTH); 1313 top_dev->oem_revision = crat_table->oem_revision; 1314 1315 sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); 1316 while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < 1317 ((char *)crat_image) + image_len) { 1318 if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { 1319 ret = kfd_parse_subtype(sub_type_hdr, device_list); 1320 if (ret) 1321 break; 1322 } 1323 1324 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1325 sub_type_hdr->length); 1326 } 1327 1328 err: 1329 if (ret) 1330 kfd_release_topology_device_list(device_list); 1331 1332 return ret; 1333 } 1334 1335 1336 static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, 1337 struct kfd_gpu_cache_info *pcache_info) 1338 { 1339 struct amdgpu_device *adev = kdev->adev; 1340 int i = 0; 1341 1342 /* TCP L1 Cache per CU */ 1343 if (adev->gfx.config.gc_tcp_l1_size) { 1344 pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size; 1345 pcache_info[i].cache_level = 1; 1346 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1347 CRAT_CACHE_FLAGS_DATA_CACHE | 1348 CRAT_CACHE_FLAGS_SIMD_CACHE); 1349 pcache_info[0].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; 1350 i++; 1351 } 1352 /* Scalar L1 Instruction Cache per SQC */ 1353 if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) { 1354 pcache_info[i].cache_size = 1355 adev->gfx.config.gc_l1_instruction_cache_size_per_sqc; 1356 pcache_info[i].cache_level = 1; 1357 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1358 CRAT_CACHE_FLAGS_INST_CACHE | 1359 CRAT_CACHE_FLAGS_SIMD_CACHE); 1360 pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; 1361 i++; 1362 } 1363 /* Scalar L1 Data Cache per SQC */ 1364 if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) { 1365 pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc; 1366 pcache_info[i].cache_level = 1; 1367 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1368 CRAT_CACHE_FLAGS_DATA_CACHE | 1369 CRAT_CACHE_FLAGS_SIMD_CACHE); 1370 pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; 1371 i++; 1372 } 1373 /* GL1 Data Cache per SA */ 1374 if (adev->gfx.config.gc_gl1c_per_sa && 1375 adev->gfx.config.gc_gl1c_size_per_instance) { 1376 pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa * 1377 adev->gfx.config.gc_gl1c_size_per_instance; 1378 pcache_info[i].cache_level = 1; 1379 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1380 CRAT_CACHE_FLAGS_DATA_CACHE | 1381 CRAT_CACHE_FLAGS_SIMD_CACHE); 1382 pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; 1383 i++; 1384 } 1385 /* L2 Data Cache per GPU (Total Tex Cache) */ 1386 if (adev->gfx.config.gc_gl2c_per_gpu) { 1387 pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu; 1388 pcache_info[i].cache_level = 2; 1389 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1390 CRAT_CACHE_FLAGS_DATA_CACHE | 1391 CRAT_CACHE_FLAGS_SIMD_CACHE); 1392 pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; 1393 i++; 1394 } 1395 /* L3 Data Cache per GPU */ 1396 if (adev->gmc.mall_size) { 1397 pcache_info[i].cache_size = adev->gmc.mall_size / 1024; 1398 pcache_info[i].cache_level = 3; 1399 pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | 1400 CRAT_CACHE_FLAGS_DATA_CACHE | 1401 CRAT_CACHE_FLAGS_SIMD_CACHE); 1402 pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; 1403 i++; 1404 } 1405 return i; 1406 } 1407 1408 int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info) 1409 { 1410 int num_of_cache_types = 0; 1411 1412 switch (kdev->adev->asic_type) { 1413 case CHIP_KAVERI: 1414 *pcache_info = kaveri_cache_info; 1415 num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); 1416 break; 1417 case CHIP_HAWAII: 1418 *pcache_info = hawaii_cache_info; 1419 num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); 1420 break; 1421 case CHIP_CARRIZO: 1422 *pcache_info = carrizo_cache_info; 1423 num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); 1424 break; 1425 case CHIP_TONGA: 1426 *pcache_info = tonga_cache_info; 1427 num_of_cache_types = ARRAY_SIZE(tonga_cache_info); 1428 break; 1429 case CHIP_FIJI: 1430 *pcache_info = fiji_cache_info; 1431 num_of_cache_types = ARRAY_SIZE(fiji_cache_info); 1432 break; 1433 case CHIP_POLARIS10: 1434 *pcache_info = polaris10_cache_info; 1435 num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); 1436 break; 1437 case CHIP_POLARIS11: 1438 *pcache_info = polaris11_cache_info; 1439 num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); 1440 break; 1441 case CHIP_POLARIS12: 1442 *pcache_info = polaris12_cache_info; 1443 num_of_cache_types = ARRAY_SIZE(polaris12_cache_info); 1444 break; 1445 case CHIP_VEGAM: 1446 *pcache_info = vegam_cache_info; 1447 num_of_cache_types = ARRAY_SIZE(vegam_cache_info); 1448 break; 1449 default: 1450 switch (KFD_GC_VERSION(kdev)) { 1451 case IP_VERSION(9, 0, 1): 1452 *pcache_info = vega10_cache_info; 1453 num_of_cache_types = ARRAY_SIZE(vega10_cache_info); 1454 break; 1455 case IP_VERSION(9, 2, 1): 1456 *pcache_info = vega12_cache_info; 1457 num_of_cache_types = ARRAY_SIZE(vega12_cache_info); 1458 break; 1459 case IP_VERSION(9, 4, 0): 1460 case IP_VERSION(9, 4, 1): 1461 *pcache_info = vega20_cache_info; 1462 num_of_cache_types = ARRAY_SIZE(vega20_cache_info); 1463 break; 1464 case IP_VERSION(9, 4, 2): 1465 case IP_VERSION(9, 4, 3): 1466 *pcache_info = aldebaran_cache_info; 1467 num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info); 1468 break; 1469 case IP_VERSION(9, 1, 0): 1470 case IP_VERSION(9, 2, 2): 1471 *pcache_info = raven_cache_info; 1472 num_of_cache_types = ARRAY_SIZE(raven_cache_info); 1473 break; 1474 case IP_VERSION(9, 3, 0): 1475 *pcache_info = renoir_cache_info; 1476 num_of_cache_types = ARRAY_SIZE(renoir_cache_info); 1477 break; 1478 case IP_VERSION(10, 1, 10): 1479 case IP_VERSION(10, 1, 2): 1480 case IP_VERSION(10, 1, 3): 1481 case IP_VERSION(10, 1, 4): 1482 *pcache_info = navi10_cache_info; 1483 num_of_cache_types = ARRAY_SIZE(navi10_cache_info); 1484 break; 1485 case IP_VERSION(10, 1, 1): 1486 *pcache_info = navi14_cache_info; 1487 num_of_cache_types = ARRAY_SIZE(navi14_cache_info); 1488 break; 1489 case IP_VERSION(10, 3, 0): 1490 *pcache_info = sienna_cichlid_cache_info; 1491 num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info); 1492 break; 1493 case IP_VERSION(10, 3, 2): 1494 *pcache_info = navy_flounder_cache_info; 1495 num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info); 1496 break; 1497 case IP_VERSION(10, 3, 4): 1498 *pcache_info = dimgrey_cavefish_cache_info; 1499 num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info); 1500 break; 1501 case IP_VERSION(10, 3, 1): 1502 *pcache_info = vangogh_cache_info; 1503 num_of_cache_types = ARRAY_SIZE(vangogh_cache_info); 1504 break; 1505 case IP_VERSION(10, 3, 5): 1506 *pcache_info = beige_goby_cache_info; 1507 num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info); 1508 break; 1509 case IP_VERSION(10, 3, 3): 1510 *pcache_info = yellow_carp_cache_info; 1511 num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info); 1512 break; 1513 case IP_VERSION(10, 3, 6): 1514 *pcache_info = gc_10_3_6_cache_info; 1515 num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info); 1516 break; 1517 case IP_VERSION(10, 3, 7): 1518 *pcache_info = gfx1037_cache_info; 1519 num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info); 1520 break; 1521 case IP_VERSION(11, 0, 0): 1522 case IP_VERSION(11, 0, 1): 1523 case IP_VERSION(11, 0, 2): 1524 case IP_VERSION(11, 0, 3): 1525 case IP_VERSION(11, 0, 4): 1526 num_of_cache_types = 1527 kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, *pcache_info); 1528 break; 1529 default: 1530 *pcache_info = dummy_cache_info; 1531 num_of_cache_types = ARRAY_SIZE(dummy_cache_info); 1532 pr_warn("dummy cache info is used temporarily and real cache info need update later.\n"); 1533 break; 1534 } 1535 } 1536 return num_of_cache_types; 1537 } 1538 1539 static bool kfd_ignore_crat(void) 1540 { 1541 bool ret; 1542 1543 if (ignore_crat) 1544 return true; 1545 1546 #ifndef KFD_SUPPORT_IOMMU_V2 1547 ret = true; 1548 #else 1549 ret = false; 1550 #endif 1551 1552 return ret; 1553 } 1554 1555 /* 1556 * kfd_create_crat_image_acpi - Allocates memory for CRAT image and 1557 * copies CRAT from ACPI (if available). 1558 * NOTE: Call kfd_destroy_crat_image to free CRAT image memory 1559 * 1560 * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then 1561 * crat_image will be NULL 1562 * @size: [OUT] size of crat_image 1563 * 1564 * Return 0 if successful else return error code 1565 */ 1566 int kfd_create_crat_image_acpi(void **crat_image, size_t *size) 1567 { 1568 struct acpi_table_header *crat_table; 1569 acpi_status status; 1570 void *pcrat_image; 1571 int rc = 0; 1572 1573 if (!crat_image) 1574 return -EINVAL; 1575 1576 *crat_image = NULL; 1577 1578 if (kfd_ignore_crat()) { 1579 pr_info("CRAT table disabled by module option\n"); 1580 return -ENODATA; 1581 } 1582 1583 /* Fetch the CRAT table from ACPI */ 1584 status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); 1585 if (status == AE_NOT_FOUND) { 1586 pr_info("CRAT table not found\n"); 1587 return -ENODATA; 1588 } else if (ACPI_FAILURE(status)) { 1589 const char *err = acpi_format_exception(status); 1590 1591 pr_err("CRAT table error: %s\n", err); 1592 return -EINVAL; 1593 } 1594 1595 pcrat_image = kvmalloc(crat_table->length, GFP_KERNEL); 1596 if (!pcrat_image) { 1597 rc = -ENOMEM; 1598 goto out; 1599 } 1600 1601 memcpy(pcrat_image, crat_table, crat_table->length); 1602 *crat_image = pcrat_image; 1603 *size = crat_table->length; 1604 out: 1605 acpi_put_table(crat_table); 1606 return rc; 1607 } 1608 1609 /* Memory required to create Virtual CRAT. 1610 * Since there is no easy way to predict the amount of memory required, the 1611 * following amount is allocated for GPU Virtual CRAT. This is 1612 * expected to cover all known conditions. But to be safe additional check 1613 * is put in the code to ensure we don't overwrite. 1614 */ 1615 #define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE) 1616 1617 /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node 1618 * 1619 * @numa_node_id: CPU NUMA node id 1620 * @avail_size: Available size in the memory 1621 * @sub_type_hdr: Memory into which compute info will be filled in 1622 * 1623 * Return 0 if successful else return -ve value 1624 */ 1625 static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, 1626 int proximity_domain, 1627 struct crat_subtype_computeunit *sub_type_hdr) 1628 { 1629 const struct cpumask *cpumask; 1630 1631 *avail_size -= sizeof(struct crat_subtype_computeunit); 1632 if (*avail_size < 0) 1633 return -ENOMEM; 1634 1635 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); 1636 1637 /* Fill in subtype header data */ 1638 sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; 1639 sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); 1640 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 1641 1642 cpumask = cpumask_of_node(numa_node_id); 1643 1644 /* Fill in CU data */ 1645 sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; 1646 sub_type_hdr->proximity_domain = proximity_domain; 1647 sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); 1648 if (sub_type_hdr->processor_id_low == -1) 1649 return -EINVAL; 1650 1651 sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); 1652 1653 return 0; 1654 } 1655 1656 /* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node 1657 * 1658 * @numa_node_id: CPU NUMA node id 1659 * @avail_size: Available size in the memory 1660 * @sub_type_hdr: Memory into which compute info will be filled in 1661 * 1662 * Return 0 if successful else return -ve value 1663 */ 1664 static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, 1665 int proximity_domain, 1666 struct crat_subtype_memory *sub_type_hdr) 1667 { 1668 uint64_t mem_in_bytes = 0; 1669 pg_data_t *pgdat; 1670 int zone_type; 1671 1672 *avail_size -= sizeof(struct crat_subtype_memory); 1673 if (*avail_size < 0) 1674 return -ENOMEM; 1675 1676 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); 1677 1678 /* Fill in subtype header data */ 1679 sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; 1680 sub_type_hdr->length = sizeof(struct crat_subtype_memory); 1681 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 1682 1683 /* Fill in Memory Subunit data */ 1684 1685 /* Unlike si_meminfo, si_meminfo_node is not exported. So 1686 * the following lines are duplicated from si_meminfo_node 1687 * function 1688 */ 1689 pgdat = NODE_DATA(numa_node_id); 1690 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 1691 mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]); 1692 mem_in_bytes <<= PAGE_SHIFT; 1693 1694 sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); 1695 sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); 1696 sub_type_hdr->proximity_domain = proximity_domain; 1697 1698 return 0; 1699 } 1700 1701 #ifdef CONFIG_X86_64 1702 static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, 1703 uint32_t *num_entries, 1704 struct crat_subtype_iolink *sub_type_hdr) 1705 { 1706 int nid; 1707 struct cpuinfo_x86 *c = &cpu_data(0); 1708 uint8_t link_type; 1709 1710 if (c->x86_vendor == X86_VENDOR_AMD) 1711 link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; 1712 else 1713 link_type = CRAT_IOLINK_TYPE_QPI_1_1; 1714 1715 *num_entries = 0; 1716 1717 /* Create IO links from this node to other CPU nodes */ 1718 for_each_online_node(nid) { 1719 if (nid == numa_node_id) /* node itself */ 1720 continue; 1721 1722 *avail_size -= sizeof(struct crat_subtype_iolink); 1723 if (*avail_size < 0) 1724 return -ENOMEM; 1725 1726 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 1727 1728 /* Fill in subtype header data */ 1729 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 1730 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 1731 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 1732 1733 /* Fill in IO link data */ 1734 sub_type_hdr->proximity_domain_from = numa_node_id; 1735 sub_type_hdr->proximity_domain_to = nid; 1736 sub_type_hdr->io_interface_type = link_type; 1737 1738 (*num_entries)++; 1739 sub_type_hdr++; 1740 } 1741 1742 return 0; 1743 } 1744 #endif 1745 1746 /* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU 1747 * 1748 * @pcrat_image: Fill in VCRAT for CPU 1749 * @size: [IN] allocated size of crat_image. 1750 * [OUT] actual size of data filled in crat_image 1751 */ 1752 static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) 1753 { 1754 struct crat_header *crat_table = (struct crat_header *)pcrat_image; 1755 struct acpi_table_header *acpi_table; 1756 acpi_status status; 1757 struct crat_subtype_generic *sub_type_hdr; 1758 int avail_size = *size; 1759 int numa_node_id; 1760 #ifdef CONFIG_X86_64 1761 uint32_t entries = 0; 1762 #endif 1763 int ret = 0; 1764 1765 if (!pcrat_image) 1766 return -EINVAL; 1767 1768 /* Fill in CRAT Header. 1769 * Modify length and total_entries as subunits are added. 1770 */ 1771 avail_size -= sizeof(struct crat_header); 1772 if (avail_size < 0) 1773 return -ENOMEM; 1774 1775 memset(crat_table, 0, sizeof(struct crat_header)); 1776 memcpy(&crat_table->signature, CRAT_SIGNATURE, 1777 sizeof(crat_table->signature)); 1778 crat_table->length = sizeof(struct crat_header); 1779 1780 status = acpi_get_table("DSDT", 0, &acpi_table); 1781 if (status != AE_OK) 1782 pr_warn("DSDT table not found for OEM information\n"); 1783 else { 1784 crat_table->oem_revision = acpi_table->revision; 1785 memcpy(crat_table->oem_id, acpi_table->oem_id, 1786 CRAT_OEMID_LENGTH); 1787 memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, 1788 CRAT_OEMTABLEID_LENGTH); 1789 acpi_put_table(acpi_table); 1790 } 1791 crat_table->total_entries = 0; 1792 crat_table->num_domains = 0; 1793 1794 sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); 1795 1796 for_each_online_node(numa_node_id) { 1797 if (kfd_numa_node_to_apic_id(numa_node_id) == -1) 1798 continue; 1799 1800 /* Fill in Subtype: Compute Unit */ 1801 ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, 1802 crat_table->num_domains, 1803 (struct crat_subtype_computeunit *)sub_type_hdr); 1804 if (ret < 0) 1805 return ret; 1806 crat_table->length += sub_type_hdr->length; 1807 crat_table->total_entries++; 1808 1809 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1810 sub_type_hdr->length); 1811 1812 /* Fill in Subtype: Memory */ 1813 ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, 1814 crat_table->num_domains, 1815 (struct crat_subtype_memory *)sub_type_hdr); 1816 if (ret < 0) 1817 return ret; 1818 crat_table->length += sub_type_hdr->length; 1819 crat_table->total_entries++; 1820 1821 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1822 sub_type_hdr->length); 1823 1824 /* Fill in Subtype: IO Link */ 1825 #ifdef CONFIG_X86_64 1826 ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, 1827 &entries, 1828 (struct crat_subtype_iolink *)sub_type_hdr); 1829 if (ret < 0) 1830 return ret; 1831 1832 if (entries) { 1833 crat_table->length += (sub_type_hdr->length * entries); 1834 crat_table->total_entries += entries; 1835 1836 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 1837 sub_type_hdr->length * entries); 1838 } 1839 #else 1840 pr_info("IO link not available for non x86 platforms\n"); 1841 #endif 1842 1843 crat_table->num_domains++; 1844 } 1845 1846 /* TODO: Add cache Subtype for CPU. 1847 * Currently, CPU cache information is available in function 1848 * detect_cache_attributes(cpu) defined in the file 1849 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not 1850 * exported and to get the same information the code needs to be 1851 * duplicated. 1852 */ 1853 1854 *size = crat_table->length; 1855 pr_info("Virtual CRAT table created for CPU\n"); 1856 1857 return 0; 1858 } 1859 1860 static int kfd_fill_gpu_memory_affinity(int *avail_size, 1861 struct kfd_node *kdev, uint8_t type, uint64_t size, 1862 struct crat_subtype_memory *sub_type_hdr, 1863 uint32_t proximity_domain, 1864 const struct kfd_local_mem_info *local_mem_info) 1865 { 1866 *avail_size -= sizeof(struct crat_subtype_memory); 1867 if (*avail_size < 0) 1868 return -ENOMEM; 1869 1870 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); 1871 sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; 1872 sub_type_hdr->length = sizeof(struct crat_subtype_memory); 1873 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; 1874 1875 sub_type_hdr->proximity_domain = proximity_domain; 1876 1877 pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", 1878 type, size); 1879 1880 sub_type_hdr->length_low = lower_32_bits(size); 1881 sub_type_hdr->length_high = upper_32_bits(size); 1882 1883 sub_type_hdr->width = local_mem_info->vram_width; 1884 sub_type_hdr->visibility_type = type; 1885 1886 return 0; 1887 } 1888 1889 #ifdef CONFIG_ACPI_NUMA 1890 static void kfd_find_numa_node_in_srat(struct kfd_node *kdev) 1891 { 1892 struct acpi_table_header *table_header = NULL; 1893 struct acpi_subtable_header *sub_header = NULL; 1894 unsigned long table_end, subtable_len; 1895 u32 pci_id = pci_domain_nr(kdev->adev->pdev->bus) << 16 | 1896 pci_dev_id(kdev->adev->pdev); 1897 u32 bdf; 1898 acpi_status status; 1899 struct acpi_srat_cpu_affinity *cpu; 1900 struct acpi_srat_generic_affinity *gpu; 1901 int pxm = 0, max_pxm = 0; 1902 int numa_node = NUMA_NO_NODE; 1903 bool found = false; 1904 1905 /* Fetch the SRAT table from ACPI */ 1906 status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header); 1907 if (status == AE_NOT_FOUND) { 1908 pr_warn("SRAT table not found\n"); 1909 return; 1910 } else if (ACPI_FAILURE(status)) { 1911 const char *err = acpi_format_exception(status); 1912 pr_err("SRAT table error: %s\n", err); 1913 return; 1914 } 1915 1916 table_end = (unsigned long)table_header + table_header->length; 1917 1918 /* Parse all entries looking for a match. */ 1919 sub_header = (struct acpi_subtable_header *) 1920 ((unsigned long)table_header + 1921 sizeof(struct acpi_table_srat)); 1922 subtable_len = sub_header->length; 1923 1924 while (((unsigned long)sub_header) + subtable_len < table_end) { 1925 /* 1926 * If length is 0, break from this loop to avoid 1927 * infinite loop. 1928 */ 1929 if (subtable_len == 0) { 1930 pr_err("SRAT invalid zero length\n"); 1931 break; 1932 } 1933 1934 switch (sub_header->type) { 1935 case ACPI_SRAT_TYPE_CPU_AFFINITY: 1936 cpu = (struct acpi_srat_cpu_affinity *)sub_header; 1937 pxm = *((u32 *)cpu->proximity_domain_hi) << 8 | 1938 cpu->proximity_domain_lo; 1939 if (pxm > max_pxm) 1940 max_pxm = pxm; 1941 break; 1942 case ACPI_SRAT_TYPE_GENERIC_AFFINITY: 1943 gpu = (struct acpi_srat_generic_affinity *)sub_header; 1944 bdf = *((u16 *)(&gpu->device_handle[0])) << 16 | 1945 *((u16 *)(&gpu->device_handle[2])); 1946 if (bdf == pci_id) { 1947 found = true; 1948 numa_node = pxm_to_node(gpu->proximity_domain); 1949 } 1950 break; 1951 default: 1952 break; 1953 } 1954 1955 if (found) 1956 break; 1957 1958 sub_header = (struct acpi_subtable_header *) 1959 ((unsigned long)sub_header + subtable_len); 1960 subtable_len = sub_header->length; 1961 } 1962 1963 acpi_put_table(table_header); 1964 1965 /* Workaround bad cpu-gpu binding case */ 1966 if (found && (numa_node < 0 || 1967 numa_node > pxm_to_node(max_pxm))) 1968 numa_node = 0; 1969 1970 if (numa_node != NUMA_NO_NODE) 1971 set_dev_node(&kdev->adev->pdev->dev, numa_node); 1972 } 1973 #endif 1974 1975 #define KFD_CRAT_INTRA_SOCKET_WEIGHT 13 1976 #define KFD_CRAT_XGMI_WEIGHT 15 1977 1978 /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU 1979 * to its NUMA node 1980 * @avail_size: Available size in the memory 1981 * @kdev - [IN] GPU device 1982 * @sub_type_hdr: Memory into which io link info will be filled in 1983 * @proximity_domain - proximity domain of the GPU node 1984 * 1985 * Return 0 if successful else return -ve value 1986 */ 1987 static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, 1988 struct kfd_node *kdev, 1989 struct crat_subtype_iolink *sub_type_hdr, 1990 uint32_t proximity_domain) 1991 { 1992 *avail_size -= sizeof(struct crat_subtype_iolink); 1993 if (*avail_size < 0) 1994 return -ENOMEM; 1995 1996 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 1997 1998 /* Fill in subtype header data */ 1999 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 2000 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 2001 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; 2002 if (kfd_dev_is_large_bar(kdev)) 2003 sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 2004 2005 /* Fill in IOLINK subtype. 2006 * TODO: Fill-in other fields of iolink subtype 2007 */ 2008 if (kdev->adev->gmc.xgmi.connected_to_cpu || 2009 (KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 3) && 2010 kdev->adev->smuio.funcs->get_pkg_type(kdev->adev) == 2011 AMDGPU_PKG_TYPE_APU)) { 2012 bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3); 2013 int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT : 2014 KFD_CRAT_INTRA_SOCKET_WEIGHT; 2015 uint32_t bandwidth = ext_cpu ? amdgpu_amdkfd_get_xgmi_bandwidth_mbytes( 2016 kdev->adev, NULL, true) : mem_bw; 2017 2018 /* 2019 * with host gpu xgmi link, host can access gpu memory whether 2020 * or not pcie bar type is large, so always create bidirectional 2021 * io link. 2022 */ 2023 sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 2024 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; 2025 sub_type_hdr->weight_xgmi = weight; 2026 sub_type_hdr->minimum_bandwidth_mbs = bandwidth; 2027 sub_type_hdr->maximum_bandwidth_mbs = bandwidth; 2028 } else { 2029 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; 2030 sub_type_hdr->minimum_bandwidth_mbs = 2031 amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true); 2032 sub_type_hdr->maximum_bandwidth_mbs = 2033 amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false); 2034 } 2035 2036 sub_type_hdr->proximity_domain_from = proximity_domain; 2037 2038 #ifdef CONFIG_ACPI_NUMA 2039 if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE && 2040 num_possible_nodes() > 1) 2041 kfd_find_numa_node_in_srat(kdev); 2042 #endif 2043 #ifdef CONFIG_NUMA 2044 if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE) 2045 sub_type_hdr->proximity_domain_to = 0; 2046 else 2047 sub_type_hdr->proximity_domain_to = kdev->adev->pdev->dev.numa_node; 2048 #else 2049 sub_type_hdr->proximity_domain_to = 0; 2050 #endif 2051 return 0; 2052 } 2053 2054 static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, 2055 struct kfd_node *kdev, 2056 struct kfd_node *peer_kdev, 2057 struct crat_subtype_iolink *sub_type_hdr, 2058 uint32_t proximity_domain_from, 2059 uint32_t proximity_domain_to) 2060 { 2061 bool use_ta_info = kdev->kfd->num_nodes == 1; 2062 2063 *avail_size -= sizeof(struct crat_subtype_iolink); 2064 if (*avail_size < 0) 2065 return -ENOMEM; 2066 2067 memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); 2068 2069 sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; 2070 sub_type_hdr->length = sizeof(struct crat_subtype_iolink); 2071 sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED | 2072 CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; 2073 2074 sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; 2075 sub_type_hdr->proximity_domain_from = proximity_domain_from; 2076 sub_type_hdr->proximity_domain_to = proximity_domain_to; 2077 2078 if (use_ta_info) { 2079 sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT * 2080 amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, peer_kdev->adev); 2081 sub_type_hdr->maximum_bandwidth_mbs = 2082 amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, 2083 peer_kdev->adev, false); 2084 sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ? 2085 amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, NULL, true) : 0; 2086 } else { 2087 bool is_single_hop = kdev->kfd == peer_kdev->kfd; 2088 int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT : 2089 (2 * KFD_CRAT_INTRA_SOCKET_WEIGHT) + KFD_CRAT_XGMI_WEIGHT; 2090 int mem_bw = 819200; 2091 2092 sub_type_hdr->weight_xgmi = weight; 2093 sub_type_hdr->maximum_bandwidth_mbs = is_single_hop ? mem_bw : 0; 2094 sub_type_hdr->minimum_bandwidth_mbs = is_single_hop ? mem_bw : 0; 2095 } 2096 2097 return 0; 2098 } 2099 2100 /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU 2101 * 2102 * @pcrat_image: Fill in VCRAT for GPU 2103 * @size: [IN] allocated size of crat_image. 2104 * [OUT] actual size of data filled in crat_image 2105 */ 2106 static int kfd_create_vcrat_image_gpu(void *pcrat_image, 2107 size_t *size, struct kfd_node *kdev, 2108 uint32_t proximity_domain) 2109 { 2110 struct crat_header *crat_table = (struct crat_header *)pcrat_image; 2111 struct crat_subtype_generic *sub_type_hdr; 2112 struct kfd_local_mem_info local_mem_info; 2113 struct kfd_topology_device *peer_dev; 2114 struct crat_subtype_computeunit *cu; 2115 struct kfd_cu_info cu_info; 2116 int avail_size = *size; 2117 uint32_t total_num_of_cu; 2118 uint32_t nid = 0; 2119 int ret = 0; 2120 2121 if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) 2122 return -EINVAL; 2123 2124 /* Fill the CRAT Header. 2125 * Modify length and total_entries as subunits are added. 2126 */ 2127 avail_size -= sizeof(struct crat_header); 2128 if (avail_size < 0) 2129 return -ENOMEM; 2130 2131 memset(crat_table, 0, sizeof(struct crat_header)); 2132 2133 memcpy(&crat_table->signature, CRAT_SIGNATURE, 2134 sizeof(crat_table->signature)); 2135 /* Change length as we add more subtypes*/ 2136 crat_table->length = sizeof(struct crat_header); 2137 crat_table->num_domains = 1; 2138 crat_table->total_entries = 0; 2139 2140 /* Fill in Subtype: Compute Unit 2141 * First fill in the sub type header and then sub type data 2142 */ 2143 avail_size -= sizeof(struct crat_subtype_computeunit); 2144 if (avail_size < 0) 2145 return -ENOMEM; 2146 2147 sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); 2148 memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); 2149 2150 sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; 2151 sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); 2152 sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; 2153 2154 /* Fill CU subtype data */ 2155 cu = (struct crat_subtype_computeunit *)sub_type_hdr; 2156 cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; 2157 cu->proximity_domain = proximity_domain; 2158 2159 amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info); 2160 cu->num_simd_per_cu = cu_info.simd_per_cu; 2161 cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; 2162 cu->max_waves_simd = cu_info.max_waves_per_simd; 2163 2164 cu->wave_front_size = cu_info.wave_front_size; 2165 cu->array_count = cu_info.num_shader_arrays_per_engine * 2166 cu_info.num_shader_engines; 2167 total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); 2168 cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); 2169 cu->num_cu_per_array = cu_info.num_cu_per_sh; 2170 cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; 2171 cu->num_banks = cu_info.num_shader_engines; 2172 cu->lds_size_in_kb = cu_info.lds_size; 2173 2174 cu->hsa_capability = 0; 2175 2176 /* Check if this node supports IOMMU. During parsing this flag will 2177 * translate to HSA_CAP_ATS_PRESENT 2178 */ 2179 if (!kfd_iommu_check_device(kdev->kfd)) 2180 cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; 2181 2182 crat_table->length += sub_type_hdr->length; 2183 crat_table->total_entries++; 2184 2185 /* Fill in Subtype: Memory. Only on systems with large BAR (no 2186 * private FB), report memory as public. On other systems 2187 * report the total FB size (public+private) as a single 2188 * private heap. 2189 */ 2190 local_mem_info = kdev->local_mem_info; 2191 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 2192 sub_type_hdr->length); 2193 2194 if (debug_largebar) 2195 local_mem_info.local_mem_size_private = 0; 2196 2197 if (local_mem_info.local_mem_size_private == 0) 2198 ret = kfd_fill_gpu_memory_affinity(&avail_size, 2199 kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, 2200 local_mem_info.local_mem_size_public, 2201 (struct crat_subtype_memory *)sub_type_hdr, 2202 proximity_domain, 2203 &local_mem_info); 2204 else 2205 ret = kfd_fill_gpu_memory_affinity(&avail_size, 2206 kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, 2207 local_mem_info.local_mem_size_public + 2208 local_mem_info.local_mem_size_private, 2209 (struct crat_subtype_memory *)sub_type_hdr, 2210 proximity_domain, 2211 &local_mem_info); 2212 if (ret < 0) 2213 return ret; 2214 2215 crat_table->length += sizeof(struct crat_subtype_memory); 2216 crat_table->total_entries++; 2217 2218 /* Fill in Subtype: IO_LINKS 2219 * Only direct links are added here which is Link from GPU to 2220 * its NUMA node. Indirect links are added by userspace. 2221 */ 2222 sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + 2223 sub_type_hdr->length); 2224 ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev, 2225 (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); 2226 2227 if (ret < 0) 2228 return ret; 2229 2230 crat_table->length += sub_type_hdr->length; 2231 crat_table->total_entries++; 2232 2233 2234 /* Fill in Subtype: IO_LINKS 2235 * Direct links from GPU to other GPUs through xGMI. 2236 * We will loop GPUs that already be processed (with lower value 2237 * of proximity_domain), add the link for the GPUs with same 2238 * hive id (from this GPU to other GPU) . The reversed iolink 2239 * (from other GPU to this GPU) will be added 2240 * in kfd_parse_subtype_iolink. 2241 */ 2242 if (kdev->kfd->hive_id) { 2243 for (nid = 0; nid < proximity_domain; ++nid) { 2244 peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid); 2245 if (!peer_dev->gpu) 2246 continue; 2247 if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id) 2248 continue; 2249 sub_type_hdr = (typeof(sub_type_hdr))( 2250 (char *)sub_type_hdr + 2251 sizeof(struct crat_subtype_iolink)); 2252 ret = kfd_fill_gpu_xgmi_link_to_gpu( 2253 &avail_size, kdev, peer_dev->gpu, 2254 (struct crat_subtype_iolink *)sub_type_hdr, 2255 proximity_domain, nid); 2256 if (ret < 0) 2257 return ret; 2258 crat_table->length += sub_type_hdr->length; 2259 crat_table->total_entries++; 2260 } 2261 } 2262 *size = crat_table->length; 2263 pr_info("Virtual CRAT table created for GPU\n"); 2264 2265 return ret; 2266 } 2267 2268 /* kfd_create_crat_image_virtual - Allocates memory for CRAT image and 2269 * creates a Virtual CRAT (VCRAT) image 2270 * 2271 * NOTE: Call kfd_destroy_crat_image to free CRAT image memory 2272 * 2273 * @crat_image: VCRAT image created because ACPI does not have a 2274 * CRAT for this device 2275 * @size: [OUT] size of virtual crat_image 2276 * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device 2277 * COMPUTE_UNIT_GPU - Create VCRAT for GPU 2278 * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU 2279 * -- this option is not currently implemented. 2280 * The assumption is that all AMD APUs will have CRAT 2281 * @kdev: Valid kfd_node required if flags contain COMPUTE_UNIT_GPU 2282 * 2283 * Return 0 if successful else return -ve value 2284 */ 2285 int kfd_create_crat_image_virtual(void **crat_image, size_t *size, 2286 int flags, struct kfd_node *kdev, 2287 uint32_t proximity_domain) 2288 { 2289 void *pcrat_image = NULL; 2290 int ret = 0, num_nodes; 2291 size_t dyn_size; 2292 2293 if (!crat_image) 2294 return -EINVAL; 2295 2296 *crat_image = NULL; 2297 2298 /* Allocate the CPU Virtual CRAT size based on the number of online 2299 * nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. 2300 * This should cover all the current conditions. A check is put not 2301 * to overwrite beyond allocated size for GPUs 2302 */ 2303 switch (flags) { 2304 case COMPUTE_UNIT_CPU: 2305 num_nodes = num_online_nodes(); 2306 dyn_size = sizeof(struct crat_header) + 2307 num_nodes * (sizeof(struct crat_subtype_computeunit) + 2308 sizeof(struct crat_subtype_memory) + 2309 (num_nodes - 1) * sizeof(struct crat_subtype_iolink)); 2310 pcrat_image = kvmalloc(dyn_size, GFP_KERNEL); 2311 if (!pcrat_image) 2312 return -ENOMEM; 2313 *size = dyn_size; 2314 pr_debug("CRAT size is %ld", dyn_size); 2315 ret = kfd_create_vcrat_image_cpu(pcrat_image, size); 2316 break; 2317 case COMPUTE_UNIT_GPU: 2318 if (!kdev) 2319 return -EINVAL; 2320 pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); 2321 if (!pcrat_image) 2322 return -ENOMEM; 2323 *size = VCRAT_SIZE_FOR_GPU; 2324 ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, 2325 proximity_domain); 2326 break; 2327 case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): 2328 /* TODO: */ 2329 ret = -EINVAL; 2330 pr_err("VCRAT not implemented for APU\n"); 2331 break; 2332 default: 2333 ret = -EINVAL; 2334 } 2335 2336 if (!ret) 2337 *crat_image = pcrat_image; 2338 else 2339 kvfree(pcrat_image); 2340 2341 return ret; 2342 } 2343 2344 2345 /* kfd_destroy_crat_image 2346 * 2347 * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) 2348 * 2349 */ 2350 void kfd_destroy_crat_image(void *crat_image) 2351 { 2352 kvfree(crat_image); 2353 } 2354