1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 /** 168 * DOC: product_name 169 * 170 * The amdgpu driver provides a sysfs API for reporting the product name 171 * for the device 172 * The file product_name is used for this and returns the product name 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_name(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return sysfs_emit(buf, "%s\n", adev->product_name); 184 } 185 186 static DEVICE_ATTR(product_name, S_IRUGO, 187 amdgpu_device_get_product_name, NULL); 188 189 /** 190 * DOC: product_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the part number 193 * for the device 194 * The file product_number is used for this and returns the part number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_product_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return sysfs_emit(buf, "%s\n", adev->product_number); 206 } 207 208 static DEVICE_ATTR(product_number, S_IRUGO, 209 amdgpu_device_get_product_number, NULL); 210 211 /** 212 * DOC: serial_number 213 * 214 * The amdgpu driver provides a sysfs API for reporting the serial number 215 * for the device 216 * The file serial_number is used for this and returns the serial number 217 * as returned from the FRU. 218 * NOTE: This is only available for certain server cards 219 */ 220 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 222 struct device_attribute *attr, char *buf) 223 { 224 struct drm_device *ddev = dev_get_drvdata(dev); 225 struct amdgpu_device *adev = drm_to_adev(ddev); 226 227 return sysfs_emit(buf, "%s\n", adev->serial); 228 } 229 230 static DEVICE_ATTR(serial_number, S_IRUGO, 231 amdgpu_device_get_serial_number, NULL); 232 233 /** 234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 235 * 236 * @dev: drm_device pointer 237 * 238 * Returns true if the device is a dGPU with ATPX power control, 239 * otherwise return false. 240 */ 241 bool amdgpu_device_supports_px(struct drm_device *dev) 242 { 243 struct amdgpu_device *adev = drm_to_adev(dev); 244 245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device is a dGPU with ACPI power control, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_boco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 if (adev->has_pr3 || 263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 264 return true; 265 return false; 266 } 267 268 /** 269 * amdgpu_device_supports_baco - Does the device support BACO 270 * 271 * @dev: drm_device pointer 272 * 273 * Returns true if the device supporte BACO, 274 * otherwise return false. 275 */ 276 bool amdgpu_device_supports_baco(struct drm_device *dev) 277 { 278 struct amdgpu_device *adev = drm_to_adev(dev); 279 280 return amdgpu_asic_supports_baco(adev); 281 } 282 283 /** 284 * amdgpu_device_supports_smart_shift - Is the device dGPU with 285 * smart shift support 286 * 287 * @dev: drm_device pointer 288 * 289 * Returns true if the device is a dGPU with Smart Shift support, 290 * otherwise returns false. 291 */ 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 293 { 294 return (amdgpu_device_supports_boco(dev) && 295 amdgpu_acpi_is_power_shift_control_supported()); 296 } 297 298 /* 299 * VRAM access helper functions 300 */ 301 302 /** 303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 304 * 305 * @adev: amdgpu_device pointer 306 * @pos: offset of the buffer in vram 307 * @buf: virtual address of the buffer in system memory 308 * @size: read/write size, sizeof(@buf) must > @size 309 * @write: true - write to vram, otherwise - read from vram 310 */ 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 312 void *buf, size_t size, bool write) 313 { 314 unsigned long flags; 315 uint32_t hi = ~0, tmp = 0; 316 uint32_t *data = buf; 317 uint64_t last; 318 int idx; 319 320 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 321 return; 322 323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 324 325 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 326 for (last = pos + size; pos < last; pos += 4) { 327 tmp = pos >> 31; 328 329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 330 if (tmp != hi) { 331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 332 hi = tmp; 333 } 334 if (write) 335 WREG32_NO_KIQ(mmMM_DATA, *data++); 336 else 337 *data++ = RREG32_NO_KIQ(mmMM_DATA); 338 } 339 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 drm_dev_exit(idx); 342 } 343 344 /** 345 * amdgpu_device_aper_access - access vram by vram aperature 346 * 347 * @adev: amdgpu_device pointer 348 * @pos: offset of the buffer in vram 349 * @buf: virtual address of the buffer in system memory 350 * @size: read/write size, sizeof(@buf) must > @size 351 * @write: true - write to vram, otherwise - read from vram 352 * 353 * The return value means how many bytes have been transferred. 354 */ 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 356 void *buf, size_t size, bool write) 357 { 358 #ifdef CONFIG_64BIT 359 void __iomem *addr; 360 size_t count = 0; 361 uint64_t last; 362 363 if (!adev->mman.aper_base_kaddr) 364 return 0; 365 366 last = min(pos + size, adev->gmc.visible_vram_size); 367 if (last > pos) { 368 addr = adev->mman.aper_base_kaddr + pos; 369 count = last - pos; 370 371 if (write) { 372 memcpy_toio(addr, buf, count); 373 mb(); 374 amdgpu_device_flush_hdp(adev, NULL); 375 } else { 376 amdgpu_device_invalidate_hdp(adev, NULL); 377 mb(); 378 memcpy_fromio(buf, addr, count); 379 } 380 381 } 382 383 return count; 384 #else 385 return 0; 386 #endif 387 } 388 389 /** 390 * amdgpu_device_vram_access - read/write a buffer in vram 391 * 392 * @adev: amdgpu_device pointer 393 * @pos: offset of the buffer in vram 394 * @buf: virtual address of the buffer in system memory 395 * @size: read/write size, sizeof(@buf) must > @size 396 * @write: true - write to vram, otherwise - read from vram 397 */ 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 399 void *buf, size_t size, bool write) 400 { 401 size_t count; 402 403 /* try to using vram apreature to access vram first */ 404 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 405 size -= count; 406 if (size) { 407 /* using MM to access rest vram */ 408 pos += count; 409 buf += count; 410 amdgpu_device_mm_access(adev, pos, buf, size, write); 411 } 412 } 413 414 /* 415 * register access helper functions. 416 */ 417 418 /* Check if hw access should be skipped because of hotplug or device error */ 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 420 { 421 if (adev->no_hw_access) 422 return true; 423 424 #ifdef CONFIG_LOCKDEP 425 /* 426 * This is a bit complicated to understand, so worth a comment. What we assert 427 * here is that the GPU reset is not running on another thread in parallel. 428 * 429 * For this we trylock the read side of the reset semaphore, if that succeeds 430 * we know that the reset is not running in paralell. 431 * 432 * If the trylock fails we assert that we are either already holding the read 433 * side of the lock or are the reset thread itself and hold the write side of 434 * the lock. 435 */ 436 if (in_task()) { 437 if (down_read_trylock(&adev->reset_domain->sem)) 438 up_read(&adev->reset_domain->sem); 439 else 440 lockdep_assert_held(&adev->reset_domain->sem); 441 } 442 #endif 443 return false; 444 } 445 446 /** 447 * amdgpu_device_rreg - read a memory mapped IO or indirect register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * @acc_flags: access flags which require special behavior 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t acc_flags) 457 { 458 uint32_t ret; 459 460 if (amdgpu_device_skip_hw_access(adev)) 461 return 0; 462 463 if ((reg * 4) < adev->rmmio_size) { 464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 465 amdgpu_sriov_runtime(adev) && 466 down_read_trylock(&adev->reset_domain->sem)) { 467 ret = amdgpu_kiq_rreg(adev, reg); 468 up_read(&adev->reset_domain->sem); 469 } else { 470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 471 } 472 } else { 473 ret = adev->pcie_rreg(adev, reg * 4); 474 } 475 476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 477 478 return ret; 479 } 480 481 /* 482 * MMIO register read with bytes helper functions 483 * @offset:bytes offset from MMIO start 484 * 485 */ 486 487 /** 488 * amdgpu_mm_rreg8 - read a memory mapped IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @offset: byte aligned register offset 492 * 493 * Returns the 8 bit value from the offset specified. 494 */ 495 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 496 { 497 if (amdgpu_device_skip_hw_access(adev)) 498 return 0; 499 500 if (offset < adev->rmmio_size) 501 return (readb(adev->rmmio + offset)); 502 BUG(); 503 } 504 505 /* 506 * MMIO register write with bytes helper functions 507 * @offset:bytes offset from MMIO start 508 * @value: the value want to be written to the register 509 * 510 */ 511 /** 512 * amdgpu_mm_wreg8 - read a memory mapped IO register 513 * 514 * @adev: amdgpu_device pointer 515 * @offset: byte aligned register offset 516 * @value: 8 bit value to write 517 * 518 * Writes the value specified to the offset specified. 519 */ 520 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 521 { 522 if (amdgpu_device_skip_hw_access(adev)) 523 return; 524 525 if (offset < adev->rmmio_size) 526 writeb(value, adev->rmmio + offset); 527 else 528 BUG(); 529 } 530 531 /** 532 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 533 * 534 * @adev: amdgpu_device pointer 535 * @reg: dword aligned register offset 536 * @v: 32 bit value to write to the register 537 * @acc_flags: access flags which require special behavior 538 * 539 * Writes the value specified to the offset specified. 540 */ 541 void amdgpu_device_wreg(struct amdgpu_device *adev, 542 uint32_t reg, uint32_t v, 543 uint32_t acc_flags) 544 { 545 if (amdgpu_device_skip_hw_access(adev)) 546 return; 547 548 if ((reg * 4) < adev->rmmio_size) { 549 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 550 amdgpu_sriov_runtime(adev) && 551 down_read_trylock(&adev->reset_domain->sem)) { 552 amdgpu_kiq_wreg(adev, reg, v); 553 up_read(&adev->reset_domain->sem); 554 } else { 555 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 556 } 557 } else { 558 adev->pcie_wreg(adev, reg * 4, v); 559 } 560 561 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 562 } 563 564 /** 565 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 566 * 567 * @adev: amdgpu_device pointer 568 * @reg: mmio/rlc register 569 * @v: value to write 570 * 571 * this function is invoked only for the debugfs register access 572 */ 573 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 574 uint32_t reg, uint32_t v) 575 { 576 if (amdgpu_device_skip_hw_access(adev)) 577 return; 578 579 if (amdgpu_sriov_fullaccess(adev) && 580 adev->gfx.rlc.funcs && 581 adev->gfx.rlc.funcs->is_rlcg_access_range) { 582 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 583 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 584 } else if ((reg * 4) >= adev->rmmio_size) { 585 adev->pcie_wreg(adev, reg * 4, v); 586 } else { 587 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 588 } 589 } 590 591 /** 592 * amdgpu_mm_rdoorbell - read a doorbell dword 593 * 594 * @adev: amdgpu_device pointer 595 * @index: doorbell index 596 * 597 * Returns the value in the doorbell aperture at the 598 * requested doorbell index (CIK). 599 */ 600 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 601 { 602 if (amdgpu_device_skip_hw_access(adev)) 603 return 0; 604 605 if (index < adev->doorbell.num_kernel_doorbells) { 606 return readl(adev->doorbell.ptr + index); 607 } else { 608 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 609 return 0; 610 } 611 } 612 613 /** 614 * amdgpu_mm_wdoorbell - write a doorbell dword 615 * 616 * @adev: amdgpu_device pointer 617 * @index: doorbell index 618 * @v: value to write 619 * 620 * Writes @v to the doorbell aperture at the 621 * requested doorbell index (CIK). 622 */ 623 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 624 { 625 if (amdgpu_device_skip_hw_access(adev)) 626 return; 627 628 if (index < adev->doorbell.num_kernel_doorbells) { 629 writel(v, adev->doorbell.ptr + index); 630 } else { 631 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 632 } 633 } 634 635 /** 636 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 637 * 638 * @adev: amdgpu_device pointer 639 * @index: doorbell index 640 * 641 * Returns the value in the doorbell aperture at the 642 * requested doorbell index (VEGA10+). 643 */ 644 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 645 { 646 if (amdgpu_device_skip_hw_access(adev)) 647 return 0; 648 649 if (index < adev->doorbell.num_kernel_doorbells) { 650 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 651 } else { 652 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 653 return 0; 654 } 655 } 656 657 /** 658 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 659 * 660 * @adev: amdgpu_device pointer 661 * @index: doorbell index 662 * @v: value to write 663 * 664 * Writes @v to the doorbell aperture at the 665 * requested doorbell index (VEGA10+). 666 */ 667 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 668 { 669 if (amdgpu_device_skip_hw_access(adev)) 670 return; 671 672 if (index < adev->doorbell.num_kernel_doorbells) { 673 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 674 } else { 675 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 676 } 677 } 678 679 /** 680 * amdgpu_device_indirect_rreg - read an indirect register 681 * 682 * @adev: amdgpu_device pointer 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 reg_addr) 689 { 690 unsigned long flags, pcie_index, pcie_data; 691 void __iomem *pcie_index_offset; 692 void __iomem *pcie_data_offset; 693 u32 r; 694 695 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 696 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 697 698 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 699 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 700 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 701 702 writel(reg_addr, pcie_index_offset); 703 readl(pcie_index_offset); 704 r = readl(pcie_data_offset); 705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 706 707 return r; 708 } 709 710 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 711 u64 reg_addr) 712 { 713 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 714 u32 r; 715 void __iomem *pcie_index_offset; 716 void __iomem *pcie_index_hi_offset; 717 void __iomem *pcie_data_offset; 718 719 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 720 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 721 if (adev->nbio.funcs->get_pcie_index_hi_offset) 722 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 723 else 724 pcie_index_hi = 0; 725 726 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 727 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 728 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 729 if (pcie_index_hi != 0) 730 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 731 pcie_index_hi * 4; 732 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 if (pcie_index_hi != 0) { 736 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 737 readl(pcie_index_hi_offset); 738 } 739 r = readl(pcie_data_offset); 740 741 /* clear the high bits */ 742 if (pcie_index_hi != 0) { 743 writel(0, pcie_index_hi_offset); 744 readl(pcie_index_hi_offset); 745 } 746 747 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 748 749 return r; 750 } 751 752 /** 753 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 754 * 755 * @adev: amdgpu_device pointer 756 * @reg_addr: indirect register address to read from 757 * 758 * Returns the value of indirect register @reg_addr 759 */ 760 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 761 u32 reg_addr) 762 { 763 unsigned long flags, pcie_index, pcie_data; 764 void __iomem *pcie_index_offset; 765 void __iomem *pcie_data_offset; 766 u64 r; 767 768 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 769 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 770 771 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 772 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 773 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 774 775 /* read low 32 bits */ 776 writel(reg_addr, pcie_index_offset); 777 readl(pcie_index_offset); 778 r = readl(pcie_data_offset); 779 /* read high 32 bits */ 780 writel(reg_addr + 4, pcie_index_offset); 781 readl(pcie_index_offset); 782 r |= ((u64)readl(pcie_data_offset) << 32); 783 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 784 785 return r; 786 } 787 788 /** 789 * amdgpu_device_indirect_wreg - write an indirect register address 790 * 791 * @adev: amdgpu_device pointer 792 * @reg_addr: indirect register offset 793 * @reg_data: indirect register data 794 * 795 */ 796 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 797 u32 reg_addr, u32 reg_data) 798 { 799 unsigned long flags, pcie_index, pcie_data; 800 void __iomem *pcie_index_offset; 801 void __iomem *pcie_data_offset; 802 803 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 804 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 805 806 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 807 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 808 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 809 810 writel(reg_addr, pcie_index_offset); 811 readl(pcie_index_offset); 812 writel(reg_data, pcie_data_offset); 813 readl(pcie_data_offset); 814 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 815 } 816 817 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 818 u64 reg_addr, u32 reg_data) 819 { 820 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 821 void __iomem *pcie_index_offset; 822 void __iomem *pcie_index_hi_offset; 823 void __iomem *pcie_data_offset; 824 825 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 826 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 827 if (adev->nbio.funcs->get_pcie_index_hi_offset) 828 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 829 else 830 pcie_index_hi = 0; 831 832 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 833 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 834 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 835 if (pcie_index_hi != 0) 836 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 837 pcie_index_hi * 4; 838 839 writel(reg_addr, pcie_index_offset); 840 readl(pcie_index_offset); 841 if (pcie_index_hi != 0) { 842 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 843 readl(pcie_index_hi_offset); 844 } 845 writel(reg_data, pcie_data_offset); 846 readl(pcie_data_offset); 847 848 /* clear the high bits */ 849 if (pcie_index_hi != 0) { 850 writel(0, pcie_index_hi_offset); 851 readl(pcie_index_hi_offset); 852 } 853 854 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 855 } 856 857 /** 858 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 859 * 860 * @adev: amdgpu_device pointer 861 * @reg_addr: indirect register offset 862 * @reg_data: indirect register data 863 * 864 */ 865 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 866 u32 reg_addr, u64 reg_data) 867 { 868 unsigned long flags, pcie_index, pcie_data; 869 void __iomem *pcie_index_offset; 870 void __iomem *pcie_data_offset; 871 872 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 873 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 874 875 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 876 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 877 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 878 879 /* write low 32 bits */ 880 writel(reg_addr, pcie_index_offset); 881 readl(pcie_index_offset); 882 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 883 readl(pcie_data_offset); 884 /* write high 32 bits */ 885 writel(reg_addr + 4, pcie_index_offset); 886 readl(pcie_index_offset); 887 writel((u32)(reg_data >> 32), pcie_data_offset); 888 readl(pcie_data_offset); 889 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 890 } 891 892 /** 893 * amdgpu_device_get_rev_id - query device rev_id 894 * 895 * @adev: amdgpu_device pointer 896 * 897 * Return device rev_id 898 */ 899 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 900 { 901 return adev->nbio.funcs->get_rev_id(adev); 902 } 903 904 /** 905 * amdgpu_invalid_rreg - dummy reg read function 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: offset of register 909 * 910 * Dummy register read function. Used for register blocks 911 * that certain asics don't have (all asics). 912 * Returns the value in the register. 913 */ 914 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 915 { 916 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 917 BUG(); 918 return 0; 919 } 920 921 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 922 { 923 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 924 BUG(); 925 return 0; 926 } 927 928 /** 929 * amdgpu_invalid_wreg - dummy reg write function 930 * 931 * @adev: amdgpu_device pointer 932 * @reg: offset of register 933 * @v: value to write to the register 934 * 935 * Dummy register read function. Used for register blocks 936 * that certain asics don't have (all asics). 937 */ 938 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 939 { 940 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 941 reg, v); 942 BUG(); 943 } 944 945 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 946 { 947 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 948 reg, v); 949 BUG(); 950 } 951 952 /** 953 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 954 * 955 * @adev: amdgpu_device pointer 956 * @reg: offset of register 957 * 958 * Dummy register read function. Used for register blocks 959 * that certain asics don't have (all asics). 960 * Returns the value in the register. 961 */ 962 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 963 { 964 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 965 BUG(); 966 return 0; 967 } 968 969 /** 970 * amdgpu_invalid_wreg64 - dummy reg write function 971 * 972 * @adev: amdgpu_device pointer 973 * @reg: offset of register 974 * @v: value to write to the register 975 * 976 * Dummy register read function. Used for register blocks 977 * that certain asics don't have (all asics). 978 */ 979 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 980 { 981 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 982 reg, v); 983 BUG(); 984 } 985 986 /** 987 * amdgpu_block_invalid_rreg - dummy reg read function 988 * 989 * @adev: amdgpu_device pointer 990 * @block: offset of instance 991 * @reg: offset of register 992 * 993 * Dummy register read function. Used for register blocks 994 * that certain asics don't have (all asics). 995 * Returns the value in the register. 996 */ 997 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 998 uint32_t block, uint32_t reg) 999 { 1000 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1001 reg, block); 1002 BUG(); 1003 return 0; 1004 } 1005 1006 /** 1007 * amdgpu_block_invalid_wreg - dummy reg write function 1008 * 1009 * @adev: amdgpu_device pointer 1010 * @block: offset of instance 1011 * @reg: offset of register 1012 * @v: value to write to the register 1013 * 1014 * Dummy register read function. Used for register blocks 1015 * that certain asics don't have (all asics). 1016 */ 1017 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1018 uint32_t block, 1019 uint32_t reg, uint32_t v) 1020 { 1021 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1022 reg, block, v); 1023 BUG(); 1024 } 1025 1026 /** 1027 * amdgpu_device_asic_init - Wrapper for atom asic_init 1028 * 1029 * @adev: amdgpu_device pointer 1030 * 1031 * Does any asic specific work and then calls atom asic init. 1032 */ 1033 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1034 { 1035 amdgpu_asic_pre_asic_init(adev); 1036 1037 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 1038 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 1039 return amdgpu_atomfirmware_asic_init(adev, true); 1040 else 1041 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1042 } 1043 1044 /** 1045 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1046 * 1047 * @adev: amdgpu_device pointer 1048 * 1049 * Allocates a scratch page of VRAM for use by various things in the 1050 * driver. 1051 */ 1052 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1053 { 1054 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1055 AMDGPU_GEM_DOMAIN_VRAM | 1056 AMDGPU_GEM_DOMAIN_GTT, 1057 &adev->mem_scratch.robj, 1058 &adev->mem_scratch.gpu_addr, 1059 (void **)&adev->mem_scratch.ptr); 1060 } 1061 1062 /** 1063 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1064 * 1065 * @adev: amdgpu_device pointer 1066 * 1067 * Frees the VRAM scratch page. 1068 */ 1069 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1070 { 1071 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1072 } 1073 1074 /** 1075 * amdgpu_device_program_register_sequence - program an array of registers. 1076 * 1077 * @adev: amdgpu_device pointer 1078 * @registers: pointer to the register array 1079 * @array_size: size of the register array 1080 * 1081 * Programs an array or registers with and and or masks. 1082 * This is a helper for setting golden registers. 1083 */ 1084 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1085 const u32 *registers, 1086 const u32 array_size) 1087 { 1088 u32 tmp, reg, and_mask, or_mask; 1089 int i; 1090 1091 if (array_size % 3) 1092 return; 1093 1094 for (i = 0; i < array_size; i += 3) { 1095 reg = registers[i + 0]; 1096 and_mask = registers[i + 1]; 1097 or_mask = registers[i + 2]; 1098 1099 if (and_mask == 0xffffffff) { 1100 tmp = or_mask; 1101 } else { 1102 tmp = RREG32(reg); 1103 tmp &= ~and_mask; 1104 if (adev->family >= AMDGPU_FAMILY_AI) 1105 tmp |= (or_mask & and_mask); 1106 else 1107 tmp |= or_mask; 1108 } 1109 WREG32(reg, tmp); 1110 } 1111 } 1112 1113 /** 1114 * amdgpu_device_pci_config_reset - reset the GPU 1115 * 1116 * @adev: amdgpu_device pointer 1117 * 1118 * Resets the GPU using the pci config reset sequence. 1119 * Only applicable to asics prior to vega10. 1120 */ 1121 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1122 { 1123 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1124 } 1125 1126 /** 1127 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1128 * 1129 * @adev: amdgpu_device pointer 1130 * 1131 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1132 */ 1133 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1134 { 1135 return pci_reset_function(adev->pdev); 1136 } 1137 1138 /* 1139 * GPU doorbell aperture helpers function. 1140 */ 1141 /** 1142 * amdgpu_device_doorbell_init - Init doorbell driver information. 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Init doorbell driver information (CIK) 1147 * Returns 0 on success, error on failure. 1148 */ 1149 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1150 { 1151 1152 /* No doorbell on SI hardware generation */ 1153 if (adev->asic_type < CHIP_BONAIRE) { 1154 adev->doorbell.base = 0; 1155 adev->doorbell.size = 0; 1156 adev->doorbell.num_kernel_doorbells = 0; 1157 adev->doorbell.ptr = NULL; 1158 return 0; 1159 } 1160 1161 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1162 return -EINVAL; 1163 1164 amdgpu_asic_init_doorbell_index(adev); 1165 1166 /* doorbell bar mapping */ 1167 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1168 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1169 1170 if (adev->enable_mes) { 1171 adev->doorbell.num_kernel_doorbells = 1172 adev->doorbell.size / sizeof(u32); 1173 } else { 1174 adev->doorbell.num_kernel_doorbells = 1175 min_t(u32, adev->doorbell.size / sizeof(u32), 1176 adev->doorbell_index.max_assignment+1); 1177 if (adev->doorbell.num_kernel_doorbells == 0) 1178 return -EINVAL; 1179 1180 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1181 * paging queue doorbell use the second page. The 1182 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1183 * doorbells are in the first page. So with paging queue enabled, 1184 * the max num_kernel_doorbells should + 1 page (0x400 in dword) 1185 */ 1186 if (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(4, 0, 0) && 1187 adev->ip_versions[SDMA0_HWIP][0] < IP_VERSION(4, 2, 0)) 1188 adev->doorbell.num_kernel_doorbells += 0x400; 1189 } 1190 1191 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1192 adev->doorbell.num_kernel_doorbells * 1193 sizeof(u32)); 1194 if (adev->doorbell.ptr == NULL) 1195 return -ENOMEM; 1196 1197 return 0; 1198 } 1199 1200 /** 1201 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1202 * 1203 * @adev: amdgpu_device pointer 1204 * 1205 * Tear down doorbell driver information (CIK) 1206 */ 1207 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1208 { 1209 iounmap(adev->doorbell.ptr); 1210 adev->doorbell.ptr = NULL; 1211 } 1212 1213 1214 1215 /* 1216 * amdgpu_device_wb_*() 1217 * Writeback is the method by which the GPU updates special pages in memory 1218 * with the status of certain GPU events (fences, ring pointers,etc.). 1219 */ 1220 1221 /** 1222 * amdgpu_device_wb_fini - Disable Writeback and free memory 1223 * 1224 * @adev: amdgpu_device pointer 1225 * 1226 * Disables Writeback and frees the Writeback memory (all asics). 1227 * Used at driver shutdown. 1228 */ 1229 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1230 { 1231 if (adev->wb.wb_obj) { 1232 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1233 &adev->wb.gpu_addr, 1234 (void **)&adev->wb.wb); 1235 adev->wb.wb_obj = NULL; 1236 } 1237 } 1238 1239 /** 1240 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1241 * 1242 * @adev: amdgpu_device pointer 1243 * 1244 * Initializes writeback and allocates writeback memory (all asics). 1245 * Used at driver startup. 1246 * Returns 0 on success or an -error on failure. 1247 */ 1248 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1249 { 1250 int r; 1251 1252 if (adev->wb.wb_obj == NULL) { 1253 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1254 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1255 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1256 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1257 (void **)&adev->wb.wb); 1258 if (r) { 1259 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1260 return r; 1261 } 1262 1263 adev->wb.num_wb = AMDGPU_MAX_WB; 1264 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1265 1266 /* clear wb memory */ 1267 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1268 } 1269 1270 return 0; 1271 } 1272 1273 /** 1274 * amdgpu_device_wb_get - Allocate a wb entry 1275 * 1276 * @adev: amdgpu_device pointer 1277 * @wb: wb index 1278 * 1279 * Allocate a wb slot for use by the driver (all asics). 1280 * Returns 0 on success or -EINVAL on failure. 1281 */ 1282 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1283 { 1284 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1285 1286 if (offset < adev->wb.num_wb) { 1287 __set_bit(offset, adev->wb.used); 1288 *wb = offset << 3; /* convert to dw offset */ 1289 return 0; 1290 } else { 1291 return -EINVAL; 1292 } 1293 } 1294 1295 /** 1296 * amdgpu_device_wb_free - Free a wb entry 1297 * 1298 * @adev: amdgpu_device pointer 1299 * @wb: wb index 1300 * 1301 * Free a wb slot allocated for use by the driver (all asics) 1302 */ 1303 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1304 { 1305 wb >>= 3; 1306 if (wb < adev->wb.num_wb) 1307 __clear_bit(wb, adev->wb.used); 1308 } 1309 1310 /** 1311 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1312 * 1313 * @adev: amdgpu_device pointer 1314 * 1315 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1316 * to fail, but if any of the BARs is not accessible after the size we abort 1317 * driver loading by returning -ENODEV. 1318 */ 1319 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1320 { 1321 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1322 struct pci_bus *root; 1323 struct resource *res; 1324 unsigned i; 1325 u16 cmd; 1326 int r; 1327 1328 /* Bypass for VF */ 1329 if (amdgpu_sriov_vf(adev)) 1330 return 0; 1331 1332 /* skip if the bios has already enabled large BAR */ 1333 if (adev->gmc.real_vram_size && 1334 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1335 return 0; 1336 1337 /* Check if the root BUS has 64bit memory resources */ 1338 root = adev->pdev->bus; 1339 while (root->parent) 1340 root = root->parent; 1341 1342 pci_bus_for_each_resource(root, res, i) { 1343 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1344 res->start > 0x100000000ull) 1345 break; 1346 } 1347 1348 /* Trying to resize is pointless without a root hub window above 4GB */ 1349 if (!res) 1350 return 0; 1351 1352 /* Limit the BAR size to what is available */ 1353 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1354 rbar_size); 1355 1356 /* Disable memory decoding while we change the BAR addresses and size */ 1357 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1358 pci_write_config_word(adev->pdev, PCI_COMMAND, 1359 cmd & ~PCI_COMMAND_MEMORY); 1360 1361 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1362 amdgpu_device_doorbell_fini(adev); 1363 if (adev->asic_type >= CHIP_BONAIRE) 1364 pci_release_resource(adev->pdev, 2); 1365 1366 pci_release_resource(adev->pdev, 0); 1367 1368 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1369 if (r == -ENOSPC) 1370 DRM_INFO("Not enough PCI address space for a large BAR."); 1371 else if (r && r != -ENOTSUPP) 1372 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1373 1374 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1375 1376 /* When the doorbell or fb BAR isn't available we have no chance of 1377 * using the device. 1378 */ 1379 r = amdgpu_device_doorbell_init(adev); 1380 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1381 return -ENODEV; 1382 1383 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1384 1385 return 0; 1386 } 1387 1388 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1389 { 1390 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) { 1391 return false; 1392 } 1393 1394 return true; 1395 } 1396 1397 /* 1398 * GPU helpers function. 1399 */ 1400 /** 1401 * amdgpu_device_need_post - check if the hw need post or not 1402 * 1403 * @adev: amdgpu_device pointer 1404 * 1405 * Check if the asic has been initialized (all asics) at driver startup 1406 * or post is needed if hw reset is performed. 1407 * Returns true if need or false if not. 1408 */ 1409 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1410 { 1411 uint32_t reg; 1412 1413 if (amdgpu_sriov_vf(adev)) 1414 return false; 1415 1416 if (!amdgpu_device_read_bios(adev)) 1417 return false; 1418 1419 if (amdgpu_passthrough(adev)) { 1420 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1421 * some old smc fw still need driver do vPost otherwise gpu hang, while 1422 * those smc fw version above 22.15 doesn't have this flaw, so we force 1423 * vpost executed for smc version below 22.15 1424 */ 1425 if (adev->asic_type == CHIP_FIJI) { 1426 int err; 1427 uint32_t fw_ver; 1428 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1429 /* force vPost if error occured */ 1430 if (err) 1431 return true; 1432 1433 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1434 if (fw_ver < 0x00160e00) 1435 return true; 1436 } 1437 } 1438 1439 /* Don't post if we need to reset whole hive on init */ 1440 if (adev->gmc.xgmi.pending_reset) 1441 return false; 1442 1443 if (adev->has_hw_reset) { 1444 adev->has_hw_reset = false; 1445 return true; 1446 } 1447 1448 /* bios scratch used on CIK+ */ 1449 if (adev->asic_type >= CHIP_BONAIRE) 1450 return amdgpu_atombios_scratch_need_asic_init(adev); 1451 1452 /* check MEM_SIZE for older asics */ 1453 reg = amdgpu_asic_get_config_memsize(adev); 1454 1455 if ((reg != 0) && (reg != 0xffffffff)) 1456 return false; 1457 1458 return true; 1459 } 1460 1461 /** 1462 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1467 * be set for this device. 1468 * 1469 * Returns true if it should be used or false if not. 1470 */ 1471 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1472 { 1473 switch (amdgpu_aspm) { 1474 case -1: 1475 break; 1476 case 0: 1477 return false; 1478 case 1: 1479 return true; 1480 default: 1481 return false; 1482 } 1483 return pcie_aspm_enabled(adev->pdev); 1484 } 1485 1486 bool amdgpu_device_aspm_support_quirk(void) 1487 { 1488 #if IS_ENABLED(CONFIG_X86) 1489 struct cpuinfo_x86 *c = &cpu_data(0); 1490 1491 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1492 #else 1493 return true; 1494 #endif 1495 } 1496 1497 /* if we get transitioned to only one device, take VGA back */ 1498 /** 1499 * amdgpu_device_vga_set_decode - enable/disable vga decode 1500 * 1501 * @pdev: PCI device pointer 1502 * @state: enable/disable vga decode 1503 * 1504 * Enable/disable vga decode (all asics). 1505 * Returns VGA resource flags. 1506 */ 1507 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1508 bool state) 1509 { 1510 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1511 amdgpu_asic_set_vga_state(adev, state); 1512 if (state) 1513 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1514 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1515 else 1516 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1517 } 1518 1519 /** 1520 * amdgpu_device_check_block_size - validate the vm block size 1521 * 1522 * @adev: amdgpu_device pointer 1523 * 1524 * Validates the vm block size specified via module parameter. 1525 * The vm block size defines number of bits in page table versus page directory, 1526 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1527 * page table and the remaining bits are in the page directory. 1528 */ 1529 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1530 { 1531 /* defines number of bits in page table versus page directory, 1532 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1533 * page table and the remaining bits are in the page directory */ 1534 if (amdgpu_vm_block_size == -1) 1535 return; 1536 1537 if (amdgpu_vm_block_size < 9) { 1538 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1539 amdgpu_vm_block_size); 1540 amdgpu_vm_block_size = -1; 1541 } 1542 } 1543 1544 /** 1545 * amdgpu_device_check_vm_size - validate the vm size 1546 * 1547 * @adev: amdgpu_device pointer 1548 * 1549 * Validates the vm size in GB specified via module parameter. 1550 * The VM size is the size of the GPU virtual memory space in GB. 1551 */ 1552 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1553 { 1554 /* no need to check the default value */ 1555 if (amdgpu_vm_size == -1) 1556 return; 1557 1558 if (amdgpu_vm_size < 1) { 1559 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1560 amdgpu_vm_size); 1561 amdgpu_vm_size = -1; 1562 } 1563 } 1564 1565 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1566 { 1567 struct sysinfo si; 1568 bool is_os_64 = (sizeof(void *) == 8); 1569 uint64_t total_memory; 1570 uint64_t dram_size_seven_GB = 0x1B8000000; 1571 uint64_t dram_size_three_GB = 0xB8000000; 1572 1573 if (amdgpu_smu_memory_pool_size == 0) 1574 return; 1575 1576 if (!is_os_64) { 1577 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1578 goto def_value; 1579 } 1580 si_meminfo(&si); 1581 total_memory = (uint64_t)si.totalram * si.mem_unit; 1582 1583 if ((amdgpu_smu_memory_pool_size == 1) || 1584 (amdgpu_smu_memory_pool_size == 2)) { 1585 if (total_memory < dram_size_three_GB) 1586 goto def_value1; 1587 } else if ((amdgpu_smu_memory_pool_size == 4) || 1588 (amdgpu_smu_memory_pool_size == 8)) { 1589 if (total_memory < dram_size_seven_GB) 1590 goto def_value1; 1591 } else { 1592 DRM_WARN("Smu memory pool size not supported\n"); 1593 goto def_value; 1594 } 1595 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1596 1597 return; 1598 1599 def_value1: 1600 DRM_WARN("No enough system memory\n"); 1601 def_value: 1602 adev->pm.smu_prv_buffer_size = 0; 1603 } 1604 1605 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1606 { 1607 if (!(adev->flags & AMD_IS_APU) || 1608 adev->asic_type < CHIP_RAVEN) 1609 return 0; 1610 1611 switch (adev->asic_type) { 1612 case CHIP_RAVEN: 1613 if (adev->pdev->device == 0x15dd) 1614 adev->apu_flags |= AMD_APU_IS_RAVEN; 1615 if (adev->pdev->device == 0x15d8) 1616 adev->apu_flags |= AMD_APU_IS_PICASSO; 1617 break; 1618 case CHIP_RENOIR: 1619 if ((adev->pdev->device == 0x1636) || 1620 (adev->pdev->device == 0x164c)) 1621 adev->apu_flags |= AMD_APU_IS_RENOIR; 1622 else 1623 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1624 break; 1625 case CHIP_VANGOGH: 1626 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1627 break; 1628 case CHIP_YELLOW_CARP: 1629 break; 1630 case CHIP_CYAN_SKILLFISH: 1631 if ((adev->pdev->device == 0x13FE) || 1632 (adev->pdev->device == 0x143F)) 1633 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1634 break; 1635 default: 1636 break; 1637 } 1638 1639 return 0; 1640 } 1641 1642 /** 1643 * amdgpu_device_check_arguments - validate module params 1644 * 1645 * @adev: amdgpu_device pointer 1646 * 1647 * Validates certain module parameters and updates 1648 * the associated values used by the driver (all asics). 1649 */ 1650 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1651 { 1652 if (amdgpu_sched_jobs < 4) { 1653 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1654 amdgpu_sched_jobs); 1655 amdgpu_sched_jobs = 4; 1656 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1657 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1658 amdgpu_sched_jobs); 1659 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1660 } 1661 1662 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1663 /* gart size must be greater or equal to 32M */ 1664 dev_warn(adev->dev, "gart size (%d) too small\n", 1665 amdgpu_gart_size); 1666 amdgpu_gart_size = -1; 1667 } 1668 1669 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1670 /* gtt size must be greater or equal to 32M */ 1671 dev_warn(adev->dev, "gtt size (%d) too small\n", 1672 amdgpu_gtt_size); 1673 amdgpu_gtt_size = -1; 1674 } 1675 1676 /* valid range is between 4 and 9 inclusive */ 1677 if (amdgpu_vm_fragment_size != -1 && 1678 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1679 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1680 amdgpu_vm_fragment_size = -1; 1681 } 1682 1683 if (amdgpu_sched_hw_submission < 2) { 1684 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1685 amdgpu_sched_hw_submission); 1686 amdgpu_sched_hw_submission = 2; 1687 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1688 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1689 amdgpu_sched_hw_submission); 1690 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1691 } 1692 1693 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1694 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1695 amdgpu_reset_method = -1; 1696 } 1697 1698 amdgpu_device_check_smu_prv_buffer_size(adev); 1699 1700 amdgpu_device_check_vm_size(adev); 1701 1702 amdgpu_device_check_block_size(adev); 1703 1704 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1705 1706 return 0; 1707 } 1708 1709 /** 1710 * amdgpu_switcheroo_set_state - set switcheroo state 1711 * 1712 * @pdev: pci dev pointer 1713 * @state: vga_switcheroo state 1714 * 1715 * Callback for the switcheroo driver. Suspends or resumes 1716 * the asics before or after it is powered up using ACPI methods. 1717 */ 1718 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1719 enum vga_switcheroo_state state) 1720 { 1721 struct drm_device *dev = pci_get_drvdata(pdev); 1722 int r; 1723 1724 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1725 return; 1726 1727 if (state == VGA_SWITCHEROO_ON) { 1728 pr_info("switched on\n"); 1729 /* don't suspend or resume card normally */ 1730 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1731 1732 pci_set_power_state(pdev, PCI_D0); 1733 amdgpu_device_load_pci_state(pdev); 1734 r = pci_enable_device(pdev); 1735 if (r) 1736 DRM_WARN("pci_enable_device failed (%d)\n", r); 1737 amdgpu_device_resume(dev, true); 1738 1739 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1740 } else { 1741 pr_info("switched off\n"); 1742 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1743 amdgpu_device_suspend(dev, true); 1744 amdgpu_device_cache_pci_state(pdev); 1745 /* Shut down the device */ 1746 pci_disable_device(pdev); 1747 pci_set_power_state(pdev, PCI_D3cold); 1748 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1749 } 1750 } 1751 1752 /** 1753 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1754 * 1755 * @pdev: pci dev pointer 1756 * 1757 * Callback for the switcheroo driver. Check of the switcheroo 1758 * state can be changed. 1759 * Returns true if the state can be changed, false if not. 1760 */ 1761 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1762 { 1763 struct drm_device *dev = pci_get_drvdata(pdev); 1764 1765 /* 1766 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1767 * locking inversion with the driver load path. And the access here is 1768 * completely racy anyway. So don't bother with locking for now. 1769 */ 1770 return atomic_read(&dev->open_count) == 0; 1771 } 1772 1773 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1774 .set_gpu_state = amdgpu_switcheroo_set_state, 1775 .reprobe = NULL, 1776 .can_switch = amdgpu_switcheroo_can_switch, 1777 }; 1778 1779 /** 1780 * amdgpu_device_ip_set_clockgating_state - set the CG state 1781 * 1782 * @dev: amdgpu_device pointer 1783 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1784 * @state: clockgating state (gate or ungate) 1785 * 1786 * Sets the requested clockgating state for all instances of 1787 * the hardware IP specified. 1788 * Returns the error code from the last instance. 1789 */ 1790 int amdgpu_device_ip_set_clockgating_state(void *dev, 1791 enum amd_ip_block_type block_type, 1792 enum amd_clockgating_state state) 1793 { 1794 struct amdgpu_device *adev = dev; 1795 int i, r = 0; 1796 1797 for (i = 0; i < adev->num_ip_blocks; i++) { 1798 if (!adev->ip_blocks[i].status.valid) 1799 continue; 1800 if (adev->ip_blocks[i].version->type != block_type) 1801 continue; 1802 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1803 continue; 1804 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1805 (void *)adev, state); 1806 if (r) 1807 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1808 adev->ip_blocks[i].version->funcs->name, r); 1809 } 1810 return r; 1811 } 1812 1813 /** 1814 * amdgpu_device_ip_set_powergating_state - set the PG state 1815 * 1816 * @dev: amdgpu_device pointer 1817 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1818 * @state: powergating state (gate or ungate) 1819 * 1820 * Sets the requested powergating state for all instances of 1821 * the hardware IP specified. 1822 * Returns the error code from the last instance. 1823 */ 1824 int amdgpu_device_ip_set_powergating_state(void *dev, 1825 enum amd_ip_block_type block_type, 1826 enum amd_powergating_state state) 1827 { 1828 struct amdgpu_device *adev = dev; 1829 int i, r = 0; 1830 1831 for (i = 0; i < adev->num_ip_blocks; i++) { 1832 if (!adev->ip_blocks[i].status.valid) 1833 continue; 1834 if (adev->ip_blocks[i].version->type != block_type) 1835 continue; 1836 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1837 continue; 1838 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1839 (void *)adev, state); 1840 if (r) 1841 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1842 adev->ip_blocks[i].version->funcs->name, r); 1843 } 1844 return r; 1845 } 1846 1847 /** 1848 * amdgpu_device_ip_get_clockgating_state - get the CG state 1849 * 1850 * @adev: amdgpu_device pointer 1851 * @flags: clockgating feature flags 1852 * 1853 * Walks the list of IPs on the device and updates the clockgating 1854 * flags for each IP. 1855 * Updates @flags with the feature flags for each hardware IP where 1856 * clockgating is enabled. 1857 */ 1858 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1859 u64 *flags) 1860 { 1861 int i; 1862 1863 for (i = 0; i < adev->num_ip_blocks; i++) { 1864 if (!adev->ip_blocks[i].status.valid) 1865 continue; 1866 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1867 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1868 } 1869 } 1870 1871 /** 1872 * amdgpu_device_ip_wait_for_idle - wait for idle 1873 * 1874 * @adev: amdgpu_device pointer 1875 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1876 * 1877 * Waits for the request hardware IP to be idle. 1878 * Returns 0 for success or a negative error code on failure. 1879 */ 1880 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1881 enum amd_ip_block_type block_type) 1882 { 1883 int i, r; 1884 1885 for (i = 0; i < adev->num_ip_blocks; i++) { 1886 if (!adev->ip_blocks[i].status.valid) 1887 continue; 1888 if (adev->ip_blocks[i].version->type == block_type) { 1889 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1890 if (r) 1891 return r; 1892 break; 1893 } 1894 } 1895 return 0; 1896 1897 } 1898 1899 /** 1900 * amdgpu_device_ip_is_idle - is the hardware IP idle 1901 * 1902 * @adev: amdgpu_device pointer 1903 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1904 * 1905 * Check if the hardware IP is idle or not. 1906 * Returns true if it the IP is idle, false if not. 1907 */ 1908 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1909 enum amd_ip_block_type block_type) 1910 { 1911 int i; 1912 1913 for (i = 0; i < adev->num_ip_blocks; i++) { 1914 if (!adev->ip_blocks[i].status.valid) 1915 continue; 1916 if (adev->ip_blocks[i].version->type == block_type) 1917 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1918 } 1919 return true; 1920 1921 } 1922 1923 /** 1924 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1925 * 1926 * @adev: amdgpu_device pointer 1927 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1928 * 1929 * Returns a pointer to the hardware IP block structure 1930 * if it exists for the asic, otherwise NULL. 1931 */ 1932 struct amdgpu_ip_block * 1933 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1934 enum amd_ip_block_type type) 1935 { 1936 int i; 1937 1938 for (i = 0; i < adev->num_ip_blocks; i++) 1939 if (adev->ip_blocks[i].version->type == type) 1940 return &adev->ip_blocks[i]; 1941 1942 return NULL; 1943 } 1944 1945 /** 1946 * amdgpu_device_ip_block_version_cmp 1947 * 1948 * @adev: amdgpu_device pointer 1949 * @type: enum amd_ip_block_type 1950 * @major: major version 1951 * @minor: minor version 1952 * 1953 * return 0 if equal or greater 1954 * return 1 if smaller or the ip_block doesn't exist 1955 */ 1956 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1957 enum amd_ip_block_type type, 1958 u32 major, u32 minor) 1959 { 1960 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1961 1962 if (ip_block && ((ip_block->version->major > major) || 1963 ((ip_block->version->major == major) && 1964 (ip_block->version->minor >= minor)))) 1965 return 0; 1966 1967 return 1; 1968 } 1969 1970 /** 1971 * amdgpu_device_ip_block_add 1972 * 1973 * @adev: amdgpu_device pointer 1974 * @ip_block_version: pointer to the IP to add 1975 * 1976 * Adds the IP block driver information to the collection of IPs 1977 * on the asic. 1978 */ 1979 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1980 const struct amdgpu_ip_block_version *ip_block_version) 1981 { 1982 if (!ip_block_version) 1983 return -EINVAL; 1984 1985 switch (ip_block_version->type) { 1986 case AMD_IP_BLOCK_TYPE_VCN: 1987 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1988 return 0; 1989 break; 1990 case AMD_IP_BLOCK_TYPE_JPEG: 1991 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1992 return 0; 1993 break; 1994 default: 1995 break; 1996 } 1997 1998 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1999 ip_block_version->funcs->name); 2000 2001 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2002 2003 return 0; 2004 } 2005 2006 /** 2007 * amdgpu_device_enable_virtual_display - enable virtual display feature 2008 * 2009 * @adev: amdgpu_device pointer 2010 * 2011 * Enabled the virtual display feature if the user has enabled it via 2012 * the module parameter virtual_display. This feature provides a virtual 2013 * display hardware on headless boards or in virtualized environments. 2014 * This function parses and validates the configuration string specified by 2015 * the user and configues the virtual display configuration (number of 2016 * virtual connectors, crtcs, etc.) specified. 2017 */ 2018 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2019 { 2020 adev->enable_virtual_display = false; 2021 2022 if (amdgpu_virtual_display) { 2023 const char *pci_address_name = pci_name(adev->pdev); 2024 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2025 2026 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2027 pciaddstr_tmp = pciaddstr; 2028 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2029 pciaddname = strsep(&pciaddname_tmp, ","); 2030 if (!strcmp("all", pciaddname) 2031 || !strcmp(pci_address_name, pciaddname)) { 2032 long num_crtc; 2033 int res = -1; 2034 2035 adev->enable_virtual_display = true; 2036 2037 if (pciaddname_tmp) 2038 res = kstrtol(pciaddname_tmp, 10, 2039 &num_crtc); 2040 2041 if (!res) { 2042 if (num_crtc < 1) 2043 num_crtc = 1; 2044 if (num_crtc > 6) 2045 num_crtc = 6; 2046 adev->mode_info.num_crtc = num_crtc; 2047 } else { 2048 adev->mode_info.num_crtc = 1; 2049 } 2050 break; 2051 } 2052 } 2053 2054 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2055 amdgpu_virtual_display, pci_address_name, 2056 adev->enable_virtual_display, adev->mode_info.num_crtc); 2057 2058 kfree(pciaddstr); 2059 } 2060 } 2061 2062 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2063 { 2064 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2065 adev->mode_info.num_crtc = 1; 2066 adev->enable_virtual_display = true; 2067 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2068 adev->enable_virtual_display, adev->mode_info.num_crtc); 2069 } 2070 } 2071 2072 /** 2073 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2074 * 2075 * @adev: amdgpu_device pointer 2076 * 2077 * Parses the asic configuration parameters specified in the gpu info 2078 * firmware and makes them availale to the driver for use in configuring 2079 * the asic. 2080 * Returns 0 on success, -EINVAL on failure. 2081 */ 2082 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2083 { 2084 const char *chip_name; 2085 char fw_name[40]; 2086 int err; 2087 const struct gpu_info_firmware_header_v1_0 *hdr; 2088 2089 adev->firmware.gpu_info_fw = NULL; 2090 2091 if (adev->mman.discovery_bin) { 2092 /* 2093 * FIXME: The bounding box is still needed by Navi12, so 2094 * temporarily read it from gpu_info firmware. Should be dropped 2095 * when DAL no longer needs it. 2096 */ 2097 if (adev->asic_type != CHIP_NAVI12) 2098 return 0; 2099 } 2100 2101 switch (adev->asic_type) { 2102 default: 2103 return 0; 2104 case CHIP_VEGA10: 2105 chip_name = "vega10"; 2106 break; 2107 case CHIP_VEGA12: 2108 chip_name = "vega12"; 2109 break; 2110 case CHIP_RAVEN: 2111 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2112 chip_name = "raven2"; 2113 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2114 chip_name = "picasso"; 2115 else 2116 chip_name = "raven"; 2117 break; 2118 case CHIP_ARCTURUS: 2119 chip_name = "arcturus"; 2120 break; 2121 case CHIP_NAVI12: 2122 chip_name = "navi12"; 2123 break; 2124 } 2125 2126 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2127 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2128 if (err) { 2129 dev_err(adev->dev, 2130 "Failed to get gpu_info firmware \"%s\"\n", 2131 fw_name); 2132 goto out; 2133 } 2134 2135 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2136 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2137 2138 switch (hdr->version_major) { 2139 case 1: 2140 { 2141 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2142 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2143 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2144 2145 /* 2146 * Should be droped when DAL no longer needs it. 2147 */ 2148 if (adev->asic_type == CHIP_NAVI12) 2149 goto parse_soc_bounding_box; 2150 2151 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2152 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2153 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2154 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2155 adev->gfx.config.max_texture_channel_caches = 2156 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2157 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2158 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2159 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2160 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2161 adev->gfx.config.double_offchip_lds_buf = 2162 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2163 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2164 adev->gfx.cu_info.max_waves_per_simd = 2165 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2166 adev->gfx.cu_info.max_scratch_slots_per_cu = 2167 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2168 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2169 if (hdr->version_minor >= 1) { 2170 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2171 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2172 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2173 adev->gfx.config.num_sc_per_sh = 2174 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2175 adev->gfx.config.num_packer_per_sc = 2176 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2177 } 2178 2179 parse_soc_bounding_box: 2180 /* 2181 * soc bounding box info is not integrated in disocovery table, 2182 * we always need to parse it from gpu info firmware if needed. 2183 */ 2184 if (hdr->version_minor == 2) { 2185 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2186 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2187 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2188 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2189 } 2190 break; 2191 } 2192 default: 2193 dev_err(adev->dev, 2194 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2195 err = -EINVAL; 2196 goto out; 2197 } 2198 out: 2199 return err; 2200 } 2201 2202 /** 2203 * amdgpu_device_ip_early_init - run early init for hardware IPs 2204 * 2205 * @adev: amdgpu_device pointer 2206 * 2207 * Early initialization pass for hardware IPs. The hardware IPs that make 2208 * up each asic are discovered each IP's early_init callback is run. This 2209 * is the first stage in initializing the asic. 2210 * Returns 0 on success, negative error code on failure. 2211 */ 2212 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2213 { 2214 struct drm_device *dev = adev_to_drm(adev); 2215 struct pci_dev *parent; 2216 int i, r; 2217 bool total; 2218 2219 amdgpu_device_enable_virtual_display(adev); 2220 2221 if (amdgpu_sriov_vf(adev)) { 2222 r = amdgpu_virt_request_full_gpu(adev, true); 2223 if (r) 2224 return r; 2225 } 2226 2227 switch (adev->asic_type) { 2228 #ifdef CONFIG_DRM_AMDGPU_SI 2229 case CHIP_VERDE: 2230 case CHIP_TAHITI: 2231 case CHIP_PITCAIRN: 2232 case CHIP_OLAND: 2233 case CHIP_HAINAN: 2234 adev->family = AMDGPU_FAMILY_SI; 2235 r = si_set_ip_blocks(adev); 2236 if (r) 2237 return r; 2238 break; 2239 #endif 2240 #ifdef CONFIG_DRM_AMDGPU_CIK 2241 case CHIP_BONAIRE: 2242 case CHIP_HAWAII: 2243 case CHIP_KAVERI: 2244 case CHIP_KABINI: 2245 case CHIP_MULLINS: 2246 if (adev->flags & AMD_IS_APU) 2247 adev->family = AMDGPU_FAMILY_KV; 2248 else 2249 adev->family = AMDGPU_FAMILY_CI; 2250 2251 r = cik_set_ip_blocks(adev); 2252 if (r) 2253 return r; 2254 break; 2255 #endif 2256 case CHIP_TOPAZ: 2257 case CHIP_TONGA: 2258 case CHIP_FIJI: 2259 case CHIP_POLARIS10: 2260 case CHIP_POLARIS11: 2261 case CHIP_POLARIS12: 2262 case CHIP_VEGAM: 2263 case CHIP_CARRIZO: 2264 case CHIP_STONEY: 2265 if (adev->flags & AMD_IS_APU) 2266 adev->family = AMDGPU_FAMILY_CZ; 2267 else 2268 adev->family = AMDGPU_FAMILY_VI; 2269 2270 r = vi_set_ip_blocks(adev); 2271 if (r) 2272 return r; 2273 break; 2274 default: 2275 r = amdgpu_discovery_set_ip_blocks(adev); 2276 if (r) 2277 return r; 2278 break; 2279 } 2280 2281 if (amdgpu_has_atpx() && 2282 (amdgpu_is_atpx_hybrid() || 2283 amdgpu_has_atpx_dgpu_power_cntl()) && 2284 ((adev->flags & AMD_IS_APU) == 0) && 2285 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2286 adev->flags |= AMD_IS_PX; 2287 2288 if (!(adev->flags & AMD_IS_APU)) { 2289 parent = pci_upstream_bridge(adev->pdev); 2290 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2291 } 2292 2293 2294 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2295 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2296 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2297 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2298 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2299 2300 total = true; 2301 for (i = 0; i < adev->num_ip_blocks; i++) { 2302 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2303 DRM_WARN("disabled ip block: %d <%s>\n", 2304 i, adev->ip_blocks[i].version->funcs->name); 2305 adev->ip_blocks[i].status.valid = false; 2306 } else { 2307 if (adev->ip_blocks[i].version->funcs->early_init) { 2308 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2309 if (r == -ENOENT) { 2310 adev->ip_blocks[i].status.valid = false; 2311 } else if (r) { 2312 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2313 adev->ip_blocks[i].version->funcs->name, r); 2314 total = false; 2315 } else { 2316 adev->ip_blocks[i].status.valid = true; 2317 } 2318 } else { 2319 adev->ip_blocks[i].status.valid = true; 2320 } 2321 } 2322 /* get the vbios after the asic_funcs are set up */ 2323 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2324 r = amdgpu_device_parse_gpu_info_fw(adev); 2325 if (r) 2326 return r; 2327 2328 /* Read BIOS */ 2329 if (amdgpu_device_read_bios(adev)) { 2330 if (!amdgpu_get_bios(adev)) 2331 return -EINVAL; 2332 2333 r = amdgpu_atombios_init(adev); 2334 if (r) { 2335 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2336 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2337 return r; 2338 } 2339 } 2340 2341 /*get pf2vf msg info at it's earliest time*/ 2342 if (amdgpu_sriov_vf(adev)) 2343 amdgpu_virt_init_data_exchange(adev); 2344 2345 } 2346 } 2347 if (!total) 2348 return -ENODEV; 2349 2350 amdgpu_amdkfd_device_probe(adev); 2351 adev->cg_flags &= amdgpu_cg_mask; 2352 adev->pg_flags &= amdgpu_pg_mask; 2353 2354 return 0; 2355 } 2356 2357 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2358 { 2359 int i, r; 2360 2361 for (i = 0; i < adev->num_ip_blocks; i++) { 2362 if (!adev->ip_blocks[i].status.sw) 2363 continue; 2364 if (adev->ip_blocks[i].status.hw) 2365 continue; 2366 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2367 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2368 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2369 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2370 if (r) { 2371 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2372 adev->ip_blocks[i].version->funcs->name, r); 2373 return r; 2374 } 2375 adev->ip_blocks[i].status.hw = true; 2376 } 2377 } 2378 2379 return 0; 2380 } 2381 2382 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2383 { 2384 int i, r; 2385 2386 for (i = 0; i < adev->num_ip_blocks; i++) { 2387 if (!adev->ip_blocks[i].status.sw) 2388 continue; 2389 if (adev->ip_blocks[i].status.hw) 2390 continue; 2391 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2392 if (r) { 2393 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2394 adev->ip_blocks[i].version->funcs->name, r); 2395 return r; 2396 } 2397 adev->ip_blocks[i].status.hw = true; 2398 } 2399 2400 return 0; 2401 } 2402 2403 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2404 { 2405 int r = 0; 2406 int i; 2407 uint32_t smu_version; 2408 2409 if (adev->asic_type >= CHIP_VEGA10) { 2410 for (i = 0; i < adev->num_ip_blocks; i++) { 2411 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2412 continue; 2413 2414 if (!adev->ip_blocks[i].status.sw) 2415 continue; 2416 2417 /* no need to do the fw loading again if already done*/ 2418 if (adev->ip_blocks[i].status.hw == true) 2419 break; 2420 2421 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2422 r = adev->ip_blocks[i].version->funcs->resume(adev); 2423 if (r) { 2424 DRM_ERROR("resume of IP block <%s> failed %d\n", 2425 adev->ip_blocks[i].version->funcs->name, r); 2426 return r; 2427 } 2428 } else { 2429 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2430 if (r) { 2431 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2432 adev->ip_blocks[i].version->funcs->name, r); 2433 return r; 2434 } 2435 } 2436 2437 adev->ip_blocks[i].status.hw = true; 2438 break; 2439 } 2440 } 2441 2442 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2443 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2444 2445 return r; 2446 } 2447 2448 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2449 { 2450 long timeout; 2451 int r, i; 2452 2453 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2454 struct amdgpu_ring *ring = adev->rings[i]; 2455 2456 /* No need to setup the GPU scheduler for rings that don't need it */ 2457 if (!ring || ring->no_scheduler) 2458 continue; 2459 2460 switch (ring->funcs->type) { 2461 case AMDGPU_RING_TYPE_GFX: 2462 timeout = adev->gfx_timeout; 2463 break; 2464 case AMDGPU_RING_TYPE_COMPUTE: 2465 timeout = adev->compute_timeout; 2466 break; 2467 case AMDGPU_RING_TYPE_SDMA: 2468 timeout = adev->sdma_timeout; 2469 break; 2470 default: 2471 timeout = adev->video_timeout; 2472 break; 2473 } 2474 2475 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2476 ring->num_hw_submission, 0, 2477 timeout, adev->reset_domain->wq, 2478 ring->sched_score, ring->name, 2479 adev->dev); 2480 if (r) { 2481 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2482 ring->name); 2483 return r; 2484 } 2485 } 2486 2487 amdgpu_xcp_update_partition_sched_list(adev); 2488 2489 return 0; 2490 } 2491 2492 2493 /** 2494 * amdgpu_device_ip_init - run init for hardware IPs 2495 * 2496 * @adev: amdgpu_device pointer 2497 * 2498 * Main initialization pass for hardware IPs. The list of all the hardware 2499 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2500 * are run. sw_init initializes the software state associated with each IP 2501 * and hw_init initializes the hardware associated with each IP. 2502 * Returns 0 on success, negative error code on failure. 2503 */ 2504 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2505 { 2506 int i, r; 2507 2508 r = amdgpu_ras_init(adev); 2509 if (r) 2510 return r; 2511 2512 for (i = 0; i < adev->num_ip_blocks; i++) { 2513 if (!adev->ip_blocks[i].status.valid) 2514 continue; 2515 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2516 if (r) { 2517 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2518 adev->ip_blocks[i].version->funcs->name, r); 2519 goto init_failed; 2520 } 2521 adev->ip_blocks[i].status.sw = true; 2522 2523 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2524 /* need to do common hw init early so everything is set up for gmc */ 2525 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2526 if (r) { 2527 DRM_ERROR("hw_init %d failed %d\n", i, r); 2528 goto init_failed; 2529 } 2530 adev->ip_blocks[i].status.hw = true; 2531 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2532 /* need to do gmc hw init early so we can allocate gpu mem */ 2533 /* Try to reserve bad pages early */ 2534 if (amdgpu_sriov_vf(adev)) 2535 amdgpu_virt_exchange_data(adev); 2536 2537 r = amdgpu_device_mem_scratch_init(adev); 2538 if (r) { 2539 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2540 goto init_failed; 2541 } 2542 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2543 if (r) { 2544 DRM_ERROR("hw_init %d failed %d\n", i, r); 2545 goto init_failed; 2546 } 2547 r = amdgpu_device_wb_init(adev); 2548 if (r) { 2549 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2550 goto init_failed; 2551 } 2552 adev->ip_blocks[i].status.hw = true; 2553 2554 /* right after GMC hw init, we create CSA */ 2555 if (adev->gfx.mcbp) { 2556 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2557 AMDGPU_GEM_DOMAIN_VRAM | 2558 AMDGPU_GEM_DOMAIN_GTT, 2559 AMDGPU_CSA_SIZE); 2560 if (r) { 2561 DRM_ERROR("allocate CSA failed %d\n", r); 2562 goto init_failed; 2563 } 2564 } 2565 } 2566 } 2567 2568 if (amdgpu_sriov_vf(adev)) 2569 amdgpu_virt_init_data_exchange(adev); 2570 2571 r = amdgpu_ib_pool_init(adev); 2572 if (r) { 2573 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2574 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2575 goto init_failed; 2576 } 2577 2578 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2579 if (r) 2580 goto init_failed; 2581 2582 r = amdgpu_device_ip_hw_init_phase1(adev); 2583 if (r) 2584 goto init_failed; 2585 2586 r = amdgpu_device_fw_loading(adev); 2587 if (r) 2588 goto init_failed; 2589 2590 r = amdgpu_device_ip_hw_init_phase2(adev); 2591 if (r) 2592 goto init_failed; 2593 2594 /* 2595 * retired pages will be loaded from eeprom and reserved here, 2596 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2597 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2598 * for I2C communication which only true at this point. 2599 * 2600 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2601 * failure from bad gpu situation and stop amdgpu init process 2602 * accordingly. For other failed cases, it will still release all 2603 * the resource and print error message, rather than returning one 2604 * negative value to upper level. 2605 * 2606 * Note: theoretically, this should be called before all vram allocations 2607 * to protect retired page from abusing 2608 */ 2609 r = amdgpu_ras_recovery_init(adev); 2610 if (r) 2611 goto init_failed; 2612 2613 /** 2614 * In case of XGMI grab extra reference for reset domain for this device 2615 */ 2616 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2617 if (amdgpu_xgmi_add_device(adev) == 0) { 2618 if (!amdgpu_sriov_vf(adev)) { 2619 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2620 2621 if (WARN_ON(!hive)) { 2622 r = -ENOENT; 2623 goto init_failed; 2624 } 2625 2626 if (!hive->reset_domain || 2627 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2628 r = -ENOENT; 2629 amdgpu_put_xgmi_hive(hive); 2630 goto init_failed; 2631 } 2632 2633 /* Drop the early temporary reset domain we created for device */ 2634 amdgpu_reset_put_reset_domain(adev->reset_domain); 2635 adev->reset_domain = hive->reset_domain; 2636 amdgpu_put_xgmi_hive(hive); 2637 } 2638 } 2639 } 2640 2641 r = amdgpu_device_init_schedulers(adev); 2642 if (r) 2643 goto init_failed; 2644 2645 /* Don't init kfd if whole hive need to be reset during init */ 2646 if (!adev->gmc.xgmi.pending_reset) { 2647 kgd2kfd_init_zone_device(adev); 2648 amdgpu_amdkfd_device_init(adev); 2649 } 2650 2651 amdgpu_fru_get_product_info(adev); 2652 2653 init_failed: 2654 2655 return r; 2656 } 2657 2658 /** 2659 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2660 * 2661 * @adev: amdgpu_device pointer 2662 * 2663 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2664 * this function before a GPU reset. If the value is retained after a 2665 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2666 */ 2667 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2668 { 2669 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2670 } 2671 2672 /** 2673 * amdgpu_device_check_vram_lost - check if vram is valid 2674 * 2675 * @adev: amdgpu_device pointer 2676 * 2677 * Checks the reset magic value written to the gart pointer in VRAM. 2678 * The driver calls this after a GPU reset to see if the contents of 2679 * VRAM is lost or now. 2680 * returns true if vram is lost, false if not. 2681 */ 2682 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2683 { 2684 if (memcmp(adev->gart.ptr, adev->reset_magic, 2685 AMDGPU_RESET_MAGIC_NUM)) 2686 return true; 2687 2688 if (!amdgpu_in_reset(adev)) 2689 return false; 2690 2691 /* 2692 * For all ASICs with baco/mode1 reset, the VRAM is 2693 * always assumed to be lost. 2694 */ 2695 switch (amdgpu_asic_reset_method(adev)) { 2696 case AMD_RESET_METHOD_BACO: 2697 case AMD_RESET_METHOD_MODE1: 2698 return true; 2699 default: 2700 return false; 2701 } 2702 } 2703 2704 /** 2705 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2706 * 2707 * @adev: amdgpu_device pointer 2708 * @state: clockgating state (gate or ungate) 2709 * 2710 * The list of all the hardware IPs that make up the asic is walked and the 2711 * set_clockgating_state callbacks are run. 2712 * Late initialization pass enabling clockgating for hardware IPs. 2713 * Fini or suspend, pass disabling clockgating for hardware IPs. 2714 * Returns 0 on success, negative error code on failure. 2715 */ 2716 2717 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2718 enum amd_clockgating_state state) 2719 { 2720 int i, j, r; 2721 2722 if (amdgpu_emu_mode == 1) 2723 return 0; 2724 2725 for (j = 0; j < adev->num_ip_blocks; j++) { 2726 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2727 if (!adev->ip_blocks[i].status.late_initialized) 2728 continue; 2729 /* skip CG for GFX, SDMA on S0ix */ 2730 if (adev->in_s0ix && 2731 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2732 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2733 continue; 2734 /* skip CG for VCE/UVD, it's handled specially */ 2735 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2736 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2737 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2738 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2739 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2740 /* enable clockgating to save power */ 2741 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2742 state); 2743 if (r) { 2744 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2745 adev->ip_blocks[i].version->funcs->name, r); 2746 return r; 2747 } 2748 } 2749 } 2750 2751 return 0; 2752 } 2753 2754 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2755 enum amd_powergating_state state) 2756 { 2757 int i, j, r; 2758 2759 if (amdgpu_emu_mode == 1) 2760 return 0; 2761 2762 for (j = 0; j < adev->num_ip_blocks; j++) { 2763 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2764 if (!adev->ip_blocks[i].status.late_initialized) 2765 continue; 2766 /* skip PG for GFX, SDMA on S0ix */ 2767 if (adev->in_s0ix && 2768 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2769 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2770 continue; 2771 /* skip CG for VCE/UVD, it's handled specially */ 2772 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2773 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2774 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2775 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2776 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2777 /* enable powergating to save power */ 2778 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2779 state); 2780 if (r) { 2781 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2782 adev->ip_blocks[i].version->funcs->name, r); 2783 return r; 2784 } 2785 } 2786 } 2787 return 0; 2788 } 2789 2790 static int amdgpu_device_enable_mgpu_fan_boost(void) 2791 { 2792 struct amdgpu_gpu_instance *gpu_ins; 2793 struct amdgpu_device *adev; 2794 int i, ret = 0; 2795 2796 mutex_lock(&mgpu_info.mutex); 2797 2798 /* 2799 * MGPU fan boost feature should be enabled 2800 * only when there are two or more dGPUs in 2801 * the system 2802 */ 2803 if (mgpu_info.num_dgpu < 2) 2804 goto out; 2805 2806 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2807 gpu_ins = &(mgpu_info.gpu_ins[i]); 2808 adev = gpu_ins->adev; 2809 if (!(adev->flags & AMD_IS_APU) && 2810 !gpu_ins->mgpu_fan_enabled) { 2811 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2812 if (ret) 2813 break; 2814 2815 gpu_ins->mgpu_fan_enabled = 1; 2816 } 2817 } 2818 2819 out: 2820 mutex_unlock(&mgpu_info.mutex); 2821 2822 return ret; 2823 } 2824 2825 /** 2826 * amdgpu_device_ip_late_init - run late init for hardware IPs 2827 * 2828 * @adev: amdgpu_device pointer 2829 * 2830 * Late initialization pass for hardware IPs. The list of all the hardware 2831 * IPs that make up the asic is walked and the late_init callbacks are run. 2832 * late_init covers any special initialization that an IP requires 2833 * after all of the have been initialized or something that needs to happen 2834 * late in the init process. 2835 * Returns 0 on success, negative error code on failure. 2836 */ 2837 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2838 { 2839 struct amdgpu_gpu_instance *gpu_instance; 2840 int i = 0, r; 2841 2842 for (i = 0; i < adev->num_ip_blocks; i++) { 2843 if (!adev->ip_blocks[i].status.hw) 2844 continue; 2845 if (adev->ip_blocks[i].version->funcs->late_init) { 2846 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2847 if (r) { 2848 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2849 adev->ip_blocks[i].version->funcs->name, r); 2850 return r; 2851 } 2852 } 2853 adev->ip_blocks[i].status.late_initialized = true; 2854 } 2855 2856 r = amdgpu_ras_late_init(adev); 2857 if (r) { 2858 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2859 return r; 2860 } 2861 2862 amdgpu_ras_set_error_query_ready(adev, true); 2863 2864 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2865 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2866 2867 amdgpu_device_fill_reset_magic(adev); 2868 2869 r = amdgpu_device_enable_mgpu_fan_boost(); 2870 if (r) 2871 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2872 2873 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2874 if (amdgpu_passthrough(adev) && 2875 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2876 adev->asic_type == CHIP_ALDEBARAN)) 2877 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2878 2879 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2880 mutex_lock(&mgpu_info.mutex); 2881 2882 /* 2883 * Reset device p-state to low as this was booted with high. 2884 * 2885 * This should be performed only after all devices from the same 2886 * hive get initialized. 2887 * 2888 * However, it's unknown how many device in the hive in advance. 2889 * As this is counted one by one during devices initializations. 2890 * 2891 * So, we wait for all XGMI interlinked devices initialized. 2892 * This may bring some delays as those devices may come from 2893 * different hives. But that should be OK. 2894 */ 2895 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2896 for (i = 0; i < mgpu_info.num_gpu; i++) { 2897 gpu_instance = &(mgpu_info.gpu_ins[i]); 2898 if (gpu_instance->adev->flags & AMD_IS_APU) 2899 continue; 2900 2901 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2902 AMDGPU_XGMI_PSTATE_MIN); 2903 if (r) { 2904 DRM_ERROR("pstate setting failed (%d).\n", r); 2905 break; 2906 } 2907 } 2908 } 2909 2910 mutex_unlock(&mgpu_info.mutex); 2911 } 2912 2913 return 0; 2914 } 2915 2916 /** 2917 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2918 * 2919 * @adev: amdgpu_device pointer 2920 * 2921 * For ASICs need to disable SMC first 2922 */ 2923 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2924 { 2925 int i, r; 2926 2927 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2928 return; 2929 2930 for (i = 0; i < adev->num_ip_blocks; i++) { 2931 if (!adev->ip_blocks[i].status.hw) 2932 continue; 2933 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2934 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2935 /* XXX handle errors */ 2936 if (r) { 2937 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2938 adev->ip_blocks[i].version->funcs->name, r); 2939 } 2940 adev->ip_blocks[i].status.hw = false; 2941 break; 2942 } 2943 } 2944 } 2945 2946 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2947 { 2948 int i, r; 2949 2950 for (i = 0; i < adev->num_ip_blocks; i++) { 2951 if (!adev->ip_blocks[i].version->funcs->early_fini) 2952 continue; 2953 2954 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2955 if (r) { 2956 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2957 adev->ip_blocks[i].version->funcs->name, r); 2958 } 2959 } 2960 2961 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2962 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2963 2964 amdgpu_amdkfd_suspend(adev, false); 2965 2966 /* Workaroud for ASICs need to disable SMC first */ 2967 amdgpu_device_smu_fini_early(adev); 2968 2969 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2970 if (!adev->ip_blocks[i].status.hw) 2971 continue; 2972 2973 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2974 /* XXX handle errors */ 2975 if (r) { 2976 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2977 adev->ip_blocks[i].version->funcs->name, r); 2978 } 2979 2980 adev->ip_blocks[i].status.hw = false; 2981 } 2982 2983 if (amdgpu_sriov_vf(adev)) { 2984 if (amdgpu_virt_release_full_gpu(adev, false)) 2985 DRM_ERROR("failed to release exclusive mode on fini\n"); 2986 } 2987 2988 return 0; 2989 } 2990 2991 /** 2992 * amdgpu_device_ip_fini - run fini for hardware IPs 2993 * 2994 * @adev: amdgpu_device pointer 2995 * 2996 * Main teardown pass for hardware IPs. The list of all the hardware 2997 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2998 * are run. hw_fini tears down the hardware associated with each IP 2999 * and sw_fini tears down any software state associated with each IP. 3000 * Returns 0 on success, negative error code on failure. 3001 */ 3002 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3003 { 3004 int i, r; 3005 3006 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3007 amdgpu_virt_release_ras_err_handler_data(adev); 3008 3009 if (adev->gmc.xgmi.num_physical_nodes > 1) 3010 amdgpu_xgmi_remove_device(adev); 3011 3012 amdgpu_amdkfd_device_fini_sw(adev); 3013 3014 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3015 if (!adev->ip_blocks[i].status.sw) 3016 continue; 3017 3018 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3019 amdgpu_ucode_free_bo(adev); 3020 amdgpu_free_static_csa(&adev->virt.csa_obj); 3021 amdgpu_device_wb_fini(adev); 3022 amdgpu_device_mem_scratch_fini(adev); 3023 amdgpu_ib_pool_fini(adev); 3024 } 3025 3026 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3027 /* XXX handle errors */ 3028 if (r) { 3029 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3030 adev->ip_blocks[i].version->funcs->name, r); 3031 } 3032 adev->ip_blocks[i].status.sw = false; 3033 adev->ip_blocks[i].status.valid = false; 3034 } 3035 3036 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3037 if (!adev->ip_blocks[i].status.late_initialized) 3038 continue; 3039 if (adev->ip_blocks[i].version->funcs->late_fini) 3040 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3041 adev->ip_blocks[i].status.late_initialized = false; 3042 } 3043 3044 amdgpu_ras_fini(adev); 3045 3046 return 0; 3047 } 3048 3049 /** 3050 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3051 * 3052 * @work: work_struct. 3053 */ 3054 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3055 { 3056 struct amdgpu_device *adev = 3057 container_of(work, struct amdgpu_device, delayed_init_work.work); 3058 int r; 3059 3060 r = amdgpu_ib_ring_tests(adev); 3061 if (r) 3062 DRM_ERROR("ib ring test failed (%d).\n", r); 3063 } 3064 3065 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3066 { 3067 struct amdgpu_device *adev = 3068 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3069 3070 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3071 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3072 3073 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3074 adev->gfx.gfx_off_state = true; 3075 } 3076 3077 /** 3078 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3079 * 3080 * @adev: amdgpu_device pointer 3081 * 3082 * Main suspend function for hardware IPs. The list of all the hardware 3083 * IPs that make up the asic is walked, clockgating is disabled and the 3084 * suspend callbacks are run. suspend puts the hardware and software state 3085 * in each IP into a state suitable for suspend. 3086 * Returns 0 on success, negative error code on failure. 3087 */ 3088 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3089 { 3090 int i, r; 3091 3092 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3093 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3094 3095 /* 3096 * Per PMFW team's suggestion, driver needs to handle gfxoff 3097 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3098 * scenario. Add the missing df cstate disablement here. 3099 */ 3100 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3101 dev_warn(adev->dev, "Failed to disallow df cstate"); 3102 3103 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3104 if (!adev->ip_blocks[i].status.valid) 3105 continue; 3106 3107 /* displays are handled separately */ 3108 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3109 continue; 3110 3111 /* XXX handle errors */ 3112 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3113 /* XXX handle errors */ 3114 if (r) { 3115 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3116 adev->ip_blocks[i].version->funcs->name, r); 3117 return r; 3118 } 3119 3120 adev->ip_blocks[i].status.hw = false; 3121 } 3122 3123 return 0; 3124 } 3125 3126 /** 3127 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3128 * 3129 * @adev: amdgpu_device pointer 3130 * 3131 * Main suspend function for hardware IPs. The list of all the hardware 3132 * IPs that make up the asic is walked, clockgating is disabled and the 3133 * suspend callbacks are run. suspend puts the hardware and software state 3134 * in each IP into a state suitable for suspend. 3135 * Returns 0 on success, negative error code on failure. 3136 */ 3137 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3138 { 3139 int i, r; 3140 3141 if (adev->in_s0ix) 3142 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3143 3144 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3145 if (!adev->ip_blocks[i].status.valid) 3146 continue; 3147 /* displays are handled in phase1 */ 3148 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3149 continue; 3150 /* PSP lost connection when err_event_athub occurs */ 3151 if (amdgpu_ras_intr_triggered() && 3152 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3153 adev->ip_blocks[i].status.hw = false; 3154 continue; 3155 } 3156 3157 /* skip unnecessary suspend if we do not initialize them yet */ 3158 if (adev->gmc.xgmi.pending_reset && 3159 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3160 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3161 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3162 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3163 adev->ip_blocks[i].status.hw = false; 3164 continue; 3165 } 3166 3167 /* skip suspend of gfx/mes and psp for S0ix 3168 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3169 * like at runtime. PSP is also part of the always on hardware 3170 * so no need to suspend it. 3171 */ 3172 if (adev->in_s0ix && 3173 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3174 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3176 continue; 3177 3178 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3179 if (adev->in_s0ix && 3180 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3181 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3182 continue; 3183 3184 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3185 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3186 * from this location and RLC Autoload automatically also gets loaded 3187 * from here based on PMFW -> PSP message during re-init sequence. 3188 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3189 * the TMR and reload FWs again for IMU enabled APU ASICs. 3190 */ 3191 if (amdgpu_in_reset(adev) && 3192 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3193 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3194 continue; 3195 3196 /* XXX handle errors */ 3197 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3198 /* XXX handle errors */ 3199 if (r) { 3200 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3201 adev->ip_blocks[i].version->funcs->name, r); 3202 } 3203 adev->ip_blocks[i].status.hw = false; 3204 /* handle putting the SMC in the appropriate state */ 3205 if (!amdgpu_sriov_vf(adev)) { 3206 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3207 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3208 if (r) { 3209 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3210 adev->mp1_state, r); 3211 return r; 3212 } 3213 } 3214 } 3215 } 3216 3217 return 0; 3218 } 3219 3220 /** 3221 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3222 * 3223 * @adev: amdgpu_device pointer 3224 * 3225 * Main suspend function for hardware IPs. The list of all the hardware 3226 * IPs that make up the asic is walked, clockgating is disabled and the 3227 * suspend callbacks are run. suspend puts the hardware and software state 3228 * in each IP into a state suitable for suspend. 3229 * Returns 0 on success, negative error code on failure. 3230 */ 3231 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3232 { 3233 int r; 3234 3235 if (amdgpu_sriov_vf(adev)) { 3236 amdgpu_virt_fini_data_exchange(adev); 3237 amdgpu_virt_request_full_gpu(adev, false); 3238 } 3239 3240 r = amdgpu_device_ip_suspend_phase1(adev); 3241 if (r) 3242 return r; 3243 r = amdgpu_device_ip_suspend_phase2(adev); 3244 3245 if (amdgpu_sriov_vf(adev)) 3246 amdgpu_virt_release_full_gpu(adev, false); 3247 3248 return r; 3249 } 3250 3251 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3252 { 3253 int i, r; 3254 3255 static enum amd_ip_block_type ip_order[] = { 3256 AMD_IP_BLOCK_TYPE_COMMON, 3257 AMD_IP_BLOCK_TYPE_GMC, 3258 AMD_IP_BLOCK_TYPE_PSP, 3259 AMD_IP_BLOCK_TYPE_IH, 3260 }; 3261 3262 for (i = 0; i < adev->num_ip_blocks; i++) { 3263 int j; 3264 struct amdgpu_ip_block *block; 3265 3266 block = &adev->ip_blocks[i]; 3267 block->status.hw = false; 3268 3269 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3270 3271 if (block->version->type != ip_order[j] || 3272 !block->status.valid) 3273 continue; 3274 3275 r = block->version->funcs->hw_init(adev); 3276 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3277 if (r) 3278 return r; 3279 block->status.hw = true; 3280 } 3281 } 3282 3283 return 0; 3284 } 3285 3286 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3287 { 3288 int i, r; 3289 3290 static enum amd_ip_block_type ip_order[] = { 3291 AMD_IP_BLOCK_TYPE_SMC, 3292 AMD_IP_BLOCK_TYPE_DCE, 3293 AMD_IP_BLOCK_TYPE_GFX, 3294 AMD_IP_BLOCK_TYPE_SDMA, 3295 AMD_IP_BLOCK_TYPE_MES, 3296 AMD_IP_BLOCK_TYPE_UVD, 3297 AMD_IP_BLOCK_TYPE_VCE, 3298 AMD_IP_BLOCK_TYPE_VCN, 3299 AMD_IP_BLOCK_TYPE_JPEG 3300 }; 3301 3302 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3303 int j; 3304 struct amdgpu_ip_block *block; 3305 3306 for (j = 0; j < adev->num_ip_blocks; j++) { 3307 block = &adev->ip_blocks[j]; 3308 3309 if (block->version->type != ip_order[i] || 3310 !block->status.valid || 3311 block->status.hw) 3312 continue; 3313 3314 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3315 r = block->version->funcs->resume(adev); 3316 else 3317 r = block->version->funcs->hw_init(adev); 3318 3319 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3320 if (r) 3321 return r; 3322 block->status.hw = true; 3323 } 3324 } 3325 3326 return 0; 3327 } 3328 3329 /** 3330 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3331 * 3332 * @adev: amdgpu_device pointer 3333 * 3334 * First resume function for hardware IPs. The list of all the hardware 3335 * IPs that make up the asic is walked and the resume callbacks are run for 3336 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3337 * after a suspend and updates the software state as necessary. This 3338 * function is also used for restoring the GPU after a GPU reset. 3339 * Returns 0 on success, negative error code on failure. 3340 */ 3341 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3342 { 3343 int i, r; 3344 3345 for (i = 0; i < adev->num_ip_blocks; i++) { 3346 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3347 continue; 3348 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3349 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3350 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3351 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3352 3353 r = adev->ip_blocks[i].version->funcs->resume(adev); 3354 if (r) { 3355 DRM_ERROR("resume of IP block <%s> failed %d\n", 3356 adev->ip_blocks[i].version->funcs->name, r); 3357 return r; 3358 } 3359 adev->ip_blocks[i].status.hw = true; 3360 } 3361 } 3362 3363 return 0; 3364 } 3365 3366 /** 3367 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3368 * 3369 * @adev: amdgpu_device pointer 3370 * 3371 * First resume function for hardware IPs. The list of all the hardware 3372 * IPs that make up the asic is walked and the resume callbacks are run for 3373 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3374 * functional state after a suspend and updates the software state as 3375 * necessary. This function is also used for restoring the GPU after a GPU 3376 * reset. 3377 * Returns 0 on success, negative error code on failure. 3378 */ 3379 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3380 { 3381 int i, r; 3382 3383 for (i = 0; i < adev->num_ip_blocks; i++) { 3384 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3385 continue; 3386 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3387 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3388 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3389 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3390 continue; 3391 r = adev->ip_blocks[i].version->funcs->resume(adev); 3392 if (r) { 3393 DRM_ERROR("resume of IP block <%s> failed %d\n", 3394 adev->ip_blocks[i].version->funcs->name, r); 3395 return r; 3396 } 3397 adev->ip_blocks[i].status.hw = true; 3398 } 3399 3400 return 0; 3401 } 3402 3403 /** 3404 * amdgpu_device_ip_resume - run resume for hardware IPs 3405 * 3406 * @adev: amdgpu_device pointer 3407 * 3408 * Main resume function for hardware IPs. The hardware IPs 3409 * are split into two resume functions because they are 3410 * are also used in in recovering from a GPU reset and some additional 3411 * steps need to be take between them. In this case (S3/S4) they are 3412 * run sequentially. 3413 * Returns 0 on success, negative error code on failure. 3414 */ 3415 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3416 { 3417 int r; 3418 3419 if (!adev->in_s0ix) { 3420 r = amdgpu_amdkfd_resume_iommu(adev); 3421 if (r) 3422 return r; 3423 } 3424 3425 r = amdgpu_device_ip_resume_phase1(adev); 3426 if (r) 3427 return r; 3428 3429 r = amdgpu_device_fw_loading(adev); 3430 if (r) 3431 return r; 3432 3433 r = amdgpu_device_ip_resume_phase2(adev); 3434 3435 return r; 3436 } 3437 3438 /** 3439 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3440 * 3441 * @adev: amdgpu_device pointer 3442 * 3443 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3444 */ 3445 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3446 { 3447 if (amdgpu_sriov_vf(adev)) { 3448 if (adev->is_atom_fw) { 3449 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3450 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3451 } else { 3452 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3453 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3454 } 3455 3456 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3457 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3458 } 3459 } 3460 3461 /** 3462 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3463 * 3464 * @asic_type: AMD asic type 3465 * 3466 * Check if there is DC (new modesetting infrastructre) support for an asic. 3467 * returns true if DC has support, false if not. 3468 */ 3469 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3470 { 3471 switch (asic_type) { 3472 #ifdef CONFIG_DRM_AMDGPU_SI 3473 case CHIP_HAINAN: 3474 #endif 3475 case CHIP_TOPAZ: 3476 /* chips with no display hardware */ 3477 return false; 3478 #if defined(CONFIG_DRM_AMD_DC) 3479 case CHIP_TAHITI: 3480 case CHIP_PITCAIRN: 3481 case CHIP_VERDE: 3482 case CHIP_OLAND: 3483 /* 3484 * We have systems in the wild with these ASICs that require 3485 * LVDS and VGA support which is not supported with DC. 3486 * 3487 * Fallback to the non-DC driver here by default so as not to 3488 * cause regressions. 3489 */ 3490 #if defined(CONFIG_DRM_AMD_DC_SI) 3491 return amdgpu_dc > 0; 3492 #else 3493 return false; 3494 #endif 3495 case CHIP_BONAIRE: 3496 case CHIP_KAVERI: 3497 case CHIP_KABINI: 3498 case CHIP_MULLINS: 3499 /* 3500 * We have systems in the wild with these ASICs that require 3501 * VGA support which is not supported with DC. 3502 * 3503 * Fallback to the non-DC driver here by default so as not to 3504 * cause regressions. 3505 */ 3506 return amdgpu_dc > 0; 3507 default: 3508 return amdgpu_dc != 0; 3509 #else 3510 default: 3511 if (amdgpu_dc > 0) 3512 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3513 "but isn't supported by ASIC, ignoring\n"); 3514 return false; 3515 #endif 3516 } 3517 } 3518 3519 /** 3520 * amdgpu_device_has_dc_support - check if dc is supported 3521 * 3522 * @adev: amdgpu_device pointer 3523 * 3524 * Returns true for supported, false for not supported 3525 */ 3526 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3527 { 3528 if (adev->enable_virtual_display || 3529 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3530 return false; 3531 3532 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3533 } 3534 3535 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3536 { 3537 struct amdgpu_device *adev = 3538 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3539 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3540 3541 /* It's a bug to not have a hive within this function */ 3542 if (WARN_ON(!hive)) 3543 return; 3544 3545 /* 3546 * Use task barrier to synchronize all xgmi reset works across the 3547 * hive. task_barrier_enter and task_barrier_exit will block 3548 * until all the threads running the xgmi reset works reach 3549 * those points. task_barrier_full will do both blocks. 3550 */ 3551 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3552 3553 task_barrier_enter(&hive->tb); 3554 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3555 3556 if (adev->asic_reset_res) 3557 goto fail; 3558 3559 task_barrier_exit(&hive->tb); 3560 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3561 3562 if (adev->asic_reset_res) 3563 goto fail; 3564 3565 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3566 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3567 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3568 } else { 3569 3570 task_barrier_full(&hive->tb); 3571 adev->asic_reset_res = amdgpu_asic_reset(adev); 3572 } 3573 3574 fail: 3575 if (adev->asic_reset_res) 3576 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3577 adev->asic_reset_res, adev_to_drm(adev)->unique); 3578 amdgpu_put_xgmi_hive(hive); 3579 } 3580 3581 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3582 { 3583 char *input = amdgpu_lockup_timeout; 3584 char *timeout_setting = NULL; 3585 int index = 0; 3586 long timeout; 3587 int ret = 0; 3588 3589 /* 3590 * By default timeout for non compute jobs is 10000 3591 * and 60000 for compute jobs. 3592 * In SR-IOV or passthrough mode, timeout for compute 3593 * jobs are 60000 by default. 3594 */ 3595 adev->gfx_timeout = msecs_to_jiffies(10000); 3596 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3597 if (amdgpu_sriov_vf(adev)) 3598 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3599 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3600 else 3601 adev->compute_timeout = msecs_to_jiffies(60000); 3602 3603 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3604 while ((timeout_setting = strsep(&input, ",")) && 3605 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3606 ret = kstrtol(timeout_setting, 0, &timeout); 3607 if (ret) 3608 return ret; 3609 3610 if (timeout == 0) { 3611 index++; 3612 continue; 3613 } else if (timeout < 0) { 3614 timeout = MAX_SCHEDULE_TIMEOUT; 3615 dev_warn(adev->dev, "lockup timeout disabled"); 3616 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3617 } else { 3618 timeout = msecs_to_jiffies(timeout); 3619 } 3620 3621 switch (index++) { 3622 case 0: 3623 adev->gfx_timeout = timeout; 3624 break; 3625 case 1: 3626 adev->compute_timeout = timeout; 3627 break; 3628 case 2: 3629 adev->sdma_timeout = timeout; 3630 break; 3631 case 3: 3632 adev->video_timeout = timeout; 3633 break; 3634 default: 3635 break; 3636 } 3637 } 3638 /* 3639 * There is only one value specified and 3640 * it should apply to all non-compute jobs. 3641 */ 3642 if (index == 1) { 3643 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3644 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3645 adev->compute_timeout = adev->gfx_timeout; 3646 } 3647 } 3648 3649 return ret; 3650 } 3651 3652 /** 3653 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3654 * 3655 * @adev: amdgpu_device pointer 3656 * 3657 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3658 */ 3659 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3660 { 3661 struct iommu_domain *domain; 3662 3663 domain = iommu_get_domain_for_dev(adev->dev); 3664 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3665 adev->ram_is_direct_mapped = true; 3666 } 3667 3668 static const struct attribute *amdgpu_dev_attributes[] = { 3669 &dev_attr_product_name.attr, 3670 &dev_attr_product_number.attr, 3671 &dev_attr_serial_number.attr, 3672 &dev_attr_pcie_replay_count.attr, 3673 NULL 3674 }; 3675 3676 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3677 { 3678 if (amdgpu_mcbp == 1) 3679 adev->gfx.mcbp = true; 3680 3681 if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3682 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3683 adev->gfx.num_gfx_rings) 3684 adev->gfx.mcbp = true; 3685 3686 if (amdgpu_sriov_vf(adev)) 3687 adev->gfx.mcbp = true; 3688 3689 if (adev->gfx.mcbp) 3690 DRM_INFO("MCBP is enabled\n"); 3691 } 3692 3693 /** 3694 * amdgpu_device_init - initialize the driver 3695 * 3696 * @adev: amdgpu_device pointer 3697 * @flags: driver flags 3698 * 3699 * Initializes the driver info and hw (all asics). 3700 * Returns 0 for success or an error on failure. 3701 * Called at driver startup. 3702 */ 3703 int amdgpu_device_init(struct amdgpu_device *adev, 3704 uint32_t flags) 3705 { 3706 struct drm_device *ddev = adev_to_drm(adev); 3707 struct pci_dev *pdev = adev->pdev; 3708 int r, i; 3709 bool px = false; 3710 u32 max_MBps; 3711 int tmp; 3712 3713 adev->shutdown = false; 3714 adev->flags = flags; 3715 3716 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3717 adev->asic_type = amdgpu_force_asic_type; 3718 else 3719 adev->asic_type = flags & AMD_ASIC_MASK; 3720 3721 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3722 if (amdgpu_emu_mode == 1) 3723 adev->usec_timeout *= 10; 3724 adev->gmc.gart_size = 512 * 1024 * 1024; 3725 adev->accel_working = false; 3726 adev->num_rings = 0; 3727 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3728 adev->mman.buffer_funcs = NULL; 3729 adev->mman.buffer_funcs_ring = NULL; 3730 adev->vm_manager.vm_pte_funcs = NULL; 3731 adev->vm_manager.vm_pte_num_scheds = 0; 3732 adev->gmc.gmc_funcs = NULL; 3733 adev->harvest_ip_mask = 0x0; 3734 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3735 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3736 3737 adev->smc_rreg = &amdgpu_invalid_rreg; 3738 adev->smc_wreg = &amdgpu_invalid_wreg; 3739 adev->pcie_rreg = &amdgpu_invalid_rreg; 3740 adev->pcie_wreg = &amdgpu_invalid_wreg; 3741 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3742 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3743 adev->pciep_rreg = &amdgpu_invalid_rreg; 3744 adev->pciep_wreg = &amdgpu_invalid_wreg; 3745 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3746 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3747 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3748 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3749 adev->didt_rreg = &amdgpu_invalid_rreg; 3750 adev->didt_wreg = &amdgpu_invalid_wreg; 3751 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3752 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3753 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3754 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3755 3756 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3757 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3758 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3759 3760 /* mutex initialization are all done here so we 3761 * can recall function without having locking issues */ 3762 mutex_init(&adev->firmware.mutex); 3763 mutex_init(&adev->pm.mutex); 3764 mutex_init(&adev->gfx.gpu_clock_mutex); 3765 mutex_init(&adev->srbm_mutex); 3766 mutex_init(&adev->gfx.pipe_reserve_mutex); 3767 mutex_init(&adev->gfx.gfx_off_mutex); 3768 mutex_init(&adev->gfx.partition_mutex); 3769 mutex_init(&adev->grbm_idx_mutex); 3770 mutex_init(&adev->mn_lock); 3771 mutex_init(&adev->virt.vf_errors.lock); 3772 hash_init(adev->mn_hash); 3773 mutex_init(&adev->psp.mutex); 3774 mutex_init(&adev->notifier_lock); 3775 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3776 mutex_init(&adev->benchmark_mutex); 3777 3778 amdgpu_device_init_apu_flags(adev); 3779 3780 r = amdgpu_device_check_arguments(adev); 3781 if (r) 3782 return r; 3783 3784 spin_lock_init(&adev->mmio_idx_lock); 3785 spin_lock_init(&adev->smc_idx_lock); 3786 spin_lock_init(&adev->pcie_idx_lock); 3787 spin_lock_init(&adev->uvd_ctx_idx_lock); 3788 spin_lock_init(&adev->didt_idx_lock); 3789 spin_lock_init(&adev->gc_cac_idx_lock); 3790 spin_lock_init(&adev->se_cac_idx_lock); 3791 spin_lock_init(&adev->audio_endpt_idx_lock); 3792 spin_lock_init(&adev->mm_stats.lock); 3793 3794 INIT_LIST_HEAD(&adev->shadow_list); 3795 mutex_init(&adev->shadow_list_lock); 3796 3797 INIT_LIST_HEAD(&adev->reset_list); 3798 3799 INIT_LIST_HEAD(&adev->ras_list); 3800 3801 INIT_DELAYED_WORK(&adev->delayed_init_work, 3802 amdgpu_device_delayed_init_work_handler); 3803 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3804 amdgpu_device_delay_enable_gfx_off); 3805 3806 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3807 3808 adev->gfx.gfx_off_req_count = 1; 3809 adev->gfx.gfx_off_residency = 0; 3810 adev->gfx.gfx_off_entrycount = 0; 3811 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3812 3813 atomic_set(&adev->throttling_logging_enabled, 1); 3814 /* 3815 * If throttling continues, logging will be performed every minute 3816 * to avoid log flooding. "-1" is subtracted since the thermal 3817 * throttling interrupt comes every second. Thus, the total logging 3818 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3819 * for throttling interrupt) = 60 seconds. 3820 */ 3821 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3822 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3823 3824 /* Registers mapping */ 3825 /* TODO: block userspace mapping of io register */ 3826 if (adev->asic_type >= CHIP_BONAIRE) { 3827 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3828 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3829 } else { 3830 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3831 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3832 } 3833 3834 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3835 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3836 3837 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3838 if (adev->rmmio == NULL) { 3839 return -ENOMEM; 3840 } 3841 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3842 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3843 3844 /* 3845 * Reset domain needs to be present early, before XGMI hive discovered 3846 * (if any) and intitialized to use reset sem and in_gpu reset flag 3847 * early on during init and before calling to RREG32. 3848 */ 3849 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3850 if (!adev->reset_domain) 3851 return -ENOMEM; 3852 3853 /* detect hw virtualization here */ 3854 amdgpu_detect_virtualization(adev); 3855 3856 amdgpu_device_get_pcie_info(adev); 3857 3858 r = amdgpu_device_get_job_timeout_settings(adev); 3859 if (r) { 3860 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3861 return r; 3862 } 3863 3864 /* early init functions */ 3865 r = amdgpu_device_ip_early_init(adev); 3866 if (r) 3867 return r; 3868 3869 amdgpu_device_set_mcbp(adev); 3870 3871 /* Get rid of things like offb */ 3872 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3873 if (r) 3874 return r; 3875 3876 /* Enable TMZ based on IP_VERSION */ 3877 amdgpu_gmc_tmz_set(adev); 3878 3879 amdgpu_gmc_noretry_set(adev); 3880 /* Need to get xgmi info early to decide the reset behavior*/ 3881 if (adev->gmc.xgmi.supported) { 3882 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3883 if (r) 3884 return r; 3885 } 3886 3887 /* enable PCIE atomic ops */ 3888 if (amdgpu_sriov_vf(adev)) { 3889 if (adev->virt.fw_reserve.p_pf2vf) 3890 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3891 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3892 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3893 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3894 * internal path natively support atomics, set have_atomics_support to true. 3895 */ 3896 } else if ((adev->flags & AMD_IS_APU) && 3897 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3898 adev->have_atomics_support = true; 3899 } else { 3900 adev->have_atomics_support = 3901 !pci_enable_atomic_ops_to_root(adev->pdev, 3902 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3903 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3904 } 3905 3906 if (!adev->have_atomics_support) 3907 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3908 3909 /* doorbell bar mapping and doorbell index init*/ 3910 amdgpu_device_doorbell_init(adev); 3911 3912 if (amdgpu_emu_mode == 1) { 3913 /* post the asic on emulation mode */ 3914 emu_soc_asic_init(adev); 3915 goto fence_driver_init; 3916 } 3917 3918 amdgpu_reset_init(adev); 3919 3920 /* detect if we are with an SRIOV vbios */ 3921 if (adev->bios) 3922 amdgpu_device_detect_sriov_bios(adev); 3923 3924 /* check if we need to reset the asic 3925 * E.g., driver was not cleanly unloaded previously, etc. 3926 */ 3927 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3928 if (adev->gmc.xgmi.num_physical_nodes) { 3929 dev_info(adev->dev, "Pending hive reset.\n"); 3930 adev->gmc.xgmi.pending_reset = true; 3931 /* Only need to init necessary block for SMU to handle the reset */ 3932 for (i = 0; i < adev->num_ip_blocks; i++) { 3933 if (!adev->ip_blocks[i].status.valid) 3934 continue; 3935 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3936 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3937 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3938 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3939 DRM_DEBUG("IP %s disabled for hw_init.\n", 3940 adev->ip_blocks[i].version->funcs->name); 3941 adev->ip_blocks[i].status.hw = true; 3942 } 3943 } 3944 } else { 3945 tmp = amdgpu_reset_method; 3946 /* It should do a default reset when loading or reloading the driver, 3947 * regardless of the module parameter reset_method. 3948 */ 3949 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3950 r = amdgpu_asic_reset(adev); 3951 amdgpu_reset_method = tmp; 3952 if (r) { 3953 dev_err(adev->dev, "asic reset on init failed\n"); 3954 goto failed; 3955 } 3956 } 3957 } 3958 3959 /* Post card if necessary */ 3960 if (amdgpu_device_need_post(adev)) { 3961 if (!adev->bios) { 3962 dev_err(adev->dev, "no vBIOS found\n"); 3963 r = -EINVAL; 3964 goto failed; 3965 } 3966 DRM_INFO("GPU posting now...\n"); 3967 r = amdgpu_device_asic_init(adev); 3968 if (r) { 3969 dev_err(adev->dev, "gpu post error!\n"); 3970 goto failed; 3971 } 3972 } 3973 3974 if (adev->bios) { 3975 if (adev->is_atom_fw) { 3976 /* Initialize clocks */ 3977 r = amdgpu_atomfirmware_get_clock_info(adev); 3978 if (r) { 3979 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3980 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3981 goto failed; 3982 } 3983 } else { 3984 /* Initialize clocks */ 3985 r = amdgpu_atombios_get_clock_info(adev); 3986 if (r) { 3987 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3988 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3989 goto failed; 3990 } 3991 /* init i2c buses */ 3992 if (!amdgpu_device_has_dc_support(adev)) 3993 amdgpu_atombios_i2c_init(adev); 3994 } 3995 } 3996 3997 fence_driver_init: 3998 /* Fence driver */ 3999 r = amdgpu_fence_driver_sw_init(adev); 4000 if (r) { 4001 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4002 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4003 goto failed; 4004 } 4005 4006 /* init the mode config */ 4007 drm_mode_config_init(adev_to_drm(adev)); 4008 4009 r = amdgpu_device_ip_init(adev); 4010 if (r) { 4011 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4012 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4013 goto release_ras_con; 4014 } 4015 4016 amdgpu_fence_driver_hw_init(adev); 4017 4018 dev_info(adev->dev, 4019 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4020 adev->gfx.config.max_shader_engines, 4021 adev->gfx.config.max_sh_per_se, 4022 adev->gfx.config.max_cu_per_sh, 4023 adev->gfx.cu_info.number); 4024 4025 adev->accel_working = true; 4026 4027 amdgpu_vm_check_compute_bug(adev); 4028 4029 /* Initialize the buffer migration limit. */ 4030 if (amdgpu_moverate >= 0) 4031 max_MBps = amdgpu_moverate; 4032 else 4033 max_MBps = 8; /* Allow 8 MB/s. */ 4034 /* Get a log2 for easy divisions. */ 4035 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4036 4037 r = amdgpu_atombios_sysfs_init(adev); 4038 if (r) 4039 drm_err(&adev->ddev, 4040 "registering atombios sysfs failed (%d).\n", r); 4041 4042 r = amdgpu_pm_sysfs_init(adev); 4043 if (r) 4044 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4045 4046 r = amdgpu_ucode_sysfs_init(adev); 4047 if (r) { 4048 adev->ucode_sysfs_en = false; 4049 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4050 } else 4051 adev->ucode_sysfs_en = true; 4052 4053 r = amdgpu_psp_sysfs_init(adev); 4054 if (r) { 4055 adev->psp_sysfs_en = false; 4056 if (!amdgpu_sriov_vf(adev)) 4057 DRM_ERROR("Creating psp sysfs failed\n"); 4058 } else 4059 adev->psp_sysfs_en = true; 4060 4061 /* 4062 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4063 * Otherwise the mgpu fan boost feature will be skipped due to the 4064 * gpu instance is counted less. 4065 */ 4066 amdgpu_register_gpu_instance(adev); 4067 4068 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4069 * explicit gating rather than handling it automatically. 4070 */ 4071 if (!adev->gmc.xgmi.pending_reset) { 4072 r = amdgpu_device_ip_late_init(adev); 4073 if (r) { 4074 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4075 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4076 goto release_ras_con; 4077 } 4078 /* must succeed. */ 4079 amdgpu_ras_resume(adev); 4080 queue_delayed_work(system_wq, &adev->delayed_init_work, 4081 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4082 } 4083 4084 if (amdgpu_sriov_vf(adev)) { 4085 amdgpu_virt_release_full_gpu(adev, true); 4086 flush_delayed_work(&adev->delayed_init_work); 4087 } 4088 4089 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4090 if (r) 4091 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4092 4093 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4094 r = amdgpu_pmu_init(adev); 4095 if (r) 4096 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4097 4098 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4099 if (amdgpu_device_cache_pci_state(adev->pdev)) 4100 pci_restore_state(pdev); 4101 4102 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4103 /* this will fail for cards that aren't VGA class devices, just 4104 * ignore it */ 4105 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4106 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4107 4108 px = amdgpu_device_supports_px(ddev); 4109 4110 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4111 apple_gmux_detect(NULL, NULL))) 4112 vga_switcheroo_register_client(adev->pdev, 4113 &amdgpu_switcheroo_ops, px); 4114 4115 if (px) 4116 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4117 4118 if (adev->gmc.xgmi.pending_reset) 4119 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4120 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4121 4122 amdgpu_device_check_iommu_direct_map(adev); 4123 4124 return 0; 4125 4126 release_ras_con: 4127 if (amdgpu_sriov_vf(adev)) 4128 amdgpu_virt_release_full_gpu(adev, true); 4129 4130 /* failed in exclusive mode due to timeout */ 4131 if (amdgpu_sriov_vf(adev) && 4132 !amdgpu_sriov_runtime(adev) && 4133 amdgpu_virt_mmio_blocked(adev) && 4134 !amdgpu_virt_wait_reset(adev)) { 4135 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4136 /* Don't send request since VF is inactive. */ 4137 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4138 adev->virt.ops = NULL; 4139 r = -EAGAIN; 4140 } 4141 amdgpu_release_ras_context(adev); 4142 4143 failed: 4144 amdgpu_vf_error_trans_all(adev); 4145 4146 return r; 4147 } 4148 4149 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4150 { 4151 4152 /* Clear all CPU mappings pointing to this device */ 4153 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4154 4155 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4156 amdgpu_device_doorbell_fini(adev); 4157 4158 iounmap(adev->rmmio); 4159 adev->rmmio = NULL; 4160 if (adev->mman.aper_base_kaddr) 4161 iounmap(adev->mman.aper_base_kaddr); 4162 adev->mman.aper_base_kaddr = NULL; 4163 4164 /* Memory manager related */ 4165 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4166 arch_phys_wc_del(adev->gmc.vram_mtrr); 4167 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4168 } 4169 } 4170 4171 /** 4172 * amdgpu_device_fini_hw - tear down the driver 4173 * 4174 * @adev: amdgpu_device pointer 4175 * 4176 * Tear down the driver info (all asics). 4177 * Called at driver shutdown. 4178 */ 4179 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4180 { 4181 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4182 flush_delayed_work(&adev->delayed_init_work); 4183 adev->shutdown = true; 4184 4185 /* make sure IB test finished before entering exclusive mode 4186 * to avoid preemption on IB test 4187 * */ 4188 if (amdgpu_sriov_vf(adev)) { 4189 amdgpu_virt_request_full_gpu(adev, false); 4190 amdgpu_virt_fini_data_exchange(adev); 4191 } 4192 4193 /* disable all interrupts */ 4194 amdgpu_irq_disable_all(adev); 4195 if (adev->mode_info.mode_config_initialized) { 4196 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4197 drm_helper_force_disable_all(adev_to_drm(adev)); 4198 else 4199 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4200 } 4201 amdgpu_fence_driver_hw_fini(adev); 4202 4203 if (adev->mman.initialized) 4204 drain_workqueue(adev->mman.bdev.wq); 4205 4206 if (adev->pm.sysfs_initialized) 4207 amdgpu_pm_sysfs_fini(adev); 4208 if (adev->ucode_sysfs_en) 4209 amdgpu_ucode_sysfs_fini(adev); 4210 if (adev->psp_sysfs_en) 4211 amdgpu_psp_sysfs_fini(adev); 4212 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4213 4214 /* disable ras feature must before hw fini */ 4215 amdgpu_ras_pre_fini(adev); 4216 4217 amdgpu_device_ip_fini_early(adev); 4218 4219 amdgpu_irq_fini_hw(adev); 4220 4221 if (adev->mman.initialized) 4222 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4223 4224 amdgpu_gart_dummy_page_fini(adev); 4225 4226 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4227 amdgpu_device_unmap_mmio(adev); 4228 4229 } 4230 4231 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4232 { 4233 int idx; 4234 bool px; 4235 4236 amdgpu_fence_driver_sw_fini(adev); 4237 amdgpu_device_ip_fini(adev); 4238 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4239 adev->accel_working = false; 4240 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4241 4242 amdgpu_reset_fini(adev); 4243 4244 /* free i2c buses */ 4245 if (!amdgpu_device_has_dc_support(adev)) 4246 amdgpu_i2c_fini(adev); 4247 4248 if (amdgpu_emu_mode != 1) 4249 amdgpu_atombios_fini(adev); 4250 4251 kfree(adev->bios); 4252 adev->bios = NULL; 4253 4254 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4255 4256 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4257 apple_gmux_detect(NULL, NULL))) 4258 vga_switcheroo_unregister_client(adev->pdev); 4259 4260 if (px) 4261 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4262 4263 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4264 vga_client_unregister(adev->pdev); 4265 4266 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4267 4268 iounmap(adev->rmmio); 4269 adev->rmmio = NULL; 4270 amdgpu_device_doorbell_fini(adev); 4271 drm_dev_exit(idx); 4272 } 4273 4274 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4275 amdgpu_pmu_fini(adev); 4276 if (adev->mman.discovery_bin) 4277 amdgpu_discovery_fini(adev); 4278 4279 amdgpu_reset_put_reset_domain(adev->reset_domain); 4280 adev->reset_domain = NULL; 4281 4282 kfree(adev->pci_state); 4283 4284 } 4285 4286 /** 4287 * amdgpu_device_evict_resources - evict device resources 4288 * @adev: amdgpu device object 4289 * 4290 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4291 * of the vram memory type. Mainly used for evicting device resources 4292 * at suspend time. 4293 * 4294 */ 4295 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4296 { 4297 int ret; 4298 4299 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4300 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4301 return 0; 4302 4303 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4304 if (ret) 4305 DRM_WARN("evicting device resources failed\n"); 4306 return ret; 4307 } 4308 4309 /* 4310 * Suspend & resume. 4311 */ 4312 /** 4313 * amdgpu_device_suspend - initiate device suspend 4314 * 4315 * @dev: drm dev pointer 4316 * @fbcon : notify the fbdev of suspend 4317 * 4318 * Puts the hw in the suspend state (all asics). 4319 * Returns 0 for success or an error on failure. 4320 * Called at driver suspend. 4321 */ 4322 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4323 { 4324 struct amdgpu_device *adev = drm_to_adev(dev); 4325 int r = 0; 4326 4327 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4328 return 0; 4329 4330 adev->in_suspend = true; 4331 4332 /* Evict the majority of BOs before grabbing the full access */ 4333 r = amdgpu_device_evict_resources(adev); 4334 if (r) 4335 return r; 4336 4337 if (amdgpu_sriov_vf(adev)) { 4338 amdgpu_virt_fini_data_exchange(adev); 4339 r = amdgpu_virt_request_full_gpu(adev, false); 4340 if (r) 4341 return r; 4342 } 4343 4344 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4345 DRM_WARN("smart shift update failed\n"); 4346 4347 if (fbcon) 4348 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4349 4350 cancel_delayed_work_sync(&adev->delayed_init_work); 4351 4352 amdgpu_ras_suspend(adev); 4353 4354 amdgpu_device_ip_suspend_phase1(adev); 4355 4356 if (!adev->in_s0ix) 4357 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4358 4359 r = amdgpu_device_evict_resources(adev); 4360 if (r) 4361 return r; 4362 4363 amdgpu_fence_driver_hw_fini(adev); 4364 4365 amdgpu_device_ip_suspend_phase2(adev); 4366 4367 if (amdgpu_sriov_vf(adev)) 4368 amdgpu_virt_release_full_gpu(adev, false); 4369 4370 return 0; 4371 } 4372 4373 /** 4374 * amdgpu_device_resume - initiate device resume 4375 * 4376 * @dev: drm dev pointer 4377 * @fbcon : notify the fbdev of resume 4378 * 4379 * Bring the hw back to operating state (all asics). 4380 * Returns 0 for success or an error on failure. 4381 * Called at driver resume. 4382 */ 4383 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4384 { 4385 struct amdgpu_device *adev = drm_to_adev(dev); 4386 int r = 0; 4387 4388 if (amdgpu_sriov_vf(adev)) { 4389 r = amdgpu_virt_request_full_gpu(adev, true); 4390 if (r) 4391 return r; 4392 } 4393 4394 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4395 return 0; 4396 4397 if (adev->in_s0ix) 4398 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4399 4400 /* post card */ 4401 if (amdgpu_device_need_post(adev)) { 4402 r = amdgpu_device_asic_init(adev); 4403 if (r) 4404 dev_err(adev->dev, "amdgpu asic init failed\n"); 4405 } 4406 4407 r = amdgpu_device_ip_resume(adev); 4408 4409 if (r) { 4410 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4411 goto exit; 4412 } 4413 amdgpu_fence_driver_hw_init(adev); 4414 4415 r = amdgpu_device_ip_late_init(adev); 4416 if (r) 4417 goto exit; 4418 4419 queue_delayed_work(system_wq, &adev->delayed_init_work, 4420 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4421 4422 if (!adev->in_s0ix) { 4423 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4424 if (r) 4425 goto exit; 4426 } 4427 4428 exit: 4429 if (amdgpu_sriov_vf(adev)) { 4430 amdgpu_virt_init_data_exchange(adev); 4431 amdgpu_virt_release_full_gpu(adev, true); 4432 } 4433 4434 if (r) 4435 return r; 4436 4437 /* Make sure IB tests flushed */ 4438 flush_delayed_work(&adev->delayed_init_work); 4439 4440 if (fbcon) 4441 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4442 4443 amdgpu_ras_resume(adev); 4444 4445 if (adev->mode_info.num_crtc) { 4446 /* 4447 * Most of the connector probing functions try to acquire runtime pm 4448 * refs to ensure that the GPU is powered on when connector polling is 4449 * performed. Since we're calling this from a runtime PM callback, 4450 * trying to acquire rpm refs will cause us to deadlock. 4451 * 4452 * Since we're guaranteed to be holding the rpm lock, it's safe to 4453 * temporarily disable the rpm helpers so this doesn't deadlock us. 4454 */ 4455 #ifdef CONFIG_PM 4456 dev->dev->power.disable_depth++; 4457 #endif 4458 if (!adev->dc_enabled) 4459 drm_helper_hpd_irq_event(dev); 4460 else 4461 drm_kms_helper_hotplug_event(dev); 4462 #ifdef CONFIG_PM 4463 dev->dev->power.disable_depth--; 4464 #endif 4465 } 4466 adev->in_suspend = false; 4467 4468 if (adev->enable_mes) 4469 amdgpu_mes_self_test(adev); 4470 4471 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4472 DRM_WARN("smart shift update failed\n"); 4473 4474 return 0; 4475 } 4476 4477 /** 4478 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4479 * 4480 * @adev: amdgpu_device pointer 4481 * 4482 * The list of all the hardware IPs that make up the asic is walked and 4483 * the check_soft_reset callbacks are run. check_soft_reset determines 4484 * if the asic is still hung or not. 4485 * Returns true if any of the IPs are still in a hung state, false if not. 4486 */ 4487 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4488 { 4489 int i; 4490 bool asic_hang = false; 4491 4492 if (amdgpu_sriov_vf(adev)) 4493 return true; 4494 4495 if (amdgpu_asic_need_full_reset(adev)) 4496 return true; 4497 4498 for (i = 0; i < adev->num_ip_blocks; i++) { 4499 if (!adev->ip_blocks[i].status.valid) 4500 continue; 4501 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4502 adev->ip_blocks[i].status.hang = 4503 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4504 if (adev->ip_blocks[i].status.hang) { 4505 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4506 asic_hang = true; 4507 } 4508 } 4509 return asic_hang; 4510 } 4511 4512 /** 4513 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4514 * 4515 * @adev: amdgpu_device pointer 4516 * 4517 * The list of all the hardware IPs that make up the asic is walked and the 4518 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4519 * handles any IP specific hardware or software state changes that are 4520 * necessary for a soft reset to succeed. 4521 * Returns 0 on success, negative error code on failure. 4522 */ 4523 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4524 { 4525 int i, r = 0; 4526 4527 for (i = 0; i < adev->num_ip_blocks; i++) { 4528 if (!adev->ip_blocks[i].status.valid) 4529 continue; 4530 if (adev->ip_blocks[i].status.hang && 4531 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4532 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4533 if (r) 4534 return r; 4535 } 4536 } 4537 4538 return 0; 4539 } 4540 4541 /** 4542 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4543 * 4544 * @adev: amdgpu_device pointer 4545 * 4546 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4547 * reset is necessary to recover. 4548 * Returns true if a full asic reset is required, false if not. 4549 */ 4550 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4551 { 4552 int i; 4553 4554 if (amdgpu_asic_need_full_reset(adev)) 4555 return true; 4556 4557 for (i = 0; i < adev->num_ip_blocks; i++) { 4558 if (!adev->ip_blocks[i].status.valid) 4559 continue; 4560 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4561 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4562 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4563 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4564 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4565 if (adev->ip_blocks[i].status.hang) { 4566 dev_info(adev->dev, "Some block need full reset!\n"); 4567 return true; 4568 } 4569 } 4570 } 4571 return false; 4572 } 4573 4574 /** 4575 * amdgpu_device_ip_soft_reset - do a soft reset 4576 * 4577 * @adev: amdgpu_device pointer 4578 * 4579 * The list of all the hardware IPs that make up the asic is walked and the 4580 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4581 * IP specific hardware or software state changes that are necessary to soft 4582 * reset the IP. 4583 * Returns 0 on success, negative error code on failure. 4584 */ 4585 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4586 { 4587 int i, r = 0; 4588 4589 for (i = 0; i < adev->num_ip_blocks; i++) { 4590 if (!adev->ip_blocks[i].status.valid) 4591 continue; 4592 if (adev->ip_blocks[i].status.hang && 4593 adev->ip_blocks[i].version->funcs->soft_reset) { 4594 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4595 if (r) 4596 return r; 4597 } 4598 } 4599 4600 return 0; 4601 } 4602 4603 /** 4604 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4605 * 4606 * @adev: amdgpu_device pointer 4607 * 4608 * The list of all the hardware IPs that make up the asic is walked and the 4609 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4610 * handles any IP specific hardware or software state changes that are 4611 * necessary after the IP has been soft reset. 4612 * Returns 0 on success, negative error code on failure. 4613 */ 4614 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4615 { 4616 int i, r = 0; 4617 4618 for (i = 0; i < adev->num_ip_blocks; i++) { 4619 if (!adev->ip_blocks[i].status.valid) 4620 continue; 4621 if (adev->ip_blocks[i].status.hang && 4622 adev->ip_blocks[i].version->funcs->post_soft_reset) 4623 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4624 if (r) 4625 return r; 4626 } 4627 4628 return 0; 4629 } 4630 4631 /** 4632 * amdgpu_device_recover_vram - Recover some VRAM contents 4633 * 4634 * @adev: amdgpu_device pointer 4635 * 4636 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4637 * restore things like GPUVM page tables after a GPU reset where 4638 * the contents of VRAM might be lost. 4639 * 4640 * Returns: 4641 * 0 on success, negative error code on failure. 4642 */ 4643 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4644 { 4645 struct dma_fence *fence = NULL, *next = NULL; 4646 struct amdgpu_bo *shadow; 4647 struct amdgpu_bo_vm *vmbo; 4648 long r = 1, tmo; 4649 4650 if (amdgpu_sriov_runtime(adev)) 4651 tmo = msecs_to_jiffies(8000); 4652 else 4653 tmo = msecs_to_jiffies(100); 4654 4655 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4656 mutex_lock(&adev->shadow_list_lock); 4657 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4658 /* If vm is compute context or adev is APU, shadow will be NULL */ 4659 if (!vmbo->shadow) 4660 continue; 4661 shadow = vmbo->shadow; 4662 4663 /* No need to recover an evicted BO */ 4664 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4665 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4666 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4667 continue; 4668 4669 r = amdgpu_bo_restore_shadow(shadow, &next); 4670 if (r) 4671 break; 4672 4673 if (fence) { 4674 tmo = dma_fence_wait_timeout(fence, false, tmo); 4675 dma_fence_put(fence); 4676 fence = next; 4677 if (tmo == 0) { 4678 r = -ETIMEDOUT; 4679 break; 4680 } else if (tmo < 0) { 4681 r = tmo; 4682 break; 4683 } 4684 } else { 4685 fence = next; 4686 } 4687 } 4688 mutex_unlock(&adev->shadow_list_lock); 4689 4690 if (fence) 4691 tmo = dma_fence_wait_timeout(fence, false, tmo); 4692 dma_fence_put(fence); 4693 4694 if (r < 0 || tmo <= 0) { 4695 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4696 return -EIO; 4697 } 4698 4699 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4700 return 0; 4701 } 4702 4703 4704 /** 4705 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4706 * 4707 * @adev: amdgpu_device pointer 4708 * @from_hypervisor: request from hypervisor 4709 * 4710 * do VF FLR and reinitialize Asic 4711 * return 0 means succeeded otherwise failed 4712 */ 4713 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4714 bool from_hypervisor) 4715 { 4716 int r; 4717 struct amdgpu_hive_info *hive = NULL; 4718 int retry_limit = 0; 4719 4720 retry: 4721 amdgpu_amdkfd_pre_reset(adev); 4722 4723 if (from_hypervisor) 4724 r = amdgpu_virt_request_full_gpu(adev, true); 4725 else 4726 r = amdgpu_virt_reset_gpu(adev); 4727 if (r) 4728 return r; 4729 4730 /* Resume IP prior to SMC */ 4731 r = amdgpu_device_ip_reinit_early_sriov(adev); 4732 if (r) 4733 goto error; 4734 4735 amdgpu_virt_init_data_exchange(adev); 4736 4737 r = amdgpu_device_fw_loading(adev); 4738 if (r) 4739 return r; 4740 4741 /* now we are okay to resume SMC/CP/SDMA */ 4742 r = amdgpu_device_ip_reinit_late_sriov(adev); 4743 if (r) 4744 goto error; 4745 4746 hive = amdgpu_get_xgmi_hive(adev); 4747 /* Update PSP FW topology after reset */ 4748 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4749 r = amdgpu_xgmi_update_topology(hive, adev); 4750 4751 if (hive) 4752 amdgpu_put_xgmi_hive(hive); 4753 4754 if (!r) { 4755 amdgpu_irq_gpu_reset_resume_helper(adev); 4756 r = amdgpu_ib_ring_tests(adev); 4757 4758 amdgpu_amdkfd_post_reset(adev); 4759 } 4760 4761 error: 4762 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4763 amdgpu_inc_vram_lost(adev); 4764 r = amdgpu_device_recover_vram(adev); 4765 } 4766 amdgpu_virt_release_full_gpu(adev, true); 4767 4768 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4769 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4770 retry_limit++; 4771 goto retry; 4772 } else 4773 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4774 } 4775 4776 return r; 4777 } 4778 4779 /** 4780 * amdgpu_device_has_job_running - check if there is any job in mirror list 4781 * 4782 * @adev: amdgpu_device pointer 4783 * 4784 * check if there is any job in mirror list 4785 */ 4786 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4787 { 4788 int i; 4789 struct drm_sched_job *job; 4790 4791 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4792 struct amdgpu_ring *ring = adev->rings[i]; 4793 4794 if (!ring || !ring->sched.thread) 4795 continue; 4796 4797 spin_lock(&ring->sched.job_list_lock); 4798 job = list_first_entry_or_null(&ring->sched.pending_list, 4799 struct drm_sched_job, list); 4800 spin_unlock(&ring->sched.job_list_lock); 4801 if (job) 4802 return true; 4803 } 4804 return false; 4805 } 4806 4807 /** 4808 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4809 * 4810 * @adev: amdgpu_device pointer 4811 * 4812 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4813 * a hung GPU. 4814 */ 4815 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4816 { 4817 4818 if (amdgpu_gpu_recovery == 0) 4819 goto disabled; 4820 4821 /* Skip soft reset check in fatal error mode */ 4822 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4823 return true; 4824 4825 if (amdgpu_sriov_vf(adev)) 4826 return true; 4827 4828 if (amdgpu_gpu_recovery == -1) { 4829 switch (adev->asic_type) { 4830 #ifdef CONFIG_DRM_AMDGPU_SI 4831 case CHIP_VERDE: 4832 case CHIP_TAHITI: 4833 case CHIP_PITCAIRN: 4834 case CHIP_OLAND: 4835 case CHIP_HAINAN: 4836 #endif 4837 #ifdef CONFIG_DRM_AMDGPU_CIK 4838 case CHIP_KAVERI: 4839 case CHIP_KABINI: 4840 case CHIP_MULLINS: 4841 #endif 4842 case CHIP_CARRIZO: 4843 case CHIP_STONEY: 4844 case CHIP_CYAN_SKILLFISH: 4845 goto disabled; 4846 default: 4847 break; 4848 } 4849 } 4850 4851 return true; 4852 4853 disabled: 4854 dev_info(adev->dev, "GPU recovery disabled.\n"); 4855 return false; 4856 } 4857 4858 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4859 { 4860 u32 i; 4861 int ret = 0; 4862 4863 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4864 4865 dev_info(adev->dev, "GPU mode1 reset\n"); 4866 4867 /* disable BM */ 4868 pci_clear_master(adev->pdev); 4869 4870 amdgpu_device_cache_pci_state(adev->pdev); 4871 4872 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4873 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4874 ret = amdgpu_dpm_mode1_reset(adev); 4875 } else { 4876 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4877 ret = psp_gpu_reset(adev); 4878 } 4879 4880 if (ret) 4881 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4882 4883 amdgpu_device_load_pci_state(adev->pdev); 4884 4885 /* wait for asic to come out of reset */ 4886 for (i = 0; i < adev->usec_timeout; i++) { 4887 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4888 4889 if (memsize != 0xffffffff) 4890 break; 4891 udelay(1); 4892 } 4893 4894 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4895 return ret; 4896 } 4897 4898 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4899 struct amdgpu_reset_context *reset_context) 4900 { 4901 int i, r = 0; 4902 struct amdgpu_job *job = NULL; 4903 bool need_full_reset = 4904 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4905 4906 if (reset_context->reset_req_dev == adev) 4907 job = reset_context->job; 4908 4909 if (amdgpu_sriov_vf(adev)) { 4910 /* stop the data exchange thread */ 4911 amdgpu_virt_fini_data_exchange(adev); 4912 } 4913 4914 amdgpu_fence_driver_isr_toggle(adev, true); 4915 4916 /* block all schedulers and reset given job's ring */ 4917 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4918 struct amdgpu_ring *ring = adev->rings[i]; 4919 4920 if (!ring || !ring->sched.thread) 4921 continue; 4922 4923 /*clear job fence from fence drv to avoid force_completion 4924 *leave NULL and vm flush fence in fence drv */ 4925 amdgpu_fence_driver_clear_job_fences(ring); 4926 4927 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4928 amdgpu_fence_driver_force_completion(ring); 4929 } 4930 4931 amdgpu_fence_driver_isr_toggle(adev, false); 4932 4933 if (job && job->vm) 4934 drm_sched_increase_karma(&job->base); 4935 4936 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4937 /* If reset handler not implemented, continue; otherwise return */ 4938 if (r == -ENOSYS) 4939 r = 0; 4940 else 4941 return r; 4942 4943 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4944 if (!amdgpu_sriov_vf(adev)) { 4945 4946 if (!need_full_reset) 4947 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4948 4949 if (!need_full_reset && amdgpu_gpu_recovery && 4950 amdgpu_device_ip_check_soft_reset(adev)) { 4951 amdgpu_device_ip_pre_soft_reset(adev); 4952 r = amdgpu_device_ip_soft_reset(adev); 4953 amdgpu_device_ip_post_soft_reset(adev); 4954 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4955 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4956 need_full_reset = true; 4957 } 4958 } 4959 4960 if (need_full_reset) 4961 r = amdgpu_device_ip_suspend(adev); 4962 if (need_full_reset) 4963 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4964 else 4965 clear_bit(AMDGPU_NEED_FULL_RESET, 4966 &reset_context->flags); 4967 } 4968 4969 return r; 4970 } 4971 4972 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4973 { 4974 int i; 4975 4976 lockdep_assert_held(&adev->reset_domain->sem); 4977 4978 for (i = 0; i < adev->num_regs; i++) { 4979 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4980 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4981 adev->reset_dump_reg_value[i]); 4982 } 4983 4984 return 0; 4985 } 4986 4987 #ifdef CONFIG_DEV_COREDUMP 4988 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4989 size_t count, void *data, size_t datalen) 4990 { 4991 struct drm_printer p; 4992 struct amdgpu_device *adev = data; 4993 struct drm_print_iterator iter; 4994 int i; 4995 4996 iter.data = buffer; 4997 iter.offset = 0; 4998 iter.start = offset; 4999 iter.remain = count; 5000 5001 p = drm_coredump_printer(&iter); 5002 5003 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 5004 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 5005 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 5006 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 5007 if (adev->reset_task_info.pid) 5008 drm_printf(&p, "process_name: %s PID: %d\n", 5009 adev->reset_task_info.process_name, 5010 adev->reset_task_info.pid); 5011 5012 if (adev->reset_vram_lost) 5013 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 5014 if (adev->num_regs) { 5015 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 5016 5017 for (i = 0; i < adev->num_regs; i++) 5018 drm_printf(&p, "0x%08x: 0x%08x\n", 5019 adev->reset_dump_reg_list[i], 5020 adev->reset_dump_reg_value[i]); 5021 } 5022 5023 return count - iter.remain; 5024 } 5025 5026 static void amdgpu_devcoredump_free(void *data) 5027 { 5028 } 5029 5030 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 5031 { 5032 struct drm_device *dev = adev_to_drm(adev); 5033 5034 ktime_get_ts64(&adev->reset_time); 5035 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 5036 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5037 } 5038 #endif 5039 5040 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5041 struct amdgpu_reset_context *reset_context) 5042 { 5043 struct amdgpu_device *tmp_adev = NULL; 5044 bool need_full_reset, skip_hw_reset, vram_lost = false; 5045 int r = 0; 5046 bool gpu_reset_for_dev_remove = 0; 5047 5048 /* Try reset handler method first */ 5049 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5050 reset_list); 5051 amdgpu_reset_reg_dumps(tmp_adev); 5052 5053 reset_context->reset_device_list = device_list_handle; 5054 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5055 /* If reset handler not implemented, continue; otherwise return */ 5056 if (r == -ENOSYS) 5057 r = 0; 5058 else 5059 return r; 5060 5061 /* Reset handler not implemented, use the default method */ 5062 need_full_reset = 5063 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5064 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5065 5066 gpu_reset_for_dev_remove = 5067 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5068 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5069 5070 /* 5071 * ASIC reset has to be done on all XGMI hive nodes ASAP 5072 * to allow proper links negotiation in FW (within 1 sec) 5073 */ 5074 if (!skip_hw_reset && need_full_reset) { 5075 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5076 /* For XGMI run all resets in parallel to speed up the process */ 5077 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5078 tmp_adev->gmc.xgmi.pending_reset = false; 5079 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5080 r = -EALREADY; 5081 } else 5082 r = amdgpu_asic_reset(tmp_adev); 5083 5084 if (r) { 5085 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5086 r, adev_to_drm(tmp_adev)->unique); 5087 break; 5088 } 5089 } 5090 5091 /* For XGMI wait for all resets to complete before proceed */ 5092 if (!r) { 5093 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5094 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5095 flush_work(&tmp_adev->xgmi_reset_work); 5096 r = tmp_adev->asic_reset_res; 5097 if (r) 5098 break; 5099 } 5100 } 5101 } 5102 } 5103 5104 if (!r && amdgpu_ras_intr_triggered()) { 5105 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5106 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5107 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5108 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5109 } 5110 5111 amdgpu_ras_intr_cleared(); 5112 } 5113 5114 /* Since the mode1 reset affects base ip blocks, the 5115 * phase1 ip blocks need to be resumed. Otherwise there 5116 * will be a BIOS signature error and the psp bootloader 5117 * can't load kdb on the next amdgpu install. 5118 */ 5119 if (gpu_reset_for_dev_remove) { 5120 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5121 amdgpu_device_ip_resume_phase1(tmp_adev); 5122 5123 goto end; 5124 } 5125 5126 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5127 if (need_full_reset) { 5128 /* post card */ 5129 r = amdgpu_device_asic_init(tmp_adev); 5130 if (r) { 5131 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5132 } else { 5133 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5134 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5135 if (r) 5136 goto out; 5137 5138 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5139 if (r) 5140 goto out; 5141 5142 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5143 #ifdef CONFIG_DEV_COREDUMP 5144 tmp_adev->reset_vram_lost = vram_lost; 5145 memset(&tmp_adev->reset_task_info, 0, 5146 sizeof(tmp_adev->reset_task_info)); 5147 if (reset_context->job && reset_context->job->vm) 5148 tmp_adev->reset_task_info = 5149 reset_context->job->vm->task_info; 5150 amdgpu_reset_capture_coredumpm(tmp_adev); 5151 #endif 5152 if (vram_lost) { 5153 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5154 amdgpu_inc_vram_lost(tmp_adev); 5155 } 5156 5157 r = amdgpu_device_fw_loading(tmp_adev); 5158 if (r) 5159 return r; 5160 5161 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5162 if (r) 5163 goto out; 5164 5165 if (vram_lost) 5166 amdgpu_device_fill_reset_magic(tmp_adev); 5167 5168 /* 5169 * Add this ASIC as tracked as reset was already 5170 * complete successfully. 5171 */ 5172 amdgpu_register_gpu_instance(tmp_adev); 5173 5174 if (!reset_context->hive && 5175 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5176 amdgpu_xgmi_add_device(tmp_adev); 5177 5178 r = amdgpu_device_ip_late_init(tmp_adev); 5179 if (r) 5180 goto out; 5181 5182 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5183 5184 /* 5185 * The GPU enters bad state once faulty pages 5186 * by ECC has reached the threshold, and ras 5187 * recovery is scheduled next. So add one check 5188 * here to break recovery if it indeed exceeds 5189 * bad page threshold, and remind user to 5190 * retire this GPU or setting one bigger 5191 * bad_page_threshold value to fix this once 5192 * probing driver again. 5193 */ 5194 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5195 /* must succeed. */ 5196 amdgpu_ras_resume(tmp_adev); 5197 } else { 5198 r = -EINVAL; 5199 goto out; 5200 } 5201 5202 /* Update PSP FW topology after reset */ 5203 if (reset_context->hive && 5204 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5205 r = amdgpu_xgmi_update_topology( 5206 reset_context->hive, tmp_adev); 5207 } 5208 } 5209 5210 out: 5211 if (!r) { 5212 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5213 r = amdgpu_ib_ring_tests(tmp_adev); 5214 if (r) { 5215 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5216 need_full_reset = true; 5217 r = -EAGAIN; 5218 goto end; 5219 } 5220 } 5221 5222 if (!r) 5223 r = amdgpu_device_recover_vram(tmp_adev); 5224 else 5225 tmp_adev->asic_reset_res = r; 5226 } 5227 5228 end: 5229 if (need_full_reset) 5230 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5231 else 5232 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5233 return r; 5234 } 5235 5236 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5237 { 5238 5239 switch (amdgpu_asic_reset_method(adev)) { 5240 case AMD_RESET_METHOD_MODE1: 5241 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5242 break; 5243 case AMD_RESET_METHOD_MODE2: 5244 adev->mp1_state = PP_MP1_STATE_RESET; 5245 break; 5246 default: 5247 adev->mp1_state = PP_MP1_STATE_NONE; 5248 break; 5249 } 5250 } 5251 5252 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5253 { 5254 amdgpu_vf_error_trans_all(adev); 5255 adev->mp1_state = PP_MP1_STATE_NONE; 5256 } 5257 5258 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5259 { 5260 struct pci_dev *p = NULL; 5261 5262 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5263 adev->pdev->bus->number, 1); 5264 if (p) { 5265 pm_runtime_enable(&(p->dev)); 5266 pm_runtime_resume(&(p->dev)); 5267 } 5268 5269 pci_dev_put(p); 5270 } 5271 5272 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5273 { 5274 enum amd_reset_method reset_method; 5275 struct pci_dev *p = NULL; 5276 u64 expires; 5277 5278 /* 5279 * For now, only BACO and mode1 reset are confirmed 5280 * to suffer the audio issue without proper suspended. 5281 */ 5282 reset_method = amdgpu_asic_reset_method(adev); 5283 if ((reset_method != AMD_RESET_METHOD_BACO) && 5284 (reset_method != AMD_RESET_METHOD_MODE1)) 5285 return -EINVAL; 5286 5287 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5288 adev->pdev->bus->number, 1); 5289 if (!p) 5290 return -ENODEV; 5291 5292 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5293 if (!expires) 5294 /* 5295 * If we cannot get the audio device autosuspend delay, 5296 * a fixed 4S interval will be used. Considering 3S is 5297 * the audio controller default autosuspend delay setting. 5298 * 4S used here is guaranteed to cover that. 5299 */ 5300 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5301 5302 while (!pm_runtime_status_suspended(&(p->dev))) { 5303 if (!pm_runtime_suspend(&(p->dev))) 5304 break; 5305 5306 if (expires < ktime_get_mono_fast_ns()) { 5307 dev_warn(adev->dev, "failed to suspend display audio\n"); 5308 pci_dev_put(p); 5309 /* TODO: abort the succeeding gpu reset? */ 5310 return -ETIMEDOUT; 5311 } 5312 } 5313 5314 pm_runtime_disable(&(p->dev)); 5315 5316 pci_dev_put(p); 5317 return 0; 5318 } 5319 5320 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5321 { 5322 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5323 5324 #if defined(CONFIG_DEBUG_FS) 5325 if (!amdgpu_sriov_vf(adev)) 5326 cancel_work(&adev->reset_work); 5327 #endif 5328 5329 if (adev->kfd.dev) 5330 cancel_work(&adev->kfd.reset_work); 5331 5332 if (amdgpu_sriov_vf(adev)) 5333 cancel_work(&adev->virt.flr_work); 5334 5335 if (con && adev->ras_enabled) 5336 cancel_work(&con->recovery_work); 5337 5338 } 5339 5340 /** 5341 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5342 * 5343 * @adev: amdgpu_device pointer 5344 * @job: which job trigger hang 5345 * @reset_context: amdgpu reset context pointer 5346 * 5347 * Attempt to reset the GPU if it has hung (all asics). 5348 * Attempt to do soft-reset or full-reset and reinitialize Asic 5349 * Returns 0 for success or an error on failure. 5350 */ 5351 5352 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5353 struct amdgpu_job *job, 5354 struct amdgpu_reset_context *reset_context) 5355 { 5356 struct list_head device_list, *device_list_handle = NULL; 5357 bool job_signaled = false; 5358 struct amdgpu_hive_info *hive = NULL; 5359 struct amdgpu_device *tmp_adev = NULL; 5360 int i, r = 0; 5361 bool need_emergency_restart = false; 5362 bool audio_suspended = false; 5363 bool gpu_reset_for_dev_remove = false; 5364 5365 gpu_reset_for_dev_remove = 5366 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5367 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5368 5369 /* 5370 * Special case: RAS triggered and full reset isn't supported 5371 */ 5372 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5373 5374 /* 5375 * Flush RAM to disk so that after reboot 5376 * the user can read log and see why the system rebooted. 5377 */ 5378 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5379 DRM_WARN("Emergency reboot."); 5380 5381 ksys_sync_helper(); 5382 emergency_restart(); 5383 } 5384 5385 dev_info(adev->dev, "GPU %s begin!\n", 5386 need_emergency_restart ? "jobs stop":"reset"); 5387 5388 if (!amdgpu_sriov_vf(adev)) 5389 hive = amdgpu_get_xgmi_hive(adev); 5390 if (hive) 5391 mutex_lock(&hive->hive_lock); 5392 5393 reset_context->job = job; 5394 reset_context->hive = hive; 5395 /* 5396 * Build list of devices to reset. 5397 * In case we are in XGMI hive mode, resort the device list 5398 * to put adev in the 1st position. 5399 */ 5400 INIT_LIST_HEAD(&device_list); 5401 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5402 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5403 list_add_tail(&tmp_adev->reset_list, &device_list); 5404 if (gpu_reset_for_dev_remove && adev->shutdown) 5405 tmp_adev->shutdown = true; 5406 } 5407 if (!list_is_first(&adev->reset_list, &device_list)) 5408 list_rotate_to_front(&adev->reset_list, &device_list); 5409 device_list_handle = &device_list; 5410 } else { 5411 list_add_tail(&adev->reset_list, &device_list); 5412 device_list_handle = &device_list; 5413 } 5414 5415 /* We need to lock reset domain only once both for XGMI and single device */ 5416 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5417 reset_list); 5418 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5419 5420 /* block all schedulers and reset given job's ring */ 5421 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5422 5423 amdgpu_device_set_mp1_state(tmp_adev); 5424 5425 /* 5426 * Try to put the audio codec into suspend state 5427 * before gpu reset started. 5428 * 5429 * Due to the power domain of the graphics device 5430 * is shared with AZ power domain. Without this, 5431 * we may change the audio hardware from behind 5432 * the audio driver's back. That will trigger 5433 * some audio codec errors. 5434 */ 5435 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5436 audio_suspended = true; 5437 5438 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5439 5440 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5441 5442 if (!amdgpu_sriov_vf(tmp_adev)) 5443 amdgpu_amdkfd_pre_reset(tmp_adev); 5444 5445 /* 5446 * Mark these ASICs to be reseted as untracked first 5447 * And add them back after reset completed 5448 */ 5449 amdgpu_unregister_gpu_instance(tmp_adev); 5450 5451 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5452 5453 /* disable ras on ALL IPs */ 5454 if (!need_emergency_restart && 5455 amdgpu_device_ip_need_full_reset(tmp_adev)) 5456 amdgpu_ras_suspend(tmp_adev); 5457 5458 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5459 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5460 5461 if (!ring || !ring->sched.thread) 5462 continue; 5463 5464 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5465 5466 if (need_emergency_restart) 5467 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5468 } 5469 atomic_inc(&tmp_adev->gpu_reset_counter); 5470 } 5471 5472 if (need_emergency_restart) 5473 goto skip_sched_resume; 5474 5475 /* 5476 * Must check guilty signal here since after this point all old 5477 * HW fences are force signaled. 5478 * 5479 * job->base holds a reference to parent fence 5480 */ 5481 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5482 job_signaled = true; 5483 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5484 goto skip_hw_reset; 5485 } 5486 5487 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5488 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5489 if (gpu_reset_for_dev_remove) { 5490 /* Workaroud for ASICs need to disable SMC first */ 5491 amdgpu_device_smu_fini_early(tmp_adev); 5492 } 5493 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5494 /*TODO Should we stop ?*/ 5495 if (r) { 5496 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5497 r, adev_to_drm(tmp_adev)->unique); 5498 tmp_adev->asic_reset_res = r; 5499 } 5500 5501 /* 5502 * Drop all pending non scheduler resets. Scheduler resets 5503 * were already dropped during drm_sched_stop 5504 */ 5505 amdgpu_device_stop_pending_resets(tmp_adev); 5506 } 5507 5508 /* Actual ASIC resets if needed.*/ 5509 /* Host driver will handle XGMI hive reset for SRIOV */ 5510 if (amdgpu_sriov_vf(adev)) { 5511 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5512 if (r) 5513 adev->asic_reset_res = r; 5514 5515 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5516 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5517 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5518 amdgpu_ras_resume(adev); 5519 } else { 5520 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5521 if (r && r == -EAGAIN) 5522 goto retry; 5523 5524 if (!r && gpu_reset_for_dev_remove) 5525 goto recover_end; 5526 } 5527 5528 skip_hw_reset: 5529 5530 /* Post ASIC reset for all devs .*/ 5531 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5532 5533 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5534 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5535 5536 if (!ring || !ring->sched.thread) 5537 continue; 5538 5539 drm_sched_start(&ring->sched, true); 5540 } 5541 5542 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5543 amdgpu_mes_self_test(tmp_adev); 5544 5545 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5546 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5547 } 5548 5549 if (tmp_adev->asic_reset_res) 5550 r = tmp_adev->asic_reset_res; 5551 5552 tmp_adev->asic_reset_res = 0; 5553 5554 if (r) { 5555 /* bad news, how to tell it to userspace ? */ 5556 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5557 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5558 } else { 5559 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5560 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5561 DRM_WARN("smart shift update failed\n"); 5562 } 5563 } 5564 5565 skip_sched_resume: 5566 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5567 /* unlock kfd: SRIOV would do it separately */ 5568 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5569 amdgpu_amdkfd_post_reset(tmp_adev); 5570 5571 /* kfd_post_reset will do nothing if kfd device is not initialized, 5572 * need to bring up kfd here if it's not be initialized before 5573 */ 5574 if (!adev->kfd.init_complete) 5575 amdgpu_amdkfd_device_init(adev); 5576 5577 if (audio_suspended) 5578 amdgpu_device_resume_display_audio(tmp_adev); 5579 5580 amdgpu_device_unset_mp1_state(tmp_adev); 5581 5582 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5583 } 5584 5585 recover_end: 5586 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5587 reset_list); 5588 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5589 5590 if (hive) { 5591 mutex_unlock(&hive->hive_lock); 5592 amdgpu_put_xgmi_hive(hive); 5593 } 5594 5595 if (r) 5596 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5597 5598 atomic_set(&adev->reset_domain->reset_res, r); 5599 return r; 5600 } 5601 5602 /** 5603 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5604 * 5605 * @adev: amdgpu_device pointer 5606 * 5607 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5608 * and lanes) of the slot the device is in. Handles APUs and 5609 * virtualized environments where PCIE config space may not be available. 5610 */ 5611 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5612 { 5613 struct pci_dev *pdev; 5614 enum pci_bus_speed speed_cap, platform_speed_cap; 5615 enum pcie_link_width platform_link_width; 5616 5617 if (amdgpu_pcie_gen_cap) 5618 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5619 5620 if (amdgpu_pcie_lane_cap) 5621 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5622 5623 /* covers APUs as well */ 5624 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5625 if (adev->pm.pcie_gen_mask == 0) 5626 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5627 if (adev->pm.pcie_mlw_mask == 0) 5628 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5629 return; 5630 } 5631 5632 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5633 return; 5634 5635 pcie_bandwidth_available(adev->pdev, NULL, 5636 &platform_speed_cap, &platform_link_width); 5637 5638 if (adev->pm.pcie_gen_mask == 0) { 5639 /* asic caps */ 5640 pdev = adev->pdev; 5641 speed_cap = pcie_get_speed_cap(pdev); 5642 if (speed_cap == PCI_SPEED_UNKNOWN) { 5643 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5644 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5645 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5646 } else { 5647 if (speed_cap == PCIE_SPEED_32_0GT) 5648 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5649 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5650 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5651 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5652 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5653 else if (speed_cap == PCIE_SPEED_16_0GT) 5654 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5655 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5656 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5657 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5658 else if (speed_cap == PCIE_SPEED_8_0GT) 5659 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5660 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5661 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5662 else if (speed_cap == PCIE_SPEED_5_0GT) 5663 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5664 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5665 else 5666 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5667 } 5668 /* platform caps */ 5669 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5670 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5671 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5672 } else { 5673 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5674 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5675 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5676 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5677 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5678 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5679 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5680 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5681 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5682 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5683 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5684 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5685 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5686 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5687 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5688 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5689 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5690 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5691 else 5692 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5693 5694 } 5695 } 5696 if (adev->pm.pcie_mlw_mask == 0) { 5697 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5698 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5699 } else { 5700 switch (platform_link_width) { 5701 case PCIE_LNK_X32: 5702 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5703 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5704 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5705 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5706 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5707 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5708 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5709 break; 5710 case PCIE_LNK_X16: 5711 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5712 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5713 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5714 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5715 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5716 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5717 break; 5718 case PCIE_LNK_X12: 5719 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5720 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5721 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5722 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5723 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5724 break; 5725 case PCIE_LNK_X8: 5726 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5727 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5728 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5729 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5730 break; 5731 case PCIE_LNK_X4: 5732 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5733 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5734 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5735 break; 5736 case PCIE_LNK_X2: 5737 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5738 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5739 break; 5740 case PCIE_LNK_X1: 5741 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5742 break; 5743 default: 5744 break; 5745 } 5746 } 5747 } 5748 } 5749 5750 /** 5751 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5752 * 5753 * @adev: amdgpu_device pointer 5754 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5755 * 5756 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5757 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5758 * @peer_adev. 5759 */ 5760 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5761 struct amdgpu_device *peer_adev) 5762 { 5763 #ifdef CONFIG_HSA_AMD_P2P 5764 uint64_t address_mask = peer_adev->dev->dma_mask ? 5765 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5766 resource_size_t aper_limit = 5767 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5768 bool p2p_access = 5769 !adev->gmc.xgmi.connected_to_cpu && 5770 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5771 5772 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5773 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5774 !(adev->gmc.aper_base & address_mask || 5775 aper_limit & address_mask)); 5776 #else 5777 return false; 5778 #endif 5779 } 5780 5781 int amdgpu_device_baco_enter(struct drm_device *dev) 5782 { 5783 struct amdgpu_device *adev = drm_to_adev(dev); 5784 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5785 5786 if (!amdgpu_device_supports_baco(dev)) 5787 return -ENOTSUPP; 5788 5789 if (ras && adev->ras_enabled && 5790 adev->nbio.funcs->enable_doorbell_interrupt) 5791 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5792 5793 return amdgpu_dpm_baco_enter(adev); 5794 } 5795 5796 int amdgpu_device_baco_exit(struct drm_device *dev) 5797 { 5798 struct amdgpu_device *adev = drm_to_adev(dev); 5799 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5800 int ret = 0; 5801 5802 if (!amdgpu_device_supports_baco(dev)) 5803 return -ENOTSUPP; 5804 5805 ret = amdgpu_dpm_baco_exit(adev); 5806 if (ret) 5807 return ret; 5808 5809 if (ras && adev->ras_enabled && 5810 adev->nbio.funcs->enable_doorbell_interrupt) 5811 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5812 5813 if (amdgpu_passthrough(adev) && 5814 adev->nbio.funcs->clear_doorbell_interrupt) 5815 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5816 5817 return 0; 5818 } 5819 5820 /** 5821 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5822 * @pdev: PCI device struct 5823 * @state: PCI channel state 5824 * 5825 * Description: Called when a PCI error is detected. 5826 * 5827 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5828 */ 5829 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5830 { 5831 struct drm_device *dev = pci_get_drvdata(pdev); 5832 struct amdgpu_device *adev = drm_to_adev(dev); 5833 int i; 5834 5835 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5836 5837 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5838 DRM_WARN("No support for XGMI hive yet..."); 5839 return PCI_ERS_RESULT_DISCONNECT; 5840 } 5841 5842 adev->pci_channel_state = state; 5843 5844 switch (state) { 5845 case pci_channel_io_normal: 5846 return PCI_ERS_RESULT_CAN_RECOVER; 5847 /* Fatal error, prepare for slot reset */ 5848 case pci_channel_io_frozen: 5849 /* 5850 * Locking adev->reset_domain->sem will prevent any external access 5851 * to GPU during PCI error recovery 5852 */ 5853 amdgpu_device_lock_reset_domain(adev->reset_domain); 5854 amdgpu_device_set_mp1_state(adev); 5855 5856 /* 5857 * Block any work scheduling as we do for regular GPU reset 5858 * for the duration of the recovery 5859 */ 5860 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5861 struct amdgpu_ring *ring = adev->rings[i]; 5862 5863 if (!ring || !ring->sched.thread) 5864 continue; 5865 5866 drm_sched_stop(&ring->sched, NULL); 5867 } 5868 atomic_inc(&adev->gpu_reset_counter); 5869 return PCI_ERS_RESULT_NEED_RESET; 5870 case pci_channel_io_perm_failure: 5871 /* Permanent error, prepare for device removal */ 5872 return PCI_ERS_RESULT_DISCONNECT; 5873 } 5874 5875 return PCI_ERS_RESULT_NEED_RESET; 5876 } 5877 5878 /** 5879 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5880 * @pdev: pointer to PCI device 5881 */ 5882 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5883 { 5884 5885 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5886 5887 /* TODO - dump whatever for debugging purposes */ 5888 5889 /* This called only if amdgpu_pci_error_detected returns 5890 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5891 * works, no need to reset slot. 5892 */ 5893 5894 return PCI_ERS_RESULT_RECOVERED; 5895 } 5896 5897 /** 5898 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5899 * @pdev: PCI device struct 5900 * 5901 * Description: This routine is called by the pci error recovery 5902 * code after the PCI slot has been reset, just before we 5903 * should resume normal operations. 5904 */ 5905 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5906 { 5907 struct drm_device *dev = pci_get_drvdata(pdev); 5908 struct amdgpu_device *adev = drm_to_adev(dev); 5909 int r, i; 5910 struct amdgpu_reset_context reset_context; 5911 u32 memsize; 5912 struct list_head device_list; 5913 5914 DRM_INFO("PCI error: slot reset callback!!\n"); 5915 5916 memset(&reset_context, 0, sizeof(reset_context)); 5917 5918 INIT_LIST_HEAD(&device_list); 5919 list_add_tail(&adev->reset_list, &device_list); 5920 5921 /* wait for asic to come out of reset */ 5922 msleep(500); 5923 5924 /* Restore PCI confspace */ 5925 amdgpu_device_load_pci_state(pdev); 5926 5927 /* confirm ASIC came out of reset */ 5928 for (i = 0; i < adev->usec_timeout; i++) { 5929 memsize = amdgpu_asic_get_config_memsize(adev); 5930 5931 if (memsize != 0xffffffff) 5932 break; 5933 udelay(1); 5934 } 5935 if (memsize == 0xffffffff) { 5936 r = -ETIME; 5937 goto out; 5938 } 5939 5940 reset_context.method = AMD_RESET_METHOD_NONE; 5941 reset_context.reset_req_dev = adev; 5942 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5943 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5944 5945 adev->no_hw_access = true; 5946 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5947 adev->no_hw_access = false; 5948 if (r) 5949 goto out; 5950 5951 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5952 5953 out: 5954 if (!r) { 5955 if (amdgpu_device_cache_pci_state(adev->pdev)) 5956 pci_restore_state(adev->pdev); 5957 5958 DRM_INFO("PCIe error recovery succeeded\n"); 5959 } else { 5960 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5961 amdgpu_device_unset_mp1_state(adev); 5962 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5963 } 5964 5965 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5966 } 5967 5968 /** 5969 * amdgpu_pci_resume() - resume normal ops after PCI reset 5970 * @pdev: pointer to PCI device 5971 * 5972 * Called when the error recovery driver tells us that its 5973 * OK to resume normal operation. 5974 */ 5975 void amdgpu_pci_resume(struct pci_dev *pdev) 5976 { 5977 struct drm_device *dev = pci_get_drvdata(pdev); 5978 struct amdgpu_device *adev = drm_to_adev(dev); 5979 int i; 5980 5981 5982 DRM_INFO("PCI error: resume callback!!\n"); 5983 5984 /* Only continue execution for the case of pci_channel_io_frozen */ 5985 if (adev->pci_channel_state != pci_channel_io_frozen) 5986 return; 5987 5988 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5989 struct amdgpu_ring *ring = adev->rings[i]; 5990 5991 if (!ring || !ring->sched.thread) 5992 continue; 5993 5994 drm_sched_start(&ring->sched, true); 5995 } 5996 5997 amdgpu_device_unset_mp1_state(adev); 5998 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5999 } 6000 6001 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6002 { 6003 struct drm_device *dev = pci_get_drvdata(pdev); 6004 struct amdgpu_device *adev = drm_to_adev(dev); 6005 int r; 6006 6007 r = pci_save_state(pdev); 6008 if (!r) { 6009 kfree(adev->pci_state); 6010 6011 adev->pci_state = pci_store_saved_state(pdev); 6012 6013 if (!adev->pci_state) { 6014 DRM_ERROR("Failed to store PCI saved state"); 6015 return false; 6016 } 6017 } else { 6018 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6019 return false; 6020 } 6021 6022 return true; 6023 } 6024 6025 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6026 { 6027 struct drm_device *dev = pci_get_drvdata(pdev); 6028 struct amdgpu_device *adev = drm_to_adev(dev); 6029 int r; 6030 6031 if (!adev->pci_state) 6032 return false; 6033 6034 r = pci_load_saved_state(pdev, adev->pci_state); 6035 6036 if (!r) { 6037 pci_restore_state(pdev); 6038 } else { 6039 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6040 return false; 6041 } 6042 6043 return true; 6044 } 6045 6046 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6047 struct amdgpu_ring *ring) 6048 { 6049 #ifdef CONFIG_X86_64 6050 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6051 return; 6052 #endif 6053 if (adev->gmc.xgmi.connected_to_cpu) 6054 return; 6055 6056 if (ring && ring->funcs->emit_hdp_flush) 6057 amdgpu_ring_emit_hdp_flush(ring); 6058 else 6059 amdgpu_asic_flush_hdp(adev, ring); 6060 } 6061 6062 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6063 struct amdgpu_ring *ring) 6064 { 6065 #ifdef CONFIG_X86_64 6066 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6067 return; 6068 #endif 6069 if (adev->gmc.xgmi.connected_to_cpu) 6070 return; 6071 6072 amdgpu_asic_invalidate_hdp(adev, ring); 6073 } 6074 6075 int amdgpu_in_reset(struct amdgpu_device *adev) 6076 { 6077 return atomic_read(&adev->reset_domain->in_gpu_reset); 6078 } 6079 6080 /** 6081 * amdgpu_device_halt() - bring hardware to some kind of halt state 6082 * 6083 * @adev: amdgpu_device pointer 6084 * 6085 * Bring hardware to some kind of halt state so that no one can touch it 6086 * any more. It will help to maintain error context when error occurred. 6087 * Compare to a simple hang, the system will keep stable at least for SSH 6088 * access. Then it should be trivial to inspect the hardware state and 6089 * see what's going on. Implemented as following: 6090 * 6091 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6092 * clears all CPU mappings to device, disallows remappings through page faults 6093 * 2. amdgpu_irq_disable_all() disables all interrupts 6094 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6095 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6096 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6097 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6098 * flush any in flight DMA operations 6099 */ 6100 void amdgpu_device_halt(struct amdgpu_device *adev) 6101 { 6102 struct pci_dev *pdev = adev->pdev; 6103 struct drm_device *ddev = adev_to_drm(adev); 6104 6105 amdgpu_xcp_dev_unplug(adev); 6106 drm_dev_unplug(ddev); 6107 6108 amdgpu_irq_disable_all(adev); 6109 6110 amdgpu_fence_driver_hw_fini(adev); 6111 6112 adev->no_hw_access = true; 6113 6114 amdgpu_device_unmap_mmio(adev); 6115 6116 pci_disable_device(pdev); 6117 pci_wait_for_pending_transaction(pdev); 6118 } 6119 6120 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6121 u32 reg) 6122 { 6123 unsigned long flags, address, data; 6124 u32 r; 6125 6126 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6127 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6128 6129 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6130 WREG32(address, reg * 4); 6131 (void)RREG32(address); 6132 r = RREG32(data); 6133 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6134 return r; 6135 } 6136 6137 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6138 u32 reg, u32 v) 6139 { 6140 unsigned long flags, address, data; 6141 6142 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6143 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6144 6145 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6146 WREG32(address, reg * 4); 6147 (void)RREG32(address); 6148 WREG32(data, v); 6149 (void)RREG32(data); 6150 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6151 } 6152 6153 /** 6154 * amdgpu_device_switch_gang - switch to a new gang 6155 * @adev: amdgpu_device pointer 6156 * @gang: the gang to switch to 6157 * 6158 * Try to switch to a new gang. 6159 * Returns: NULL if we switched to the new gang or a reference to the current 6160 * gang leader. 6161 */ 6162 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6163 struct dma_fence *gang) 6164 { 6165 struct dma_fence *old = NULL; 6166 6167 do { 6168 dma_fence_put(old); 6169 rcu_read_lock(); 6170 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6171 rcu_read_unlock(); 6172 6173 if (old == gang) 6174 break; 6175 6176 if (!dma_fence_is_signaled(old)) 6177 return old; 6178 6179 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6180 old, gang) != old); 6181 6182 dma_fence_put(old); 6183 return NULL; 6184 } 6185 6186 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6187 { 6188 switch (adev->asic_type) { 6189 #ifdef CONFIG_DRM_AMDGPU_SI 6190 case CHIP_HAINAN: 6191 #endif 6192 case CHIP_TOPAZ: 6193 /* chips with no display hardware */ 6194 return false; 6195 #ifdef CONFIG_DRM_AMDGPU_SI 6196 case CHIP_TAHITI: 6197 case CHIP_PITCAIRN: 6198 case CHIP_VERDE: 6199 case CHIP_OLAND: 6200 #endif 6201 #ifdef CONFIG_DRM_AMDGPU_CIK 6202 case CHIP_BONAIRE: 6203 case CHIP_HAWAII: 6204 case CHIP_KAVERI: 6205 case CHIP_KABINI: 6206 case CHIP_MULLINS: 6207 #endif 6208 case CHIP_TONGA: 6209 case CHIP_FIJI: 6210 case CHIP_POLARIS10: 6211 case CHIP_POLARIS11: 6212 case CHIP_POLARIS12: 6213 case CHIP_VEGAM: 6214 case CHIP_CARRIZO: 6215 case CHIP_STONEY: 6216 /* chips with display hardware */ 6217 return true; 6218 default: 6219 /* IP discovery */ 6220 if (!adev->ip_versions[DCE_HWIP][0] || 6221 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6222 return false; 6223 return true; 6224 } 6225 } 6226 6227 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6228 uint32_t inst, uint32_t reg_addr, char reg_name[], 6229 uint32_t expected_value, uint32_t mask) 6230 { 6231 uint32_t ret = 0; 6232 uint32_t old_ = 0; 6233 uint32_t tmp_ = RREG32(reg_addr); 6234 uint32_t loop = adev->usec_timeout; 6235 6236 while ((tmp_ & (mask)) != (expected_value)) { 6237 if (old_ != tmp_) { 6238 loop = adev->usec_timeout; 6239 old_ = tmp_; 6240 } else 6241 udelay(1); 6242 tmp_ = RREG32(reg_addr); 6243 loop--; 6244 if (!loop) { 6245 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6246 inst, reg_name, (uint32_t)expected_value, 6247 (uint32_t)(tmp_ & (mask))); 6248 ret = -ETIMEDOUT; 6249 break; 6250 } 6251 } 6252 return ret; 6253 } 6254