1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 /** 168 * DOC: product_name 169 * 170 * The amdgpu driver provides a sysfs API for reporting the product name 171 * for the device 172 * The file product_name is used for this and returns the product name 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_name(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return sysfs_emit(buf, "%s\n", adev->product_name); 184 } 185 186 static DEVICE_ATTR(product_name, S_IRUGO, 187 amdgpu_device_get_product_name, NULL); 188 189 /** 190 * DOC: product_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the part number 193 * for the device 194 * The file product_number is used for this and returns the part number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_product_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return sysfs_emit(buf, "%s\n", adev->product_number); 206 } 207 208 static DEVICE_ATTR(product_number, S_IRUGO, 209 amdgpu_device_get_product_number, NULL); 210 211 /** 212 * DOC: serial_number 213 * 214 * The amdgpu driver provides a sysfs API for reporting the serial number 215 * for the device 216 * The file serial_number is used for this and returns the serial number 217 * as returned from the FRU. 218 * NOTE: This is only available for certain server cards 219 */ 220 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 222 struct device_attribute *attr, char *buf) 223 { 224 struct drm_device *ddev = dev_get_drvdata(dev); 225 struct amdgpu_device *adev = drm_to_adev(ddev); 226 227 return sysfs_emit(buf, "%s\n", adev->serial); 228 } 229 230 static DEVICE_ATTR(serial_number, S_IRUGO, 231 amdgpu_device_get_serial_number, NULL); 232 233 /** 234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 235 * 236 * @dev: drm_device pointer 237 * 238 * Returns true if the device is a dGPU with ATPX power control, 239 * otherwise return false. 240 */ 241 bool amdgpu_device_supports_px(struct drm_device *dev) 242 { 243 struct amdgpu_device *adev = drm_to_adev(dev); 244 245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device is a dGPU with ACPI power control, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_boco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 if (adev->has_pr3 || 263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 264 return true; 265 return false; 266 } 267 268 /** 269 * amdgpu_device_supports_baco - Does the device support BACO 270 * 271 * @dev: drm_device pointer 272 * 273 * Returns true if the device supporte BACO, 274 * otherwise return false. 275 */ 276 bool amdgpu_device_supports_baco(struct drm_device *dev) 277 { 278 struct amdgpu_device *adev = drm_to_adev(dev); 279 280 return amdgpu_asic_supports_baco(adev); 281 } 282 283 /** 284 * amdgpu_device_supports_smart_shift - Is the device dGPU with 285 * smart shift support 286 * 287 * @dev: drm_device pointer 288 * 289 * Returns true if the device is a dGPU with Smart Shift support, 290 * otherwise returns false. 291 */ 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 293 { 294 return (amdgpu_device_supports_boco(dev) && 295 amdgpu_acpi_is_power_shift_control_supported()); 296 } 297 298 /* 299 * VRAM access helper functions 300 */ 301 302 /** 303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 304 * 305 * @adev: amdgpu_device pointer 306 * @pos: offset of the buffer in vram 307 * @buf: virtual address of the buffer in system memory 308 * @size: read/write size, sizeof(@buf) must > @size 309 * @write: true - write to vram, otherwise - read from vram 310 */ 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 312 void *buf, size_t size, bool write) 313 { 314 unsigned long flags; 315 uint32_t hi = ~0, tmp = 0; 316 uint32_t *data = buf; 317 uint64_t last; 318 int idx; 319 320 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 321 return; 322 323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 324 325 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 326 for (last = pos + size; pos < last; pos += 4) { 327 tmp = pos >> 31; 328 329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 330 if (tmp != hi) { 331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 332 hi = tmp; 333 } 334 if (write) 335 WREG32_NO_KIQ(mmMM_DATA, *data++); 336 else 337 *data++ = RREG32_NO_KIQ(mmMM_DATA); 338 } 339 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 drm_dev_exit(idx); 342 } 343 344 /** 345 * amdgpu_device_aper_access - access vram by vram aperature 346 * 347 * @adev: amdgpu_device pointer 348 * @pos: offset of the buffer in vram 349 * @buf: virtual address of the buffer in system memory 350 * @size: read/write size, sizeof(@buf) must > @size 351 * @write: true - write to vram, otherwise - read from vram 352 * 353 * The return value means how many bytes have been transferred. 354 */ 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 356 void *buf, size_t size, bool write) 357 { 358 #ifdef CONFIG_64BIT 359 void __iomem *addr; 360 size_t count = 0; 361 uint64_t last; 362 363 if (!adev->mman.aper_base_kaddr) 364 return 0; 365 366 last = min(pos + size, adev->gmc.visible_vram_size); 367 if (last > pos) { 368 addr = adev->mman.aper_base_kaddr + pos; 369 count = last - pos; 370 371 if (write) { 372 memcpy_toio(addr, buf, count); 373 mb(); 374 amdgpu_device_flush_hdp(adev, NULL); 375 } else { 376 amdgpu_device_invalidate_hdp(adev, NULL); 377 mb(); 378 memcpy_fromio(buf, addr, count); 379 } 380 381 } 382 383 return count; 384 #else 385 return 0; 386 #endif 387 } 388 389 /** 390 * amdgpu_device_vram_access - read/write a buffer in vram 391 * 392 * @adev: amdgpu_device pointer 393 * @pos: offset of the buffer in vram 394 * @buf: virtual address of the buffer in system memory 395 * @size: read/write size, sizeof(@buf) must > @size 396 * @write: true - write to vram, otherwise - read from vram 397 */ 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 399 void *buf, size_t size, bool write) 400 { 401 size_t count; 402 403 /* try to using vram apreature to access vram first */ 404 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 405 size -= count; 406 if (size) { 407 /* using MM to access rest vram */ 408 pos += count; 409 buf += count; 410 amdgpu_device_mm_access(adev, pos, buf, size, write); 411 } 412 } 413 414 /* 415 * register access helper functions. 416 */ 417 418 /* Check if hw access should be skipped because of hotplug or device error */ 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 420 { 421 if (adev->no_hw_access) 422 return true; 423 424 #ifdef CONFIG_LOCKDEP 425 /* 426 * This is a bit complicated to understand, so worth a comment. What we assert 427 * here is that the GPU reset is not running on another thread in parallel. 428 * 429 * For this we trylock the read side of the reset semaphore, if that succeeds 430 * we know that the reset is not running in paralell. 431 * 432 * If the trylock fails we assert that we are either already holding the read 433 * side of the lock or are the reset thread itself and hold the write side of 434 * the lock. 435 */ 436 if (in_task()) { 437 if (down_read_trylock(&adev->reset_domain->sem)) 438 up_read(&adev->reset_domain->sem); 439 else 440 lockdep_assert_held(&adev->reset_domain->sem); 441 } 442 #endif 443 return false; 444 } 445 446 /** 447 * amdgpu_device_rreg - read a memory mapped IO or indirect register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * @acc_flags: access flags which require special behavior 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t acc_flags) 457 { 458 uint32_t ret; 459 460 if (amdgpu_device_skip_hw_access(adev)) 461 return 0; 462 463 if ((reg * 4) < adev->rmmio_size) { 464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 465 amdgpu_sriov_runtime(adev) && 466 down_read_trylock(&adev->reset_domain->sem)) { 467 ret = amdgpu_kiq_rreg(adev, reg); 468 up_read(&adev->reset_domain->sem); 469 } else { 470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 471 } 472 } else { 473 ret = adev->pcie_rreg(adev, reg * 4); 474 } 475 476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 477 478 return ret; 479 } 480 481 /* 482 * MMIO register read with bytes helper functions 483 * @offset:bytes offset from MMIO start 484 * 485 */ 486 487 /** 488 * amdgpu_mm_rreg8 - read a memory mapped IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @offset: byte aligned register offset 492 * 493 * Returns the 8 bit value from the offset specified. 494 */ 495 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 496 { 497 if (amdgpu_device_skip_hw_access(adev)) 498 return 0; 499 500 if (offset < adev->rmmio_size) 501 return (readb(adev->rmmio + offset)); 502 BUG(); 503 } 504 505 /* 506 * MMIO register write with bytes helper functions 507 * @offset:bytes offset from MMIO start 508 * @value: the value want to be written to the register 509 * 510 */ 511 /** 512 * amdgpu_mm_wreg8 - read a memory mapped IO register 513 * 514 * @adev: amdgpu_device pointer 515 * @offset: byte aligned register offset 516 * @value: 8 bit value to write 517 * 518 * Writes the value specified to the offset specified. 519 */ 520 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 521 { 522 if (amdgpu_device_skip_hw_access(adev)) 523 return; 524 525 if (offset < adev->rmmio_size) 526 writeb(value, adev->rmmio + offset); 527 else 528 BUG(); 529 } 530 531 /** 532 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 533 * 534 * @adev: amdgpu_device pointer 535 * @reg: dword aligned register offset 536 * @v: 32 bit value to write to the register 537 * @acc_flags: access flags which require special behavior 538 * 539 * Writes the value specified to the offset specified. 540 */ 541 void amdgpu_device_wreg(struct amdgpu_device *adev, 542 uint32_t reg, uint32_t v, 543 uint32_t acc_flags) 544 { 545 if (amdgpu_device_skip_hw_access(adev)) 546 return; 547 548 if ((reg * 4) < adev->rmmio_size) { 549 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 550 amdgpu_sriov_runtime(adev) && 551 down_read_trylock(&adev->reset_domain->sem)) { 552 amdgpu_kiq_wreg(adev, reg, v); 553 up_read(&adev->reset_domain->sem); 554 } else { 555 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 556 } 557 } else { 558 adev->pcie_wreg(adev, reg * 4, v); 559 } 560 561 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 562 } 563 564 /** 565 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 566 * 567 * @adev: amdgpu_device pointer 568 * @reg: mmio/rlc register 569 * @v: value to write 570 * 571 * this function is invoked only for the debugfs register access 572 */ 573 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 574 uint32_t reg, uint32_t v) 575 { 576 if (amdgpu_device_skip_hw_access(adev)) 577 return; 578 579 if (amdgpu_sriov_fullaccess(adev) && 580 adev->gfx.rlc.funcs && 581 adev->gfx.rlc.funcs->is_rlcg_access_range) { 582 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 583 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 584 } else if ((reg * 4) >= adev->rmmio_size) { 585 adev->pcie_wreg(adev, reg * 4, v); 586 } else { 587 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 588 } 589 } 590 591 /** 592 * amdgpu_mm_rdoorbell - read a doorbell dword 593 * 594 * @adev: amdgpu_device pointer 595 * @index: doorbell index 596 * 597 * Returns the value in the doorbell aperture at the 598 * requested doorbell index (CIK). 599 */ 600 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 601 { 602 if (amdgpu_device_skip_hw_access(adev)) 603 return 0; 604 605 if (index < adev->doorbell.num_kernel_doorbells) { 606 return readl(adev->doorbell.ptr + index); 607 } else { 608 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 609 return 0; 610 } 611 } 612 613 /** 614 * amdgpu_mm_wdoorbell - write a doorbell dword 615 * 616 * @adev: amdgpu_device pointer 617 * @index: doorbell index 618 * @v: value to write 619 * 620 * Writes @v to the doorbell aperture at the 621 * requested doorbell index (CIK). 622 */ 623 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 624 { 625 if (amdgpu_device_skip_hw_access(adev)) 626 return; 627 628 if (index < adev->doorbell.num_kernel_doorbells) { 629 writel(v, adev->doorbell.ptr + index); 630 } else { 631 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 632 } 633 } 634 635 /** 636 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 637 * 638 * @adev: amdgpu_device pointer 639 * @index: doorbell index 640 * 641 * Returns the value in the doorbell aperture at the 642 * requested doorbell index (VEGA10+). 643 */ 644 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 645 { 646 if (amdgpu_device_skip_hw_access(adev)) 647 return 0; 648 649 if (index < adev->doorbell.num_kernel_doorbells) { 650 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 651 } else { 652 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 653 return 0; 654 } 655 } 656 657 /** 658 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 659 * 660 * @adev: amdgpu_device pointer 661 * @index: doorbell index 662 * @v: value to write 663 * 664 * Writes @v to the doorbell aperture at the 665 * requested doorbell index (VEGA10+). 666 */ 667 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 668 { 669 if (amdgpu_device_skip_hw_access(adev)) 670 return; 671 672 if (index < adev->doorbell.num_kernel_doorbells) { 673 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 674 } else { 675 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 676 } 677 } 678 679 /** 680 * amdgpu_device_indirect_rreg - read an indirect register 681 * 682 * @adev: amdgpu_device pointer 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 reg_addr) 689 { 690 unsigned long flags, pcie_index, pcie_data; 691 void __iomem *pcie_index_offset; 692 void __iomem *pcie_data_offset; 693 u32 r; 694 695 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 696 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 697 698 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 699 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 700 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 701 702 writel(reg_addr, pcie_index_offset); 703 readl(pcie_index_offset); 704 r = readl(pcie_data_offset); 705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 706 707 return r; 708 } 709 710 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 711 u64 reg_addr) 712 { 713 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 714 u32 r; 715 void __iomem *pcie_index_offset; 716 void __iomem *pcie_index_hi_offset; 717 void __iomem *pcie_data_offset; 718 719 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 720 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 721 if (adev->nbio.funcs->get_pcie_index_hi_offset) 722 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 723 else 724 pcie_index_hi = 0; 725 726 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 727 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 728 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 729 if (pcie_index_hi != 0) 730 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 731 pcie_index_hi * 4; 732 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 if (pcie_index_hi != 0) { 736 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 737 readl(pcie_index_hi_offset); 738 } 739 r = readl(pcie_data_offset); 740 741 /* clear the high bits */ 742 if (pcie_index_hi != 0) { 743 writel(0, pcie_index_hi_offset); 744 readl(pcie_index_hi_offset); 745 } 746 747 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 748 749 return r; 750 } 751 752 /** 753 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 754 * 755 * @adev: amdgpu_device pointer 756 * @reg_addr: indirect register address to read from 757 * 758 * Returns the value of indirect register @reg_addr 759 */ 760 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 761 u32 reg_addr) 762 { 763 unsigned long flags, pcie_index, pcie_data; 764 void __iomem *pcie_index_offset; 765 void __iomem *pcie_data_offset; 766 u64 r; 767 768 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 769 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 770 771 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 772 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 773 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 774 775 /* read low 32 bits */ 776 writel(reg_addr, pcie_index_offset); 777 readl(pcie_index_offset); 778 r = readl(pcie_data_offset); 779 /* read high 32 bits */ 780 writel(reg_addr + 4, pcie_index_offset); 781 readl(pcie_index_offset); 782 r |= ((u64)readl(pcie_data_offset) << 32); 783 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 784 785 return r; 786 } 787 788 /** 789 * amdgpu_device_indirect_wreg - write an indirect register address 790 * 791 * @adev: amdgpu_device pointer 792 * @pcie_index: mmio register offset 793 * @pcie_data: mmio register offset 794 * @reg_addr: indirect register offset 795 * @reg_data: indirect register data 796 * 797 */ 798 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 799 u32 reg_addr, u32 reg_data) 800 { 801 unsigned long flags, pcie_index, pcie_data; 802 void __iomem *pcie_index_offset; 803 void __iomem *pcie_data_offset; 804 805 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 806 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 807 808 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 809 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 810 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 811 812 writel(reg_addr, pcie_index_offset); 813 readl(pcie_index_offset); 814 writel(reg_data, pcie_data_offset); 815 readl(pcie_data_offset); 816 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 817 } 818 819 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 820 u64 reg_addr, u32 reg_data) 821 { 822 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 823 void __iomem *pcie_index_offset; 824 void __iomem *pcie_index_hi_offset; 825 void __iomem *pcie_data_offset; 826 827 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 828 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 829 if (adev->nbio.funcs->get_pcie_index_hi_offset) 830 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 831 else 832 pcie_index_hi = 0; 833 834 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 835 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 836 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 837 if (pcie_index_hi != 0) 838 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 839 pcie_index_hi * 4; 840 841 writel(reg_addr, pcie_index_offset); 842 readl(pcie_index_offset); 843 if (pcie_index_hi != 0) { 844 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 845 readl(pcie_index_hi_offset); 846 } 847 writel(reg_data, pcie_data_offset); 848 readl(pcie_data_offset); 849 850 /* clear the high bits */ 851 if (pcie_index_hi != 0) { 852 writel(0, pcie_index_hi_offset); 853 readl(pcie_index_hi_offset); 854 } 855 856 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 857 } 858 859 /** 860 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 861 * 862 * @adev: amdgpu_device pointer 863 * @pcie_index: mmio register offset 864 * @pcie_data: mmio register offset 865 * @reg_addr: indirect register offset 866 * @reg_data: indirect register data 867 * 868 */ 869 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 870 u32 reg_addr, u64 reg_data) 871 { 872 unsigned long flags, pcie_index, pcie_data; 873 void __iomem *pcie_index_offset; 874 void __iomem *pcie_data_offset; 875 876 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 877 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 878 879 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 880 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 881 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 882 883 /* write low 32 bits */ 884 writel(reg_addr, pcie_index_offset); 885 readl(pcie_index_offset); 886 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 887 readl(pcie_data_offset); 888 /* write high 32 bits */ 889 writel(reg_addr + 4, pcie_index_offset); 890 readl(pcie_index_offset); 891 writel((u32)(reg_data >> 32), pcie_data_offset); 892 readl(pcie_data_offset); 893 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 894 } 895 896 /** 897 * amdgpu_device_get_rev_id - query device rev_id 898 * 899 * @adev: amdgpu_device pointer 900 * 901 * Return device rev_id 902 */ 903 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 904 { 905 return adev->nbio.funcs->get_rev_id(adev); 906 } 907 908 /** 909 * amdgpu_invalid_rreg - dummy reg read function 910 * 911 * @adev: amdgpu_device pointer 912 * @reg: offset of register 913 * 914 * Dummy register read function. Used for register blocks 915 * that certain asics don't have (all asics). 916 * Returns the value in the register. 917 */ 918 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 919 { 920 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 921 BUG(); 922 return 0; 923 } 924 925 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 926 { 927 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 928 BUG(); 929 return 0; 930 } 931 932 /** 933 * amdgpu_invalid_wreg - dummy reg write function 934 * 935 * @adev: amdgpu_device pointer 936 * @reg: offset of register 937 * @v: value to write to the register 938 * 939 * Dummy register read function. Used for register blocks 940 * that certain asics don't have (all asics). 941 */ 942 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 943 { 944 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 945 reg, v); 946 BUG(); 947 } 948 949 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 950 { 951 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 952 reg, v); 953 BUG(); 954 } 955 956 /** 957 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 958 * 959 * @adev: amdgpu_device pointer 960 * @reg: offset of register 961 * 962 * Dummy register read function. Used for register blocks 963 * that certain asics don't have (all asics). 964 * Returns the value in the register. 965 */ 966 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 967 { 968 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 969 BUG(); 970 return 0; 971 } 972 973 /** 974 * amdgpu_invalid_wreg64 - dummy reg write function 975 * 976 * @adev: amdgpu_device pointer 977 * @reg: offset of register 978 * @v: value to write to the register 979 * 980 * Dummy register read function. Used for register blocks 981 * that certain asics don't have (all asics). 982 */ 983 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 984 { 985 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 986 reg, v); 987 BUG(); 988 } 989 990 /** 991 * amdgpu_block_invalid_rreg - dummy reg read function 992 * 993 * @adev: amdgpu_device pointer 994 * @block: offset of instance 995 * @reg: offset of register 996 * 997 * Dummy register read function. Used for register blocks 998 * that certain asics don't have (all asics). 999 * Returns the value in the register. 1000 */ 1001 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1002 uint32_t block, uint32_t reg) 1003 { 1004 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1005 reg, block); 1006 BUG(); 1007 return 0; 1008 } 1009 1010 /** 1011 * amdgpu_block_invalid_wreg - dummy reg write function 1012 * 1013 * @adev: amdgpu_device pointer 1014 * @block: offset of instance 1015 * @reg: offset of register 1016 * @v: value to write to the register 1017 * 1018 * Dummy register read function. Used for register blocks 1019 * that certain asics don't have (all asics). 1020 */ 1021 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1022 uint32_t block, 1023 uint32_t reg, uint32_t v) 1024 { 1025 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1026 reg, block, v); 1027 BUG(); 1028 } 1029 1030 /** 1031 * amdgpu_device_asic_init - Wrapper for atom asic_init 1032 * 1033 * @adev: amdgpu_device pointer 1034 * 1035 * Does any asic specific work and then calls atom asic init. 1036 */ 1037 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1038 { 1039 amdgpu_asic_pre_asic_init(adev); 1040 1041 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 1042 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 1043 return amdgpu_atomfirmware_asic_init(adev, true); 1044 else 1045 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1046 } 1047 1048 /** 1049 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1050 * 1051 * @adev: amdgpu_device pointer 1052 * 1053 * Allocates a scratch page of VRAM for use by various things in the 1054 * driver. 1055 */ 1056 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1057 { 1058 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1059 AMDGPU_GEM_DOMAIN_VRAM | 1060 AMDGPU_GEM_DOMAIN_GTT, 1061 &adev->mem_scratch.robj, 1062 &adev->mem_scratch.gpu_addr, 1063 (void **)&adev->mem_scratch.ptr); 1064 } 1065 1066 /** 1067 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1068 * 1069 * @adev: amdgpu_device pointer 1070 * 1071 * Frees the VRAM scratch page. 1072 */ 1073 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1074 { 1075 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1076 } 1077 1078 /** 1079 * amdgpu_device_program_register_sequence - program an array of registers. 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @registers: pointer to the register array 1083 * @array_size: size of the register array 1084 * 1085 * Programs an array or registers with and and or masks. 1086 * This is a helper for setting golden registers. 1087 */ 1088 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1089 const u32 *registers, 1090 const u32 array_size) 1091 { 1092 u32 tmp, reg, and_mask, or_mask; 1093 int i; 1094 1095 if (array_size % 3) 1096 return; 1097 1098 for (i = 0; i < array_size; i += 3) { 1099 reg = registers[i + 0]; 1100 and_mask = registers[i + 1]; 1101 or_mask = registers[i + 2]; 1102 1103 if (and_mask == 0xffffffff) { 1104 tmp = or_mask; 1105 } else { 1106 tmp = RREG32(reg); 1107 tmp &= ~and_mask; 1108 if (adev->family >= AMDGPU_FAMILY_AI) 1109 tmp |= (or_mask & and_mask); 1110 else 1111 tmp |= or_mask; 1112 } 1113 WREG32(reg, tmp); 1114 } 1115 } 1116 1117 /** 1118 * amdgpu_device_pci_config_reset - reset the GPU 1119 * 1120 * @adev: amdgpu_device pointer 1121 * 1122 * Resets the GPU using the pci config reset sequence. 1123 * Only applicable to asics prior to vega10. 1124 */ 1125 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1126 { 1127 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1128 } 1129 1130 /** 1131 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1132 * 1133 * @adev: amdgpu_device pointer 1134 * 1135 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1136 */ 1137 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1138 { 1139 return pci_reset_function(adev->pdev); 1140 } 1141 1142 /* 1143 * GPU doorbell aperture helpers function. 1144 */ 1145 /** 1146 * amdgpu_device_doorbell_init - Init doorbell driver information. 1147 * 1148 * @adev: amdgpu_device pointer 1149 * 1150 * Init doorbell driver information (CIK) 1151 * Returns 0 on success, error on failure. 1152 */ 1153 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1154 { 1155 1156 /* No doorbell on SI hardware generation */ 1157 if (adev->asic_type < CHIP_BONAIRE) { 1158 adev->doorbell.base = 0; 1159 adev->doorbell.size = 0; 1160 adev->doorbell.num_kernel_doorbells = 0; 1161 adev->doorbell.ptr = NULL; 1162 return 0; 1163 } 1164 1165 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1166 return -EINVAL; 1167 1168 amdgpu_asic_init_doorbell_index(adev); 1169 1170 /* doorbell bar mapping */ 1171 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1172 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1173 1174 if (adev->enable_mes) { 1175 adev->doorbell.num_kernel_doorbells = 1176 adev->doorbell.size / sizeof(u32); 1177 } else { 1178 adev->doorbell.num_kernel_doorbells = 1179 min_t(u32, adev->doorbell.size / sizeof(u32), 1180 adev->doorbell_index.max_assignment+1); 1181 if (adev->doorbell.num_kernel_doorbells == 0) 1182 return -EINVAL; 1183 1184 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1185 * paging queue doorbell use the second page. The 1186 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1187 * doorbells are in the first page. So with paging queue enabled, 1188 * the max num_kernel_doorbells should + 1 page (0x400 in dword) 1189 */ 1190 if (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(4, 0, 0) && 1191 adev->ip_versions[SDMA0_HWIP][0] < IP_VERSION(4, 2, 0)) 1192 adev->doorbell.num_kernel_doorbells += 0x400; 1193 } 1194 1195 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1196 adev->doorbell.num_kernel_doorbells * 1197 sizeof(u32)); 1198 if (adev->doorbell.ptr == NULL) 1199 return -ENOMEM; 1200 1201 return 0; 1202 } 1203 1204 /** 1205 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Tear down doorbell driver information (CIK) 1210 */ 1211 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1212 { 1213 iounmap(adev->doorbell.ptr); 1214 adev->doorbell.ptr = NULL; 1215 } 1216 1217 1218 1219 /* 1220 * amdgpu_device_wb_*() 1221 * Writeback is the method by which the GPU updates special pages in memory 1222 * with the status of certain GPU events (fences, ring pointers,etc.). 1223 */ 1224 1225 /** 1226 * amdgpu_device_wb_fini - Disable Writeback and free memory 1227 * 1228 * @adev: amdgpu_device pointer 1229 * 1230 * Disables Writeback and frees the Writeback memory (all asics). 1231 * Used at driver shutdown. 1232 */ 1233 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1234 { 1235 if (adev->wb.wb_obj) { 1236 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1237 &adev->wb.gpu_addr, 1238 (void **)&adev->wb.wb); 1239 adev->wb.wb_obj = NULL; 1240 } 1241 } 1242 1243 /** 1244 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1245 * 1246 * @adev: amdgpu_device pointer 1247 * 1248 * Initializes writeback and allocates writeback memory (all asics). 1249 * Used at driver startup. 1250 * Returns 0 on success or an -error on failure. 1251 */ 1252 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1253 { 1254 int r; 1255 1256 if (adev->wb.wb_obj == NULL) { 1257 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1258 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1259 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1260 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1261 (void **)&adev->wb.wb); 1262 if (r) { 1263 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1264 return r; 1265 } 1266 1267 adev->wb.num_wb = AMDGPU_MAX_WB; 1268 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1269 1270 /* clear wb memory */ 1271 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1272 } 1273 1274 return 0; 1275 } 1276 1277 /** 1278 * amdgpu_device_wb_get - Allocate a wb entry 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @wb: wb index 1282 * 1283 * Allocate a wb slot for use by the driver (all asics). 1284 * Returns 0 on success or -EINVAL on failure. 1285 */ 1286 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1287 { 1288 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1289 1290 if (offset < adev->wb.num_wb) { 1291 __set_bit(offset, adev->wb.used); 1292 *wb = offset << 3; /* convert to dw offset */ 1293 return 0; 1294 } else { 1295 return -EINVAL; 1296 } 1297 } 1298 1299 /** 1300 * amdgpu_device_wb_free - Free a wb entry 1301 * 1302 * @adev: amdgpu_device pointer 1303 * @wb: wb index 1304 * 1305 * Free a wb slot allocated for use by the driver (all asics) 1306 */ 1307 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1308 { 1309 wb >>= 3; 1310 if (wb < adev->wb.num_wb) 1311 __clear_bit(wb, adev->wb.used); 1312 } 1313 1314 /** 1315 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1316 * 1317 * @adev: amdgpu_device pointer 1318 * 1319 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1320 * to fail, but if any of the BARs is not accessible after the size we abort 1321 * driver loading by returning -ENODEV. 1322 */ 1323 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1324 { 1325 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1326 struct pci_bus *root; 1327 struct resource *res; 1328 unsigned i; 1329 u16 cmd; 1330 int r; 1331 1332 /* Bypass for VF */ 1333 if (amdgpu_sriov_vf(adev)) 1334 return 0; 1335 1336 /* skip if the bios has already enabled large BAR */ 1337 if (adev->gmc.real_vram_size && 1338 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1339 return 0; 1340 1341 /* Check if the root BUS has 64bit memory resources */ 1342 root = adev->pdev->bus; 1343 while (root->parent) 1344 root = root->parent; 1345 1346 pci_bus_for_each_resource(root, res, i) { 1347 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1348 res->start > 0x100000000ull) 1349 break; 1350 } 1351 1352 /* Trying to resize is pointless without a root hub window above 4GB */ 1353 if (!res) 1354 return 0; 1355 1356 /* Limit the BAR size to what is available */ 1357 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1358 rbar_size); 1359 1360 /* Disable memory decoding while we change the BAR addresses and size */ 1361 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1362 pci_write_config_word(adev->pdev, PCI_COMMAND, 1363 cmd & ~PCI_COMMAND_MEMORY); 1364 1365 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1366 amdgpu_device_doorbell_fini(adev); 1367 if (adev->asic_type >= CHIP_BONAIRE) 1368 pci_release_resource(adev->pdev, 2); 1369 1370 pci_release_resource(adev->pdev, 0); 1371 1372 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1373 if (r == -ENOSPC) 1374 DRM_INFO("Not enough PCI address space for a large BAR."); 1375 else if (r && r != -ENOTSUPP) 1376 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1377 1378 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1379 1380 /* When the doorbell or fb BAR isn't available we have no chance of 1381 * using the device. 1382 */ 1383 r = amdgpu_device_doorbell_init(adev); 1384 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1385 return -ENODEV; 1386 1387 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1388 1389 return 0; 1390 } 1391 1392 /* 1393 * GPU helpers function. 1394 */ 1395 /** 1396 * amdgpu_device_need_post - check if the hw need post or not 1397 * 1398 * @adev: amdgpu_device pointer 1399 * 1400 * Check if the asic has been initialized (all asics) at driver startup 1401 * or post is needed if hw reset is performed. 1402 * Returns true if need or false if not. 1403 */ 1404 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1405 { 1406 uint32_t reg; 1407 1408 if (amdgpu_sriov_vf(adev)) 1409 return false; 1410 1411 if (amdgpu_passthrough(adev)) { 1412 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1413 * some old smc fw still need driver do vPost otherwise gpu hang, while 1414 * those smc fw version above 22.15 doesn't have this flaw, so we force 1415 * vpost executed for smc version below 22.15 1416 */ 1417 if (adev->asic_type == CHIP_FIJI) { 1418 int err; 1419 uint32_t fw_ver; 1420 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1421 /* force vPost if error occured */ 1422 if (err) 1423 return true; 1424 1425 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1426 if (fw_ver < 0x00160e00) 1427 return true; 1428 } 1429 } 1430 1431 /* Don't post if we need to reset whole hive on init */ 1432 if (adev->gmc.xgmi.pending_reset) 1433 return false; 1434 1435 if (adev->has_hw_reset) { 1436 adev->has_hw_reset = false; 1437 return true; 1438 } 1439 1440 /* bios scratch used on CIK+ */ 1441 if (adev->asic_type >= CHIP_BONAIRE) 1442 return amdgpu_atombios_scratch_need_asic_init(adev); 1443 1444 /* check MEM_SIZE for older asics */ 1445 reg = amdgpu_asic_get_config_memsize(adev); 1446 1447 if ((reg != 0) && (reg != 0xffffffff)) 1448 return false; 1449 1450 return true; 1451 } 1452 1453 /** 1454 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1455 * 1456 * @adev: amdgpu_device pointer 1457 * 1458 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1459 * be set for this device. 1460 * 1461 * Returns true if it should be used or false if not. 1462 */ 1463 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1464 { 1465 switch (amdgpu_aspm) { 1466 case -1: 1467 break; 1468 case 0: 1469 return false; 1470 case 1: 1471 return true; 1472 default: 1473 return false; 1474 } 1475 return pcie_aspm_enabled(adev->pdev); 1476 } 1477 1478 bool amdgpu_device_aspm_support_quirk(void) 1479 { 1480 #if IS_ENABLED(CONFIG_X86) 1481 struct cpuinfo_x86 *c = &cpu_data(0); 1482 1483 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1484 #else 1485 return true; 1486 #endif 1487 } 1488 1489 /* if we get transitioned to only one device, take VGA back */ 1490 /** 1491 * amdgpu_device_vga_set_decode - enable/disable vga decode 1492 * 1493 * @pdev: PCI device pointer 1494 * @state: enable/disable vga decode 1495 * 1496 * Enable/disable vga decode (all asics). 1497 * Returns VGA resource flags. 1498 */ 1499 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1500 bool state) 1501 { 1502 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1503 amdgpu_asic_set_vga_state(adev, state); 1504 if (state) 1505 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1506 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1507 else 1508 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1509 } 1510 1511 /** 1512 * amdgpu_device_check_block_size - validate the vm block size 1513 * 1514 * @adev: amdgpu_device pointer 1515 * 1516 * Validates the vm block size specified via module parameter. 1517 * The vm block size defines number of bits in page table versus page directory, 1518 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1519 * page table and the remaining bits are in the page directory. 1520 */ 1521 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1522 { 1523 /* defines number of bits in page table versus page directory, 1524 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1525 * page table and the remaining bits are in the page directory */ 1526 if (amdgpu_vm_block_size == -1) 1527 return; 1528 1529 if (amdgpu_vm_block_size < 9) { 1530 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1531 amdgpu_vm_block_size); 1532 amdgpu_vm_block_size = -1; 1533 } 1534 } 1535 1536 /** 1537 * amdgpu_device_check_vm_size - validate the vm size 1538 * 1539 * @adev: amdgpu_device pointer 1540 * 1541 * Validates the vm size in GB specified via module parameter. 1542 * The VM size is the size of the GPU virtual memory space in GB. 1543 */ 1544 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1545 { 1546 /* no need to check the default value */ 1547 if (amdgpu_vm_size == -1) 1548 return; 1549 1550 if (amdgpu_vm_size < 1) { 1551 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1552 amdgpu_vm_size); 1553 amdgpu_vm_size = -1; 1554 } 1555 } 1556 1557 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1558 { 1559 struct sysinfo si; 1560 bool is_os_64 = (sizeof(void *) == 8); 1561 uint64_t total_memory; 1562 uint64_t dram_size_seven_GB = 0x1B8000000; 1563 uint64_t dram_size_three_GB = 0xB8000000; 1564 1565 if (amdgpu_smu_memory_pool_size == 0) 1566 return; 1567 1568 if (!is_os_64) { 1569 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1570 goto def_value; 1571 } 1572 si_meminfo(&si); 1573 total_memory = (uint64_t)si.totalram * si.mem_unit; 1574 1575 if ((amdgpu_smu_memory_pool_size == 1) || 1576 (amdgpu_smu_memory_pool_size == 2)) { 1577 if (total_memory < dram_size_three_GB) 1578 goto def_value1; 1579 } else if ((amdgpu_smu_memory_pool_size == 4) || 1580 (amdgpu_smu_memory_pool_size == 8)) { 1581 if (total_memory < dram_size_seven_GB) 1582 goto def_value1; 1583 } else { 1584 DRM_WARN("Smu memory pool size not supported\n"); 1585 goto def_value; 1586 } 1587 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1588 1589 return; 1590 1591 def_value1: 1592 DRM_WARN("No enough system memory\n"); 1593 def_value: 1594 adev->pm.smu_prv_buffer_size = 0; 1595 } 1596 1597 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1598 { 1599 if (!(adev->flags & AMD_IS_APU) || 1600 adev->asic_type < CHIP_RAVEN) 1601 return 0; 1602 1603 switch (adev->asic_type) { 1604 case CHIP_RAVEN: 1605 if (adev->pdev->device == 0x15dd) 1606 adev->apu_flags |= AMD_APU_IS_RAVEN; 1607 if (adev->pdev->device == 0x15d8) 1608 adev->apu_flags |= AMD_APU_IS_PICASSO; 1609 break; 1610 case CHIP_RENOIR: 1611 if ((adev->pdev->device == 0x1636) || 1612 (adev->pdev->device == 0x164c)) 1613 adev->apu_flags |= AMD_APU_IS_RENOIR; 1614 else 1615 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1616 break; 1617 case CHIP_VANGOGH: 1618 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1619 break; 1620 case CHIP_YELLOW_CARP: 1621 break; 1622 case CHIP_CYAN_SKILLFISH: 1623 if ((adev->pdev->device == 0x13FE) || 1624 (adev->pdev->device == 0x143F)) 1625 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1626 break; 1627 default: 1628 break; 1629 } 1630 1631 return 0; 1632 } 1633 1634 /** 1635 * amdgpu_device_check_arguments - validate module params 1636 * 1637 * @adev: amdgpu_device pointer 1638 * 1639 * Validates certain module parameters and updates 1640 * the associated values used by the driver (all asics). 1641 */ 1642 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1643 { 1644 if (amdgpu_sched_jobs < 4) { 1645 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1646 amdgpu_sched_jobs); 1647 amdgpu_sched_jobs = 4; 1648 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1649 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1650 amdgpu_sched_jobs); 1651 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1652 } 1653 1654 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1655 /* gart size must be greater or equal to 32M */ 1656 dev_warn(adev->dev, "gart size (%d) too small\n", 1657 amdgpu_gart_size); 1658 amdgpu_gart_size = -1; 1659 } 1660 1661 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1662 /* gtt size must be greater or equal to 32M */ 1663 dev_warn(adev->dev, "gtt size (%d) too small\n", 1664 amdgpu_gtt_size); 1665 amdgpu_gtt_size = -1; 1666 } 1667 1668 /* valid range is between 4 and 9 inclusive */ 1669 if (amdgpu_vm_fragment_size != -1 && 1670 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1671 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1672 amdgpu_vm_fragment_size = -1; 1673 } 1674 1675 if (amdgpu_sched_hw_submission < 2) { 1676 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1677 amdgpu_sched_hw_submission); 1678 amdgpu_sched_hw_submission = 2; 1679 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1680 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1681 amdgpu_sched_hw_submission); 1682 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1683 } 1684 1685 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1686 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1687 amdgpu_reset_method = -1; 1688 } 1689 1690 amdgpu_device_check_smu_prv_buffer_size(adev); 1691 1692 amdgpu_device_check_vm_size(adev); 1693 1694 amdgpu_device_check_block_size(adev); 1695 1696 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1697 1698 return 0; 1699 } 1700 1701 /** 1702 * amdgpu_switcheroo_set_state - set switcheroo state 1703 * 1704 * @pdev: pci dev pointer 1705 * @state: vga_switcheroo state 1706 * 1707 * Callback for the switcheroo driver. Suspends or resumes 1708 * the asics before or after it is powered up using ACPI methods. 1709 */ 1710 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1711 enum vga_switcheroo_state state) 1712 { 1713 struct drm_device *dev = pci_get_drvdata(pdev); 1714 int r; 1715 1716 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1717 return; 1718 1719 if (state == VGA_SWITCHEROO_ON) { 1720 pr_info("switched on\n"); 1721 /* don't suspend or resume card normally */ 1722 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1723 1724 pci_set_power_state(pdev, PCI_D0); 1725 amdgpu_device_load_pci_state(pdev); 1726 r = pci_enable_device(pdev); 1727 if (r) 1728 DRM_WARN("pci_enable_device failed (%d)\n", r); 1729 amdgpu_device_resume(dev, true); 1730 1731 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1732 } else { 1733 pr_info("switched off\n"); 1734 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1735 amdgpu_device_suspend(dev, true); 1736 amdgpu_device_cache_pci_state(pdev); 1737 /* Shut down the device */ 1738 pci_disable_device(pdev); 1739 pci_set_power_state(pdev, PCI_D3cold); 1740 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1741 } 1742 } 1743 1744 /** 1745 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1746 * 1747 * @pdev: pci dev pointer 1748 * 1749 * Callback for the switcheroo driver. Check of the switcheroo 1750 * state can be changed. 1751 * Returns true if the state can be changed, false if not. 1752 */ 1753 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1754 { 1755 struct drm_device *dev = pci_get_drvdata(pdev); 1756 1757 /* 1758 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1759 * locking inversion with the driver load path. And the access here is 1760 * completely racy anyway. So don't bother with locking for now. 1761 */ 1762 return atomic_read(&dev->open_count) == 0; 1763 } 1764 1765 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1766 .set_gpu_state = amdgpu_switcheroo_set_state, 1767 .reprobe = NULL, 1768 .can_switch = amdgpu_switcheroo_can_switch, 1769 }; 1770 1771 /** 1772 * amdgpu_device_ip_set_clockgating_state - set the CG state 1773 * 1774 * @dev: amdgpu_device pointer 1775 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1776 * @state: clockgating state (gate or ungate) 1777 * 1778 * Sets the requested clockgating state for all instances of 1779 * the hardware IP specified. 1780 * Returns the error code from the last instance. 1781 */ 1782 int amdgpu_device_ip_set_clockgating_state(void *dev, 1783 enum amd_ip_block_type block_type, 1784 enum amd_clockgating_state state) 1785 { 1786 struct amdgpu_device *adev = dev; 1787 int i, r = 0; 1788 1789 for (i = 0; i < adev->num_ip_blocks; i++) { 1790 if (!adev->ip_blocks[i].status.valid) 1791 continue; 1792 if (adev->ip_blocks[i].version->type != block_type) 1793 continue; 1794 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1795 continue; 1796 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1797 (void *)adev, state); 1798 if (r) 1799 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1800 adev->ip_blocks[i].version->funcs->name, r); 1801 } 1802 return r; 1803 } 1804 1805 /** 1806 * amdgpu_device_ip_set_powergating_state - set the PG state 1807 * 1808 * @dev: amdgpu_device pointer 1809 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1810 * @state: powergating state (gate or ungate) 1811 * 1812 * Sets the requested powergating state for all instances of 1813 * the hardware IP specified. 1814 * Returns the error code from the last instance. 1815 */ 1816 int amdgpu_device_ip_set_powergating_state(void *dev, 1817 enum amd_ip_block_type block_type, 1818 enum amd_powergating_state state) 1819 { 1820 struct amdgpu_device *adev = dev; 1821 int i, r = 0; 1822 1823 for (i = 0; i < adev->num_ip_blocks; i++) { 1824 if (!adev->ip_blocks[i].status.valid) 1825 continue; 1826 if (adev->ip_blocks[i].version->type != block_type) 1827 continue; 1828 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1829 continue; 1830 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1831 (void *)adev, state); 1832 if (r) 1833 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1834 adev->ip_blocks[i].version->funcs->name, r); 1835 } 1836 return r; 1837 } 1838 1839 /** 1840 * amdgpu_device_ip_get_clockgating_state - get the CG state 1841 * 1842 * @adev: amdgpu_device pointer 1843 * @flags: clockgating feature flags 1844 * 1845 * Walks the list of IPs on the device and updates the clockgating 1846 * flags for each IP. 1847 * Updates @flags with the feature flags for each hardware IP where 1848 * clockgating is enabled. 1849 */ 1850 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1851 u64 *flags) 1852 { 1853 int i; 1854 1855 for (i = 0; i < adev->num_ip_blocks; i++) { 1856 if (!adev->ip_blocks[i].status.valid) 1857 continue; 1858 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1859 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1860 } 1861 } 1862 1863 /** 1864 * amdgpu_device_ip_wait_for_idle - wait for idle 1865 * 1866 * @adev: amdgpu_device pointer 1867 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1868 * 1869 * Waits for the request hardware IP to be idle. 1870 * Returns 0 for success or a negative error code on failure. 1871 */ 1872 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1873 enum amd_ip_block_type block_type) 1874 { 1875 int i, r; 1876 1877 for (i = 0; i < adev->num_ip_blocks; i++) { 1878 if (!adev->ip_blocks[i].status.valid) 1879 continue; 1880 if (adev->ip_blocks[i].version->type == block_type) { 1881 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1882 if (r) 1883 return r; 1884 break; 1885 } 1886 } 1887 return 0; 1888 1889 } 1890 1891 /** 1892 * amdgpu_device_ip_is_idle - is the hardware IP idle 1893 * 1894 * @adev: amdgpu_device pointer 1895 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1896 * 1897 * Check if the hardware IP is idle or not. 1898 * Returns true if it the IP is idle, false if not. 1899 */ 1900 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1901 enum amd_ip_block_type block_type) 1902 { 1903 int i; 1904 1905 for (i = 0; i < adev->num_ip_blocks; i++) { 1906 if (!adev->ip_blocks[i].status.valid) 1907 continue; 1908 if (adev->ip_blocks[i].version->type == block_type) 1909 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1910 } 1911 return true; 1912 1913 } 1914 1915 /** 1916 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1917 * 1918 * @adev: amdgpu_device pointer 1919 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1920 * 1921 * Returns a pointer to the hardware IP block structure 1922 * if it exists for the asic, otherwise NULL. 1923 */ 1924 struct amdgpu_ip_block * 1925 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1926 enum amd_ip_block_type type) 1927 { 1928 int i; 1929 1930 for (i = 0; i < adev->num_ip_blocks; i++) 1931 if (adev->ip_blocks[i].version->type == type) 1932 return &adev->ip_blocks[i]; 1933 1934 return NULL; 1935 } 1936 1937 /** 1938 * amdgpu_device_ip_block_version_cmp 1939 * 1940 * @adev: amdgpu_device pointer 1941 * @type: enum amd_ip_block_type 1942 * @major: major version 1943 * @minor: minor version 1944 * 1945 * return 0 if equal or greater 1946 * return 1 if smaller or the ip_block doesn't exist 1947 */ 1948 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1949 enum amd_ip_block_type type, 1950 u32 major, u32 minor) 1951 { 1952 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1953 1954 if (ip_block && ((ip_block->version->major > major) || 1955 ((ip_block->version->major == major) && 1956 (ip_block->version->minor >= minor)))) 1957 return 0; 1958 1959 return 1; 1960 } 1961 1962 /** 1963 * amdgpu_device_ip_block_add 1964 * 1965 * @adev: amdgpu_device pointer 1966 * @ip_block_version: pointer to the IP to add 1967 * 1968 * Adds the IP block driver information to the collection of IPs 1969 * on the asic. 1970 */ 1971 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1972 const struct amdgpu_ip_block_version *ip_block_version) 1973 { 1974 if (!ip_block_version) 1975 return -EINVAL; 1976 1977 switch (ip_block_version->type) { 1978 case AMD_IP_BLOCK_TYPE_VCN: 1979 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1980 return 0; 1981 break; 1982 case AMD_IP_BLOCK_TYPE_JPEG: 1983 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1984 return 0; 1985 break; 1986 default: 1987 break; 1988 } 1989 1990 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1991 ip_block_version->funcs->name); 1992 1993 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1994 1995 return 0; 1996 } 1997 1998 /** 1999 * amdgpu_device_enable_virtual_display - enable virtual display feature 2000 * 2001 * @adev: amdgpu_device pointer 2002 * 2003 * Enabled the virtual display feature if the user has enabled it via 2004 * the module parameter virtual_display. This feature provides a virtual 2005 * display hardware on headless boards or in virtualized environments. 2006 * This function parses and validates the configuration string specified by 2007 * the user and configues the virtual display configuration (number of 2008 * virtual connectors, crtcs, etc.) specified. 2009 */ 2010 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2011 { 2012 adev->enable_virtual_display = false; 2013 2014 if (amdgpu_virtual_display) { 2015 const char *pci_address_name = pci_name(adev->pdev); 2016 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2017 2018 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2019 pciaddstr_tmp = pciaddstr; 2020 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2021 pciaddname = strsep(&pciaddname_tmp, ","); 2022 if (!strcmp("all", pciaddname) 2023 || !strcmp(pci_address_name, pciaddname)) { 2024 long num_crtc; 2025 int res = -1; 2026 2027 adev->enable_virtual_display = true; 2028 2029 if (pciaddname_tmp) 2030 res = kstrtol(pciaddname_tmp, 10, 2031 &num_crtc); 2032 2033 if (!res) { 2034 if (num_crtc < 1) 2035 num_crtc = 1; 2036 if (num_crtc > 6) 2037 num_crtc = 6; 2038 adev->mode_info.num_crtc = num_crtc; 2039 } else { 2040 adev->mode_info.num_crtc = 1; 2041 } 2042 break; 2043 } 2044 } 2045 2046 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2047 amdgpu_virtual_display, pci_address_name, 2048 adev->enable_virtual_display, adev->mode_info.num_crtc); 2049 2050 kfree(pciaddstr); 2051 } 2052 } 2053 2054 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2055 { 2056 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2057 adev->mode_info.num_crtc = 1; 2058 adev->enable_virtual_display = true; 2059 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2060 adev->enable_virtual_display, adev->mode_info.num_crtc); 2061 } 2062 } 2063 2064 /** 2065 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2066 * 2067 * @adev: amdgpu_device pointer 2068 * 2069 * Parses the asic configuration parameters specified in the gpu info 2070 * firmware and makes them availale to the driver for use in configuring 2071 * the asic. 2072 * Returns 0 on success, -EINVAL on failure. 2073 */ 2074 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2075 { 2076 const char *chip_name; 2077 char fw_name[40]; 2078 int err; 2079 const struct gpu_info_firmware_header_v1_0 *hdr; 2080 2081 adev->firmware.gpu_info_fw = NULL; 2082 2083 if (adev->mman.discovery_bin) { 2084 /* 2085 * FIXME: The bounding box is still needed by Navi12, so 2086 * temporarily read it from gpu_info firmware. Should be dropped 2087 * when DAL no longer needs it. 2088 */ 2089 if (adev->asic_type != CHIP_NAVI12) 2090 return 0; 2091 } 2092 2093 switch (adev->asic_type) { 2094 default: 2095 return 0; 2096 case CHIP_VEGA10: 2097 chip_name = "vega10"; 2098 break; 2099 case CHIP_VEGA12: 2100 chip_name = "vega12"; 2101 break; 2102 case CHIP_RAVEN: 2103 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2104 chip_name = "raven2"; 2105 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2106 chip_name = "picasso"; 2107 else 2108 chip_name = "raven"; 2109 break; 2110 case CHIP_ARCTURUS: 2111 chip_name = "arcturus"; 2112 break; 2113 case CHIP_NAVI12: 2114 chip_name = "navi12"; 2115 break; 2116 } 2117 2118 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2119 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2120 if (err) { 2121 dev_err(adev->dev, 2122 "Failed to get gpu_info firmware \"%s\"\n", 2123 fw_name); 2124 goto out; 2125 } 2126 2127 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2128 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2129 2130 switch (hdr->version_major) { 2131 case 1: 2132 { 2133 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2134 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2135 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2136 2137 /* 2138 * Should be droped when DAL no longer needs it. 2139 */ 2140 if (adev->asic_type == CHIP_NAVI12) 2141 goto parse_soc_bounding_box; 2142 2143 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2144 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2145 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2146 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2147 adev->gfx.config.max_texture_channel_caches = 2148 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2149 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2150 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2151 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2152 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2153 adev->gfx.config.double_offchip_lds_buf = 2154 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2155 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2156 adev->gfx.cu_info.max_waves_per_simd = 2157 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2158 adev->gfx.cu_info.max_scratch_slots_per_cu = 2159 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2160 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2161 if (hdr->version_minor >= 1) { 2162 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2163 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2164 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2165 adev->gfx.config.num_sc_per_sh = 2166 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2167 adev->gfx.config.num_packer_per_sc = 2168 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2169 } 2170 2171 parse_soc_bounding_box: 2172 /* 2173 * soc bounding box info is not integrated in disocovery table, 2174 * we always need to parse it from gpu info firmware if needed. 2175 */ 2176 if (hdr->version_minor == 2) { 2177 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2178 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2179 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2180 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2181 } 2182 break; 2183 } 2184 default: 2185 dev_err(adev->dev, 2186 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2187 err = -EINVAL; 2188 goto out; 2189 } 2190 out: 2191 return err; 2192 } 2193 2194 /** 2195 * amdgpu_device_ip_early_init - run early init for hardware IPs 2196 * 2197 * @adev: amdgpu_device pointer 2198 * 2199 * Early initialization pass for hardware IPs. The hardware IPs that make 2200 * up each asic are discovered each IP's early_init callback is run. This 2201 * is the first stage in initializing the asic. 2202 * Returns 0 on success, negative error code on failure. 2203 */ 2204 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2205 { 2206 struct drm_device *dev = adev_to_drm(adev); 2207 struct pci_dev *parent; 2208 int i, r; 2209 bool total; 2210 2211 amdgpu_device_enable_virtual_display(adev); 2212 2213 if (amdgpu_sriov_vf(adev)) { 2214 r = amdgpu_virt_request_full_gpu(adev, true); 2215 if (r) 2216 return r; 2217 } 2218 2219 switch (adev->asic_type) { 2220 #ifdef CONFIG_DRM_AMDGPU_SI 2221 case CHIP_VERDE: 2222 case CHIP_TAHITI: 2223 case CHIP_PITCAIRN: 2224 case CHIP_OLAND: 2225 case CHIP_HAINAN: 2226 adev->family = AMDGPU_FAMILY_SI; 2227 r = si_set_ip_blocks(adev); 2228 if (r) 2229 return r; 2230 break; 2231 #endif 2232 #ifdef CONFIG_DRM_AMDGPU_CIK 2233 case CHIP_BONAIRE: 2234 case CHIP_HAWAII: 2235 case CHIP_KAVERI: 2236 case CHIP_KABINI: 2237 case CHIP_MULLINS: 2238 if (adev->flags & AMD_IS_APU) 2239 adev->family = AMDGPU_FAMILY_KV; 2240 else 2241 adev->family = AMDGPU_FAMILY_CI; 2242 2243 r = cik_set_ip_blocks(adev); 2244 if (r) 2245 return r; 2246 break; 2247 #endif 2248 case CHIP_TOPAZ: 2249 case CHIP_TONGA: 2250 case CHIP_FIJI: 2251 case CHIP_POLARIS10: 2252 case CHIP_POLARIS11: 2253 case CHIP_POLARIS12: 2254 case CHIP_VEGAM: 2255 case CHIP_CARRIZO: 2256 case CHIP_STONEY: 2257 if (adev->flags & AMD_IS_APU) 2258 adev->family = AMDGPU_FAMILY_CZ; 2259 else 2260 adev->family = AMDGPU_FAMILY_VI; 2261 2262 r = vi_set_ip_blocks(adev); 2263 if (r) 2264 return r; 2265 break; 2266 default: 2267 r = amdgpu_discovery_set_ip_blocks(adev); 2268 if (r) 2269 return r; 2270 break; 2271 } 2272 2273 if (amdgpu_has_atpx() && 2274 (amdgpu_is_atpx_hybrid() || 2275 amdgpu_has_atpx_dgpu_power_cntl()) && 2276 ((adev->flags & AMD_IS_APU) == 0) && 2277 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2278 adev->flags |= AMD_IS_PX; 2279 2280 if (!(adev->flags & AMD_IS_APU)) { 2281 parent = pci_upstream_bridge(adev->pdev); 2282 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2283 } 2284 2285 2286 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2287 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2288 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2289 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2290 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2291 2292 total = true; 2293 for (i = 0; i < adev->num_ip_blocks; i++) { 2294 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2295 DRM_WARN("disabled ip block: %d <%s>\n", 2296 i, adev->ip_blocks[i].version->funcs->name); 2297 adev->ip_blocks[i].status.valid = false; 2298 } else { 2299 if (adev->ip_blocks[i].version->funcs->early_init) { 2300 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2301 if (r == -ENOENT) { 2302 adev->ip_blocks[i].status.valid = false; 2303 } else if (r) { 2304 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2305 adev->ip_blocks[i].version->funcs->name, r); 2306 total = false; 2307 } else { 2308 adev->ip_blocks[i].status.valid = true; 2309 } 2310 } else { 2311 adev->ip_blocks[i].status.valid = true; 2312 } 2313 } 2314 /* get the vbios after the asic_funcs are set up */ 2315 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2316 r = amdgpu_device_parse_gpu_info_fw(adev); 2317 if (r) 2318 return r; 2319 2320 /* Read BIOS */ 2321 if (!amdgpu_get_bios(adev)) 2322 return -EINVAL; 2323 2324 r = amdgpu_atombios_init(adev); 2325 if (r) { 2326 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2327 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2328 return r; 2329 } 2330 2331 /*get pf2vf msg info at it's earliest time*/ 2332 if (amdgpu_sriov_vf(adev)) 2333 amdgpu_virt_init_data_exchange(adev); 2334 2335 } 2336 } 2337 if (!total) 2338 return -ENODEV; 2339 2340 amdgpu_amdkfd_device_probe(adev); 2341 adev->cg_flags &= amdgpu_cg_mask; 2342 adev->pg_flags &= amdgpu_pg_mask; 2343 2344 return 0; 2345 } 2346 2347 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2348 { 2349 int i, r; 2350 2351 for (i = 0; i < adev->num_ip_blocks; i++) { 2352 if (!adev->ip_blocks[i].status.sw) 2353 continue; 2354 if (adev->ip_blocks[i].status.hw) 2355 continue; 2356 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2357 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2358 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2359 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2360 if (r) { 2361 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2362 adev->ip_blocks[i].version->funcs->name, r); 2363 return r; 2364 } 2365 adev->ip_blocks[i].status.hw = true; 2366 } 2367 } 2368 2369 return 0; 2370 } 2371 2372 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2373 { 2374 int i, r; 2375 2376 for (i = 0; i < adev->num_ip_blocks; i++) { 2377 if (!adev->ip_blocks[i].status.sw) 2378 continue; 2379 if (adev->ip_blocks[i].status.hw) 2380 continue; 2381 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2382 if (r) { 2383 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2384 adev->ip_blocks[i].version->funcs->name, r); 2385 return r; 2386 } 2387 adev->ip_blocks[i].status.hw = true; 2388 } 2389 2390 return 0; 2391 } 2392 2393 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2394 { 2395 int r = 0; 2396 int i; 2397 uint32_t smu_version; 2398 2399 if (adev->asic_type >= CHIP_VEGA10) { 2400 for (i = 0; i < adev->num_ip_blocks; i++) { 2401 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2402 continue; 2403 2404 if (!adev->ip_blocks[i].status.sw) 2405 continue; 2406 2407 /* no need to do the fw loading again if already done*/ 2408 if (adev->ip_blocks[i].status.hw == true) 2409 break; 2410 2411 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2412 r = adev->ip_blocks[i].version->funcs->resume(adev); 2413 if (r) { 2414 DRM_ERROR("resume of IP block <%s> failed %d\n", 2415 adev->ip_blocks[i].version->funcs->name, r); 2416 return r; 2417 } 2418 } else { 2419 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2420 if (r) { 2421 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2422 adev->ip_blocks[i].version->funcs->name, r); 2423 return r; 2424 } 2425 } 2426 2427 adev->ip_blocks[i].status.hw = true; 2428 break; 2429 } 2430 } 2431 2432 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2433 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2434 2435 return r; 2436 } 2437 2438 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2439 { 2440 long timeout; 2441 int r, i; 2442 2443 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2444 struct amdgpu_ring *ring = adev->rings[i]; 2445 2446 /* No need to setup the GPU scheduler for rings that don't need it */ 2447 if (!ring || ring->no_scheduler) 2448 continue; 2449 2450 switch (ring->funcs->type) { 2451 case AMDGPU_RING_TYPE_GFX: 2452 timeout = adev->gfx_timeout; 2453 break; 2454 case AMDGPU_RING_TYPE_COMPUTE: 2455 timeout = adev->compute_timeout; 2456 break; 2457 case AMDGPU_RING_TYPE_SDMA: 2458 timeout = adev->sdma_timeout; 2459 break; 2460 default: 2461 timeout = adev->video_timeout; 2462 break; 2463 } 2464 2465 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2466 ring->num_hw_submission, 0, 2467 timeout, adev->reset_domain->wq, 2468 ring->sched_score, ring->name, 2469 adev->dev); 2470 if (r) { 2471 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2472 ring->name); 2473 return r; 2474 } 2475 } 2476 2477 amdgpu_xcp_update_partition_sched_list(adev); 2478 2479 return 0; 2480 } 2481 2482 2483 /** 2484 * amdgpu_device_ip_init - run init for hardware IPs 2485 * 2486 * @adev: amdgpu_device pointer 2487 * 2488 * Main initialization pass for hardware IPs. The list of all the hardware 2489 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2490 * are run. sw_init initializes the software state associated with each IP 2491 * and hw_init initializes the hardware associated with each IP. 2492 * Returns 0 on success, negative error code on failure. 2493 */ 2494 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2495 { 2496 int i, r; 2497 2498 r = amdgpu_ras_init(adev); 2499 if (r) 2500 return r; 2501 2502 for (i = 0; i < adev->num_ip_blocks; i++) { 2503 if (!adev->ip_blocks[i].status.valid) 2504 continue; 2505 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2506 if (r) { 2507 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2508 adev->ip_blocks[i].version->funcs->name, r); 2509 goto init_failed; 2510 } 2511 adev->ip_blocks[i].status.sw = true; 2512 2513 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2514 /* need to do common hw init early so everything is set up for gmc */ 2515 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2516 if (r) { 2517 DRM_ERROR("hw_init %d failed %d\n", i, r); 2518 goto init_failed; 2519 } 2520 adev->ip_blocks[i].status.hw = true; 2521 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2522 /* need to do gmc hw init early so we can allocate gpu mem */ 2523 /* Try to reserve bad pages early */ 2524 if (amdgpu_sriov_vf(adev)) 2525 amdgpu_virt_exchange_data(adev); 2526 2527 r = amdgpu_device_mem_scratch_init(adev); 2528 if (r) { 2529 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2530 goto init_failed; 2531 } 2532 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2533 if (r) { 2534 DRM_ERROR("hw_init %d failed %d\n", i, r); 2535 goto init_failed; 2536 } 2537 r = amdgpu_device_wb_init(adev); 2538 if (r) { 2539 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2540 goto init_failed; 2541 } 2542 adev->ip_blocks[i].status.hw = true; 2543 2544 /* right after GMC hw init, we create CSA */ 2545 if (amdgpu_mcbp) { 2546 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2547 AMDGPU_GEM_DOMAIN_VRAM | 2548 AMDGPU_GEM_DOMAIN_GTT, 2549 AMDGPU_CSA_SIZE); 2550 if (r) { 2551 DRM_ERROR("allocate CSA failed %d\n", r); 2552 goto init_failed; 2553 } 2554 } 2555 } 2556 } 2557 2558 if (amdgpu_sriov_vf(adev)) 2559 amdgpu_virt_init_data_exchange(adev); 2560 2561 r = amdgpu_ib_pool_init(adev); 2562 if (r) { 2563 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2564 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2565 goto init_failed; 2566 } 2567 2568 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2569 if (r) 2570 goto init_failed; 2571 2572 r = amdgpu_device_ip_hw_init_phase1(adev); 2573 if (r) 2574 goto init_failed; 2575 2576 r = amdgpu_device_fw_loading(adev); 2577 if (r) 2578 goto init_failed; 2579 2580 r = amdgpu_device_ip_hw_init_phase2(adev); 2581 if (r) 2582 goto init_failed; 2583 2584 /* 2585 * retired pages will be loaded from eeprom and reserved here, 2586 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2587 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2588 * for I2C communication which only true at this point. 2589 * 2590 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2591 * failure from bad gpu situation and stop amdgpu init process 2592 * accordingly. For other failed cases, it will still release all 2593 * the resource and print error message, rather than returning one 2594 * negative value to upper level. 2595 * 2596 * Note: theoretically, this should be called before all vram allocations 2597 * to protect retired page from abusing 2598 */ 2599 r = amdgpu_ras_recovery_init(adev); 2600 if (r) 2601 goto init_failed; 2602 2603 /** 2604 * In case of XGMI grab extra reference for reset domain for this device 2605 */ 2606 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2607 if (amdgpu_xgmi_add_device(adev) == 0) { 2608 if (!amdgpu_sriov_vf(adev)) { 2609 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2610 2611 if (WARN_ON(!hive)) { 2612 r = -ENOENT; 2613 goto init_failed; 2614 } 2615 2616 if (!hive->reset_domain || 2617 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2618 r = -ENOENT; 2619 amdgpu_put_xgmi_hive(hive); 2620 goto init_failed; 2621 } 2622 2623 /* Drop the early temporary reset domain we created for device */ 2624 amdgpu_reset_put_reset_domain(adev->reset_domain); 2625 adev->reset_domain = hive->reset_domain; 2626 amdgpu_put_xgmi_hive(hive); 2627 } 2628 } 2629 } 2630 2631 r = amdgpu_device_init_schedulers(adev); 2632 if (r) 2633 goto init_failed; 2634 2635 /* Don't init kfd if whole hive need to be reset during init */ 2636 if (!adev->gmc.xgmi.pending_reset) { 2637 kgd2kfd_init_zone_device(adev); 2638 amdgpu_amdkfd_device_init(adev); 2639 } 2640 2641 amdgpu_fru_get_product_info(adev); 2642 2643 init_failed: 2644 2645 return r; 2646 } 2647 2648 /** 2649 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2650 * 2651 * @adev: amdgpu_device pointer 2652 * 2653 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2654 * this function before a GPU reset. If the value is retained after a 2655 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2656 */ 2657 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2658 { 2659 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2660 } 2661 2662 /** 2663 * amdgpu_device_check_vram_lost - check if vram is valid 2664 * 2665 * @adev: amdgpu_device pointer 2666 * 2667 * Checks the reset magic value written to the gart pointer in VRAM. 2668 * The driver calls this after a GPU reset to see if the contents of 2669 * VRAM is lost or now. 2670 * returns true if vram is lost, false if not. 2671 */ 2672 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2673 { 2674 if (memcmp(adev->gart.ptr, adev->reset_magic, 2675 AMDGPU_RESET_MAGIC_NUM)) 2676 return true; 2677 2678 if (!amdgpu_in_reset(adev)) 2679 return false; 2680 2681 /* 2682 * For all ASICs with baco/mode1 reset, the VRAM is 2683 * always assumed to be lost. 2684 */ 2685 switch (amdgpu_asic_reset_method(adev)) { 2686 case AMD_RESET_METHOD_BACO: 2687 case AMD_RESET_METHOD_MODE1: 2688 return true; 2689 default: 2690 return false; 2691 } 2692 } 2693 2694 /** 2695 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2696 * 2697 * @adev: amdgpu_device pointer 2698 * @state: clockgating state (gate or ungate) 2699 * 2700 * The list of all the hardware IPs that make up the asic is walked and the 2701 * set_clockgating_state callbacks are run. 2702 * Late initialization pass enabling clockgating for hardware IPs. 2703 * Fini or suspend, pass disabling clockgating for hardware IPs. 2704 * Returns 0 on success, negative error code on failure. 2705 */ 2706 2707 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2708 enum amd_clockgating_state state) 2709 { 2710 int i, j, r; 2711 2712 if (amdgpu_emu_mode == 1) 2713 return 0; 2714 2715 for (j = 0; j < adev->num_ip_blocks; j++) { 2716 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2717 if (!adev->ip_blocks[i].status.late_initialized) 2718 continue; 2719 /* skip CG for GFX, SDMA on S0ix */ 2720 if (adev->in_s0ix && 2721 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2722 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2723 continue; 2724 /* skip CG for VCE/UVD, it's handled specially */ 2725 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2726 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2727 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2728 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2729 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2730 /* enable clockgating to save power */ 2731 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2732 state); 2733 if (r) { 2734 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2735 adev->ip_blocks[i].version->funcs->name, r); 2736 return r; 2737 } 2738 } 2739 } 2740 2741 return 0; 2742 } 2743 2744 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2745 enum amd_powergating_state state) 2746 { 2747 int i, j, r; 2748 2749 if (amdgpu_emu_mode == 1) 2750 return 0; 2751 2752 for (j = 0; j < adev->num_ip_blocks; j++) { 2753 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2754 if (!adev->ip_blocks[i].status.late_initialized) 2755 continue; 2756 /* skip PG for GFX, SDMA on S0ix */ 2757 if (adev->in_s0ix && 2758 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2759 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2760 continue; 2761 /* skip CG for VCE/UVD, it's handled specially */ 2762 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2763 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2764 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2765 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2766 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2767 /* enable powergating to save power */ 2768 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2769 state); 2770 if (r) { 2771 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2772 adev->ip_blocks[i].version->funcs->name, r); 2773 return r; 2774 } 2775 } 2776 } 2777 return 0; 2778 } 2779 2780 static int amdgpu_device_enable_mgpu_fan_boost(void) 2781 { 2782 struct amdgpu_gpu_instance *gpu_ins; 2783 struct amdgpu_device *adev; 2784 int i, ret = 0; 2785 2786 mutex_lock(&mgpu_info.mutex); 2787 2788 /* 2789 * MGPU fan boost feature should be enabled 2790 * only when there are two or more dGPUs in 2791 * the system 2792 */ 2793 if (mgpu_info.num_dgpu < 2) 2794 goto out; 2795 2796 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2797 gpu_ins = &(mgpu_info.gpu_ins[i]); 2798 adev = gpu_ins->adev; 2799 if (!(adev->flags & AMD_IS_APU) && 2800 !gpu_ins->mgpu_fan_enabled) { 2801 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2802 if (ret) 2803 break; 2804 2805 gpu_ins->mgpu_fan_enabled = 1; 2806 } 2807 } 2808 2809 out: 2810 mutex_unlock(&mgpu_info.mutex); 2811 2812 return ret; 2813 } 2814 2815 /** 2816 * amdgpu_device_ip_late_init - run late init for hardware IPs 2817 * 2818 * @adev: amdgpu_device pointer 2819 * 2820 * Late initialization pass for hardware IPs. The list of all the hardware 2821 * IPs that make up the asic is walked and the late_init callbacks are run. 2822 * late_init covers any special initialization that an IP requires 2823 * after all of the have been initialized or something that needs to happen 2824 * late in the init process. 2825 * Returns 0 on success, negative error code on failure. 2826 */ 2827 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2828 { 2829 struct amdgpu_gpu_instance *gpu_instance; 2830 int i = 0, r; 2831 2832 for (i = 0; i < adev->num_ip_blocks; i++) { 2833 if (!adev->ip_blocks[i].status.hw) 2834 continue; 2835 if (adev->ip_blocks[i].version->funcs->late_init) { 2836 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2837 if (r) { 2838 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2839 adev->ip_blocks[i].version->funcs->name, r); 2840 return r; 2841 } 2842 } 2843 adev->ip_blocks[i].status.late_initialized = true; 2844 } 2845 2846 r = amdgpu_ras_late_init(adev); 2847 if (r) { 2848 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2849 return r; 2850 } 2851 2852 amdgpu_ras_set_error_query_ready(adev, true); 2853 2854 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2855 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2856 2857 amdgpu_device_fill_reset_magic(adev); 2858 2859 r = amdgpu_device_enable_mgpu_fan_boost(); 2860 if (r) 2861 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2862 2863 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2864 if (amdgpu_passthrough(adev) && 2865 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2866 adev->asic_type == CHIP_ALDEBARAN)) 2867 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2868 2869 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2870 mutex_lock(&mgpu_info.mutex); 2871 2872 /* 2873 * Reset device p-state to low as this was booted with high. 2874 * 2875 * This should be performed only after all devices from the same 2876 * hive get initialized. 2877 * 2878 * However, it's unknown how many device in the hive in advance. 2879 * As this is counted one by one during devices initializations. 2880 * 2881 * So, we wait for all XGMI interlinked devices initialized. 2882 * This may bring some delays as those devices may come from 2883 * different hives. But that should be OK. 2884 */ 2885 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2886 for (i = 0; i < mgpu_info.num_gpu; i++) { 2887 gpu_instance = &(mgpu_info.gpu_ins[i]); 2888 if (gpu_instance->adev->flags & AMD_IS_APU) 2889 continue; 2890 2891 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2892 AMDGPU_XGMI_PSTATE_MIN); 2893 if (r) { 2894 DRM_ERROR("pstate setting failed (%d).\n", r); 2895 break; 2896 } 2897 } 2898 } 2899 2900 mutex_unlock(&mgpu_info.mutex); 2901 } 2902 2903 return 0; 2904 } 2905 2906 /** 2907 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2908 * 2909 * @adev: amdgpu_device pointer 2910 * 2911 * For ASICs need to disable SMC first 2912 */ 2913 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2914 { 2915 int i, r; 2916 2917 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2918 return; 2919 2920 for (i = 0; i < adev->num_ip_blocks; i++) { 2921 if (!adev->ip_blocks[i].status.hw) 2922 continue; 2923 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2924 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2925 /* XXX handle errors */ 2926 if (r) { 2927 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2928 adev->ip_blocks[i].version->funcs->name, r); 2929 } 2930 adev->ip_blocks[i].status.hw = false; 2931 break; 2932 } 2933 } 2934 } 2935 2936 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2937 { 2938 int i, r; 2939 2940 for (i = 0; i < adev->num_ip_blocks; i++) { 2941 if (!adev->ip_blocks[i].version->funcs->early_fini) 2942 continue; 2943 2944 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2945 if (r) { 2946 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2947 adev->ip_blocks[i].version->funcs->name, r); 2948 } 2949 } 2950 2951 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2952 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2953 2954 amdgpu_amdkfd_suspend(adev, false); 2955 2956 /* Workaroud for ASICs need to disable SMC first */ 2957 amdgpu_device_smu_fini_early(adev); 2958 2959 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2960 if (!adev->ip_blocks[i].status.hw) 2961 continue; 2962 2963 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2964 /* XXX handle errors */ 2965 if (r) { 2966 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2967 adev->ip_blocks[i].version->funcs->name, r); 2968 } 2969 2970 adev->ip_blocks[i].status.hw = false; 2971 } 2972 2973 if (amdgpu_sriov_vf(adev)) { 2974 if (amdgpu_virt_release_full_gpu(adev, false)) 2975 DRM_ERROR("failed to release exclusive mode on fini\n"); 2976 } 2977 2978 return 0; 2979 } 2980 2981 /** 2982 * amdgpu_device_ip_fini - run fini for hardware IPs 2983 * 2984 * @adev: amdgpu_device pointer 2985 * 2986 * Main teardown pass for hardware IPs. The list of all the hardware 2987 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2988 * are run. hw_fini tears down the hardware associated with each IP 2989 * and sw_fini tears down any software state associated with each IP. 2990 * Returns 0 on success, negative error code on failure. 2991 */ 2992 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2993 { 2994 int i, r; 2995 2996 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2997 amdgpu_virt_release_ras_err_handler_data(adev); 2998 2999 if (adev->gmc.xgmi.num_physical_nodes > 1) 3000 amdgpu_xgmi_remove_device(adev); 3001 3002 amdgpu_amdkfd_device_fini_sw(adev); 3003 3004 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3005 if (!adev->ip_blocks[i].status.sw) 3006 continue; 3007 3008 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3009 amdgpu_ucode_free_bo(adev); 3010 amdgpu_free_static_csa(&adev->virt.csa_obj); 3011 amdgpu_device_wb_fini(adev); 3012 amdgpu_device_mem_scratch_fini(adev); 3013 amdgpu_ib_pool_fini(adev); 3014 } 3015 3016 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3017 /* XXX handle errors */ 3018 if (r) { 3019 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3020 adev->ip_blocks[i].version->funcs->name, r); 3021 } 3022 adev->ip_blocks[i].status.sw = false; 3023 adev->ip_blocks[i].status.valid = false; 3024 } 3025 3026 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3027 if (!adev->ip_blocks[i].status.late_initialized) 3028 continue; 3029 if (adev->ip_blocks[i].version->funcs->late_fini) 3030 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3031 adev->ip_blocks[i].status.late_initialized = false; 3032 } 3033 3034 amdgpu_ras_fini(adev); 3035 3036 return 0; 3037 } 3038 3039 /** 3040 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3041 * 3042 * @work: work_struct. 3043 */ 3044 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3045 { 3046 struct amdgpu_device *adev = 3047 container_of(work, struct amdgpu_device, delayed_init_work.work); 3048 int r; 3049 3050 r = amdgpu_ib_ring_tests(adev); 3051 if (r) 3052 DRM_ERROR("ib ring test failed (%d).\n", r); 3053 } 3054 3055 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3056 { 3057 struct amdgpu_device *adev = 3058 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3059 3060 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3061 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3062 3063 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3064 adev->gfx.gfx_off_state = true; 3065 } 3066 3067 /** 3068 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3069 * 3070 * @adev: amdgpu_device pointer 3071 * 3072 * Main suspend function for hardware IPs. The list of all the hardware 3073 * IPs that make up the asic is walked, clockgating is disabled and the 3074 * suspend callbacks are run. suspend puts the hardware and software state 3075 * in each IP into a state suitable for suspend. 3076 * Returns 0 on success, negative error code on failure. 3077 */ 3078 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3079 { 3080 int i, r; 3081 3082 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3083 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3084 3085 /* 3086 * Per PMFW team's suggestion, driver needs to handle gfxoff 3087 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3088 * scenario. Add the missing df cstate disablement here. 3089 */ 3090 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3091 dev_warn(adev->dev, "Failed to disallow df cstate"); 3092 3093 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3094 if (!adev->ip_blocks[i].status.valid) 3095 continue; 3096 3097 /* displays are handled separately */ 3098 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3099 continue; 3100 3101 /* XXX handle errors */ 3102 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3103 /* XXX handle errors */ 3104 if (r) { 3105 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3106 adev->ip_blocks[i].version->funcs->name, r); 3107 return r; 3108 } 3109 3110 adev->ip_blocks[i].status.hw = false; 3111 } 3112 3113 return 0; 3114 } 3115 3116 /** 3117 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3118 * 3119 * @adev: amdgpu_device pointer 3120 * 3121 * Main suspend function for hardware IPs. The list of all the hardware 3122 * IPs that make up the asic is walked, clockgating is disabled and the 3123 * suspend callbacks are run. suspend puts the hardware and software state 3124 * in each IP into a state suitable for suspend. 3125 * Returns 0 on success, negative error code on failure. 3126 */ 3127 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3128 { 3129 int i, r; 3130 3131 if (adev->in_s0ix) 3132 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3133 3134 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3135 if (!adev->ip_blocks[i].status.valid) 3136 continue; 3137 /* displays are handled in phase1 */ 3138 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3139 continue; 3140 /* PSP lost connection when err_event_athub occurs */ 3141 if (amdgpu_ras_intr_triggered() && 3142 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3143 adev->ip_blocks[i].status.hw = false; 3144 continue; 3145 } 3146 3147 /* skip unnecessary suspend if we do not initialize them yet */ 3148 if (adev->gmc.xgmi.pending_reset && 3149 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3150 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3151 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3152 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3153 adev->ip_blocks[i].status.hw = false; 3154 continue; 3155 } 3156 3157 /* skip suspend of gfx/mes and psp for S0ix 3158 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3159 * like at runtime. PSP is also part of the always on hardware 3160 * so no need to suspend it. 3161 */ 3162 if (adev->in_s0ix && 3163 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3164 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3165 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3166 continue; 3167 3168 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3169 if (adev->in_s0ix && 3170 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3171 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3172 continue; 3173 3174 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3175 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3176 * from this location and RLC Autoload automatically also gets loaded 3177 * from here based on PMFW -> PSP message during re-init sequence. 3178 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3179 * the TMR and reload FWs again for IMU enabled APU ASICs. 3180 */ 3181 if (amdgpu_in_reset(adev) && 3182 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3183 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3184 continue; 3185 3186 /* XXX handle errors */ 3187 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3188 /* XXX handle errors */ 3189 if (r) { 3190 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3191 adev->ip_blocks[i].version->funcs->name, r); 3192 } 3193 adev->ip_blocks[i].status.hw = false; 3194 /* handle putting the SMC in the appropriate state */ 3195 if (!amdgpu_sriov_vf(adev)) { 3196 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3197 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3198 if (r) { 3199 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3200 adev->mp1_state, r); 3201 return r; 3202 } 3203 } 3204 } 3205 } 3206 3207 return 0; 3208 } 3209 3210 /** 3211 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3212 * 3213 * @adev: amdgpu_device pointer 3214 * 3215 * Main suspend function for hardware IPs. The list of all the hardware 3216 * IPs that make up the asic is walked, clockgating is disabled and the 3217 * suspend callbacks are run. suspend puts the hardware and software state 3218 * in each IP into a state suitable for suspend. 3219 * Returns 0 on success, negative error code on failure. 3220 */ 3221 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3222 { 3223 int r; 3224 3225 if (amdgpu_sriov_vf(adev)) { 3226 amdgpu_virt_fini_data_exchange(adev); 3227 amdgpu_virt_request_full_gpu(adev, false); 3228 } 3229 3230 r = amdgpu_device_ip_suspend_phase1(adev); 3231 if (r) 3232 return r; 3233 r = amdgpu_device_ip_suspend_phase2(adev); 3234 3235 if (amdgpu_sriov_vf(adev)) 3236 amdgpu_virt_release_full_gpu(adev, false); 3237 3238 return r; 3239 } 3240 3241 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3242 { 3243 int i, r; 3244 3245 static enum amd_ip_block_type ip_order[] = { 3246 AMD_IP_BLOCK_TYPE_COMMON, 3247 AMD_IP_BLOCK_TYPE_GMC, 3248 AMD_IP_BLOCK_TYPE_PSP, 3249 AMD_IP_BLOCK_TYPE_IH, 3250 }; 3251 3252 for (i = 0; i < adev->num_ip_blocks; i++) { 3253 int j; 3254 struct amdgpu_ip_block *block; 3255 3256 block = &adev->ip_blocks[i]; 3257 block->status.hw = false; 3258 3259 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3260 3261 if (block->version->type != ip_order[j] || 3262 !block->status.valid) 3263 continue; 3264 3265 r = block->version->funcs->hw_init(adev); 3266 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3267 if (r) 3268 return r; 3269 block->status.hw = true; 3270 } 3271 } 3272 3273 return 0; 3274 } 3275 3276 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3277 { 3278 int i, r; 3279 3280 static enum amd_ip_block_type ip_order[] = { 3281 AMD_IP_BLOCK_TYPE_SMC, 3282 AMD_IP_BLOCK_TYPE_DCE, 3283 AMD_IP_BLOCK_TYPE_GFX, 3284 AMD_IP_BLOCK_TYPE_SDMA, 3285 AMD_IP_BLOCK_TYPE_MES, 3286 AMD_IP_BLOCK_TYPE_UVD, 3287 AMD_IP_BLOCK_TYPE_VCE, 3288 AMD_IP_BLOCK_TYPE_VCN, 3289 AMD_IP_BLOCK_TYPE_JPEG 3290 }; 3291 3292 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3293 int j; 3294 struct amdgpu_ip_block *block; 3295 3296 for (j = 0; j < adev->num_ip_blocks; j++) { 3297 block = &adev->ip_blocks[j]; 3298 3299 if (block->version->type != ip_order[i] || 3300 !block->status.valid || 3301 block->status.hw) 3302 continue; 3303 3304 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3305 r = block->version->funcs->resume(adev); 3306 else 3307 r = block->version->funcs->hw_init(adev); 3308 3309 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3310 if (r) 3311 return r; 3312 block->status.hw = true; 3313 } 3314 } 3315 3316 return 0; 3317 } 3318 3319 /** 3320 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3321 * 3322 * @adev: amdgpu_device pointer 3323 * 3324 * First resume function for hardware IPs. The list of all the hardware 3325 * IPs that make up the asic is walked and the resume callbacks are run for 3326 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3327 * after a suspend and updates the software state as necessary. This 3328 * function is also used for restoring the GPU after a GPU reset. 3329 * Returns 0 on success, negative error code on failure. 3330 */ 3331 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3332 { 3333 int i, r; 3334 3335 for (i = 0; i < adev->num_ip_blocks; i++) { 3336 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3337 continue; 3338 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3339 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3340 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3341 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3342 3343 r = adev->ip_blocks[i].version->funcs->resume(adev); 3344 if (r) { 3345 DRM_ERROR("resume of IP block <%s> failed %d\n", 3346 adev->ip_blocks[i].version->funcs->name, r); 3347 return r; 3348 } 3349 adev->ip_blocks[i].status.hw = true; 3350 } 3351 } 3352 3353 return 0; 3354 } 3355 3356 /** 3357 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3358 * 3359 * @adev: amdgpu_device pointer 3360 * 3361 * First resume function for hardware IPs. The list of all the hardware 3362 * IPs that make up the asic is walked and the resume callbacks are run for 3363 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3364 * functional state after a suspend and updates the software state as 3365 * necessary. This function is also used for restoring the GPU after a GPU 3366 * reset. 3367 * Returns 0 on success, negative error code on failure. 3368 */ 3369 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3370 { 3371 int i, r; 3372 3373 for (i = 0; i < adev->num_ip_blocks; i++) { 3374 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3375 continue; 3376 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3377 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3378 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3379 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3380 continue; 3381 r = adev->ip_blocks[i].version->funcs->resume(adev); 3382 if (r) { 3383 DRM_ERROR("resume of IP block <%s> failed %d\n", 3384 adev->ip_blocks[i].version->funcs->name, r); 3385 return r; 3386 } 3387 adev->ip_blocks[i].status.hw = true; 3388 } 3389 3390 return 0; 3391 } 3392 3393 /** 3394 * amdgpu_device_ip_resume - run resume for hardware IPs 3395 * 3396 * @adev: amdgpu_device pointer 3397 * 3398 * Main resume function for hardware IPs. The hardware IPs 3399 * are split into two resume functions because they are 3400 * are also used in in recovering from a GPU reset and some additional 3401 * steps need to be take between them. In this case (S3/S4) they are 3402 * run sequentially. 3403 * Returns 0 on success, negative error code on failure. 3404 */ 3405 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3406 { 3407 int r; 3408 3409 if (!adev->in_s0ix) { 3410 r = amdgpu_amdkfd_resume_iommu(adev); 3411 if (r) 3412 return r; 3413 } 3414 3415 r = amdgpu_device_ip_resume_phase1(adev); 3416 if (r) 3417 return r; 3418 3419 r = amdgpu_device_fw_loading(adev); 3420 if (r) 3421 return r; 3422 3423 r = amdgpu_device_ip_resume_phase2(adev); 3424 3425 return r; 3426 } 3427 3428 /** 3429 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3430 * 3431 * @adev: amdgpu_device pointer 3432 * 3433 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3434 */ 3435 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3436 { 3437 if (amdgpu_sriov_vf(adev)) { 3438 if (adev->is_atom_fw) { 3439 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3440 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3441 } else { 3442 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3443 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3444 } 3445 3446 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3447 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3448 } 3449 } 3450 3451 /** 3452 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3453 * 3454 * @asic_type: AMD asic type 3455 * 3456 * Check if there is DC (new modesetting infrastructre) support for an asic. 3457 * returns true if DC has support, false if not. 3458 */ 3459 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3460 { 3461 switch (asic_type) { 3462 #ifdef CONFIG_DRM_AMDGPU_SI 3463 case CHIP_HAINAN: 3464 #endif 3465 case CHIP_TOPAZ: 3466 /* chips with no display hardware */ 3467 return false; 3468 #if defined(CONFIG_DRM_AMD_DC) 3469 case CHIP_TAHITI: 3470 case CHIP_PITCAIRN: 3471 case CHIP_VERDE: 3472 case CHIP_OLAND: 3473 /* 3474 * We have systems in the wild with these ASICs that require 3475 * LVDS and VGA support which is not supported with DC. 3476 * 3477 * Fallback to the non-DC driver here by default so as not to 3478 * cause regressions. 3479 */ 3480 #if defined(CONFIG_DRM_AMD_DC_SI) 3481 return amdgpu_dc > 0; 3482 #else 3483 return false; 3484 #endif 3485 case CHIP_BONAIRE: 3486 case CHIP_KAVERI: 3487 case CHIP_KABINI: 3488 case CHIP_MULLINS: 3489 /* 3490 * We have systems in the wild with these ASICs that require 3491 * VGA support which is not supported with DC. 3492 * 3493 * Fallback to the non-DC driver here by default so as not to 3494 * cause regressions. 3495 */ 3496 return amdgpu_dc > 0; 3497 default: 3498 return amdgpu_dc != 0; 3499 #else 3500 default: 3501 if (amdgpu_dc > 0) 3502 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3503 "but isn't supported by ASIC, ignoring\n"); 3504 return false; 3505 #endif 3506 } 3507 } 3508 3509 /** 3510 * amdgpu_device_has_dc_support - check if dc is supported 3511 * 3512 * @adev: amdgpu_device pointer 3513 * 3514 * Returns true for supported, false for not supported 3515 */ 3516 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3517 { 3518 if (adev->enable_virtual_display || 3519 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3520 return false; 3521 3522 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3523 } 3524 3525 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3526 { 3527 struct amdgpu_device *adev = 3528 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3529 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3530 3531 /* It's a bug to not have a hive within this function */ 3532 if (WARN_ON(!hive)) 3533 return; 3534 3535 /* 3536 * Use task barrier to synchronize all xgmi reset works across the 3537 * hive. task_barrier_enter and task_barrier_exit will block 3538 * until all the threads running the xgmi reset works reach 3539 * those points. task_barrier_full will do both blocks. 3540 */ 3541 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3542 3543 task_barrier_enter(&hive->tb); 3544 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3545 3546 if (adev->asic_reset_res) 3547 goto fail; 3548 3549 task_barrier_exit(&hive->tb); 3550 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3551 3552 if (adev->asic_reset_res) 3553 goto fail; 3554 3555 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3556 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3557 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3558 } else { 3559 3560 task_barrier_full(&hive->tb); 3561 adev->asic_reset_res = amdgpu_asic_reset(adev); 3562 } 3563 3564 fail: 3565 if (adev->asic_reset_res) 3566 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3567 adev->asic_reset_res, adev_to_drm(adev)->unique); 3568 amdgpu_put_xgmi_hive(hive); 3569 } 3570 3571 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3572 { 3573 char *input = amdgpu_lockup_timeout; 3574 char *timeout_setting = NULL; 3575 int index = 0; 3576 long timeout; 3577 int ret = 0; 3578 3579 /* 3580 * By default timeout for non compute jobs is 10000 3581 * and 60000 for compute jobs. 3582 * In SR-IOV or passthrough mode, timeout for compute 3583 * jobs are 60000 by default. 3584 */ 3585 adev->gfx_timeout = msecs_to_jiffies(10000); 3586 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3587 if (amdgpu_sriov_vf(adev)) 3588 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3589 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3590 else 3591 adev->compute_timeout = msecs_to_jiffies(60000); 3592 3593 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3594 while ((timeout_setting = strsep(&input, ",")) && 3595 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3596 ret = kstrtol(timeout_setting, 0, &timeout); 3597 if (ret) 3598 return ret; 3599 3600 if (timeout == 0) { 3601 index++; 3602 continue; 3603 } else if (timeout < 0) { 3604 timeout = MAX_SCHEDULE_TIMEOUT; 3605 dev_warn(adev->dev, "lockup timeout disabled"); 3606 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3607 } else { 3608 timeout = msecs_to_jiffies(timeout); 3609 } 3610 3611 switch (index++) { 3612 case 0: 3613 adev->gfx_timeout = timeout; 3614 break; 3615 case 1: 3616 adev->compute_timeout = timeout; 3617 break; 3618 case 2: 3619 adev->sdma_timeout = timeout; 3620 break; 3621 case 3: 3622 adev->video_timeout = timeout; 3623 break; 3624 default: 3625 break; 3626 } 3627 } 3628 /* 3629 * There is only one value specified and 3630 * it should apply to all non-compute jobs. 3631 */ 3632 if (index == 1) { 3633 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3634 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3635 adev->compute_timeout = adev->gfx_timeout; 3636 } 3637 } 3638 3639 return ret; 3640 } 3641 3642 /** 3643 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3644 * 3645 * @adev: amdgpu_device pointer 3646 * 3647 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3648 */ 3649 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3650 { 3651 struct iommu_domain *domain; 3652 3653 domain = iommu_get_domain_for_dev(adev->dev); 3654 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3655 adev->ram_is_direct_mapped = true; 3656 } 3657 3658 static const struct attribute *amdgpu_dev_attributes[] = { 3659 &dev_attr_product_name.attr, 3660 &dev_attr_product_number.attr, 3661 &dev_attr_serial_number.attr, 3662 &dev_attr_pcie_replay_count.attr, 3663 NULL 3664 }; 3665 3666 /** 3667 * amdgpu_device_init - initialize the driver 3668 * 3669 * @adev: amdgpu_device pointer 3670 * @flags: driver flags 3671 * 3672 * Initializes the driver info and hw (all asics). 3673 * Returns 0 for success or an error on failure. 3674 * Called at driver startup. 3675 */ 3676 int amdgpu_device_init(struct amdgpu_device *adev, 3677 uint32_t flags) 3678 { 3679 struct drm_device *ddev = adev_to_drm(adev); 3680 struct pci_dev *pdev = adev->pdev; 3681 int r, i; 3682 bool px = false; 3683 u32 max_MBps; 3684 int tmp; 3685 3686 adev->shutdown = false; 3687 adev->flags = flags; 3688 3689 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3690 adev->asic_type = amdgpu_force_asic_type; 3691 else 3692 adev->asic_type = flags & AMD_ASIC_MASK; 3693 3694 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3695 if (amdgpu_emu_mode == 1) 3696 adev->usec_timeout *= 10; 3697 adev->gmc.gart_size = 512 * 1024 * 1024; 3698 adev->accel_working = false; 3699 adev->num_rings = 0; 3700 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3701 adev->mman.buffer_funcs = NULL; 3702 adev->mman.buffer_funcs_ring = NULL; 3703 adev->vm_manager.vm_pte_funcs = NULL; 3704 adev->vm_manager.vm_pte_num_scheds = 0; 3705 adev->gmc.gmc_funcs = NULL; 3706 adev->harvest_ip_mask = 0x0; 3707 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3708 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3709 3710 adev->smc_rreg = &amdgpu_invalid_rreg; 3711 adev->smc_wreg = &amdgpu_invalid_wreg; 3712 adev->pcie_rreg = &amdgpu_invalid_rreg; 3713 adev->pcie_wreg = &amdgpu_invalid_wreg; 3714 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3715 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3716 adev->pciep_rreg = &amdgpu_invalid_rreg; 3717 adev->pciep_wreg = &amdgpu_invalid_wreg; 3718 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3719 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3720 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3721 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3722 adev->didt_rreg = &amdgpu_invalid_rreg; 3723 adev->didt_wreg = &amdgpu_invalid_wreg; 3724 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3725 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3726 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3727 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3728 3729 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3730 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3731 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3732 3733 /* mutex initialization are all done here so we 3734 * can recall function without having locking issues */ 3735 mutex_init(&adev->firmware.mutex); 3736 mutex_init(&adev->pm.mutex); 3737 mutex_init(&adev->gfx.gpu_clock_mutex); 3738 mutex_init(&adev->srbm_mutex); 3739 mutex_init(&adev->gfx.pipe_reserve_mutex); 3740 mutex_init(&adev->gfx.gfx_off_mutex); 3741 mutex_init(&adev->gfx.partition_mutex); 3742 mutex_init(&adev->grbm_idx_mutex); 3743 mutex_init(&adev->mn_lock); 3744 mutex_init(&adev->virt.vf_errors.lock); 3745 hash_init(adev->mn_hash); 3746 mutex_init(&adev->psp.mutex); 3747 mutex_init(&adev->notifier_lock); 3748 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3749 mutex_init(&adev->benchmark_mutex); 3750 3751 amdgpu_device_init_apu_flags(adev); 3752 3753 r = amdgpu_device_check_arguments(adev); 3754 if (r) 3755 return r; 3756 3757 spin_lock_init(&adev->mmio_idx_lock); 3758 spin_lock_init(&adev->smc_idx_lock); 3759 spin_lock_init(&adev->pcie_idx_lock); 3760 spin_lock_init(&adev->uvd_ctx_idx_lock); 3761 spin_lock_init(&adev->didt_idx_lock); 3762 spin_lock_init(&adev->gc_cac_idx_lock); 3763 spin_lock_init(&adev->se_cac_idx_lock); 3764 spin_lock_init(&adev->audio_endpt_idx_lock); 3765 spin_lock_init(&adev->mm_stats.lock); 3766 3767 INIT_LIST_HEAD(&adev->shadow_list); 3768 mutex_init(&adev->shadow_list_lock); 3769 3770 INIT_LIST_HEAD(&adev->reset_list); 3771 3772 INIT_LIST_HEAD(&adev->ras_list); 3773 3774 INIT_DELAYED_WORK(&adev->delayed_init_work, 3775 amdgpu_device_delayed_init_work_handler); 3776 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3777 amdgpu_device_delay_enable_gfx_off); 3778 3779 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3780 3781 adev->gfx.gfx_off_req_count = 1; 3782 adev->gfx.gfx_off_residency = 0; 3783 adev->gfx.gfx_off_entrycount = 0; 3784 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3785 3786 atomic_set(&adev->throttling_logging_enabled, 1); 3787 /* 3788 * If throttling continues, logging will be performed every minute 3789 * to avoid log flooding. "-1" is subtracted since the thermal 3790 * throttling interrupt comes every second. Thus, the total logging 3791 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3792 * for throttling interrupt) = 60 seconds. 3793 */ 3794 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3795 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3796 3797 /* Registers mapping */ 3798 /* TODO: block userspace mapping of io register */ 3799 if (adev->asic_type >= CHIP_BONAIRE) { 3800 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3801 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3802 } else { 3803 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3804 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3805 } 3806 3807 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3808 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3809 3810 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3811 if (adev->rmmio == NULL) { 3812 return -ENOMEM; 3813 } 3814 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3815 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3816 3817 if (amdgpu_mcbp) 3818 DRM_INFO("MCBP is enabled\n"); 3819 3820 /* 3821 * Reset domain needs to be present early, before XGMI hive discovered 3822 * (if any) and intitialized to use reset sem and in_gpu reset flag 3823 * early on during init and before calling to RREG32. 3824 */ 3825 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3826 if (!adev->reset_domain) 3827 return -ENOMEM; 3828 3829 /* detect hw virtualization here */ 3830 amdgpu_detect_virtualization(adev); 3831 3832 amdgpu_device_get_pcie_info(adev); 3833 3834 r = amdgpu_device_get_job_timeout_settings(adev); 3835 if (r) { 3836 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3837 return r; 3838 } 3839 3840 /* early init functions */ 3841 r = amdgpu_device_ip_early_init(adev); 3842 if (r) 3843 return r; 3844 3845 /* Get rid of things like offb */ 3846 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3847 if (r) 3848 return r; 3849 3850 /* Enable TMZ based on IP_VERSION */ 3851 amdgpu_gmc_tmz_set(adev); 3852 3853 amdgpu_gmc_noretry_set(adev); 3854 /* Need to get xgmi info early to decide the reset behavior*/ 3855 if (adev->gmc.xgmi.supported) { 3856 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3857 if (r) 3858 return r; 3859 } 3860 3861 /* enable PCIE atomic ops */ 3862 if (amdgpu_sriov_vf(adev)) { 3863 if (adev->virt.fw_reserve.p_pf2vf) 3864 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3865 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3866 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3867 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3868 * internal path natively support atomics, set have_atomics_support to true. 3869 */ 3870 } else if ((adev->flags & AMD_IS_APU) && 3871 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3872 adev->have_atomics_support = true; 3873 } else { 3874 adev->have_atomics_support = 3875 !pci_enable_atomic_ops_to_root(adev->pdev, 3876 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3877 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3878 } 3879 3880 if (!adev->have_atomics_support) 3881 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3882 3883 /* doorbell bar mapping and doorbell index init*/ 3884 amdgpu_device_doorbell_init(adev); 3885 3886 if (amdgpu_emu_mode == 1) { 3887 /* post the asic on emulation mode */ 3888 emu_soc_asic_init(adev); 3889 goto fence_driver_init; 3890 } 3891 3892 amdgpu_reset_init(adev); 3893 3894 /* detect if we are with an SRIOV vbios */ 3895 if (adev->bios) 3896 amdgpu_device_detect_sriov_bios(adev); 3897 3898 /* check if we need to reset the asic 3899 * E.g., driver was not cleanly unloaded previously, etc. 3900 */ 3901 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3902 if (adev->gmc.xgmi.num_physical_nodes) { 3903 dev_info(adev->dev, "Pending hive reset.\n"); 3904 adev->gmc.xgmi.pending_reset = true; 3905 /* Only need to init necessary block for SMU to handle the reset */ 3906 for (i = 0; i < adev->num_ip_blocks; i++) { 3907 if (!adev->ip_blocks[i].status.valid) 3908 continue; 3909 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3910 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3911 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3912 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3913 DRM_DEBUG("IP %s disabled for hw_init.\n", 3914 adev->ip_blocks[i].version->funcs->name); 3915 adev->ip_blocks[i].status.hw = true; 3916 } 3917 } 3918 } else { 3919 tmp = amdgpu_reset_method; 3920 /* It should do a default reset when loading or reloading the driver, 3921 * regardless of the module parameter reset_method. 3922 */ 3923 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3924 r = amdgpu_asic_reset(adev); 3925 amdgpu_reset_method = tmp; 3926 if (r) { 3927 dev_err(adev->dev, "asic reset on init failed\n"); 3928 goto failed; 3929 } 3930 } 3931 } 3932 3933 /* Post card if necessary */ 3934 if (amdgpu_device_need_post(adev)) { 3935 if (!adev->bios) { 3936 dev_err(adev->dev, "no vBIOS found\n"); 3937 r = -EINVAL; 3938 goto failed; 3939 } 3940 DRM_INFO("GPU posting now...\n"); 3941 r = amdgpu_device_asic_init(adev); 3942 if (r) { 3943 dev_err(adev->dev, "gpu post error!\n"); 3944 goto failed; 3945 } 3946 } 3947 3948 if (adev->is_atom_fw) { 3949 /* Initialize clocks */ 3950 r = amdgpu_atomfirmware_get_clock_info(adev); 3951 if (r) { 3952 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3953 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3954 goto failed; 3955 } 3956 } else { 3957 /* Initialize clocks */ 3958 r = amdgpu_atombios_get_clock_info(adev); 3959 if (r) { 3960 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3961 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3962 goto failed; 3963 } 3964 /* init i2c buses */ 3965 if (!amdgpu_device_has_dc_support(adev)) 3966 amdgpu_atombios_i2c_init(adev); 3967 } 3968 3969 fence_driver_init: 3970 /* Fence driver */ 3971 r = amdgpu_fence_driver_sw_init(adev); 3972 if (r) { 3973 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3974 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3975 goto failed; 3976 } 3977 3978 /* init the mode config */ 3979 drm_mode_config_init(adev_to_drm(adev)); 3980 3981 r = amdgpu_device_ip_init(adev); 3982 if (r) { 3983 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3984 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3985 goto release_ras_con; 3986 } 3987 3988 amdgpu_fence_driver_hw_init(adev); 3989 3990 dev_info(adev->dev, 3991 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3992 adev->gfx.config.max_shader_engines, 3993 adev->gfx.config.max_sh_per_se, 3994 adev->gfx.config.max_cu_per_sh, 3995 adev->gfx.cu_info.number); 3996 3997 adev->accel_working = true; 3998 3999 amdgpu_vm_check_compute_bug(adev); 4000 4001 /* Initialize the buffer migration limit. */ 4002 if (amdgpu_moverate >= 0) 4003 max_MBps = amdgpu_moverate; 4004 else 4005 max_MBps = 8; /* Allow 8 MB/s. */ 4006 /* Get a log2 for easy divisions. */ 4007 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4008 4009 r = amdgpu_pm_sysfs_init(adev); 4010 if (r) 4011 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4012 4013 r = amdgpu_ucode_sysfs_init(adev); 4014 if (r) { 4015 adev->ucode_sysfs_en = false; 4016 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4017 } else 4018 adev->ucode_sysfs_en = true; 4019 4020 r = amdgpu_psp_sysfs_init(adev); 4021 if (r) { 4022 adev->psp_sysfs_en = false; 4023 if (!amdgpu_sriov_vf(adev)) 4024 DRM_ERROR("Creating psp sysfs failed\n"); 4025 } else 4026 adev->psp_sysfs_en = true; 4027 4028 /* 4029 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4030 * Otherwise the mgpu fan boost feature will be skipped due to the 4031 * gpu instance is counted less. 4032 */ 4033 amdgpu_register_gpu_instance(adev); 4034 4035 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4036 * explicit gating rather than handling it automatically. 4037 */ 4038 if (!adev->gmc.xgmi.pending_reset) { 4039 r = amdgpu_device_ip_late_init(adev); 4040 if (r) { 4041 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4043 goto release_ras_con; 4044 } 4045 /* must succeed. */ 4046 amdgpu_ras_resume(adev); 4047 queue_delayed_work(system_wq, &adev->delayed_init_work, 4048 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4049 } 4050 4051 if (amdgpu_sriov_vf(adev)) { 4052 amdgpu_virt_release_full_gpu(adev, true); 4053 flush_delayed_work(&adev->delayed_init_work); 4054 } 4055 4056 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4057 if (r) 4058 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4059 4060 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4061 r = amdgpu_pmu_init(adev); 4062 if (r) 4063 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4064 4065 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4066 if (amdgpu_device_cache_pci_state(adev->pdev)) 4067 pci_restore_state(pdev); 4068 4069 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4070 /* this will fail for cards that aren't VGA class devices, just 4071 * ignore it */ 4072 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4073 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4074 4075 px = amdgpu_device_supports_px(ddev); 4076 4077 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4078 apple_gmux_detect(NULL, NULL))) 4079 vga_switcheroo_register_client(adev->pdev, 4080 &amdgpu_switcheroo_ops, px); 4081 4082 if (px) 4083 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4084 4085 if (adev->gmc.xgmi.pending_reset) 4086 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4087 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4088 4089 amdgpu_device_check_iommu_direct_map(adev); 4090 4091 return 0; 4092 4093 release_ras_con: 4094 if (amdgpu_sriov_vf(adev)) 4095 amdgpu_virt_release_full_gpu(adev, true); 4096 4097 /* failed in exclusive mode due to timeout */ 4098 if (amdgpu_sriov_vf(adev) && 4099 !amdgpu_sriov_runtime(adev) && 4100 amdgpu_virt_mmio_blocked(adev) && 4101 !amdgpu_virt_wait_reset(adev)) { 4102 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4103 /* Don't send request since VF is inactive. */ 4104 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4105 adev->virt.ops = NULL; 4106 r = -EAGAIN; 4107 } 4108 amdgpu_release_ras_context(adev); 4109 4110 failed: 4111 amdgpu_vf_error_trans_all(adev); 4112 4113 return r; 4114 } 4115 4116 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4117 { 4118 4119 /* Clear all CPU mappings pointing to this device */ 4120 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4121 4122 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4123 amdgpu_device_doorbell_fini(adev); 4124 4125 iounmap(adev->rmmio); 4126 adev->rmmio = NULL; 4127 if (adev->mman.aper_base_kaddr) 4128 iounmap(adev->mman.aper_base_kaddr); 4129 adev->mman.aper_base_kaddr = NULL; 4130 4131 /* Memory manager related */ 4132 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4133 arch_phys_wc_del(adev->gmc.vram_mtrr); 4134 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4135 } 4136 } 4137 4138 /** 4139 * amdgpu_device_fini_hw - tear down the driver 4140 * 4141 * @adev: amdgpu_device pointer 4142 * 4143 * Tear down the driver info (all asics). 4144 * Called at driver shutdown. 4145 */ 4146 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4147 { 4148 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4149 flush_delayed_work(&adev->delayed_init_work); 4150 adev->shutdown = true; 4151 4152 /* make sure IB test finished before entering exclusive mode 4153 * to avoid preemption on IB test 4154 * */ 4155 if (amdgpu_sriov_vf(adev)) { 4156 amdgpu_virt_request_full_gpu(adev, false); 4157 amdgpu_virt_fini_data_exchange(adev); 4158 } 4159 4160 /* disable all interrupts */ 4161 amdgpu_irq_disable_all(adev); 4162 if (adev->mode_info.mode_config_initialized) { 4163 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4164 drm_helper_force_disable_all(adev_to_drm(adev)); 4165 else 4166 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4167 } 4168 amdgpu_fence_driver_hw_fini(adev); 4169 4170 if (adev->mman.initialized) 4171 drain_workqueue(adev->mman.bdev.wq); 4172 4173 if (adev->pm.sysfs_initialized) 4174 amdgpu_pm_sysfs_fini(adev); 4175 if (adev->ucode_sysfs_en) 4176 amdgpu_ucode_sysfs_fini(adev); 4177 if (adev->psp_sysfs_en) 4178 amdgpu_psp_sysfs_fini(adev); 4179 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4180 4181 /* disable ras feature must before hw fini */ 4182 amdgpu_ras_pre_fini(adev); 4183 4184 amdgpu_device_ip_fini_early(adev); 4185 4186 amdgpu_irq_fini_hw(adev); 4187 4188 if (adev->mman.initialized) 4189 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4190 4191 amdgpu_gart_dummy_page_fini(adev); 4192 4193 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4194 amdgpu_device_unmap_mmio(adev); 4195 4196 } 4197 4198 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4199 { 4200 int idx; 4201 bool px; 4202 4203 amdgpu_fence_driver_sw_fini(adev); 4204 amdgpu_device_ip_fini(adev); 4205 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4206 adev->accel_working = false; 4207 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4208 4209 amdgpu_reset_fini(adev); 4210 4211 /* free i2c buses */ 4212 if (!amdgpu_device_has_dc_support(adev)) 4213 amdgpu_i2c_fini(adev); 4214 4215 if (amdgpu_emu_mode != 1) 4216 amdgpu_atombios_fini(adev); 4217 4218 kfree(adev->bios); 4219 adev->bios = NULL; 4220 4221 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4222 4223 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4224 apple_gmux_detect(NULL, NULL))) 4225 vga_switcheroo_unregister_client(adev->pdev); 4226 4227 if (px) 4228 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4229 4230 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4231 vga_client_unregister(adev->pdev); 4232 4233 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4234 4235 iounmap(adev->rmmio); 4236 adev->rmmio = NULL; 4237 amdgpu_device_doorbell_fini(adev); 4238 drm_dev_exit(idx); 4239 } 4240 4241 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4242 amdgpu_pmu_fini(adev); 4243 if (adev->mman.discovery_bin) 4244 amdgpu_discovery_fini(adev); 4245 4246 amdgpu_reset_put_reset_domain(adev->reset_domain); 4247 adev->reset_domain = NULL; 4248 4249 kfree(adev->pci_state); 4250 4251 } 4252 4253 /** 4254 * amdgpu_device_evict_resources - evict device resources 4255 * @adev: amdgpu device object 4256 * 4257 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4258 * of the vram memory type. Mainly used for evicting device resources 4259 * at suspend time. 4260 * 4261 */ 4262 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4263 { 4264 int ret; 4265 4266 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4267 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4268 return 0; 4269 4270 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4271 if (ret) 4272 DRM_WARN("evicting device resources failed\n"); 4273 return ret; 4274 } 4275 4276 /* 4277 * Suspend & resume. 4278 */ 4279 /** 4280 * amdgpu_device_suspend - initiate device suspend 4281 * 4282 * @dev: drm dev pointer 4283 * @fbcon : notify the fbdev of suspend 4284 * 4285 * Puts the hw in the suspend state (all asics). 4286 * Returns 0 for success or an error on failure. 4287 * Called at driver suspend. 4288 */ 4289 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4290 { 4291 struct amdgpu_device *adev = drm_to_adev(dev); 4292 int r = 0; 4293 4294 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4295 return 0; 4296 4297 adev->in_suspend = true; 4298 4299 /* Evict the majority of BOs before grabbing the full access */ 4300 r = amdgpu_device_evict_resources(adev); 4301 if (r) 4302 return r; 4303 4304 if (amdgpu_sriov_vf(adev)) { 4305 amdgpu_virt_fini_data_exchange(adev); 4306 r = amdgpu_virt_request_full_gpu(adev, false); 4307 if (r) 4308 return r; 4309 } 4310 4311 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4312 DRM_WARN("smart shift update failed\n"); 4313 4314 if (fbcon) 4315 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4316 4317 cancel_delayed_work_sync(&adev->delayed_init_work); 4318 4319 amdgpu_ras_suspend(adev); 4320 4321 amdgpu_device_ip_suspend_phase1(adev); 4322 4323 if (!adev->in_s0ix) 4324 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4325 4326 r = amdgpu_device_evict_resources(adev); 4327 if (r) 4328 return r; 4329 4330 amdgpu_fence_driver_hw_fini(adev); 4331 4332 amdgpu_device_ip_suspend_phase2(adev); 4333 4334 if (amdgpu_sriov_vf(adev)) 4335 amdgpu_virt_release_full_gpu(adev, false); 4336 4337 return 0; 4338 } 4339 4340 /** 4341 * amdgpu_device_resume - initiate device resume 4342 * 4343 * @dev: drm dev pointer 4344 * @fbcon : notify the fbdev of resume 4345 * 4346 * Bring the hw back to operating state (all asics). 4347 * Returns 0 for success or an error on failure. 4348 * Called at driver resume. 4349 */ 4350 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4351 { 4352 struct amdgpu_device *adev = drm_to_adev(dev); 4353 int r = 0; 4354 4355 if (amdgpu_sriov_vf(adev)) { 4356 r = amdgpu_virt_request_full_gpu(adev, true); 4357 if (r) 4358 return r; 4359 } 4360 4361 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4362 return 0; 4363 4364 if (adev->in_s0ix) 4365 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4366 4367 /* post card */ 4368 if (amdgpu_device_need_post(adev)) { 4369 r = amdgpu_device_asic_init(adev); 4370 if (r) 4371 dev_err(adev->dev, "amdgpu asic init failed\n"); 4372 } 4373 4374 r = amdgpu_device_ip_resume(adev); 4375 4376 if (r) { 4377 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4378 goto exit; 4379 } 4380 amdgpu_fence_driver_hw_init(adev); 4381 4382 r = amdgpu_device_ip_late_init(adev); 4383 if (r) 4384 goto exit; 4385 4386 queue_delayed_work(system_wq, &adev->delayed_init_work, 4387 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4388 4389 if (!adev->in_s0ix) { 4390 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4391 if (r) 4392 goto exit; 4393 } 4394 4395 exit: 4396 if (amdgpu_sriov_vf(adev)) { 4397 amdgpu_virt_init_data_exchange(adev); 4398 amdgpu_virt_release_full_gpu(adev, true); 4399 } 4400 4401 if (r) 4402 return r; 4403 4404 /* Make sure IB tests flushed */ 4405 flush_delayed_work(&adev->delayed_init_work); 4406 4407 if (fbcon) 4408 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4409 4410 amdgpu_ras_resume(adev); 4411 4412 if (adev->mode_info.num_crtc) { 4413 /* 4414 * Most of the connector probing functions try to acquire runtime pm 4415 * refs to ensure that the GPU is powered on when connector polling is 4416 * performed. Since we're calling this from a runtime PM callback, 4417 * trying to acquire rpm refs will cause us to deadlock. 4418 * 4419 * Since we're guaranteed to be holding the rpm lock, it's safe to 4420 * temporarily disable the rpm helpers so this doesn't deadlock us. 4421 */ 4422 #ifdef CONFIG_PM 4423 dev->dev->power.disable_depth++; 4424 #endif 4425 if (!adev->dc_enabled) 4426 drm_helper_hpd_irq_event(dev); 4427 else 4428 drm_kms_helper_hotplug_event(dev); 4429 #ifdef CONFIG_PM 4430 dev->dev->power.disable_depth--; 4431 #endif 4432 } 4433 adev->in_suspend = false; 4434 4435 if (adev->enable_mes) 4436 amdgpu_mes_self_test(adev); 4437 4438 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4439 DRM_WARN("smart shift update failed\n"); 4440 4441 return 0; 4442 } 4443 4444 /** 4445 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4446 * 4447 * @adev: amdgpu_device pointer 4448 * 4449 * The list of all the hardware IPs that make up the asic is walked and 4450 * the check_soft_reset callbacks are run. check_soft_reset determines 4451 * if the asic is still hung or not. 4452 * Returns true if any of the IPs are still in a hung state, false if not. 4453 */ 4454 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4455 { 4456 int i; 4457 bool asic_hang = false; 4458 4459 if (amdgpu_sriov_vf(adev)) 4460 return true; 4461 4462 if (amdgpu_asic_need_full_reset(adev)) 4463 return true; 4464 4465 for (i = 0; i < adev->num_ip_blocks; i++) { 4466 if (!adev->ip_blocks[i].status.valid) 4467 continue; 4468 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4469 adev->ip_blocks[i].status.hang = 4470 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4471 if (adev->ip_blocks[i].status.hang) { 4472 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4473 asic_hang = true; 4474 } 4475 } 4476 return asic_hang; 4477 } 4478 4479 /** 4480 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4481 * 4482 * @adev: amdgpu_device pointer 4483 * 4484 * The list of all the hardware IPs that make up the asic is walked and the 4485 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4486 * handles any IP specific hardware or software state changes that are 4487 * necessary for a soft reset to succeed. 4488 * Returns 0 on success, negative error code on failure. 4489 */ 4490 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4491 { 4492 int i, r = 0; 4493 4494 for (i = 0; i < adev->num_ip_blocks; i++) { 4495 if (!adev->ip_blocks[i].status.valid) 4496 continue; 4497 if (adev->ip_blocks[i].status.hang && 4498 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4499 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4500 if (r) 4501 return r; 4502 } 4503 } 4504 4505 return 0; 4506 } 4507 4508 /** 4509 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4510 * 4511 * @adev: amdgpu_device pointer 4512 * 4513 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4514 * reset is necessary to recover. 4515 * Returns true if a full asic reset is required, false if not. 4516 */ 4517 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4518 { 4519 int i; 4520 4521 if (amdgpu_asic_need_full_reset(adev)) 4522 return true; 4523 4524 for (i = 0; i < adev->num_ip_blocks; i++) { 4525 if (!adev->ip_blocks[i].status.valid) 4526 continue; 4527 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4528 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4529 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4530 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4531 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4532 if (adev->ip_blocks[i].status.hang) { 4533 dev_info(adev->dev, "Some block need full reset!\n"); 4534 return true; 4535 } 4536 } 4537 } 4538 return false; 4539 } 4540 4541 /** 4542 * amdgpu_device_ip_soft_reset - do a soft reset 4543 * 4544 * @adev: amdgpu_device pointer 4545 * 4546 * The list of all the hardware IPs that make up the asic is walked and the 4547 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4548 * IP specific hardware or software state changes that are necessary to soft 4549 * reset the IP. 4550 * Returns 0 on success, negative error code on failure. 4551 */ 4552 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4553 { 4554 int i, r = 0; 4555 4556 for (i = 0; i < adev->num_ip_blocks; i++) { 4557 if (!adev->ip_blocks[i].status.valid) 4558 continue; 4559 if (adev->ip_blocks[i].status.hang && 4560 adev->ip_blocks[i].version->funcs->soft_reset) { 4561 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4562 if (r) 4563 return r; 4564 } 4565 } 4566 4567 return 0; 4568 } 4569 4570 /** 4571 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4572 * 4573 * @adev: amdgpu_device pointer 4574 * 4575 * The list of all the hardware IPs that make up the asic is walked and the 4576 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4577 * handles any IP specific hardware or software state changes that are 4578 * necessary after the IP has been soft reset. 4579 * Returns 0 on success, negative error code on failure. 4580 */ 4581 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4582 { 4583 int i, r = 0; 4584 4585 for (i = 0; i < adev->num_ip_blocks; i++) { 4586 if (!adev->ip_blocks[i].status.valid) 4587 continue; 4588 if (adev->ip_blocks[i].status.hang && 4589 adev->ip_blocks[i].version->funcs->post_soft_reset) 4590 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4591 if (r) 4592 return r; 4593 } 4594 4595 return 0; 4596 } 4597 4598 /** 4599 * amdgpu_device_recover_vram - Recover some VRAM contents 4600 * 4601 * @adev: amdgpu_device pointer 4602 * 4603 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4604 * restore things like GPUVM page tables after a GPU reset where 4605 * the contents of VRAM might be lost. 4606 * 4607 * Returns: 4608 * 0 on success, negative error code on failure. 4609 */ 4610 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4611 { 4612 struct dma_fence *fence = NULL, *next = NULL; 4613 struct amdgpu_bo *shadow; 4614 struct amdgpu_bo_vm *vmbo; 4615 long r = 1, tmo; 4616 4617 if (amdgpu_sriov_runtime(adev)) 4618 tmo = msecs_to_jiffies(8000); 4619 else 4620 tmo = msecs_to_jiffies(100); 4621 4622 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4623 mutex_lock(&adev->shadow_list_lock); 4624 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4625 /* If vm is compute context or adev is APU, shadow will be NULL */ 4626 if (!vmbo->shadow) 4627 continue; 4628 shadow = vmbo->shadow; 4629 4630 /* No need to recover an evicted BO */ 4631 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4632 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4633 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4634 continue; 4635 4636 r = amdgpu_bo_restore_shadow(shadow, &next); 4637 if (r) 4638 break; 4639 4640 if (fence) { 4641 tmo = dma_fence_wait_timeout(fence, false, tmo); 4642 dma_fence_put(fence); 4643 fence = next; 4644 if (tmo == 0) { 4645 r = -ETIMEDOUT; 4646 break; 4647 } else if (tmo < 0) { 4648 r = tmo; 4649 break; 4650 } 4651 } else { 4652 fence = next; 4653 } 4654 } 4655 mutex_unlock(&adev->shadow_list_lock); 4656 4657 if (fence) 4658 tmo = dma_fence_wait_timeout(fence, false, tmo); 4659 dma_fence_put(fence); 4660 4661 if (r < 0 || tmo <= 0) { 4662 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4663 return -EIO; 4664 } 4665 4666 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4667 return 0; 4668 } 4669 4670 4671 /** 4672 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4673 * 4674 * @adev: amdgpu_device pointer 4675 * @from_hypervisor: request from hypervisor 4676 * 4677 * do VF FLR and reinitialize Asic 4678 * return 0 means succeeded otherwise failed 4679 */ 4680 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4681 bool from_hypervisor) 4682 { 4683 int r; 4684 struct amdgpu_hive_info *hive = NULL; 4685 int retry_limit = 0; 4686 4687 retry: 4688 amdgpu_amdkfd_pre_reset(adev); 4689 4690 if (from_hypervisor) 4691 r = amdgpu_virt_request_full_gpu(adev, true); 4692 else 4693 r = amdgpu_virt_reset_gpu(adev); 4694 if (r) 4695 return r; 4696 4697 /* Resume IP prior to SMC */ 4698 r = amdgpu_device_ip_reinit_early_sriov(adev); 4699 if (r) 4700 goto error; 4701 4702 amdgpu_virt_init_data_exchange(adev); 4703 4704 r = amdgpu_device_fw_loading(adev); 4705 if (r) 4706 return r; 4707 4708 /* now we are okay to resume SMC/CP/SDMA */ 4709 r = amdgpu_device_ip_reinit_late_sriov(adev); 4710 if (r) 4711 goto error; 4712 4713 hive = amdgpu_get_xgmi_hive(adev); 4714 /* Update PSP FW topology after reset */ 4715 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4716 r = amdgpu_xgmi_update_topology(hive, adev); 4717 4718 if (hive) 4719 amdgpu_put_xgmi_hive(hive); 4720 4721 if (!r) { 4722 amdgpu_irq_gpu_reset_resume_helper(adev); 4723 r = amdgpu_ib_ring_tests(adev); 4724 4725 amdgpu_amdkfd_post_reset(adev); 4726 } 4727 4728 error: 4729 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4730 amdgpu_inc_vram_lost(adev); 4731 r = amdgpu_device_recover_vram(adev); 4732 } 4733 amdgpu_virt_release_full_gpu(adev, true); 4734 4735 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4736 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4737 retry_limit++; 4738 goto retry; 4739 } else 4740 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4741 } 4742 4743 return r; 4744 } 4745 4746 /** 4747 * amdgpu_device_has_job_running - check if there is any job in mirror list 4748 * 4749 * @adev: amdgpu_device pointer 4750 * 4751 * check if there is any job in mirror list 4752 */ 4753 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4754 { 4755 int i; 4756 struct drm_sched_job *job; 4757 4758 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4759 struct amdgpu_ring *ring = adev->rings[i]; 4760 4761 if (!ring || !ring->sched.thread) 4762 continue; 4763 4764 spin_lock(&ring->sched.job_list_lock); 4765 job = list_first_entry_or_null(&ring->sched.pending_list, 4766 struct drm_sched_job, list); 4767 spin_unlock(&ring->sched.job_list_lock); 4768 if (job) 4769 return true; 4770 } 4771 return false; 4772 } 4773 4774 /** 4775 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4776 * 4777 * @adev: amdgpu_device pointer 4778 * 4779 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4780 * a hung GPU. 4781 */ 4782 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4783 { 4784 4785 if (amdgpu_gpu_recovery == 0) 4786 goto disabled; 4787 4788 /* Skip soft reset check in fatal error mode */ 4789 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4790 return true; 4791 4792 if (amdgpu_sriov_vf(adev)) 4793 return true; 4794 4795 if (amdgpu_gpu_recovery == -1) { 4796 switch (adev->asic_type) { 4797 #ifdef CONFIG_DRM_AMDGPU_SI 4798 case CHIP_VERDE: 4799 case CHIP_TAHITI: 4800 case CHIP_PITCAIRN: 4801 case CHIP_OLAND: 4802 case CHIP_HAINAN: 4803 #endif 4804 #ifdef CONFIG_DRM_AMDGPU_CIK 4805 case CHIP_KAVERI: 4806 case CHIP_KABINI: 4807 case CHIP_MULLINS: 4808 #endif 4809 case CHIP_CARRIZO: 4810 case CHIP_STONEY: 4811 case CHIP_CYAN_SKILLFISH: 4812 goto disabled; 4813 default: 4814 break; 4815 } 4816 } 4817 4818 return true; 4819 4820 disabled: 4821 dev_info(adev->dev, "GPU recovery disabled.\n"); 4822 return false; 4823 } 4824 4825 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4826 { 4827 u32 i; 4828 int ret = 0; 4829 4830 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4831 4832 dev_info(adev->dev, "GPU mode1 reset\n"); 4833 4834 /* disable BM */ 4835 pci_clear_master(adev->pdev); 4836 4837 amdgpu_device_cache_pci_state(adev->pdev); 4838 4839 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4840 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4841 ret = amdgpu_dpm_mode1_reset(adev); 4842 } else { 4843 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4844 ret = psp_gpu_reset(adev); 4845 } 4846 4847 if (ret) 4848 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4849 4850 amdgpu_device_load_pci_state(adev->pdev); 4851 4852 /* wait for asic to come out of reset */ 4853 for (i = 0; i < adev->usec_timeout; i++) { 4854 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4855 4856 if (memsize != 0xffffffff) 4857 break; 4858 udelay(1); 4859 } 4860 4861 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4862 return ret; 4863 } 4864 4865 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4866 struct amdgpu_reset_context *reset_context) 4867 { 4868 int i, r = 0; 4869 struct amdgpu_job *job = NULL; 4870 bool need_full_reset = 4871 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4872 4873 if (reset_context->reset_req_dev == adev) 4874 job = reset_context->job; 4875 4876 if (amdgpu_sriov_vf(adev)) { 4877 /* stop the data exchange thread */ 4878 amdgpu_virt_fini_data_exchange(adev); 4879 } 4880 4881 amdgpu_fence_driver_isr_toggle(adev, true); 4882 4883 /* block all schedulers and reset given job's ring */ 4884 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4885 struct amdgpu_ring *ring = adev->rings[i]; 4886 4887 if (!ring || !ring->sched.thread) 4888 continue; 4889 4890 /*clear job fence from fence drv to avoid force_completion 4891 *leave NULL and vm flush fence in fence drv */ 4892 amdgpu_fence_driver_clear_job_fences(ring); 4893 4894 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4895 amdgpu_fence_driver_force_completion(ring); 4896 } 4897 4898 amdgpu_fence_driver_isr_toggle(adev, false); 4899 4900 if (job && job->vm) 4901 drm_sched_increase_karma(&job->base); 4902 4903 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4904 /* If reset handler not implemented, continue; otherwise return */ 4905 if (r == -ENOSYS) 4906 r = 0; 4907 else 4908 return r; 4909 4910 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4911 if (!amdgpu_sriov_vf(adev)) { 4912 4913 if (!need_full_reset) 4914 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4915 4916 if (!need_full_reset && amdgpu_gpu_recovery && 4917 amdgpu_device_ip_check_soft_reset(adev)) { 4918 amdgpu_device_ip_pre_soft_reset(adev); 4919 r = amdgpu_device_ip_soft_reset(adev); 4920 amdgpu_device_ip_post_soft_reset(adev); 4921 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4922 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4923 need_full_reset = true; 4924 } 4925 } 4926 4927 if (need_full_reset) 4928 r = amdgpu_device_ip_suspend(adev); 4929 if (need_full_reset) 4930 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4931 else 4932 clear_bit(AMDGPU_NEED_FULL_RESET, 4933 &reset_context->flags); 4934 } 4935 4936 return r; 4937 } 4938 4939 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4940 { 4941 int i; 4942 4943 lockdep_assert_held(&adev->reset_domain->sem); 4944 4945 for (i = 0; i < adev->num_regs; i++) { 4946 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4947 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4948 adev->reset_dump_reg_value[i]); 4949 } 4950 4951 return 0; 4952 } 4953 4954 #ifdef CONFIG_DEV_COREDUMP 4955 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4956 size_t count, void *data, size_t datalen) 4957 { 4958 struct drm_printer p; 4959 struct amdgpu_device *adev = data; 4960 struct drm_print_iterator iter; 4961 int i; 4962 4963 iter.data = buffer; 4964 iter.offset = 0; 4965 iter.start = offset; 4966 iter.remain = count; 4967 4968 p = drm_coredump_printer(&iter); 4969 4970 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4971 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4972 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4973 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4974 if (adev->reset_task_info.pid) 4975 drm_printf(&p, "process_name: %s PID: %d\n", 4976 adev->reset_task_info.process_name, 4977 adev->reset_task_info.pid); 4978 4979 if (adev->reset_vram_lost) 4980 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4981 if (adev->num_regs) { 4982 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4983 4984 for (i = 0; i < adev->num_regs; i++) 4985 drm_printf(&p, "0x%08x: 0x%08x\n", 4986 adev->reset_dump_reg_list[i], 4987 adev->reset_dump_reg_value[i]); 4988 } 4989 4990 return count - iter.remain; 4991 } 4992 4993 static void amdgpu_devcoredump_free(void *data) 4994 { 4995 } 4996 4997 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4998 { 4999 struct drm_device *dev = adev_to_drm(adev); 5000 5001 ktime_get_ts64(&adev->reset_time); 5002 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 5003 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5004 } 5005 #endif 5006 5007 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5008 struct amdgpu_reset_context *reset_context) 5009 { 5010 struct amdgpu_device *tmp_adev = NULL; 5011 bool need_full_reset, skip_hw_reset, vram_lost = false; 5012 int r = 0; 5013 bool gpu_reset_for_dev_remove = 0; 5014 5015 /* Try reset handler method first */ 5016 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5017 reset_list); 5018 amdgpu_reset_reg_dumps(tmp_adev); 5019 5020 reset_context->reset_device_list = device_list_handle; 5021 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5022 /* If reset handler not implemented, continue; otherwise return */ 5023 if (r == -ENOSYS) 5024 r = 0; 5025 else 5026 return r; 5027 5028 /* Reset handler not implemented, use the default method */ 5029 need_full_reset = 5030 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5031 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5032 5033 gpu_reset_for_dev_remove = 5034 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5035 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5036 5037 /* 5038 * ASIC reset has to be done on all XGMI hive nodes ASAP 5039 * to allow proper links negotiation in FW (within 1 sec) 5040 */ 5041 if (!skip_hw_reset && need_full_reset) { 5042 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5043 /* For XGMI run all resets in parallel to speed up the process */ 5044 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5045 tmp_adev->gmc.xgmi.pending_reset = false; 5046 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5047 r = -EALREADY; 5048 } else 5049 r = amdgpu_asic_reset(tmp_adev); 5050 5051 if (r) { 5052 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5053 r, adev_to_drm(tmp_adev)->unique); 5054 break; 5055 } 5056 } 5057 5058 /* For XGMI wait for all resets to complete before proceed */ 5059 if (!r) { 5060 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5061 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5062 flush_work(&tmp_adev->xgmi_reset_work); 5063 r = tmp_adev->asic_reset_res; 5064 if (r) 5065 break; 5066 } 5067 } 5068 } 5069 } 5070 5071 if (!r && amdgpu_ras_intr_triggered()) { 5072 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5073 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5074 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5075 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5076 } 5077 5078 amdgpu_ras_intr_cleared(); 5079 } 5080 5081 /* Since the mode1 reset affects base ip blocks, the 5082 * phase1 ip blocks need to be resumed. Otherwise there 5083 * will be a BIOS signature error and the psp bootloader 5084 * can't load kdb on the next amdgpu install. 5085 */ 5086 if (gpu_reset_for_dev_remove) { 5087 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5088 amdgpu_device_ip_resume_phase1(tmp_adev); 5089 5090 goto end; 5091 } 5092 5093 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5094 if (need_full_reset) { 5095 /* post card */ 5096 r = amdgpu_device_asic_init(tmp_adev); 5097 if (r) { 5098 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5099 } else { 5100 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5101 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5102 if (r) 5103 goto out; 5104 5105 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5106 if (r) 5107 goto out; 5108 5109 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5110 #ifdef CONFIG_DEV_COREDUMP 5111 tmp_adev->reset_vram_lost = vram_lost; 5112 memset(&tmp_adev->reset_task_info, 0, 5113 sizeof(tmp_adev->reset_task_info)); 5114 if (reset_context->job && reset_context->job->vm) 5115 tmp_adev->reset_task_info = 5116 reset_context->job->vm->task_info; 5117 amdgpu_reset_capture_coredumpm(tmp_adev); 5118 #endif 5119 if (vram_lost) { 5120 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5121 amdgpu_inc_vram_lost(tmp_adev); 5122 } 5123 5124 r = amdgpu_device_fw_loading(tmp_adev); 5125 if (r) 5126 return r; 5127 5128 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5129 if (r) 5130 goto out; 5131 5132 if (vram_lost) 5133 amdgpu_device_fill_reset_magic(tmp_adev); 5134 5135 /* 5136 * Add this ASIC as tracked as reset was already 5137 * complete successfully. 5138 */ 5139 amdgpu_register_gpu_instance(tmp_adev); 5140 5141 if (!reset_context->hive && 5142 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5143 amdgpu_xgmi_add_device(tmp_adev); 5144 5145 r = amdgpu_device_ip_late_init(tmp_adev); 5146 if (r) 5147 goto out; 5148 5149 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5150 5151 /* 5152 * The GPU enters bad state once faulty pages 5153 * by ECC has reached the threshold, and ras 5154 * recovery is scheduled next. So add one check 5155 * here to break recovery if it indeed exceeds 5156 * bad page threshold, and remind user to 5157 * retire this GPU or setting one bigger 5158 * bad_page_threshold value to fix this once 5159 * probing driver again. 5160 */ 5161 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5162 /* must succeed. */ 5163 amdgpu_ras_resume(tmp_adev); 5164 } else { 5165 r = -EINVAL; 5166 goto out; 5167 } 5168 5169 /* Update PSP FW topology after reset */ 5170 if (reset_context->hive && 5171 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5172 r = amdgpu_xgmi_update_topology( 5173 reset_context->hive, tmp_adev); 5174 } 5175 } 5176 5177 out: 5178 if (!r) { 5179 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5180 r = amdgpu_ib_ring_tests(tmp_adev); 5181 if (r) { 5182 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5183 need_full_reset = true; 5184 r = -EAGAIN; 5185 goto end; 5186 } 5187 } 5188 5189 if (!r) 5190 r = amdgpu_device_recover_vram(tmp_adev); 5191 else 5192 tmp_adev->asic_reset_res = r; 5193 } 5194 5195 end: 5196 if (need_full_reset) 5197 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5198 else 5199 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5200 return r; 5201 } 5202 5203 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5204 { 5205 5206 switch (amdgpu_asic_reset_method(adev)) { 5207 case AMD_RESET_METHOD_MODE1: 5208 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5209 break; 5210 case AMD_RESET_METHOD_MODE2: 5211 adev->mp1_state = PP_MP1_STATE_RESET; 5212 break; 5213 default: 5214 adev->mp1_state = PP_MP1_STATE_NONE; 5215 break; 5216 } 5217 } 5218 5219 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5220 { 5221 amdgpu_vf_error_trans_all(adev); 5222 adev->mp1_state = PP_MP1_STATE_NONE; 5223 } 5224 5225 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5226 { 5227 struct pci_dev *p = NULL; 5228 5229 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5230 adev->pdev->bus->number, 1); 5231 if (p) { 5232 pm_runtime_enable(&(p->dev)); 5233 pm_runtime_resume(&(p->dev)); 5234 } 5235 5236 pci_dev_put(p); 5237 } 5238 5239 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5240 { 5241 enum amd_reset_method reset_method; 5242 struct pci_dev *p = NULL; 5243 u64 expires; 5244 5245 /* 5246 * For now, only BACO and mode1 reset are confirmed 5247 * to suffer the audio issue without proper suspended. 5248 */ 5249 reset_method = amdgpu_asic_reset_method(adev); 5250 if ((reset_method != AMD_RESET_METHOD_BACO) && 5251 (reset_method != AMD_RESET_METHOD_MODE1)) 5252 return -EINVAL; 5253 5254 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5255 adev->pdev->bus->number, 1); 5256 if (!p) 5257 return -ENODEV; 5258 5259 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5260 if (!expires) 5261 /* 5262 * If we cannot get the audio device autosuspend delay, 5263 * a fixed 4S interval will be used. Considering 3S is 5264 * the audio controller default autosuspend delay setting. 5265 * 4S used here is guaranteed to cover that. 5266 */ 5267 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5268 5269 while (!pm_runtime_status_suspended(&(p->dev))) { 5270 if (!pm_runtime_suspend(&(p->dev))) 5271 break; 5272 5273 if (expires < ktime_get_mono_fast_ns()) { 5274 dev_warn(adev->dev, "failed to suspend display audio\n"); 5275 pci_dev_put(p); 5276 /* TODO: abort the succeeding gpu reset? */ 5277 return -ETIMEDOUT; 5278 } 5279 } 5280 5281 pm_runtime_disable(&(p->dev)); 5282 5283 pci_dev_put(p); 5284 return 0; 5285 } 5286 5287 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5288 { 5289 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5290 5291 #if defined(CONFIG_DEBUG_FS) 5292 if (!amdgpu_sriov_vf(adev)) 5293 cancel_work(&adev->reset_work); 5294 #endif 5295 5296 if (adev->kfd.dev) 5297 cancel_work(&adev->kfd.reset_work); 5298 5299 if (amdgpu_sriov_vf(adev)) 5300 cancel_work(&adev->virt.flr_work); 5301 5302 if (con && adev->ras_enabled) 5303 cancel_work(&con->recovery_work); 5304 5305 } 5306 5307 /** 5308 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5309 * 5310 * @adev: amdgpu_device pointer 5311 * @job: which job trigger hang 5312 * @reset_context: amdgpu reset context pointer 5313 * 5314 * Attempt to reset the GPU if it has hung (all asics). 5315 * Attempt to do soft-reset or full-reset and reinitialize Asic 5316 * Returns 0 for success or an error on failure. 5317 */ 5318 5319 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5320 struct amdgpu_job *job, 5321 struct amdgpu_reset_context *reset_context) 5322 { 5323 struct list_head device_list, *device_list_handle = NULL; 5324 bool job_signaled = false; 5325 struct amdgpu_hive_info *hive = NULL; 5326 struct amdgpu_device *tmp_adev = NULL; 5327 int i, r = 0; 5328 bool need_emergency_restart = false; 5329 bool audio_suspended = false; 5330 bool gpu_reset_for_dev_remove = false; 5331 5332 gpu_reset_for_dev_remove = 5333 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5334 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5335 5336 /* 5337 * Special case: RAS triggered and full reset isn't supported 5338 */ 5339 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5340 5341 /* 5342 * Flush RAM to disk so that after reboot 5343 * the user can read log and see why the system rebooted. 5344 */ 5345 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5346 DRM_WARN("Emergency reboot."); 5347 5348 ksys_sync_helper(); 5349 emergency_restart(); 5350 } 5351 5352 dev_info(adev->dev, "GPU %s begin!\n", 5353 need_emergency_restart ? "jobs stop":"reset"); 5354 5355 if (!amdgpu_sriov_vf(adev)) 5356 hive = amdgpu_get_xgmi_hive(adev); 5357 if (hive) 5358 mutex_lock(&hive->hive_lock); 5359 5360 reset_context->job = job; 5361 reset_context->hive = hive; 5362 /* 5363 * Build list of devices to reset. 5364 * In case we are in XGMI hive mode, resort the device list 5365 * to put adev in the 1st position. 5366 */ 5367 INIT_LIST_HEAD(&device_list); 5368 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5369 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5370 list_add_tail(&tmp_adev->reset_list, &device_list); 5371 if (gpu_reset_for_dev_remove && adev->shutdown) 5372 tmp_adev->shutdown = true; 5373 } 5374 if (!list_is_first(&adev->reset_list, &device_list)) 5375 list_rotate_to_front(&adev->reset_list, &device_list); 5376 device_list_handle = &device_list; 5377 } else { 5378 list_add_tail(&adev->reset_list, &device_list); 5379 device_list_handle = &device_list; 5380 } 5381 5382 /* We need to lock reset domain only once both for XGMI and single device */ 5383 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5384 reset_list); 5385 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5386 5387 /* block all schedulers and reset given job's ring */ 5388 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5389 5390 amdgpu_device_set_mp1_state(tmp_adev); 5391 5392 /* 5393 * Try to put the audio codec into suspend state 5394 * before gpu reset started. 5395 * 5396 * Due to the power domain of the graphics device 5397 * is shared with AZ power domain. Without this, 5398 * we may change the audio hardware from behind 5399 * the audio driver's back. That will trigger 5400 * some audio codec errors. 5401 */ 5402 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5403 audio_suspended = true; 5404 5405 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5406 5407 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5408 5409 if (!amdgpu_sriov_vf(tmp_adev)) 5410 amdgpu_amdkfd_pre_reset(tmp_adev); 5411 5412 /* 5413 * Mark these ASICs to be reseted as untracked first 5414 * And add them back after reset completed 5415 */ 5416 amdgpu_unregister_gpu_instance(tmp_adev); 5417 5418 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5419 5420 /* disable ras on ALL IPs */ 5421 if (!need_emergency_restart && 5422 amdgpu_device_ip_need_full_reset(tmp_adev)) 5423 amdgpu_ras_suspend(tmp_adev); 5424 5425 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5426 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5427 5428 if (!ring || !ring->sched.thread) 5429 continue; 5430 5431 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5432 5433 if (need_emergency_restart) 5434 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5435 } 5436 atomic_inc(&tmp_adev->gpu_reset_counter); 5437 } 5438 5439 if (need_emergency_restart) 5440 goto skip_sched_resume; 5441 5442 /* 5443 * Must check guilty signal here since after this point all old 5444 * HW fences are force signaled. 5445 * 5446 * job->base holds a reference to parent fence 5447 */ 5448 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5449 job_signaled = true; 5450 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5451 goto skip_hw_reset; 5452 } 5453 5454 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5455 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5456 if (gpu_reset_for_dev_remove) { 5457 /* Workaroud for ASICs need to disable SMC first */ 5458 amdgpu_device_smu_fini_early(tmp_adev); 5459 } 5460 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5461 /*TODO Should we stop ?*/ 5462 if (r) { 5463 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5464 r, adev_to_drm(tmp_adev)->unique); 5465 tmp_adev->asic_reset_res = r; 5466 } 5467 5468 /* 5469 * Drop all pending non scheduler resets. Scheduler resets 5470 * were already dropped during drm_sched_stop 5471 */ 5472 amdgpu_device_stop_pending_resets(tmp_adev); 5473 } 5474 5475 /* Actual ASIC resets if needed.*/ 5476 /* Host driver will handle XGMI hive reset for SRIOV */ 5477 if (amdgpu_sriov_vf(adev)) { 5478 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5479 if (r) 5480 adev->asic_reset_res = r; 5481 5482 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5483 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5484 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5485 amdgpu_ras_resume(adev); 5486 } else { 5487 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5488 if (r && r == -EAGAIN) 5489 goto retry; 5490 5491 if (!r && gpu_reset_for_dev_remove) 5492 goto recover_end; 5493 } 5494 5495 skip_hw_reset: 5496 5497 /* Post ASIC reset for all devs .*/ 5498 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5499 5500 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5501 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5502 5503 if (!ring || !ring->sched.thread) 5504 continue; 5505 5506 drm_sched_start(&ring->sched, true); 5507 } 5508 5509 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5510 amdgpu_mes_self_test(tmp_adev); 5511 5512 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5513 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5514 } 5515 5516 if (tmp_adev->asic_reset_res) 5517 r = tmp_adev->asic_reset_res; 5518 5519 tmp_adev->asic_reset_res = 0; 5520 5521 if (r) { 5522 /* bad news, how to tell it to userspace ? */ 5523 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5524 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5525 } else { 5526 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5527 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5528 DRM_WARN("smart shift update failed\n"); 5529 } 5530 } 5531 5532 skip_sched_resume: 5533 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5534 /* unlock kfd: SRIOV would do it separately */ 5535 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5536 amdgpu_amdkfd_post_reset(tmp_adev); 5537 5538 /* kfd_post_reset will do nothing if kfd device is not initialized, 5539 * need to bring up kfd here if it's not be initialized before 5540 */ 5541 if (!adev->kfd.init_complete) 5542 amdgpu_amdkfd_device_init(adev); 5543 5544 if (audio_suspended) 5545 amdgpu_device_resume_display_audio(tmp_adev); 5546 5547 amdgpu_device_unset_mp1_state(tmp_adev); 5548 5549 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5550 } 5551 5552 recover_end: 5553 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5554 reset_list); 5555 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5556 5557 if (hive) { 5558 mutex_unlock(&hive->hive_lock); 5559 amdgpu_put_xgmi_hive(hive); 5560 } 5561 5562 if (r) 5563 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5564 5565 atomic_set(&adev->reset_domain->reset_res, r); 5566 return r; 5567 } 5568 5569 /** 5570 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5571 * 5572 * @adev: amdgpu_device pointer 5573 * 5574 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5575 * and lanes) of the slot the device is in. Handles APUs and 5576 * virtualized environments where PCIE config space may not be available. 5577 */ 5578 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5579 { 5580 struct pci_dev *pdev; 5581 enum pci_bus_speed speed_cap, platform_speed_cap; 5582 enum pcie_link_width platform_link_width; 5583 5584 if (amdgpu_pcie_gen_cap) 5585 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5586 5587 if (amdgpu_pcie_lane_cap) 5588 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5589 5590 /* covers APUs as well */ 5591 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5592 if (adev->pm.pcie_gen_mask == 0) 5593 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5594 if (adev->pm.pcie_mlw_mask == 0) 5595 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5596 return; 5597 } 5598 5599 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5600 return; 5601 5602 pcie_bandwidth_available(adev->pdev, NULL, 5603 &platform_speed_cap, &platform_link_width); 5604 5605 if (adev->pm.pcie_gen_mask == 0) { 5606 /* asic caps */ 5607 pdev = adev->pdev; 5608 speed_cap = pcie_get_speed_cap(pdev); 5609 if (speed_cap == PCI_SPEED_UNKNOWN) { 5610 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5611 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5612 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5613 } else { 5614 if (speed_cap == PCIE_SPEED_32_0GT) 5615 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5616 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5617 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5618 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5619 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5620 else if (speed_cap == PCIE_SPEED_16_0GT) 5621 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5622 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5623 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5624 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5625 else if (speed_cap == PCIE_SPEED_8_0GT) 5626 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5627 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5628 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5629 else if (speed_cap == PCIE_SPEED_5_0GT) 5630 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5631 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5632 else 5633 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5634 } 5635 /* platform caps */ 5636 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5637 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5638 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5639 } else { 5640 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5641 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5642 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5643 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5644 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5645 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5646 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5647 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5648 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5649 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5650 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5651 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5652 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5653 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5654 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5655 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5656 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5657 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5658 else 5659 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5660 5661 } 5662 } 5663 if (adev->pm.pcie_mlw_mask == 0) { 5664 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5665 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5666 } else { 5667 switch (platform_link_width) { 5668 case PCIE_LNK_X32: 5669 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5670 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5671 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5672 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5673 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5674 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5675 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5676 break; 5677 case PCIE_LNK_X16: 5678 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5681 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5682 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5683 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5684 break; 5685 case PCIE_LNK_X12: 5686 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5691 break; 5692 case PCIE_LNK_X8: 5693 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5694 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5695 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5697 break; 5698 case PCIE_LNK_X4: 5699 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5700 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5701 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5702 break; 5703 case PCIE_LNK_X2: 5704 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5705 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5706 break; 5707 case PCIE_LNK_X1: 5708 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5709 break; 5710 default: 5711 break; 5712 } 5713 } 5714 } 5715 } 5716 5717 /** 5718 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5719 * 5720 * @adev: amdgpu_device pointer 5721 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5722 * 5723 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5724 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5725 * @peer_adev. 5726 */ 5727 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5728 struct amdgpu_device *peer_adev) 5729 { 5730 #ifdef CONFIG_HSA_AMD_P2P 5731 uint64_t address_mask = peer_adev->dev->dma_mask ? 5732 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5733 resource_size_t aper_limit = 5734 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5735 bool p2p_access = 5736 !adev->gmc.xgmi.connected_to_cpu && 5737 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5738 5739 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5740 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5741 !(adev->gmc.aper_base & address_mask || 5742 aper_limit & address_mask)); 5743 #else 5744 return false; 5745 #endif 5746 } 5747 5748 int amdgpu_device_baco_enter(struct drm_device *dev) 5749 { 5750 struct amdgpu_device *adev = drm_to_adev(dev); 5751 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5752 5753 if (!amdgpu_device_supports_baco(dev)) 5754 return -ENOTSUPP; 5755 5756 if (ras && adev->ras_enabled && 5757 adev->nbio.funcs->enable_doorbell_interrupt) 5758 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5759 5760 return amdgpu_dpm_baco_enter(adev); 5761 } 5762 5763 int amdgpu_device_baco_exit(struct drm_device *dev) 5764 { 5765 struct amdgpu_device *adev = drm_to_adev(dev); 5766 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5767 int ret = 0; 5768 5769 if (!amdgpu_device_supports_baco(dev)) 5770 return -ENOTSUPP; 5771 5772 ret = amdgpu_dpm_baco_exit(adev); 5773 if (ret) 5774 return ret; 5775 5776 if (ras && adev->ras_enabled && 5777 adev->nbio.funcs->enable_doorbell_interrupt) 5778 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5779 5780 if (amdgpu_passthrough(adev) && 5781 adev->nbio.funcs->clear_doorbell_interrupt) 5782 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5783 5784 return 0; 5785 } 5786 5787 /** 5788 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5789 * @pdev: PCI device struct 5790 * @state: PCI channel state 5791 * 5792 * Description: Called when a PCI error is detected. 5793 * 5794 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5795 */ 5796 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5797 { 5798 struct drm_device *dev = pci_get_drvdata(pdev); 5799 struct amdgpu_device *adev = drm_to_adev(dev); 5800 int i; 5801 5802 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5803 5804 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5805 DRM_WARN("No support for XGMI hive yet..."); 5806 return PCI_ERS_RESULT_DISCONNECT; 5807 } 5808 5809 adev->pci_channel_state = state; 5810 5811 switch (state) { 5812 case pci_channel_io_normal: 5813 return PCI_ERS_RESULT_CAN_RECOVER; 5814 /* Fatal error, prepare for slot reset */ 5815 case pci_channel_io_frozen: 5816 /* 5817 * Locking adev->reset_domain->sem will prevent any external access 5818 * to GPU during PCI error recovery 5819 */ 5820 amdgpu_device_lock_reset_domain(adev->reset_domain); 5821 amdgpu_device_set_mp1_state(adev); 5822 5823 /* 5824 * Block any work scheduling as we do for regular GPU reset 5825 * for the duration of the recovery 5826 */ 5827 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5828 struct amdgpu_ring *ring = adev->rings[i]; 5829 5830 if (!ring || !ring->sched.thread) 5831 continue; 5832 5833 drm_sched_stop(&ring->sched, NULL); 5834 } 5835 atomic_inc(&adev->gpu_reset_counter); 5836 return PCI_ERS_RESULT_NEED_RESET; 5837 case pci_channel_io_perm_failure: 5838 /* Permanent error, prepare for device removal */ 5839 return PCI_ERS_RESULT_DISCONNECT; 5840 } 5841 5842 return PCI_ERS_RESULT_NEED_RESET; 5843 } 5844 5845 /** 5846 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5847 * @pdev: pointer to PCI device 5848 */ 5849 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5850 { 5851 5852 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5853 5854 /* TODO - dump whatever for debugging purposes */ 5855 5856 /* This called only if amdgpu_pci_error_detected returns 5857 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5858 * works, no need to reset slot. 5859 */ 5860 5861 return PCI_ERS_RESULT_RECOVERED; 5862 } 5863 5864 /** 5865 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5866 * @pdev: PCI device struct 5867 * 5868 * Description: This routine is called by the pci error recovery 5869 * code after the PCI slot has been reset, just before we 5870 * should resume normal operations. 5871 */ 5872 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5873 { 5874 struct drm_device *dev = pci_get_drvdata(pdev); 5875 struct amdgpu_device *adev = drm_to_adev(dev); 5876 int r, i; 5877 struct amdgpu_reset_context reset_context; 5878 u32 memsize; 5879 struct list_head device_list; 5880 5881 DRM_INFO("PCI error: slot reset callback!!\n"); 5882 5883 memset(&reset_context, 0, sizeof(reset_context)); 5884 5885 INIT_LIST_HEAD(&device_list); 5886 list_add_tail(&adev->reset_list, &device_list); 5887 5888 /* wait for asic to come out of reset */ 5889 msleep(500); 5890 5891 /* Restore PCI confspace */ 5892 amdgpu_device_load_pci_state(pdev); 5893 5894 /* confirm ASIC came out of reset */ 5895 for (i = 0; i < adev->usec_timeout; i++) { 5896 memsize = amdgpu_asic_get_config_memsize(adev); 5897 5898 if (memsize != 0xffffffff) 5899 break; 5900 udelay(1); 5901 } 5902 if (memsize == 0xffffffff) { 5903 r = -ETIME; 5904 goto out; 5905 } 5906 5907 reset_context.method = AMD_RESET_METHOD_NONE; 5908 reset_context.reset_req_dev = adev; 5909 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5910 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5911 5912 adev->no_hw_access = true; 5913 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5914 adev->no_hw_access = false; 5915 if (r) 5916 goto out; 5917 5918 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5919 5920 out: 5921 if (!r) { 5922 if (amdgpu_device_cache_pci_state(adev->pdev)) 5923 pci_restore_state(adev->pdev); 5924 5925 DRM_INFO("PCIe error recovery succeeded\n"); 5926 } else { 5927 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5928 amdgpu_device_unset_mp1_state(adev); 5929 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5930 } 5931 5932 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5933 } 5934 5935 /** 5936 * amdgpu_pci_resume() - resume normal ops after PCI reset 5937 * @pdev: pointer to PCI device 5938 * 5939 * Called when the error recovery driver tells us that its 5940 * OK to resume normal operation. 5941 */ 5942 void amdgpu_pci_resume(struct pci_dev *pdev) 5943 { 5944 struct drm_device *dev = pci_get_drvdata(pdev); 5945 struct amdgpu_device *adev = drm_to_adev(dev); 5946 int i; 5947 5948 5949 DRM_INFO("PCI error: resume callback!!\n"); 5950 5951 /* Only continue execution for the case of pci_channel_io_frozen */ 5952 if (adev->pci_channel_state != pci_channel_io_frozen) 5953 return; 5954 5955 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5956 struct amdgpu_ring *ring = adev->rings[i]; 5957 5958 if (!ring || !ring->sched.thread) 5959 continue; 5960 5961 drm_sched_start(&ring->sched, true); 5962 } 5963 5964 amdgpu_device_unset_mp1_state(adev); 5965 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5966 } 5967 5968 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5969 { 5970 struct drm_device *dev = pci_get_drvdata(pdev); 5971 struct amdgpu_device *adev = drm_to_adev(dev); 5972 int r; 5973 5974 r = pci_save_state(pdev); 5975 if (!r) { 5976 kfree(adev->pci_state); 5977 5978 adev->pci_state = pci_store_saved_state(pdev); 5979 5980 if (!adev->pci_state) { 5981 DRM_ERROR("Failed to store PCI saved state"); 5982 return false; 5983 } 5984 } else { 5985 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5986 return false; 5987 } 5988 5989 return true; 5990 } 5991 5992 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5993 { 5994 struct drm_device *dev = pci_get_drvdata(pdev); 5995 struct amdgpu_device *adev = drm_to_adev(dev); 5996 int r; 5997 5998 if (!adev->pci_state) 5999 return false; 6000 6001 r = pci_load_saved_state(pdev, adev->pci_state); 6002 6003 if (!r) { 6004 pci_restore_state(pdev); 6005 } else { 6006 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6007 return false; 6008 } 6009 6010 return true; 6011 } 6012 6013 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6014 struct amdgpu_ring *ring) 6015 { 6016 #ifdef CONFIG_X86_64 6017 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6018 return; 6019 #endif 6020 if (adev->gmc.xgmi.connected_to_cpu) 6021 return; 6022 6023 if (ring && ring->funcs->emit_hdp_flush) 6024 amdgpu_ring_emit_hdp_flush(ring); 6025 else 6026 amdgpu_asic_flush_hdp(adev, ring); 6027 } 6028 6029 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6030 struct amdgpu_ring *ring) 6031 { 6032 #ifdef CONFIG_X86_64 6033 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6034 return; 6035 #endif 6036 if (adev->gmc.xgmi.connected_to_cpu) 6037 return; 6038 6039 amdgpu_asic_invalidate_hdp(adev, ring); 6040 } 6041 6042 int amdgpu_in_reset(struct amdgpu_device *adev) 6043 { 6044 return atomic_read(&adev->reset_domain->in_gpu_reset); 6045 } 6046 6047 /** 6048 * amdgpu_device_halt() - bring hardware to some kind of halt state 6049 * 6050 * @adev: amdgpu_device pointer 6051 * 6052 * Bring hardware to some kind of halt state so that no one can touch it 6053 * any more. It will help to maintain error context when error occurred. 6054 * Compare to a simple hang, the system will keep stable at least for SSH 6055 * access. Then it should be trivial to inspect the hardware state and 6056 * see what's going on. Implemented as following: 6057 * 6058 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6059 * clears all CPU mappings to device, disallows remappings through page faults 6060 * 2. amdgpu_irq_disable_all() disables all interrupts 6061 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6062 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6063 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6064 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6065 * flush any in flight DMA operations 6066 */ 6067 void amdgpu_device_halt(struct amdgpu_device *adev) 6068 { 6069 struct pci_dev *pdev = adev->pdev; 6070 struct drm_device *ddev = adev_to_drm(adev); 6071 6072 amdgpu_xcp_dev_unplug(adev); 6073 drm_dev_unplug(ddev); 6074 6075 amdgpu_irq_disable_all(adev); 6076 6077 amdgpu_fence_driver_hw_fini(adev); 6078 6079 adev->no_hw_access = true; 6080 6081 amdgpu_device_unmap_mmio(adev); 6082 6083 pci_disable_device(pdev); 6084 pci_wait_for_pending_transaction(pdev); 6085 } 6086 6087 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6088 u32 reg) 6089 { 6090 unsigned long flags, address, data; 6091 u32 r; 6092 6093 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6094 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6095 6096 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6097 WREG32(address, reg * 4); 6098 (void)RREG32(address); 6099 r = RREG32(data); 6100 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6101 return r; 6102 } 6103 6104 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6105 u32 reg, u32 v) 6106 { 6107 unsigned long flags, address, data; 6108 6109 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6110 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6111 6112 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6113 WREG32(address, reg * 4); 6114 (void)RREG32(address); 6115 WREG32(data, v); 6116 (void)RREG32(data); 6117 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6118 } 6119 6120 /** 6121 * amdgpu_device_switch_gang - switch to a new gang 6122 * @adev: amdgpu_device pointer 6123 * @gang: the gang to switch to 6124 * 6125 * Try to switch to a new gang. 6126 * Returns: NULL if we switched to the new gang or a reference to the current 6127 * gang leader. 6128 */ 6129 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6130 struct dma_fence *gang) 6131 { 6132 struct dma_fence *old = NULL; 6133 6134 do { 6135 dma_fence_put(old); 6136 rcu_read_lock(); 6137 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6138 rcu_read_unlock(); 6139 6140 if (old == gang) 6141 break; 6142 6143 if (!dma_fence_is_signaled(old)) 6144 return old; 6145 6146 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6147 old, gang) != old); 6148 6149 dma_fence_put(old); 6150 return NULL; 6151 } 6152 6153 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6154 { 6155 switch (adev->asic_type) { 6156 #ifdef CONFIG_DRM_AMDGPU_SI 6157 case CHIP_HAINAN: 6158 #endif 6159 case CHIP_TOPAZ: 6160 /* chips with no display hardware */ 6161 return false; 6162 #ifdef CONFIG_DRM_AMDGPU_SI 6163 case CHIP_TAHITI: 6164 case CHIP_PITCAIRN: 6165 case CHIP_VERDE: 6166 case CHIP_OLAND: 6167 #endif 6168 #ifdef CONFIG_DRM_AMDGPU_CIK 6169 case CHIP_BONAIRE: 6170 case CHIP_HAWAII: 6171 case CHIP_KAVERI: 6172 case CHIP_KABINI: 6173 case CHIP_MULLINS: 6174 #endif 6175 case CHIP_TONGA: 6176 case CHIP_FIJI: 6177 case CHIP_POLARIS10: 6178 case CHIP_POLARIS11: 6179 case CHIP_POLARIS12: 6180 case CHIP_VEGAM: 6181 case CHIP_CARRIZO: 6182 case CHIP_STONEY: 6183 /* chips with display hardware */ 6184 return true; 6185 default: 6186 /* IP discovery */ 6187 if (!adev->ip_versions[DCE_HWIP][0] || 6188 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6189 return false; 6190 return true; 6191 } 6192 } 6193 6194 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6195 uint32_t inst, uint32_t reg_addr, char reg_name[], 6196 uint32_t expected_value, uint32_t mask) 6197 { 6198 uint32_t ret = 0; 6199 uint32_t old_ = 0; 6200 uint32_t tmp_ = RREG32(reg_addr); 6201 uint32_t loop = adev->usec_timeout; 6202 6203 while ((tmp_ & (mask)) != (expected_value)) { 6204 if (old_ != tmp_) { 6205 loop = adev->usec_timeout; 6206 old_ = tmp_; 6207 } else 6208 udelay(1); 6209 tmp_ = RREG32(reg_addr); 6210 loop--; 6211 if (!loop) { 6212 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6213 inst, reg_name, (uint32_t)expected_value, 6214 (uint32_t)(tmp_ & (mask))); 6215 ret = -ETIMEDOUT; 6216 break; 6217 } 6218 } 6219 return ret; 6220 } 6221