1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 /** 168 * DOC: product_name 169 * 170 * The amdgpu driver provides a sysfs API for reporting the product name 171 * for the device 172 * The file product_name is used for this and returns the product name 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_name(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return sysfs_emit(buf, "%s\n", adev->product_name); 184 } 185 186 static DEVICE_ATTR(product_name, S_IRUGO, 187 amdgpu_device_get_product_name, NULL); 188 189 /** 190 * DOC: product_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the part number 193 * for the device 194 * The file product_number is used for this and returns the part number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_product_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return sysfs_emit(buf, "%s\n", adev->product_number); 206 } 207 208 static DEVICE_ATTR(product_number, S_IRUGO, 209 amdgpu_device_get_product_number, NULL); 210 211 /** 212 * DOC: serial_number 213 * 214 * The amdgpu driver provides a sysfs API for reporting the serial number 215 * for the device 216 * The file serial_number is used for this and returns the serial number 217 * as returned from the FRU. 218 * NOTE: This is only available for certain server cards 219 */ 220 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 222 struct device_attribute *attr, char *buf) 223 { 224 struct drm_device *ddev = dev_get_drvdata(dev); 225 struct amdgpu_device *adev = drm_to_adev(ddev); 226 227 return sysfs_emit(buf, "%s\n", adev->serial); 228 } 229 230 static DEVICE_ATTR(serial_number, S_IRUGO, 231 amdgpu_device_get_serial_number, NULL); 232 233 /** 234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 235 * 236 * @dev: drm_device pointer 237 * 238 * Returns true if the device is a dGPU with ATPX power control, 239 * otherwise return false. 240 */ 241 bool amdgpu_device_supports_px(struct drm_device *dev) 242 { 243 struct amdgpu_device *adev = drm_to_adev(dev); 244 245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device is a dGPU with ACPI power control, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_boco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 if (adev->has_pr3 || 263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 264 return true; 265 return false; 266 } 267 268 /** 269 * amdgpu_device_supports_baco - Does the device support BACO 270 * 271 * @dev: drm_device pointer 272 * 273 * Returns true if the device supporte BACO, 274 * otherwise return false. 275 */ 276 bool amdgpu_device_supports_baco(struct drm_device *dev) 277 { 278 struct amdgpu_device *adev = drm_to_adev(dev); 279 280 return amdgpu_asic_supports_baco(adev); 281 } 282 283 /** 284 * amdgpu_device_supports_smart_shift - Is the device dGPU with 285 * smart shift support 286 * 287 * @dev: drm_device pointer 288 * 289 * Returns true if the device is a dGPU with Smart Shift support, 290 * otherwise returns false. 291 */ 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 293 { 294 return (amdgpu_device_supports_boco(dev) && 295 amdgpu_acpi_is_power_shift_control_supported()); 296 } 297 298 /* 299 * VRAM access helper functions 300 */ 301 302 /** 303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 304 * 305 * @adev: amdgpu_device pointer 306 * @pos: offset of the buffer in vram 307 * @buf: virtual address of the buffer in system memory 308 * @size: read/write size, sizeof(@buf) must > @size 309 * @write: true - write to vram, otherwise - read from vram 310 */ 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 312 void *buf, size_t size, bool write) 313 { 314 unsigned long flags; 315 uint32_t hi = ~0, tmp = 0; 316 uint32_t *data = buf; 317 uint64_t last; 318 int idx; 319 320 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 321 return; 322 323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 324 325 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 326 for (last = pos + size; pos < last; pos += 4) { 327 tmp = pos >> 31; 328 329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 330 if (tmp != hi) { 331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 332 hi = tmp; 333 } 334 if (write) 335 WREG32_NO_KIQ(mmMM_DATA, *data++); 336 else 337 *data++ = RREG32_NO_KIQ(mmMM_DATA); 338 } 339 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 drm_dev_exit(idx); 342 } 343 344 /** 345 * amdgpu_device_aper_access - access vram by vram aperature 346 * 347 * @adev: amdgpu_device pointer 348 * @pos: offset of the buffer in vram 349 * @buf: virtual address of the buffer in system memory 350 * @size: read/write size, sizeof(@buf) must > @size 351 * @write: true - write to vram, otherwise - read from vram 352 * 353 * The return value means how many bytes have been transferred. 354 */ 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 356 void *buf, size_t size, bool write) 357 { 358 #ifdef CONFIG_64BIT 359 void __iomem *addr; 360 size_t count = 0; 361 uint64_t last; 362 363 if (!adev->mman.aper_base_kaddr) 364 return 0; 365 366 last = min(pos + size, adev->gmc.visible_vram_size); 367 if (last > pos) { 368 addr = adev->mman.aper_base_kaddr + pos; 369 count = last - pos; 370 371 if (write) { 372 memcpy_toio(addr, buf, count); 373 mb(); 374 amdgpu_device_flush_hdp(adev, NULL); 375 } else { 376 amdgpu_device_invalidate_hdp(adev, NULL); 377 mb(); 378 memcpy_fromio(buf, addr, count); 379 } 380 381 } 382 383 return count; 384 #else 385 return 0; 386 #endif 387 } 388 389 /** 390 * amdgpu_device_vram_access - read/write a buffer in vram 391 * 392 * @adev: amdgpu_device pointer 393 * @pos: offset of the buffer in vram 394 * @buf: virtual address of the buffer in system memory 395 * @size: read/write size, sizeof(@buf) must > @size 396 * @write: true - write to vram, otherwise - read from vram 397 */ 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 399 void *buf, size_t size, bool write) 400 { 401 size_t count; 402 403 /* try to using vram apreature to access vram first */ 404 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 405 size -= count; 406 if (size) { 407 /* using MM to access rest vram */ 408 pos += count; 409 buf += count; 410 amdgpu_device_mm_access(adev, pos, buf, size, write); 411 } 412 } 413 414 /* 415 * register access helper functions. 416 */ 417 418 /* Check if hw access should be skipped because of hotplug or device error */ 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 420 { 421 if (adev->no_hw_access) 422 return true; 423 424 #ifdef CONFIG_LOCKDEP 425 /* 426 * This is a bit complicated to understand, so worth a comment. What we assert 427 * here is that the GPU reset is not running on another thread in parallel. 428 * 429 * For this we trylock the read side of the reset semaphore, if that succeeds 430 * we know that the reset is not running in paralell. 431 * 432 * If the trylock fails we assert that we are either already holding the read 433 * side of the lock or are the reset thread itself and hold the write side of 434 * the lock. 435 */ 436 if (in_task()) { 437 if (down_read_trylock(&adev->reset_domain->sem)) 438 up_read(&adev->reset_domain->sem); 439 else 440 lockdep_assert_held(&adev->reset_domain->sem); 441 } 442 #endif 443 return false; 444 } 445 446 /** 447 * amdgpu_device_rreg - read a memory mapped IO or indirect register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * @acc_flags: access flags which require special behavior 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t acc_flags) 457 { 458 uint32_t ret; 459 460 if (amdgpu_device_skip_hw_access(adev)) 461 return 0; 462 463 if ((reg * 4) < adev->rmmio_size) { 464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 465 amdgpu_sriov_runtime(adev) && 466 down_read_trylock(&adev->reset_domain->sem)) { 467 ret = amdgpu_kiq_rreg(adev, reg); 468 up_read(&adev->reset_domain->sem); 469 } else { 470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 471 } 472 } else { 473 ret = adev->pcie_rreg(adev, reg * 4); 474 } 475 476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 477 478 return ret; 479 } 480 481 /* 482 * MMIO register read with bytes helper functions 483 * @offset:bytes offset from MMIO start 484 * 485 */ 486 487 /** 488 * amdgpu_mm_rreg8 - read a memory mapped IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @offset: byte aligned register offset 492 * 493 * Returns the 8 bit value from the offset specified. 494 */ 495 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 496 { 497 if (amdgpu_device_skip_hw_access(adev)) 498 return 0; 499 500 if (offset < adev->rmmio_size) 501 return (readb(adev->rmmio + offset)); 502 BUG(); 503 } 504 505 /* 506 * MMIO register write with bytes helper functions 507 * @offset:bytes offset from MMIO start 508 * @value: the value want to be written to the register 509 * 510 */ 511 /** 512 * amdgpu_mm_wreg8 - read a memory mapped IO register 513 * 514 * @adev: amdgpu_device pointer 515 * @offset: byte aligned register offset 516 * @value: 8 bit value to write 517 * 518 * Writes the value specified to the offset specified. 519 */ 520 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 521 { 522 if (amdgpu_device_skip_hw_access(adev)) 523 return; 524 525 if (offset < adev->rmmio_size) 526 writeb(value, adev->rmmio + offset); 527 else 528 BUG(); 529 } 530 531 /** 532 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 533 * 534 * @adev: amdgpu_device pointer 535 * @reg: dword aligned register offset 536 * @v: 32 bit value to write to the register 537 * @acc_flags: access flags which require special behavior 538 * 539 * Writes the value specified to the offset specified. 540 */ 541 void amdgpu_device_wreg(struct amdgpu_device *adev, 542 uint32_t reg, uint32_t v, 543 uint32_t acc_flags) 544 { 545 if (amdgpu_device_skip_hw_access(adev)) 546 return; 547 548 if ((reg * 4) < adev->rmmio_size) { 549 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 550 amdgpu_sriov_runtime(adev) && 551 down_read_trylock(&adev->reset_domain->sem)) { 552 amdgpu_kiq_wreg(adev, reg, v); 553 up_read(&adev->reset_domain->sem); 554 } else { 555 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 556 } 557 } else { 558 adev->pcie_wreg(adev, reg * 4, v); 559 } 560 561 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 562 } 563 564 /** 565 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 566 * 567 * @adev: amdgpu_device pointer 568 * @reg: mmio/rlc register 569 * @v: value to write 570 * 571 * this function is invoked only for the debugfs register access 572 */ 573 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 574 uint32_t reg, uint32_t v) 575 { 576 if (amdgpu_device_skip_hw_access(adev)) 577 return; 578 579 if (amdgpu_sriov_fullaccess(adev) && 580 adev->gfx.rlc.funcs && 581 adev->gfx.rlc.funcs->is_rlcg_access_range) { 582 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 583 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 584 } else if ((reg * 4) >= adev->rmmio_size) { 585 adev->pcie_wreg(adev, reg * 4, v); 586 } else { 587 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 588 } 589 } 590 591 /** 592 * amdgpu_mm_rdoorbell - read a doorbell dword 593 * 594 * @adev: amdgpu_device pointer 595 * @index: doorbell index 596 * 597 * Returns the value in the doorbell aperture at the 598 * requested doorbell index (CIK). 599 */ 600 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 601 { 602 if (amdgpu_device_skip_hw_access(adev)) 603 return 0; 604 605 if (index < adev->doorbell.num_kernel_doorbells) { 606 return readl(adev->doorbell.ptr + index); 607 } else { 608 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 609 return 0; 610 } 611 } 612 613 /** 614 * amdgpu_mm_wdoorbell - write a doorbell dword 615 * 616 * @adev: amdgpu_device pointer 617 * @index: doorbell index 618 * @v: value to write 619 * 620 * Writes @v to the doorbell aperture at the 621 * requested doorbell index (CIK). 622 */ 623 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 624 { 625 if (amdgpu_device_skip_hw_access(adev)) 626 return; 627 628 if (index < adev->doorbell.num_kernel_doorbells) { 629 writel(v, adev->doorbell.ptr + index); 630 } else { 631 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 632 } 633 } 634 635 /** 636 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 637 * 638 * @adev: amdgpu_device pointer 639 * @index: doorbell index 640 * 641 * Returns the value in the doorbell aperture at the 642 * requested doorbell index (VEGA10+). 643 */ 644 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 645 { 646 if (amdgpu_device_skip_hw_access(adev)) 647 return 0; 648 649 if (index < adev->doorbell.num_kernel_doorbells) { 650 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 651 } else { 652 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 653 return 0; 654 } 655 } 656 657 /** 658 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 659 * 660 * @adev: amdgpu_device pointer 661 * @index: doorbell index 662 * @v: value to write 663 * 664 * Writes @v to the doorbell aperture at the 665 * requested doorbell index (VEGA10+). 666 */ 667 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 668 { 669 if (amdgpu_device_skip_hw_access(adev)) 670 return; 671 672 if (index < adev->doorbell.num_kernel_doorbells) { 673 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 674 } else { 675 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 676 } 677 } 678 679 /** 680 * amdgpu_device_indirect_rreg - read an indirect register 681 * 682 * @adev: amdgpu_device pointer 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 reg_addr) 689 { 690 unsigned long flags, pcie_index, pcie_data; 691 void __iomem *pcie_index_offset; 692 void __iomem *pcie_data_offset; 693 u32 r; 694 695 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 696 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 697 698 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 699 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 700 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 701 702 writel(reg_addr, pcie_index_offset); 703 readl(pcie_index_offset); 704 r = readl(pcie_data_offset); 705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 706 707 return r; 708 } 709 710 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 711 u64 reg_addr) 712 { 713 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 714 u32 r; 715 void __iomem *pcie_index_offset; 716 void __iomem *pcie_index_hi_offset; 717 void __iomem *pcie_data_offset; 718 719 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 720 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 721 if (adev->nbio.funcs->get_pcie_index_hi_offset) 722 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 723 else 724 pcie_index_hi = 0; 725 726 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 727 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 728 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 729 if (pcie_index_hi != 0) 730 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 731 pcie_index_hi * 4; 732 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 if (pcie_index_hi != 0) { 736 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 737 readl(pcie_index_hi_offset); 738 } 739 r = readl(pcie_data_offset); 740 741 /* clear the high bits */ 742 if (pcie_index_hi != 0) { 743 writel(0, pcie_index_hi_offset); 744 readl(pcie_index_hi_offset); 745 } 746 747 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 748 749 return r; 750 } 751 752 /** 753 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 754 * 755 * @adev: amdgpu_device pointer 756 * @reg_addr: indirect register address to read from 757 * 758 * Returns the value of indirect register @reg_addr 759 */ 760 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 761 u32 reg_addr) 762 { 763 unsigned long flags, pcie_index, pcie_data; 764 void __iomem *pcie_index_offset; 765 void __iomem *pcie_data_offset; 766 u64 r; 767 768 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 769 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 770 771 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 772 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 773 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 774 775 /* read low 32 bits */ 776 writel(reg_addr, pcie_index_offset); 777 readl(pcie_index_offset); 778 r = readl(pcie_data_offset); 779 /* read high 32 bits */ 780 writel(reg_addr + 4, pcie_index_offset); 781 readl(pcie_index_offset); 782 r |= ((u64)readl(pcie_data_offset) << 32); 783 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 784 785 return r; 786 } 787 788 /** 789 * amdgpu_device_indirect_wreg - write an indirect register address 790 * 791 * @adev: amdgpu_device pointer 792 * @reg_addr: indirect register offset 793 * @reg_data: indirect register data 794 * 795 */ 796 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 797 u32 reg_addr, u32 reg_data) 798 { 799 unsigned long flags, pcie_index, pcie_data; 800 void __iomem *pcie_index_offset; 801 void __iomem *pcie_data_offset; 802 803 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 804 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 805 806 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 807 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 808 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 809 810 writel(reg_addr, pcie_index_offset); 811 readl(pcie_index_offset); 812 writel(reg_data, pcie_data_offset); 813 readl(pcie_data_offset); 814 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 815 } 816 817 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 818 u64 reg_addr, u32 reg_data) 819 { 820 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 821 void __iomem *pcie_index_offset; 822 void __iomem *pcie_index_hi_offset; 823 void __iomem *pcie_data_offset; 824 825 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 826 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 827 if (adev->nbio.funcs->get_pcie_index_hi_offset) 828 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 829 else 830 pcie_index_hi = 0; 831 832 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 833 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 834 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 835 if (pcie_index_hi != 0) 836 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 837 pcie_index_hi * 4; 838 839 writel(reg_addr, pcie_index_offset); 840 readl(pcie_index_offset); 841 if (pcie_index_hi != 0) { 842 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 843 readl(pcie_index_hi_offset); 844 } 845 writel(reg_data, pcie_data_offset); 846 readl(pcie_data_offset); 847 848 /* clear the high bits */ 849 if (pcie_index_hi != 0) { 850 writel(0, pcie_index_hi_offset); 851 readl(pcie_index_hi_offset); 852 } 853 854 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 855 } 856 857 /** 858 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 859 * 860 * @adev: amdgpu_device pointer 861 * @reg_addr: indirect register offset 862 * @reg_data: indirect register data 863 * 864 */ 865 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 866 u32 reg_addr, u64 reg_data) 867 { 868 unsigned long flags, pcie_index, pcie_data; 869 void __iomem *pcie_index_offset; 870 void __iomem *pcie_data_offset; 871 872 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 873 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 874 875 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 876 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 877 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 878 879 /* write low 32 bits */ 880 writel(reg_addr, pcie_index_offset); 881 readl(pcie_index_offset); 882 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 883 readl(pcie_data_offset); 884 /* write high 32 bits */ 885 writel(reg_addr + 4, pcie_index_offset); 886 readl(pcie_index_offset); 887 writel((u32)(reg_data >> 32), pcie_data_offset); 888 readl(pcie_data_offset); 889 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 890 } 891 892 /** 893 * amdgpu_device_get_rev_id - query device rev_id 894 * 895 * @adev: amdgpu_device pointer 896 * 897 * Return device rev_id 898 */ 899 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 900 { 901 return adev->nbio.funcs->get_rev_id(adev); 902 } 903 904 /** 905 * amdgpu_invalid_rreg - dummy reg read function 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: offset of register 909 * 910 * Dummy register read function. Used for register blocks 911 * that certain asics don't have (all asics). 912 * Returns the value in the register. 913 */ 914 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 915 { 916 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 917 BUG(); 918 return 0; 919 } 920 921 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 922 { 923 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 924 BUG(); 925 return 0; 926 } 927 928 /** 929 * amdgpu_invalid_wreg - dummy reg write function 930 * 931 * @adev: amdgpu_device pointer 932 * @reg: offset of register 933 * @v: value to write to the register 934 * 935 * Dummy register read function. Used for register blocks 936 * that certain asics don't have (all asics). 937 */ 938 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 939 { 940 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 941 reg, v); 942 BUG(); 943 } 944 945 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 946 { 947 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 948 reg, v); 949 BUG(); 950 } 951 952 /** 953 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 954 * 955 * @adev: amdgpu_device pointer 956 * @reg: offset of register 957 * 958 * Dummy register read function. Used for register blocks 959 * that certain asics don't have (all asics). 960 * Returns the value in the register. 961 */ 962 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 963 { 964 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 965 BUG(); 966 return 0; 967 } 968 969 /** 970 * amdgpu_invalid_wreg64 - dummy reg write function 971 * 972 * @adev: amdgpu_device pointer 973 * @reg: offset of register 974 * @v: value to write to the register 975 * 976 * Dummy register read function. Used for register blocks 977 * that certain asics don't have (all asics). 978 */ 979 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 980 { 981 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 982 reg, v); 983 BUG(); 984 } 985 986 /** 987 * amdgpu_block_invalid_rreg - dummy reg read function 988 * 989 * @adev: amdgpu_device pointer 990 * @block: offset of instance 991 * @reg: offset of register 992 * 993 * Dummy register read function. Used for register blocks 994 * that certain asics don't have (all asics). 995 * Returns the value in the register. 996 */ 997 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 998 uint32_t block, uint32_t reg) 999 { 1000 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1001 reg, block); 1002 BUG(); 1003 return 0; 1004 } 1005 1006 /** 1007 * amdgpu_block_invalid_wreg - dummy reg write function 1008 * 1009 * @adev: amdgpu_device pointer 1010 * @block: offset of instance 1011 * @reg: offset of register 1012 * @v: value to write to the register 1013 * 1014 * Dummy register read function. Used for register blocks 1015 * that certain asics don't have (all asics). 1016 */ 1017 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1018 uint32_t block, 1019 uint32_t reg, uint32_t v) 1020 { 1021 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1022 reg, block, v); 1023 BUG(); 1024 } 1025 1026 /** 1027 * amdgpu_device_asic_init - Wrapper for atom asic_init 1028 * 1029 * @adev: amdgpu_device pointer 1030 * 1031 * Does any asic specific work and then calls atom asic init. 1032 */ 1033 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1034 { 1035 amdgpu_asic_pre_asic_init(adev); 1036 1037 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 1038 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 1039 return amdgpu_atomfirmware_asic_init(adev, true); 1040 else 1041 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1042 } 1043 1044 /** 1045 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1046 * 1047 * @adev: amdgpu_device pointer 1048 * 1049 * Allocates a scratch page of VRAM for use by various things in the 1050 * driver. 1051 */ 1052 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1053 { 1054 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1055 AMDGPU_GEM_DOMAIN_VRAM | 1056 AMDGPU_GEM_DOMAIN_GTT, 1057 &adev->mem_scratch.robj, 1058 &adev->mem_scratch.gpu_addr, 1059 (void **)&adev->mem_scratch.ptr); 1060 } 1061 1062 /** 1063 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1064 * 1065 * @adev: amdgpu_device pointer 1066 * 1067 * Frees the VRAM scratch page. 1068 */ 1069 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1070 { 1071 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1072 } 1073 1074 /** 1075 * amdgpu_device_program_register_sequence - program an array of registers. 1076 * 1077 * @adev: amdgpu_device pointer 1078 * @registers: pointer to the register array 1079 * @array_size: size of the register array 1080 * 1081 * Programs an array or registers with and and or masks. 1082 * This is a helper for setting golden registers. 1083 */ 1084 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1085 const u32 *registers, 1086 const u32 array_size) 1087 { 1088 u32 tmp, reg, and_mask, or_mask; 1089 int i; 1090 1091 if (array_size % 3) 1092 return; 1093 1094 for (i = 0; i < array_size; i += 3) { 1095 reg = registers[i + 0]; 1096 and_mask = registers[i + 1]; 1097 or_mask = registers[i + 2]; 1098 1099 if (and_mask == 0xffffffff) { 1100 tmp = or_mask; 1101 } else { 1102 tmp = RREG32(reg); 1103 tmp &= ~and_mask; 1104 if (adev->family >= AMDGPU_FAMILY_AI) 1105 tmp |= (or_mask & and_mask); 1106 else 1107 tmp |= or_mask; 1108 } 1109 WREG32(reg, tmp); 1110 } 1111 } 1112 1113 /** 1114 * amdgpu_device_pci_config_reset - reset the GPU 1115 * 1116 * @adev: amdgpu_device pointer 1117 * 1118 * Resets the GPU using the pci config reset sequence. 1119 * Only applicable to asics prior to vega10. 1120 */ 1121 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1122 { 1123 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1124 } 1125 1126 /** 1127 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1128 * 1129 * @adev: amdgpu_device pointer 1130 * 1131 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1132 */ 1133 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1134 { 1135 return pci_reset_function(adev->pdev); 1136 } 1137 1138 /* 1139 * GPU doorbell aperture helpers function. 1140 */ 1141 /** 1142 * amdgpu_device_doorbell_init - Init doorbell driver information. 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Init doorbell driver information (CIK) 1147 * Returns 0 on success, error on failure. 1148 */ 1149 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1150 { 1151 1152 /* No doorbell on SI hardware generation */ 1153 if (adev->asic_type < CHIP_BONAIRE) { 1154 adev->doorbell.base = 0; 1155 adev->doorbell.size = 0; 1156 adev->doorbell.num_kernel_doorbells = 0; 1157 adev->doorbell.ptr = NULL; 1158 return 0; 1159 } 1160 1161 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1162 return -EINVAL; 1163 1164 amdgpu_asic_init_doorbell_index(adev); 1165 1166 /* doorbell bar mapping */ 1167 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1168 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1169 1170 if (adev->enable_mes) { 1171 adev->doorbell.num_kernel_doorbells = 1172 adev->doorbell.size / sizeof(u32); 1173 } else { 1174 adev->doorbell.num_kernel_doorbells = 1175 min_t(u32, adev->doorbell.size / sizeof(u32), 1176 adev->doorbell_index.max_assignment+1); 1177 if (adev->doorbell.num_kernel_doorbells == 0) 1178 return -EINVAL; 1179 1180 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1181 * paging queue doorbell use the second page. The 1182 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1183 * doorbells are in the first page. So with paging queue enabled, 1184 * the max num_kernel_doorbells should + 1 page (0x400 in dword) 1185 */ 1186 if (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(4, 0, 0) && 1187 adev->ip_versions[SDMA0_HWIP][0] < IP_VERSION(4, 2, 0)) 1188 adev->doorbell.num_kernel_doorbells += 0x400; 1189 } 1190 1191 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1192 adev->doorbell.num_kernel_doorbells * 1193 sizeof(u32)); 1194 if (adev->doorbell.ptr == NULL) 1195 return -ENOMEM; 1196 1197 return 0; 1198 } 1199 1200 /** 1201 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1202 * 1203 * @adev: amdgpu_device pointer 1204 * 1205 * Tear down doorbell driver information (CIK) 1206 */ 1207 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1208 { 1209 iounmap(adev->doorbell.ptr); 1210 adev->doorbell.ptr = NULL; 1211 } 1212 1213 1214 1215 /* 1216 * amdgpu_device_wb_*() 1217 * Writeback is the method by which the GPU updates special pages in memory 1218 * with the status of certain GPU events (fences, ring pointers,etc.). 1219 */ 1220 1221 /** 1222 * amdgpu_device_wb_fini - Disable Writeback and free memory 1223 * 1224 * @adev: amdgpu_device pointer 1225 * 1226 * Disables Writeback and frees the Writeback memory (all asics). 1227 * Used at driver shutdown. 1228 */ 1229 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1230 { 1231 if (adev->wb.wb_obj) { 1232 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1233 &adev->wb.gpu_addr, 1234 (void **)&adev->wb.wb); 1235 adev->wb.wb_obj = NULL; 1236 } 1237 } 1238 1239 /** 1240 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1241 * 1242 * @adev: amdgpu_device pointer 1243 * 1244 * Initializes writeback and allocates writeback memory (all asics). 1245 * Used at driver startup. 1246 * Returns 0 on success or an -error on failure. 1247 */ 1248 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1249 { 1250 int r; 1251 1252 if (adev->wb.wb_obj == NULL) { 1253 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1254 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1255 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1256 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1257 (void **)&adev->wb.wb); 1258 if (r) { 1259 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1260 return r; 1261 } 1262 1263 adev->wb.num_wb = AMDGPU_MAX_WB; 1264 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1265 1266 /* clear wb memory */ 1267 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1268 } 1269 1270 return 0; 1271 } 1272 1273 /** 1274 * amdgpu_device_wb_get - Allocate a wb entry 1275 * 1276 * @adev: amdgpu_device pointer 1277 * @wb: wb index 1278 * 1279 * Allocate a wb slot for use by the driver (all asics). 1280 * Returns 0 on success or -EINVAL on failure. 1281 */ 1282 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1283 { 1284 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1285 1286 if (offset < adev->wb.num_wb) { 1287 __set_bit(offset, adev->wb.used); 1288 *wb = offset << 3; /* convert to dw offset */ 1289 return 0; 1290 } else { 1291 return -EINVAL; 1292 } 1293 } 1294 1295 /** 1296 * amdgpu_device_wb_free - Free a wb entry 1297 * 1298 * @adev: amdgpu_device pointer 1299 * @wb: wb index 1300 * 1301 * Free a wb slot allocated for use by the driver (all asics) 1302 */ 1303 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1304 { 1305 wb >>= 3; 1306 if (wb < adev->wb.num_wb) 1307 __clear_bit(wb, adev->wb.used); 1308 } 1309 1310 /** 1311 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1312 * 1313 * @adev: amdgpu_device pointer 1314 * 1315 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1316 * to fail, but if any of the BARs is not accessible after the size we abort 1317 * driver loading by returning -ENODEV. 1318 */ 1319 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1320 { 1321 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1322 struct pci_bus *root; 1323 struct resource *res; 1324 unsigned i; 1325 u16 cmd; 1326 int r; 1327 1328 /* Bypass for VF */ 1329 if (amdgpu_sriov_vf(adev)) 1330 return 0; 1331 1332 /* skip if the bios has already enabled large BAR */ 1333 if (adev->gmc.real_vram_size && 1334 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1335 return 0; 1336 1337 /* Check if the root BUS has 64bit memory resources */ 1338 root = adev->pdev->bus; 1339 while (root->parent) 1340 root = root->parent; 1341 1342 pci_bus_for_each_resource(root, res, i) { 1343 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1344 res->start > 0x100000000ull) 1345 break; 1346 } 1347 1348 /* Trying to resize is pointless without a root hub window above 4GB */ 1349 if (!res) 1350 return 0; 1351 1352 /* Limit the BAR size to what is available */ 1353 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1354 rbar_size); 1355 1356 /* Disable memory decoding while we change the BAR addresses and size */ 1357 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1358 pci_write_config_word(adev->pdev, PCI_COMMAND, 1359 cmd & ~PCI_COMMAND_MEMORY); 1360 1361 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1362 amdgpu_device_doorbell_fini(adev); 1363 if (adev->asic_type >= CHIP_BONAIRE) 1364 pci_release_resource(adev->pdev, 2); 1365 1366 pci_release_resource(adev->pdev, 0); 1367 1368 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1369 if (r == -ENOSPC) 1370 DRM_INFO("Not enough PCI address space for a large BAR."); 1371 else if (r && r != -ENOTSUPP) 1372 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1373 1374 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1375 1376 /* When the doorbell or fb BAR isn't available we have no chance of 1377 * using the device. 1378 */ 1379 r = amdgpu_device_doorbell_init(adev); 1380 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1381 return -ENODEV; 1382 1383 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1384 1385 return 0; 1386 } 1387 1388 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1389 { 1390 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) { 1391 return false; 1392 } 1393 1394 return true; 1395 } 1396 1397 /* 1398 * GPU helpers function. 1399 */ 1400 /** 1401 * amdgpu_device_need_post - check if the hw need post or not 1402 * 1403 * @adev: amdgpu_device pointer 1404 * 1405 * Check if the asic has been initialized (all asics) at driver startup 1406 * or post is needed if hw reset is performed. 1407 * Returns true if need or false if not. 1408 */ 1409 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1410 { 1411 uint32_t reg; 1412 1413 if (amdgpu_sriov_vf(adev)) 1414 return false; 1415 1416 if (!amdgpu_device_read_bios(adev)) 1417 return false; 1418 1419 if (amdgpu_passthrough(adev)) { 1420 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1421 * some old smc fw still need driver do vPost otherwise gpu hang, while 1422 * those smc fw version above 22.15 doesn't have this flaw, so we force 1423 * vpost executed for smc version below 22.15 1424 */ 1425 if (adev->asic_type == CHIP_FIJI) { 1426 int err; 1427 uint32_t fw_ver; 1428 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1429 /* force vPost if error occured */ 1430 if (err) 1431 return true; 1432 1433 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1434 if (fw_ver < 0x00160e00) 1435 return true; 1436 } 1437 } 1438 1439 /* Don't post if we need to reset whole hive on init */ 1440 if (adev->gmc.xgmi.pending_reset) 1441 return false; 1442 1443 if (adev->has_hw_reset) { 1444 adev->has_hw_reset = false; 1445 return true; 1446 } 1447 1448 /* bios scratch used on CIK+ */ 1449 if (adev->asic_type >= CHIP_BONAIRE) 1450 return amdgpu_atombios_scratch_need_asic_init(adev); 1451 1452 /* check MEM_SIZE for older asics */ 1453 reg = amdgpu_asic_get_config_memsize(adev); 1454 1455 if ((reg != 0) && (reg != 0xffffffff)) 1456 return false; 1457 1458 return true; 1459 } 1460 1461 /* 1462 * On APUs with >= 64GB white flickering has been observed w/ SG enabled. 1463 * Disable S/G on such systems until we have a proper fix. 1464 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 1465 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 1466 */ 1467 bool amdgpu_sg_display_supported(struct amdgpu_device *adev) 1468 { 1469 switch (amdgpu_sg_display) { 1470 case -1: 1471 break; 1472 case 0: 1473 return false; 1474 case 1: 1475 return true; 1476 default: 1477 return false; 1478 } 1479 if ((totalram_pages() << (PAGE_SHIFT - 10)) + 1480 (adev->gmc.real_vram_size / 1024) >= 64000000) { 1481 DRM_WARN("Disabling S/G due to >=64GB RAM\n"); 1482 return false; 1483 } 1484 return true; 1485 } 1486 1487 /* 1488 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1489 * speed switching. Until we have confirmation from Intel that a specific host 1490 * supports it, it's safer that we keep it disabled for all. 1491 * 1492 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1493 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1494 */ 1495 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1496 { 1497 #if IS_ENABLED(CONFIG_X86) 1498 struct cpuinfo_x86 *c = &cpu_data(0); 1499 1500 if (c->x86_vendor == X86_VENDOR_INTEL) 1501 return false; 1502 #endif 1503 return true; 1504 } 1505 1506 /** 1507 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1508 * 1509 * @adev: amdgpu_device pointer 1510 * 1511 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1512 * be set for this device. 1513 * 1514 * Returns true if it should be used or false if not. 1515 */ 1516 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1517 { 1518 switch (amdgpu_aspm) { 1519 case -1: 1520 break; 1521 case 0: 1522 return false; 1523 case 1: 1524 return true; 1525 default: 1526 return false; 1527 } 1528 return pcie_aspm_enabled(adev->pdev); 1529 } 1530 1531 bool amdgpu_device_aspm_support_quirk(void) 1532 { 1533 #if IS_ENABLED(CONFIG_X86) 1534 struct cpuinfo_x86 *c = &cpu_data(0); 1535 1536 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1537 #else 1538 return true; 1539 #endif 1540 } 1541 1542 /* if we get transitioned to only one device, take VGA back */ 1543 /** 1544 * amdgpu_device_vga_set_decode - enable/disable vga decode 1545 * 1546 * @pdev: PCI device pointer 1547 * @state: enable/disable vga decode 1548 * 1549 * Enable/disable vga decode (all asics). 1550 * Returns VGA resource flags. 1551 */ 1552 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1553 bool state) 1554 { 1555 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1556 amdgpu_asic_set_vga_state(adev, state); 1557 if (state) 1558 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1559 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1560 else 1561 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1562 } 1563 1564 /** 1565 * amdgpu_device_check_block_size - validate the vm block size 1566 * 1567 * @adev: amdgpu_device pointer 1568 * 1569 * Validates the vm block size specified via module parameter. 1570 * The vm block size defines number of bits in page table versus page directory, 1571 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1572 * page table and the remaining bits are in the page directory. 1573 */ 1574 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1575 { 1576 /* defines number of bits in page table versus page directory, 1577 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1578 * page table and the remaining bits are in the page directory */ 1579 if (amdgpu_vm_block_size == -1) 1580 return; 1581 1582 if (amdgpu_vm_block_size < 9) { 1583 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1584 amdgpu_vm_block_size); 1585 amdgpu_vm_block_size = -1; 1586 } 1587 } 1588 1589 /** 1590 * amdgpu_device_check_vm_size - validate the vm size 1591 * 1592 * @adev: amdgpu_device pointer 1593 * 1594 * Validates the vm size in GB specified via module parameter. 1595 * The VM size is the size of the GPU virtual memory space in GB. 1596 */ 1597 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1598 { 1599 /* no need to check the default value */ 1600 if (amdgpu_vm_size == -1) 1601 return; 1602 1603 if (amdgpu_vm_size < 1) { 1604 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1605 amdgpu_vm_size); 1606 amdgpu_vm_size = -1; 1607 } 1608 } 1609 1610 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1611 { 1612 struct sysinfo si; 1613 bool is_os_64 = (sizeof(void *) == 8); 1614 uint64_t total_memory; 1615 uint64_t dram_size_seven_GB = 0x1B8000000; 1616 uint64_t dram_size_three_GB = 0xB8000000; 1617 1618 if (amdgpu_smu_memory_pool_size == 0) 1619 return; 1620 1621 if (!is_os_64) { 1622 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1623 goto def_value; 1624 } 1625 si_meminfo(&si); 1626 total_memory = (uint64_t)si.totalram * si.mem_unit; 1627 1628 if ((amdgpu_smu_memory_pool_size == 1) || 1629 (amdgpu_smu_memory_pool_size == 2)) { 1630 if (total_memory < dram_size_three_GB) 1631 goto def_value1; 1632 } else if ((amdgpu_smu_memory_pool_size == 4) || 1633 (amdgpu_smu_memory_pool_size == 8)) { 1634 if (total_memory < dram_size_seven_GB) 1635 goto def_value1; 1636 } else { 1637 DRM_WARN("Smu memory pool size not supported\n"); 1638 goto def_value; 1639 } 1640 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1641 1642 return; 1643 1644 def_value1: 1645 DRM_WARN("No enough system memory\n"); 1646 def_value: 1647 adev->pm.smu_prv_buffer_size = 0; 1648 } 1649 1650 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1651 { 1652 if (!(adev->flags & AMD_IS_APU) || 1653 adev->asic_type < CHIP_RAVEN) 1654 return 0; 1655 1656 switch (adev->asic_type) { 1657 case CHIP_RAVEN: 1658 if (adev->pdev->device == 0x15dd) 1659 adev->apu_flags |= AMD_APU_IS_RAVEN; 1660 if (adev->pdev->device == 0x15d8) 1661 adev->apu_flags |= AMD_APU_IS_PICASSO; 1662 break; 1663 case CHIP_RENOIR: 1664 if ((adev->pdev->device == 0x1636) || 1665 (adev->pdev->device == 0x164c)) 1666 adev->apu_flags |= AMD_APU_IS_RENOIR; 1667 else 1668 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1669 break; 1670 case CHIP_VANGOGH: 1671 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1672 break; 1673 case CHIP_YELLOW_CARP: 1674 break; 1675 case CHIP_CYAN_SKILLFISH: 1676 if ((adev->pdev->device == 0x13FE) || 1677 (adev->pdev->device == 0x143F)) 1678 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1679 break; 1680 default: 1681 break; 1682 } 1683 1684 return 0; 1685 } 1686 1687 /** 1688 * amdgpu_device_check_arguments - validate module params 1689 * 1690 * @adev: amdgpu_device pointer 1691 * 1692 * Validates certain module parameters and updates 1693 * the associated values used by the driver (all asics). 1694 */ 1695 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1696 { 1697 if (amdgpu_sched_jobs < 4) { 1698 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1699 amdgpu_sched_jobs); 1700 amdgpu_sched_jobs = 4; 1701 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1702 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1703 amdgpu_sched_jobs); 1704 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1705 } 1706 1707 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1708 /* gart size must be greater or equal to 32M */ 1709 dev_warn(adev->dev, "gart size (%d) too small\n", 1710 amdgpu_gart_size); 1711 amdgpu_gart_size = -1; 1712 } 1713 1714 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1715 /* gtt size must be greater or equal to 32M */ 1716 dev_warn(adev->dev, "gtt size (%d) too small\n", 1717 amdgpu_gtt_size); 1718 amdgpu_gtt_size = -1; 1719 } 1720 1721 /* valid range is between 4 and 9 inclusive */ 1722 if (amdgpu_vm_fragment_size != -1 && 1723 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1724 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1725 amdgpu_vm_fragment_size = -1; 1726 } 1727 1728 if (amdgpu_sched_hw_submission < 2) { 1729 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1730 amdgpu_sched_hw_submission); 1731 amdgpu_sched_hw_submission = 2; 1732 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1733 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1734 amdgpu_sched_hw_submission); 1735 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1736 } 1737 1738 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1739 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1740 amdgpu_reset_method = -1; 1741 } 1742 1743 amdgpu_device_check_smu_prv_buffer_size(adev); 1744 1745 amdgpu_device_check_vm_size(adev); 1746 1747 amdgpu_device_check_block_size(adev); 1748 1749 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1750 1751 return 0; 1752 } 1753 1754 /** 1755 * amdgpu_switcheroo_set_state - set switcheroo state 1756 * 1757 * @pdev: pci dev pointer 1758 * @state: vga_switcheroo state 1759 * 1760 * Callback for the switcheroo driver. Suspends or resumes 1761 * the asics before or after it is powered up using ACPI methods. 1762 */ 1763 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1764 enum vga_switcheroo_state state) 1765 { 1766 struct drm_device *dev = pci_get_drvdata(pdev); 1767 int r; 1768 1769 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1770 return; 1771 1772 if (state == VGA_SWITCHEROO_ON) { 1773 pr_info("switched on\n"); 1774 /* don't suspend or resume card normally */ 1775 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1776 1777 pci_set_power_state(pdev, PCI_D0); 1778 amdgpu_device_load_pci_state(pdev); 1779 r = pci_enable_device(pdev); 1780 if (r) 1781 DRM_WARN("pci_enable_device failed (%d)\n", r); 1782 amdgpu_device_resume(dev, true); 1783 1784 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1785 } else { 1786 pr_info("switched off\n"); 1787 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1788 amdgpu_device_suspend(dev, true); 1789 amdgpu_device_cache_pci_state(pdev); 1790 /* Shut down the device */ 1791 pci_disable_device(pdev); 1792 pci_set_power_state(pdev, PCI_D3cold); 1793 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1794 } 1795 } 1796 1797 /** 1798 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1799 * 1800 * @pdev: pci dev pointer 1801 * 1802 * Callback for the switcheroo driver. Check of the switcheroo 1803 * state can be changed. 1804 * Returns true if the state can be changed, false if not. 1805 */ 1806 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1807 { 1808 struct drm_device *dev = pci_get_drvdata(pdev); 1809 1810 /* 1811 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1812 * locking inversion with the driver load path. And the access here is 1813 * completely racy anyway. So don't bother with locking for now. 1814 */ 1815 return atomic_read(&dev->open_count) == 0; 1816 } 1817 1818 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1819 .set_gpu_state = amdgpu_switcheroo_set_state, 1820 .reprobe = NULL, 1821 .can_switch = amdgpu_switcheroo_can_switch, 1822 }; 1823 1824 /** 1825 * amdgpu_device_ip_set_clockgating_state - set the CG state 1826 * 1827 * @dev: amdgpu_device pointer 1828 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1829 * @state: clockgating state (gate or ungate) 1830 * 1831 * Sets the requested clockgating state for all instances of 1832 * the hardware IP specified. 1833 * Returns the error code from the last instance. 1834 */ 1835 int amdgpu_device_ip_set_clockgating_state(void *dev, 1836 enum amd_ip_block_type block_type, 1837 enum amd_clockgating_state state) 1838 { 1839 struct amdgpu_device *adev = dev; 1840 int i, r = 0; 1841 1842 for (i = 0; i < adev->num_ip_blocks; i++) { 1843 if (!adev->ip_blocks[i].status.valid) 1844 continue; 1845 if (adev->ip_blocks[i].version->type != block_type) 1846 continue; 1847 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1848 continue; 1849 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1850 (void *)adev, state); 1851 if (r) 1852 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1853 adev->ip_blocks[i].version->funcs->name, r); 1854 } 1855 return r; 1856 } 1857 1858 /** 1859 * amdgpu_device_ip_set_powergating_state - set the PG state 1860 * 1861 * @dev: amdgpu_device pointer 1862 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1863 * @state: powergating state (gate or ungate) 1864 * 1865 * Sets the requested powergating state for all instances of 1866 * the hardware IP specified. 1867 * Returns the error code from the last instance. 1868 */ 1869 int amdgpu_device_ip_set_powergating_state(void *dev, 1870 enum amd_ip_block_type block_type, 1871 enum amd_powergating_state state) 1872 { 1873 struct amdgpu_device *adev = dev; 1874 int i, r = 0; 1875 1876 for (i = 0; i < adev->num_ip_blocks; i++) { 1877 if (!adev->ip_blocks[i].status.valid) 1878 continue; 1879 if (adev->ip_blocks[i].version->type != block_type) 1880 continue; 1881 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1882 continue; 1883 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1884 (void *)adev, state); 1885 if (r) 1886 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1887 adev->ip_blocks[i].version->funcs->name, r); 1888 } 1889 return r; 1890 } 1891 1892 /** 1893 * amdgpu_device_ip_get_clockgating_state - get the CG state 1894 * 1895 * @adev: amdgpu_device pointer 1896 * @flags: clockgating feature flags 1897 * 1898 * Walks the list of IPs on the device and updates the clockgating 1899 * flags for each IP. 1900 * Updates @flags with the feature flags for each hardware IP where 1901 * clockgating is enabled. 1902 */ 1903 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1904 u64 *flags) 1905 { 1906 int i; 1907 1908 for (i = 0; i < adev->num_ip_blocks; i++) { 1909 if (!adev->ip_blocks[i].status.valid) 1910 continue; 1911 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1912 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1913 } 1914 } 1915 1916 /** 1917 * amdgpu_device_ip_wait_for_idle - wait for idle 1918 * 1919 * @adev: amdgpu_device pointer 1920 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1921 * 1922 * Waits for the request hardware IP to be idle. 1923 * Returns 0 for success or a negative error code on failure. 1924 */ 1925 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1926 enum amd_ip_block_type block_type) 1927 { 1928 int i, r; 1929 1930 for (i = 0; i < adev->num_ip_blocks; i++) { 1931 if (!adev->ip_blocks[i].status.valid) 1932 continue; 1933 if (adev->ip_blocks[i].version->type == block_type) { 1934 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1935 if (r) 1936 return r; 1937 break; 1938 } 1939 } 1940 return 0; 1941 1942 } 1943 1944 /** 1945 * amdgpu_device_ip_is_idle - is the hardware IP idle 1946 * 1947 * @adev: amdgpu_device pointer 1948 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1949 * 1950 * Check if the hardware IP is idle or not. 1951 * Returns true if it the IP is idle, false if not. 1952 */ 1953 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1954 enum amd_ip_block_type block_type) 1955 { 1956 int i; 1957 1958 for (i = 0; i < adev->num_ip_blocks; i++) { 1959 if (!adev->ip_blocks[i].status.valid) 1960 continue; 1961 if (adev->ip_blocks[i].version->type == block_type) 1962 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1963 } 1964 return true; 1965 1966 } 1967 1968 /** 1969 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1970 * 1971 * @adev: amdgpu_device pointer 1972 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1973 * 1974 * Returns a pointer to the hardware IP block structure 1975 * if it exists for the asic, otherwise NULL. 1976 */ 1977 struct amdgpu_ip_block * 1978 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1979 enum amd_ip_block_type type) 1980 { 1981 int i; 1982 1983 for (i = 0; i < adev->num_ip_blocks; i++) 1984 if (adev->ip_blocks[i].version->type == type) 1985 return &adev->ip_blocks[i]; 1986 1987 return NULL; 1988 } 1989 1990 /** 1991 * amdgpu_device_ip_block_version_cmp 1992 * 1993 * @adev: amdgpu_device pointer 1994 * @type: enum amd_ip_block_type 1995 * @major: major version 1996 * @minor: minor version 1997 * 1998 * return 0 if equal or greater 1999 * return 1 if smaller or the ip_block doesn't exist 2000 */ 2001 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2002 enum amd_ip_block_type type, 2003 u32 major, u32 minor) 2004 { 2005 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2006 2007 if (ip_block && ((ip_block->version->major > major) || 2008 ((ip_block->version->major == major) && 2009 (ip_block->version->minor >= minor)))) 2010 return 0; 2011 2012 return 1; 2013 } 2014 2015 /** 2016 * amdgpu_device_ip_block_add 2017 * 2018 * @adev: amdgpu_device pointer 2019 * @ip_block_version: pointer to the IP to add 2020 * 2021 * Adds the IP block driver information to the collection of IPs 2022 * on the asic. 2023 */ 2024 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2025 const struct amdgpu_ip_block_version *ip_block_version) 2026 { 2027 if (!ip_block_version) 2028 return -EINVAL; 2029 2030 switch (ip_block_version->type) { 2031 case AMD_IP_BLOCK_TYPE_VCN: 2032 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2033 return 0; 2034 break; 2035 case AMD_IP_BLOCK_TYPE_JPEG: 2036 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2037 return 0; 2038 break; 2039 default: 2040 break; 2041 } 2042 2043 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2044 ip_block_version->funcs->name); 2045 2046 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2047 2048 return 0; 2049 } 2050 2051 /** 2052 * amdgpu_device_enable_virtual_display - enable virtual display feature 2053 * 2054 * @adev: amdgpu_device pointer 2055 * 2056 * Enabled the virtual display feature if the user has enabled it via 2057 * the module parameter virtual_display. This feature provides a virtual 2058 * display hardware on headless boards or in virtualized environments. 2059 * This function parses and validates the configuration string specified by 2060 * the user and configues the virtual display configuration (number of 2061 * virtual connectors, crtcs, etc.) specified. 2062 */ 2063 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2064 { 2065 adev->enable_virtual_display = false; 2066 2067 if (amdgpu_virtual_display) { 2068 const char *pci_address_name = pci_name(adev->pdev); 2069 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2070 2071 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2072 pciaddstr_tmp = pciaddstr; 2073 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2074 pciaddname = strsep(&pciaddname_tmp, ","); 2075 if (!strcmp("all", pciaddname) 2076 || !strcmp(pci_address_name, pciaddname)) { 2077 long num_crtc; 2078 int res = -1; 2079 2080 adev->enable_virtual_display = true; 2081 2082 if (pciaddname_tmp) 2083 res = kstrtol(pciaddname_tmp, 10, 2084 &num_crtc); 2085 2086 if (!res) { 2087 if (num_crtc < 1) 2088 num_crtc = 1; 2089 if (num_crtc > 6) 2090 num_crtc = 6; 2091 adev->mode_info.num_crtc = num_crtc; 2092 } else { 2093 adev->mode_info.num_crtc = 1; 2094 } 2095 break; 2096 } 2097 } 2098 2099 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2100 amdgpu_virtual_display, pci_address_name, 2101 adev->enable_virtual_display, adev->mode_info.num_crtc); 2102 2103 kfree(pciaddstr); 2104 } 2105 } 2106 2107 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2108 { 2109 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2110 adev->mode_info.num_crtc = 1; 2111 adev->enable_virtual_display = true; 2112 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2113 adev->enable_virtual_display, adev->mode_info.num_crtc); 2114 } 2115 } 2116 2117 /** 2118 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2119 * 2120 * @adev: amdgpu_device pointer 2121 * 2122 * Parses the asic configuration parameters specified in the gpu info 2123 * firmware and makes them availale to the driver for use in configuring 2124 * the asic. 2125 * Returns 0 on success, -EINVAL on failure. 2126 */ 2127 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2128 { 2129 const char *chip_name; 2130 char fw_name[40]; 2131 int err; 2132 const struct gpu_info_firmware_header_v1_0 *hdr; 2133 2134 adev->firmware.gpu_info_fw = NULL; 2135 2136 if (adev->mman.discovery_bin) { 2137 /* 2138 * FIXME: The bounding box is still needed by Navi12, so 2139 * temporarily read it from gpu_info firmware. Should be dropped 2140 * when DAL no longer needs it. 2141 */ 2142 if (adev->asic_type != CHIP_NAVI12) 2143 return 0; 2144 } 2145 2146 switch (adev->asic_type) { 2147 default: 2148 return 0; 2149 case CHIP_VEGA10: 2150 chip_name = "vega10"; 2151 break; 2152 case CHIP_VEGA12: 2153 chip_name = "vega12"; 2154 break; 2155 case CHIP_RAVEN: 2156 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2157 chip_name = "raven2"; 2158 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2159 chip_name = "picasso"; 2160 else 2161 chip_name = "raven"; 2162 break; 2163 case CHIP_ARCTURUS: 2164 chip_name = "arcturus"; 2165 break; 2166 case CHIP_NAVI12: 2167 chip_name = "navi12"; 2168 break; 2169 } 2170 2171 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2172 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2173 if (err) { 2174 dev_err(adev->dev, 2175 "Failed to get gpu_info firmware \"%s\"\n", 2176 fw_name); 2177 goto out; 2178 } 2179 2180 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2181 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2182 2183 switch (hdr->version_major) { 2184 case 1: 2185 { 2186 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2187 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2188 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2189 2190 /* 2191 * Should be droped when DAL no longer needs it. 2192 */ 2193 if (adev->asic_type == CHIP_NAVI12) 2194 goto parse_soc_bounding_box; 2195 2196 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2197 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2198 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2199 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2200 adev->gfx.config.max_texture_channel_caches = 2201 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2202 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2203 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2204 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2205 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2206 adev->gfx.config.double_offchip_lds_buf = 2207 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2208 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2209 adev->gfx.cu_info.max_waves_per_simd = 2210 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2211 adev->gfx.cu_info.max_scratch_slots_per_cu = 2212 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2213 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2214 if (hdr->version_minor >= 1) { 2215 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2216 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2217 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2218 adev->gfx.config.num_sc_per_sh = 2219 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2220 adev->gfx.config.num_packer_per_sc = 2221 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2222 } 2223 2224 parse_soc_bounding_box: 2225 /* 2226 * soc bounding box info is not integrated in disocovery table, 2227 * we always need to parse it from gpu info firmware if needed. 2228 */ 2229 if (hdr->version_minor == 2) { 2230 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2231 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2232 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2233 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2234 } 2235 break; 2236 } 2237 default: 2238 dev_err(adev->dev, 2239 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2240 err = -EINVAL; 2241 goto out; 2242 } 2243 out: 2244 return err; 2245 } 2246 2247 /** 2248 * amdgpu_device_ip_early_init - run early init for hardware IPs 2249 * 2250 * @adev: amdgpu_device pointer 2251 * 2252 * Early initialization pass for hardware IPs. The hardware IPs that make 2253 * up each asic are discovered each IP's early_init callback is run. This 2254 * is the first stage in initializing the asic. 2255 * Returns 0 on success, negative error code on failure. 2256 */ 2257 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2258 { 2259 struct drm_device *dev = adev_to_drm(adev); 2260 struct pci_dev *parent; 2261 int i, r; 2262 bool total; 2263 2264 amdgpu_device_enable_virtual_display(adev); 2265 2266 if (amdgpu_sriov_vf(adev)) { 2267 r = amdgpu_virt_request_full_gpu(adev, true); 2268 if (r) 2269 return r; 2270 } 2271 2272 switch (adev->asic_type) { 2273 #ifdef CONFIG_DRM_AMDGPU_SI 2274 case CHIP_VERDE: 2275 case CHIP_TAHITI: 2276 case CHIP_PITCAIRN: 2277 case CHIP_OLAND: 2278 case CHIP_HAINAN: 2279 adev->family = AMDGPU_FAMILY_SI; 2280 r = si_set_ip_blocks(adev); 2281 if (r) 2282 return r; 2283 break; 2284 #endif 2285 #ifdef CONFIG_DRM_AMDGPU_CIK 2286 case CHIP_BONAIRE: 2287 case CHIP_HAWAII: 2288 case CHIP_KAVERI: 2289 case CHIP_KABINI: 2290 case CHIP_MULLINS: 2291 if (adev->flags & AMD_IS_APU) 2292 adev->family = AMDGPU_FAMILY_KV; 2293 else 2294 adev->family = AMDGPU_FAMILY_CI; 2295 2296 r = cik_set_ip_blocks(adev); 2297 if (r) 2298 return r; 2299 break; 2300 #endif 2301 case CHIP_TOPAZ: 2302 case CHIP_TONGA: 2303 case CHIP_FIJI: 2304 case CHIP_POLARIS10: 2305 case CHIP_POLARIS11: 2306 case CHIP_POLARIS12: 2307 case CHIP_VEGAM: 2308 case CHIP_CARRIZO: 2309 case CHIP_STONEY: 2310 if (adev->flags & AMD_IS_APU) 2311 adev->family = AMDGPU_FAMILY_CZ; 2312 else 2313 adev->family = AMDGPU_FAMILY_VI; 2314 2315 r = vi_set_ip_blocks(adev); 2316 if (r) 2317 return r; 2318 break; 2319 default: 2320 r = amdgpu_discovery_set_ip_blocks(adev); 2321 if (r) 2322 return r; 2323 break; 2324 } 2325 2326 if (amdgpu_has_atpx() && 2327 (amdgpu_is_atpx_hybrid() || 2328 amdgpu_has_atpx_dgpu_power_cntl()) && 2329 ((adev->flags & AMD_IS_APU) == 0) && 2330 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2331 adev->flags |= AMD_IS_PX; 2332 2333 if (!(adev->flags & AMD_IS_APU)) { 2334 parent = pci_upstream_bridge(adev->pdev); 2335 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2336 } 2337 2338 2339 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2340 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2341 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2342 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2343 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2344 2345 total = true; 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2348 DRM_WARN("disabled ip block: %d <%s>\n", 2349 i, adev->ip_blocks[i].version->funcs->name); 2350 adev->ip_blocks[i].status.valid = false; 2351 } else { 2352 if (adev->ip_blocks[i].version->funcs->early_init) { 2353 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2354 if (r == -ENOENT) { 2355 adev->ip_blocks[i].status.valid = false; 2356 } else if (r) { 2357 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2358 adev->ip_blocks[i].version->funcs->name, r); 2359 total = false; 2360 } else { 2361 adev->ip_blocks[i].status.valid = true; 2362 } 2363 } else { 2364 adev->ip_blocks[i].status.valid = true; 2365 } 2366 } 2367 /* get the vbios after the asic_funcs are set up */ 2368 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2369 r = amdgpu_device_parse_gpu_info_fw(adev); 2370 if (r) 2371 return r; 2372 2373 /* Read BIOS */ 2374 if (amdgpu_device_read_bios(adev)) { 2375 if (!amdgpu_get_bios(adev)) 2376 return -EINVAL; 2377 2378 r = amdgpu_atombios_init(adev); 2379 if (r) { 2380 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2381 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2382 return r; 2383 } 2384 } 2385 2386 /*get pf2vf msg info at it's earliest time*/ 2387 if (amdgpu_sriov_vf(adev)) 2388 amdgpu_virt_init_data_exchange(adev); 2389 2390 } 2391 } 2392 if (!total) 2393 return -ENODEV; 2394 2395 amdgpu_amdkfd_device_probe(adev); 2396 adev->cg_flags &= amdgpu_cg_mask; 2397 adev->pg_flags &= amdgpu_pg_mask; 2398 2399 return 0; 2400 } 2401 2402 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2403 { 2404 int i, r; 2405 2406 for (i = 0; i < adev->num_ip_blocks; i++) { 2407 if (!adev->ip_blocks[i].status.sw) 2408 continue; 2409 if (adev->ip_blocks[i].status.hw) 2410 continue; 2411 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2412 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2413 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2414 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2415 if (r) { 2416 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2417 adev->ip_blocks[i].version->funcs->name, r); 2418 return r; 2419 } 2420 adev->ip_blocks[i].status.hw = true; 2421 } 2422 } 2423 2424 return 0; 2425 } 2426 2427 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2428 { 2429 int i, r; 2430 2431 for (i = 0; i < adev->num_ip_blocks; i++) { 2432 if (!adev->ip_blocks[i].status.sw) 2433 continue; 2434 if (adev->ip_blocks[i].status.hw) 2435 continue; 2436 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2437 if (r) { 2438 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2439 adev->ip_blocks[i].version->funcs->name, r); 2440 return r; 2441 } 2442 adev->ip_blocks[i].status.hw = true; 2443 } 2444 2445 return 0; 2446 } 2447 2448 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2449 { 2450 int r = 0; 2451 int i; 2452 uint32_t smu_version; 2453 2454 if (adev->asic_type >= CHIP_VEGA10) { 2455 for (i = 0; i < adev->num_ip_blocks; i++) { 2456 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2457 continue; 2458 2459 if (!adev->ip_blocks[i].status.sw) 2460 continue; 2461 2462 /* no need to do the fw loading again if already done*/ 2463 if (adev->ip_blocks[i].status.hw == true) 2464 break; 2465 2466 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2467 r = adev->ip_blocks[i].version->funcs->resume(adev); 2468 if (r) { 2469 DRM_ERROR("resume of IP block <%s> failed %d\n", 2470 adev->ip_blocks[i].version->funcs->name, r); 2471 return r; 2472 } 2473 } else { 2474 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2475 if (r) { 2476 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2477 adev->ip_blocks[i].version->funcs->name, r); 2478 return r; 2479 } 2480 } 2481 2482 adev->ip_blocks[i].status.hw = true; 2483 break; 2484 } 2485 } 2486 2487 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2488 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2489 2490 return r; 2491 } 2492 2493 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2494 { 2495 long timeout; 2496 int r, i; 2497 2498 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2499 struct amdgpu_ring *ring = adev->rings[i]; 2500 2501 /* No need to setup the GPU scheduler for rings that don't need it */ 2502 if (!ring || ring->no_scheduler) 2503 continue; 2504 2505 switch (ring->funcs->type) { 2506 case AMDGPU_RING_TYPE_GFX: 2507 timeout = adev->gfx_timeout; 2508 break; 2509 case AMDGPU_RING_TYPE_COMPUTE: 2510 timeout = adev->compute_timeout; 2511 break; 2512 case AMDGPU_RING_TYPE_SDMA: 2513 timeout = adev->sdma_timeout; 2514 break; 2515 default: 2516 timeout = adev->video_timeout; 2517 break; 2518 } 2519 2520 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2521 ring->num_hw_submission, 0, 2522 timeout, adev->reset_domain->wq, 2523 ring->sched_score, ring->name, 2524 adev->dev); 2525 if (r) { 2526 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2527 ring->name); 2528 return r; 2529 } 2530 } 2531 2532 amdgpu_xcp_update_partition_sched_list(adev); 2533 2534 return 0; 2535 } 2536 2537 2538 /** 2539 * amdgpu_device_ip_init - run init for hardware IPs 2540 * 2541 * @adev: amdgpu_device pointer 2542 * 2543 * Main initialization pass for hardware IPs. The list of all the hardware 2544 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2545 * are run. sw_init initializes the software state associated with each IP 2546 * and hw_init initializes the hardware associated with each IP. 2547 * Returns 0 on success, negative error code on failure. 2548 */ 2549 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2550 { 2551 int i, r; 2552 2553 r = amdgpu_ras_init(adev); 2554 if (r) 2555 return r; 2556 2557 for (i = 0; i < adev->num_ip_blocks; i++) { 2558 if (!adev->ip_blocks[i].status.valid) 2559 continue; 2560 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2561 if (r) { 2562 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2563 adev->ip_blocks[i].version->funcs->name, r); 2564 goto init_failed; 2565 } 2566 adev->ip_blocks[i].status.sw = true; 2567 2568 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2569 /* need to do common hw init early so everything is set up for gmc */ 2570 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2571 if (r) { 2572 DRM_ERROR("hw_init %d failed %d\n", i, r); 2573 goto init_failed; 2574 } 2575 adev->ip_blocks[i].status.hw = true; 2576 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2577 /* need to do gmc hw init early so we can allocate gpu mem */ 2578 /* Try to reserve bad pages early */ 2579 if (amdgpu_sriov_vf(adev)) 2580 amdgpu_virt_exchange_data(adev); 2581 2582 r = amdgpu_device_mem_scratch_init(adev); 2583 if (r) { 2584 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2585 goto init_failed; 2586 } 2587 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2588 if (r) { 2589 DRM_ERROR("hw_init %d failed %d\n", i, r); 2590 goto init_failed; 2591 } 2592 r = amdgpu_device_wb_init(adev); 2593 if (r) { 2594 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2595 goto init_failed; 2596 } 2597 adev->ip_blocks[i].status.hw = true; 2598 2599 /* right after GMC hw init, we create CSA */ 2600 if (adev->gfx.mcbp) { 2601 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2602 AMDGPU_GEM_DOMAIN_VRAM | 2603 AMDGPU_GEM_DOMAIN_GTT, 2604 AMDGPU_CSA_SIZE); 2605 if (r) { 2606 DRM_ERROR("allocate CSA failed %d\n", r); 2607 goto init_failed; 2608 } 2609 } 2610 } 2611 } 2612 2613 if (amdgpu_sriov_vf(adev)) 2614 amdgpu_virt_init_data_exchange(adev); 2615 2616 r = amdgpu_ib_pool_init(adev); 2617 if (r) { 2618 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2619 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2620 goto init_failed; 2621 } 2622 2623 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2624 if (r) 2625 goto init_failed; 2626 2627 r = amdgpu_device_ip_hw_init_phase1(adev); 2628 if (r) 2629 goto init_failed; 2630 2631 r = amdgpu_device_fw_loading(adev); 2632 if (r) 2633 goto init_failed; 2634 2635 r = amdgpu_device_ip_hw_init_phase2(adev); 2636 if (r) 2637 goto init_failed; 2638 2639 /* 2640 * retired pages will be loaded from eeprom and reserved here, 2641 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2642 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2643 * for I2C communication which only true at this point. 2644 * 2645 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2646 * failure from bad gpu situation and stop amdgpu init process 2647 * accordingly. For other failed cases, it will still release all 2648 * the resource and print error message, rather than returning one 2649 * negative value to upper level. 2650 * 2651 * Note: theoretically, this should be called before all vram allocations 2652 * to protect retired page from abusing 2653 */ 2654 r = amdgpu_ras_recovery_init(adev); 2655 if (r) 2656 goto init_failed; 2657 2658 /** 2659 * In case of XGMI grab extra reference for reset domain for this device 2660 */ 2661 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2662 if (amdgpu_xgmi_add_device(adev) == 0) { 2663 if (!amdgpu_sriov_vf(adev)) { 2664 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2665 2666 if (WARN_ON(!hive)) { 2667 r = -ENOENT; 2668 goto init_failed; 2669 } 2670 2671 if (!hive->reset_domain || 2672 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2673 r = -ENOENT; 2674 amdgpu_put_xgmi_hive(hive); 2675 goto init_failed; 2676 } 2677 2678 /* Drop the early temporary reset domain we created for device */ 2679 amdgpu_reset_put_reset_domain(adev->reset_domain); 2680 adev->reset_domain = hive->reset_domain; 2681 amdgpu_put_xgmi_hive(hive); 2682 } 2683 } 2684 } 2685 2686 r = amdgpu_device_init_schedulers(adev); 2687 if (r) 2688 goto init_failed; 2689 2690 /* Don't init kfd if whole hive need to be reset during init */ 2691 if (!adev->gmc.xgmi.pending_reset) { 2692 kgd2kfd_init_zone_device(adev); 2693 amdgpu_amdkfd_device_init(adev); 2694 } 2695 2696 amdgpu_fru_get_product_info(adev); 2697 2698 init_failed: 2699 2700 return r; 2701 } 2702 2703 /** 2704 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2705 * 2706 * @adev: amdgpu_device pointer 2707 * 2708 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2709 * this function before a GPU reset. If the value is retained after a 2710 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2711 */ 2712 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2713 { 2714 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2715 } 2716 2717 /** 2718 * amdgpu_device_check_vram_lost - check if vram is valid 2719 * 2720 * @adev: amdgpu_device pointer 2721 * 2722 * Checks the reset magic value written to the gart pointer in VRAM. 2723 * The driver calls this after a GPU reset to see if the contents of 2724 * VRAM is lost or now. 2725 * returns true if vram is lost, false if not. 2726 */ 2727 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2728 { 2729 if (memcmp(adev->gart.ptr, adev->reset_magic, 2730 AMDGPU_RESET_MAGIC_NUM)) 2731 return true; 2732 2733 if (!amdgpu_in_reset(adev)) 2734 return false; 2735 2736 /* 2737 * For all ASICs with baco/mode1 reset, the VRAM is 2738 * always assumed to be lost. 2739 */ 2740 switch (amdgpu_asic_reset_method(adev)) { 2741 case AMD_RESET_METHOD_BACO: 2742 case AMD_RESET_METHOD_MODE1: 2743 return true; 2744 default: 2745 return false; 2746 } 2747 } 2748 2749 /** 2750 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2751 * 2752 * @adev: amdgpu_device pointer 2753 * @state: clockgating state (gate or ungate) 2754 * 2755 * The list of all the hardware IPs that make up the asic is walked and the 2756 * set_clockgating_state callbacks are run. 2757 * Late initialization pass enabling clockgating for hardware IPs. 2758 * Fini or suspend, pass disabling clockgating for hardware IPs. 2759 * Returns 0 on success, negative error code on failure. 2760 */ 2761 2762 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2763 enum amd_clockgating_state state) 2764 { 2765 int i, j, r; 2766 2767 if (amdgpu_emu_mode == 1) 2768 return 0; 2769 2770 for (j = 0; j < adev->num_ip_blocks; j++) { 2771 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2772 if (!adev->ip_blocks[i].status.late_initialized) 2773 continue; 2774 /* skip CG for GFX, SDMA on S0ix */ 2775 if (adev->in_s0ix && 2776 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2778 continue; 2779 /* skip CG for VCE/UVD, it's handled specially */ 2780 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2781 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2782 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2783 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2784 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2785 /* enable clockgating to save power */ 2786 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2787 state); 2788 if (r) { 2789 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2790 adev->ip_blocks[i].version->funcs->name, r); 2791 return r; 2792 } 2793 } 2794 } 2795 2796 return 0; 2797 } 2798 2799 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2800 enum amd_powergating_state state) 2801 { 2802 int i, j, r; 2803 2804 if (amdgpu_emu_mode == 1) 2805 return 0; 2806 2807 for (j = 0; j < adev->num_ip_blocks; j++) { 2808 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2809 if (!adev->ip_blocks[i].status.late_initialized) 2810 continue; 2811 /* skip PG for GFX, SDMA on S0ix */ 2812 if (adev->in_s0ix && 2813 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2814 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2815 continue; 2816 /* skip CG for VCE/UVD, it's handled specially */ 2817 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2818 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2819 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2820 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2821 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2822 /* enable powergating to save power */ 2823 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2824 state); 2825 if (r) { 2826 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2827 adev->ip_blocks[i].version->funcs->name, r); 2828 return r; 2829 } 2830 } 2831 } 2832 return 0; 2833 } 2834 2835 static int amdgpu_device_enable_mgpu_fan_boost(void) 2836 { 2837 struct amdgpu_gpu_instance *gpu_ins; 2838 struct amdgpu_device *adev; 2839 int i, ret = 0; 2840 2841 mutex_lock(&mgpu_info.mutex); 2842 2843 /* 2844 * MGPU fan boost feature should be enabled 2845 * only when there are two or more dGPUs in 2846 * the system 2847 */ 2848 if (mgpu_info.num_dgpu < 2) 2849 goto out; 2850 2851 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2852 gpu_ins = &(mgpu_info.gpu_ins[i]); 2853 adev = gpu_ins->adev; 2854 if (!(adev->flags & AMD_IS_APU) && 2855 !gpu_ins->mgpu_fan_enabled) { 2856 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2857 if (ret) 2858 break; 2859 2860 gpu_ins->mgpu_fan_enabled = 1; 2861 } 2862 } 2863 2864 out: 2865 mutex_unlock(&mgpu_info.mutex); 2866 2867 return ret; 2868 } 2869 2870 /** 2871 * amdgpu_device_ip_late_init - run late init for hardware IPs 2872 * 2873 * @adev: amdgpu_device pointer 2874 * 2875 * Late initialization pass for hardware IPs. The list of all the hardware 2876 * IPs that make up the asic is walked and the late_init callbacks are run. 2877 * late_init covers any special initialization that an IP requires 2878 * after all of the have been initialized or something that needs to happen 2879 * late in the init process. 2880 * Returns 0 on success, negative error code on failure. 2881 */ 2882 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2883 { 2884 struct amdgpu_gpu_instance *gpu_instance; 2885 int i = 0, r; 2886 2887 for (i = 0; i < adev->num_ip_blocks; i++) { 2888 if (!adev->ip_blocks[i].status.hw) 2889 continue; 2890 if (adev->ip_blocks[i].version->funcs->late_init) { 2891 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2892 if (r) { 2893 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2894 adev->ip_blocks[i].version->funcs->name, r); 2895 return r; 2896 } 2897 } 2898 adev->ip_blocks[i].status.late_initialized = true; 2899 } 2900 2901 r = amdgpu_ras_late_init(adev); 2902 if (r) { 2903 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2904 return r; 2905 } 2906 2907 amdgpu_ras_set_error_query_ready(adev, true); 2908 2909 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2910 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2911 2912 amdgpu_device_fill_reset_magic(adev); 2913 2914 r = amdgpu_device_enable_mgpu_fan_boost(); 2915 if (r) 2916 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2917 2918 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2919 if (amdgpu_passthrough(adev) && 2920 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2921 adev->asic_type == CHIP_ALDEBARAN)) 2922 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2923 2924 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2925 mutex_lock(&mgpu_info.mutex); 2926 2927 /* 2928 * Reset device p-state to low as this was booted with high. 2929 * 2930 * This should be performed only after all devices from the same 2931 * hive get initialized. 2932 * 2933 * However, it's unknown how many device in the hive in advance. 2934 * As this is counted one by one during devices initializations. 2935 * 2936 * So, we wait for all XGMI interlinked devices initialized. 2937 * This may bring some delays as those devices may come from 2938 * different hives. But that should be OK. 2939 */ 2940 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2941 for (i = 0; i < mgpu_info.num_gpu; i++) { 2942 gpu_instance = &(mgpu_info.gpu_ins[i]); 2943 if (gpu_instance->adev->flags & AMD_IS_APU) 2944 continue; 2945 2946 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2947 AMDGPU_XGMI_PSTATE_MIN); 2948 if (r) { 2949 DRM_ERROR("pstate setting failed (%d).\n", r); 2950 break; 2951 } 2952 } 2953 } 2954 2955 mutex_unlock(&mgpu_info.mutex); 2956 } 2957 2958 return 0; 2959 } 2960 2961 /** 2962 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2963 * 2964 * @adev: amdgpu_device pointer 2965 * 2966 * For ASICs need to disable SMC first 2967 */ 2968 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2969 { 2970 int i, r; 2971 2972 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2973 return; 2974 2975 for (i = 0; i < adev->num_ip_blocks; i++) { 2976 if (!adev->ip_blocks[i].status.hw) 2977 continue; 2978 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2979 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2980 /* XXX handle errors */ 2981 if (r) { 2982 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2983 adev->ip_blocks[i].version->funcs->name, r); 2984 } 2985 adev->ip_blocks[i].status.hw = false; 2986 break; 2987 } 2988 } 2989 } 2990 2991 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2992 { 2993 int i, r; 2994 2995 for (i = 0; i < adev->num_ip_blocks; i++) { 2996 if (!adev->ip_blocks[i].version->funcs->early_fini) 2997 continue; 2998 2999 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3000 if (r) { 3001 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3002 adev->ip_blocks[i].version->funcs->name, r); 3003 } 3004 } 3005 3006 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3007 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3008 3009 amdgpu_amdkfd_suspend(adev, false); 3010 3011 /* Workaroud for ASICs need to disable SMC first */ 3012 amdgpu_device_smu_fini_early(adev); 3013 3014 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3015 if (!adev->ip_blocks[i].status.hw) 3016 continue; 3017 3018 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3019 /* XXX handle errors */ 3020 if (r) { 3021 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3022 adev->ip_blocks[i].version->funcs->name, r); 3023 } 3024 3025 adev->ip_blocks[i].status.hw = false; 3026 } 3027 3028 if (amdgpu_sriov_vf(adev)) { 3029 if (amdgpu_virt_release_full_gpu(adev, false)) 3030 DRM_ERROR("failed to release exclusive mode on fini\n"); 3031 } 3032 3033 return 0; 3034 } 3035 3036 /** 3037 * amdgpu_device_ip_fini - run fini for hardware IPs 3038 * 3039 * @adev: amdgpu_device pointer 3040 * 3041 * Main teardown pass for hardware IPs. The list of all the hardware 3042 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3043 * are run. hw_fini tears down the hardware associated with each IP 3044 * and sw_fini tears down any software state associated with each IP. 3045 * Returns 0 on success, negative error code on failure. 3046 */ 3047 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3048 { 3049 int i, r; 3050 3051 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3052 amdgpu_virt_release_ras_err_handler_data(adev); 3053 3054 if (adev->gmc.xgmi.num_physical_nodes > 1) 3055 amdgpu_xgmi_remove_device(adev); 3056 3057 amdgpu_amdkfd_device_fini_sw(adev); 3058 3059 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3060 if (!adev->ip_blocks[i].status.sw) 3061 continue; 3062 3063 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3064 amdgpu_ucode_free_bo(adev); 3065 amdgpu_free_static_csa(&adev->virt.csa_obj); 3066 amdgpu_device_wb_fini(adev); 3067 amdgpu_device_mem_scratch_fini(adev); 3068 amdgpu_ib_pool_fini(adev); 3069 } 3070 3071 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3072 /* XXX handle errors */ 3073 if (r) { 3074 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3075 adev->ip_blocks[i].version->funcs->name, r); 3076 } 3077 adev->ip_blocks[i].status.sw = false; 3078 adev->ip_blocks[i].status.valid = false; 3079 } 3080 3081 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3082 if (!adev->ip_blocks[i].status.late_initialized) 3083 continue; 3084 if (adev->ip_blocks[i].version->funcs->late_fini) 3085 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3086 adev->ip_blocks[i].status.late_initialized = false; 3087 } 3088 3089 amdgpu_ras_fini(adev); 3090 3091 return 0; 3092 } 3093 3094 /** 3095 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3096 * 3097 * @work: work_struct. 3098 */ 3099 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3100 { 3101 struct amdgpu_device *adev = 3102 container_of(work, struct amdgpu_device, delayed_init_work.work); 3103 int r; 3104 3105 r = amdgpu_ib_ring_tests(adev); 3106 if (r) 3107 DRM_ERROR("ib ring test failed (%d).\n", r); 3108 } 3109 3110 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3111 { 3112 struct amdgpu_device *adev = 3113 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3114 3115 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3116 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3117 3118 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3119 adev->gfx.gfx_off_state = true; 3120 } 3121 3122 /** 3123 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3124 * 3125 * @adev: amdgpu_device pointer 3126 * 3127 * Main suspend function for hardware IPs. The list of all the hardware 3128 * IPs that make up the asic is walked, clockgating is disabled and the 3129 * suspend callbacks are run. suspend puts the hardware and software state 3130 * in each IP into a state suitable for suspend. 3131 * Returns 0 on success, negative error code on failure. 3132 */ 3133 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3134 { 3135 int i, r; 3136 3137 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3138 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3139 3140 /* 3141 * Per PMFW team's suggestion, driver needs to handle gfxoff 3142 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3143 * scenario. Add the missing df cstate disablement here. 3144 */ 3145 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3146 dev_warn(adev->dev, "Failed to disallow df cstate"); 3147 3148 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3149 if (!adev->ip_blocks[i].status.valid) 3150 continue; 3151 3152 /* displays are handled separately */ 3153 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3154 continue; 3155 3156 /* XXX handle errors */ 3157 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3158 /* XXX handle errors */ 3159 if (r) { 3160 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3161 adev->ip_blocks[i].version->funcs->name, r); 3162 return r; 3163 } 3164 3165 adev->ip_blocks[i].status.hw = false; 3166 } 3167 3168 return 0; 3169 } 3170 3171 /** 3172 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3173 * 3174 * @adev: amdgpu_device pointer 3175 * 3176 * Main suspend function for hardware IPs. The list of all the hardware 3177 * IPs that make up the asic is walked, clockgating is disabled and the 3178 * suspend callbacks are run. suspend puts the hardware and software state 3179 * in each IP into a state suitable for suspend. 3180 * Returns 0 on success, negative error code on failure. 3181 */ 3182 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3183 { 3184 int i, r; 3185 3186 if (adev->in_s0ix) 3187 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3188 3189 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3190 if (!adev->ip_blocks[i].status.valid) 3191 continue; 3192 /* displays are handled in phase1 */ 3193 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3194 continue; 3195 /* PSP lost connection when err_event_athub occurs */ 3196 if (amdgpu_ras_intr_triggered() && 3197 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3198 adev->ip_blocks[i].status.hw = false; 3199 continue; 3200 } 3201 3202 /* skip unnecessary suspend if we do not initialize them yet */ 3203 if (adev->gmc.xgmi.pending_reset && 3204 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3205 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3206 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3207 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3208 adev->ip_blocks[i].status.hw = false; 3209 continue; 3210 } 3211 3212 /* skip suspend of gfx/mes and psp for S0ix 3213 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3214 * like at runtime. PSP is also part of the always on hardware 3215 * so no need to suspend it. 3216 */ 3217 if (adev->in_s0ix && 3218 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3221 continue; 3222 3223 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3224 if (adev->in_s0ix && 3225 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3226 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3227 continue; 3228 3229 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3230 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3231 * from this location and RLC Autoload automatically also gets loaded 3232 * from here based on PMFW -> PSP message during re-init sequence. 3233 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3234 * the TMR and reload FWs again for IMU enabled APU ASICs. 3235 */ 3236 if (amdgpu_in_reset(adev) && 3237 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3238 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3239 continue; 3240 3241 /* XXX handle errors */ 3242 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3243 /* XXX handle errors */ 3244 if (r) { 3245 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3246 adev->ip_blocks[i].version->funcs->name, r); 3247 } 3248 adev->ip_blocks[i].status.hw = false; 3249 /* handle putting the SMC in the appropriate state */ 3250 if (!amdgpu_sriov_vf(adev)) { 3251 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3252 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3253 if (r) { 3254 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3255 adev->mp1_state, r); 3256 return r; 3257 } 3258 } 3259 } 3260 } 3261 3262 return 0; 3263 } 3264 3265 /** 3266 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3267 * 3268 * @adev: amdgpu_device pointer 3269 * 3270 * Main suspend function for hardware IPs. The list of all the hardware 3271 * IPs that make up the asic is walked, clockgating is disabled and the 3272 * suspend callbacks are run. suspend puts the hardware and software state 3273 * in each IP into a state suitable for suspend. 3274 * Returns 0 on success, negative error code on failure. 3275 */ 3276 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3277 { 3278 int r; 3279 3280 if (amdgpu_sriov_vf(adev)) { 3281 amdgpu_virt_fini_data_exchange(adev); 3282 amdgpu_virt_request_full_gpu(adev, false); 3283 } 3284 3285 r = amdgpu_device_ip_suspend_phase1(adev); 3286 if (r) 3287 return r; 3288 r = amdgpu_device_ip_suspend_phase2(adev); 3289 3290 if (amdgpu_sriov_vf(adev)) 3291 amdgpu_virt_release_full_gpu(adev, false); 3292 3293 return r; 3294 } 3295 3296 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3297 { 3298 int i, r; 3299 3300 static enum amd_ip_block_type ip_order[] = { 3301 AMD_IP_BLOCK_TYPE_COMMON, 3302 AMD_IP_BLOCK_TYPE_GMC, 3303 AMD_IP_BLOCK_TYPE_PSP, 3304 AMD_IP_BLOCK_TYPE_IH, 3305 }; 3306 3307 for (i = 0; i < adev->num_ip_blocks; i++) { 3308 int j; 3309 struct amdgpu_ip_block *block; 3310 3311 block = &adev->ip_blocks[i]; 3312 block->status.hw = false; 3313 3314 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3315 3316 if (block->version->type != ip_order[j] || 3317 !block->status.valid) 3318 continue; 3319 3320 r = block->version->funcs->hw_init(adev); 3321 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3322 if (r) 3323 return r; 3324 block->status.hw = true; 3325 } 3326 } 3327 3328 return 0; 3329 } 3330 3331 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3332 { 3333 int i, r; 3334 3335 static enum amd_ip_block_type ip_order[] = { 3336 AMD_IP_BLOCK_TYPE_SMC, 3337 AMD_IP_BLOCK_TYPE_DCE, 3338 AMD_IP_BLOCK_TYPE_GFX, 3339 AMD_IP_BLOCK_TYPE_SDMA, 3340 AMD_IP_BLOCK_TYPE_MES, 3341 AMD_IP_BLOCK_TYPE_UVD, 3342 AMD_IP_BLOCK_TYPE_VCE, 3343 AMD_IP_BLOCK_TYPE_VCN, 3344 AMD_IP_BLOCK_TYPE_JPEG 3345 }; 3346 3347 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3348 int j; 3349 struct amdgpu_ip_block *block; 3350 3351 for (j = 0; j < adev->num_ip_blocks; j++) { 3352 block = &adev->ip_blocks[j]; 3353 3354 if (block->version->type != ip_order[i] || 3355 !block->status.valid || 3356 block->status.hw) 3357 continue; 3358 3359 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3360 r = block->version->funcs->resume(adev); 3361 else 3362 r = block->version->funcs->hw_init(adev); 3363 3364 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3365 if (r) 3366 return r; 3367 block->status.hw = true; 3368 } 3369 } 3370 3371 return 0; 3372 } 3373 3374 /** 3375 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3376 * 3377 * @adev: amdgpu_device pointer 3378 * 3379 * First resume function for hardware IPs. The list of all the hardware 3380 * IPs that make up the asic is walked and the resume callbacks are run for 3381 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3382 * after a suspend and updates the software state as necessary. This 3383 * function is also used for restoring the GPU after a GPU reset. 3384 * Returns 0 on success, negative error code on failure. 3385 */ 3386 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3387 { 3388 int i, r; 3389 3390 for (i = 0; i < adev->num_ip_blocks; i++) { 3391 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3392 continue; 3393 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3394 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3395 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3396 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3397 3398 r = adev->ip_blocks[i].version->funcs->resume(adev); 3399 if (r) { 3400 DRM_ERROR("resume of IP block <%s> failed %d\n", 3401 adev->ip_blocks[i].version->funcs->name, r); 3402 return r; 3403 } 3404 adev->ip_blocks[i].status.hw = true; 3405 } 3406 } 3407 3408 return 0; 3409 } 3410 3411 /** 3412 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3413 * 3414 * @adev: amdgpu_device pointer 3415 * 3416 * First resume function for hardware IPs. The list of all the hardware 3417 * IPs that make up the asic is walked and the resume callbacks are run for 3418 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3419 * functional state after a suspend and updates the software state as 3420 * necessary. This function is also used for restoring the GPU after a GPU 3421 * reset. 3422 * Returns 0 on success, negative error code on failure. 3423 */ 3424 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3425 { 3426 int i, r; 3427 3428 for (i = 0; i < adev->num_ip_blocks; i++) { 3429 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3430 continue; 3431 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3432 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3433 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3434 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3435 continue; 3436 r = adev->ip_blocks[i].version->funcs->resume(adev); 3437 if (r) { 3438 DRM_ERROR("resume of IP block <%s> failed %d\n", 3439 adev->ip_blocks[i].version->funcs->name, r); 3440 return r; 3441 } 3442 adev->ip_blocks[i].status.hw = true; 3443 } 3444 3445 return 0; 3446 } 3447 3448 /** 3449 * amdgpu_device_ip_resume - run resume for hardware IPs 3450 * 3451 * @adev: amdgpu_device pointer 3452 * 3453 * Main resume function for hardware IPs. The hardware IPs 3454 * are split into two resume functions because they are 3455 * are also used in in recovering from a GPU reset and some additional 3456 * steps need to be take between them. In this case (S3/S4) they are 3457 * run sequentially. 3458 * Returns 0 on success, negative error code on failure. 3459 */ 3460 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3461 { 3462 int r; 3463 3464 if (!adev->in_s0ix) { 3465 r = amdgpu_amdkfd_resume_iommu(adev); 3466 if (r) 3467 return r; 3468 } 3469 3470 r = amdgpu_device_ip_resume_phase1(adev); 3471 if (r) 3472 return r; 3473 3474 r = amdgpu_device_fw_loading(adev); 3475 if (r) 3476 return r; 3477 3478 r = amdgpu_device_ip_resume_phase2(adev); 3479 3480 return r; 3481 } 3482 3483 /** 3484 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3485 * 3486 * @adev: amdgpu_device pointer 3487 * 3488 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3489 */ 3490 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3491 { 3492 if (amdgpu_sriov_vf(adev)) { 3493 if (adev->is_atom_fw) { 3494 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3495 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3496 } else { 3497 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3498 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3499 } 3500 3501 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3502 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3503 } 3504 } 3505 3506 /** 3507 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3508 * 3509 * @asic_type: AMD asic type 3510 * 3511 * Check if there is DC (new modesetting infrastructre) support for an asic. 3512 * returns true if DC has support, false if not. 3513 */ 3514 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3515 { 3516 switch (asic_type) { 3517 #ifdef CONFIG_DRM_AMDGPU_SI 3518 case CHIP_HAINAN: 3519 #endif 3520 case CHIP_TOPAZ: 3521 /* chips with no display hardware */ 3522 return false; 3523 #if defined(CONFIG_DRM_AMD_DC) 3524 case CHIP_TAHITI: 3525 case CHIP_PITCAIRN: 3526 case CHIP_VERDE: 3527 case CHIP_OLAND: 3528 /* 3529 * We have systems in the wild with these ASICs that require 3530 * LVDS and VGA support which is not supported with DC. 3531 * 3532 * Fallback to the non-DC driver here by default so as not to 3533 * cause regressions. 3534 */ 3535 #if defined(CONFIG_DRM_AMD_DC_SI) 3536 return amdgpu_dc > 0; 3537 #else 3538 return false; 3539 #endif 3540 case CHIP_BONAIRE: 3541 case CHIP_KAVERI: 3542 case CHIP_KABINI: 3543 case CHIP_MULLINS: 3544 /* 3545 * We have systems in the wild with these ASICs that require 3546 * VGA support which is not supported with DC. 3547 * 3548 * Fallback to the non-DC driver here by default so as not to 3549 * cause regressions. 3550 */ 3551 return amdgpu_dc > 0; 3552 default: 3553 return amdgpu_dc != 0; 3554 #else 3555 default: 3556 if (amdgpu_dc > 0) 3557 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3558 "but isn't supported by ASIC, ignoring\n"); 3559 return false; 3560 #endif 3561 } 3562 } 3563 3564 /** 3565 * amdgpu_device_has_dc_support - check if dc is supported 3566 * 3567 * @adev: amdgpu_device pointer 3568 * 3569 * Returns true for supported, false for not supported 3570 */ 3571 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3572 { 3573 if (adev->enable_virtual_display || 3574 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3575 return false; 3576 3577 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3578 } 3579 3580 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3581 { 3582 struct amdgpu_device *adev = 3583 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3584 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3585 3586 /* It's a bug to not have a hive within this function */ 3587 if (WARN_ON(!hive)) 3588 return; 3589 3590 /* 3591 * Use task barrier to synchronize all xgmi reset works across the 3592 * hive. task_barrier_enter and task_barrier_exit will block 3593 * until all the threads running the xgmi reset works reach 3594 * those points. task_barrier_full will do both blocks. 3595 */ 3596 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3597 3598 task_barrier_enter(&hive->tb); 3599 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3600 3601 if (adev->asic_reset_res) 3602 goto fail; 3603 3604 task_barrier_exit(&hive->tb); 3605 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3606 3607 if (adev->asic_reset_res) 3608 goto fail; 3609 3610 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3611 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3612 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3613 } else { 3614 3615 task_barrier_full(&hive->tb); 3616 adev->asic_reset_res = amdgpu_asic_reset(adev); 3617 } 3618 3619 fail: 3620 if (adev->asic_reset_res) 3621 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3622 adev->asic_reset_res, adev_to_drm(adev)->unique); 3623 amdgpu_put_xgmi_hive(hive); 3624 } 3625 3626 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3627 { 3628 char *input = amdgpu_lockup_timeout; 3629 char *timeout_setting = NULL; 3630 int index = 0; 3631 long timeout; 3632 int ret = 0; 3633 3634 /* 3635 * By default timeout for non compute jobs is 10000 3636 * and 60000 for compute jobs. 3637 * In SR-IOV or passthrough mode, timeout for compute 3638 * jobs are 60000 by default. 3639 */ 3640 adev->gfx_timeout = msecs_to_jiffies(10000); 3641 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3642 if (amdgpu_sriov_vf(adev)) 3643 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3644 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3645 else 3646 adev->compute_timeout = msecs_to_jiffies(60000); 3647 3648 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3649 while ((timeout_setting = strsep(&input, ",")) && 3650 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3651 ret = kstrtol(timeout_setting, 0, &timeout); 3652 if (ret) 3653 return ret; 3654 3655 if (timeout == 0) { 3656 index++; 3657 continue; 3658 } else if (timeout < 0) { 3659 timeout = MAX_SCHEDULE_TIMEOUT; 3660 dev_warn(adev->dev, "lockup timeout disabled"); 3661 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3662 } else { 3663 timeout = msecs_to_jiffies(timeout); 3664 } 3665 3666 switch (index++) { 3667 case 0: 3668 adev->gfx_timeout = timeout; 3669 break; 3670 case 1: 3671 adev->compute_timeout = timeout; 3672 break; 3673 case 2: 3674 adev->sdma_timeout = timeout; 3675 break; 3676 case 3: 3677 adev->video_timeout = timeout; 3678 break; 3679 default: 3680 break; 3681 } 3682 } 3683 /* 3684 * There is only one value specified and 3685 * it should apply to all non-compute jobs. 3686 */ 3687 if (index == 1) { 3688 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3689 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3690 adev->compute_timeout = adev->gfx_timeout; 3691 } 3692 } 3693 3694 return ret; 3695 } 3696 3697 /** 3698 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3699 * 3700 * @adev: amdgpu_device pointer 3701 * 3702 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3703 */ 3704 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3705 { 3706 struct iommu_domain *domain; 3707 3708 domain = iommu_get_domain_for_dev(adev->dev); 3709 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3710 adev->ram_is_direct_mapped = true; 3711 } 3712 3713 static const struct attribute *amdgpu_dev_attributes[] = { 3714 &dev_attr_product_name.attr, 3715 &dev_attr_product_number.attr, 3716 &dev_attr_serial_number.attr, 3717 &dev_attr_pcie_replay_count.attr, 3718 NULL 3719 }; 3720 3721 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3722 { 3723 if (amdgpu_mcbp == 1) 3724 adev->gfx.mcbp = true; 3725 3726 if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3727 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3728 adev->gfx.num_gfx_rings) 3729 adev->gfx.mcbp = true; 3730 3731 if (amdgpu_sriov_vf(adev)) 3732 adev->gfx.mcbp = true; 3733 3734 if (adev->gfx.mcbp) 3735 DRM_INFO("MCBP is enabled\n"); 3736 } 3737 3738 /** 3739 * amdgpu_device_init - initialize the driver 3740 * 3741 * @adev: amdgpu_device pointer 3742 * @flags: driver flags 3743 * 3744 * Initializes the driver info and hw (all asics). 3745 * Returns 0 for success or an error on failure. 3746 * Called at driver startup. 3747 */ 3748 int amdgpu_device_init(struct amdgpu_device *adev, 3749 uint32_t flags) 3750 { 3751 struct drm_device *ddev = adev_to_drm(adev); 3752 struct pci_dev *pdev = adev->pdev; 3753 int r, i; 3754 bool px = false; 3755 u32 max_MBps; 3756 int tmp; 3757 3758 adev->shutdown = false; 3759 adev->flags = flags; 3760 3761 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3762 adev->asic_type = amdgpu_force_asic_type; 3763 else 3764 adev->asic_type = flags & AMD_ASIC_MASK; 3765 3766 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3767 if (amdgpu_emu_mode == 1) 3768 adev->usec_timeout *= 10; 3769 adev->gmc.gart_size = 512 * 1024 * 1024; 3770 adev->accel_working = false; 3771 adev->num_rings = 0; 3772 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3773 adev->mman.buffer_funcs = NULL; 3774 adev->mman.buffer_funcs_ring = NULL; 3775 adev->vm_manager.vm_pte_funcs = NULL; 3776 adev->vm_manager.vm_pte_num_scheds = 0; 3777 adev->gmc.gmc_funcs = NULL; 3778 adev->harvest_ip_mask = 0x0; 3779 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3780 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3781 3782 adev->smc_rreg = &amdgpu_invalid_rreg; 3783 adev->smc_wreg = &amdgpu_invalid_wreg; 3784 adev->pcie_rreg = &amdgpu_invalid_rreg; 3785 adev->pcie_wreg = &amdgpu_invalid_wreg; 3786 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3787 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3788 adev->pciep_rreg = &amdgpu_invalid_rreg; 3789 adev->pciep_wreg = &amdgpu_invalid_wreg; 3790 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3791 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3792 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3793 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3794 adev->didt_rreg = &amdgpu_invalid_rreg; 3795 adev->didt_wreg = &amdgpu_invalid_wreg; 3796 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3797 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3798 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3799 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3800 3801 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3802 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3803 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3804 3805 /* mutex initialization are all done here so we 3806 * can recall function without having locking issues */ 3807 mutex_init(&adev->firmware.mutex); 3808 mutex_init(&adev->pm.mutex); 3809 mutex_init(&adev->gfx.gpu_clock_mutex); 3810 mutex_init(&adev->srbm_mutex); 3811 mutex_init(&adev->gfx.pipe_reserve_mutex); 3812 mutex_init(&adev->gfx.gfx_off_mutex); 3813 mutex_init(&adev->gfx.partition_mutex); 3814 mutex_init(&adev->grbm_idx_mutex); 3815 mutex_init(&adev->mn_lock); 3816 mutex_init(&adev->virt.vf_errors.lock); 3817 hash_init(adev->mn_hash); 3818 mutex_init(&adev->psp.mutex); 3819 mutex_init(&adev->notifier_lock); 3820 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3821 mutex_init(&adev->benchmark_mutex); 3822 3823 amdgpu_device_init_apu_flags(adev); 3824 3825 r = amdgpu_device_check_arguments(adev); 3826 if (r) 3827 return r; 3828 3829 spin_lock_init(&adev->mmio_idx_lock); 3830 spin_lock_init(&adev->smc_idx_lock); 3831 spin_lock_init(&adev->pcie_idx_lock); 3832 spin_lock_init(&adev->uvd_ctx_idx_lock); 3833 spin_lock_init(&adev->didt_idx_lock); 3834 spin_lock_init(&adev->gc_cac_idx_lock); 3835 spin_lock_init(&adev->se_cac_idx_lock); 3836 spin_lock_init(&adev->audio_endpt_idx_lock); 3837 spin_lock_init(&adev->mm_stats.lock); 3838 3839 INIT_LIST_HEAD(&adev->shadow_list); 3840 mutex_init(&adev->shadow_list_lock); 3841 3842 INIT_LIST_HEAD(&adev->reset_list); 3843 3844 INIT_LIST_HEAD(&adev->ras_list); 3845 3846 INIT_DELAYED_WORK(&adev->delayed_init_work, 3847 amdgpu_device_delayed_init_work_handler); 3848 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3849 amdgpu_device_delay_enable_gfx_off); 3850 3851 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3852 3853 adev->gfx.gfx_off_req_count = 1; 3854 adev->gfx.gfx_off_residency = 0; 3855 adev->gfx.gfx_off_entrycount = 0; 3856 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3857 3858 atomic_set(&adev->throttling_logging_enabled, 1); 3859 /* 3860 * If throttling continues, logging will be performed every minute 3861 * to avoid log flooding. "-1" is subtracted since the thermal 3862 * throttling interrupt comes every second. Thus, the total logging 3863 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3864 * for throttling interrupt) = 60 seconds. 3865 */ 3866 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3867 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3868 3869 /* Registers mapping */ 3870 /* TODO: block userspace mapping of io register */ 3871 if (adev->asic_type >= CHIP_BONAIRE) { 3872 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3873 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3874 } else { 3875 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3876 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3877 } 3878 3879 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3880 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3881 3882 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3883 if (adev->rmmio == NULL) { 3884 return -ENOMEM; 3885 } 3886 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3887 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3888 3889 /* 3890 * Reset domain needs to be present early, before XGMI hive discovered 3891 * (if any) and intitialized to use reset sem and in_gpu reset flag 3892 * early on during init and before calling to RREG32. 3893 */ 3894 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3895 if (!adev->reset_domain) 3896 return -ENOMEM; 3897 3898 /* detect hw virtualization here */ 3899 amdgpu_detect_virtualization(adev); 3900 3901 amdgpu_device_get_pcie_info(adev); 3902 3903 r = amdgpu_device_get_job_timeout_settings(adev); 3904 if (r) { 3905 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3906 return r; 3907 } 3908 3909 /* early init functions */ 3910 r = amdgpu_device_ip_early_init(adev); 3911 if (r) 3912 return r; 3913 3914 amdgpu_device_set_mcbp(adev); 3915 3916 /* Get rid of things like offb */ 3917 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3918 if (r) 3919 return r; 3920 3921 /* Enable TMZ based on IP_VERSION */ 3922 amdgpu_gmc_tmz_set(adev); 3923 3924 amdgpu_gmc_noretry_set(adev); 3925 /* Need to get xgmi info early to decide the reset behavior*/ 3926 if (adev->gmc.xgmi.supported) { 3927 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3928 if (r) 3929 return r; 3930 } 3931 3932 /* enable PCIE atomic ops */ 3933 if (amdgpu_sriov_vf(adev)) { 3934 if (adev->virt.fw_reserve.p_pf2vf) 3935 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3936 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3937 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3938 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3939 * internal path natively support atomics, set have_atomics_support to true. 3940 */ 3941 } else if ((adev->flags & AMD_IS_APU) && 3942 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3943 adev->have_atomics_support = true; 3944 } else { 3945 adev->have_atomics_support = 3946 !pci_enable_atomic_ops_to_root(adev->pdev, 3947 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3948 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3949 } 3950 3951 if (!adev->have_atomics_support) 3952 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3953 3954 /* doorbell bar mapping and doorbell index init*/ 3955 amdgpu_device_doorbell_init(adev); 3956 3957 if (amdgpu_emu_mode == 1) { 3958 /* post the asic on emulation mode */ 3959 emu_soc_asic_init(adev); 3960 goto fence_driver_init; 3961 } 3962 3963 amdgpu_reset_init(adev); 3964 3965 /* detect if we are with an SRIOV vbios */ 3966 if (adev->bios) 3967 amdgpu_device_detect_sriov_bios(adev); 3968 3969 /* check if we need to reset the asic 3970 * E.g., driver was not cleanly unloaded previously, etc. 3971 */ 3972 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3973 if (adev->gmc.xgmi.num_physical_nodes) { 3974 dev_info(adev->dev, "Pending hive reset.\n"); 3975 adev->gmc.xgmi.pending_reset = true; 3976 /* Only need to init necessary block for SMU to handle the reset */ 3977 for (i = 0; i < adev->num_ip_blocks; i++) { 3978 if (!adev->ip_blocks[i].status.valid) 3979 continue; 3980 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3984 DRM_DEBUG("IP %s disabled for hw_init.\n", 3985 adev->ip_blocks[i].version->funcs->name); 3986 adev->ip_blocks[i].status.hw = true; 3987 } 3988 } 3989 } else { 3990 tmp = amdgpu_reset_method; 3991 /* It should do a default reset when loading or reloading the driver, 3992 * regardless of the module parameter reset_method. 3993 */ 3994 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3995 r = amdgpu_asic_reset(adev); 3996 amdgpu_reset_method = tmp; 3997 if (r) { 3998 dev_err(adev->dev, "asic reset on init failed\n"); 3999 goto failed; 4000 } 4001 } 4002 } 4003 4004 /* Post card if necessary */ 4005 if (amdgpu_device_need_post(adev)) { 4006 if (!adev->bios) { 4007 dev_err(adev->dev, "no vBIOS found\n"); 4008 r = -EINVAL; 4009 goto failed; 4010 } 4011 DRM_INFO("GPU posting now...\n"); 4012 r = amdgpu_device_asic_init(adev); 4013 if (r) { 4014 dev_err(adev->dev, "gpu post error!\n"); 4015 goto failed; 4016 } 4017 } 4018 4019 if (adev->bios) { 4020 if (adev->is_atom_fw) { 4021 /* Initialize clocks */ 4022 r = amdgpu_atomfirmware_get_clock_info(adev); 4023 if (r) { 4024 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4025 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4026 goto failed; 4027 } 4028 } else { 4029 /* Initialize clocks */ 4030 r = amdgpu_atombios_get_clock_info(adev); 4031 if (r) { 4032 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4033 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4034 goto failed; 4035 } 4036 /* init i2c buses */ 4037 if (!amdgpu_device_has_dc_support(adev)) 4038 amdgpu_atombios_i2c_init(adev); 4039 } 4040 } 4041 4042 fence_driver_init: 4043 /* Fence driver */ 4044 r = amdgpu_fence_driver_sw_init(adev); 4045 if (r) { 4046 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4047 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4048 goto failed; 4049 } 4050 4051 /* init the mode config */ 4052 drm_mode_config_init(adev_to_drm(adev)); 4053 4054 r = amdgpu_device_ip_init(adev); 4055 if (r) { 4056 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4057 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4058 goto release_ras_con; 4059 } 4060 4061 amdgpu_fence_driver_hw_init(adev); 4062 4063 dev_info(adev->dev, 4064 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4065 adev->gfx.config.max_shader_engines, 4066 adev->gfx.config.max_sh_per_se, 4067 adev->gfx.config.max_cu_per_sh, 4068 adev->gfx.cu_info.number); 4069 4070 adev->accel_working = true; 4071 4072 amdgpu_vm_check_compute_bug(adev); 4073 4074 /* Initialize the buffer migration limit. */ 4075 if (amdgpu_moverate >= 0) 4076 max_MBps = amdgpu_moverate; 4077 else 4078 max_MBps = 8; /* Allow 8 MB/s. */ 4079 /* Get a log2 for easy divisions. */ 4080 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4081 4082 r = amdgpu_atombios_sysfs_init(adev); 4083 if (r) 4084 drm_err(&adev->ddev, 4085 "registering atombios sysfs failed (%d).\n", r); 4086 4087 r = amdgpu_pm_sysfs_init(adev); 4088 if (r) 4089 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4090 4091 r = amdgpu_ucode_sysfs_init(adev); 4092 if (r) { 4093 adev->ucode_sysfs_en = false; 4094 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4095 } else 4096 adev->ucode_sysfs_en = true; 4097 4098 r = amdgpu_psp_sysfs_init(adev); 4099 if (r) { 4100 adev->psp_sysfs_en = false; 4101 if (!amdgpu_sriov_vf(adev)) 4102 DRM_ERROR("Creating psp sysfs failed\n"); 4103 } else 4104 adev->psp_sysfs_en = true; 4105 4106 /* 4107 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4108 * Otherwise the mgpu fan boost feature will be skipped due to the 4109 * gpu instance is counted less. 4110 */ 4111 amdgpu_register_gpu_instance(adev); 4112 4113 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4114 * explicit gating rather than handling it automatically. 4115 */ 4116 if (!adev->gmc.xgmi.pending_reset) { 4117 r = amdgpu_device_ip_late_init(adev); 4118 if (r) { 4119 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4120 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4121 goto release_ras_con; 4122 } 4123 /* must succeed. */ 4124 amdgpu_ras_resume(adev); 4125 queue_delayed_work(system_wq, &adev->delayed_init_work, 4126 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4127 } 4128 4129 if (amdgpu_sriov_vf(adev)) { 4130 amdgpu_virt_release_full_gpu(adev, true); 4131 flush_delayed_work(&adev->delayed_init_work); 4132 } 4133 4134 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4135 if (r) 4136 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4137 4138 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4139 r = amdgpu_pmu_init(adev); 4140 if (r) 4141 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4142 4143 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4144 if (amdgpu_device_cache_pci_state(adev->pdev)) 4145 pci_restore_state(pdev); 4146 4147 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4148 /* this will fail for cards that aren't VGA class devices, just 4149 * ignore it */ 4150 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4151 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4152 4153 px = amdgpu_device_supports_px(ddev); 4154 4155 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4156 apple_gmux_detect(NULL, NULL))) 4157 vga_switcheroo_register_client(adev->pdev, 4158 &amdgpu_switcheroo_ops, px); 4159 4160 if (px) 4161 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4162 4163 if (adev->gmc.xgmi.pending_reset) 4164 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4165 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4166 4167 amdgpu_device_check_iommu_direct_map(adev); 4168 4169 return 0; 4170 4171 release_ras_con: 4172 if (amdgpu_sriov_vf(adev)) 4173 amdgpu_virt_release_full_gpu(adev, true); 4174 4175 /* failed in exclusive mode due to timeout */ 4176 if (amdgpu_sriov_vf(adev) && 4177 !amdgpu_sriov_runtime(adev) && 4178 amdgpu_virt_mmio_blocked(adev) && 4179 !amdgpu_virt_wait_reset(adev)) { 4180 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4181 /* Don't send request since VF is inactive. */ 4182 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4183 adev->virt.ops = NULL; 4184 r = -EAGAIN; 4185 } 4186 amdgpu_release_ras_context(adev); 4187 4188 failed: 4189 amdgpu_vf_error_trans_all(adev); 4190 4191 return r; 4192 } 4193 4194 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4195 { 4196 4197 /* Clear all CPU mappings pointing to this device */ 4198 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4199 4200 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4201 amdgpu_device_doorbell_fini(adev); 4202 4203 iounmap(adev->rmmio); 4204 adev->rmmio = NULL; 4205 if (adev->mman.aper_base_kaddr) 4206 iounmap(adev->mman.aper_base_kaddr); 4207 adev->mman.aper_base_kaddr = NULL; 4208 4209 /* Memory manager related */ 4210 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4211 arch_phys_wc_del(adev->gmc.vram_mtrr); 4212 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4213 } 4214 } 4215 4216 /** 4217 * amdgpu_device_fini_hw - tear down the driver 4218 * 4219 * @adev: amdgpu_device pointer 4220 * 4221 * Tear down the driver info (all asics). 4222 * Called at driver shutdown. 4223 */ 4224 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4225 { 4226 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4227 flush_delayed_work(&adev->delayed_init_work); 4228 adev->shutdown = true; 4229 4230 /* make sure IB test finished before entering exclusive mode 4231 * to avoid preemption on IB test 4232 * */ 4233 if (amdgpu_sriov_vf(adev)) { 4234 amdgpu_virt_request_full_gpu(adev, false); 4235 amdgpu_virt_fini_data_exchange(adev); 4236 } 4237 4238 /* disable all interrupts */ 4239 amdgpu_irq_disable_all(adev); 4240 if (adev->mode_info.mode_config_initialized) { 4241 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4242 drm_helper_force_disable_all(adev_to_drm(adev)); 4243 else 4244 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4245 } 4246 amdgpu_fence_driver_hw_fini(adev); 4247 4248 if (adev->mman.initialized) 4249 drain_workqueue(adev->mman.bdev.wq); 4250 4251 if (adev->pm.sysfs_initialized) 4252 amdgpu_pm_sysfs_fini(adev); 4253 if (adev->ucode_sysfs_en) 4254 amdgpu_ucode_sysfs_fini(adev); 4255 if (adev->psp_sysfs_en) 4256 amdgpu_psp_sysfs_fini(adev); 4257 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4258 4259 /* disable ras feature must before hw fini */ 4260 amdgpu_ras_pre_fini(adev); 4261 4262 amdgpu_device_ip_fini_early(adev); 4263 4264 amdgpu_irq_fini_hw(adev); 4265 4266 if (adev->mman.initialized) 4267 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4268 4269 amdgpu_gart_dummy_page_fini(adev); 4270 4271 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4272 amdgpu_device_unmap_mmio(adev); 4273 4274 } 4275 4276 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4277 { 4278 int idx; 4279 bool px; 4280 4281 amdgpu_fence_driver_sw_fini(adev); 4282 amdgpu_device_ip_fini(adev); 4283 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4284 adev->accel_working = false; 4285 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4286 4287 amdgpu_reset_fini(adev); 4288 4289 /* free i2c buses */ 4290 if (!amdgpu_device_has_dc_support(adev)) 4291 amdgpu_i2c_fini(adev); 4292 4293 if (amdgpu_emu_mode != 1) 4294 amdgpu_atombios_fini(adev); 4295 4296 kfree(adev->bios); 4297 adev->bios = NULL; 4298 4299 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4300 4301 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4302 apple_gmux_detect(NULL, NULL))) 4303 vga_switcheroo_unregister_client(adev->pdev); 4304 4305 if (px) 4306 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4307 4308 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4309 vga_client_unregister(adev->pdev); 4310 4311 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4312 4313 iounmap(adev->rmmio); 4314 adev->rmmio = NULL; 4315 amdgpu_device_doorbell_fini(adev); 4316 drm_dev_exit(idx); 4317 } 4318 4319 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4320 amdgpu_pmu_fini(adev); 4321 if (adev->mman.discovery_bin) 4322 amdgpu_discovery_fini(adev); 4323 4324 amdgpu_reset_put_reset_domain(adev->reset_domain); 4325 adev->reset_domain = NULL; 4326 4327 kfree(adev->pci_state); 4328 4329 } 4330 4331 /** 4332 * amdgpu_device_evict_resources - evict device resources 4333 * @adev: amdgpu device object 4334 * 4335 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4336 * of the vram memory type. Mainly used for evicting device resources 4337 * at suspend time. 4338 * 4339 */ 4340 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4341 { 4342 int ret; 4343 4344 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4345 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4346 return 0; 4347 4348 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4349 if (ret) 4350 DRM_WARN("evicting device resources failed\n"); 4351 return ret; 4352 } 4353 4354 /* 4355 * Suspend & resume. 4356 */ 4357 /** 4358 * amdgpu_device_suspend - initiate device suspend 4359 * 4360 * @dev: drm dev pointer 4361 * @fbcon : notify the fbdev of suspend 4362 * 4363 * Puts the hw in the suspend state (all asics). 4364 * Returns 0 for success or an error on failure. 4365 * Called at driver suspend. 4366 */ 4367 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4368 { 4369 struct amdgpu_device *adev = drm_to_adev(dev); 4370 int r = 0; 4371 4372 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4373 return 0; 4374 4375 adev->in_suspend = true; 4376 4377 /* Evict the majority of BOs before grabbing the full access */ 4378 r = amdgpu_device_evict_resources(adev); 4379 if (r) 4380 return r; 4381 4382 if (amdgpu_sriov_vf(adev)) { 4383 amdgpu_virt_fini_data_exchange(adev); 4384 r = amdgpu_virt_request_full_gpu(adev, false); 4385 if (r) 4386 return r; 4387 } 4388 4389 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4390 DRM_WARN("smart shift update failed\n"); 4391 4392 if (fbcon) 4393 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4394 4395 cancel_delayed_work_sync(&adev->delayed_init_work); 4396 4397 amdgpu_ras_suspend(adev); 4398 4399 amdgpu_device_ip_suspend_phase1(adev); 4400 4401 if (!adev->in_s0ix) 4402 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4403 4404 r = amdgpu_device_evict_resources(adev); 4405 if (r) 4406 return r; 4407 4408 amdgpu_fence_driver_hw_fini(adev); 4409 4410 amdgpu_device_ip_suspend_phase2(adev); 4411 4412 if (amdgpu_sriov_vf(adev)) 4413 amdgpu_virt_release_full_gpu(adev, false); 4414 4415 return 0; 4416 } 4417 4418 /** 4419 * amdgpu_device_resume - initiate device resume 4420 * 4421 * @dev: drm dev pointer 4422 * @fbcon : notify the fbdev of resume 4423 * 4424 * Bring the hw back to operating state (all asics). 4425 * Returns 0 for success or an error on failure. 4426 * Called at driver resume. 4427 */ 4428 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4429 { 4430 struct amdgpu_device *adev = drm_to_adev(dev); 4431 int r = 0; 4432 4433 if (amdgpu_sriov_vf(adev)) { 4434 r = amdgpu_virt_request_full_gpu(adev, true); 4435 if (r) 4436 return r; 4437 } 4438 4439 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4440 return 0; 4441 4442 if (adev->in_s0ix) 4443 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4444 4445 /* post card */ 4446 if (amdgpu_device_need_post(adev)) { 4447 r = amdgpu_device_asic_init(adev); 4448 if (r) 4449 dev_err(adev->dev, "amdgpu asic init failed\n"); 4450 } 4451 4452 r = amdgpu_device_ip_resume(adev); 4453 4454 if (r) { 4455 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4456 goto exit; 4457 } 4458 amdgpu_fence_driver_hw_init(adev); 4459 4460 r = amdgpu_device_ip_late_init(adev); 4461 if (r) 4462 goto exit; 4463 4464 queue_delayed_work(system_wq, &adev->delayed_init_work, 4465 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4466 4467 if (!adev->in_s0ix) { 4468 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4469 if (r) 4470 goto exit; 4471 } 4472 4473 exit: 4474 if (amdgpu_sriov_vf(adev)) { 4475 amdgpu_virt_init_data_exchange(adev); 4476 amdgpu_virt_release_full_gpu(adev, true); 4477 } 4478 4479 if (r) 4480 return r; 4481 4482 /* Make sure IB tests flushed */ 4483 flush_delayed_work(&adev->delayed_init_work); 4484 4485 if (fbcon) 4486 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4487 4488 amdgpu_ras_resume(adev); 4489 4490 if (adev->mode_info.num_crtc) { 4491 /* 4492 * Most of the connector probing functions try to acquire runtime pm 4493 * refs to ensure that the GPU is powered on when connector polling is 4494 * performed. Since we're calling this from a runtime PM callback, 4495 * trying to acquire rpm refs will cause us to deadlock. 4496 * 4497 * Since we're guaranteed to be holding the rpm lock, it's safe to 4498 * temporarily disable the rpm helpers so this doesn't deadlock us. 4499 */ 4500 #ifdef CONFIG_PM 4501 dev->dev->power.disable_depth++; 4502 #endif 4503 if (!adev->dc_enabled) 4504 drm_helper_hpd_irq_event(dev); 4505 else 4506 drm_kms_helper_hotplug_event(dev); 4507 #ifdef CONFIG_PM 4508 dev->dev->power.disable_depth--; 4509 #endif 4510 } 4511 adev->in_suspend = false; 4512 4513 if (adev->enable_mes) 4514 amdgpu_mes_self_test(adev); 4515 4516 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4517 DRM_WARN("smart shift update failed\n"); 4518 4519 return 0; 4520 } 4521 4522 /** 4523 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4524 * 4525 * @adev: amdgpu_device pointer 4526 * 4527 * The list of all the hardware IPs that make up the asic is walked and 4528 * the check_soft_reset callbacks are run. check_soft_reset determines 4529 * if the asic is still hung or not. 4530 * Returns true if any of the IPs are still in a hung state, false if not. 4531 */ 4532 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4533 { 4534 int i; 4535 bool asic_hang = false; 4536 4537 if (amdgpu_sriov_vf(adev)) 4538 return true; 4539 4540 if (amdgpu_asic_need_full_reset(adev)) 4541 return true; 4542 4543 for (i = 0; i < adev->num_ip_blocks; i++) { 4544 if (!adev->ip_blocks[i].status.valid) 4545 continue; 4546 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4547 adev->ip_blocks[i].status.hang = 4548 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4549 if (adev->ip_blocks[i].status.hang) { 4550 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4551 asic_hang = true; 4552 } 4553 } 4554 return asic_hang; 4555 } 4556 4557 /** 4558 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4559 * 4560 * @adev: amdgpu_device pointer 4561 * 4562 * The list of all the hardware IPs that make up the asic is walked and the 4563 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4564 * handles any IP specific hardware or software state changes that are 4565 * necessary for a soft reset to succeed. 4566 * Returns 0 on success, negative error code on failure. 4567 */ 4568 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4569 { 4570 int i, r = 0; 4571 4572 for (i = 0; i < adev->num_ip_blocks; i++) { 4573 if (!adev->ip_blocks[i].status.valid) 4574 continue; 4575 if (adev->ip_blocks[i].status.hang && 4576 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4577 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4578 if (r) 4579 return r; 4580 } 4581 } 4582 4583 return 0; 4584 } 4585 4586 /** 4587 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4588 * 4589 * @adev: amdgpu_device pointer 4590 * 4591 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4592 * reset is necessary to recover. 4593 * Returns true if a full asic reset is required, false if not. 4594 */ 4595 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4596 { 4597 int i; 4598 4599 if (amdgpu_asic_need_full_reset(adev)) 4600 return true; 4601 4602 for (i = 0; i < adev->num_ip_blocks; i++) { 4603 if (!adev->ip_blocks[i].status.valid) 4604 continue; 4605 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4606 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4607 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4608 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4609 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4610 if (adev->ip_blocks[i].status.hang) { 4611 dev_info(adev->dev, "Some block need full reset!\n"); 4612 return true; 4613 } 4614 } 4615 } 4616 return false; 4617 } 4618 4619 /** 4620 * amdgpu_device_ip_soft_reset - do a soft reset 4621 * 4622 * @adev: amdgpu_device pointer 4623 * 4624 * The list of all the hardware IPs that make up the asic is walked and the 4625 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4626 * IP specific hardware or software state changes that are necessary to soft 4627 * reset the IP. 4628 * Returns 0 on success, negative error code on failure. 4629 */ 4630 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4631 { 4632 int i, r = 0; 4633 4634 for (i = 0; i < adev->num_ip_blocks; i++) { 4635 if (!adev->ip_blocks[i].status.valid) 4636 continue; 4637 if (adev->ip_blocks[i].status.hang && 4638 adev->ip_blocks[i].version->funcs->soft_reset) { 4639 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4640 if (r) 4641 return r; 4642 } 4643 } 4644 4645 return 0; 4646 } 4647 4648 /** 4649 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4650 * 4651 * @adev: amdgpu_device pointer 4652 * 4653 * The list of all the hardware IPs that make up the asic is walked and the 4654 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4655 * handles any IP specific hardware or software state changes that are 4656 * necessary after the IP has been soft reset. 4657 * Returns 0 on success, negative error code on failure. 4658 */ 4659 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4660 { 4661 int i, r = 0; 4662 4663 for (i = 0; i < adev->num_ip_blocks; i++) { 4664 if (!adev->ip_blocks[i].status.valid) 4665 continue; 4666 if (adev->ip_blocks[i].status.hang && 4667 adev->ip_blocks[i].version->funcs->post_soft_reset) 4668 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4669 if (r) 4670 return r; 4671 } 4672 4673 return 0; 4674 } 4675 4676 /** 4677 * amdgpu_device_recover_vram - Recover some VRAM contents 4678 * 4679 * @adev: amdgpu_device pointer 4680 * 4681 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4682 * restore things like GPUVM page tables after a GPU reset where 4683 * the contents of VRAM might be lost. 4684 * 4685 * Returns: 4686 * 0 on success, negative error code on failure. 4687 */ 4688 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4689 { 4690 struct dma_fence *fence = NULL, *next = NULL; 4691 struct amdgpu_bo *shadow; 4692 struct amdgpu_bo_vm *vmbo; 4693 long r = 1, tmo; 4694 4695 if (amdgpu_sriov_runtime(adev)) 4696 tmo = msecs_to_jiffies(8000); 4697 else 4698 tmo = msecs_to_jiffies(100); 4699 4700 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4701 mutex_lock(&adev->shadow_list_lock); 4702 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4703 /* If vm is compute context or adev is APU, shadow will be NULL */ 4704 if (!vmbo->shadow) 4705 continue; 4706 shadow = vmbo->shadow; 4707 4708 /* No need to recover an evicted BO */ 4709 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4710 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4711 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4712 continue; 4713 4714 r = amdgpu_bo_restore_shadow(shadow, &next); 4715 if (r) 4716 break; 4717 4718 if (fence) { 4719 tmo = dma_fence_wait_timeout(fence, false, tmo); 4720 dma_fence_put(fence); 4721 fence = next; 4722 if (tmo == 0) { 4723 r = -ETIMEDOUT; 4724 break; 4725 } else if (tmo < 0) { 4726 r = tmo; 4727 break; 4728 } 4729 } else { 4730 fence = next; 4731 } 4732 } 4733 mutex_unlock(&adev->shadow_list_lock); 4734 4735 if (fence) 4736 tmo = dma_fence_wait_timeout(fence, false, tmo); 4737 dma_fence_put(fence); 4738 4739 if (r < 0 || tmo <= 0) { 4740 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4741 return -EIO; 4742 } 4743 4744 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4745 return 0; 4746 } 4747 4748 4749 /** 4750 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4751 * 4752 * @adev: amdgpu_device pointer 4753 * @from_hypervisor: request from hypervisor 4754 * 4755 * do VF FLR and reinitialize Asic 4756 * return 0 means succeeded otherwise failed 4757 */ 4758 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4759 bool from_hypervisor) 4760 { 4761 int r; 4762 struct amdgpu_hive_info *hive = NULL; 4763 int retry_limit = 0; 4764 4765 retry: 4766 amdgpu_amdkfd_pre_reset(adev); 4767 4768 if (from_hypervisor) 4769 r = amdgpu_virt_request_full_gpu(adev, true); 4770 else 4771 r = amdgpu_virt_reset_gpu(adev); 4772 if (r) 4773 return r; 4774 4775 /* Resume IP prior to SMC */ 4776 r = amdgpu_device_ip_reinit_early_sriov(adev); 4777 if (r) 4778 goto error; 4779 4780 amdgpu_virt_init_data_exchange(adev); 4781 4782 r = amdgpu_device_fw_loading(adev); 4783 if (r) 4784 return r; 4785 4786 /* now we are okay to resume SMC/CP/SDMA */ 4787 r = amdgpu_device_ip_reinit_late_sriov(adev); 4788 if (r) 4789 goto error; 4790 4791 hive = amdgpu_get_xgmi_hive(adev); 4792 /* Update PSP FW topology after reset */ 4793 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4794 r = amdgpu_xgmi_update_topology(hive, adev); 4795 4796 if (hive) 4797 amdgpu_put_xgmi_hive(hive); 4798 4799 if (!r) { 4800 amdgpu_irq_gpu_reset_resume_helper(adev); 4801 r = amdgpu_ib_ring_tests(adev); 4802 4803 amdgpu_amdkfd_post_reset(adev); 4804 } 4805 4806 error: 4807 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4808 amdgpu_inc_vram_lost(adev); 4809 r = amdgpu_device_recover_vram(adev); 4810 } 4811 amdgpu_virt_release_full_gpu(adev, true); 4812 4813 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4814 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4815 retry_limit++; 4816 goto retry; 4817 } else 4818 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4819 } 4820 4821 return r; 4822 } 4823 4824 /** 4825 * amdgpu_device_has_job_running - check if there is any job in mirror list 4826 * 4827 * @adev: amdgpu_device pointer 4828 * 4829 * check if there is any job in mirror list 4830 */ 4831 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4832 { 4833 int i; 4834 struct drm_sched_job *job; 4835 4836 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4837 struct amdgpu_ring *ring = adev->rings[i]; 4838 4839 if (!ring || !ring->sched.thread) 4840 continue; 4841 4842 spin_lock(&ring->sched.job_list_lock); 4843 job = list_first_entry_or_null(&ring->sched.pending_list, 4844 struct drm_sched_job, list); 4845 spin_unlock(&ring->sched.job_list_lock); 4846 if (job) 4847 return true; 4848 } 4849 return false; 4850 } 4851 4852 /** 4853 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4854 * 4855 * @adev: amdgpu_device pointer 4856 * 4857 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4858 * a hung GPU. 4859 */ 4860 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4861 { 4862 4863 if (amdgpu_gpu_recovery == 0) 4864 goto disabled; 4865 4866 /* Skip soft reset check in fatal error mode */ 4867 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4868 return true; 4869 4870 if (amdgpu_sriov_vf(adev)) 4871 return true; 4872 4873 if (amdgpu_gpu_recovery == -1) { 4874 switch (adev->asic_type) { 4875 #ifdef CONFIG_DRM_AMDGPU_SI 4876 case CHIP_VERDE: 4877 case CHIP_TAHITI: 4878 case CHIP_PITCAIRN: 4879 case CHIP_OLAND: 4880 case CHIP_HAINAN: 4881 #endif 4882 #ifdef CONFIG_DRM_AMDGPU_CIK 4883 case CHIP_KAVERI: 4884 case CHIP_KABINI: 4885 case CHIP_MULLINS: 4886 #endif 4887 case CHIP_CARRIZO: 4888 case CHIP_STONEY: 4889 case CHIP_CYAN_SKILLFISH: 4890 goto disabled; 4891 default: 4892 break; 4893 } 4894 } 4895 4896 return true; 4897 4898 disabled: 4899 dev_info(adev->dev, "GPU recovery disabled.\n"); 4900 return false; 4901 } 4902 4903 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4904 { 4905 u32 i; 4906 int ret = 0; 4907 4908 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4909 4910 dev_info(adev->dev, "GPU mode1 reset\n"); 4911 4912 /* disable BM */ 4913 pci_clear_master(adev->pdev); 4914 4915 amdgpu_device_cache_pci_state(adev->pdev); 4916 4917 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4918 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4919 ret = amdgpu_dpm_mode1_reset(adev); 4920 } else { 4921 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4922 ret = psp_gpu_reset(adev); 4923 } 4924 4925 if (ret) 4926 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4927 4928 amdgpu_device_load_pci_state(adev->pdev); 4929 4930 /* wait for asic to come out of reset */ 4931 for (i = 0; i < adev->usec_timeout; i++) { 4932 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4933 4934 if (memsize != 0xffffffff) 4935 break; 4936 udelay(1); 4937 } 4938 4939 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4940 return ret; 4941 } 4942 4943 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4944 struct amdgpu_reset_context *reset_context) 4945 { 4946 int i, r = 0; 4947 struct amdgpu_job *job = NULL; 4948 bool need_full_reset = 4949 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4950 4951 if (reset_context->reset_req_dev == adev) 4952 job = reset_context->job; 4953 4954 if (amdgpu_sriov_vf(adev)) { 4955 /* stop the data exchange thread */ 4956 amdgpu_virt_fini_data_exchange(adev); 4957 } 4958 4959 amdgpu_fence_driver_isr_toggle(adev, true); 4960 4961 /* block all schedulers and reset given job's ring */ 4962 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4963 struct amdgpu_ring *ring = adev->rings[i]; 4964 4965 if (!ring || !ring->sched.thread) 4966 continue; 4967 4968 /*clear job fence from fence drv to avoid force_completion 4969 *leave NULL and vm flush fence in fence drv */ 4970 amdgpu_fence_driver_clear_job_fences(ring); 4971 4972 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4973 amdgpu_fence_driver_force_completion(ring); 4974 } 4975 4976 amdgpu_fence_driver_isr_toggle(adev, false); 4977 4978 if (job && job->vm) 4979 drm_sched_increase_karma(&job->base); 4980 4981 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4982 /* If reset handler not implemented, continue; otherwise return */ 4983 if (r == -ENOSYS) 4984 r = 0; 4985 else 4986 return r; 4987 4988 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4989 if (!amdgpu_sriov_vf(adev)) { 4990 4991 if (!need_full_reset) 4992 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4993 4994 if (!need_full_reset && amdgpu_gpu_recovery && 4995 amdgpu_device_ip_check_soft_reset(adev)) { 4996 amdgpu_device_ip_pre_soft_reset(adev); 4997 r = amdgpu_device_ip_soft_reset(adev); 4998 amdgpu_device_ip_post_soft_reset(adev); 4999 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5000 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5001 need_full_reset = true; 5002 } 5003 } 5004 5005 if (need_full_reset) 5006 r = amdgpu_device_ip_suspend(adev); 5007 if (need_full_reset) 5008 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5009 else 5010 clear_bit(AMDGPU_NEED_FULL_RESET, 5011 &reset_context->flags); 5012 } 5013 5014 return r; 5015 } 5016 5017 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5018 { 5019 int i; 5020 5021 lockdep_assert_held(&adev->reset_domain->sem); 5022 5023 for (i = 0; i < adev->num_regs; i++) { 5024 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 5025 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 5026 adev->reset_dump_reg_value[i]); 5027 } 5028 5029 return 0; 5030 } 5031 5032 #ifdef CONFIG_DEV_COREDUMP 5033 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 5034 size_t count, void *data, size_t datalen) 5035 { 5036 struct drm_printer p; 5037 struct amdgpu_device *adev = data; 5038 struct drm_print_iterator iter; 5039 int i; 5040 5041 iter.data = buffer; 5042 iter.offset = 0; 5043 iter.start = offset; 5044 iter.remain = count; 5045 5046 p = drm_coredump_printer(&iter); 5047 5048 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 5049 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 5050 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 5051 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 5052 if (adev->reset_task_info.pid) 5053 drm_printf(&p, "process_name: %s PID: %d\n", 5054 adev->reset_task_info.process_name, 5055 adev->reset_task_info.pid); 5056 5057 if (adev->reset_vram_lost) 5058 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 5059 if (adev->num_regs) { 5060 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 5061 5062 for (i = 0; i < adev->num_regs; i++) 5063 drm_printf(&p, "0x%08x: 0x%08x\n", 5064 adev->reset_dump_reg_list[i], 5065 adev->reset_dump_reg_value[i]); 5066 } 5067 5068 return count - iter.remain; 5069 } 5070 5071 static void amdgpu_devcoredump_free(void *data) 5072 { 5073 } 5074 5075 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 5076 { 5077 struct drm_device *dev = adev_to_drm(adev); 5078 5079 ktime_get_ts64(&adev->reset_time); 5080 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 5081 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5082 } 5083 #endif 5084 5085 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5086 struct amdgpu_reset_context *reset_context) 5087 { 5088 struct amdgpu_device *tmp_adev = NULL; 5089 bool need_full_reset, skip_hw_reset, vram_lost = false; 5090 int r = 0; 5091 bool gpu_reset_for_dev_remove = 0; 5092 5093 /* Try reset handler method first */ 5094 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5095 reset_list); 5096 amdgpu_reset_reg_dumps(tmp_adev); 5097 5098 reset_context->reset_device_list = device_list_handle; 5099 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5100 /* If reset handler not implemented, continue; otherwise return */ 5101 if (r == -ENOSYS) 5102 r = 0; 5103 else 5104 return r; 5105 5106 /* Reset handler not implemented, use the default method */ 5107 need_full_reset = 5108 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5109 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5110 5111 gpu_reset_for_dev_remove = 5112 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5113 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5114 5115 /* 5116 * ASIC reset has to be done on all XGMI hive nodes ASAP 5117 * to allow proper links negotiation in FW (within 1 sec) 5118 */ 5119 if (!skip_hw_reset && need_full_reset) { 5120 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5121 /* For XGMI run all resets in parallel to speed up the process */ 5122 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5123 tmp_adev->gmc.xgmi.pending_reset = false; 5124 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5125 r = -EALREADY; 5126 } else 5127 r = amdgpu_asic_reset(tmp_adev); 5128 5129 if (r) { 5130 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5131 r, adev_to_drm(tmp_adev)->unique); 5132 break; 5133 } 5134 } 5135 5136 /* For XGMI wait for all resets to complete before proceed */ 5137 if (!r) { 5138 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5139 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5140 flush_work(&tmp_adev->xgmi_reset_work); 5141 r = tmp_adev->asic_reset_res; 5142 if (r) 5143 break; 5144 } 5145 } 5146 } 5147 } 5148 5149 if (!r && amdgpu_ras_intr_triggered()) { 5150 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5151 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5152 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5153 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5154 } 5155 5156 amdgpu_ras_intr_cleared(); 5157 } 5158 5159 /* Since the mode1 reset affects base ip blocks, the 5160 * phase1 ip blocks need to be resumed. Otherwise there 5161 * will be a BIOS signature error and the psp bootloader 5162 * can't load kdb on the next amdgpu install. 5163 */ 5164 if (gpu_reset_for_dev_remove) { 5165 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5166 amdgpu_device_ip_resume_phase1(tmp_adev); 5167 5168 goto end; 5169 } 5170 5171 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5172 if (need_full_reset) { 5173 /* post card */ 5174 r = amdgpu_device_asic_init(tmp_adev); 5175 if (r) { 5176 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5177 } else { 5178 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5179 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5180 if (r) 5181 goto out; 5182 5183 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5184 if (r) 5185 goto out; 5186 5187 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5188 #ifdef CONFIG_DEV_COREDUMP 5189 tmp_adev->reset_vram_lost = vram_lost; 5190 memset(&tmp_adev->reset_task_info, 0, 5191 sizeof(tmp_adev->reset_task_info)); 5192 if (reset_context->job && reset_context->job->vm) 5193 tmp_adev->reset_task_info = 5194 reset_context->job->vm->task_info; 5195 amdgpu_reset_capture_coredumpm(tmp_adev); 5196 #endif 5197 if (vram_lost) { 5198 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5199 amdgpu_inc_vram_lost(tmp_adev); 5200 } 5201 5202 r = amdgpu_device_fw_loading(tmp_adev); 5203 if (r) 5204 return r; 5205 5206 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5207 if (r) 5208 goto out; 5209 5210 if (vram_lost) 5211 amdgpu_device_fill_reset_magic(tmp_adev); 5212 5213 /* 5214 * Add this ASIC as tracked as reset was already 5215 * complete successfully. 5216 */ 5217 amdgpu_register_gpu_instance(tmp_adev); 5218 5219 if (!reset_context->hive && 5220 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5221 amdgpu_xgmi_add_device(tmp_adev); 5222 5223 r = amdgpu_device_ip_late_init(tmp_adev); 5224 if (r) 5225 goto out; 5226 5227 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5228 5229 /* 5230 * The GPU enters bad state once faulty pages 5231 * by ECC has reached the threshold, and ras 5232 * recovery is scheduled next. So add one check 5233 * here to break recovery if it indeed exceeds 5234 * bad page threshold, and remind user to 5235 * retire this GPU or setting one bigger 5236 * bad_page_threshold value to fix this once 5237 * probing driver again. 5238 */ 5239 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5240 /* must succeed. */ 5241 amdgpu_ras_resume(tmp_adev); 5242 } else { 5243 r = -EINVAL; 5244 goto out; 5245 } 5246 5247 /* Update PSP FW topology after reset */ 5248 if (reset_context->hive && 5249 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5250 r = amdgpu_xgmi_update_topology( 5251 reset_context->hive, tmp_adev); 5252 } 5253 } 5254 5255 out: 5256 if (!r) { 5257 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5258 r = amdgpu_ib_ring_tests(tmp_adev); 5259 if (r) { 5260 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5261 need_full_reset = true; 5262 r = -EAGAIN; 5263 goto end; 5264 } 5265 } 5266 5267 if (!r) 5268 r = amdgpu_device_recover_vram(tmp_adev); 5269 else 5270 tmp_adev->asic_reset_res = r; 5271 } 5272 5273 end: 5274 if (need_full_reset) 5275 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5276 else 5277 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5278 return r; 5279 } 5280 5281 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5282 { 5283 5284 switch (amdgpu_asic_reset_method(adev)) { 5285 case AMD_RESET_METHOD_MODE1: 5286 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5287 break; 5288 case AMD_RESET_METHOD_MODE2: 5289 adev->mp1_state = PP_MP1_STATE_RESET; 5290 break; 5291 default: 5292 adev->mp1_state = PP_MP1_STATE_NONE; 5293 break; 5294 } 5295 } 5296 5297 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5298 { 5299 amdgpu_vf_error_trans_all(adev); 5300 adev->mp1_state = PP_MP1_STATE_NONE; 5301 } 5302 5303 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5304 { 5305 struct pci_dev *p = NULL; 5306 5307 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5308 adev->pdev->bus->number, 1); 5309 if (p) { 5310 pm_runtime_enable(&(p->dev)); 5311 pm_runtime_resume(&(p->dev)); 5312 } 5313 5314 pci_dev_put(p); 5315 } 5316 5317 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5318 { 5319 enum amd_reset_method reset_method; 5320 struct pci_dev *p = NULL; 5321 u64 expires; 5322 5323 /* 5324 * For now, only BACO and mode1 reset are confirmed 5325 * to suffer the audio issue without proper suspended. 5326 */ 5327 reset_method = amdgpu_asic_reset_method(adev); 5328 if ((reset_method != AMD_RESET_METHOD_BACO) && 5329 (reset_method != AMD_RESET_METHOD_MODE1)) 5330 return -EINVAL; 5331 5332 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5333 adev->pdev->bus->number, 1); 5334 if (!p) 5335 return -ENODEV; 5336 5337 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5338 if (!expires) 5339 /* 5340 * If we cannot get the audio device autosuspend delay, 5341 * a fixed 4S interval will be used. Considering 3S is 5342 * the audio controller default autosuspend delay setting. 5343 * 4S used here is guaranteed to cover that. 5344 */ 5345 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5346 5347 while (!pm_runtime_status_suspended(&(p->dev))) { 5348 if (!pm_runtime_suspend(&(p->dev))) 5349 break; 5350 5351 if (expires < ktime_get_mono_fast_ns()) { 5352 dev_warn(adev->dev, "failed to suspend display audio\n"); 5353 pci_dev_put(p); 5354 /* TODO: abort the succeeding gpu reset? */ 5355 return -ETIMEDOUT; 5356 } 5357 } 5358 5359 pm_runtime_disable(&(p->dev)); 5360 5361 pci_dev_put(p); 5362 return 0; 5363 } 5364 5365 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5366 { 5367 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5368 5369 #if defined(CONFIG_DEBUG_FS) 5370 if (!amdgpu_sriov_vf(adev)) 5371 cancel_work(&adev->reset_work); 5372 #endif 5373 5374 if (adev->kfd.dev) 5375 cancel_work(&adev->kfd.reset_work); 5376 5377 if (amdgpu_sriov_vf(adev)) 5378 cancel_work(&adev->virt.flr_work); 5379 5380 if (con && adev->ras_enabled) 5381 cancel_work(&con->recovery_work); 5382 5383 } 5384 5385 /** 5386 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5387 * 5388 * @adev: amdgpu_device pointer 5389 * @job: which job trigger hang 5390 * @reset_context: amdgpu reset context pointer 5391 * 5392 * Attempt to reset the GPU if it has hung (all asics). 5393 * Attempt to do soft-reset or full-reset and reinitialize Asic 5394 * Returns 0 for success or an error on failure. 5395 */ 5396 5397 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5398 struct amdgpu_job *job, 5399 struct amdgpu_reset_context *reset_context) 5400 { 5401 struct list_head device_list, *device_list_handle = NULL; 5402 bool job_signaled = false; 5403 struct amdgpu_hive_info *hive = NULL; 5404 struct amdgpu_device *tmp_adev = NULL; 5405 int i, r = 0; 5406 bool need_emergency_restart = false; 5407 bool audio_suspended = false; 5408 bool gpu_reset_for_dev_remove = false; 5409 5410 gpu_reset_for_dev_remove = 5411 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5412 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5413 5414 /* 5415 * Special case: RAS triggered and full reset isn't supported 5416 */ 5417 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5418 5419 /* 5420 * Flush RAM to disk so that after reboot 5421 * the user can read log and see why the system rebooted. 5422 */ 5423 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5424 DRM_WARN("Emergency reboot."); 5425 5426 ksys_sync_helper(); 5427 emergency_restart(); 5428 } 5429 5430 dev_info(adev->dev, "GPU %s begin!\n", 5431 need_emergency_restart ? "jobs stop":"reset"); 5432 5433 if (!amdgpu_sriov_vf(adev)) 5434 hive = amdgpu_get_xgmi_hive(adev); 5435 if (hive) 5436 mutex_lock(&hive->hive_lock); 5437 5438 reset_context->job = job; 5439 reset_context->hive = hive; 5440 /* 5441 * Build list of devices to reset. 5442 * In case we are in XGMI hive mode, resort the device list 5443 * to put adev in the 1st position. 5444 */ 5445 INIT_LIST_HEAD(&device_list); 5446 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5447 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5448 list_add_tail(&tmp_adev->reset_list, &device_list); 5449 if (gpu_reset_for_dev_remove && adev->shutdown) 5450 tmp_adev->shutdown = true; 5451 } 5452 if (!list_is_first(&adev->reset_list, &device_list)) 5453 list_rotate_to_front(&adev->reset_list, &device_list); 5454 device_list_handle = &device_list; 5455 } else { 5456 list_add_tail(&adev->reset_list, &device_list); 5457 device_list_handle = &device_list; 5458 } 5459 5460 /* We need to lock reset domain only once both for XGMI and single device */ 5461 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5462 reset_list); 5463 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5464 5465 /* block all schedulers and reset given job's ring */ 5466 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5467 5468 amdgpu_device_set_mp1_state(tmp_adev); 5469 5470 /* 5471 * Try to put the audio codec into suspend state 5472 * before gpu reset started. 5473 * 5474 * Due to the power domain of the graphics device 5475 * is shared with AZ power domain. Without this, 5476 * we may change the audio hardware from behind 5477 * the audio driver's back. That will trigger 5478 * some audio codec errors. 5479 */ 5480 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5481 audio_suspended = true; 5482 5483 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5484 5485 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5486 5487 if (!amdgpu_sriov_vf(tmp_adev)) 5488 amdgpu_amdkfd_pre_reset(tmp_adev); 5489 5490 /* 5491 * Mark these ASICs to be reseted as untracked first 5492 * And add them back after reset completed 5493 */ 5494 amdgpu_unregister_gpu_instance(tmp_adev); 5495 5496 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5497 5498 /* disable ras on ALL IPs */ 5499 if (!need_emergency_restart && 5500 amdgpu_device_ip_need_full_reset(tmp_adev)) 5501 amdgpu_ras_suspend(tmp_adev); 5502 5503 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5504 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5505 5506 if (!ring || !ring->sched.thread) 5507 continue; 5508 5509 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5510 5511 if (need_emergency_restart) 5512 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5513 } 5514 atomic_inc(&tmp_adev->gpu_reset_counter); 5515 } 5516 5517 if (need_emergency_restart) 5518 goto skip_sched_resume; 5519 5520 /* 5521 * Must check guilty signal here since after this point all old 5522 * HW fences are force signaled. 5523 * 5524 * job->base holds a reference to parent fence 5525 */ 5526 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5527 job_signaled = true; 5528 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5529 goto skip_hw_reset; 5530 } 5531 5532 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5533 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5534 if (gpu_reset_for_dev_remove) { 5535 /* Workaroud for ASICs need to disable SMC first */ 5536 amdgpu_device_smu_fini_early(tmp_adev); 5537 } 5538 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5539 /*TODO Should we stop ?*/ 5540 if (r) { 5541 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5542 r, adev_to_drm(tmp_adev)->unique); 5543 tmp_adev->asic_reset_res = r; 5544 } 5545 5546 /* 5547 * Drop all pending non scheduler resets. Scheduler resets 5548 * were already dropped during drm_sched_stop 5549 */ 5550 amdgpu_device_stop_pending_resets(tmp_adev); 5551 } 5552 5553 /* Actual ASIC resets if needed.*/ 5554 /* Host driver will handle XGMI hive reset for SRIOV */ 5555 if (amdgpu_sriov_vf(adev)) { 5556 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5557 if (r) 5558 adev->asic_reset_res = r; 5559 5560 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5561 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5562 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5563 amdgpu_ras_resume(adev); 5564 } else { 5565 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5566 if (r && r == -EAGAIN) 5567 goto retry; 5568 5569 if (!r && gpu_reset_for_dev_remove) 5570 goto recover_end; 5571 } 5572 5573 skip_hw_reset: 5574 5575 /* Post ASIC reset for all devs .*/ 5576 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5577 5578 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5579 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5580 5581 if (!ring || !ring->sched.thread) 5582 continue; 5583 5584 drm_sched_start(&ring->sched, true); 5585 } 5586 5587 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5588 amdgpu_mes_self_test(tmp_adev); 5589 5590 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5591 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5592 } 5593 5594 if (tmp_adev->asic_reset_res) 5595 r = tmp_adev->asic_reset_res; 5596 5597 tmp_adev->asic_reset_res = 0; 5598 5599 if (r) { 5600 /* bad news, how to tell it to userspace ? */ 5601 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5602 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5603 } else { 5604 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5605 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5606 DRM_WARN("smart shift update failed\n"); 5607 } 5608 } 5609 5610 skip_sched_resume: 5611 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5612 /* unlock kfd: SRIOV would do it separately */ 5613 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5614 amdgpu_amdkfd_post_reset(tmp_adev); 5615 5616 /* kfd_post_reset will do nothing if kfd device is not initialized, 5617 * need to bring up kfd here if it's not be initialized before 5618 */ 5619 if (!adev->kfd.init_complete) 5620 amdgpu_amdkfd_device_init(adev); 5621 5622 if (audio_suspended) 5623 amdgpu_device_resume_display_audio(tmp_adev); 5624 5625 amdgpu_device_unset_mp1_state(tmp_adev); 5626 5627 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5628 } 5629 5630 recover_end: 5631 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5632 reset_list); 5633 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5634 5635 if (hive) { 5636 mutex_unlock(&hive->hive_lock); 5637 amdgpu_put_xgmi_hive(hive); 5638 } 5639 5640 if (r) 5641 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5642 5643 atomic_set(&adev->reset_domain->reset_res, r); 5644 return r; 5645 } 5646 5647 /** 5648 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5649 * 5650 * @adev: amdgpu_device pointer 5651 * 5652 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5653 * and lanes) of the slot the device is in. Handles APUs and 5654 * virtualized environments where PCIE config space may not be available. 5655 */ 5656 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5657 { 5658 struct pci_dev *pdev; 5659 enum pci_bus_speed speed_cap, platform_speed_cap; 5660 enum pcie_link_width platform_link_width; 5661 5662 if (amdgpu_pcie_gen_cap) 5663 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5664 5665 if (amdgpu_pcie_lane_cap) 5666 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5667 5668 /* covers APUs as well */ 5669 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5670 if (adev->pm.pcie_gen_mask == 0) 5671 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5672 if (adev->pm.pcie_mlw_mask == 0) 5673 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5674 return; 5675 } 5676 5677 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5678 return; 5679 5680 pcie_bandwidth_available(adev->pdev, NULL, 5681 &platform_speed_cap, &platform_link_width); 5682 5683 if (adev->pm.pcie_gen_mask == 0) { 5684 /* asic caps */ 5685 pdev = adev->pdev; 5686 speed_cap = pcie_get_speed_cap(pdev); 5687 if (speed_cap == PCI_SPEED_UNKNOWN) { 5688 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5689 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5690 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5691 } else { 5692 if (speed_cap == PCIE_SPEED_32_0GT) 5693 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5694 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5695 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5696 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5697 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5698 else if (speed_cap == PCIE_SPEED_16_0GT) 5699 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5700 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5701 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5702 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5703 else if (speed_cap == PCIE_SPEED_8_0GT) 5704 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5705 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5706 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5707 else if (speed_cap == PCIE_SPEED_5_0GT) 5708 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5709 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5710 else 5711 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5712 } 5713 /* platform caps */ 5714 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5715 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5716 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5717 } else { 5718 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5719 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5720 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5721 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5722 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5723 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5724 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5725 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5726 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5727 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5728 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5729 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5730 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5731 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5732 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5733 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5734 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5735 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5736 else 5737 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5738 5739 } 5740 } 5741 if (adev->pm.pcie_mlw_mask == 0) { 5742 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5743 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5744 } else { 5745 switch (platform_link_width) { 5746 case PCIE_LNK_X32: 5747 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5749 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5750 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5751 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5752 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5754 break; 5755 case PCIE_LNK_X16: 5756 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5758 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5759 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5762 break; 5763 case PCIE_LNK_X12: 5764 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5769 break; 5770 case PCIE_LNK_X8: 5771 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5772 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5773 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5775 break; 5776 case PCIE_LNK_X4: 5777 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5778 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5779 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5780 break; 5781 case PCIE_LNK_X2: 5782 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5783 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5784 break; 5785 case PCIE_LNK_X1: 5786 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5787 break; 5788 default: 5789 break; 5790 } 5791 } 5792 } 5793 } 5794 5795 /** 5796 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5797 * 5798 * @adev: amdgpu_device pointer 5799 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5800 * 5801 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5802 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5803 * @peer_adev. 5804 */ 5805 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5806 struct amdgpu_device *peer_adev) 5807 { 5808 #ifdef CONFIG_HSA_AMD_P2P 5809 uint64_t address_mask = peer_adev->dev->dma_mask ? 5810 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5811 resource_size_t aper_limit = 5812 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5813 bool p2p_access = 5814 !adev->gmc.xgmi.connected_to_cpu && 5815 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5816 5817 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5818 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5819 !(adev->gmc.aper_base & address_mask || 5820 aper_limit & address_mask)); 5821 #else 5822 return false; 5823 #endif 5824 } 5825 5826 int amdgpu_device_baco_enter(struct drm_device *dev) 5827 { 5828 struct amdgpu_device *adev = drm_to_adev(dev); 5829 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5830 5831 if (!amdgpu_device_supports_baco(dev)) 5832 return -ENOTSUPP; 5833 5834 if (ras && adev->ras_enabled && 5835 adev->nbio.funcs->enable_doorbell_interrupt) 5836 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5837 5838 return amdgpu_dpm_baco_enter(adev); 5839 } 5840 5841 int amdgpu_device_baco_exit(struct drm_device *dev) 5842 { 5843 struct amdgpu_device *adev = drm_to_adev(dev); 5844 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5845 int ret = 0; 5846 5847 if (!amdgpu_device_supports_baco(dev)) 5848 return -ENOTSUPP; 5849 5850 ret = amdgpu_dpm_baco_exit(adev); 5851 if (ret) 5852 return ret; 5853 5854 if (ras && adev->ras_enabled && 5855 adev->nbio.funcs->enable_doorbell_interrupt) 5856 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5857 5858 if (amdgpu_passthrough(adev) && 5859 adev->nbio.funcs->clear_doorbell_interrupt) 5860 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5861 5862 return 0; 5863 } 5864 5865 /** 5866 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5867 * @pdev: PCI device struct 5868 * @state: PCI channel state 5869 * 5870 * Description: Called when a PCI error is detected. 5871 * 5872 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5873 */ 5874 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5875 { 5876 struct drm_device *dev = pci_get_drvdata(pdev); 5877 struct amdgpu_device *adev = drm_to_adev(dev); 5878 int i; 5879 5880 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5881 5882 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5883 DRM_WARN("No support for XGMI hive yet..."); 5884 return PCI_ERS_RESULT_DISCONNECT; 5885 } 5886 5887 adev->pci_channel_state = state; 5888 5889 switch (state) { 5890 case pci_channel_io_normal: 5891 return PCI_ERS_RESULT_CAN_RECOVER; 5892 /* Fatal error, prepare for slot reset */ 5893 case pci_channel_io_frozen: 5894 /* 5895 * Locking adev->reset_domain->sem will prevent any external access 5896 * to GPU during PCI error recovery 5897 */ 5898 amdgpu_device_lock_reset_domain(adev->reset_domain); 5899 amdgpu_device_set_mp1_state(adev); 5900 5901 /* 5902 * Block any work scheduling as we do for regular GPU reset 5903 * for the duration of the recovery 5904 */ 5905 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5906 struct amdgpu_ring *ring = adev->rings[i]; 5907 5908 if (!ring || !ring->sched.thread) 5909 continue; 5910 5911 drm_sched_stop(&ring->sched, NULL); 5912 } 5913 atomic_inc(&adev->gpu_reset_counter); 5914 return PCI_ERS_RESULT_NEED_RESET; 5915 case pci_channel_io_perm_failure: 5916 /* Permanent error, prepare for device removal */ 5917 return PCI_ERS_RESULT_DISCONNECT; 5918 } 5919 5920 return PCI_ERS_RESULT_NEED_RESET; 5921 } 5922 5923 /** 5924 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5925 * @pdev: pointer to PCI device 5926 */ 5927 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5928 { 5929 5930 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5931 5932 /* TODO - dump whatever for debugging purposes */ 5933 5934 /* This called only if amdgpu_pci_error_detected returns 5935 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5936 * works, no need to reset slot. 5937 */ 5938 5939 return PCI_ERS_RESULT_RECOVERED; 5940 } 5941 5942 /** 5943 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5944 * @pdev: PCI device struct 5945 * 5946 * Description: This routine is called by the pci error recovery 5947 * code after the PCI slot has been reset, just before we 5948 * should resume normal operations. 5949 */ 5950 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5951 { 5952 struct drm_device *dev = pci_get_drvdata(pdev); 5953 struct amdgpu_device *adev = drm_to_adev(dev); 5954 int r, i; 5955 struct amdgpu_reset_context reset_context; 5956 u32 memsize; 5957 struct list_head device_list; 5958 5959 DRM_INFO("PCI error: slot reset callback!!\n"); 5960 5961 memset(&reset_context, 0, sizeof(reset_context)); 5962 5963 INIT_LIST_HEAD(&device_list); 5964 list_add_tail(&adev->reset_list, &device_list); 5965 5966 /* wait for asic to come out of reset */ 5967 msleep(500); 5968 5969 /* Restore PCI confspace */ 5970 amdgpu_device_load_pci_state(pdev); 5971 5972 /* confirm ASIC came out of reset */ 5973 for (i = 0; i < adev->usec_timeout; i++) { 5974 memsize = amdgpu_asic_get_config_memsize(adev); 5975 5976 if (memsize != 0xffffffff) 5977 break; 5978 udelay(1); 5979 } 5980 if (memsize == 0xffffffff) { 5981 r = -ETIME; 5982 goto out; 5983 } 5984 5985 reset_context.method = AMD_RESET_METHOD_NONE; 5986 reset_context.reset_req_dev = adev; 5987 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5988 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5989 5990 adev->no_hw_access = true; 5991 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5992 adev->no_hw_access = false; 5993 if (r) 5994 goto out; 5995 5996 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5997 5998 out: 5999 if (!r) { 6000 if (amdgpu_device_cache_pci_state(adev->pdev)) 6001 pci_restore_state(adev->pdev); 6002 6003 DRM_INFO("PCIe error recovery succeeded\n"); 6004 } else { 6005 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6006 amdgpu_device_unset_mp1_state(adev); 6007 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6008 } 6009 6010 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6011 } 6012 6013 /** 6014 * amdgpu_pci_resume() - resume normal ops after PCI reset 6015 * @pdev: pointer to PCI device 6016 * 6017 * Called when the error recovery driver tells us that its 6018 * OK to resume normal operation. 6019 */ 6020 void amdgpu_pci_resume(struct pci_dev *pdev) 6021 { 6022 struct drm_device *dev = pci_get_drvdata(pdev); 6023 struct amdgpu_device *adev = drm_to_adev(dev); 6024 int i; 6025 6026 6027 DRM_INFO("PCI error: resume callback!!\n"); 6028 6029 /* Only continue execution for the case of pci_channel_io_frozen */ 6030 if (adev->pci_channel_state != pci_channel_io_frozen) 6031 return; 6032 6033 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6034 struct amdgpu_ring *ring = adev->rings[i]; 6035 6036 if (!ring || !ring->sched.thread) 6037 continue; 6038 6039 drm_sched_start(&ring->sched, true); 6040 } 6041 6042 amdgpu_device_unset_mp1_state(adev); 6043 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6044 } 6045 6046 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6047 { 6048 struct drm_device *dev = pci_get_drvdata(pdev); 6049 struct amdgpu_device *adev = drm_to_adev(dev); 6050 int r; 6051 6052 r = pci_save_state(pdev); 6053 if (!r) { 6054 kfree(adev->pci_state); 6055 6056 adev->pci_state = pci_store_saved_state(pdev); 6057 6058 if (!adev->pci_state) { 6059 DRM_ERROR("Failed to store PCI saved state"); 6060 return false; 6061 } 6062 } else { 6063 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6064 return false; 6065 } 6066 6067 return true; 6068 } 6069 6070 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6071 { 6072 struct drm_device *dev = pci_get_drvdata(pdev); 6073 struct amdgpu_device *adev = drm_to_adev(dev); 6074 int r; 6075 6076 if (!adev->pci_state) 6077 return false; 6078 6079 r = pci_load_saved_state(pdev, adev->pci_state); 6080 6081 if (!r) { 6082 pci_restore_state(pdev); 6083 } else { 6084 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6085 return false; 6086 } 6087 6088 return true; 6089 } 6090 6091 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6092 struct amdgpu_ring *ring) 6093 { 6094 #ifdef CONFIG_X86_64 6095 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6096 return; 6097 #endif 6098 if (adev->gmc.xgmi.connected_to_cpu) 6099 return; 6100 6101 if (ring && ring->funcs->emit_hdp_flush) 6102 amdgpu_ring_emit_hdp_flush(ring); 6103 else 6104 amdgpu_asic_flush_hdp(adev, ring); 6105 } 6106 6107 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6108 struct amdgpu_ring *ring) 6109 { 6110 #ifdef CONFIG_X86_64 6111 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6112 return; 6113 #endif 6114 if (adev->gmc.xgmi.connected_to_cpu) 6115 return; 6116 6117 amdgpu_asic_invalidate_hdp(adev, ring); 6118 } 6119 6120 int amdgpu_in_reset(struct amdgpu_device *adev) 6121 { 6122 return atomic_read(&adev->reset_domain->in_gpu_reset); 6123 } 6124 6125 /** 6126 * amdgpu_device_halt() - bring hardware to some kind of halt state 6127 * 6128 * @adev: amdgpu_device pointer 6129 * 6130 * Bring hardware to some kind of halt state so that no one can touch it 6131 * any more. It will help to maintain error context when error occurred. 6132 * Compare to a simple hang, the system will keep stable at least for SSH 6133 * access. Then it should be trivial to inspect the hardware state and 6134 * see what's going on. Implemented as following: 6135 * 6136 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6137 * clears all CPU mappings to device, disallows remappings through page faults 6138 * 2. amdgpu_irq_disable_all() disables all interrupts 6139 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6140 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6141 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6142 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6143 * flush any in flight DMA operations 6144 */ 6145 void amdgpu_device_halt(struct amdgpu_device *adev) 6146 { 6147 struct pci_dev *pdev = adev->pdev; 6148 struct drm_device *ddev = adev_to_drm(adev); 6149 6150 amdgpu_xcp_dev_unplug(adev); 6151 drm_dev_unplug(ddev); 6152 6153 amdgpu_irq_disable_all(adev); 6154 6155 amdgpu_fence_driver_hw_fini(adev); 6156 6157 adev->no_hw_access = true; 6158 6159 amdgpu_device_unmap_mmio(adev); 6160 6161 pci_disable_device(pdev); 6162 pci_wait_for_pending_transaction(pdev); 6163 } 6164 6165 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6166 u32 reg) 6167 { 6168 unsigned long flags, address, data; 6169 u32 r; 6170 6171 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6172 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6173 6174 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6175 WREG32(address, reg * 4); 6176 (void)RREG32(address); 6177 r = RREG32(data); 6178 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6179 return r; 6180 } 6181 6182 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6183 u32 reg, u32 v) 6184 { 6185 unsigned long flags, address, data; 6186 6187 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6188 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6189 6190 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6191 WREG32(address, reg * 4); 6192 (void)RREG32(address); 6193 WREG32(data, v); 6194 (void)RREG32(data); 6195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6196 } 6197 6198 /** 6199 * amdgpu_device_switch_gang - switch to a new gang 6200 * @adev: amdgpu_device pointer 6201 * @gang: the gang to switch to 6202 * 6203 * Try to switch to a new gang. 6204 * Returns: NULL if we switched to the new gang or a reference to the current 6205 * gang leader. 6206 */ 6207 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6208 struct dma_fence *gang) 6209 { 6210 struct dma_fence *old = NULL; 6211 6212 do { 6213 dma_fence_put(old); 6214 rcu_read_lock(); 6215 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6216 rcu_read_unlock(); 6217 6218 if (old == gang) 6219 break; 6220 6221 if (!dma_fence_is_signaled(old)) 6222 return old; 6223 6224 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6225 old, gang) != old); 6226 6227 dma_fence_put(old); 6228 return NULL; 6229 } 6230 6231 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6232 { 6233 switch (adev->asic_type) { 6234 #ifdef CONFIG_DRM_AMDGPU_SI 6235 case CHIP_HAINAN: 6236 #endif 6237 case CHIP_TOPAZ: 6238 /* chips with no display hardware */ 6239 return false; 6240 #ifdef CONFIG_DRM_AMDGPU_SI 6241 case CHIP_TAHITI: 6242 case CHIP_PITCAIRN: 6243 case CHIP_VERDE: 6244 case CHIP_OLAND: 6245 #endif 6246 #ifdef CONFIG_DRM_AMDGPU_CIK 6247 case CHIP_BONAIRE: 6248 case CHIP_HAWAII: 6249 case CHIP_KAVERI: 6250 case CHIP_KABINI: 6251 case CHIP_MULLINS: 6252 #endif 6253 case CHIP_TONGA: 6254 case CHIP_FIJI: 6255 case CHIP_POLARIS10: 6256 case CHIP_POLARIS11: 6257 case CHIP_POLARIS12: 6258 case CHIP_VEGAM: 6259 case CHIP_CARRIZO: 6260 case CHIP_STONEY: 6261 /* chips with display hardware */ 6262 return true; 6263 default: 6264 /* IP discovery */ 6265 if (!adev->ip_versions[DCE_HWIP][0] || 6266 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6267 return false; 6268 return true; 6269 } 6270 } 6271 6272 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6273 uint32_t inst, uint32_t reg_addr, char reg_name[], 6274 uint32_t expected_value, uint32_t mask) 6275 { 6276 uint32_t ret = 0; 6277 uint32_t old_ = 0; 6278 uint32_t tmp_ = RREG32(reg_addr); 6279 uint32_t loop = adev->usec_timeout; 6280 6281 while ((tmp_ & (mask)) != (expected_value)) { 6282 if (old_ != tmp_) { 6283 loop = adev->usec_timeout; 6284 old_ = tmp_; 6285 } else 6286 udelay(1); 6287 tmp_ = RREG32(reg_addr); 6288 loop--; 6289 if (!loop) { 6290 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6291 inst, reg_name, (uint32_t)expected_value, 6292 (uint32_t)(tmp_ & (mask))); 6293 ret = -ETIMEDOUT; 6294 break; 6295 } 6296 } 6297 return ret; 6298 } 6299