1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 /** 168 * DOC: product_name 169 * 170 * The amdgpu driver provides a sysfs API for reporting the product name 171 * for the device 172 * The file product_name is used for this and returns the product name 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_name(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return sysfs_emit(buf, "%s\n", adev->product_name); 184 } 185 186 static DEVICE_ATTR(product_name, S_IRUGO, 187 amdgpu_device_get_product_name, NULL); 188 189 /** 190 * DOC: product_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the part number 193 * for the device 194 * The file product_number is used for this and returns the part number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_product_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return sysfs_emit(buf, "%s\n", adev->product_number); 206 } 207 208 static DEVICE_ATTR(product_number, S_IRUGO, 209 amdgpu_device_get_product_number, NULL); 210 211 /** 212 * DOC: serial_number 213 * 214 * The amdgpu driver provides a sysfs API for reporting the serial number 215 * for the device 216 * The file serial_number is used for this and returns the serial number 217 * as returned from the FRU. 218 * NOTE: This is only available for certain server cards 219 */ 220 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 222 struct device_attribute *attr, char *buf) 223 { 224 struct drm_device *ddev = dev_get_drvdata(dev); 225 struct amdgpu_device *adev = drm_to_adev(ddev); 226 227 return sysfs_emit(buf, "%s\n", adev->serial); 228 } 229 230 static DEVICE_ATTR(serial_number, S_IRUGO, 231 amdgpu_device_get_serial_number, NULL); 232 233 /** 234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 235 * 236 * @dev: drm_device pointer 237 * 238 * Returns true if the device is a dGPU with ATPX power control, 239 * otherwise return false. 240 */ 241 bool amdgpu_device_supports_px(struct drm_device *dev) 242 { 243 struct amdgpu_device *adev = drm_to_adev(dev); 244 245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device is a dGPU with ACPI power control, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_boco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 if (adev->has_pr3 || 263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 264 return true; 265 return false; 266 } 267 268 /** 269 * amdgpu_device_supports_baco - Does the device support BACO 270 * 271 * @dev: drm_device pointer 272 * 273 * Returns true if the device supporte BACO, 274 * otherwise return false. 275 */ 276 bool amdgpu_device_supports_baco(struct drm_device *dev) 277 { 278 struct amdgpu_device *adev = drm_to_adev(dev); 279 280 return amdgpu_asic_supports_baco(adev); 281 } 282 283 /** 284 * amdgpu_device_supports_smart_shift - Is the device dGPU with 285 * smart shift support 286 * 287 * @dev: drm_device pointer 288 * 289 * Returns true if the device is a dGPU with Smart Shift support, 290 * otherwise returns false. 291 */ 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 293 { 294 return (amdgpu_device_supports_boco(dev) && 295 amdgpu_acpi_is_power_shift_control_supported()); 296 } 297 298 /* 299 * VRAM access helper functions 300 */ 301 302 /** 303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 304 * 305 * @adev: amdgpu_device pointer 306 * @pos: offset of the buffer in vram 307 * @buf: virtual address of the buffer in system memory 308 * @size: read/write size, sizeof(@buf) must > @size 309 * @write: true - write to vram, otherwise - read from vram 310 */ 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 312 void *buf, size_t size, bool write) 313 { 314 unsigned long flags; 315 uint32_t hi = ~0, tmp = 0; 316 uint32_t *data = buf; 317 uint64_t last; 318 int idx; 319 320 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 321 return; 322 323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 324 325 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 326 for (last = pos + size; pos < last; pos += 4) { 327 tmp = pos >> 31; 328 329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 330 if (tmp != hi) { 331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 332 hi = tmp; 333 } 334 if (write) 335 WREG32_NO_KIQ(mmMM_DATA, *data++); 336 else 337 *data++ = RREG32_NO_KIQ(mmMM_DATA); 338 } 339 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 drm_dev_exit(idx); 342 } 343 344 /** 345 * amdgpu_device_aper_access - access vram by vram aperature 346 * 347 * @adev: amdgpu_device pointer 348 * @pos: offset of the buffer in vram 349 * @buf: virtual address of the buffer in system memory 350 * @size: read/write size, sizeof(@buf) must > @size 351 * @write: true - write to vram, otherwise - read from vram 352 * 353 * The return value means how many bytes have been transferred. 354 */ 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 356 void *buf, size_t size, bool write) 357 { 358 #ifdef CONFIG_64BIT 359 void __iomem *addr; 360 size_t count = 0; 361 uint64_t last; 362 363 if (!adev->mman.aper_base_kaddr) 364 return 0; 365 366 last = min(pos + size, adev->gmc.visible_vram_size); 367 if (last > pos) { 368 addr = adev->mman.aper_base_kaddr + pos; 369 count = last - pos; 370 371 if (write) { 372 memcpy_toio(addr, buf, count); 373 mb(); 374 amdgpu_device_flush_hdp(adev, NULL); 375 } else { 376 amdgpu_device_invalidate_hdp(adev, NULL); 377 mb(); 378 memcpy_fromio(buf, addr, count); 379 } 380 381 } 382 383 return count; 384 #else 385 return 0; 386 #endif 387 } 388 389 /** 390 * amdgpu_device_vram_access - read/write a buffer in vram 391 * 392 * @adev: amdgpu_device pointer 393 * @pos: offset of the buffer in vram 394 * @buf: virtual address of the buffer in system memory 395 * @size: read/write size, sizeof(@buf) must > @size 396 * @write: true - write to vram, otherwise - read from vram 397 */ 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 399 void *buf, size_t size, bool write) 400 { 401 size_t count; 402 403 /* try to using vram apreature to access vram first */ 404 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 405 size -= count; 406 if (size) { 407 /* using MM to access rest vram */ 408 pos += count; 409 buf += count; 410 amdgpu_device_mm_access(adev, pos, buf, size, write); 411 } 412 } 413 414 /* 415 * register access helper functions. 416 */ 417 418 /* Check if hw access should be skipped because of hotplug or device error */ 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 420 { 421 if (adev->no_hw_access) 422 return true; 423 424 #ifdef CONFIG_LOCKDEP 425 /* 426 * This is a bit complicated to understand, so worth a comment. What we assert 427 * here is that the GPU reset is not running on another thread in parallel. 428 * 429 * For this we trylock the read side of the reset semaphore, if that succeeds 430 * we know that the reset is not running in paralell. 431 * 432 * If the trylock fails we assert that we are either already holding the read 433 * side of the lock or are the reset thread itself and hold the write side of 434 * the lock. 435 */ 436 if (in_task()) { 437 if (down_read_trylock(&adev->reset_domain->sem)) 438 up_read(&adev->reset_domain->sem); 439 else 440 lockdep_assert_held(&adev->reset_domain->sem); 441 } 442 #endif 443 return false; 444 } 445 446 /** 447 * amdgpu_device_rreg - read a memory mapped IO or indirect register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * @acc_flags: access flags which require special behavior 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t acc_flags) 457 { 458 uint32_t ret; 459 460 if (amdgpu_device_skip_hw_access(adev)) 461 return 0; 462 463 if ((reg * 4) < adev->rmmio_size) { 464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 465 amdgpu_sriov_runtime(adev) && 466 down_read_trylock(&adev->reset_domain->sem)) { 467 ret = amdgpu_kiq_rreg(adev, reg); 468 up_read(&adev->reset_domain->sem); 469 } else { 470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 471 } 472 } else { 473 ret = adev->pcie_rreg(adev, reg * 4); 474 } 475 476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 477 478 return ret; 479 } 480 481 /* 482 * MMIO register read with bytes helper functions 483 * @offset:bytes offset from MMIO start 484 * 485 */ 486 487 /** 488 * amdgpu_mm_rreg8 - read a memory mapped IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @offset: byte aligned register offset 492 * 493 * Returns the 8 bit value from the offset specified. 494 */ 495 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 496 { 497 if (amdgpu_device_skip_hw_access(adev)) 498 return 0; 499 500 if (offset < adev->rmmio_size) 501 return (readb(adev->rmmio + offset)); 502 BUG(); 503 } 504 505 /* 506 * MMIO register write with bytes helper functions 507 * @offset:bytes offset from MMIO start 508 * @value: the value want to be written to the register 509 * 510 */ 511 /** 512 * amdgpu_mm_wreg8 - read a memory mapped IO register 513 * 514 * @adev: amdgpu_device pointer 515 * @offset: byte aligned register offset 516 * @value: 8 bit value to write 517 * 518 * Writes the value specified to the offset specified. 519 */ 520 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 521 { 522 if (amdgpu_device_skip_hw_access(adev)) 523 return; 524 525 if (offset < adev->rmmio_size) 526 writeb(value, adev->rmmio + offset); 527 else 528 BUG(); 529 } 530 531 /** 532 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 533 * 534 * @adev: amdgpu_device pointer 535 * @reg: dword aligned register offset 536 * @v: 32 bit value to write to the register 537 * @acc_flags: access flags which require special behavior 538 * 539 * Writes the value specified to the offset specified. 540 */ 541 void amdgpu_device_wreg(struct amdgpu_device *adev, 542 uint32_t reg, uint32_t v, 543 uint32_t acc_flags) 544 { 545 if (amdgpu_device_skip_hw_access(adev)) 546 return; 547 548 if ((reg * 4) < adev->rmmio_size) { 549 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 550 amdgpu_sriov_runtime(adev) && 551 down_read_trylock(&adev->reset_domain->sem)) { 552 amdgpu_kiq_wreg(adev, reg, v); 553 up_read(&adev->reset_domain->sem); 554 } else { 555 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 556 } 557 } else { 558 adev->pcie_wreg(adev, reg * 4, v); 559 } 560 561 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 562 } 563 564 /** 565 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 566 * 567 * @adev: amdgpu_device pointer 568 * @reg: mmio/rlc register 569 * @v: value to write 570 * 571 * this function is invoked only for the debugfs register access 572 */ 573 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 574 uint32_t reg, uint32_t v) 575 { 576 if (amdgpu_device_skip_hw_access(adev)) 577 return; 578 579 if (amdgpu_sriov_fullaccess(adev) && 580 adev->gfx.rlc.funcs && 581 adev->gfx.rlc.funcs->is_rlcg_access_range) { 582 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 583 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 584 } else if ((reg * 4) >= adev->rmmio_size) { 585 adev->pcie_wreg(adev, reg * 4, v); 586 } else { 587 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 588 } 589 } 590 591 /** 592 * amdgpu_mm_rdoorbell - read a doorbell dword 593 * 594 * @adev: amdgpu_device pointer 595 * @index: doorbell index 596 * 597 * Returns the value in the doorbell aperture at the 598 * requested doorbell index (CIK). 599 */ 600 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 601 { 602 if (amdgpu_device_skip_hw_access(adev)) 603 return 0; 604 605 if (index < adev->doorbell.num_kernel_doorbells) { 606 return readl(adev->doorbell.ptr + index); 607 } else { 608 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 609 return 0; 610 } 611 } 612 613 /** 614 * amdgpu_mm_wdoorbell - write a doorbell dword 615 * 616 * @adev: amdgpu_device pointer 617 * @index: doorbell index 618 * @v: value to write 619 * 620 * Writes @v to the doorbell aperture at the 621 * requested doorbell index (CIK). 622 */ 623 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 624 { 625 if (amdgpu_device_skip_hw_access(adev)) 626 return; 627 628 if (index < adev->doorbell.num_kernel_doorbells) { 629 writel(v, adev->doorbell.ptr + index); 630 } else { 631 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 632 } 633 } 634 635 /** 636 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 637 * 638 * @adev: amdgpu_device pointer 639 * @index: doorbell index 640 * 641 * Returns the value in the doorbell aperture at the 642 * requested doorbell index (VEGA10+). 643 */ 644 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 645 { 646 if (amdgpu_device_skip_hw_access(adev)) 647 return 0; 648 649 if (index < adev->doorbell.num_kernel_doorbells) { 650 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 651 } else { 652 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 653 return 0; 654 } 655 } 656 657 /** 658 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 659 * 660 * @adev: amdgpu_device pointer 661 * @index: doorbell index 662 * @v: value to write 663 * 664 * Writes @v to the doorbell aperture at the 665 * requested doorbell index (VEGA10+). 666 */ 667 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 668 { 669 if (amdgpu_device_skip_hw_access(adev)) 670 return; 671 672 if (index < adev->doorbell.num_kernel_doorbells) { 673 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 674 } else { 675 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 676 } 677 } 678 679 /** 680 * amdgpu_device_indirect_rreg - read an indirect register 681 * 682 * @adev: amdgpu_device pointer 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 reg_addr) 689 { 690 unsigned long flags, pcie_index, pcie_data; 691 void __iomem *pcie_index_offset; 692 void __iomem *pcie_data_offset; 693 u32 r; 694 695 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 696 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 697 698 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 699 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 700 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 701 702 writel(reg_addr, pcie_index_offset); 703 readl(pcie_index_offset); 704 r = readl(pcie_data_offset); 705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 706 707 return r; 708 } 709 710 /** 711 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 712 * 713 * @adev: amdgpu_device pointer 714 * @reg_addr: indirect register address to read from 715 * 716 * Returns the value of indirect register @reg_addr 717 */ 718 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 719 u32 reg_addr) 720 { 721 unsigned long flags, pcie_index, pcie_data; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 u64 r; 725 726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* read low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 r = readl(pcie_data_offset); 737 /* read high 32 bits */ 738 writel(reg_addr + 4, pcie_index_offset); 739 readl(pcie_index_offset); 740 r |= ((u64)readl(pcie_data_offset) << 32); 741 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 742 743 return r; 744 } 745 746 /** 747 * amdgpu_device_indirect_wreg - write an indirect register address 748 * 749 * @adev: amdgpu_device pointer 750 * @pcie_index: mmio register offset 751 * @pcie_data: mmio register offset 752 * @reg_addr: indirect register offset 753 * @reg_data: indirect register data 754 * 755 */ 756 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 757 u32 reg_addr, u32 reg_data) 758 { 759 unsigned long flags, pcie_index, pcie_data; 760 void __iomem *pcie_index_offset; 761 void __iomem *pcie_data_offset; 762 763 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 764 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 765 766 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 767 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 768 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 769 770 writel(reg_addr, pcie_index_offset); 771 readl(pcie_index_offset); 772 writel(reg_data, pcie_data_offset); 773 readl(pcie_data_offset); 774 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 775 } 776 777 /** 778 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 779 * 780 * @adev: amdgpu_device pointer 781 * @pcie_index: mmio register offset 782 * @pcie_data: mmio register offset 783 * @reg_addr: indirect register offset 784 * @reg_data: indirect register data 785 * 786 */ 787 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 788 u32 reg_addr, u64 reg_data) 789 { 790 unsigned long flags, pcie_index, pcie_data; 791 void __iomem *pcie_index_offset; 792 void __iomem *pcie_data_offset; 793 794 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 795 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 796 797 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 798 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 799 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 800 801 /* write low 32 bits */ 802 writel(reg_addr, pcie_index_offset); 803 readl(pcie_index_offset); 804 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 805 readl(pcie_data_offset); 806 /* write high 32 bits */ 807 writel(reg_addr + 4, pcie_index_offset); 808 readl(pcie_index_offset); 809 writel((u32)(reg_data >> 32), pcie_data_offset); 810 readl(pcie_data_offset); 811 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 812 } 813 814 /** 815 * amdgpu_device_get_rev_id - query device rev_id 816 * 817 * @adev: amdgpu_device pointer 818 * 819 * Return device rev_id 820 */ 821 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 822 { 823 return adev->nbio.funcs->get_rev_id(adev); 824 } 825 826 /** 827 * amdgpu_invalid_rreg - dummy reg read function 828 * 829 * @adev: amdgpu_device pointer 830 * @reg: offset of register 831 * 832 * Dummy register read function. Used for register blocks 833 * that certain asics don't have (all asics). 834 * Returns the value in the register. 835 */ 836 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 837 { 838 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 839 BUG(); 840 return 0; 841 } 842 843 /** 844 * amdgpu_invalid_wreg - dummy reg write function 845 * 846 * @adev: amdgpu_device pointer 847 * @reg: offset of register 848 * @v: value to write to the register 849 * 850 * Dummy register read function. Used for register blocks 851 * that certain asics don't have (all asics). 852 */ 853 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 854 { 855 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 856 reg, v); 857 BUG(); 858 } 859 860 /** 861 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 862 * 863 * @adev: amdgpu_device pointer 864 * @reg: offset of register 865 * 866 * Dummy register read function. Used for register blocks 867 * that certain asics don't have (all asics). 868 * Returns the value in the register. 869 */ 870 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 871 { 872 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 873 BUG(); 874 return 0; 875 } 876 877 /** 878 * amdgpu_invalid_wreg64 - dummy reg write function 879 * 880 * @adev: amdgpu_device pointer 881 * @reg: offset of register 882 * @v: value to write to the register 883 * 884 * Dummy register read function. Used for register blocks 885 * that certain asics don't have (all asics). 886 */ 887 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 888 { 889 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 890 reg, v); 891 BUG(); 892 } 893 894 /** 895 * amdgpu_block_invalid_rreg - dummy reg read function 896 * 897 * @adev: amdgpu_device pointer 898 * @block: offset of instance 899 * @reg: offset of register 900 * 901 * Dummy register read function. Used for register blocks 902 * that certain asics don't have (all asics). 903 * Returns the value in the register. 904 */ 905 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 906 uint32_t block, uint32_t reg) 907 { 908 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 909 reg, block); 910 BUG(); 911 return 0; 912 } 913 914 /** 915 * amdgpu_block_invalid_wreg - dummy reg write function 916 * 917 * @adev: amdgpu_device pointer 918 * @block: offset of instance 919 * @reg: offset of register 920 * @v: value to write to the register 921 * 922 * Dummy register read function. Used for register blocks 923 * that certain asics don't have (all asics). 924 */ 925 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 926 uint32_t block, 927 uint32_t reg, uint32_t v) 928 { 929 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 930 reg, block, v); 931 BUG(); 932 } 933 934 /** 935 * amdgpu_device_asic_init - Wrapper for atom asic_init 936 * 937 * @adev: amdgpu_device pointer 938 * 939 * Does any asic specific work and then calls atom asic init. 940 */ 941 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 942 { 943 amdgpu_asic_pre_asic_init(adev); 944 945 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 946 return amdgpu_atomfirmware_asic_init(adev, true); 947 else 948 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 949 } 950 951 /** 952 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 953 * 954 * @adev: amdgpu_device pointer 955 * 956 * Allocates a scratch page of VRAM for use by various things in the 957 * driver. 958 */ 959 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 960 { 961 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 962 AMDGPU_GEM_DOMAIN_VRAM | 963 AMDGPU_GEM_DOMAIN_GTT, 964 &adev->mem_scratch.robj, 965 &adev->mem_scratch.gpu_addr, 966 (void **)&adev->mem_scratch.ptr); 967 } 968 969 /** 970 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 971 * 972 * @adev: amdgpu_device pointer 973 * 974 * Frees the VRAM scratch page. 975 */ 976 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 977 { 978 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 979 } 980 981 /** 982 * amdgpu_device_program_register_sequence - program an array of registers. 983 * 984 * @adev: amdgpu_device pointer 985 * @registers: pointer to the register array 986 * @array_size: size of the register array 987 * 988 * Programs an array or registers with and and or masks. 989 * This is a helper for setting golden registers. 990 */ 991 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 992 const u32 *registers, 993 const u32 array_size) 994 { 995 u32 tmp, reg, and_mask, or_mask; 996 int i; 997 998 if (array_size % 3) 999 return; 1000 1001 for (i = 0; i < array_size; i += 3) { 1002 reg = registers[i + 0]; 1003 and_mask = registers[i + 1]; 1004 or_mask = registers[i + 2]; 1005 1006 if (and_mask == 0xffffffff) { 1007 tmp = or_mask; 1008 } else { 1009 tmp = RREG32(reg); 1010 tmp &= ~and_mask; 1011 if (adev->family >= AMDGPU_FAMILY_AI) 1012 tmp |= (or_mask & and_mask); 1013 else 1014 tmp |= or_mask; 1015 } 1016 WREG32(reg, tmp); 1017 } 1018 } 1019 1020 /** 1021 * amdgpu_device_pci_config_reset - reset the GPU 1022 * 1023 * @adev: amdgpu_device pointer 1024 * 1025 * Resets the GPU using the pci config reset sequence. 1026 * Only applicable to asics prior to vega10. 1027 */ 1028 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1029 { 1030 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1031 } 1032 1033 /** 1034 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1035 * 1036 * @adev: amdgpu_device pointer 1037 * 1038 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1039 */ 1040 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1041 { 1042 return pci_reset_function(adev->pdev); 1043 } 1044 1045 /* 1046 * GPU doorbell aperture helpers function. 1047 */ 1048 /** 1049 * amdgpu_device_doorbell_init - Init doorbell driver information. 1050 * 1051 * @adev: amdgpu_device pointer 1052 * 1053 * Init doorbell driver information (CIK) 1054 * Returns 0 on success, error on failure. 1055 */ 1056 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1057 { 1058 1059 /* No doorbell on SI hardware generation */ 1060 if (adev->asic_type < CHIP_BONAIRE) { 1061 adev->doorbell.base = 0; 1062 adev->doorbell.size = 0; 1063 adev->doorbell.num_kernel_doorbells = 0; 1064 adev->doorbell.ptr = NULL; 1065 return 0; 1066 } 1067 1068 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1069 return -EINVAL; 1070 1071 amdgpu_asic_init_doorbell_index(adev); 1072 1073 /* doorbell bar mapping */ 1074 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1075 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1076 1077 if (adev->enable_mes) { 1078 adev->doorbell.num_kernel_doorbells = 1079 adev->doorbell.size / sizeof(u32); 1080 } else { 1081 adev->doorbell.num_kernel_doorbells = 1082 min_t(u32, adev->doorbell.size / sizeof(u32), 1083 adev->doorbell_index.max_assignment+1); 1084 if (adev->doorbell.num_kernel_doorbells == 0) 1085 return -EINVAL; 1086 1087 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1088 * paging queue doorbell use the second page. The 1089 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1090 * doorbells are in the first page. So with paging queue enabled, 1091 * the max num_kernel_doorbells should + 1 page (0x400 in dword) 1092 */ 1093 if (adev->asic_type >= CHIP_VEGA10) 1094 adev->doorbell.num_kernel_doorbells += 0x400; 1095 } 1096 1097 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1098 adev->doorbell.num_kernel_doorbells * 1099 sizeof(u32)); 1100 if (adev->doorbell.ptr == NULL) 1101 return -ENOMEM; 1102 1103 return 0; 1104 } 1105 1106 /** 1107 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1108 * 1109 * @adev: amdgpu_device pointer 1110 * 1111 * Tear down doorbell driver information (CIK) 1112 */ 1113 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1114 { 1115 iounmap(adev->doorbell.ptr); 1116 adev->doorbell.ptr = NULL; 1117 } 1118 1119 1120 1121 /* 1122 * amdgpu_device_wb_*() 1123 * Writeback is the method by which the GPU updates special pages in memory 1124 * with the status of certain GPU events (fences, ring pointers,etc.). 1125 */ 1126 1127 /** 1128 * amdgpu_device_wb_fini - Disable Writeback and free memory 1129 * 1130 * @adev: amdgpu_device pointer 1131 * 1132 * Disables Writeback and frees the Writeback memory (all asics). 1133 * Used at driver shutdown. 1134 */ 1135 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1136 { 1137 if (adev->wb.wb_obj) { 1138 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1139 &adev->wb.gpu_addr, 1140 (void **)&adev->wb.wb); 1141 adev->wb.wb_obj = NULL; 1142 } 1143 } 1144 1145 /** 1146 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1147 * 1148 * @adev: amdgpu_device pointer 1149 * 1150 * Initializes writeback and allocates writeback memory (all asics). 1151 * Used at driver startup. 1152 * Returns 0 on success or an -error on failure. 1153 */ 1154 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1155 { 1156 int r; 1157 1158 if (adev->wb.wb_obj == NULL) { 1159 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1160 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1161 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1162 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1163 (void **)&adev->wb.wb); 1164 if (r) { 1165 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1166 return r; 1167 } 1168 1169 adev->wb.num_wb = AMDGPU_MAX_WB; 1170 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1171 1172 /* clear wb memory */ 1173 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1174 } 1175 1176 return 0; 1177 } 1178 1179 /** 1180 * amdgpu_device_wb_get - Allocate a wb entry 1181 * 1182 * @adev: amdgpu_device pointer 1183 * @wb: wb index 1184 * 1185 * Allocate a wb slot for use by the driver (all asics). 1186 * Returns 0 on success or -EINVAL on failure. 1187 */ 1188 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1189 { 1190 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1191 1192 if (offset < adev->wb.num_wb) { 1193 __set_bit(offset, adev->wb.used); 1194 *wb = offset << 3; /* convert to dw offset */ 1195 return 0; 1196 } else { 1197 return -EINVAL; 1198 } 1199 } 1200 1201 /** 1202 * amdgpu_device_wb_free - Free a wb entry 1203 * 1204 * @adev: amdgpu_device pointer 1205 * @wb: wb index 1206 * 1207 * Free a wb slot allocated for use by the driver (all asics) 1208 */ 1209 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1210 { 1211 wb >>= 3; 1212 if (wb < adev->wb.num_wb) 1213 __clear_bit(wb, adev->wb.used); 1214 } 1215 1216 /** 1217 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1218 * 1219 * @adev: amdgpu_device pointer 1220 * 1221 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1222 * to fail, but if any of the BARs is not accessible after the size we abort 1223 * driver loading by returning -ENODEV. 1224 */ 1225 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1226 { 1227 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1228 struct pci_bus *root; 1229 struct resource *res; 1230 unsigned i; 1231 u16 cmd; 1232 int r; 1233 1234 /* Bypass for VF */ 1235 if (amdgpu_sriov_vf(adev)) 1236 return 0; 1237 1238 /* skip if the bios has already enabled large BAR */ 1239 if (adev->gmc.real_vram_size && 1240 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1241 return 0; 1242 1243 /* Check if the root BUS has 64bit memory resources */ 1244 root = adev->pdev->bus; 1245 while (root->parent) 1246 root = root->parent; 1247 1248 pci_bus_for_each_resource(root, res, i) { 1249 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1250 res->start > 0x100000000ull) 1251 break; 1252 } 1253 1254 /* Trying to resize is pointless without a root hub window above 4GB */ 1255 if (!res) 1256 return 0; 1257 1258 /* Limit the BAR size to what is available */ 1259 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1260 rbar_size); 1261 1262 /* Disable memory decoding while we change the BAR addresses and size */ 1263 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1264 pci_write_config_word(adev->pdev, PCI_COMMAND, 1265 cmd & ~PCI_COMMAND_MEMORY); 1266 1267 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1268 amdgpu_device_doorbell_fini(adev); 1269 if (adev->asic_type >= CHIP_BONAIRE) 1270 pci_release_resource(adev->pdev, 2); 1271 1272 pci_release_resource(adev->pdev, 0); 1273 1274 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1275 if (r == -ENOSPC) 1276 DRM_INFO("Not enough PCI address space for a large BAR."); 1277 else if (r && r != -ENOTSUPP) 1278 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1279 1280 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1281 1282 /* When the doorbell or fb BAR isn't available we have no chance of 1283 * using the device. 1284 */ 1285 r = amdgpu_device_doorbell_init(adev); 1286 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1287 return -ENODEV; 1288 1289 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * GPU helpers function. 1296 */ 1297 /** 1298 * amdgpu_device_need_post - check if the hw need post or not 1299 * 1300 * @adev: amdgpu_device pointer 1301 * 1302 * Check if the asic has been initialized (all asics) at driver startup 1303 * or post is needed if hw reset is performed. 1304 * Returns true if need or false if not. 1305 */ 1306 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1307 { 1308 uint32_t reg; 1309 1310 if (amdgpu_sriov_vf(adev)) 1311 return false; 1312 1313 if (amdgpu_passthrough(adev)) { 1314 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1315 * some old smc fw still need driver do vPost otherwise gpu hang, while 1316 * those smc fw version above 22.15 doesn't have this flaw, so we force 1317 * vpost executed for smc version below 22.15 1318 */ 1319 if (adev->asic_type == CHIP_FIJI) { 1320 int err; 1321 uint32_t fw_ver; 1322 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1323 /* force vPost if error occured */ 1324 if (err) 1325 return true; 1326 1327 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1328 if (fw_ver < 0x00160e00) 1329 return true; 1330 } 1331 } 1332 1333 /* Don't post if we need to reset whole hive on init */ 1334 if (adev->gmc.xgmi.pending_reset) 1335 return false; 1336 1337 if (adev->has_hw_reset) { 1338 adev->has_hw_reset = false; 1339 return true; 1340 } 1341 1342 /* bios scratch used on CIK+ */ 1343 if (adev->asic_type >= CHIP_BONAIRE) 1344 return amdgpu_atombios_scratch_need_asic_init(adev); 1345 1346 /* check MEM_SIZE for older asics */ 1347 reg = amdgpu_asic_get_config_memsize(adev); 1348 1349 if ((reg != 0) && (reg != 0xffffffff)) 1350 return false; 1351 1352 return true; 1353 } 1354 1355 /** 1356 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1357 * 1358 * @adev: amdgpu_device pointer 1359 * 1360 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1361 * be set for this device. 1362 * 1363 * Returns true if it should be used or false if not. 1364 */ 1365 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1366 { 1367 switch (amdgpu_aspm) { 1368 case -1: 1369 break; 1370 case 0: 1371 return false; 1372 case 1: 1373 return true; 1374 default: 1375 return false; 1376 } 1377 return pcie_aspm_enabled(adev->pdev); 1378 } 1379 1380 bool amdgpu_device_aspm_support_quirk(void) 1381 { 1382 #if IS_ENABLED(CONFIG_X86) 1383 struct cpuinfo_x86 *c = &cpu_data(0); 1384 1385 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1386 #else 1387 return true; 1388 #endif 1389 } 1390 1391 /* if we get transitioned to only one device, take VGA back */ 1392 /** 1393 * amdgpu_device_vga_set_decode - enable/disable vga decode 1394 * 1395 * @pdev: PCI device pointer 1396 * @state: enable/disable vga decode 1397 * 1398 * Enable/disable vga decode (all asics). 1399 * Returns VGA resource flags. 1400 */ 1401 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1402 bool state) 1403 { 1404 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1405 amdgpu_asic_set_vga_state(adev, state); 1406 if (state) 1407 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1408 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1409 else 1410 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1411 } 1412 1413 /** 1414 * amdgpu_device_check_block_size - validate the vm block size 1415 * 1416 * @adev: amdgpu_device pointer 1417 * 1418 * Validates the vm block size specified via module parameter. 1419 * The vm block size defines number of bits in page table versus page directory, 1420 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1421 * page table and the remaining bits are in the page directory. 1422 */ 1423 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1424 { 1425 /* defines number of bits in page table versus page directory, 1426 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1427 * page table and the remaining bits are in the page directory */ 1428 if (amdgpu_vm_block_size == -1) 1429 return; 1430 1431 if (amdgpu_vm_block_size < 9) { 1432 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1433 amdgpu_vm_block_size); 1434 amdgpu_vm_block_size = -1; 1435 } 1436 } 1437 1438 /** 1439 * amdgpu_device_check_vm_size - validate the vm size 1440 * 1441 * @adev: amdgpu_device pointer 1442 * 1443 * Validates the vm size in GB specified via module parameter. 1444 * The VM size is the size of the GPU virtual memory space in GB. 1445 */ 1446 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1447 { 1448 /* no need to check the default value */ 1449 if (amdgpu_vm_size == -1) 1450 return; 1451 1452 if (amdgpu_vm_size < 1) { 1453 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1454 amdgpu_vm_size); 1455 amdgpu_vm_size = -1; 1456 } 1457 } 1458 1459 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1460 { 1461 struct sysinfo si; 1462 bool is_os_64 = (sizeof(void *) == 8); 1463 uint64_t total_memory; 1464 uint64_t dram_size_seven_GB = 0x1B8000000; 1465 uint64_t dram_size_three_GB = 0xB8000000; 1466 1467 if (amdgpu_smu_memory_pool_size == 0) 1468 return; 1469 1470 if (!is_os_64) { 1471 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1472 goto def_value; 1473 } 1474 si_meminfo(&si); 1475 total_memory = (uint64_t)si.totalram * si.mem_unit; 1476 1477 if ((amdgpu_smu_memory_pool_size == 1) || 1478 (amdgpu_smu_memory_pool_size == 2)) { 1479 if (total_memory < dram_size_three_GB) 1480 goto def_value1; 1481 } else if ((amdgpu_smu_memory_pool_size == 4) || 1482 (amdgpu_smu_memory_pool_size == 8)) { 1483 if (total_memory < dram_size_seven_GB) 1484 goto def_value1; 1485 } else { 1486 DRM_WARN("Smu memory pool size not supported\n"); 1487 goto def_value; 1488 } 1489 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1490 1491 return; 1492 1493 def_value1: 1494 DRM_WARN("No enough system memory\n"); 1495 def_value: 1496 adev->pm.smu_prv_buffer_size = 0; 1497 } 1498 1499 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1500 { 1501 if (!(adev->flags & AMD_IS_APU) || 1502 adev->asic_type < CHIP_RAVEN) 1503 return 0; 1504 1505 switch (adev->asic_type) { 1506 case CHIP_RAVEN: 1507 if (adev->pdev->device == 0x15dd) 1508 adev->apu_flags |= AMD_APU_IS_RAVEN; 1509 if (adev->pdev->device == 0x15d8) 1510 adev->apu_flags |= AMD_APU_IS_PICASSO; 1511 break; 1512 case CHIP_RENOIR: 1513 if ((adev->pdev->device == 0x1636) || 1514 (adev->pdev->device == 0x164c)) 1515 adev->apu_flags |= AMD_APU_IS_RENOIR; 1516 else 1517 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1518 break; 1519 case CHIP_VANGOGH: 1520 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1521 break; 1522 case CHIP_YELLOW_CARP: 1523 break; 1524 case CHIP_CYAN_SKILLFISH: 1525 if ((adev->pdev->device == 0x13FE) || 1526 (adev->pdev->device == 0x143F)) 1527 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1528 break; 1529 default: 1530 break; 1531 } 1532 1533 return 0; 1534 } 1535 1536 /** 1537 * amdgpu_device_check_arguments - validate module params 1538 * 1539 * @adev: amdgpu_device pointer 1540 * 1541 * Validates certain module parameters and updates 1542 * the associated values used by the driver (all asics). 1543 */ 1544 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1545 { 1546 if (amdgpu_sched_jobs < 4) { 1547 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1548 amdgpu_sched_jobs); 1549 amdgpu_sched_jobs = 4; 1550 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1551 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1552 amdgpu_sched_jobs); 1553 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1554 } 1555 1556 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1557 /* gart size must be greater or equal to 32M */ 1558 dev_warn(adev->dev, "gart size (%d) too small\n", 1559 amdgpu_gart_size); 1560 amdgpu_gart_size = -1; 1561 } 1562 1563 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1564 /* gtt size must be greater or equal to 32M */ 1565 dev_warn(adev->dev, "gtt size (%d) too small\n", 1566 amdgpu_gtt_size); 1567 amdgpu_gtt_size = -1; 1568 } 1569 1570 /* valid range is between 4 and 9 inclusive */ 1571 if (amdgpu_vm_fragment_size != -1 && 1572 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1573 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1574 amdgpu_vm_fragment_size = -1; 1575 } 1576 1577 if (amdgpu_sched_hw_submission < 2) { 1578 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1579 amdgpu_sched_hw_submission); 1580 amdgpu_sched_hw_submission = 2; 1581 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1582 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1583 amdgpu_sched_hw_submission); 1584 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1585 } 1586 1587 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1588 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1589 amdgpu_reset_method = -1; 1590 } 1591 1592 amdgpu_device_check_smu_prv_buffer_size(adev); 1593 1594 amdgpu_device_check_vm_size(adev); 1595 1596 amdgpu_device_check_block_size(adev); 1597 1598 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1599 1600 return 0; 1601 } 1602 1603 /** 1604 * amdgpu_switcheroo_set_state - set switcheroo state 1605 * 1606 * @pdev: pci dev pointer 1607 * @state: vga_switcheroo state 1608 * 1609 * Callback for the switcheroo driver. Suspends or resumes 1610 * the asics before or after it is powered up using ACPI methods. 1611 */ 1612 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1613 enum vga_switcheroo_state state) 1614 { 1615 struct drm_device *dev = pci_get_drvdata(pdev); 1616 int r; 1617 1618 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1619 return; 1620 1621 if (state == VGA_SWITCHEROO_ON) { 1622 pr_info("switched on\n"); 1623 /* don't suspend or resume card normally */ 1624 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1625 1626 pci_set_power_state(pdev, PCI_D0); 1627 amdgpu_device_load_pci_state(pdev); 1628 r = pci_enable_device(pdev); 1629 if (r) 1630 DRM_WARN("pci_enable_device failed (%d)\n", r); 1631 amdgpu_device_resume(dev, true); 1632 1633 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1634 } else { 1635 pr_info("switched off\n"); 1636 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1637 amdgpu_device_suspend(dev, true); 1638 amdgpu_device_cache_pci_state(pdev); 1639 /* Shut down the device */ 1640 pci_disable_device(pdev); 1641 pci_set_power_state(pdev, PCI_D3cold); 1642 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1643 } 1644 } 1645 1646 /** 1647 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1648 * 1649 * @pdev: pci dev pointer 1650 * 1651 * Callback for the switcheroo driver. Check of the switcheroo 1652 * state can be changed. 1653 * Returns true if the state can be changed, false if not. 1654 */ 1655 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1656 { 1657 struct drm_device *dev = pci_get_drvdata(pdev); 1658 1659 /* 1660 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1661 * locking inversion with the driver load path. And the access here is 1662 * completely racy anyway. So don't bother with locking for now. 1663 */ 1664 return atomic_read(&dev->open_count) == 0; 1665 } 1666 1667 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1668 .set_gpu_state = amdgpu_switcheroo_set_state, 1669 .reprobe = NULL, 1670 .can_switch = amdgpu_switcheroo_can_switch, 1671 }; 1672 1673 /** 1674 * amdgpu_device_ip_set_clockgating_state - set the CG state 1675 * 1676 * @dev: amdgpu_device pointer 1677 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1678 * @state: clockgating state (gate or ungate) 1679 * 1680 * Sets the requested clockgating state for all instances of 1681 * the hardware IP specified. 1682 * Returns the error code from the last instance. 1683 */ 1684 int amdgpu_device_ip_set_clockgating_state(void *dev, 1685 enum amd_ip_block_type block_type, 1686 enum amd_clockgating_state state) 1687 { 1688 struct amdgpu_device *adev = dev; 1689 int i, r = 0; 1690 1691 for (i = 0; i < adev->num_ip_blocks; i++) { 1692 if (!adev->ip_blocks[i].status.valid) 1693 continue; 1694 if (adev->ip_blocks[i].version->type != block_type) 1695 continue; 1696 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1697 continue; 1698 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1699 (void *)adev, state); 1700 if (r) 1701 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1702 adev->ip_blocks[i].version->funcs->name, r); 1703 } 1704 return r; 1705 } 1706 1707 /** 1708 * amdgpu_device_ip_set_powergating_state - set the PG state 1709 * 1710 * @dev: amdgpu_device pointer 1711 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1712 * @state: powergating state (gate or ungate) 1713 * 1714 * Sets the requested powergating state for all instances of 1715 * the hardware IP specified. 1716 * Returns the error code from the last instance. 1717 */ 1718 int amdgpu_device_ip_set_powergating_state(void *dev, 1719 enum amd_ip_block_type block_type, 1720 enum amd_powergating_state state) 1721 { 1722 struct amdgpu_device *adev = dev; 1723 int i, r = 0; 1724 1725 for (i = 0; i < adev->num_ip_blocks; i++) { 1726 if (!adev->ip_blocks[i].status.valid) 1727 continue; 1728 if (adev->ip_blocks[i].version->type != block_type) 1729 continue; 1730 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1731 continue; 1732 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1733 (void *)adev, state); 1734 if (r) 1735 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1736 adev->ip_blocks[i].version->funcs->name, r); 1737 } 1738 return r; 1739 } 1740 1741 /** 1742 * amdgpu_device_ip_get_clockgating_state - get the CG state 1743 * 1744 * @adev: amdgpu_device pointer 1745 * @flags: clockgating feature flags 1746 * 1747 * Walks the list of IPs on the device and updates the clockgating 1748 * flags for each IP. 1749 * Updates @flags with the feature flags for each hardware IP where 1750 * clockgating is enabled. 1751 */ 1752 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1753 u64 *flags) 1754 { 1755 int i; 1756 1757 for (i = 0; i < adev->num_ip_blocks; i++) { 1758 if (!adev->ip_blocks[i].status.valid) 1759 continue; 1760 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1761 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1762 } 1763 } 1764 1765 /** 1766 * amdgpu_device_ip_wait_for_idle - wait for idle 1767 * 1768 * @adev: amdgpu_device pointer 1769 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1770 * 1771 * Waits for the request hardware IP to be idle. 1772 * Returns 0 for success or a negative error code on failure. 1773 */ 1774 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1775 enum amd_ip_block_type block_type) 1776 { 1777 int i, r; 1778 1779 for (i = 0; i < adev->num_ip_blocks; i++) { 1780 if (!adev->ip_blocks[i].status.valid) 1781 continue; 1782 if (adev->ip_blocks[i].version->type == block_type) { 1783 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1784 if (r) 1785 return r; 1786 break; 1787 } 1788 } 1789 return 0; 1790 1791 } 1792 1793 /** 1794 * amdgpu_device_ip_is_idle - is the hardware IP idle 1795 * 1796 * @adev: amdgpu_device pointer 1797 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1798 * 1799 * Check if the hardware IP is idle or not. 1800 * Returns true if it the IP is idle, false if not. 1801 */ 1802 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1803 enum amd_ip_block_type block_type) 1804 { 1805 int i; 1806 1807 for (i = 0; i < adev->num_ip_blocks; i++) { 1808 if (!adev->ip_blocks[i].status.valid) 1809 continue; 1810 if (adev->ip_blocks[i].version->type == block_type) 1811 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1812 } 1813 return true; 1814 1815 } 1816 1817 /** 1818 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1819 * 1820 * @adev: amdgpu_device pointer 1821 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1822 * 1823 * Returns a pointer to the hardware IP block structure 1824 * if it exists for the asic, otherwise NULL. 1825 */ 1826 struct amdgpu_ip_block * 1827 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1828 enum amd_ip_block_type type) 1829 { 1830 int i; 1831 1832 for (i = 0; i < adev->num_ip_blocks; i++) 1833 if (adev->ip_blocks[i].version->type == type) 1834 return &adev->ip_blocks[i]; 1835 1836 return NULL; 1837 } 1838 1839 /** 1840 * amdgpu_device_ip_block_version_cmp 1841 * 1842 * @adev: amdgpu_device pointer 1843 * @type: enum amd_ip_block_type 1844 * @major: major version 1845 * @minor: minor version 1846 * 1847 * return 0 if equal or greater 1848 * return 1 if smaller or the ip_block doesn't exist 1849 */ 1850 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1851 enum amd_ip_block_type type, 1852 u32 major, u32 minor) 1853 { 1854 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1855 1856 if (ip_block && ((ip_block->version->major > major) || 1857 ((ip_block->version->major == major) && 1858 (ip_block->version->minor >= minor)))) 1859 return 0; 1860 1861 return 1; 1862 } 1863 1864 /** 1865 * amdgpu_device_ip_block_add 1866 * 1867 * @adev: amdgpu_device pointer 1868 * @ip_block_version: pointer to the IP to add 1869 * 1870 * Adds the IP block driver information to the collection of IPs 1871 * on the asic. 1872 */ 1873 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1874 const struct amdgpu_ip_block_version *ip_block_version) 1875 { 1876 if (!ip_block_version) 1877 return -EINVAL; 1878 1879 switch (ip_block_version->type) { 1880 case AMD_IP_BLOCK_TYPE_VCN: 1881 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1882 return 0; 1883 break; 1884 case AMD_IP_BLOCK_TYPE_JPEG: 1885 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1886 return 0; 1887 break; 1888 default: 1889 break; 1890 } 1891 1892 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1893 ip_block_version->funcs->name); 1894 1895 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1896 1897 return 0; 1898 } 1899 1900 /** 1901 * amdgpu_device_enable_virtual_display - enable virtual display feature 1902 * 1903 * @adev: amdgpu_device pointer 1904 * 1905 * Enabled the virtual display feature if the user has enabled it via 1906 * the module parameter virtual_display. This feature provides a virtual 1907 * display hardware on headless boards or in virtualized environments. 1908 * This function parses and validates the configuration string specified by 1909 * the user and configues the virtual display configuration (number of 1910 * virtual connectors, crtcs, etc.) specified. 1911 */ 1912 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1913 { 1914 adev->enable_virtual_display = false; 1915 1916 if (amdgpu_virtual_display) { 1917 const char *pci_address_name = pci_name(adev->pdev); 1918 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1919 1920 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1921 pciaddstr_tmp = pciaddstr; 1922 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1923 pciaddname = strsep(&pciaddname_tmp, ","); 1924 if (!strcmp("all", pciaddname) 1925 || !strcmp(pci_address_name, pciaddname)) { 1926 long num_crtc; 1927 int res = -1; 1928 1929 adev->enable_virtual_display = true; 1930 1931 if (pciaddname_tmp) 1932 res = kstrtol(pciaddname_tmp, 10, 1933 &num_crtc); 1934 1935 if (!res) { 1936 if (num_crtc < 1) 1937 num_crtc = 1; 1938 if (num_crtc > 6) 1939 num_crtc = 6; 1940 adev->mode_info.num_crtc = num_crtc; 1941 } else { 1942 adev->mode_info.num_crtc = 1; 1943 } 1944 break; 1945 } 1946 } 1947 1948 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1949 amdgpu_virtual_display, pci_address_name, 1950 adev->enable_virtual_display, adev->mode_info.num_crtc); 1951 1952 kfree(pciaddstr); 1953 } 1954 } 1955 1956 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1957 { 1958 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1959 adev->mode_info.num_crtc = 1; 1960 adev->enable_virtual_display = true; 1961 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1962 adev->enable_virtual_display, adev->mode_info.num_crtc); 1963 } 1964 } 1965 1966 /** 1967 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1968 * 1969 * @adev: amdgpu_device pointer 1970 * 1971 * Parses the asic configuration parameters specified in the gpu info 1972 * firmware and makes them availale to the driver for use in configuring 1973 * the asic. 1974 * Returns 0 on success, -EINVAL on failure. 1975 */ 1976 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1977 { 1978 const char *chip_name; 1979 char fw_name[40]; 1980 int err; 1981 const struct gpu_info_firmware_header_v1_0 *hdr; 1982 1983 adev->firmware.gpu_info_fw = NULL; 1984 1985 if (adev->mman.discovery_bin) { 1986 /* 1987 * FIXME: The bounding box is still needed by Navi12, so 1988 * temporarily read it from gpu_info firmware. Should be dropped 1989 * when DAL no longer needs it. 1990 */ 1991 if (adev->asic_type != CHIP_NAVI12) 1992 return 0; 1993 } 1994 1995 switch (adev->asic_type) { 1996 default: 1997 return 0; 1998 case CHIP_VEGA10: 1999 chip_name = "vega10"; 2000 break; 2001 case CHIP_VEGA12: 2002 chip_name = "vega12"; 2003 break; 2004 case CHIP_RAVEN: 2005 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2006 chip_name = "raven2"; 2007 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2008 chip_name = "picasso"; 2009 else 2010 chip_name = "raven"; 2011 break; 2012 case CHIP_ARCTURUS: 2013 chip_name = "arcturus"; 2014 break; 2015 case CHIP_NAVI12: 2016 chip_name = "navi12"; 2017 break; 2018 } 2019 2020 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2021 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2022 if (err) { 2023 dev_err(adev->dev, 2024 "Failed to get gpu_info firmware \"%s\"\n", 2025 fw_name); 2026 goto out; 2027 } 2028 2029 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2030 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2031 2032 switch (hdr->version_major) { 2033 case 1: 2034 { 2035 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2036 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2037 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2038 2039 /* 2040 * Should be droped when DAL no longer needs it. 2041 */ 2042 if (adev->asic_type == CHIP_NAVI12) 2043 goto parse_soc_bounding_box; 2044 2045 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2046 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2047 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2048 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2049 adev->gfx.config.max_texture_channel_caches = 2050 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2051 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2052 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2053 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2054 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2055 adev->gfx.config.double_offchip_lds_buf = 2056 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2057 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2058 adev->gfx.cu_info.max_waves_per_simd = 2059 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2060 adev->gfx.cu_info.max_scratch_slots_per_cu = 2061 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2062 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2063 if (hdr->version_minor >= 1) { 2064 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2065 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2066 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2067 adev->gfx.config.num_sc_per_sh = 2068 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2069 adev->gfx.config.num_packer_per_sc = 2070 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2071 } 2072 2073 parse_soc_bounding_box: 2074 /* 2075 * soc bounding box info is not integrated in disocovery table, 2076 * we always need to parse it from gpu info firmware if needed. 2077 */ 2078 if (hdr->version_minor == 2) { 2079 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2080 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2081 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2082 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2083 } 2084 break; 2085 } 2086 default: 2087 dev_err(adev->dev, 2088 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2089 err = -EINVAL; 2090 goto out; 2091 } 2092 out: 2093 return err; 2094 } 2095 2096 /** 2097 * amdgpu_device_ip_early_init - run early init for hardware IPs 2098 * 2099 * @adev: amdgpu_device pointer 2100 * 2101 * Early initialization pass for hardware IPs. The hardware IPs that make 2102 * up each asic are discovered each IP's early_init callback is run. This 2103 * is the first stage in initializing the asic. 2104 * Returns 0 on success, negative error code on failure. 2105 */ 2106 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2107 { 2108 struct drm_device *dev = adev_to_drm(adev); 2109 struct pci_dev *parent; 2110 int i, r; 2111 bool total; 2112 2113 amdgpu_device_enable_virtual_display(adev); 2114 2115 if (amdgpu_sriov_vf(adev)) { 2116 r = amdgpu_virt_request_full_gpu(adev, true); 2117 if (r) 2118 return r; 2119 } 2120 2121 switch (adev->asic_type) { 2122 #ifdef CONFIG_DRM_AMDGPU_SI 2123 case CHIP_VERDE: 2124 case CHIP_TAHITI: 2125 case CHIP_PITCAIRN: 2126 case CHIP_OLAND: 2127 case CHIP_HAINAN: 2128 adev->family = AMDGPU_FAMILY_SI; 2129 r = si_set_ip_blocks(adev); 2130 if (r) 2131 return r; 2132 break; 2133 #endif 2134 #ifdef CONFIG_DRM_AMDGPU_CIK 2135 case CHIP_BONAIRE: 2136 case CHIP_HAWAII: 2137 case CHIP_KAVERI: 2138 case CHIP_KABINI: 2139 case CHIP_MULLINS: 2140 if (adev->flags & AMD_IS_APU) 2141 adev->family = AMDGPU_FAMILY_KV; 2142 else 2143 adev->family = AMDGPU_FAMILY_CI; 2144 2145 r = cik_set_ip_blocks(adev); 2146 if (r) 2147 return r; 2148 break; 2149 #endif 2150 case CHIP_TOPAZ: 2151 case CHIP_TONGA: 2152 case CHIP_FIJI: 2153 case CHIP_POLARIS10: 2154 case CHIP_POLARIS11: 2155 case CHIP_POLARIS12: 2156 case CHIP_VEGAM: 2157 case CHIP_CARRIZO: 2158 case CHIP_STONEY: 2159 if (adev->flags & AMD_IS_APU) 2160 adev->family = AMDGPU_FAMILY_CZ; 2161 else 2162 adev->family = AMDGPU_FAMILY_VI; 2163 2164 r = vi_set_ip_blocks(adev); 2165 if (r) 2166 return r; 2167 break; 2168 default: 2169 r = amdgpu_discovery_set_ip_blocks(adev); 2170 if (r) 2171 return r; 2172 break; 2173 } 2174 2175 if (amdgpu_has_atpx() && 2176 (amdgpu_is_atpx_hybrid() || 2177 amdgpu_has_atpx_dgpu_power_cntl()) && 2178 ((adev->flags & AMD_IS_APU) == 0) && 2179 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2180 adev->flags |= AMD_IS_PX; 2181 2182 if (!(adev->flags & AMD_IS_APU)) { 2183 parent = pci_upstream_bridge(adev->pdev); 2184 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2185 } 2186 2187 2188 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2189 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2190 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2191 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2192 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2193 2194 total = true; 2195 for (i = 0; i < adev->num_ip_blocks; i++) { 2196 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2197 DRM_ERROR("disabled ip block: %d <%s>\n", 2198 i, adev->ip_blocks[i].version->funcs->name); 2199 adev->ip_blocks[i].status.valid = false; 2200 } else { 2201 if (adev->ip_blocks[i].version->funcs->early_init) { 2202 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2203 if (r == -ENOENT) { 2204 adev->ip_blocks[i].status.valid = false; 2205 } else if (r) { 2206 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2207 adev->ip_blocks[i].version->funcs->name, r); 2208 total = false; 2209 } else { 2210 adev->ip_blocks[i].status.valid = true; 2211 } 2212 } else { 2213 adev->ip_blocks[i].status.valid = true; 2214 } 2215 } 2216 /* get the vbios after the asic_funcs are set up */ 2217 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2218 r = amdgpu_device_parse_gpu_info_fw(adev); 2219 if (r) 2220 return r; 2221 2222 /* Read BIOS */ 2223 if (!amdgpu_get_bios(adev)) 2224 return -EINVAL; 2225 2226 r = amdgpu_atombios_init(adev); 2227 if (r) { 2228 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2229 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2230 return r; 2231 } 2232 2233 /*get pf2vf msg info at it's earliest time*/ 2234 if (amdgpu_sriov_vf(adev)) 2235 amdgpu_virt_init_data_exchange(adev); 2236 2237 } 2238 } 2239 if (!total) 2240 return -ENODEV; 2241 2242 amdgpu_amdkfd_device_probe(adev); 2243 adev->cg_flags &= amdgpu_cg_mask; 2244 adev->pg_flags &= amdgpu_pg_mask; 2245 2246 return 0; 2247 } 2248 2249 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2250 { 2251 int i, r; 2252 2253 for (i = 0; i < adev->num_ip_blocks; i++) { 2254 if (!adev->ip_blocks[i].status.sw) 2255 continue; 2256 if (adev->ip_blocks[i].status.hw) 2257 continue; 2258 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2259 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2260 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2261 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2262 if (r) { 2263 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2264 adev->ip_blocks[i].version->funcs->name, r); 2265 return r; 2266 } 2267 adev->ip_blocks[i].status.hw = true; 2268 } 2269 } 2270 2271 return 0; 2272 } 2273 2274 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2275 { 2276 int i, r; 2277 2278 for (i = 0; i < adev->num_ip_blocks; i++) { 2279 if (!adev->ip_blocks[i].status.sw) 2280 continue; 2281 if (adev->ip_blocks[i].status.hw) 2282 continue; 2283 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2284 if (r) { 2285 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2286 adev->ip_blocks[i].version->funcs->name, r); 2287 return r; 2288 } 2289 adev->ip_blocks[i].status.hw = true; 2290 } 2291 2292 return 0; 2293 } 2294 2295 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2296 { 2297 int r = 0; 2298 int i; 2299 uint32_t smu_version; 2300 2301 if (adev->asic_type >= CHIP_VEGA10) { 2302 for (i = 0; i < adev->num_ip_blocks; i++) { 2303 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2304 continue; 2305 2306 if (!adev->ip_blocks[i].status.sw) 2307 continue; 2308 2309 /* no need to do the fw loading again if already done*/ 2310 if (adev->ip_blocks[i].status.hw == true) 2311 break; 2312 2313 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2314 r = adev->ip_blocks[i].version->funcs->resume(adev); 2315 if (r) { 2316 DRM_ERROR("resume of IP block <%s> failed %d\n", 2317 adev->ip_blocks[i].version->funcs->name, r); 2318 return r; 2319 } 2320 } else { 2321 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2322 if (r) { 2323 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2324 adev->ip_blocks[i].version->funcs->name, r); 2325 return r; 2326 } 2327 } 2328 2329 adev->ip_blocks[i].status.hw = true; 2330 break; 2331 } 2332 } 2333 2334 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2335 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2336 2337 return r; 2338 } 2339 2340 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2341 { 2342 long timeout; 2343 int r, i; 2344 2345 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2346 struct amdgpu_ring *ring = adev->rings[i]; 2347 2348 /* No need to setup the GPU scheduler for rings that don't need it */ 2349 if (!ring || ring->no_scheduler) 2350 continue; 2351 2352 switch (ring->funcs->type) { 2353 case AMDGPU_RING_TYPE_GFX: 2354 timeout = adev->gfx_timeout; 2355 break; 2356 case AMDGPU_RING_TYPE_COMPUTE: 2357 timeout = adev->compute_timeout; 2358 break; 2359 case AMDGPU_RING_TYPE_SDMA: 2360 timeout = adev->sdma_timeout; 2361 break; 2362 default: 2363 timeout = adev->video_timeout; 2364 break; 2365 } 2366 2367 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2368 ring->num_hw_submission, 0, 2369 timeout, adev->reset_domain->wq, 2370 ring->sched_score, ring->name, 2371 adev->dev); 2372 if (r) { 2373 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2374 ring->name); 2375 return r; 2376 } 2377 } 2378 2379 return 0; 2380 } 2381 2382 2383 /** 2384 * amdgpu_device_ip_init - run init for hardware IPs 2385 * 2386 * @adev: amdgpu_device pointer 2387 * 2388 * Main initialization pass for hardware IPs. The list of all the hardware 2389 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2390 * are run. sw_init initializes the software state associated with each IP 2391 * and hw_init initializes the hardware associated with each IP. 2392 * Returns 0 on success, negative error code on failure. 2393 */ 2394 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2395 { 2396 int i, r; 2397 2398 r = amdgpu_ras_init(adev); 2399 if (r) 2400 return r; 2401 2402 for (i = 0; i < adev->num_ip_blocks; i++) { 2403 if (!adev->ip_blocks[i].status.valid) 2404 continue; 2405 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2406 if (r) { 2407 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2408 adev->ip_blocks[i].version->funcs->name, r); 2409 goto init_failed; 2410 } 2411 adev->ip_blocks[i].status.sw = true; 2412 2413 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2414 /* need to do common hw init early so everything is set up for gmc */ 2415 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2416 if (r) { 2417 DRM_ERROR("hw_init %d failed %d\n", i, r); 2418 goto init_failed; 2419 } 2420 adev->ip_blocks[i].status.hw = true; 2421 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2422 /* need to do gmc hw init early so we can allocate gpu mem */ 2423 /* Try to reserve bad pages early */ 2424 if (amdgpu_sriov_vf(adev)) 2425 amdgpu_virt_exchange_data(adev); 2426 2427 r = amdgpu_device_mem_scratch_init(adev); 2428 if (r) { 2429 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2430 goto init_failed; 2431 } 2432 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2433 if (r) { 2434 DRM_ERROR("hw_init %d failed %d\n", i, r); 2435 goto init_failed; 2436 } 2437 r = amdgpu_device_wb_init(adev); 2438 if (r) { 2439 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2440 goto init_failed; 2441 } 2442 adev->ip_blocks[i].status.hw = true; 2443 2444 /* right after GMC hw init, we create CSA */ 2445 if (amdgpu_mcbp) { 2446 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2447 AMDGPU_GEM_DOMAIN_VRAM | 2448 AMDGPU_GEM_DOMAIN_GTT, 2449 AMDGPU_CSA_SIZE); 2450 if (r) { 2451 DRM_ERROR("allocate CSA failed %d\n", r); 2452 goto init_failed; 2453 } 2454 } 2455 } 2456 } 2457 2458 if (amdgpu_sriov_vf(adev)) 2459 amdgpu_virt_init_data_exchange(adev); 2460 2461 r = amdgpu_ib_pool_init(adev); 2462 if (r) { 2463 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2464 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2465 goto init_failed; 2466 } 2467 2468 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2469 if (r) 2470 goto init_failed; 2471 2472 r = amdgpu_device_ip_hw_init_phase1(adev); 2473 if (r) 2474 goto init_failed; 2475 2476 r = amdgpu_device_fw_loading(adev); 2477 if (r) 2478 goto init_failed; 2479 2480 r = amdgpu_device_ip_hw_init_phase2(adev); 2481 if (r) 2482 goto init_failed; 2483 2484 /* 2485 * retired pages will be loaded from eeprom and reserved here, 2486 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2487 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2488 * for I2C communication which only true at this point. 2489 * 2490 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2491 * failure from bad gpu situation and stop amdgpu init process 2492 * accordingly. For other failed cases, it will still release all 2493 * the resource and print error message, rather than returning one 2494 * negative value to upper level. 2495 * 2496 * Note: theoretically, this should be called before all vram allocations 2497 * to protect retired page from abusing 2498 */ 2499 r = amdgpu_ras_recovery_init(adev); 2500 if (r) 2501 goto init_failed; 2502 2503 /** 2504 * In case of XGMI grab extra reference for reset domain for this device 2505 */ 2506 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2507 if (amdgpu_xgmi_add_device(adev) == 0) { 2508 if (!amdgpu_sriov_vf(adev)) { 2509 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2510 2511 if (WARN_ON(!hive)) { 2512 r = -ENOENT; 2513 goto init_failed; 2514 } 2515 2516 if (!hive->reset_domain || 2517 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2518 r = -ENOENT; 2519 amdgpu_put_xgmi_hive(hive); 2520 goto init_failed; 2521 } 2522 2523 /* Drop the early temporary reset domain we created for device */ 2524 amdgpu_reset_put_reset_domain(adev->reset_domain); 2525 adev->reset_domain = hive->reset_domain; 2526 amdgpu_put_xgmi_hive(hive); 2527 } 2528 } 2529 } 2530 2531 r = amdgpu_device_init_schedulers(adev); 2532 if (r) 2533 goto init_failed; 2534 2535 /* Don't init kfd if whole hive need to be reset during init */ 2536 if (!adev->gmc.xgmi.pending_reset) 2537 amdgpu_amdkfd_device_init(adev); 2538 2539 amdgpu_fru_get_product_info(adev); 2540 2541 init_failed: 2542 2543 return r; 2544 } 2545 2546 /** 2547 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2548 * 2549 * @adev: amdgpu_device pointer 2550 * 2551 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2552 * this function before a GPU reset. If the value is retained after a 2553 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2554 */ 2555 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2556 { 2557 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2558 } 2559 2560 /** 2561 * amdgpu_device_check_vram_lost - check if vram is valid 2562 * 2563 * @adev: amdgpu_device pointer 2564 * 2565 * Checks the reset magic value written to the gart pointer in VRAM. 2566 * The driver calls this after a GPU reset to see if the contents of 2567 * VRAM is lost or now. 2568 * returns true if vram is lost, false if not. 2569 */ 2570 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2571 { 2572 if (memcmp(adev->gart.ptr, adev->reset_magic, 2573 AMDGPU_RESET_MAGIC_NUM)) 2574 return true; 2575 2576 if (!amdgpu_in_reset(adev)) 2577 return false; 2578 2579 /* 2580 * For all ASICs with baco/mode1 reset, the VRAM is 2581 * always assumed to be lost. 2582 */ 2583 switch (amdgpu_asic_reset_method(adev)) { 2584 case AMD_RESET_METHOD_BACO: 2585 case AMD_RESET_METHOD_MODE1: 2586 return true; 2587 default: 2588 return false; 2589 } 2590 } 2591 2592 /** 2593 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2594 * 2595 * @adev: amdgpu_device pointer 2596 * @state: clockgating state (gate or ungate) 2597 * 2598 * The list of all the hardware IPs that make up the asic is walked and the 2599 * set_clockgating_state callbacks are run. 2600 * Late initialization pass enabling clockgating for hardware IPs. 2601 * Fini or suspend, pass disabling clockgating for hardware IPs. 2602 * Returns 0 on success, negative error code on failure. 2603 */ 2604 2605 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2606 enum amd_clockgating_state state) 2607 { 2608 int i, j, r; 2609 2610 if (amdgpu_emu_mode == 1) 2611 return 0; 2612 2613 for (j = 0; j < adev->num_ip_blocks; j++) { 2614 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2615 if (!adev->ip_blocks[i].status.late_initialized) 2616 continue; 2617 /* skip CG for GFX, SDMA on S0ix */ 2618 if (adev->in_s0ix && 2619 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2620 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2621 continue; 2622 /* skip CG for VCE/UVD, it's handled specially */ 2623 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2624 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2625 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2626 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2627 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2628 /* enable clockgating to save power */ 2629 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2630 state); 2631 if (r) { 2632 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2633 adev->ip_blocks[i].version->funcs->name, r); 2634 return r; 2635 } 2636 } 2637 } 2638 2639 return 0; 2640 } 2641 2642 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2643 enum amd_powergating_state state) 2644 { 2645 int i, j, r; 2646 2647 if (amdgpu_emu_mode == 1) 2648 return 0; 2649 2650 for (j = 0; j < adev->num_ip_blocks; j++) { 2651 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2652 if (!adev->ip_blocks[i].status.late_initialized) 2653 continue; 2654 /* skip PG for GFX, SDMA on S0ix */ 2655 if (adev->in_s0ix && 2656 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2657 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2658 continue; 2659 /* skip CG for VCE/UVD, it's handled specially */ 2660 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2661 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2662 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2663 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2664 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2665 /* enable powergating to save power */ 2666 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2667 state); 2668 if (r) { 2669 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2670 adev->ip_blocks[i].version->funcs->name, r); 2671 return r; 2672 } 2673 } 2674 } 2675 return 0; 2676 } 2677 2678 static int amdgpu_device_enable_mgpu_fan_boost(void) 2679 { 2680 struct amdgpu_gpu_instance *gpu_ins; 2681 struct amdgpu_device *adev; 2682 int i, ret = 0; 2683 2684 mutex_lock(&mgpu_info.mutex); 2685 2686 /* 2687 * MGPU fan boost feature should be enabled 2688 * only when there are two or more dGPUs in 2689 * the system 2690 */ 2691 if (mgpu_info.num_dgpu < 2) 2692 goto out; 2693 2694 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2695 gpu_ins = &(mgpu_info.gpu_ins[i]); 2696 adev = gpu_ins->adev; 2697 if (!(adev->flags & AMD_IS_APU) && 2698 !gpu_ins->mgpu_fan_enabled) { 2699 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2700 if (ret) 2701 break; 2702 2703 gpu_ins->mgpu_fan_enabled = 1; 2704 } 2705 } 2706 2707 out: 2708 mutex_unlock(&mgpu_info.mutex); 2709 2710 return ret; 2711 } 2712 2713 /** 2714 * amdgpu_device_ip_late_init - run late init for hardware IPs 2715 * 2716 * @adev: amdgpu_device pointer 2717 * 2718 * Late initialization pass for hardware IPs. The list of all the hardware 2719 * IPs that make up the asic is walked and the late_init callbacks are run. 2720 * late_init covers any special initialization that an IP requires 2721 * after all of the have been initialized or something that needs to happen 2722 * late in the init process. 2723 * Returns 0 on success, negative error code on failure. 2724 */ 2725 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2726 { 2727 struct amdgpu_gpu_instance *gpu_instance; 2728 int i = 0, r; 2729 2730 for (i = 0; i < adev->num_ip_blocks; i++) { 2731 if (!adev->ip_blocks[i].status.hw) 2732 continue; 2733 if (adev->ip_blocks[i].version->funcs->late_init) { 2734 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2735 if (r) { 2736 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2737 adev->ip_blocks[i].version->funcs->name, r); 2738 return r; 2739 } 2740 } 2741 adev->ip_blocks[i].status.late_initialized = true; 2742 } 2743 2744 r = amdgpu_ras_late_init(adev); 2745 if (r) { 2746 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2747 return r; 2748 } 2749 2750 amdgpu_ras_set_error_query_ready(adev, true); 2751 2752 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2753 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2754 2755 amdgpu_device_fill_reset_magic(adev); 2756 2757 r = amdgpu_device_enable_mgpu_fan_boost(); 2758 if (r) 2759 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2760 2761 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2762 if (amdgpu_passthrough(adev) && 2763 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2764 adev->asic_type == CHIP_ALDEBARAN)) 2765 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2766 2767 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2768 mutex_lock(&mgpu_info.mutex); 2769 2770 /* 2771 * Reset device p-state to low as this was booted with high. 2772 * 2773 * This should be performed only after all devices from the same 2774 * hive get initialized. 2775 * 2776 * However, it's unknown how many device in the hive in advance. 2777 * As this is counted one by one during devices initializations. 2778 * 2779 * So, we wait for all XGMI interlinked devices initialized. 2780 * This may bring some delays as those devices may come from 2781 * different hives. But that should be OK. 2782 */ 2783 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2784 for (i = 0; i < mgpu_info.num_gpu; i++) { 2785 gpu_instance = &(mgpu_info.gpu_ins[i]); 2786 if (gpu_instance->adev->flags & AMD_IS_APU) 2787 continue; 2788 2789 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2790 AMDGPU_XGMI_PSTATE_MIN); 2791 if (r) { 2792 DRM_ERROR("pstate setting failed (%d).\n", r); 2793 break; 2794 } 2795 } 2796 } 2797 2798 mutex_unlock(&mgpu_info.mutex); 2799 } 2800 2801 return 0; 2802 } 2803 2804 /** 2805 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2806 * 2807 * @adev: amdgpu_device pointer 2808 * 2809 * For ASICs need to disable SMC first 2810 */ 2811 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2812 { 2813 int i, r; 2814 2815 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2816 return; 2817 2818 for (i = 0; i < adev->num_ip_blocks; i++) { 2819 if (!adev->ip_blocks[i].status.hw) 2820 continue; 2821 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2822 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2823 /* XXX handle errors */ 2824 if (r) { 2825 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2826 adev->ip_blocks[i].version->funcs->name, r); 2827 } 2828 adev->ip_blocks[i].status.hw = false; 2829 break; 2830 } 2831 } 2832 } 2833 2834 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2835 { 2836 int i, r; 2837 2838 for (i = 0; i < adev->num_ip_blocks; i++) { 2839 if (!adev->ip_blocks[i].version->funcs->early_fini) 2840 continue; 2841 2842 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2843 if (r) { 2844 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2845 adev->ip_blocks[i].version->funcs->name, r); 2846 } 2847 } 2848 2849 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2850 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2851 2852 amdgpu_amdkfd_suspend(adev, false); 2853 2854 /* Workaroud for ASICs need to disable SMC first */ 2855 amdgpu_device_smu_fini_early(adev); 2856 2857 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2858 if (!adev->ip_blocks[i].status.hw) 2859 continue; 2860 2861 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2862 /* XXX handle errors */ 2863 if (r) { 2864 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2865 adev->ip_blocks[i].version->funcs->name, r); 2866 } 2867 2868 adev->ip_blocks[i].status.hw = false; 2869 } 2870 2871 if (amdgpu_sriov_vf(adev)) { 2872 if (amdgpu_virt_release_full_gpu(adev, false)) 2873 DRM_ERROR("failed to release exclusive mode on fini\n"); 2874 } 2875 2876 return 0; 2877 } 2878 2879 /** 2880 * amdgpu_device_ip_fini - run fini for hardware IPs 2881 * 2882 * @adev: amdgpu_device pointer 2883 * 2884 * Main teardown pass for hardware IPs. The list of all the hardware 2885 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2886 * are run. hw_fini tears down the hardware associated with each IP 2887 * and sw_fini tears down any software state associated with each IP. 2888 * Returns 0 on success, negative error code on failure. 2889 */ 2890 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2891 { 2892 int i, r; 2893 2894 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2895 amdgpu_virt_release_ras_err_handler_data(adev); 2896 2897 if (adev->gmc.xgmi.num_physical_nodes > 1) 2898 amdgpu_xgmi_remove_device(adev); 2899 2900 amdgpu_amdkfd_device_fini_sw(adev); 2901 2902 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2903 if (!adev->ip_blocks[i].status.sw) 2904 continue; 2905 2906 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2907 amdgpu_ucode_free_bo(adev); 2908 amdgpu_free_static_csa(&adev->virt.csa_obj); 2909 amdgpu_device_wb_fini(adev); 2910 amdgpu_device_mem_scratch_fini(adev); 2911 amdgpu_ib_pool_fini(adev); 2912 } 2913 2914 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2915 /* XXX handle errors */ 2916 if (r) { 2917 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2918 adev->ip_blocks[i].version->funcs->name, r); 2919 } 2920 adev->ip_blocks[i].status.sw = false; 2921 adev->ip_blocks[i].status.valid = false; 2922 } 2923 2924 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2925 if (!adev->ip_blocks[i].status.late_initialized) 2926 continue; 2927 if (adev->ip_blocks[i].version->funcs->late_fini) 2928 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2929 adev->ip_blocks[i].status.late_initialized = false; 2930 } 2931 2932 amdgpu_ras_fini(adev); 2933 2934 return 0; 2935 } 2936 2937 /** 2938 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2939 * 2940 * @work: work_struct. 2941 */ 2942 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2943 { 2944 struct amdgpu_device *adev = 2945 container_of(work, struct amdgpu_device, delayed_init_work.work); 2946 int r; 2947 2948 r = amdgpu_ib_ring_tests(adev); 2949 if (r) 2950 DRM_ERROR("ib ring test failed (%d).\n", r); 2951 } 2952 2953 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2954 { 2955 struct amdgpu_device *adev = 2956 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2957 2958 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2959 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2960 2961 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2962 adev->gfx.gfx_off_state = true; 2963 } 2964 2965 /** 2966 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2967 * 2968 * @adev: amdgpu_device pointer 2969 * 2970 * Main suspend function for hardware IPs. The list of all the hardware 2971 * IPs that make up the asic is walked, clockgating is disabled and the 2972 * suspend callbacks are run. suspend puts the hardware and software state 2973 * in each IP into a state suitable for suspend. 2974 * Returns 0 on success, negative error code on failure. 2975 */ 2976 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2977 { 2978 int i, r; 2979 2980 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2981 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2982 2983 /* 2984 * Per PMFW team's suggestion, driver needs to handle gfxoff 2985 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2986 * scenario. Add the missing df cstate disablement here. 2987 */ 2988 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2989 dev_warn(adev->dev, "Failed to disallow df cstate"); 2990 2991 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2992 if (!adev->ip_blocks[i].status.valid) 2993 continue; 2994 2995 /* displays are handled separately */ 2996 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2997 continue; 2998 2999 /* XXX handle errors */ 3000 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3001 /* XXX handle errors */ 3002 if (r) { 3003 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3004 adev->ip_blocks[i].version->funcs->name, r); 3005 return r; 3006 } 3007 3008 adev->ip_blocks[i].status.hw = false; 3009 } 3010 3011 return 0; 3012 } 3013 3014 /** 3015 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3016 * 3017 * @adev: amdgpu_device pointer 3018 * 3019 * Main suspend function for hardware IPs. The list of all the hardware 3020 * IPs that make up the asic is walked, clockgating is disabled and the 3021 * suspend callbacks are run. suspend puts the hardware and software state 3022 * in each IP into a state suitable for suspend. 3023 * Returns 0 on success, negative error code on failure. 3024 */ 3025 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3026 { 3027 int i, r; 3028 3029 if (adev->in_s0ix) 3030 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3031 3032 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3033 if (!adev->ip_blocks[i].status.valid) 3034 continue; 3035 /* displays are handled in phase1 */ 3036 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3037 continue; 3038 /* PSP lost connection when err_event_athub occurs */ 3039 if (amdgpu_ras_intr_triggered() && 3040 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3041 adev->ip_blocks[i].status.hw = false; 3042 continue; 3043 } 3044 3045 /* skip unnecessary suspend if we do not initialize them yet */ 3046 if (adev->gmc.xgmi.pending_reset && 3047 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3048 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3049 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3050 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3051 adev->ip_blocks[i].status.hw = false; 3052 continue; 3053 } 3054 3055 /* skip suspend of gfx/mes and psp for S0ix 3056 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3057 * like at runtime. PSP is also part of the always on hardware 3058 * so no need to suspend it. 3059 */ 3060 if (adev->in_s0ix && 3061 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3062 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3063 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3064 continue; 3065 3066 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3067 if (adev->in_s0ix && 3068 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3069 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3070 continue; 3071 3072 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3073 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3074 * from this location and RLC Autoload automatically also gets loaded 3075 * from here based on PMFW -> PSP message during re-init sequence. 3076 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3077 * the TMR and reload FWs again for IMU enabled APU ASICs. 3078 */ 3079 if (amdgpu_in_reset(adev) && 3080 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3081 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3082 continue; 3083 3084 /* XXX handle errors */ 3085 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3086 /* XXX handle errors */ 3087 if (r) { 3088 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3089 adev->ip_blocks[i].version->funcs->name, r); 3090 } 3091 adev->ip_blocks[i].status.hw = false; 3092 /* handle putting the SMC in the appropriate state */ 3093 if (!amdgpu_sriov_vf(adev)) { 3094 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3095 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3096 if (r) { 3097 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3098 adev->mp1_state, r); 3099 return r; 3100 } 3101 } 3102 } 3103 } 3104 3105 return 0; 3106 } 3107 3108 /** 3109 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3110 * 3111 * @adev: amdgpu_device pointer 3112 * 3113 * Main suspend function for hardware IPs. The list of all the hardware 3114 * IPs that make up the asic is walked, clockgating is disabled and the 3115 * suspend callbacks are run. suspend puts the hardware and software state 3116 * in each IP into a state suitable for suspend. 3117 * Returns 0 on success, negative error code on failure. 3118 */ 3119 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3120 { 3121 int r; 3122 3123 if (amdgpu_sriov_vf(adev)) { 3124 amdgpu_virt_fini_data_exchange(adev); 3125 amdgpu_virt_request_full_gpu(adev, false); 3126 } 3127 3128 r = amdgpu_device_ip_suspend_phase1(adev); 3129 if (r) 3130 return r; 3131 r = amdgpu_device_ip_suspend_phase2(adev); 3132 3133 if (amdgpu_sriov_vf(adev)) 3134 amdgpu_virt_release_full_gpu(adev, false); 3135 3136 return r; 3137 } 3138 3139 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3140 { 3141 int i, r; 3142 3143 static enum amd_ip_block_type ip_order[] = { 3144 AMD_IP_BLOCK_TYPE_COMMON, 3145 AMD_IP_BLOCK_TYPE_GMC, 3146 AMD_IP_BLOCK_TYPE_PSP, 3147 AMD_IP_BLOCK_TYPE_IH, 3148 }; 3149 3150 for (i = 0; i < adev->num_ip_blocks; i++) { 3151 int j; 3152 struct amdgpu_ip_block *block; 3153 3154 block = &adev->ip_blocks[i]; 3155 block->status.hw = false; 3156 3157 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3158 3159 if (block->version->type != ip_order[j] || 3160 !block->status.valid) 3161 continue; 3162 3163 r = block->version->funcs->hw_init(adev); 3164 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3165 if (r) 3166 return r; 3167 block->status.hw = true; 3168 } 3169 } 3170 3171 return 0; 3172 } 3173 3174 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3175 { 3176 int i, r; 3177 3178 static enum amd_ip_block_type ip_order[] = { 3179 AMD_IP_BLOCK_TYPE_SMC, 3180 AMD_IP_BLOCK_TYPE_DCE, 3181 AMD_IP_BLOCK_TYPE_GFX, 3182 AMD_IP_BLOCK_TYPE_SDMA, 3183 AMD_IP_BLOCK_TYPE_MES, 3184 AMD_IP_BLOCK_TYPE_UVD, 3185 AMD_IP_BLOCK_TYPE_VCE, 3186 AMD_IP_BLOCK_TYPE_VCN, 3187 AMD_IP_BLOCK_TYPE_JPEG 3188 }; 3189 3190 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3191 int j; 3192 struct amdgpu_ip_block *block; 3193 3194 for (j = 0; j < adev->num_ip_blocks; j++) { 3195 block = &adev->ip_blocks[j]; 3196 3197 if (block->version->type != ip_order[i] || 3198 !block->status.valid || 3199 block->status.hw) 3200 continue; 3201 3202 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3203 r = block->version->funcs->resume(adev); 3204 else 3205 r = block->version->funcs->hw_init(adev); 3206 3207 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3208 if (r) 3209 return r; 3210 block->status.hw = true; 3211 } 3212 } 3213 3214 return 0; 3215 } 3216 3217 /** 3218 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3219 * 3220 * @adev: amdgpu_device pointer 3221 * 3222 * First resume function for hardware IPs. The list of all the hardware 3223 * IPs that make up the asic is walked and the resume callbacks are run for 3224 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3225 * after a suspend and updates the software state as necessary. This 3226 * function is also used for restoring the GPU after a GPU reset. 3227 * Returns 0 on success, negative error code on failure. 3228 */ 3229 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3230 { 3231 int i, r; 3232 3233 for (i = 0; i < adev->num_ip_blocks; i++) { 3234 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3235 continue; 3236 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3237 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3238 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3239 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3240 3241 r = adev->ip_blocks[i].version->funcs->resume(adev); 3242 if (r) { 3243 DRM_ERROR("resume of IP block <%s> failed %d\n", 3244 adev->ip_blocks[i].version->funcs->name, r); 3245 return r; 3246 } 3247 adev->ip_blocks[i].status.hw = true; 3248 } 3249 } 3250 3251 return 0; 3252 } 3253 3254 /** 3255 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3256 * 3257 * @adev: amdgpu_device pointer 3258 * 3259 * First resume function for hardware IPs. The list of all the hardware 3260 * IPs that make up the asic is walked and the resume callbacks are run for 3261 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3262 * functional state after a suspend and updates the software state as 3263 * necessary. This function is also used for restoring the GPU after a GPU 3264 * reset. 3265 * Returns 0 on success, negative error code on failure. 3266 */ 3267 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3268 { 3269 int i, r; 3270 3271 for (i = 0; i < adev->num_ip_blocks; i++) { 3272 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3273 continue; 3274 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3275 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3276 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3277 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3278 continue; 3279 r = adev->ip_blocks[i].version->funcs->resume(adev); 3280 if (r) { 3281 DRM_ERROR("resume of IP block <%s> failed %d\n", 3282 adev->ip_blocks[i].version->funcs->name, r); 3283 return r; 3284 } 3285 adev->ip_blocks[i].status.hw = true; 3286 } 3287 3288 return 0; 3289 } 3290 3291 /** 3292 * amdgpu_device_ip_resume - run resume for hardware IPs 3293 * 3294 * @adev: amdgpu_device pointer 3295 * 3296 * Main resume function for hardware IPs. The hardware IPs 3297 * are split into two resume functions because they are 3298 * are also used in in recovering from a GPU reset and some additional 3299 * steps need to be take between them. In this case (S3/S4) they are 3300 * run sequentially. 3301 * Returns 0 on success, negative error code on failure. 3302 */ 3303 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3304 { 3305 int r; 3306 3307 if (!adev->in_s0ix) { 3308 r = amdgpu_amdkfd_resume_iommu(adev); 3309 if (r) 3310 return r; 3311 } 3312 3313 r = amdgpu_device_ip_resume_phase1(adev); 3314 if (r) 3315 return r; 3316 3317 r = amdgpu_device_fw_loading(adev); 3318 if (r) 3319 return r; 3320 3321 r = amdgpu_device_ip_resume_phase2(adev); 3322 3323 return r; 3324 } 3325 3326 /** 3327 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3328 * 3329 * @adev: amdgpu_device pointer 3330 * 3331 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3332 */ 3333 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3334 { 3335 if (amdgpu_sriov_vf(adev)) { 3336 if (adev->is_atom_fw) { 3337 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3338 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3339 } else { 3340 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3341 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3342 } 3343 3344 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3345 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3346 } 3347 } 3348 3349 /** 3350 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3351 * 3352 * @asic_type: AMD asic type 3353 * 3354 * Check if there is DC (new modesetting infrastructre) support for an asic. 3355 * returns true if DC has support, false if not. 3356 */ 3357 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3358 { 3359 switch (asic_type) { 3360 #ifdef CONFIG_DRM_AMDGPU_SI 3361 case CHIP_HAINAN: 3362 #endif 3363 case CHIP_TOPAZ: 3364 /* chips with no display hardware */ 3365 return false; 3366 #if defined(CONFIG_DRM_AMD_DC) 3367 case CHIP_TAHITI: 3368 case CHIP_PITCAIRN: 3369 case CHIP_VERDE: 3370 case CHIP_OLAND: 3371 /* 3372 * We have systems in the wild with these ASICs that require 3373 * LVDS and VGA support which is not supported with DC. 3374 * 3375 * Fallback to the non-DC driver here by default so as not to 3376 * cause regressions. 3377 */ 3378 #if defined(CONFIG_DRM_AMD_DC_SI) 3379 return amdgpu_dc > 0; 3380 #else 3381 return false; 3382 #endif 3383 case CHIP_BONAIRE: 3384 case CHIP_KAVERI: 3385 case CHIP_KABINI: 3386 case CHIP_MULLINS: 3387 /* 3388 * We have systems in the wild with these ASICs that require 3389 * VGA support which is not supported with DC. 3390 * 3391 * Fallback to the non-DC driver here by default so as not to 3392 * cause regressions. 3393 */ 3394 return amdgpu_dc > 0; 3395 default: 3396 return amdgpu_dc != 0; 3397 #else 3398 default: 3399 if (amdgpu_dc > 0) 3400 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3401 "but isn't supported by ASIC, ignoring\n"); 3402 return false; 3403 #endif 3404 } 3405 } 3406 3407 /** 3408 * amdgpu_device_has_dc_support - check if dc is supported 3409 * 3410 * @adev: amdgpu_device pointer 3411 * 3412 * Returns true for supported, false for not supported 3413 */ 3414 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3415 { 3416 if (adev->enable_virtual_display || 3417 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3418 return false; 3419 3420 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3421 } 3422 3423 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3424 { 3425 struct amdgpu_device *adev = 3426 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3427 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3428 3429 /* It's a bug to not have a hive within this function */ 3430 if (WARN_ON(!hive)) 3431 return; 3432 3433 /* 3434 * Use task barrier to synchronize all xgmi reset works across the 3435 * hive. task_barrier_enter and task_barrier_exit will block 3436 * until all the threads running the xgmi reset works reach 3437 * those points. task_barrier_full will do both blocks. 3438 */ 3439 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3440 3441 task_barrier_enter(&hive->tb); 3442 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3443 3444 if (adev->asic_reset_res) 3445 goto fail; 3446 3447 task_barrier_exit(&hive->tb); 3448 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3449 3450 if (adev->asic_reset_res) 3451 goto fail; 3452 3453 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3454 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3455 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3456 } else { 3457 3458 task_barrier_full(&hive->tb); 3459 adev->asic_reset_res = amdgpu_asic_reset(adev); 3460 } 3461 3462 fail: 3463 if (adev->asic_reset_res) 3464 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3465 adev->asic_reset_res, adev_to_drm(adev)->unique); 3466 amdgpu_put_xgmi_hive(hive); 3467 } 3468 3469 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3470 { 3471 char *input = amdgpu_lockup_timeout; 3472 char *timeout_setting = NULL; 3473 int index = 0; 3474 long timeout; 3475 int ret = 0; 3476 3477 /* 3478 * By default timeout for non compute jobs is 10000 3479 * and 60000 for compute jobs. 3480 * In SR-IOV or passthrough mode, timeout for compute 3481 * jobs are 60000 by default. 3482 */ 3483 adev->gfx_timeout = msecs_to_jiffies(10000); 3484 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3485 if (amdgpu_sriov_vf(adev)) 3486 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3487 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3488 else 3489 adev->compute_timeout = msecs_to_jiffies(60000); 3490 3491 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3492 while ((timeout_setting = strsep(&input, ",")) && 3493 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3494 ret = kstrtol(timeout_setting, 0, &timeout); 3495 if (ret) 3496 return ret; 3497 3498 if (timeout == 0) { 3499 index++; 3500 continue; 3501 } else if (timeout < 0) { 3502 timeout = MAX_SCHEDULE_TIMEOUT; 3503 dev_warn(adev->dev, "lockup timeout disabled"); 3504 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3505 } else { 3506 timeout = msecs_to_jiffies(timeout); 3507 } 3508 3509 switch (index++) { 3510 case 0: 3511 adev->gfx_timeout = timeout; 3512 break; 3513 case 1: 3514 adev->compute_timeout = timeout; 3515 break; 3516 case 2: 3517 adev->sdma_timeout = timeout; 3518 break; 3519 case 3: 3520 adev->video_timeout = timeout; 3521 break; 3522 default: 3523 break; 3524 } 3525 } 3526 /* 3527 * There is only one value specified and 3528 * it should apply to all non-compute jobs. 3529 */ 3530 if (index == 1) { 3531 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3532 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3533 adev->compute_timeout = adev->gfx_timeout; 3534 } 3535 } 3536 3537 return ret; 3538 } 3539 3540 /** 3541 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3542 * 3543 * @adev: amdgpu_device pointer 3544 * 3545 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3546 */ 3547 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3548 { 3549 struct iommu_domain *domain; 3550 3551 domain = iommu_get_domain_for_dev(adev->dev); 3552 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3553 adev->ram_is_direct_mapped = true; 3554 } 3555 3556 static const struct attribute *amdgpu_dev_attributes[] = { 3557 &dev_attr_product_name.attr, 3558 &dev_attr_product_number.attr, 3559 &dev_attr_serial_number.attr, 3560 &dev_attr_pcie_replay_count.attr, 3561 NULL 3562 }; 3563 3564 /** 3565 * amdgpu_device_init - initialize the driver 3566 * 3567 * @adev: amdgpu_device pointer 3568 * @flags: driver flags 3569 * 3570 * Initializes the driver info and hw (all asics). 3571 * Returns 0 for success or an error on failure. 3572 * Called at driver startup. 3573 */ 3574 int amdgpu_device_init(struct amdgpu_device *adev, 3575 uint32_t flags) 3576 { 3577 struct drm_device *ddev = adev_to_drm(adev); 3578 struct pci_dev *pdev = adev->pdev; 3579 int r, i; 3580 bool px = false; 3581 u32 max_MBps; 3582 int tmp; 3583 3584 adev->shutdown = false; 3585 adev->flags = flags; 3586 3587 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3588 adev->asic_type = amdgpu_force_asic_type; 3589 else 3590 adev->asic_type = flags & AMD_ASIC_MASK; 3591 3592 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3593 if (amdgpu_emu_mode == 1) 3594 adev->usec_timeout *= 10; 3595 adev->gmc.gart_size = 512 * 1024 * 1024; 3596 adev->accel_working = false; 3597 adev->num_rings = 0; 3598 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3599 adev->mman.buffer_funcs = NULL; 3600 adev->mman.buffer_funcs_ring = NULL; 3601 adev->vm_manager.vm_pte_funcs = NULL; 3602 adev->vm_manager.vm_pte_num_scheds = 0; 3603 adev->gmc.gmc_funcs = NULL; 3604 adev->harvest_ip_mask = 0x0; 3605 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3606 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3607 3608 adev->smc_rreg = &amdgpu_invalid_rreg; 3609 adev->smc_wreg = &amdgpu_invalid_wreg; 3610 adev->pcie_rreg = &amdgpu_invalid_rreg; 3611 adev->pcie_wreg = &amdgpu_invalid_wreg; 3612 adev->pciep_rreg = &amdgpu_invalid_rreg; 3613 adev->pciep_wreg = &amdgpu_invalid_wreg; 3614 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3615 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3616 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3617 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3618 adev->didt_rreg = &amdgpu_invalid_rreg; 3619 adev->didt_wreg = &amdgpu_invalid_wreg; 3620 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3621 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3622 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3623 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3624 3625 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3626 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3627 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3628 3629 /* mutex initialization are all done here so we 3630 * can recall function without having locking issues */ 3631 mutex_init(&adev->firmware.mutex); 3632 mutex_init(&adev->pm.mutex); 3633 mutex_init(&adev->gfx.gpu_clock_mutex); 3634 mutex_init(&adev->srbm_mutex); 3635 mutex_init(&adev->gfx.pipe_reserve_mutex); 3636 mutex_init(&adev->gfx.gfx_off_mutex); 3637 mutex_init(&adev->grbm_idx_mutex); 3638 mutex_init(&adev->mn_lock); 3639 mutex_init(&adev->virt.vf_errors.lock); 3640 hash_init(adev->mn_hash); 3641 mutex_init(&adev->psp.mutex); 3642 mutex_init(&adev->notifier_lock); 3643 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3644 mutex_init(&adev->benchmark_mutex); 3645 3646 amdgpu_device_init_apu_flags(adev); 3647 3648 r = amdgpu_device_check_arguments(adev); 3649 if (r) 3650 return r; 3651 3652 spin_lock_init(&adev->mmio_idx_lock); 3653 spin_lock_init(&adev->smc_idx_lock); 3654 spin_lock_init(&adev->pcie_idx_lock); 3655 spin_lock_init(&adev->uvd_ctx_idx_lock); 3656 spin_lock_init(&adev->didt_idx_lock); 3657 spin_lock_init(&adev->gc_cac_idx_lock); 3658 spin_lock_init(&adev->se_cac_idx_lock); 3659 spin_lock_init(&adev->audio_endpt_idx_lock); 3660 spin_lock_init(&adev->mm_stats.lock); 3661 3662 INIT_LIST_HEAD(&adev->shadow_list); 3663 mutex_init(&adev->shadow_list_lock); 3664 3665 INIT_LIST_HEAD(&adev->reset_list); 3666 3667 INIT_LIST_HEAD(&adev->ras_list); 3668 3669 INIT_DELAYED_WORK(&adev->delayed_init_work, 3670 amdgpu_device_delayed_init_work_handler); 3671 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3672 amdgpu_device_delay_enable_gfx_off); 3673 3674 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3675 3676 adev->gfx.gfx_off_req_count = 1; 3677 adev->gfx.gfx_off_residency = 0; 3678 adev->gfx.gfx_off_entrycount = 0; 3679 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3680 3681 atomic_set(&adev->throttling_logging_enabled, 1); 3682 /* 3683 * If throttling continues, logging will be performed every minute 3684 * to avoid log flooding. "-1" is subtracted since the thermal 3685 * throttling interrupt comes every second. Thus, the total logging 3686 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3687 * for throttling interrupt) = 60 seconds. 3688 */ 3689 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3690 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3691 3692 /* Registers mapping */ 3693 /* TODO: block userspace mapping of io register */ 3694 if (adev->asic_type >= CHIP_BONAIRE) { 3695 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3696 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3697 } else { 3698 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3699 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3700 } 3701 3702 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3703 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3704 3705 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3706 if (adev->rmmio == NULL) { 3707 return -ENOMEM; 3708 } 3709 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3710 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3711 3712 amdgpu_device_get_pcie_info(adev); 3713 3714 if (amdgpu_mcbp) 3715 DRM_INFO("MCBP is enabled\n"); 3716 3717 /* 3718 * Reset domain needs to be present early, before XGMI hive discovered 3719 * (if any) and intitialized to use reset sem and in_gpu reset flag 3720 * early on during init and before calling to RREG32. 3721 */ 3722 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3723 if (!adev->reset_domain) 3724 return -ENOMEM; 3725 3726 /* detect hw virtualization here */ 3727 amdgpu_detect_virtualization(adev); 3728 3729 r = amdgpu_device_get_job_timeout_settings(adev); 3730 if (r) { 3731 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3732 return r; 3733 } 3734 3735 /* early init functions */ 3736 r = amdgpu_device_ip_early_init(adev); 3737 if (r) 3738 return r; 3739 3740 /* Get rid of things like offb */ 3741 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3742 if (r) 3743 return r; 3744 3745 /* Enable TMZ based on IP_VERSION */ 3746 amdgpu_gmc_tmz_set(adev); 3747 3748 amdgpu_gmc_noretry_set(adev); 3749 /* Need to get xgmi info early to decide the reset behavior*/ 3750 if (adev->gmc.xgmi.supported) { 3751 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3752 if (r) 3753 return r; 3754 } 3755 3756 /* enable PCIE atomic ops */ 3757 if (amdgpu_sriov_vf(adev)) 3758 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3759 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3760 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3761 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3762 * internal path natively support atomics, set have_atomics_support to true. 3763 */ 3764 else if ((adev->flags & AMD_IS_APU) && 3765 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3766 adev->have_atomics_support = true; 3767 else 3768 adev->have_atomics_support = 3769 !pci_enable_atomic_ops_to_root(adev->pdev, 3770 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3771 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3772 if (!adev->have_atomics_support) 3773 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3774 3775 /* doorbell bar mapping and doorbell index init*/ 3776 amdgpu_device_doorbell_init(adev); 3777 3778 if (amdgpu_emu_mode == 1) { 3779 /* post the asic on emulation mode */ 3780 emu_soc_asic_init(adev); 3781 goto fence_driver_init; 3782 } 3783 3784 amdgpu_reset_init(adev); 3785 3786 /* detect if we are with an SRIOV vbios */ 3787 amdgpu_device_detect_sriov_bios(adev); 3788 3789 /* check if we need to reset the asic 3790 * E.g., driver was not cleanly unloaded previously, etc. 3791 */ 3792 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3793 if (adev->gmc.xgmi.num_physical_nodes) { 3794 dev_info(adev->dev, "Pending hive reset.\n"); 3795 adev->gmc.xgmi.pending_reset = true; 3796 /* Only need to init necessary block for SMU to handle the reset */ 3797 for (i = 0; i < adev->num_ip_blocks; i++) { 3798 if (!adev->ip_blocks[i].status.valid) 3799 continue; 3800 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3801 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3802 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3803 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3804 DRM_DEBUG("IP %s disabled for hw_init.\n", 3805 adev->ip_blocks[i].version->funcs->name); 3806 adev->ip_blocks[i].status.hw = true; 3807 } 3808 } 3809 } else { 3810 tmp = amdgpu_reset_method; 3811 /* It should do a default reset when loading or reloading the driver, 3812 * regardless of the module parameter reset_method. 3813 */ 3814 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3815 r = amdgpu_asic_reset(adev); 3816 amdgpu_reset_method = tmp; 3817 if (r) { 3818 dev_err(adev->dev, "asic reset on init failed\n"); 3819 goto failed; 3820 } 3821 } 3822 } 3823 3824 /* Post card if necessary */ 3825 if (amdgpu_device_need_post(adev)) { 3826 if (!adev->bios) { 3827 dev_err(adev->dev, "no vBIOS found\n"); 3828 r = -EINVAL; 3829 goto failed; 3830 } 3831 DRM_INFO("GPU posting now...\n"); 3832 r = amdgpu_device_asic_init(adev); 3833 if (r) { 3834 dev_err(adev->dev, "gpu post error!\n"); 3835 goto failed; 3836 } 3837 } 3838 3839 if (adev->is_atom_fw) { 3840 /* Initialize clocks */ 3841 r = amdgpu_atomfirmware_get_clock_info(adev); 3842 if (r) { 3843 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3844 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3845 goto failed; 3846 } 3847 } else { 3848 /* Initialize clocks */ 3849 r = amdgpu_atombios_get_clock_info(adev); 3850 if (r) { 3851 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3852 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3853 goto failed; 3854 } 3855 /* init i2c buses */ 3856 if (!amdgpu_device_has_dc_support(adev)) 3857 amdgpu_atombios_i2c_init(adev); 3858 } 3859 3860 fence_driver_init: 3861 /* Fence driver */ 3862 r = amdgpu_fence_driver_sw_init(adev); 3863 if (r) { 3864 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3865 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3866 goto failed; 3867 } 3868 3869 /* init the mode config */ 3870 drm_mode_config_init(adev_to_drm(adev)); 3871 3872 r = amdgpu_device_ip_init(adev); 3873 if (r) { 3874 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3875 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3876 goto release_ras_con; 3877 } 3878 3879 amdgpu_fence_driver_hw_init(adev); 3880 3881 dev_info(adev->dev, 3882 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3883 adev->gfx.config.max_shader_engines, 3884 adev->gfx.config.max_sh_per_se, 3885 adev->gfx.config.max_cu_per_sh, 3886 adev->gfx.cu_info.number); 3887 3888 adev->accel_working = true; 3889 3890 amdgpu_vm_check_compute_bug(adev); 3891 3892 /* Initialize the buffer migration limit. */ 3893 if (amdgpu_moverate >= 0) 3894 max_MBps = amdgpu_moverate; 3895 else 3896 max_MBps = 8; /* Allow 8 MB/s. */ 3897 /* Get a log2 for easy divisions. */ 3898 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3899 3900 r = amdgpu_pm_sysfs_init(adev); 3901 if (r) 3902 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3903 3904 r = amdgpu_ucode_sysfs_init(adev); 3905 if (r) { 3906 adev->ucode_sysfs_en = false; 3907 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3908 } else 3909 adev->ucode_sysfs_en = true; 3910 3911 r = amdgpu_psp_sysfs_init(adev); 3912 if (r) { 3913 adev->psp_sysfs_en = false; 3914 if (!amdgpu_sriov_vf(adev)) 3915 DRM_ERROR("Creating psp sysfs failed\n"); 3916 } else 3917 adev->psp_sysfs_en = true; 3918 3919 /* 3920 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3921 * Otherwise the mgpu fan boost feature will be skipped due to the 3922 * gpu instance is counted less. 3923 */ 3924 amdgpu_register_gpu_instance(adev); 3925 3926 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3927 * explicit gating rather than handling it automatically. 3928 */ 3929 if (!adev->gmc.xgmi.pending_reset) { 3930 r = amdgpu_device_ip_late_init(adev); 3931 if (r) { 3932 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3933 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3934 goto release_ras_con; 3935 } 3936 /* must succeed. */ 3937 amdgpu_ras_resume(adev); 3938 queue_delayed_work(system_wq, &adev->delayed_init_work, 3939 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3940 } 3941 3942 if (amdgpu_sriov_vf(adev)) { 3943 amdgpu_virt_release_full_gpu(adev, true); 3944 flush_delayed_work(&adev->delayed_init_work); 3945 } 3946 3947 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3948 if (r) 3949 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3950 3951 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3952 r = amdgpu_pmu_init(adev); 3953 if (r) 3954 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3955 3956 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3957 if (amdgpu_device_cache_pci_state(adev->pdev)) 3958 pci_restore_state(pdev); 3959 3960 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3961 /* this will fail for cards that aren't VGA class devices, just 3962 * ignore it */ 3963 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3964 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3965 3966 px = amdgpu_device_supports_px(ddev); 3967 3968 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3969 apple_gmux_detect(NULL, NULL))) 3970 vga_switcheroo_register_client(adev->pdev, 3971 &amdgpu_switcheroo_ops, px); 3972 3973 if (px) 3974 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3975 3976 if (adev->gmc.xgmi.pending_reset) 3977 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3978 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3979 3980 amdgpu_device_check_iommu_direct_map(adev); 3981 3982 return 0; 3983 3984 release_ras_con: 3985 if (amdgpu_sriov_vf(adev)) 3986 amdgpu_virt_release_full_gpu(adev, true); 3987 3988 /* failed in exclusive mode due to timeout */ 3989 if (amdgpu_sriov_vf(adev) && 3990 !amdgpu_sriov_runtime(adev) && 3991 amdgpu_virt_mmio_blocked(adev) && 3992 !amdgpu_virt_wait_reset(adev)) { 3993 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3994 /* Don't send request since VF is inactive. */ 3995 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3996 adev->virt.ops = NULL; 3997 r = -EAGAIN; 3998 } 3999 amdgpu_release_ras_context(adev); 4000 4001 failed: 4002 amdgpu_vf_error_trans_all(adev); 4003 4004 return r; 4005 } 4006 4007 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4008 { 4009 4010 /* Clear all CPU mappings pointing to this device */ 4011 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4012 4013 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4014 amdgpu_device_doorbell_fini(adev); 4015 4016 iounmap(adev->rmmio); 4017 adev->rmmio = NULL; 4018 if (adev->mman.aper_base_kaddr) 4019 iounmap(adev->mman.aper_base_kaddr); 4020 adev->mman.aper_base_kaddr = NULL; 4021 4022 /* Memory manager related */ 4023 if (!adev->gmc.xgmi.connected_to_cpu) { 4024 arch_phys_wc_del(adev->gmc.vram_mtrr); 4025 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4026 } 4027 } 4028 4029 /** 4030 * amdgpu_device_fini_hw - tear down the driver 4031 * 4032 * @adev: amdgpu_device pointer 4033 * 4034 * Tear down the driver info (all asics). 4035 * Called at driver shutdown. 4036 */ 4037 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4038 { 4039 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4040 flush_delayed_work(&adev->delayed_init_work); 4041 adev->shutdown = true; 4042 4043 /* make sure IB test finished before entering exclusive mode 4044 * to avoid preemption on IB test 4045 * */ 4046 if (amdgpu_sriov_vf(adev)) { 4047 amdgpu_virt_request_full_gpu(adev, false); 4048 amdgpu_virt_fini_data_exchange(adev); 4049 } 4050 4051 /* disable all interrupts */ 4052 amdgpu_irq_disable_all(adev); 4053 if (adev->mode_info.mode_config_initialized) { 4054 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4055 drm_helper_force_disable_all(adev_to_drm(adev)); 4056 else 4057 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4058 } 4059 amdgpu_fence_driver_hw_fini(adev); 4060 4061 if (adev->mman.initialized) 4062 drain_workqueue(adev->mman.bdev.wq); 4063 4064 if (adev->pm.sysfs_initialized) 4065 amdgpu_pm_sysfs_fini(adev); 4066 if (adev->ucode_sysfs_en) 4067 amdgpu_ucode_sysfs_fini(adev); 4068 if (adev->psp_sysfs_en) 4069 amdgpu_psp_sysfs_fini(adev); 4070 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4071 4072 /* disable ras feature must before hw fini */ 4073 amdgpu_ras_pre_fini(adev); 4074 4075 amdgpu_device_ip_fini_early(adev); 4076 4077 amdgpu_irq_fini_hw(adev); 4078 4079 if (adev->mman.initialized) 4080 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4081 4082 amdgpu_gart_dummy_page_fini(adev); 4083 4084 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4085 amdgpu_device_unmap_mmio(adev); 4086 4087 } 4088 4089 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4090 { 4091 int idx; 4092 bool px; 4093 4094 amdgpu_fence_driver_sw_fini(adev); 4095 amdgpu_device_ip_fini(adev); 4096 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4097 adev->accel_working = false; 4098 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4099 4100 amdgpu_reset_fini(adev); 4101 4102 /* free i2c buses */ 4103 if (!amdgpu_device_has_dc_support(adev)) 4104 amdgpu_i2c_fini(adev); 4105 4106 if (amdgpu_emu_mode != 1) 4107 amdgpu_atombios_fini(adev); 4108 4109 kfree(adev->bios); 4110 adev->bios = NULL; 4111 4112 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4113 4114 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4115 apple_gmux_detect(NULL, NULL))) 4116 vga_switcheroo_unregister_client(adev->pdev); 4117 4118 if (px) 4119 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4120 4121 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4122 vga_client_unregister(adev->pdev); 4123 4124 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4125 4126 iounmap(adev->rmmio); 4127 adev->rmmio = NULL; 4128 amdgpu_device_doorbell_fini(adev); 4129 drm_dev_exit(idx); 4130 } 4131 4132 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4133 amdgpu_pmu_fini(adev); 4134 if (adev->mman.discovery_bin) 4135 amdgpu_discovery_fini(adev); 4136 4137 amdgpu_reset_put_reset_domain(adev->reset_domain); 4138 adev->reset_domain = NULL; 4139 4140 kfree(adev->pci_state); 4141 4142 } 4143 4144 /** 4145 * amdgpu_device_evict_resources - evict device resources 4146 * @adev: amdgpu device object 4147 * 4148 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4149 * of the vram memory type. Mainly used for evicting device resources 4150 * at suspend time. 4151 * 4152 */ 4153 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4154 { 4155 int ret; 4156 4157 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4158 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4159 return 0; 4160 4161 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4162 if (ret) 4163 DRM_WARN("evicting device resources failed\n"); 4164 return ret; 4165 } 4166 4167 /* 4168 * Suspend & resume. 4169 */ 4170 /** 4171 * amdgpu_device_suspend - initiate device suspend 4172 * 4173 * @dev: drm dev pointer 4174 * @fbcon : notify the fbdev of suspend 4175 * 4176 * Puts the hw in the suspend state (all asics). 4177 * Returns 0 for success or an error on failure. 4178 * Called at driver suspend. 4179 */ 4180 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4181 { 4182 struct amdgpu_device *adev = drm_to_adev(dev); 4183 int r = 0; 4184 4185 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4186 return 0; 4187 4188 adev->in_suspend = true; 4189 4190 /* Evict the majority of BOs before grabbing the full access */ 4191 r = amdgpu_device_evict_resources(adev); 4192 if (r) 4193 return r; 4194 4195 if (amdgpu_sriov_vf(adev)) { 4196 amdgpu_virt_fini_data_exchange(adev); 4197 r = amdgpu_virt_request_full_gpu(adev, false); 4198 if (r) 4199 return r; 4200 } 4201 4202 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4203 DRM_WARN("smart shift update failed\n"); 4204 4205 if (fbcon) 4206 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4207 4208 cancel_delayed_work_sync(&adev->delayed_init_work); 4209 4210 amdgpu_ras_suspend(adev); 4211 4212 amdgpu_device_ip_suspend_phase1(adev); 4213 4214 if (!adev->in_s0ix) 4215 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4216 4217 r = amdgpu_device_evict_resources(adev); 4218 if (r) 4219 return r; 4220 4221 amdgpu_fence_driver_hw_fini(adev); 4222 4223 amdgpu_device_ip_suspend_phase2(adev); 4224 4225 if (amdgpu_sriov_vf(adev)) 4226 amdgpu_virt_release_full_gpu(adev, false); 4227 4228 return 0; 4229 } 4230 4231 /** 4232 * amdgpu_device_resume - initiate device resume 4233 * 4234 * @dev: drm dev pointer 4235 * @fbcon : notify the fbdev of resume 4236 * 4237 * Bring the hw back to operating state (all asics). 4238 * Returns 0 for success or an error on failure. 4239 * Called at driver resume. 4240 */ 4241 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4242 { 4243 struct amdgpu_device *adev = drm_to_adev(dev); 4244 int r = 0; 4245 4246 if (amdgpu_sriov_vf(adev)) { 4247 r = amdgpu_virt_request_full_gpu(adev, true); 4248 if (r) 4249 return r; 4250 } 4251 4252 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4253 return 0; 4254 4255 if (adev->in_s0ix) 4256 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4257 4258 /* post card */ 4259 if (amdgpu_device_need_post(adev)) { 4260 r = amdgpu_device_asic_init(adev); 4261 if (r) 4262 dev_err(adev->dev, "amdgpu asic init failed\n"); 4263 } 4264 4265 r = amdgpu_device_ip_resume(adev); 4266 4267 if (r) { 4268 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4269 goto exit; 4270 } 4271 amdgpu_fence_driver_hw_init(adev); 4272 4273 r = amdgpu_device_ip_late_init(adev); 4274 if (r) 4275 goto exit; 4276 4277 queue_delayed_work(system_wq, &adev->delayed_init_work, 4278 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4279 4280 if (!adev->in_s0ix) { 4281 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4282 if (r) 4283 goto exit; 4284 } 4285 4286 exit: 4287 if (amdgpu_sriov_vf(adev)) { 4288 amdgpu_virt_init_data_exchange(adev); 4289 amdgpu_virt_release_full_gpu(adev, true); 4290 } 4291 4292 if (r) 4293 return r; 4294 4295 /* Make sure IB tests flushed */ 4296 flush_delayed_work(&adev->delayed_init_work); 4297 4298 if (fbcon) 4299 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4300 4301 amdgpu_ras_resume(adev); 4302 4303 if (adev->mode_info.num_crtc) { 4304 /* 4305 * Most of the connector probing functions try to acquire runtime pm 4306 * refs to ensure that the GPU is powered on when connector polling is 4307 * performed. Since we're calling this from a runtime PM callback, 4308 * trying to acquire rpm refs will cause us to deadlock. 4309 * 4310 * Since we're guaranteed to be holding the rpm lock, it's safe to 4311 * temporarily disable the rpm helpers so this doesn't deadlock us. 4312 */ 4313 #ifdef CONFIG_PM 4314 dev->dev->power.disable_depth++; 4315 #endif 4316 if (!adev->dc_enabled) 4317 drm_helper_hpd_irq_event(dev); 4318 else 4319 drm_kms_helper_hotplug_event(dev); 4320 #ifdef CONFIG_PM 4321 dev->dev->power.disable_depth--; 4322 #endif 4323 } 4324 adev->in_suspend = false; 4325 4326 if (adev->enable_mes) 4327 amdgpu_mes_self_test(adev); 4328 4329 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4330 DRM_WARN("smart shift update failed\n"); 4331 4332 return 0; 4333 } 4334 4335 /** 4336 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4337 * 4338 * @adev: amdgpu_device pointer 4339 * 4340 * The list of all the hardware IPs that make up the asic is walked and 4341 * the check_soft_reset callbacks are run. check_soft_reset determines 4342 * if the asic is still hung or not. 4343 * Returns true if any of the IPs are still in a hung state, false if not. 4344 */ 4345 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4346 { 4347 int i; 4348 bool asic_hang = false; 4349 4350 if (amdgpu_sriov_vf(adev)) 4351 return true; 4352 4353 if (amdgpu_asic_need_full_reset(adev)) 4354 return true; 4355 4356 for (i = 0; i < adev->num_ip_blocks; i++) { 4357 if (!adev->ip_blocks[i].status.valid) 4358 continue; 4359 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4360 adev->ip_blocks[i].status.hang = 4361 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4362 if (adev->ip_blocks[i].status.hang) { 4363 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4364 asic_hang = true; 4365 } 4366 } 4367 return asic_hang; 4368 } 4369 4370 /** 4371 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4372 * 4373 * @adev: amdgpu_device pointer 4374 * 4375 * The list of all the hardware IPs that make up the asic is walked and the 4376 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4377 * handles any IP specific hardware or software state changes that are 4378 * necessary for a soft reset to succeed. 4379 * Returns 0 on success, negative error code on failure. 4380 */ 4381 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4382 { 4383 int i, r = 0; 4384 4385 for (i = 0; i < adev->num_ip_blocks; i++) { 4386 if (!adev->ip_blocks[i].status.valid) 4387 continue; 4388 if (adev->ip_blocks[i].status.hang && 4389 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4390 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4391 if (r) 4392 return r; 4393 } 4394 } 4395 4396 return 0; 4397 } 4398 4399 /** 4400 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4401 * 4402 * @adev: amdgpu_device pointer 4403 * 4404 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4405 * reset is necessary to recover. 4406 * Returns true if a full asic reset is required, false if not. 4407 */ 4408 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4409 { 4410 int i; 4411 4412 if (amdgpu_asic_need_full_reset(adev)) 4413 return true; 4414 4415 for (i = 0; i < adev->num_ip_blocks; i++) { 4416 if (!adev->ip_blocks[i].status.valid) 4417 continue; 4418 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4419 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4420 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4421 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4422 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4423 if (adev->ip_blocks[i].status.hang) { 4424 dev_info(adev->dev, "Some block need full reset!\n"); 4425 return true; 4426 } 4427 } 4428 } 4429 return false; 4430 } 4431 4432 /** 4433 * amdgpu_device_ip_soft_reset - do a soft reset 4434 * 4435 * @adev: amdgpu_device pointer 4436 * 4437 * The list of all the hardware IPs that make up the asic is walked and the 4438 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4439 * IP specific hardware or software state changes that are necessary to soft 4440 * reset the IP. 4441 * Returns 0 on success, negative error code on failure. 4442 */ 4443 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4444 { 4445 int i, r = 0; 4446 4447 for (i = 0; i < adev->num_ip_blocks; i++) { 4448 if (!adev->ip_blocks[i].status.valid) 4449 continue; 4450 if (adev->ip_blocks[i].status.hang && 4451 adev->ip_blocks[i].version->funcs->soft_reset) { 4452 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4453 if (r) 4454 return r; 4455 } 4456 } 4457 4458 return 0; 4459 } 4460 4461 /** 4462 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4463 * 4464 * @adev: amdgpu_device pointer 4465 * 4466 * The list of all the hardware IPs that make up the asic is walked and the 4467 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4468 * handles any IP specific hardware or software state changes that are 4469 * necessary after the IP has been soft reset. 4470 * Returns 0 on success, negative error code on failure. 4471 */ 4472 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4473 { 4474 int i, r = 0; 4475 4476 for (i = 0; i < adev->num_ip_blocks; i++) { 4477 if (!adev->ip_blocks[i].status.valid) 4478 continue; 4479 if (adev->ip_blocks[i].status.hang && 4480 adev->ip_blocks[i].version->funcs->post_soft_reset) 4481 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4482 if (r) 4483 return r; 4484 } 4485 4486 return 0; 4487 } 4488 4489 /** 4490 * amdgpu_device_recover_vram - Recover some VRAM contents 4491 * 4492 * @adev: amdgpu_device pointer 4493 * 4494 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4495 * restore things like GPUVM page tables after a GPU reset where 4496 * the contents of VRAM might be lost. 4497 * 4498 * Returns: 4499 * 0 on success, negative error code on failure. 4500 */ 4501 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4502 { 4503 struct dma_fence *fence = NULL, *next = NULL; 4504 struct amdgpu_bo *shadow; 4505 struct amdgpu_bo_vm *vmbo; 4506 long r = 1, tmo; 4507 4508 if (amdgpu_sriov_runtime(adev)) 4509 tmo = msecs_to_jiffies(8000); 4510 else 4511 tmo = msecs_to_jiffies(100); 4512 4513 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4514 mutex_lock(&adev->shadow_list_lock); 4515 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4516 /* If vm is compute context or adev is APU, shadow will be NULL */ 4517 if (!vmbo->shadow) 4518 continue; 4519 shadow = vmbo->shadow; 4520 4521 /* No need to recover an evicted BO */ 4522 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4523 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4524 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4525 continue; 4526 4527 r = amdgpu_bo_restore_shadow(shadow, &next); 4528 if (r) 4529 break; 4530 4531 if (fence) { 4532 tmo = dma_fence_wait_timeout(fence, false, tmo); 4533 dma_fence_put(fence); 4534 fence = next; 4535 if (tmo == 0) { 4536 r = -ETIMEDOUT; 4537 break; 4538 } else if (tmo < 0) { 4539 r = tmo; 4540 break; 4541 } 4542 } else { 4543 fence = next; 4544 } 4545 } 4546 mutex_unlock(&adev->shadow_list_lock); 4547 4548 if (fence) 4549 tmo = dma_fence_wait_timeout(fence, false, tmo); 4550 dma_fence_put(fence); 4551 4552 if (r < 0 || tmo <= 0) { 4553 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4554 return -EIO; 4555 } 4556 4557 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4558 return 0; 4559 } 4560 4561 4562 /** 4563 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4564 * 4565 * @adev: amdgpu_device pointer 4566 * @from_hypervisor: request from hypervisor 4567 * 4568 * do VF FLR and reinitialize Asic 4569 * return 0 means succeeded otherwise failed 4570 */ 4571 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4572 bool from_hypervisor) 4573 { 4574 int r; 4575 struct amdgpu_hive_info *hive = NULL; 4576 int retry_limit = 0; 4577 4578 retry: 4579 amdgpu_amdkfd_pre_reset(adev); 4580 4581 if (from_hypervisor) 4582 r = amdgpu_virt_request_full_gpu(adev, true); 4583 else 4584 r = amdgpu_virt_reset_gpu(adev); 4585 if (r) 4586 return r; 4587 4588 /* Resume IP prior to SMC */ 4589 r = amdgpu_device_ip_reinit_early_sriov(adev); 4590 if (r) 4591 goto error; 4592 4593 amdgpu_virt_init_data_exchange(adev); 4594 4595 r = amdgpu_device_fw_loading(adev); 4596 if (r) 4597 return r; 4598 4599 /* now we are okay to resume SMC/CP/SDMA */ 4600 r = amdgpu_device_ip_reinit_late_sriov(adev); 4601 if (r) 4602 goto error; 4603 4604 hive = amdgpu_get_xgmi_hive(adev); 4605 /* Update PSP FW topology after reset */ 4606 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4607 r = amdgpu_xgmi_update_topology(hive, adev); 4608 4609 if (hive) 4610 amdgpu_put_xgmi_hive(hive); 4611 4612 if (!r) { 4613 amdgpu_irq_gpu_reset_resume_helper(adev); 4614 r = amdgpu_ib_ring_tests(adev); 4615 4616 amdgpu_amdkfd_post_reset(adev); 4617 } 4618 4619 error: 4620 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4621 amdgpu_inc_vram_lost(adev); 4622 r = amdgpu_device_recover_vram(adev); 4623 } 4624 amdgpu_virt_release_full_gpu(adev, true); 4625 4626 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4627 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4628 retry_limit++; 4629 goto retry; 4630 } else 4631 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4632 } 4633 4634 return r; 4635 } 4636 4637 /** 4638 * amdgpu_device_has_job_running - check if there is any job in mirror list 4639 * 4640 * @adev: amdgpu_device pointer 4641 * 4642 * check if there is any job in mirror list 4643 */ 4644 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4645 { 4646 int i; 4647 struct drm_sched_job *job; 4648 4649 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4650 struct amdgpu_ring *ring = adev->rings[i]; 4651 4652 if (!ring || !ring->sched.thread) 4653 continue; 4654 4655 spin_lock(&ring->sched.job_list_lock); 4656 job = list_first_entry_or_null(&ring->sched.pending_list, 4657 struct drm_sched_job, list); 4658 spin_unlock(&ring->sched.job_list_lock); 4659 if (job) 4660 return true; 4661 } 4662 return false; 4663 } 4664 4665 /** 4666 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4667 * 4668 * @adev: amdgpu_device pointer 4669 * 4670 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4671 * a hung GPU. 4672 */ 4673 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4674 { 4675 4676 if (amdgpu_gpu_recovery == 0) 4677 goto disabled; 4678 4679 /* Skip soft reset check in fatal error mode */ 4680 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4681 return true; 4682 4683 if (amdgpu_sriov_vf(adev)) 4684 return true; 4685 4686 if (amdgpu_gpu_recovery == -1) { 4687 switch (adev->asic_type) { 4688 #ifdef CONFIG_DRM_AMDGPU_SI 4689 case CHIP_VERDE: 4690 case CHIP_TAHITI: 4691 case CHIP_PITCAIRN: 4692 case CHIP_OLAND: 4693 case CHIP_HAINAN: 4694 #endif 4695 #ifdef CONFIG_DRM_AMDGPU_CIK 4696 case CHIP_KAVERI: 4697 case CHIP_KABINI: 4698 case CHIP_MULLINS: 4699 #endif 4700 case CHIP_CARRIZO: 4701 case CHIP_STONEY: 4702 case CHIP_CYAN_SKILLFISH: 4703 goto disabled; 4704 default: 4705 break; 4706 } 4707 } 4708 4709 return true; 4710 4711 disabled: 4712 dev_info(adev->dev, "GPU recovery disabled.\n"); 4713 return false; 4714 } 4715 4716 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4717 { 4718 u32 i; 4719 int ret = 0; 4720 4721 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4722 4723 dev_info(adev->dev, "GPU mode1 reset\n"); 4724 4725 /* disable BM */ 4726 pci_clear_master(adev->pdev); 4727 4728 amdgpu_device_cache_pci_state(adev->pdev); 4729 4730 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4731 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4732 ret = amdgpu_dpm_mode1_reset(adev); 4733 } else { 4734 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4735 ret = psp_gpu_reset(adev); 4736 } 4737 4738 if (ret) 4739 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4740 4741 amdgpu_device_load_pci_state(adev->pdev); 4742 4743 /* wait for asic to come out of reset */ 4744 for (i = 0; i < adev->usec_timeout; i++) { 4745 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4746 4747 if (memsize != 0xffffffff) 4748 break; 4749 udelay(1); 4750 } 4751 4752 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4753 return ret; 4754 } 4755 4756 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4757 struct amdgpu_reset_context *reset_context) 4758 { 4759 int i, r = 0; 4760 struct amdgpu_job *job = NULL; 4761 bool need_full_reset = 4762 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4763 4764 if (reset_context->reset_req_dev == adev) 4765 job = reset_context->job; 4766 4767 if (amdgpu_sriov_vf(adev)) { 4768 /* stop the data exchange thread */ 4769 amdgpu_virt_fini_data_exchange(adev); 4770 } 4771 4772 amdgpu_fence_driver_isr_toggle(adev, true); 4773 4774 /* block all schedulers and reset given job's ring */ 4775 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4776 struct amdgpu_ring *ring = adev->rings[i]; 4777 4778 if (!ring || !ring->sched.thread) 4779 continue; 4780 4781 /*clear job fence from fence drv to avoid force_completion 4782 *leave NULL and vm flush fence in fence drv */ 4783 amdgpu_fence_driver_clear_job_fences(ring); 4784 4785 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4786 amdgpu_fence_driver_force_completion(ring); 4787 } 4788 4789 amdgpu_fence_driver_isr_toggle(adev, false); 4790 4791 if (job && job->vm) 4792 drm_sched_increase_karma(&job->base); 4793 4794 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4795 /* If reset handler not implemented, continue; otherwise return */ 4796 if (r == -ENOSYS) 4797 r = 0; 4798 else 4799 return r; 4800 4801 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4802 if (!amdgpu_sriov_vf(adev)) { 4803 4804 if (!need_full_reset) 4805 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4806 4807 if (!need_full_reset && amdgpu_gpu_recovery && 4808 amdgpu_device_ip_check_soft_reset(adev)) { 4809 amdgpu_device_ip_pre_soft_reset(adev); 4810 r = amdgpu_device_ip_soft_reset(adev); 4811 amdgpu_device_ip_post_soft_reset(adev); 4812 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4813 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4814 need_full_reset = true; 4815 } 4816 } 4817 4818 if (need_full_reset) 4819 r = amdgpu_device_ip_suspend(adev); 4820 if (need_full_reset) 4821 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4822 else 4823 clear_bit(AMDGPU_NEED_FULL_RESET, 4824 &reset_context->flags); 4825 } 4826 4827 return r; 4828 } 4829 4830 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4831 { 4832 int i; 4833 4834 lockdep_assert_held(&adev->reset_domain->sem); 4835 4836 for (i = 0; i < adev->num_regs; i++) { 4837 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4838 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4839 adev->reset_dump_reg_value[i]); 4840 } 4841 4842 return 0; 4843 } 4844 4845 #ifdef CONFIG_DEV_COREDUMP 4846 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4847 size_t count, void *data, size_t datalen) 4848 { 4849 struct drm_printer p; 4850 struct amdgpu_device *adev = data; 4851 struct drm_print_iterator iter; 4852 int i; 4853 4854 iter.data = buffer; 4855 iter.offset = 0; 4856 iter.start = offset; 4857 iter.remain = count; 4858 4859 p = drm_coredump_printer(&iter); 4860 4861 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4862 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4863 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4864 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4865 if (adev->reset_task_info.pid) 4866 drm_printf(&p, "process_name: %s PID: %d\n", 4867 adev->reset_task_info.process_name, 4868 adev->reset_task_info.pid); 4869 4870 if (adev->reset_vram_lost) 4871 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4872 if (adev->num_regs) { 4873 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4874 4875 for (i = 0; i < adev->num_regs; i++) 4876 drm_printf(&p, "0x%08x: 0x%08x\n", 4877 adev->reset_dump_reg_list[i], 4878 adev->reset_dump_reg_value[i]); 4879 } 4880 4881 return count - iter.remain; 4882 } 4883 4884 static void amdgpu_devcoredump_free(void *data) 4885 { 4886 } 4887 4888 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4889 { 4890 struct drm_device *dev = adev_to_drm(adev); 4891 4892 ktime_get_ts64(&adev->reset_time); 4893 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4894 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4895 } 4896 #endif 4897 4898 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4899 struct amdgpu_reset_context *reset_context) 4900 { 4901 struct amdgpu_device *tmp_adev = NULL; 4902 bool need_full_reset, skip_hw_reset, vram_lost = false; 4903 int r = 0; 4904 bool gpu_reset_for_dev_remove = 0; 4905 4906 /* Try reset handler method first */ 4907 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4908 reset_list); 4909 amdgpu_reset_reg_dumps(tmp_adev); 4910 4911 reset_context->reset_device_list = device_list_handle; 4912 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4913 /* If reset handler not implemented, continue; otherwise return */ 4914 if (r == -ENOSYS) 4915 r = 0; 4916 else 4917 return r; 4918 4919 /* Reset handler not implemented, use the default method */ 4920 need_full_reset = 4921 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4922 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4923 4924 gpu_reset_for_dev_remove = 4925 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4926 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4927 4928 /* 4929 * ASIC reset has to be done on all XGMI hive nodes ASAP 4930 * to allow proper links negotiation in FW (within 1 sec) 4931 */ 4932 if (!skip_hw_reset && need_full_reset) { 4933 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4934 /* For XGMI run all resets in parallel to speed up the process */ 4935 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4936 tmp_adev->gmc.xgmi.pending_reset = false; 4937 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4938 r = -EALREADY; 4939 } else 4940 r = amdgpu_asic_reset(tmp_adev); 4941 4942 if (r) { 4943 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4944 r, adev_to_drm(tmp_adev)->unique); 4945 break; 4946 } 4947 } 4948 4949 /* For XGMI wait for all resets to complete before proceed */ 4950 if (!r) { 4951 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4952 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4953 flush_work(&tmp_adev->xgmi_reset_work); 4954 r = tmp_adev->asic_reset_res; 4955 if (r) 4956 break; 4957 } 4958 } 4959 } 4960 } 4961 4962 if (!r && amdgpu_ras_intr_triggered()) { 4963 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4964 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4965 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4966 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4967 } 4968 4969 amdgpu_ras_intr_cleared(); 4970 } 4971 4972 /* Since the mode1 reset affects base ip blocks, the 4973 * phase1 ip blocks need to be resumed. Otherwise there 4974 * will be a BIOS signature error and the psp bootloader 4975 * can't load kdb on the next amdgpu install. 4976 */ 4977 if (gpu_reset_for_dev_remove) { 4978 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4979 amdgpu_device_ip_resume_phase1(tmp_adev); 4980 4981 goto end; 4982 } 4983 4984 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4985 if (need_full_reset) { 4986 /* post card */ 4987 r = amdgpu_device_asic_init(tmp_adev); 4988 if (r) { 4989 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4990 } else { 4991 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4992 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4993 if (r) 4994 goto out; 4995 4996 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4997 if (r) 4998 goto out; 4999 5000 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5001 #ifdef CONFIG_DEV_COREDUMP 5002 tmp_adev->reset_vram_lost = vram_lost; 5003 memset(&tmp_adev->reset_task_info, 0, 5004 sizeof(tmp_adev->reset_task_info)); 5005 if (reset_context->job && reset_context->job->vm) 5006 tmp_adev->reset_task_info = 5007 reset_context->job->vm->task_info; 5008 amdgpu_reset_capture_coredumpm(tmp_adev); 5009 #endif 5010 if (vram_lost) { 5011 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5012 amdgpu_inc_vram_lost(tmp_adev); 5013 } 5014 5015 r = amdgpu_device_fw_loading(tmp_adev); 5016 if (r) 5017 return r; 5018 5019 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5020 if (r) 5021 goto out; 5022 5023 if (vram_lost) 5024 amdgpu_device_fill_reset_magic(tmp_adev); 5025 5026 /* 5027 * Add this ASIC as tracked as reset was already 5028 * complete successfully. 5029 */ 5030 amdgpu_register_gpu_instance(tmp_adev); 5031 5032 if (!reset_context->hive && 5033 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5034 amdgpu_xgmi_add_device(tmp_adev); 5035 5036 r = amdgpu_device_ip_late_init(tmp_adev); 5037 if (r) 5038 goto out; 5039 5040 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5041 5042 /* 5043 * The GPU enters bad state once faulty pages 5044 * by ECC has reached the threshold, and ras 5045 * recovery is scheduled next. So add one check 5046 * here to break recovery if it indeed exceeds 5047 * bad page threshold, and remind user to 5048 * retire this GPU or setting one bigger 5049 * bad_page_threshold value to fix this once 5050 * probing driver again. 5051 */ 5052 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5053 /* must succeed. */ 5054 amdgpu_ras_resume(tmp_adev); 5055 } else { 5056 r = -EINVAL; 5057 goto out; 5058 } 5059 5060 /* Update PSP FW topology after reset */ 5061 if (reset_context->hive && 5062 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5063 r = amdgpu_xgmi_update_topology( 5064 reset_context->hive, tmp_adev); 5065 } 5066 } 5067 5068 out: 5069 if (!r) { 5070 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5071 r = amdgpu_ib_ring_tests(tmp_adev); 5072 if (r) { 5073 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5074 need_full_reset = true; 5075 r = -EAGAIN; 5076 goto end; 5077 } 5078 } 5079 5080 if (!r) 5081 r = amdgpu_device_recover_vram(tmp_adev); 5082 else 5083 tmp_adev->asic_reset_res = r; 5084 } 5085 5086 end: 5087 if (need_full_reset) 5088 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5089 else 5090 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5091 return r; 5092 } 5093 5094 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5095 { 5096 5097 switch (amdgpu_asic_reset_method(adev)) { 5098 case AMD_RESET_METHOD_MODE1: 5099 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5100 break; 5101 case AMD_RESET_METHOD_MODE2: 5102 adev->mp1_state = PP_MP1_STATE_RESET; 5103 break; 5104 default: 5105 adev->mp1_state = PP_MP1_STATE_NONE; 5106 break; 5107 } 5108 } 5109 5110 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5111 { 5112 amdgpu_vf_error_trans_all(adev); 5113 adev->mp1_state = PP_MP1_STATE_NONE; 5114 } 5115 5116 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5117 { 5118 struct pci_dev *p = NULL; 5119 5120 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5121 adev->pdev->bus->number, 1); 5122 if (p) { 5123 pm_runtime_enable(&(p->dev)); 5124 pm_runtime_resume(&(p->dev)); 5125 } 5126 5127 pci_dev_put(p); 5128 } 5129 5130 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5131 { 5132 enum amd_reset_method reset_method; 5133 struct pci_dev *p = NULL; 5134 u64 expires; 5135 5136 /* 5137 * For now, only BACO and mode1 reset are confirmed 5138 * to suffer the audio issue without proper suspended. 5139 */ 5140 reset_method = amdgpu_asic_reset_method(adev); 5141 if ((reset_method != AMD_RESET_METHOD_BACO) && 5142 (reset_method != AMD_RESET_METHOD_MODE1)) 5143 return -EINVAL; 5144 5145 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5146 adev->pdev->bus->number, 1); 5147 if (!p) 5148 return -ENODEV; 5149 5150 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5151 if (!expires) 5152 /* 5153 * If we cannot get the audio device autosuspend delay, 5154 * a fixed 4S interval will be used. Considering 3S is 5155 * the audio controller default autosuspend delay setting. 5156 * 4S used here is guaranteed to cover that. 5157 */ 5158 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5159 5160 while (!pm_runtime_status_suspended(&(p->dev))) { 5161 if (!pm_runtime_suspend(&(p->dev))) 5162 break; 5163 5164 if (expires < ktime_get_mono_fast_ns()) { 5165 dev_warn(adev->dev, "failed to suspend display audio\n"); 5166 pci_dev_put(p); 5167 /* TODO: abort the succeeding gpu reset? */ 5168 return -ETIMEDOUT; 5169 } 5170 } 5171 5172 pm_runtime_disable(&(p->dev)); 5173 5174 pci_dev_put(p); 5175 return 0; 5176 } 5177 5178 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5179 { 5180 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5181 5182 #if defined(CONFIG_DEBUG_FS) 5183 if (!amdgpu_sriov_vf(adev)) 5184 cancel_work(&adev->reset_work); 5185 #endif 5186 5187 if (adev->kfd.dev) 5188 cancel_work(&adev->kfd.reset_work); 5189 5190 if (amdgpu_sriov_vf(adev)) 5191 cancel_work(&adev->virt.flr_work); 5192 5193 if (con && adev->ras_enabled) 5194 cancel_work(&con->recovery_work); 5195 5196 } 5197 5198 /** 5199 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5200 * 5201 * @adev: amdgpu_device pointer 5202 * @job: which job trigger hang 5203 * @reset_context: amdgpu reset context pointer 5204 * 5205 * Attempt to reset the GPU if it has hung (all asics). 5206 * Attempt to do soft-reset or full-reset and reinitialize Asic 5207 * Returns 0 for success or an error on failure. 5208 */ 5209 5210 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5211 struct amdgpu_job *job, 5212 struct amdgpu_reset_context *reset_context) 5213 { 5214 struct list_head device_list, *device_list_handle = NULL; 5215 bool job_signaled = false; 5216 struct amdgpu_hive_info *hive = NULL; 5217 struct amdgpu_device *tmp_adev = NULL; 5218 int i, r = 0; 5219 bool need_emergency_restart = false; 5220 bool audio_suspended = false; 5221 bool gpu_reset_for_dev_remove = false; 5222 5223 gpu_reset_for_dev_remove = 5224 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5225 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5226 5227 /* 5228 * Special case: RAS triggered and full reset isn't supported 5229 */ 5230 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5231 5232 /* 5233 * Flush RAM to disk so that after reboot 5234 * the user can read log and see why the system rebooted. 5235 */ 5236 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5237 DRM_WARN("Emergency reboot."); 5238 5239 ksys_sync_helper(); 5240 emergency_restart(); 5241 } 5242 5243 dev_info(adev->dev, "GPU %s begin!\n", 5244 need_emergency_restart ? "jobs stop":"reset"); 5245 5246 if (!amdgpu_sriov_vf(adev)) 5247 hive = amdgpu_get_xgmi_hive(adev); 5248 if (hive) 5249 mutex_lock(&hive->hive_lock); 5250 5251 reset_context->job = job; 5252 reset_context->hive = hive; 5253 /* 5254 * Build list of devices to reset. 5255 * In case we are in XGMI hive mode, resort the device list 5256 * to put adev in the 1st position. 5257 */ 5258 INIT_LIST_HEAD(&device_list); 5259 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5260 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5261 list_add_tail(&tmp_adev->reset_list, &device_list); 5262 if (gpu_reset_for_dev_remove && adev->shutdown) 5263 tmp_adev->shutdown = true; 5264 } 5265 if (!list_is_first(&adev->reset_list, &device_list)) 5266 list_rotate_to_front(&adev->reset_list, &device_list); 5267 device_list_handle = &device_list; 5268 } else { 5269 list_add_tail(&adev->reset_list, &device_list); 5270 device_list_handle = &device_list; 5271 } 5272 5273 /* We need to lock reset domain only once both for XGMI and single device */ 5274 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5275 reset_list); 5276 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5277 5278 /* block all schedulers and reset given job's ring */ 5279 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5280 5281 amdgpu_device_set_mp1_state(tmp_adev); 5282 5283 /* 5284 * Try to put the audio codec into suspend state 5285 * before gpu reset started. 5286 * 5287 * Due to the power domain of the graphics device 5288 * is shared with AZ power domain. Without this, 5289 * we may change the audio hardware from behind 5290 * the audio driver's back. That will trigger 5291 * some audio codec errors. 5292 */ 5293 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5294 audio_suspended = true; 5295 5296 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5297 5298 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5299 5300 if (!amdgpu_sriov_vf(tmp_adev)) 5301 amdgpu_amdkfd_pre_reset(tmp_adev); 5302 5303 /* 5304 * Mark these ASICs to be reseted as untracked first 5305 * And add them back after reset completed 5306 */ 5307 amdgpu_unregister_gpu_instance(tmp_adev); 5308 5309 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5310 5311 /* disable ras on ALL IPs */ 5312 if (!need_emergency_restart && 5313 amdgpu_device_ip_need_full_reset(tmp_adev)) 5314 amdgpu_ras_suspend(tmp_adev); 5315 5316 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5317 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5318 5319 if (!ring || !ring->sched.thread) 5320 continue; 5321 5322 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5323 5324 if (need_emergency_restart) 5325 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5326 } 5327 atomic_inc(&tmp_adev->gpu_reset_counter); 5328 } 5329 5330 if (need_emergency_restart) 5331 goto skip_sched_resume; 5332 5333 /* 5334 * Must check guilty signal here since after this point all old 5335 * HW fences are force signaled. 5336 * 5337 * job->base holds a reference to parent fence 5338 */ 5339 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5340 job_signaled = true; 5341 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5342 goto skip_hw_reset; 5343 } 5344 5345 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5346 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5347 if (gpu_reset_for_dev_remove) { 5348 /* Workaroud for ASICs need to disable SMC first */ 5349 amdgpu_device_smu_fini_early(tmp_adev); 5350 } 5351 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5352 /*TODO Should we stop ?*/ 5353 if (r) { 5354 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5355 r, adev_to_drm(tmp_adev)->unique); 5356 tmp_adev->asic_reset_res = r; 5357 } 5358 5359 /* 5360 * Drop all pending non scheduler resets. Scheduler resets 5361 * were already dropped during drm_sched_stop 5362 */ 5363 amdgpu_device_stop_pending_resets(tmp_adev); 5364 } 5365 5366 /* Actual ASIC resets if needed.*/ 5367 /* Host driver will handle XGMI hive reset for SRIOV */ 5368 if (amdgpu_sriov_vf(adev)) { 5369 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5370 if (r) 5371 adev->asic_reset_res = r; 5372 5373 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5374 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5375 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5376 amdgpu_ras_resume(adev); 5377 } else { 5378 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5379 if (r && r == -EAGAIN) 5380 goto retry; 5381 5382 if (!r && gpu_reset_for_dev_remove) 5383 goto recover_end; 5384 } 5385 5386 skip_hw_reset: 5387 5388 /* Post ASIC reset for all devs .*/ 5389 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5390 5391 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5392 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5393 5394 if (!ring || !ring->sched.thread) 5395 continue; 5396 5397 drm_sched_start(&ring->sched, true); 5398 } 5399 5400 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5401 amdgpu_mes_self_test(tmp_adev); 5402 5403 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5404 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5405 } 5406 5407 if (tmp_adev->asic_reset_res) 5408 r = tmp_adev->asic_reset_res; 5409 5410 tmp_adev->asic_reset_res = 0; 5411 5412 if (r) { 5413 /* bad news, how to tell it to userspace ? */ 5414 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5415 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5416 } else { 5417 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5418 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5419 DRM_WARN("smart shift update failed\n"); 5420 } 5421 } 5422 5423 skip_sched_resume: 5424 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5425 /* unlock kfd: SRIOV would do it separately */ 5426 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5427 amdgpu_amdkfd_post_reset(tmp_adev); 5428 5429 /* kfd_post_reset will do nothing if kfd device is not initialized, 5430 * need to bring up kfd here if it's not be initialized before 5431 */ 5432 if (!adev->kfd.init_complete) 5433 amdgpu_amdkfd_device_init(adev); 5434 5435 if (audio_suspended) 5436 amdgpu_device_resume_display_audio(tmp_adev); 5437 5438 amdgpu_device_unset_mp1_state(tmp_adev); 5439 5440 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5441 } 5442 5443 recover_end: 5444 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5445 reset_list); 5446 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5447 5448 if (hive) { 5449 mutex_unlock(&hive->hive_lock); 5450 amdgpu_put_xgmi_hive(hive); 5451 } 5452 5453 if (r) 5454 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5455 5456 atomic_set(&adev->reset_domain->reset_res, r); 5457 return r; 5458 } 5459 5460 /** 5461 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5462 * 5463 * @adev: amdgpu_device pointer 5464 * 5465 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5466 * and lanes) of the slot the device is in. Handles APUs and 5467 * virtualized environments where PCIE config space may not be available. 5468 */ 5469 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5470 { 5471 struct pci_dev *pdev; 5472 enum pci_bus_speed speed_cap, platform_speed_cap; 5473 enum pcie_link_width platform_link_width; 5474 5475 if (amdgpu_pcie_gen_cap) 5476 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5477 5478 if (amdgpu_pcie_lane_cap) 5479 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5480 5481 /* covers APUs as well */ 5482 if (pci_is_root_bus(adev->pdev->bus)) { 5483 if (adev->pm.pcie_gen_mask == 0) 5484 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5485 if (adev->pm.pcie_mlw_mask == 0) 5486 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5487 return; 5488 } 5489 5490 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5491 return; 5492 5493 pcie_bandwidth_available(adev->pdev, NULL, 5494 &platform_speed_cap, &platform_link_width); 5495 5496 if (adev->pm.pcie_gen_mask == 0) { 5497 /* asic caps */ 5498 pdev = adev->pdev; 5499 speed_cap = pcie_get_speed_cap(pdev); 5500 if (speed_cap == PCI_SPEED_UNKNOWN) { 5501 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5502 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5504 } else { 5505 if (speed_cap == PCIE_SPEED_32_0GT) 5506 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5507 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5508 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5509 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5510 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5511 else if (speed_cap == PCIE_SPEED_16_0GT) 5512 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5513 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5514 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5515 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5516 else if (speed_cap == PCIE_SPEED_8_0GT) 5517 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5518 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5519 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5520 else if (speed_cap == PCIE_SPEED_5_0GT) 5521 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5522 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5523 else 5524 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5525 } 5526 /* platform caps */ 5527 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5528 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5530 } else { 5531 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5532 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5533 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5534 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5535 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5536 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5537 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5538 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5539 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5540 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5541 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5542 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5543 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5544 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5545 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5546 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5547 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5548 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5549 else 5550 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5551 5552 } 5553 } 5554 if (adev->pm.pcie_mlw_mask == 0) { 5555 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5556 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5557 } else { 5558 switch (platform_link_width) { 5559 case PCIE_LNK_X32: 5560 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5565 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5567 break; 5568 case PCIE_LNK_X16: 5569 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5570 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5572 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5573 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5574 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5575 break; 5576 case PCIE_LNK_X12: 5577 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5578 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5579 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5580 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5581 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5582 break; 5583 case PCIE_LNK_X8: 5584 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5585 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5586 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5587 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5588 break; 5589 case PCIE_LNK_X4: 5590 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5591 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5592 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5593 break; 5594 case PCIE_LNK_X2: 5595 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5596 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5597 break; 5598 case PCIE_LNK_X1: 5599 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5600 break; 5601 default: 5602 break; 5603 } 5604 } 5605 } 5606 } 5607 5608 /** 5609 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5610 * 5611 * @adev: amdgpu_device pointer 5612 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5613 * 5614 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5615 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5616 * @peer_adev. 5617 */ 5618 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5619 struct amdgpu_device *peer_adev) 5620 { 5621 #ifdef CONFIG_HSA_AMD_P2P 5622 uint64_t address_mask = peer_adev->dev->dma_mask ? 5623 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5624 resource_size_t aper_limit = 5625 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5626 bool p2p_access = 5627 !adev->gmc.xgmi.connected_to_cpu && 5628 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5629 5630 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5631 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5632 !(adev->gmc.aper_base & address_mask || 5633 aper_limit & address_mask)); 5634 #else 5635 return false; 5636 #endif 5637 } 5638 5639 int amdgpu_device_baco_enter(struct drm_device *dev) 5640 { 5641 struct amdgpu_device *adev = drm_to_adev(dev); 5642 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5643 5644 if (!amdgpu_device_supports_baco(dev)) 5645 return -ENOTSUPP; 5646 5647 if (ras && adev->ras_enabled && 5648 adev->nbio.funcs->enable_doorbell_interrupt) 5649 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5650 5651 return amdgpu_dpm_baco_enter(adev); 5652 } 5653 5654 int amdgpu_device_baco_exit(struct drm_device *dev) 5655 { 5656 struct amdgpu_device *adev = drm_to_adev(dev); 5657 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5658 int ret = 0; 5659 5660 if (!amdgpu_device_supports_baco(dev)) 5661 return -ENOTSUPP; 5662 5663 ret = amdgpu_dpm_baco_exit(adev); 5664 if (ret) 5665 return ret; 5666 5667 if (ras && adev->ras_enabled && 5668 adev->nbio.funcs->enable_doorbell_interrupt) 5669 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5670 5671 if (amdgpu_passthrough(adev) && 5672 adev->nbio.funcs->clear_doorbell_interrupt) 5673 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5674 5675 return 0; 5676 } 5677 5678 /** 5679 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5680 * @pdev: PCI device struct 5681 * @state: PCI channel state 5682 * 5683 * Description: Called when a PCI error is detected. 5684 * 5685 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5686 */ 5687 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5688 { 5689 struct drm_device *dev = pci_get_drvdata(pdev); 5690 struct amdgpu_device *adev = drm_to_adev(dev); 5691 int i; 5692 5693 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5694 5695 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5696 DRM_WARN("No support for XGMI hive yet..."); 5697 return PCI_ERS_RESULT_DISCONNECT; 5698 } 5699 5700 adev->pci_channel_state = state; 5701 5702 switch (state) { 5703 case pci_channel_io_normal: 5704 return PCI_ERS_RESULT_CAN_RECOVER; 5705 /* Fatal error, prepare for slot reset */ 5706 case pci_channel_io_frozen: 5707 /* 5708 * Locking adev->reset_domain->sem will prevent any external access 5709 * to GPU during PCI error recovery 5710 */ 5711 amdgpu_device_lock_reset_domain(adev->reset_domain); 5712 amdgpu_device_set_mp1_state(adev); 5713 5714 /* 5715 * Block any work scheduling as we do for regular GPU reset 5716 * for the duration of the recovery 5717 */ 5718 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5719 struct amdgpu_ring *ring = adev->rings[i]; 5720 5721 if (!ring || !ring->sched.thread) 5722 continue; 5723 5724 drm_sched_stop(&ring->sched, NULL); 5725 } 5726 atomic_inc(&adev->gpu_reset_counter); 5727 return PCI_ERS_RESULT_NEED_RESET; 5728 case pci_channel_io_perm_failure: 5729 /* Permanent error, prepare for device removal */ 5730 return PCI_ERS_RESULT_DISCONNECT; 5731 } 5732 5733 return PCI_ERS_RESULT_NEED_RESET; 5734 } 5735 5736 /** 5737 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5738 * @pdev: pointer to PCI device 5739 */ 5740 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5741 { 5742 5743 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5744 5745 /* TODO - dump whatever for debugging purposes */ 5746 5747 /* This called only if amdgpu_pci_error_detected returns 5748 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5749 * works, no need to reset slot. 5750 */ 5751 5752 return PCI_ERS_RESULT_RECOVERED; 5753 } 5754 5755 /** 5756 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5757 * @pdev: PCI device struct 5758 * 5759 * Description: This routine is called by the pci error recovery 5760 * code after the PCI slot has been reset, just before we 5761 * should resume normal operations. 5762 */ 5763 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5764 { 5765 struct drm_device *dev = pci_get_drvdata(pdev); 5766 struct amdgpu_device *adev = drm_to_adev(dev); 5767 int r, i; 5768 struct amdgpu_reset_context reset_context; 5769 u32 memsize; 5770 struct list_head device_list; 5771 5772 DRM_INFO("PCI error: slot reset callback!!\n"); 5773 5774 memset(&reset_context, 0, sizeof(reset_context)); 5775 5776 INIT_LIST_HEAD(&device_list); 5777 list_add_tail(&adev->reset_list, &device_list); 5778 5779 /* wait for asic to come out of reset */ 5780 msleep(500); 5781 5782 /* Restore PCI confspace */ 5783 amdgpu_device_load_pci_state(pdev); 5784 5785 /* confirm ASIC came out of reset */ 5786 for (i = 0; i < adev->usec_timeout; i++) { 5787 memsize = amdgpu_asic_get_config_memsize(adev); 5788 5789 if (memsize != 0xffffffff) 5790 break; 5791 udelay(1); 5792 } 5793 if (memsize == 0xffffffff) { 5794 r = -ETIME; 5795 goto out; 5796 } 5797 5798 reset_context.method = AMD_RESET_METHOD_NONE; 5799 reset_context.reset_req_dev = adev; 5800 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5801 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5802 5803 adev->no_hw_access = true; 5804 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5805 adev->no_hw_access = false; 5806 if (r) 5807 goto out; 5808 5809 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5810 5811 out: 5812 if (!r) { 5813 if (amdgpu_device_cache_pci_state(adev->pdev)) 5814 pci_restore_state(adev->pdev); 5815 5816 DRM_INFO("PCIe error recovery succeeded\n"); 5817 } else { 5818 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5819 amdgpu_device_unset_mp1_state(adev); 5820 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5821 } 5822 5823 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5824 } 5825 5826 /** 5827 * amdgpu_pci_resume() - resume normal ops after PCI reset 5828 * @pdev: pointer to PCI device 5829 * 5830 * Called when the error recovery driver tells us that its 5831 * OK to resume normal operation. 5832 */ 5833 void amdgpu_pci_resume(struct pci_dev *pdev) 5834 { 5835 struct drm_device *dev = pci_get_drvdata(pdev); 5836 struct amdgpu_device *adev = drm_to_adev(dev); 5837 int i; 5838 5839 5840 DRM_INFO("PCI error: resume callback!!\n"); 5841 5842 /* Only continue execution for the case of pci_channel_io_frozen */ 5843 if (adev->pci_channel_state != pci_channel_io_frozen) 5844 return; 5845 5846 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5847 struct amdgpu_ring *ring = adev->rings[i]; 5848 5849 if (!ring || !ring->sched.thread) 5850 continue; 5851 5852 drm_sched_start(&ring->sched, true); 5853 } 5854 5855 amdgpu_device_unset_mp1_state(adev); 5856 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5857 } 5858 5859 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5860 { 5861 struct drm_device *dev = pci_get_drvdata(pdev); 5862 struct amdgpu_device *adev = drm_to_adev(dev); 5863 int r; 5864 5865 r = pci_save_state(pdev); 5866 if (!r) { 5867 kfree(adev->pci_state); 5868 5869 adev->pci_state = pci_store_saved_state(pdev); 5870 5871 if (!adev->pci_state) { 5872 DRM_ERROR("Failed to store PCI saved state"); 5873 return false; 5874 } 5875 } else { 5876 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5877 return false; 5878 } 5879 5880 return true; 5881 } 5882 5883 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5884 { 5885 struct drm_device *dev = pci_get_drvdata(pdev); 5886 struct amdgpu_device *adev = drm_to_adev(dev); 5887 int r; 5888 5889 if (!adev->pci_state) 5890 return false; 5891 5892 r = pci_load_saved_state(pdev, adev->pci_state); 5893 5894 if (!r) { 5895 pci_restore_state(pdev); 5896 } else { 5897 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5898 return false; 5899 } 5900 5901 return true; 5902 } 5903 5904 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5905 struct amdgpu_ring *ring) 5906 { 5907 #ifdef CONFIG_X86_64 5908 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5909 return; 5910 #endif 5911 if (adev->gmc.xgmi.connected_to_cpu) 5912 return; 5913 5914 if (ring && ring->funcs->emit_hdp_flush) 5915 amdgpu_ring_emit_hdp_flush(ring); 5916 else 5917 amdgpu_asic_flush_hdp(adev, ring); 5918 } 5919 5920 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5921 struct amdgpu_ring *ring) 5922 { 5923 #ifdef CONFIG_X86_64 5924 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5925 return; 5926 #endif 5927 if (adev->gmc.xgmi.connected_to_cpu) 5928 return; 5929 5930 amdgpu_asic_invalidate_hdp(adev, ring); 5931 } 5932 5933 int amdgpu_in_reset(struct amdgpu_device *adev) 5934 { 5935 return atomic_read(&adev->reset_domain->in_gpu_reset); 5936 } 5937 5938 /** 5939 * amdgpu_device_halt() - bring hardware to some kind of halt state 5940 * 5941 * @adev: amdgpu_device pointer 5942 * 5943 * Bring hardware to some kind of halt state so that no one can touch it 5944 * any more. It will help to maintain error context when error occurred. 5945 * Compare to a simple hang, the system will keep stable at least for SSH 5946 * access. Then it should be trivial to inspect the hardware state and 5947 * see what's going on. Implemented as following: 5948 * 5949 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5950 * clears all CPU mappings to device, disallows remappings through page faults 5951 * 2. amdgpu_irq_disable_all() disables all interrupts 5952 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5953 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5954 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5955 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5956 * flush any in flight DMA operations 5957 */ 5958 void amdgpu_device_halt(struct amdgpu_device *adev) 5959 { 5960 struct pci_dev *pdev = adev->pdev; 5961 struct drm_device *ddev = adev_to_drm(adev); 5962 5963 drm_dev_unplug(ddev); 5964 5965 amdgpu_irq_disable_all(adev); 5966 5967 amdgpu_fence_driver_hw_fini(adev); 5968 5969 adev->no_hw_access = true; 5970 5971 amdgpu_device_unmap_mmio(adev); 5972 5973 pci_disable_device(pdev); 5974 pci_wait_for_pending_transaction(pdev); 5975 } 5976 5977 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5978 u32 reg) 5979 { 5980 unsigned long flags, address, data; 5981 u32 r; 5982 5983 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5984 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5985 5986 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5987 WREG32(address, reg * 4); 5988 (void)RREG32(address); 5989 r = RREG32(data); 5990 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5991 return r; 5992 } 5993 5994 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5995 u32 reg, u32 v) 5996 { 5997 unsigned long flags, address, data; 5998 5999 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6000 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6001 6002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6003 WREG32(address, reg * 4); 6004 (void)RREG32(address); 6005 WREG32(data, v); 6006 (void)RREG32(data); 6007 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6008 } 6009 6010 /** 6011 * amdgpu_device_switch_gang - switch to a new gang 6012 * @adev: amdgpu_device pointer 6013 * @gang: the gang to switch to 6014 * 6015 * Try to switch to a new gang. 6016 * Returns: NULL if we switched to the new gang or a reference to the current 6017 * gang leader. 6018 */ 6019 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6020 struct dma_fence *gang) 6021 { 6022 struct dma_fence *old = NULL; 6023 6024 do { 6025 dma_fence_put(old); 6026 rcu_read_lock(); 6027 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6028 rcu_read_unlock(); 6029 6030 if (old == gang) 6031 break; 6032 6033 if (!dma_fence_is_signaled(old)) 6034 return old; 6035 6036 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6037 old, gang) != old); 6038 6039 dma_fence_put(old); 6040 return NULL; 6041 } 6042 6043 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6044 { 6045 switch (adev->asic_type) { 6046 #ifdef CONFIG_DRM_AMDGPU_SI 6047 case CHIP_HAINAN: 6048 #endif 6049 case CHIP_TOPAZ: 6050 /* chips with no display hardware */ 6051 return false; 6052 #ifdef CONFIG_DRM_AMDGPU_SI 6053 case CHIP_TAHITI: 6054 case CHIP_PITCAIRN: 6055 case CHIP_VERDE: 6056 case CHIP_OLAND: 6057 #endif 6058 #ifdef CONFIG_DRM_AMDGPU_CIK 6059 case CHIP_BONAIRE: 6060 case CHIP_HAWAII: 6061 case CHIP_KAVERI: 6062 case CHIP_KABINI: 6063 case CHIP_MULLINS: 6064 #endif 6065 case CHIP_TONGA: 6066 case CHIP_FIJI: 6067 case CHIP_POLARIS10: 6068 case CHIP_POLARIS11: 6069 case CHIP_POLARIS12: 6070 case CHIP_VEGAM: 6071 case CHIP_CARRIZO: 6072 case CHIP_STONEY: 6073 /* chips with display hardware */ 6074 return true; 6075 default: 6076 /* IP discovery */ 6077 if (!adev->ip_versions[DCE_HWIP][0] || 6078 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6079 return false; 6080 return true; 6081 } 6082 } 6083