1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * 511 * this function is invoked only for the debugfs register access 512 */ 513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 514 uint32_t reg, uint32_t v, 515 uint32_t xcc_id) 516 { 517 if (amdgpu_device_skip_hw_access(adev)) 518 return; 519 520 if (amdgpu_sriov_fullaccess(adev) && 521 adev->gfx.rlc.funcs && 522 adev->gfx.rlc.funcs->is_rlcg_access_range) { 523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 525 } else if ((reg * 4) >= adev->rmmio_size) { 526 adev->pcie_wreg(adev, reg * 4, v); 527 } else { 528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 529 } 530 } 531 532 /** 533 * amdgpu_device_indirect_rreg - read an indirect register 534 * 535 * @adev: amdgpu_device pointer 536 * @reg_addr: indirect register address to read from 537 * 538 * Returns the value of indirect register @reg_addr 539 */ 540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 541 u32 reg_addr) 542 { 543 unsigned long flags, pcie_index, pcie_data; 544 void __iomem *pcie_index_offset; 545 void __iomem *pcie_data_offset; 546 u32 r; 547 548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 550 551 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 554 555 writel(reg_addr, pcie_index_offset); 556 readl(pcie_index_offset); 557 r = readl(pcie_data_offset); 558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 559 560 return r; 561 } 562 563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 564 u64 reg_addr) 565 { 566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 567 u32 r; 568 void __iomem *pcie_index_offset; 569 void __iomem *pcie_index_hi_offset; 570 void __iomem *pcie_data_offset; 571 572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 574 if (adev->nbio.funcs->get_pcie_index_hi_offset) 575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 576 else 577 pcie_index_hi = 0; 578 579 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 582 if (pcie_index_hi != 0) 583 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 584 pcie_index_hi * 4; 585 586 writel(reg_addr, pcie_index_offset); 587 readl(pcie_index_offset); 588 if (pcie_index_hi != 0) { 589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 590 readl(pcie_index_hi_offset); 591 } 592 r = readl(pcie_data_offset); 593 594 /* clear the high bits */ 595 if (pcie_index_hi != 0) { 596 writel(0, pcie_index_hi_offset); 597 readl(pcie_index_hi_offset); 598 } 599 600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 601 602 return r; 603 } 604 605 /** 606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 607 * 608 * @adev: amdgpu_device pointer 609 * @reg_addr: indirect register address to read from 610 * 611 * Returns the value of indirect register @reg_addr 612 */ 613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 614 u32 reg_addr) 615 { 616 unsigned long flags, pcie_index, pcie_data; 617 void __iomem *pcie_index_offset; 618 void __iomem *pcie_data_offset; 619 u64 r; 620 621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @reg_addr: indirect register offset 646 * @reg_data: indirect register data 647 * 648 */ 649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 650 u32 reg_addr, u32 reg_data) 651 { 652 unsigned long flags, pcie_index, pcie_data; 653 void __iomem *pcie_index_offset; 654 void __iomem *pcie_data_offset; 655 656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 671 u64 reg_addr, u32 reg_data) 672 { 673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 674 void __iomem *pcie_index_offset; 675 void __iomem *pcie_index_hi_offset; 676 void __iomem *pcie_data_offset; 677 678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 680 if (adev->nbio.funcs->get_pcie_index_hi_offset) 681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 682 else 683 pcie_index_hi = 0; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 if (pcie_index_hi != 0) 689 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 690 pcie_index_hi * 4; 691 692 writel(reg_addr, pcie_index_offset); 693 readl(pcie_index_offset); 694 if (pcie_index_hi != 0) { 695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 696 readl(pcie_index_hi_offset); 697 } 698 writel(reg_data, pcie_data_offset); 699 readl(pcie_data_offset); 700 701 /* clear the high bits */ 702 if (pcie_index_hi != 0) { 703 writel(0, pcie_index_hi_offset); 704 readl(pcie_index_hi_offset); 705 } 706 707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 708 } 709 710 /** 711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 712 * 713 * @adev: amdgpu_device pointer 714 * @reg_addr: indirect register offset 715 * @reg_data: indirect register data 716 * 717 */ 718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 719 u32 reg_addr, u64 reg_data) 720 { 721 unsigned long flags, pcie_index, pcie_data; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 727 728 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 731 732 /* write low 32 bits */ 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 736 readl(pcie_data_offset); 737 /* write high 32 bits */ 738 writel(reg_addr + 4, pcie_index_offset); 739 readl(pcie_index_offset); 740 writel((u32)(reg_data >> 32), pcie_data_offset); 741 readl(pcie_data_offset); 742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 743 } 744 745 /** 746 * amdgpu_device_get_rev_id - query device rev_id 747 * 748 * @adev: amdgpu_device pointer 749 * 750 * Return device rev_id 751 */ 752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 753 { 754 return adev->nbio.funcs->get_rev_id(adev); 755 } 756 757 /** 758 * amdgpu_invalid_rreg - dummy reg read function 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: offset of register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 * Returns the value in the register. 766 */ 767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 768 { 769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 770 BUG(); 771 return 0; 772 } 773 774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 775 { 776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 777 BUG(); 778 return 0; 779 } 780 781 /** 782 * amdgpu_invalid_wreg - dummy reg write function 783 * 784 * @adev: amdgpu_device pointer 785 * @reg: offset of register 786 * @v: value to write to the register 787 * 788 * Dummy register read function. Used for register blocks 789 * that certain asics don't have (all asics). 790 */ 791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 792 { 793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 794 reg, v); 795 BUG(); 796 } 797 798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 799 { 800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 801 reg, v); 802 BUG(); 803 } 804 805 /** 806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 807 * 808 * @adev: amdgpu_device pointer 809 * @reg: offset of register 810 * 811 * Dummy register read function. Used for register blocks 812 * that certain asics don't have (all asics). 813 * Returns the value in the register. 814 */ 815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 816 { 817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 818 BUG(); 819 return 0; 820 } 821 822 /** 823 * amdgpu_invalid_wreg64 - dummy reg write function 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: offset of register 827 * @v: value to write to the register 828 * 829 * Dummy register read function. Used for register blocks 830 * that certain asics don't have (all asics). 831 */ 832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 833 { 834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 835 reg, v); 836 BUG(); 837 } 838 839 /** 840 * amdgpu_block_invalid_rreg - dummy reg read function 841 * 842 * @adev: amdgpu_device pointer 843 * @block: offset of instance 844 * @reg: offset of register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 * Returns the value in the register. 849 */ 850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 851 uint32_t block, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 854 reg, block); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_block_invalid_wreg - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @block: offset of instance 864 * @reg: offset of register 865 * @v: value to write to the register 866 * 867 * Dummy register read function. Used for register blocks 868 * that certain asics don't have (all asics). 869 */ 870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 871 uint32_t block, 872 uint32_t reg, uint32_t v) 873 { 874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 875 reg, block, v); 876 BUG(); 877 } 878 879 /** 880 * amdgpu_device_asic_init - Wrapper for atom asic_init 881 * 882 * @adev: amdgpu_device pointer 883 * 884 * Does any asic specific work and then calls atom asic init. 885 */ 886 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 887 { 888 int ret; 889 890 amdgpu_asic_pre_asic_init(adev); 891 892 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 893 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 894 amdgpu_psp_wait_for_bootloader(adev); 895 ret = amdgpu_atomfirmware_asic_init(adev, true); 896 return ret; 897 } else { 898 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 899 } 900 901 return 0; 902 } 903 904 /** 905 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 906 * 907 * @adev: amdgpu_device pointer 908 * 909 * Allocates a scratch page of VRAM for use by various things in the 910 * driver. 911 */ 912 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 913 { 914 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 915 AMDGPU_GEM_DOMAIN_VRAM | 916 AMDGPU_GEM_DOMAIN_GTT, 917 &adev->mem_scratch.robj, 918 &adev->mem_scratch.gpu_addr, 919 (void **)&adev->mem_scratch.ptr); 920 } 921 922 /** 923 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 924 * 925 * @adev: amdgpu_device pointer 926 * 927 * Frees the VRAM scratch page. 928 */ 929 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 930 { 931 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 932 } 933 934 /** 935 * amdgpu_device_program_register_sequence - program an array of registers. 936 * 937 * @adev: amdgpu_device pointer 938 * @registers: pointer to the register array 939 * @array_size: size of the register array 940 * 941 * Programs an array or registers with and or masks. 942 * This is a helper for setting golden registers. 943 */ 944 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 945 const u32 *registers, 946 const u32 array_size) 947 { 948 u32 tmp, reg, and_mask, or_mask; 949 int i; 950 951 if (array_size % 3) 952 return; 953 954 for (i = 0; i < array_size; i += 3) { 955 reg = registers[i + 0]; 956 and_mask = registers[i + 1]; 957 or_mask = registers[i + 2]; 958 959 if (and_mask == 0xffffffff) { 960 tmp = or_mask; 961 } else { 962 tmp = RREG32(reg); 963 tmp &= ~and_mask; 964 if (adev->family >= AMDGPU_FAMILY_AI) 965 tmp |= (or_mask & and_mask); 966 else 967 tmp |= or_mask; 968 } 969 WREG32(reg, tmp); 970 } 971 } 972 973 /** 974 * amdgpu_device_pci_config_reset - reset the GPU 975 * 976 * @adev: amdgpu_device pointer 977 * 978 * Resets the GPU using the pci config reset sequence. 979 * Only applicable to asics prior to vega10. 980 */ 981 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 982 { 983 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 984 } 985 986 /** 987 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 988 * 989 * @adev: amdgpu_device pointer 990 * 991 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 992 */ 993 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 994 { 995 return pci_reset_function(adev->pdev); 996 } 997 998 /* 999 * amdgpu_device_wb_*() 1000 * Writeback is the method by which the GPU updates special pages in memory 1001 * with the status of certain GPU events (fences, ring pointers,etc.). 1002 */ 1003 1004 /** 1005 * amdgpu_device_wb_fini - Disable Writeback and free memory 1006 * 1007 * @adev: amdgpu_device pointer 1008 * 1009 * Disables Writeback and frees the Writeback memory (all asics). 1010 * Used at driver shutdown. 1011 */ 1012 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1013 { 1014 if (adev->wb.wb_obj) { 1015 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1016 &adev->wb.gpu_addr, 1017 (void **)&adev->wb.wb); 1018 adev->wb.wb_obj = NULL; 1019 } 1020 } 1021 1022 /** 1023 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1024 * 1025 * @adev: amdgpu_device pointer 1026 * 1027 * Initializes writeback and allocates writeback memory (all asics). 1028 * Used at driver startup. 1029 * Returns 0 on success or an -error on failure. 1030 */ 1031 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1032 { 1033 int r; 1034 1035 if (adev->wb.wb_obj == NULL) { 1036 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1037 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1038 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1039 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1040 (void **)&adev->wb.wb); 1041 if (r) { 1042 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1043 return r; 1044 } 1045 1046 adev->wb.num_wb = AMDGPU_MAX_WB; 1047 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1048 1049 /* clear wb memory */ 1050 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1051 } 1052 1053 return 0; 1054 } 1055 1056 /** 1057 * amdgpu_device_wb_get - Allocate a wb entry 1058 * 1059 * @adev: amdgpu_device pointer 1060 * @wb: wb index 1061 * 1062 * Allocate a wb slot for use by the driver (all asics). 1063 * Returns 0 on success or -EINVAL on failure. 1064 */ 1065 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1066 { 1067 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1068 1069 if (offset < adev->wb.num_wb) { 1070 __set_bit(offset, adev->wb.used); 1071 *wb = offset << 3; /* convert to dw offset */ 1072 return 0; 1073 } else { 1074 return -EINVAL; 1075 } 1076 } 1077 1078 /** 1079 * amdgpu_device_wb_free - Free a wb entry 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @wb: wb index 1083 * 1084 * Free a wb slot allocated for use by the driver (all asics) 1085 */ 1086 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1087 { 1088 wb >>= 3; 1089 if (wb < adev->wb.num_wb) 1090 __clear_bit(wb, adev->wb.used); 1091 } 1092 1093 /** 1094 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1095 * 1096 * @adev: amdgpu_device pointer 1097 * 1098 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1099 * to fail, but if any of the BARs is not accessible after the size we abort 1100 * driver loading by returning -ENODEV. 1101 */ 1102 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1103 { 1104 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1105 struct pci_bus *root; 1106 struct resource *res; 1107 unsigned int i; 1108 u16 cmd; 1109 int r; 1110 1111 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1112 return 0; 1113 1114 /* Bypass for VF */ 1115 if (amdgpu_sriov_vf(adev)) 1116 return 0; 1117 1118 /* skip if the bios has already enabled large BAR */ 1119 if (adev->gmc.real_vram_size && 1120 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1121 return 0; 1122 1123 /* Check if the root BUS has 64bit memory resources */ 1124 root = adev->pdev->bus; 1125 while (root->parent) 1126 root = root->parent; 1127 1128 pci_bus_for_each_resource(root, res, i) { 1129 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1130 res->start > 0x100000000ull) 1131 break; 1132 } 1133 1134 /* Trying to resize is pointless without a root hub window above 4GB */ 1135 if (!res) 1136 return 0; 1137 1138 /* Limit the BAR size to what is available */ 1139 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1140 rbar_size); 1141 1142 /* Disable memory decoding while we change the BAR addresses and size */ 1143 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1144 pci_write_config_word(adev->pdev, PCI_COMMAND, 1145 cmd & ~PCI_COMMAND_MEMORY); 1146 1147 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1148 amdgpu_doorbell_fini(adev); 1149 if (adev->asic_type >= CHIP_BONAIRE) 1150 pci_release_resource(adev->pdev, 2); 1151 1152 pci_release_resource(adev->pdev, 0); 1153 1154 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1155 if (r == -ENOSPC) 1156 DRM_INFO("Not enough PCI address space for a large BAR."); 1157 else if (r && r != -ENOTSUPP) 1158 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1159 1160 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1161 1162 /* When the doorbell or fb BAR isn't available we have no chance of 1163 * using the device. 1164 */ 1165 r = amdgpu_doorbell_init(adev); 1166 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1167 return -ENODEV; 1168 1169 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1170 1171 return 0; 1172 } 1173 1174 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1175 { 1176 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1177 return false; 1178 1179 return true; 1180 } 1181 1182 /* 1183 * GPU helpers function. 1184 */ 1185 /** 1186 * amdgpu_device_need_post - check if the hw need post or not 1187 * 1188 * @adev: amdgpu_device pointer 1189 * 1190 * Check if the asic has been initialized (all asics) at driver startup 1191 * or post is needed if hw reset is performed. 1192 * Returns true if need or false if not. 1193 */ 1194 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1195 { 1196 uint32_t reg; 1197 1198 if (amdgpu_sriov_vf(adev)) 1199 return false; 1200 1201 if (!amdgpu_device_read_bios(adev)) 1202 return false; 1203 1204 if (amdgpu_passthrough(adev)) { 1205 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1206 * some old smc fw still need driver do vPost otherwise gpu hang, while 1207 * those smc fw version above 22.15 doesn't have this flaw, so we force 1208 * vpost executed for smc version below 22.15 1209 */ 1210 if (adev->asic_type == CHIP_FIJI) { 1211 int err; 1212 uint32_t fw_ver; 1213 1214 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1215 /* force vPost if error occured */ 1216 if (err) 1217 return true; 1218 1219 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1220 if (fw_ver < 0x00160e00) 1221 return true; 1222 } 1223 } 1224 1225 /* Don't post if we need to reset whole hive on init */ 1226 if (adev->gmc.xgmi.pending_reset) 1227 return false; 1228 1229 if (adev->has_hw_reset) { 1230 adev->has_hw_reset = false; 1231 return true; 1232 } 1233 1234 /* bios scratch used on CIK+ */ 1235 if (adev->asic_type >= CHIP_BONAIRE) 1236 return amdgpu_atombios_scratch_need_asic_init(adev); 1237 1238 /* check MEM_SIZE for older asics */ 1239 reg = amdgpu_asic_get_config_memsize(adev); 1240 1241 if ((reg != 0) && (reg != 0xffffffff)) 1242 return false; 1243 1244 return true; 1245 } 1246 1247 /* 1248 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1249 * speed switching. Until we have confirmation from Intel that a specific host 1250 * supports it, it's safer that we keep it disabled for all. 1251 * 1252 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1253 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1254 */ 1255 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1256 { 1257 #if IS_ENABLED(CONFIG_X86) 1258 struct cpuinfo_x86 *c = &cpu_data(0); 1259 1260 if (c->x86_vendor == X86_VENDOR_INTEL) 1261 return false; 1262 #endif 1263 return true; 1264 } 1265 1266 /** 1267 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1272 * be set for this device. 1273 * 1274 * Returns true if it should be used or false if not. 1275 */ 1276 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1277 { 1278 switch (amdgpu_aspm) { 1279 case -1: 1280 break; 1281 case 0: 1282 return false; 1283 case 1: 1284 return true; 1285 default: 1286 return false; 1287 } 1288 return pcie_aspm_enabled(adev->pdev); 1289 } 1290 1291 bool amdgpu_device_aspm_support_quirk(void) 1292 { 1293 #if IS_ENABLED(CONFIG_X86) 1294 struct cpuinfo_x86 *c = &cpu_data(0); 1295 1296 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1297 #else 1298 return true; 1299 #endif 1300 } 1301 1302 /* if we get transitioned to only one device, take VGA back */ 1303 /** 1304 * amdgpu_device_vga_set_decode - enable/disable vga decode 1305 * 1306 * @pdev: PCI device pointer 1307 * @state: enable/disable vga decode 1308 * 1309 * Enable/disable vga decode (all asics). 1310 * Returns VGA resource flags. 1311 */ 1312 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1313 bool state) 1314 { 1315 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1316 1317 amdgpu_asic_set_vga_state(adev, state); 1318 if (state) 1319 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1320 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1321 else 1322 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1323 } 1324 1325 /** 1326 * amdgpu_device_check_block_size - validate the vm block size 1327 * 1328 * @adev: amdgpu_device pointer 1329 * 1330 * Validates the vm block size specified via module parameter. 1331 * The vm block size defines number of bits in page table versus page directory, 1332 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1333 * page table and the remaining bits are in the page directory. 1334 */ 1335 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1336 { 1337 /* defines number of bits in page table versus page directory, 1338 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1339 * page table and the remaining bits are in the page directory 1340 */ 1341 if (amdgpu_vm_block_size == -1) 1342 return; 1343 1344 if (amdgpu_vm_block_size < 9) { 1345 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1346 amdgpu_vm_block_size); 1347 amdgpu_vm_block_size = -1; 1348 } 1349 } 1350 1351 /** 1352 * amdgpu_device_check_vm_size - validate the vm size 1353 * 1354 * @adev: amdgpu_device pointer 1355 * 1356 * Validates the vm size in GB specified via module parameter. 1357 * The VM size is the size of the GPU virtual memory space in GB. 1358 */ 1359 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1360 { 1361 /* no need to check the default value */ 1362 if (amdgpu_vm_size == -1) 1363 return; 1364 1365 if (amdgpu_vm_size < 1) { 1366 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1367 amdgpu_vm_size); 1368 amdgpu_vm_size = -1; 1369 } 1370 } 1371 1372 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1373 { 1374 struct sysinfo si; 1375 bool is_os_64 = (sizeof(void *) == 8); 1376 uint64_t total_memory; 1377 uint64_t dram_size_seven_GB = 0x1B8000000; 1378 uint64_t dram_size_three_GB = 0xB8000000; 1379 1380 if (amdgpu_smu_memory_pool_size == 0) 1381 return; 1382 1383 if (!is_os_64) { 1384 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1385 goto def_value; 1386 } 1387 si_meminfo(&si); 1388 total_memory = (uint64_t)si.totalram * si.mem_unit; 1389 1390 if ((amdgpu_smu_memory_pool_size == 1) || 1391 (amdgpu_smu_memory_pool_size == 2)) { 1392 if (total_memory < dram_size_three_GB) 1393 goto def_value1; 1394 } else if ((amdgpu_smu_memory_pool_size == 4) || 1395 (amdgpu_smu_memory_pool_size == 8)) { 1396 if (total_memory < dram_size_seven_GB) 1397 goto def_value1; 1398 } else { 1399 DRM_WARN("Smu memory pool size not supported\n"); 1400 goto def_value; 1401 } 1402 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1403 1404 return; 1405 1406 def_value1: 1407 DRM_WARN("No enough system memory\n"); 1408 def_value: 1409 adev->pm.smu_prv_buffer_size = 0; 1410 } 1411 1412 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1413 { 1414 if (!(adev->flags & AMD_IS_APU) || 1415 adev->asic_type < CHIP_RAVEN) 1416 return 0; 1417 1418 switch (adev->asic_type) { 1419 case CHIP_RAVEN: 1420 if (adev->pdev->device == 0x15dd) 1421 adev->apu_flags |= AMD_APU_IS_RAVEN; 1422 if (adev->pdev->device == 0x15d8) 1423 adev->apu_flags |= AMD_APU_IS_PICASSO; 1424 break; 1425 case CHIP_RENOIR: 1426 if ((adev->pdev->device == 0x1636) || 1427 (adev->pdev->device == 0x164c)) 1428 adev->apu_flags |= AMD_APU_IS_RENOIR; 1429 else 1430 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1431 break; 1432 case CHIP_VANGOGH: 1433 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1434 break; 1435 case CHIP_YELLOW_CARP: 1436 break; 1437 case CHIP_CYAN_SKILLFISH: 1438 if ((adev->pdev->device == 0x13FE) || 1439 (adev->pdev->device == 0x143F)) 1440 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1441 break; 1442 default: 1443 break; 1444 } 1445 1446 return 0; 1447 } 1448 1449 /** 1450 * amdgpu_device_check_arguments - validate module params 1451 * 1452 * @adev: amdgpu_device pointer 1453 * 1454 * Validates certain module parameters and updates 1455 * the associated values used by the driver (all asics). 1456 */ 1457 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1458 { 1459 if (amdgpu_sched_jobs < 4) { 1460 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1461 amdgpu_sched_jobs); 1462 amdgpu_sched_jobs = 4; 1463 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1464 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1465 amdgpu_sched_jobs); 1466 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1467 } 1468 1469 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1470 /* gart size must be greater or equal to 32M */ 1471 dev_warn(adev->dev, "gart size (%d) too small\n", 1472 amdgpu_gart_size); 1473 amdgpu_gart_size = -1; 1474 } 1475 1476 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1477 /* gtt size must be greater or equal to 32M */ 1478 dev_warn(adev->dev, "gtt size (%d) too small\n", 1479 amdgpu_gtt_size); 1480 amdgpu_gtt_size = -1; 1481 } 1482 1483 /* valid range is between 4 and 9 inclusive */ 1484 if (amdgpu_vm_fragment_size != -1 && 1485 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1486 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1487 amdgpu_vm_fragment_size = -1; 1488 } 1489 1490 if (amdgpu_sched_hw_submission < 2) { 1491 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1492 amdgpu_sched_hw_submission); 1493 amdgpu_sched_hw_submission = 2; 1494 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1495 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1496 amdgpu_sched_hw_submission); 1497 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1498 } 1499 1500 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1501 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1502 amdgpu_reset_method = -1; 1503 } 1504 1505 amdgpu_device_check_smu_prv_buffer_size(adev); 1506 1507 amdgpu_device_check_vm_size(adev); 1508 1509 amdgpu_device_check_block_size(adev); 1510 1511 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1512 1513 return 0; 1514 } 1515 1516 /** 1517 * amdgpu_switcheroo_set_state - set switcheroo state 1518 * 1519 * @pdev: pci dev pointer 1520 * @state: vga_switcheroo state 1521 * 1522 * Callback for the switcheroo driver. Suspends or resumes 1523 * the asics before or after it is powered up using ACPI methods. 1524 */ 1525 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1526 enum vga_switcheroo_state state) 1527 { 1528 struct drm_device *dev = pci_get_drvdata(pdev); 1529 int r; 1530 1531 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1532 return; 1533 1534 if (state == VGA_SWITCHEROO_ON) { 1535 pr_info("switched on\n"); 1536 /* don't suspend or resume card normally */ 1537 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1538 1539 pci_set_power_state(pdev, PCI_D0); 1540 amdgpu_device_load_pci_state(pdev); 1541 r = pci_enable_device(pdev); 1542 if (r) 1543 DRM_WARN("pci_enable_device failed (%d)\n", r); 1544 amdgpu_device_resume(dev, true); 1545 1546 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1547 } else { 1548 pr_info("switched off\n"); 1549 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1550 amdgpu_device_suspend(dev, true); 1551 amdgpu_device_cache_pci_state(pdev); 1552 /* Shut down the device */ 1553 pci_disable_device(pdev); 1554 pci_set_power_state(pdev, PCI_D3cold); 1555 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1556 } 1557 } 1558 1559 /** 1560 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1561 * 1562 * @pdev: pci dev pointer 1563 * 1564 * Callback for the switcheroo driver. Check of the switcheroo 1565 * state can be changed. 1566 * Returns true if the state can be changed, false if not. 1567 */ 1568 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1569 { 1570 struct drm_device *dev = pci_get_drvdata(pdev); 1571 1572 /* 1573 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1574 * locking inversion with the driver load path. And the access here is 1575 * completely racy anyway. So don't bother with locking for now. 1576 */ 1577 return atomic_read(&dev->open_count) == 0; 1578 } 1579 1580 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1581 .set_gpu_state = amdgpu_switcheroo_set_state, 1582 .reprobe = NULL, 1583 .can_switch = amdgpu_switcheroo_can_switch, 1584 }; 1585 1586 /** 1587 * amdgpu_device_ip_set_clockgating_state - set the CG state 1588 * 1589 * @dev: amdgpu_device pointer 1590 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1591 * @state: clockgating state (gate or ungate) 1592 * 1593 * Sets the requested clockgating state for all instances of 1594 * the hardware IP specified. 1595 * Returns the error code from the last instance. 1596 */ 1597 int amdgpu_device_ip_set_clockgating_state(void *dev, 1598 enum amd_ip_block_type block_type, 1599 enum amd_clockgating_state state) 1600 { 1601 struct amdgpu_device *adev = dev; 1602 int i, r = 0; 1603 1604 for (i = 0; i < adev->num_ip_blocks; i++) { 1605 if (!adev->ip_blocks[i].status.valid) 1606 continue; 1607 if (adev->ip_blocks[i].version->type != block_type) 1608 continue; 1609 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1610 continue; 1611 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1612 (void *)adev, state); 1613 if (r) 1614 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1615 adev->ip_blocks[i].version->funcs->name, r); 1616 } 1617 return r; 1618 } 1619 1620 /** 1621 * amdgpu_device_ip_set_powergating_state - set the PG state 1622 * 1623 * @dev: amdgpu_device pointer 1624 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1625 * @state: powergating state (gate or ungate) 1626 * 1627 * Sets the requested powergating state for all instances of 1628 * the hardware IP specified. 1629 * Returns the error code from the last instance. 1630 */ 1631 int amdgpu_device_ip_set_powergating_state(void *dev, 1632 enum amd_ip_block_type block_type, 1633 enum amd_powergating_state state) 1634 { 1635 struct amdgpu_device *adev = dev; 1636 int i, r = 0; 1637 1638 for (i = 0; i < adev->num_ip_blocks; i++) { 1639 if (!adev->ip_blocks[i].status.valid) 1640 continue; 1641 if (adev->ip_blocks[i].version->type != block_type) 1642 continue; 1643 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1644 continue; 1645 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1646 (void *)adev, state); 1647 if (r) 1648 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1649 adev->ip_blocks[i].version->funcs->name, r); 1650 } 1651 return r; 1652 } 1653 1654 /** 1655 * amdgpu_device_ip_get_clockgating_state - get the CG state 1656 * 1657 * @adev: amdgpu_device pointer 1658 * @flags: clockgating feature flags 1659 * 1660 * Walks the list of IPs on the device and updates the clockgating 1661 * flags for each IP. 1662 * Updates @flags with the feature flags for each hardware IP where 1663 * clockgating is enabled. 1664 */ 1665 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1666 u64 *flags) 1667 { 1668 int i; 1669 1670 for (i = 0; i < adev->num_ip_blocks; i++) { 1671 if (!adev->ip_blocks[i].status.valid) 1672 continue; 1673 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1674 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1675 } 1676 } 1677 1678 /** 1679 * amdgpu_device_ip_wait_for_idle - wait for idle 1680 * 1681 * @adev: amdgpu_device pointer 1682 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1683 * 1684 * Waits for the request hardware IP to be idle. 1685 * Returns 0 for success or a negative error code on failure. 1686 */ 1687 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1688 enum amd_ip_block_type block_type) 1689 { 1690 int i, r; 1691 1692 for (i = 0; i < adev->num_ip_blocks; i++) { 1693 if (!adev->ip_blocks[i].status.valid) 1694 continue; 1695 if (adev->ip_blocks[i].version->type == block_type) { 1696 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1697 if (r) 1698 return r; 1699 break; 1700 } 1701 } 1702 return 0; 1703 1704 } 1705 1706 /** 1707 * amdgpu_device_ip_is_idle - is the hardware IP idle 1708 * 1709 * @adev: amdgpu_device pointer 1710 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1711 * 1712 * Check if the hardware IP is idle or not. 1713 * Returns true if it the IP is idle, false if not. 1714 */ 1715 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1716 enum amd_ip_block_type block_type) 1717 { 1718 int i; 1719 1720 for (i = 0; i < adev->num_ip_blocks; i++) { 1721 if (!adev->ip_blocks[i].status.valid) 1722 continue; 1723 if (adev->ip_blocks[i].version->type == block_type) 1724 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1725 } 1726 return true; 1727 1728 } 1729 1730 /** 1731 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1732 * 1733 * @adev: amdgpu_device pointer 1734 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1735 * 1736 * Returns a pointer to the hardware IP block structure 1737 * if it exists for the asic, otherwise NULL. 1738 */ 1739 struct amdgpu_ip_block * 1740 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1741 enum amd_ip_block_type type) 1742 { 1743 int i; 1744 1745 for (i = 0; i < adev->num_ip_blocks; i++) 1746 if (adev->ip_blocks[i].version->type == type) 1747 return &adev->ip_blocks[i]; 1748 1749 return NULL; 1750 } 1751 1752 /** 1753 * amdgpu_device_ip_block_version_cmp 1754 * 1755 * @adev: amdgpu_device pointer 1756 * @type: enum amd_ip_block_type 1757 * @major: major version 1758 * @minor: minor version 1759 * 1760 * return 0 if equal or greater 1761 * return 1 if smaller or the ip_block doesn't exist 1762 */ 1763 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1764 enum amd_ip_block_type type, 1765 u32 major, u32 minor) 1766 { 1767 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1768 1769 if (ip_block && ((ip_block->version->major > major) || 1770 ((ip_block->version->major == major) && 1771 (ip_block->version->minor >= minor)))) 1772 return 0; 1773 1774 return 1; 1775 } 1776 1777 /** 1778 * amdgpu_device_ip_block_add 1779 * 1780 * @adev: amdgpu_device pointer 1781 * @ip_block_version: pointer to the IP to add 1782 * 1783 * Adds the IP block driver information to the collection of IPs 1784 * on the asic. 1785 */ 1786 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1787 const struct amdgpu_ip_block_version *ip_block_version) 1788 { 1789 if (!ip_block_version) 1790 return -EINVAL; 1791 1792 switch (ip_block_version->type) { 1793 case AMD_IP_BLOCK_TYPE_VCN: 1794 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1795 return 0; 1796 break; 1797 case AMD_IP_BLOCK_TYPE_JPEG: 1798 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1799 return 0; 1800 break; 1801 default: 1802 break; 1803 } 1804 1805 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1806 ip_block_version->funcs->name); 1807 1808 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1809 1810 return 0; 1811 } 1812 1813 /** 1814 * amdgpu_device_enable_virtual_display - enable virtual display feature 1815 * 1816 * @adev: amdgpu_device pointer 1817 * 1818 * Enabled the virtual display feature if the user has enabled it via 1819 * the module parameter virtual_display. This feature provides a virtual 1820 * display hardware on headless boards or in virtualized environments. 1821 * This function parses and validates the configuration string specified by 1822 * the user and configues the virtual display configuration (number of 1823 * virtual connectors, crtcs, etc.) specified. 1824 */ 1825 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1826 { 1827 adev->enable_virtual_display = false; 1828 1829 if (amdgpu_virtual_display) { 1830 const char *pci_address_name = pci_name(adev->pdev); 1831 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1832 1833 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1834 pciaddstr_tmp = pciaddstr; 1835 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1836 pciaddname = strsep(&pciaddname_tmp, ","); 1837 if (!strcmp("all", pciaddname) 1838 || !strcmp(pci_address_name, pciaddname)) { 1839 long num_crtc; 1840 int res = -1; 1841 1842 adev->enable_virtual_display = true; 1843 1844 if (pciaddname_tmp) 1845 res = kstrtol(pciaddname_tmp, 10, 1846 &num_crtc); 1847 1848 if (!res) { 1849 if (num_crtc < 1) 1850 num_crtc = 1; 1851 if (num_crtc > 6) 1852 num_crtc = 6; 1853 adev->mode_info.num_crtc = num_crtc; 1854 } else { 1855 adev->mode_info.num_crtc = 1; 1856 } 1857 break; 1858 } 1859 } 1860 1861 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1862 amdgpu_virtual_display, pci_address_name, 1863 adev->enable_virtual_display, adev->mode_info.num_crtc); 1864 1865 kfree(pciaddstr); 1866 } 1867 } 1868 1869 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1870 { 1871 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1872 adev->mode_info.num_crtc = 1; 1873 adev->enable_virtual_display = true; 1874 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1875 adev->enable_virtual_display, adev->mode_info.num_crtc); 1876 } 1877 } 1878 1879 /** 1880 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1881 * 1882 * @adev: amdgpu_device pointer 1883 * 1884 * Parses the asic configuration parameters specified in the gpu info 1885 * firmware and makes them availale to the driver for use in configuring 1886 * the asic. 1887 * Returns 0 on success, -EINVAL on failure. 1888 */ 1889 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1890 { 1891 const char *chip_name; 1892 char fw_name[40]; 1893 int err; 1894 const struct gpu_info_firmware_header_v1_0 *hdr; 1895 1896 adev->firmware.gpu_info_fw = NULL; 1897 1898 if (adev->mman.discovery_bin) { 1899 /* 1900 * FIXME: The bounding box is still needed by Navi12, so 1901 * temporarily read it from gpu_info firmware. Should be dropped 1902 * when DAL no longer needs it. 1903 */ 1904 if (adev->asic_type != CHIP_NAVI12) 1905 return 0; 1906 } 1907 1908 switch (adev->asic_type) { 1909 default: 1910 return 0; 1911 case CHIP_VEGA10: 1912 chip_name = "vega10"; 1913 break; 1914 case CHIP_VEGA12: 1915 chip_name = "vega12"; 1916 break; 1917 case CHIP_RAVEN: 1918 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1919 chip_name = "raven2"; 1920 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1921 chip_name = "picasso"; 1922 else 1923 chip_name = "raven"; 1924 break; 1925 case CHIP_ARCTURUS: 1926 chip_name = "arcturus"; 1927 break; 1928 case CHIP_NAVI12: 1929 chip_name = "navi12"; 1930 break; 1931 } 1932 1933 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1934 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1935 if (err) { 1936 dev_err(adev->dev, 1937 "Failed to get gpu_info firmware \"%s\"\n", 1938 fw_name); 1939 goto out; 1940 } 1941 1942 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1943 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1944 1945 switch (hdr->version_major) { 1946 case 1: 1947 { 1948 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1949 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1950 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1951 1952 /* 1953 * Should be droped when DAL no longer needs it. 1954 */ 1955 if (adev->asic_type == CHIP_NAVI12) 1956 goto parse_soc_bounding_box; 1957 1958 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1959 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1960 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1961 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1962 adev->gfx.config.max_texture_channel_caches = 1963 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1964 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1965 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1966 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1967 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1968 adev->gfx.config.double_offchip_lds_buf = 1969 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1970 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1971 adev->gfx.cu_info.max_waves_per_simd = 1972 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1973 adev->gfx.cu_info.max_scratch_slots_per_cu = 1974 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1975 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1976 if (hdr->version_minor >= 1) { 1977 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1978 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1979 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1980 adev->gfx.config.num_sc_per_sh = 1981 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1982 adev->gfx.config.num_packer_per_sc = 1983 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1984 } 1985 1986 parse_soc_bounding_box: 1987 /* 1988 * soc bounding box info is not integrated in disocovery table, 1989 * we always need to parse it from gpu info firmware if needed. 1990 */ 1991 if (hdr->version_minor == 2) { 1992 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1993 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1994 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1995 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1996 } 1997 break; 1998 } 1999 default: 2000 dev_err(adev->dev, 2001 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2002 err = -EINVAL; 2003 goto out; 2004 } 2005 out: 2006 return err; 2007 } 2008 2009 /** 2010 * amdgpu_device_ip_early_init - run early init for hardware IPs 2011 * 2012 * @adev: amdgpu_device pointer 2013 * 2014 * Early initialization pass for hardware IPs. The hardware IPs that make 2015 * up each asic are discovered each IP's early_init callback is run. This 2016 * is the first stage in initializing the asic. 2017 * Returns 0 on success, negative error code on failure. 2018 */ 2019 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2020 { 2021 struct drm_device *dev = adev_to_drm(adev); 2022 struct pci_dev *parent; 2023 int i, r; 2024 bool total; 2025 2026 amdgpu_device_enable_virtual_display(adev); 2027 2028 if (amdgpu_sriov_vf(adev)) { 2029 r = amdgpu_virt_request_full_gpu(adev, true); 2030 if (r) 2031 return r; 2032 } 2033 2034 switch (adev->asic_type) { 2035 #ifdef CONFIG_DRM_AMDGPU_SI 2036 case CHIP_VERDE: 2037 case CHIP_TAHITI: 2038 case CHIP_PITCAIRN: 2039 case CHIP_OLAND: 2040 case CHIP_HAINAN: 2041 adev->family = AMDGPU_FAMILY_SI; 2042 r = si_set_ip_blocks(adev); 2043 if (r) 2044 return r; 2045 break; 2046 #endif 2047 #ifdef CONFIG_DRM_AMDGPU_CIK 2048 case CHIP_BONAIRE: 2049 case CHIP_HAWAII: 2050 case CHIP_KAVERI: 2051 case CHIP_KABINI: 2052 case CHIP_MULLINS: 2053 if (adev->flags & AMD_IS_APU) 2054 adev->family = AMDGPU_FAMILY_KV; 2055 else 2056 adev->family = AMDGPU_FAMILY_CI; 2057 2058 r = cik_set_ip_blocks(adev); 2059 if (r) 2060 return r; 2061 break; 2062 #endif 2063 case CHIP_TOPAZ: 2064 case CHIP_TONGA: 2065 case CHIP_FIJI: 2066 case CHIP_POLARIS10: 2067 case CHIP_POLARIS11: 2068 case CHIP_POLARIS12: 2069 case CHIP_VEGAM: 2070 case CHIP_CARRIZO: 2071 case CHIP_STONEY: 2072 if (adev->flags & AMD_IS_APU) 2073 adev->family = AMDGPU_FAMILY_CZ; 2074 else 2075 adev->family = AMDGPU_FAMILY_VI; 2076 2077 r = vi_set_ip_blocks(adev); 2078 if (r) 2079 return r; 2080 break; 2081 default: 2082 r = amdgpu_discovery_set_ip_blocks(adev); 2083 if (r) 2084 return r; 2085 break; 2086 } 2087 2088 if (amdgpu_has_atpx() && 2089 (amdgpu_is_atpx_hybrid() || 2090 amdgpu_has_atpx_dgpu_power_cntl()) && 2091 ((adev->flags & AMD_IS_APU) == 0) && 2092 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2093 adev->flags |= AMD_IS_PX; 2094 2095 if (!(adev->flags & AMD_IS_APU)) { 2096 parent = pcie_find_root_port(adev->pdev); 2097 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2098 } 2099 2100 2101 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2102 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2103 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2104 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2105 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2106 if (!amdgpu_device_pcie_dynamic_switching_supported()) 2107 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2108 2109 total = true; 2110 for (i = 0; i < adev->num_ip_blocks; i++) { 2111 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2112 DRM_WARN("disabled ip block: %d <%s>\n", 2113 i, adev->ip_blocks[i].version->funcs->name); 2114 adev->ip_blocks[i].status.valid = false; 2115 } else { 2116 if (adev->ip_blocks[i].version->funcs->early_init) { 2117 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2118 if (r == -ENOENT) { 2119 adev->ip_blocks[i].status.valid = false; 2120 } else if (r) { 2121 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2122 adev->ip_blocks[i].version->funcs->name, r); 2123 total = false; 2124 } else { 2125 adev->ip_blocks[i].status.valid = true; 2126 } 2127 } else { 2128 adev->ip_blocks[i].status.valid = true; 2129 } 2130 } 2131 /* get the vbios after the asic_funcs are set up */ 2132 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2133 r = amdgpu_device_parse_gpu_info_fw(adev); 2134 if (r) 2135 return r; 2136 2137 /* Read BIOS */ 2138 if (amdgpu_device_read_bios(adev)) { 2139 if (!amdgpu_get_bios(adev)) 2140 return -EINVAL; 2141 2142 r = amdgpu_atombios_init(adev); 2143 if (r) { 2144 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2145 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2146 return r; 2147 } 2148 } 2149 2150 /*get pf2vf msg info at it's earliest time*/ 2151 if (amdgpu_sriov_vf(adev)) 2152 amdgpu_virt_init_data_exchange(adev); 2153 2154 } 2155 } 2156 if (!total) 2157 return -ENODEV; 2158 2159 amdgpu_amdkfd_device_probe(adev); 2160 adev->cg_flags &= amdgpu_cg_mask; 2161 adev->pg_flags &= amdgpu_pg_mask; 2162 2163 return 0; 2164 } 2165 2166 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2167 { 2168 int i, r; 2169 2170 for (i = 0; i < adev->num_ip_blocks; i++) { 2171 if (!adev->ip_blocks[i].status.sw) 2172 continue; 2173 if (adev->ip_blocks[i].status.hw) 2174 continue; 2175 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2176 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2177 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2178 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2179 if (r) { 2180 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2181 adev->ip_blocks[i].version->funcs->name, r); 2182 return r; 2183 } 2184 adev->ip_blocks[i].status.hw = true; 2185 } 2186 } 2187 2188 return 0; 2189 } 2190 2191 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2192 { 2193 int i, r; 2194 2195 for (i = 0; i < adev->num_ip_blocks; i++) { 2196 if (!adev->ip_blocks[i].status.sw) 2197 continue; 2198 if (adev->ip_blocks[i].status.hw) 2199 continue; 2200 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2201 if (r) { 2202 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 return r; 2205 } 2206 adev->ip_blocks[i].status.hw = true; 2207 } 2208 2209 return 0; 2210 } 2211 2212 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2213 { 2214 int r = 0; 2215 int i; 2216 uint32_t smu_version; 2217 2218 if (adev->asic_type >= CHIP_VEGA10) { 2219 for (i = 0; i < adev->num_ip_blocks; i++) { 2220 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2221 continue; 2222 2223 if (!adev->ip_blocks[i].status.sw) 2224 continue; 2225 2226 /* no need to do the fw loading again if already done*/ 2227 if (adev->ip_blocks[i].status.hw == true) 2228 break; 2229 2230 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2231 r = adev->ip_blocks[i].version->funcs->resume(adev); 2232 if (r) { 2233 DRM_ERROR("resume of IP block <%s> failed %d\n", 2234 adev->ip_blocks[i].version->funcs->name, r); 2235 return r; 2236 } 2237 } else { 2238 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2239 if (r) { 2240 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2241 adev->ip_blocks[i].version->funcs->name, r); 2242 return r; 2243 } 2244 } 2245 2246 adev->ip_blocks[i].status.hw = true; 2247 break; 2248 } 2249 } 2250 2251 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2252 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2253 2254 return r; 2255 } 2256 2257 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2258 { 2259 long timeout; 2260 int r, i; 2261 2262 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2263 struct amdgpu_ring *ring = adev->rings[i]; 2264 2265 /* No need to setup the GPU scheduler for rings that don't need it */ 2266 if (!ring || ring->no_scheduler) 2267 continue; 2268 2269 switch (ring->funcs->type) { 2270 case AMDGPU_RING_TYPE_GFX: 2271 timeout = adev->gfx_timeout; 2272 break; 2273 case AMDGPU_RING_TYPE_COMPUTE: 2274 timeout = adev->compute_timeout; 2275 break; 2276 case AMDGPU_RING_TYPE_SDMA: 2277 timeout = adev->sdma_timeout; 2278 break; 2279 default: 2280 timeout = adev->video_timeout; 2281 break; 2282 } 2283 2284 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2285 ring->num_hw_submission, 0, 2286 timeout, adev->reset_domain->wq, 2287 ring->sched_score, ring->name, 2288 adev->dev); 2289 if (r) { 2290 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2291 ring->name); 2292 return r; 2293 } 2294 } 2295 2296 amdgpu_xcp_update_partition_sched_list(adev); 2297 2298 return 0; 2299 } 2300 2301 2302 /** 2303 * amdgpu_device_ip_init - run init for hardware IPs 2304 * 2305 * @adev: amdgpu_device pointer 2306 * 2307 * Main initialization pass for hardware IPs. The list of all the hardware 2308 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2309 * are run. sw_init initializes the software state associated with each IP 2310 * and hw_init initializes the hardware associated with each IP. 2311 * Returns 0 on success, negative error code on failure. 2312 */ 2313 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2314 { 2315 int i, r; 2316 2317 r = amdgpu_ras_init(adev); 2318 if (r) 2319 return r; 2320 2321 for (i = 0; i < adev->num_ip_blocks; i++) { 2322 if (!adev->ip_blocks[i].status.valid) 2323 continue; 2324 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2325 if (r) { 2326 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2327 adev->ip_blocks[i].version->funcs->name, r); 2328 goto init_failed; 2329 } 2330 adev->ip_blocks[i].status.sw = true; 2331 2332 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2333 /* need to do common hw init early so everything is set up for gmc */ 2334 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2335 if (r) { 2336 DRM_ERROR("hw_init %d failed %d\n", i, r); 2337 goto init_failed; 2338 } 2339 adev->ip_blocks[i].status.hw = true; 2340 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2341 /* need to do gmc hw init early so we can allocate gpu mem */ 2342 /* Try to reserve bad pages early */ 2343 if (amdgpu_sriov_vf(adev)) 2344 amdgpu_virt_exchange_data(adev); 2345 2346 r = amdgpu_device_mem_scratch_init(adev); 2347 if (r) { 2348 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2349 goto init_failed; 2350 } 2351 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2352 if (r) { 2353 DRM_ERROR("hw_init %d failed %d\n", i, r); 2354 goto init_failed; 2355 } 2356 r = amdgpu_device_wb_init(adev); 2357 if (r) { 2358 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2359 goto init_failed; 2360 } 2361 adev->ip_blocks[i].status.hw = true; 2362 2363 /* right after GMC hw init, we create CSA */ 2364 if (adev->gfx.mcbp) { 2365 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2366 AMDGPU_GEM_DOMAIN_VRAM | 2367 AMDGPU_GEM_DOMAIN_GTT, 2368 AMDGPU_CSA_SIZE); 2369 if (r) { 2370 DRM_ERROR("allocate CSA failed %d\n", r); 2371 goto init_failed; 2372 } 2373 } 2374 } 2375 } 2376 2377 if (amdgpu_sriov_vf(adev)) 2378 amdgpu_virt_init_data_exchange(adev); 2379 2380 r = amdgpu_ib_pool_init(adev); 2381 if (r) { 2382 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2383 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2384 goto init_failed; 2385 } 2386 2387 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2388 if (r) 2389 goto init_failed; 2390 2391 r = amdgpu_device_ip_hw_init_phase1(adev); 2392 if (r) 2393 goto init_failed; 2394 2395 r = amdgpu_device_fw_loading(adev); 2396 if (r) 2397 goto init_failed; 2398 2399 r = amdgpu_device_ip_hw_init_phase2(adev); 2400 if (r) 2401 goto init_failed; 2402 2403 /* 2404 * retired pages will be loaded from eeprom and reserved here, 2405 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2406 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2407 * for I2C communication which only true at this point. 2408 * 2409 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2410 * failure from bad gpu situation and stop amdgpu init process 2411 * accordingly. For other failed cases, it will still release all 2412 * the resource and print error message, rather than returning one 2413 * negative value to upper level. 2414 * 2415 * Note: theoretically, this should be called before all vram allocations 2416 * to protect retired page from abusing 2417 */ 2418 r = amdgpu_ras_recovery_init(adev); 2419 if (r) 2420 goto init_failed; 2421 2422 /** 2423 * In case of XGMI grab extra reference for reset domain for this device 2424 */ 2425 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2426 if (amdgpu_xgmi_add_device(adev) == 0) { 2427 if (!amdgpu_sriov_vf(adev)) { 2428 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2429 2430 if (WARN_ON(!hive)) { 2431 r = -ENOENT; 2432 goto init_failed; 2433 } 2434 2435 if (!hive->reset_domain || 2436 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2437 r = -ENOENT; 2438 amdgpu_put_xgmi_hive(hive); 2439 goto init_failed; 2440 } 2441 2442 /* Drop the early temporary reset domain we created for device */ 2443 amdgpu_reset_put_reset_domain(adev->reset_domain); 2444 adev->reset_domain = hive->reset_domain; 2445 amdgpu_put_xgmi_hive(hive); 2446 } 2447 } 2448 } 2449 2450 r = amdgpu_device_init_schedulers(adev); 2451 if (r) 2452 goto init_failed; 2453 2454 /* Don't init kfd if whole hive need to be reset during init */ 2455 if (!adev->gmc.xgmi.pending_reset) { 2456 kgd2kfd_init_zone_device(adev); 2457 amdgpu_amdkfd_device_init(adev); 2458 } 2459 2460 amdgpu_fru_get_product_info(adev); 2461 2462 init_failed: 2463 2464 return r; 2465 } 2466 2467 /** 2468 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2469 * 2470 * @adev: amdgpu_device pointer 2471 * 2472 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2473 * this function before a GPU reset. If the value is retained after a 2474 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2475 */ 2476 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2477 { 2478 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2479 } 2480 2481 /** 2482 * amdgpu_device_check_vram_lost - check if vram is valid 2483 * 2484 * @adev: amdgpu_device pointer 2485 * 2486 * Checks the reset magic value written to the gart pointer in VRAM. 2487 * The driver calls this after a GPU reset to see if the contents of 2488 * VRAM is lost or now. 2489 * returns true if vram is lost, false if not. 2490 */ 2491 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2492 { 2493 if (memcmp(adev->gart.ptr, adev->reset_magic, 2494 AMDGPU_RESET_MAGIC_NUM)) 2495 return true; 2496 2497 if (!amdgpu_in_reset(adev)) 2498 return false; 2499 2500 /* 2501 * For all ASICs with baco/mode1 reset, the VRAM is 2502 * always assumed to be lost. 2503 */ 2504 switch (amdgpu_asic_reset_method(adev)) { 2505 case AMD_RESET_METHOD_BACO: 2506 case AMD_RESET_METHOD_MODE1: 2507 return true; 2508 default: 2509 return false; 2510 } 2511 } 2512 2513 /** 2514 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2515 * 2516 * @adev: amdgpu_device pointer 2517 * @state: clockgating state (gate or ungate) 2518 * 2519 * The list of all the hardware IPs that make up the asic is walked and the 2520 * set_clockgating_state callbacks are run. 2521 * Late initialization pass enabling clockgating for hardware IPs. 2522 * Fini or suspend, pass disabling clockgating for hardware IPs. 2523 * Returns 0 on success, negative error code on failure. 2524 */ 2525 2526 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2527 enum amd_clockgating_state state) 2528 { 2529 int i, j, r; 2530 2531 if (amdgpu_emu_mode == 1) 2532 return 0; 2533 2534 for (j = 0; j < adev->num_ip_blocks; j++) { 2535 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2536 if (!adev->ip_blocks[i].status.late_initialized) 2537 continue; 2538 /* skip CG for GFX, SDMA on S0ix */ 2539 if (adev->in_s0ix && 2540 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2541 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2542 continue; 2543 /* skip CG for VCE/UVD, it's handled specially */ 2544 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2545 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2546 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2547 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2548 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2549 /* enable clockgating to save power */ 2550 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2551 state); 2552 if (r) { 2553 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2554 adev->ip_blocks[i].version->funcs->name, r); 2555 return r; 2556 } 2557 } 2558 } 2559 2560 return 0; 2561 } 2562 2563 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2564 enum amd_powergating_state state) 2565 { 2566 int i, j, r; 2567 2568 if (amdgpu_emu_mode == 1) 2569 return 0; 2570 2571 for (j = 0; j < adev->num_ip_blocks; j++) { 2572 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2573 if (!adev->ip_blocks[i].status.late_initialized) 2574 continue; 2575 /* skip PG for GFX, SDMA on S0ix */ 2576 if (adev->in_s0ix && 2577 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2579 continue; 2580 /* skip CG for VCE/UVD, it's handled specially */ 2581 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2582 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2583 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2584 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2585 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2586 /* enable powergating to save power */ 2587 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2588 state); 2589 if (r) { 2590 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2591 adev->ip_blocks[i].version->funcs->name, r); 2592 return r; 2593 } 2594 } 2595 } 2596 return 0; 2597 } 2598 2599 static int amdgpu_device_enable_mgpu_fan_boost(void) 2600 { 2601 struct amdgpu_gpu_instance *gpu_ins; 2602 struct amdgpu_device *adev; 2603 int i, ret = 0; 2604 2605 mutex_lock(&mgpu_info.mutex); 2606 2607 /* 2608 * MGPU fan boost feature should be enabled 2609 * only when there are two or more dGPUs in 2610 * the system 2611 */ 2612 if (mgpu_info.num_dgpu < 2) 2613 goto out; 2614 2615 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2616 gpu_ins = &(mgpu_info.gpu_ins[i]); 2617 adev = gpu_ins->adev; 2618 if (!(adev->flags & AMD_IS_APU) && 2619 !gpu_ins->mgpu_fan_enabled) { 2620 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2621 if (ret) 2622 break; 2623 2624 gpu_ins->mgpu_fan_enabled = 1; 2625 } 2626 } 2627 2628 out: 2629 mutex_unlock(&mgpu_info.mutex); 2630 2631 return ret; 2632 } 2633 2634 /** 2635 * amdgpu_device_ip_late_init - run late init for hardware IPs 2636 * 2637 * @adev: amdgpu_device pointer 2638 * 2639 * Late initialization pass for hardware IPs. The list of all the hardware 2640 * IPs that make up the asic is walked and the late_init callbacks are run. 2641 * late_init covers any special initialization that an IP requires 2642 * after all of the have been initialized or something that needs to happen 2643 * late in the init process. 2644 * Returns 0 on success, negative error code on failure. 2645 */ 2646 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2647 { 2648 struct amdgpu_gpu_instance *gpu_instance; 2649 int i = 0, r; 2650 2651 for (i = 0; i < adev->num_ip_blocks; i++) { 2652 if (!adev->ip_blocks[i].status.hw) 2653 continue; 2654 if (adev->ip_blocks[i].version->funcs->late_init) { 2655 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2656 if (r) { 2657 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2658 adev->ip_blocks[i].version->funcs->name, r); 2659 return r; 2660 } 2661 } 2662 adev->ip_blocks[i].status.late_initialized = true; 2663 } 2664 2665 r = amdgpu_ras_late_init(adev); 2666 if (r) { 2667 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2668 return r; 2669 } 2670 2671 amdgpu_ras_set_error_query_ready(adev, true); 2672 2673 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2674 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2675 2676 amdgpu_device_fill_reset_magic(adev); 2677 2678 r = amdgpu_device_enable_mgpu_fan_boost(); 2679 if (r) 2680 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2681 2682 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2683 if (amdgpu_passthrough(adev) && 2684 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2685 adev->asic_type == CHIP_ALDEBARAN)) 2686 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2687 2688 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2689 mutex_lock(&mgpu_info.mutex); 2690 2691 /* 2692 * Reset device p-state to low as this was booted with high. 2693 * 2694 * This should be performed only after all devices from the same 2695 * hive get initialized. 2696 * 2697 * However, it's unknown how many device in the hive in advance. 2698 * As this is counted one by one during devices initializations. 2699 * 2700 * So, we wait for all XGMI interlinked devices initialized. 2701 * This may bring some delays as those devices may come from 2702 * different hives. But that should be OK. 2703 */ 2704 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2705 for (i = 0; i < mgpu_info.num_gpu; i++) { 2706 gpu_instance = &(mgpu_info.gpu_ins[i]); 2707 if (gpu_instance->adev->flags & AMD_IS_APU) 2708 continue; 2709 2710 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2711 AMDGPU_XGMI_PSTATE_MIN); 2712 if (r) { 2713 DRM_ERROR("pstate setting failed (%d).\n", r); 2714 break; 2715 } 2716 } 2717 } 2718 2719 mutex_unlock(&mgpu_info.mutex); 2720 } 2721 2722 return 0; 2723 } 2724 2725 /** 2726 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2727 * 2728 * @adev: amdgpu_device pointer 2729 * 2730 * For ASICs need to disable SMC first 2731 */ 2732 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2733 { 2734 int i, r; 2735 2736 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2737 return; 2738 2739 for (i = 0; i < adev->num_ip_blocks; i++) { 2740 if (!adev->ip_blocks[i].status.hw) 2741 continue; 2742 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2743 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2744 /* XXX handle errors */ 2745 if (r) { 2746 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2747 adev->ip_blocks[i].version->funcs->name, r); 2748 } 2749 adev->ip_blocks[i].status.hw = false; 2750 break; 2751 } 2752 } 2753 } 2754 2755 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2756 { 2757 int i, r; 2758 2759 for (i = 0; i < adev->num_ip_blocks; i++) { 2760 if (!adev->ip_blocks[i].version->funcs->early_fini) 2761 continue; 2762 2763 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2764 if (r) { 2765 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2766 adev->ip_blocks[i].version->funcs->name, r); 2767 } 2768 } 2769 2770 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2771 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2772 2773 amdgpu_amdkfd_suspend(adev, false); 2774 2775 /* Workaroud for ASICs need to disable SMC first */ 2776 amdgpu_device_smu_fini_early(adev); 2777 2778 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2779 if (!adev->ip_blocks[i].status.hw) 2780 continue; 2781 2782 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2783 /* XXX handle errors */ 2784 if (r) { 2785 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2786 adev->ip_blocks[i].version->funcs->name, r); 2787 } 2788 2789 adev->ip_blocks[i].status.hw = false; 2790 } 2791 2792 if (amdgpu_sriov_vf(adev)) { 2793 if (amdgpu_virt_release_full_gpu(adev, false)) 2794 DRM_ERROR("failed to release exclusive mode on fini\n"); 2795 } 2796 2797 return 0; 2798 } 2799 2800 /** 2801 * amdgpu_device_ip_fini - run fini for hardware IPs 2802 * 2803 * @adev: amdgpu_device pointer 2804 * 2805 * Main teardown pass for hardware IPs. The list of all the hardware 2806 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2807 * are run. hw_fini tears down the hardware associated with each IP 2808 * and sw_fini tears down any software state associated with each IP. 2809 * Returns 0 on success, negative error code on failure. 2810 */ 2811 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2812 { 2813 int i, r; 2814 2815 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2816 amdgpu_virt_release_ras_err_handler_data(adev); 2817 2818 if (adev->gmc.xgmi.num_physical_nodes > 1) 2819 amdgpu_xgmi_remove_device(adev); 2820 2821 amdgpu_amdkfd_device_fini_sw(adev); 2822 2823 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2824 if (!adev->ip_blocks[i].status.sw) 2825 continue; 2826 2827 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2828 amdgpu_ucode_free_bo(adev); 2829 amdgpu_free_static_csa(&adev->virt.csa_obj); 2830 amdgpu_device_wb_fini(adev); 2831 amdgpu_device_mem_scratch_fini(adev); 2832 amdgpu_ib_pool_fini(adev); 2833 } 2834 2835 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2836 /* XXX handle errors */ 2837 if (r) { 2838 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2839 adev->ip_blocks[i].version->funcs->name, r); 2840 } 2841 adev->ip_blocks[i].status.sw = false; 2842 adev->ip_blocks[i].status.valid = false; 2843 } 2844 2845 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2846 if (!adev->ip_blocks[i].status.late_initialized) 2847 continue; 2848 if (adev->ip_blocks[i].version->funcs->late_fini) 2849 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2850 adev->ip_blocks[i].status.late_initialized = false; 2851 } 2852 2853 amdgpu_ras_fini(adev); 2854 2855 return 0; 2856 } 2857 2858 /** 2859 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2860 * 2861 * @work: work_struct. 2862 */ 2863 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2864 { 2865 struct amdgpu_device *adev = 2866 container_of(work, struct amdgpu_device, delayed_init_work.work); 2867 int r; 2868 2869 r = amdgpu_ib_ring_tests(adev); 2870 if (r) 2871 DRM_ERROR("ib ring test failed (%d).\n", r); 2872 } 2873 2874 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2875 { 2876 struct amdgpu_device *adev = 2877 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2878 2879 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2880 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2881 2882 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2883 adev->gfx.gfx_off_state = true; 2884 } 2885 2886 /** 2887 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2888 * 2889 * @adev: amdgpu_device pointer 2890 * 2891 * Main suspend function for hardware IPs. The list of all the hardware 2892 * IPs that make up the asic is walked, clockgating is disabled and the 2893 * suspend callbacks are run. suspend puts the hardware and software state 2894 * in each IP into a state suitable for suspend. 2895 * Returns 0 on success, negative error code on failure. 2896 */ 2897 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2898 { 2899 int i, r; 2900 2901 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2902 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2903 2904 /* 2905 * Per PMFW team's suggestion, driver needs to handle gfxoff 2906 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2907 * scenario. Add the missing df cstate disablement here. 2908 */ 2909 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2910 dev_warn(adev->dev, "Failed to disallow df cstate"); 2911 2912 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2913 if (!adev->ip_blocks[i].status.valid) 2914 continue; 2915 2916 /* displays are handled separately */ 2917 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2918 continue; 2919 2920 /* XXX handle errors */ 2921 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2922 /* XXX handle errors */ 2923 if (r) { 2924 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2925 adev->ip_blocks[i].version->funcs->name, r); 2926 return r; 2927 } 2928 2929 adev->ip_blocks[i].status.hw = false; 2930 } 2931 2932 return 0; 2933 } 2934 2935 /** 2936 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2937 * 2938 * @adev: amdgpu_device pointer 2939 * 2940 * Main suspend function for hardware IPs. The list of all the hardware 2941 * IPs that make up the asic is walked, clockgating is disabled and the 2942 * suspend callbacks are run. suspend puts the hardware and software state 2943 * in each IP into a state suitable for suspend. 2944 * Returns 0 on success, negative error code on failure. 2945 */ 2946 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2947 { 2948 int i, r; 2949 2950 if (adev->in_s0ix) 2951 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2952 2953 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2954 if (!adev->ip_blocks[i].status.valid) 2955 continue; 2956 /* displays are handled in phase1 */ 2957 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2958 continue; 2959 /* PSP lost connection when err_event_athub occurs */ 2960 if (amdgpu_ras_intr_triggered() && 2961 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2962 adev->ip_blocks[i].status.hw = false; 2963 continue; 2964 } 2965 2966 /* skip unnecessary suspend if we do not initialize them yet */ 2967 if (adev->gmc.xgmi.pending_reset && 2968 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2969 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2970 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2971 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2972 adev->ip_blocks[i].status.hw = false; 2973 continue; 2974 } 2975 2976 /* skip suspend of gfx/mes and psp for S0ix 2977 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2978 * like at runtime. PSP is also part of the always on hardware 2979 * so no need to suspend it. 2980 */ 2981 if (adev->in_s0ix && 2982 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 2985 continue; 2986 2987 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 2988 if (adev->in_s0ix && 2989 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 2990 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2991 continue; 2992 2993 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 2994 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 2995 * from this location and RLC Autoload automatically also gets loaded 2996 * from here based on PMFW -> PSP message during re-init sequence. 2997 * Therefore, the psp suspend & resume should be skipped to avoid destroy 2998 * the TMR and reload FWs again for IMU enabled APU ASICs. 2999 */ 3000 if (amdgpu_in_reset(adev) && 3001 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3002 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3003 continue; 3004 3005 /* XXX handle errors */ 3006 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3007 /* XXX handle errors */ 3008 if (r) { 3009 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3010 adev->ip_blocks[i].version->funcs->name, r); 3011 } 3012 adev->ip_blocks[i].status.hw = false; 3013 /* handle putting the SMC in the appropriate state */ 3014 if (!amdgpu_sriov_vf(adev)) { 3015 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3016 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3017 if (r) { 3018 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3019 adev->mp1_state, r); 3020 return r; 3021 } 3022 } 3023 } 3024 } 3025 3026 return 0; 3027 } 3028 3029 /** 3030 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3031 * 3032 * @adev: amdgpu_device pointer 3033 * 3034 * Main suspend function for hardware IPs. The list of all the hardware 3035 * IPs that make up the asic is walked, clockgating is disabled and the 3036 * suspend callbacks are run. suspend puts the hardware and software state 3037 * in each IP into a state suitable for suspend. 3038 * Returns 0 on success, negative error code on failure. 3039 */ 3040 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3041 { 3042 int r; 3043 3044 if (amdgpu_sriov_vf(adev)) { 3045 amdgpu_virt_fini_data_exchange(adev); 3046 amdgpu_virt_request_full_gpu(adev, false); 3047 } 3048 3049 r = amdgpu_device_ip_suspend_phase1(adev); 3050 if (r) 3051 return r; 3052 r = amdgpu_device_ip_suspend_phase2(adev); 3053 3054 if (amdgpu_sriov_vf(adev)) 3055 amdgpu_virt_release_full_gpu(adev, false); 3056 3057 return r; 3058 } 3059 3060 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3061 { 3062 int i, r; 3063 3064 static enum amd_ip_block_type ip_order[] = { 3065 AMD_IP_BLOCK_TYPE_COMMON, 3066 AMD_IP_BLOCK_TYPE_GMC, 3067 AMD_IP_BLOCK_TYPE_PSP, 3068 AMD_IP_BLOCK_TYPE_IH, 3069 }; 3070 3071 for (i = 0; i < adev->num_ip_blocks; i++) { 3072 int j; 3073 struct amdgpu_ip_block *block; 3074 3075 block = &adev->ip_blocks[i]; 3076 block->status.hw = false; 3077 3078 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3079 3080 if (block->version->type != ip_order[j] || 3081 !block->status.valid) 3082 continue; 3083 3084 r = block->version->funcs->hw_init(adev); 3085 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3086 if (r) 3087 return r; 3088 block->status.hw = true; 3089 } 3090 } 3091 3092 return 0; 3093 } 3094 3095 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3096 { 3097 int i, r; 3098 3099 static enum amd_ip_block_type ip_order[] = { 3100 AMD_IP_BLOCK_TYPE_SMC, 3101 AMD_IP_BLOCK_TYPE_DCE, 3102 AMD_IP_BLOCK_TYPE_GFX, 3103 AMD_IP_BLOCK_TYPE_SDMA, 3104 AMD_IP_BLOCK_TYPE_MES, 3105 AMD_IP_BLOCK_TYPE_UVD, 3106 AMD_IP_BLOCK_TYPE_VCE, 3107 AMD_IP_BLOCK_TYPE_VCN, 3108 AMD_IP_BLOCK_TYPE_JPEG 3109 }; 3110 3111 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3112 int j; 3113 struct amdgpu_ip_block *block; 3114 3115 for (j = 0; j < adev->num_ip_blocks; j++) { 3116 block = &adev->ip_blocks[j]; 3117 3118 if (block->version->type != ip_order[i] || 3119 !block->status.valid || 3120 block->status.hw) 3121 continue; 3122 3123 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3124 r = block->version->funcs->resume(adev); 3125 else 3126 r = block->version->funcs->hw_init(adev); 3127 3128 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3129 if (r) 3130 return r; 3131 block->status.hw = true; 3132 } 3133 } 3134 3135 return 0; 3136 } 3137 3138 /** 3139 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3140 * 3141 * @adev: amdgpu_device pointer 3142 * 3143 * First resume function for hardware IPs. The list of all the hardware 3144 * IPs that make up the asic is walked and the resume callbacks are run for 3145 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3146 * after a suspend and updates the software state as necessary. This 3147 * function is also used for restoring the GPU after a GPU reset. 3148 * Returns 0 on success, negative error code on failure. 3149 */ 3150 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3151 { 3152 int i, r; 3153 3154 for (i = 0; i < adev->num_ip_blocks; i++) { 3155 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3156 continue; 3157 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3158 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3159 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3160 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3161 3162 r = adev->ip_blocks[i].version->funcs->resume(adev); 3163 if (r) { 3164 DRM_ERROR("resume of IP block <%s> failed %d\n", 3165 adev->ip_blocks[i].version->funcs->name, r); 3166 return r; 3167 } 3168 adev->ip_blocks[i].status.hw = true; 3169 } 3170 } 3171 3172 return 0; 3173 } 3174 3175 /** 3176 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3177 * 3178 * @adev: amdgpu_device pointer 3179 * 3180 * First resume function for hardware IPs. The list of all the hardware 3181 * IPs that make up the asic is walked and the resume callbacks are run for 3182 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3183 * functional state after a suspend and updates the software state as 3184 * necessary. This function is also used for restoring the GPU after a GPU 3185 * reset. 3186 * Returns 0 on success, negative error code on failure. 3187 */ 3188 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3189 { 3190 int i, r; 3191 3192 for (i = 0; i < adev->num_ip_blocks; i++) { 3193 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3194 continue; 3195 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3196 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3197 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3198 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3199 continue; 3200 r = adev->ip_blocks[i].version->funcs->resume(adev); 3201 if (r) { 3202 DRM_ERROR("resume of IP block <%s> failed %d\n", 3203 adev->ip_blocks[i].version->funcs->name, r); 3204 return r; 3205 } 3206 adev->ip_blocks[i].status.hw = true; 3207 } 3208 3209 return 0; 3210 } 3211 3212 /** 3213 * amdgpu_device_ip_resume - run resume for hardware IPs 3214 * 3215 * @adev: amdgpu_device pointer 3216 * 3217 * Main resume function for hardware IPs. The hardware IPs 3218 * are split into two resume functions because they are 3219 * also used in recovering from a GPU reset and some additional 3220 * steps need to be take between them. In this case (S3/S4) they are 3221 * run sequentially. 3222 * Returns 0 on success, negative error code on failure. 3223 */ 3224 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3225 { 3226 int r; 3227 3228 r = amdgpu_device_ip_resume_phase1(adev); 3229 if (r) 3230 return r; 3231 3232 r = amdgpu_device_fw_loading(adev); 3233 if (r) 3234 return r; 3235 3236 r = amdgpu_device_ip_resume_phase2(adev); 3237 3238 return r; 3239 } 3240 3241 /** 3242 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3243 * 3244 * @adev: amdgpu_device pointer 3245 * 3246 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3247 */ 3248 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3249 { 3250 if (amdgpu_sriov_vf(adev)) { 3251 if (adev->is_atom_fw) { 3252 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3253 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3254 } else { 3255 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3256 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3257 } 3258 3259 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3260 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3261 } 3262 } 3263 3264 /** 3265 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3266 * 3267 * @asic_type: AMD asic type 3268 * 3269 * Check if there is DC (new modesetting infrastructre) support for an asic. 3270 * returns true if DC has support, false if not. 3271 */ 3272 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3273 { 3274 switch (asic_type) { 3275 #ifdef CONFIG_DRM_AMDGPU_SI 3276 case CHIP_HAINAN: 3277 #endif 3278 case CHIP_TOPAZ: 3279 /* chips with no display hardware */ 3280 return false; 3281 #if defined(CONFIG_DRM_AMD_DC) 3282 case CHIP_TAHITI: 3283 case CHIP_PITCAIRN: 3284 case CHIP_VERDE: 3285 case CHIP_OLAND: 3286 /* 3287 * We have systems in the wild with these ASICs that require 3288 * LVDS and VGA support which is not supported with DC. 3289 * 3290 * Fallback to the non-DC driver here by default so as not to 3291 * cause regressions. 3292 */ 3293 #if defined(CONFIG_DRM_AMD_DC_SI) 3294 return amdgpu_dc > 0; 3295 #else 3296 return false; 3297 #endif 3298 case CHIP_BONAIRE: 3299 case CHIP_KAVERI: 3300 case CHIP_KABINI: 3301 case CHIP_MULLINS: 3302 /* 3303 * We have systems in the wild with these ASICs that require 3304 * VGA support which is not supported with DC. 3305 * 3306 * Fallback to the non-DC driver here by default so as not to 3307 * cause regressions. 3308 */ 3309 return amdgpu_dc > 0; 3310 default: 3311 return amdgpu_dc != 0; 3312 #else 3313 default: 3314 if (amdgpu_dc > 0) 3315 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3316 return false; 3317 #endif 3318 } 3319 } 3320 3321 /** 3322 * amdgpu_device_has_dc_support - check if dc is supported 3323 * 3324 * @adev: amdgpu_device pointer 3325 * 3326 * Returns true for supported, false for not supported 3327 */ 3328 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3329 { 3330 if (adev->enable_virtual_display || 3331 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3332 return false; 3333 3334 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3335 } 3336 3337 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3338 { 3339 struct amdgpu_device *adev = 3340 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3341 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3342 3343 /* It's a bug to not have a hive within this function */ 3344 if (WARN_ON(!hive)) 3345 return; 3346 3347 /* 3348 * Use task barrier to synchronize all xgmi reset works across the 3349 * hive. task_barrier_enter and task_barrier_exit will block 3350 * until all the threads running the xgmi reset works reach 3351 * those points. task_barrier_full will do both blocks. 3352 */ 3353 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3354 3355 task_barrier_enter(&hive->tb); 3356 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3357 3358 if (adev->asic_reset_res) 3359 goto fail; 3360 3361 task_barrier_exit(&hive->tb); 3362 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3363 3364 if (adev->asic_reset_res) 3365 goto fail; 3366 3367 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3368 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3369 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3370 } else { 3371 3372 task_barrier_full(&hive->tb); 3373 adev->asic_reset_res = amdgpu_asic_reset(adev); 3374 } 3375 3376 fail: 3377 if (adev->asic_reset_res) 3378 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3379 adev->asic_reset_res, adev_to_drm(adev)->unique); 3380 amdgpu_put_xgmi_hive(hive); 3381 } 3382 3383 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3384 { 3385 char *input = amdgpu_lockup_timeout; 3386 char *timeout_setting = NULL; 3387 int index = 0; 3388 long timeout; 3389 int ret = 0; 3390 3391 /* 3392 * By default timeout for non compute jobs is 10000 3393 * and 60000 for compute jobs. 3394 * In SR-IOV or passthrough mode, timeout for compute 3395 * jobs are 60000 by default. 3396 */ 3397 adev->gfx_timeout = msecs_to_jiffies(10000); 3398 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3399 if (amdgpu_sriov_vf(adev)) 3400 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3401 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3402 else 3403 adev->compute_timeout = msecs_to_jiffies(60000); 3404 3405 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3406 while ((timeout_setting = strsep(&input, ",")) && 3407 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3408 ret = kstrtol(timeout_setting, 0, &timeout); 3409 if (ret) 3410 return ret; 3411 3412 if (timeout == 0) { 3413 index++; 3414 continue; 3415 } else if (timeout < 0) { 3416 timeout = MAX_SCHEDULE_TIMEOUT; 3417 dev_warn(adev->dev, "lockup timeout disabled"); 3418 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3419 } else { 3420 timeout = msecs_to_jiffies(timeout); 3421 } 3422 3423 switch (index++) { 3424 case 0: 3425 adev->gfx_timeout = timeout; 3426 break; 3427 case 1: 3428 adev->compute_timeout = timeout; 3429 break; 3430 case 2: 3431 adev->sdma_timeout = timeout; 3432 break; 3433 case 3: 3434 adev->video_timeout = timeout; 3435 break; 3436 default: 3437 break; 3438 } 3439 } 3440 /* 3441 * There is only one value specified and 3442 * it should apply to all non-compute jobs. 3443 */ 3444 if (index == 1) { 3445 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3446 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3447 adev->compute_timeout = adev->gfx_timeout; 3448 } 3449 } 3450 3451 return ret; 3452 } 3453 3454 /** 3455 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3456 * 3457 * @adev: amdgpu_device pointer 3458 * 3459 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3460 */ 3461 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3462 { 3463 struct iommu_domain *domain; 3464 3465 domain = iommu_get_domain_for_dev(adev->dev); 3466 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3467 adev->ram_is_direct_mapped = true; 3468 } 3469 3470 static const struct attribute *amdgpu_dev_attributes[] = { 3471 &dev_attr_pcie_replay_count.attr, 3472 NULL 3473 }; 3474 3475 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3476 { 3477 if (amdgpu_mcbp == 1) 3478 adev->gfx.mcbp = true; 3479 else if (amdgpu_mcbp == 0) 3480 adev->gfx.mcbp = false; 3481 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3482 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3483 adev->gfx.num_gfx_rings) 3484 adev->gfx.mcbp = true; 3485 3486 if (amdgpu_sriov_vf(adev)) 3487 adev->gfx.mcbp = true; 3488 3489 if (adev->gfx.mcbp) 3490 DRM_INFO("MCBP is enabled\n"); 3491 } 3492 3493 /** 3494 * amdgpu_device_init - initialize the driver 3495 * 3496 * @adev: amdgpu_device pointer 3497 * @flags: driver flags 3498 * 3499 * Initializes the driver info and hw (all asics). 3500 * Returns 0 for success or an error on failure. 3501 * Called at driver startup. 3502 */ 3503 int amdgpu_device_init(struct amdgpu_device *adev, 3504 uint32_t flags) 3505 { 3506 struct drm_device *ddev = adev_to_drm(adev); 3507 struct pci_dev *pdev = adev->pdev; 3508 int r, i; 3509 bool px = false; 3510 u32 max_MBps; 3511 int tmp; 3512 3513 adev->shutdown = false; 3514 adev->flags = flags; 3515 3516 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3517 adev->asic_type = amdgpu_force_asic_type; 3518 else 3519 adev->asic_type = flags & AMD_ASIC_MASK; 3520 3521 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3522 if (amdgpu_emu_mode == 1) 3523 adev->usec_timeout *= 10; 3524 adev->gmc.gart_size = 512 * 1024 * 1024; 3525 adev->accel_working = false; 3526 adev->num_rings = 0; 3527 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3528 adev->mman.buffer_funcs = NULL; 3529 adev->mman.buffer_funcs_ring = NULL; 3530 adev->vm_manager.vm_pte_funcs = NULL; 3531 adev->vm_manager.vm_pte_num_scheds = 0; 3532 adev->gmc.gmc_funcs = NULL; 3533 adev->harvest_ip_mask = 0x0; 3534 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3535 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3536 3537 adev->smc_rreg = &amdgpu_invalid_rreg; 3538 adev->smc_wreg = &amdgpu_invalid_wreg; 3539 adev->pcie_rreg = &amdgpu_invalid_rreg; 3540 adev->pcie_wreg = &amdgpu_invalid_wreg; 3541 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3542 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3543 adev->pciep_rreg = &amdgpu_invalid_rreg; 3544 adev->pciep_wreg = &amdgpu_invalid_wreg; 3545 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3546 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3547 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3548 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3549 adev->didt_rreg = &amdgpu_invalid_rreg; 3550 adev->didt_wreg = &amdgpu_invalid_wreg; 3551 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3552 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3553 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3554 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3555 3556 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3557 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3558 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3559 3560 /* mutex initialization are all done here so we 3561 * can recall function without having locking issues 3562 */ 3563 mutex_init(&adev->firmware.mutex); 3564 mutex_init(&adev->pm.mutex); 3565 mutex_init(&adev->gfx.gpu_clock_mutex); 3566 mutex_init(&adev->srbm_mutex); 3567 mutex_init(&adev->gfx.pipe_reserve_mutex); 3568 mutex_init(&adev->gfx.gfx_off_mutex); 3569 mutex_init(&adev->gfx.partition_mutex); 3570 mutex_init(&adev->grbm_idx_mutex); 3571 mutex_init(&adev->mn_lock); 3572 mutex_init(&adev->virt.vf_errors.lock); 3573 hash_init(adev->mn_hash); 3574 mutex_init(&adev->psp.mutex); 3575 mutex_init(&adev->notifier_lock); 3576 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3577 mutex_init(&adev->benchmark_mutex); 3578 3579 amdgpu_device_init_apu_flags(adev); 3580 3581 r = amdgpu_device_check_arguments(adev); 3582 if (r) 3583 return r; 3584 3585 spin_lock_init(&adev->mmio_idx_lock); 3586 spin_lock_init(&adev->smc_idx_lock); 3587 spin_lock_init(&adev->pcie_idx_lock); 3588 spin_lock_init(&adev->uvd_ctx_idx_lock); 3589 spin_lock_init(&adev->didt_idx_lock); 3590 spin_lock_init(&adev->gc_cac_idx_lock); 3591 spin_lock_init(&adev->se_cac_idx_lock); 3592 spin_lock_init(&adev->audio_endpt_idx_lock); 3593 spin_lock_init(&adev->mm_stats.lock); 3594 3595 INIT_LIST_HEAD(&adev->shadow_list); 3596 mutex_init(&adev->shadow_list_lock); 3597 3598 INIT_LIST_HEAD(&adev->reset_list); 3599 3600 INIT_LIST_HEAD(&adev->ras_list); 3601 3602 INIT_DELAYED_WORK(&adev->delayed_init_work, 3603 amdgpu_device_delayed_init_work_handler); 3604 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3605 amdgpu_device_delay_enable_gfx_off); 3606 3607 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3608 3609 adev->gfx.gfx_off_req_count = 1; 3610 adev->gfx.gfx_off_residency = 0; 3611 adev->gfx.gfx_off_entrycount = 0; 3612 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3613 3614 atomic_set(&adev->throttling_logging_enabled, 1); 3615 /* 3616 * If throttling continues, logging will be performed every minute 3617 * to avoid log flooding. "-1" is subtracted since the thermal 3618 * throttling interrupt comes every second. Thus, the total logging 3619 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3620 * for throttling interrupt) = 60 seconds. 3621 */ 3622 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3623 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3624 3625 /* Registers mapping */ 3626 /* TODO: block userspace mapping of io register */ 3627 if (adev->asic_type >= CHIP_BONAIRE) { 3628 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3629 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3630 } else { 3631 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3632 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3633 } 3634 3635 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3636 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3637 3638 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3639 if (!adev->rmmio) 3640 return -ENOMEM; 3641 3642 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3643 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3644 3645 /* 3646 * Reset domain needs to be present early, before XGMI hive discovered 3647 * (if any) and intitialized to use reset sem and in_gpu reset flag 3648 * early on during init and before calling to RREG32. 3649 */ 3650 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3651 if (!adev->reset_domain) 3652 return -ENOMEM; 3653 3654 /* detect hw virtualization here */ 3655 amdgpu_detect_virtualization(adev); 3656 3657 amdgpu_device_get_pcie_info(adev); 3658 3659 r = amdgpu_device_get_job_timeout_settings(adev); 3660 if (r) { 3661 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3662 return r; 3663 } 3664 3665 /* early init functions */ 3666 r = amdgpu_device_ip_early_init(adev); 3667 if (r) 3668 return r; 3669 3670 amdgpu_device_set_mcbp(adev); 3671 3672 /* Get rid of things like offb */ 3673 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3674 if (r) 3675 return r; 3676 3677 /* Enable TMZ based on IP_VERSION */ 3678 amdgpu_gmc_tmz_set(adev); 3679 3680 amdgpu_gmc_noretry_set(adev); 3681 /* Need to get xgmi info early to decide the reset behavior*/ 3682 if (adev->gmc.xgmi.supported) { 3683 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3684 if (r) 3685 return r; 3686 } 3687 3688 /* enable PCIE atomic ops */ 3689 if (amdgpu_sriov_vf(adev)) { 3690 if (adev->virt.fw_reserve.p_pf2vf) 3691 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3692 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3693 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3694 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3695 * internal path natively support atomics, set have_atomics_support to true. 3696 */ 3697 } else if ((adev->flags & AMD_IS_APU) && 3698 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3699 adev->have_atomics_support = true; 3700 } else { 3701 adev->have_atomics_support = 3702 !pci_enable_atomic_ops_to_root(adev->pdev, 3703 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3704 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3705 } 3706 3707 if (!adev->have_atomics_support) 3708 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3709 3710 /* doorbell bar mapping and doorbell index init*/ 3711 amdgpu_doorbell_init(adev); 3712 3713 if (amdgpu_emu_mode == 1) { 3714 /* post the asic on emulation mode */ 3715 emu_soc_asic_init(adev); 3716 goto fence_driver_init; 3717 } 3718 3719 amdgpu_reset_init(adev); 3720 3721 /* detect if we are with an SRIOV vbios */ 3722 if (adev->bios) 3723 amdgpu_device_detect_sriov_bios(adev); 3724 3725 /* check if we need to reset the asic 3726 * E.g., driver was not cleanly unloaded previously, etc. 3727 */ 3728 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3729 if (adev->gmc.xgmi.num_physical_nodes) { 3730 dev_info(adev->dev, "Pending hive reset.\n"); 3731 adev->gmc.xgmi.pending_reset = true; 3732 /* Only need to init necessary block for SMU to handle the reset */ 3733 for (i = 0; i < adev->num_ip_blocks; i++) { 3734 if (!adev->ip_blocks[i].status.valid) 3735 continue; 3736 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3738 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3739 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3740 DRM_DEBUG("IP %s disabled for hw_init.\n", 3741 adev->ip_blocks[i].version->funcs->name); 3742 adev->ip_blocks[i].status.hw = true; 3743 } 3744 } 3745 } else { 3746 tmp = amdgpu_reset_method; 3747 /* It should do a default reset when loading or reloading the driver, 3748 * regardless of the module parameter reset_method. 3749 */ 3750 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3751 r = amdgpu_asic_reset(adev); 3752 amdgpu_reset_method = tmp; 3753 if (r) { 3754 dev_err(adev->dev, "asic reset on init failed\n"); 3755 goto failed; 3756 } 3757 } 3758 } 3759 3760 /* Post card if necessary */ 3761 if (amdgpu_device_need_post(adev)) { 3762 if (!adev->bios) { 3763 dev_err(adev->dev, "no vBIOS found\n"); 3764 r = -EINVAL; 3765 goto failed; 3766 } 3767 DRM_INFO("GPU posting now...\n"); 3768 r = amdgpu_device_asic_init(adev); 3769 if (r) { 3770 dev_err(adev->dev, "gpu post error!\n"); 3771 goto failed; 3772 } 3773 } 3774 3775 if (adev->bios) { 3776 if (adev->is_atom_fw) { 3777 /* Initialize clocks */ 3778 r = amdgpu_atomfirmware_get_clock_info(adev); 3779 if (r) { 3780 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3781 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3782 goto failed; 3783 } 3784 } else { 3785 /* Initialize clocks */ 3786 r = amdgpu_atombios_get_clock_info(adev); 3787 if (r) { 3788 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3789 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3790 goto failed; 3791 } 3792 /* init i2c buses */ 3793 if (!amdgpu_device_has_dc_support(adev)) 3794 amdgpu_atombios_i2c_init(adev); 3795 } 3796 } 3797 3798 fence_driver_init: 3799 /* Fence driver */ 3800 r = amdgpu_fence_driver_sw_init(adev); 3801 if (r) { 3802 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3803 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3804 goto failed; 3805 } 3806 3807 /* init the mode config */ 3808 drm_mode_config_init(adev_to_drm(adev)); 3809 3810 r = amdgpu_device_ip_init(adev); 3811 if (r) { 3812 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3813 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3814 goto release_ras_con; 3815 } 3816 3817 amdgpu_fence_driver_hw_init(adev); 3818 3819 dev_info(adev->dev, 3820 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3821 adev->gfx.config.max_shader_engines, 3822 adev->gfx.config.max_sh_per_se, 3823 adev->gfx.config.max_cu_per_sh, 3824 adev->gfx.cu_info.number); 3825 3826 adev->accel_working = true; 3827 3828 amdgpu_vm_check_compute_bug(adev); 3829 3830 /* Initialize the buffer migration limit. */ 3831 if (amdgpu_moverate >= 0) 3832 max_MBps = amdgpu_moverate; 3833 else 3834 max_MBps = 8; /* Allow 8 MB/s. */ 3835 /* Get a log2 for easy divisions. */ 3836 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3837 3838 r = amdgpu_atombios_sysfs_init(adev); 3839 if (r) 3840 drm_err(&adev->ddev, 3841 "registering atombios sysfs failed (%d).\n", r); 3842 3843 r = amdgpu_pm_sysfs_init(adev); 3844 if (r) 3845 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3846 3847 r = amdgpu_ucode_sysfs_init(adev); 3848 if (r) { 3849 adev->ucode_sysfs_en = false; 3850 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3851 } else 3852 adev->ucode_sysfs_en = true; 3853 3854 /* 3855 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3856 * Otherwise the mgpu fan boost feature will be skipped due to the 3857 * gpu instance is counted less. 3858 */ 3859 amdgpu_register_gpu_instance(adev); 3860 3861 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3862 * explicit gating rather than handling it automatically. 3863 */ 3864 if (!adev->gmc.xgmi.pending_reset) { 3865 r = amdgpu_device_ip_late_init(adev); 3866 if (r) { 3867 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3868 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3869 goto release_ras_con; 3870 } 3871 /* must succeed. */ 3872 amdgpu_ras_resume(adev); 3873 queue_delayed_work(system_wq, &adev->delayed_init_work, 3874 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3875 } 3876 3877 if (amdgpu_sriov_vf(adev)) { 3878 amdgpu_virt_release_full_gpu(adev, true); 3879 flush_delayed_work(&adev->delayed_init_work); 3880 } 3881 3882 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3883 if (r) 3884 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3885 3886 amdgpu_fru_sysfs_init(adev); 3887 3888 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3889 r = amdgpu_pmu_init(adev); 3890 if (r) 3891 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3892 3893 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3894 if (amdgpu_device_cache_pci_state(adev->pdev)) 3895 pci_restore_state(pdev); 3896 3897 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3898 /* this will fail for cards that aren't VGA class devices, just 3899 * ignore it 3900 */ 3901 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3902 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3903 3904 px = amdgpu_device_supports_px(ddev); 3905 3906 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3907 apple_gmux_detect(NULL, NULL))) 3908 vga_switcheroo_register_client(adev->pdev, 3909 &amdgpu_switcheroo_ops, px); 3910 3911 if (px) 3912 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3913 3914 if (adev->gmc.xgmi.pending_reset) 3915 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3916 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3917 3918 amdgpu_device_check_iommu_direct_map(adev); 3919 3920 return 0; 3921 3922 release_ras_con: 3923 if (amdgpu_sriov_vf(adev)) 3924 amdgpu_virt_release_full_gpu(adev, true); 3925 3926 /* failed in exclusive mode due to timeout */ 3927 if (amdgpu_sriov_vf(adev) && 3928 !amdgpu_sriov_runtime(adev) && 3929 amdgpu_virt_mmio_blocked(adev) && 3930 !amdgpu_virt_wait_reset(adev)) { 3931 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3932 /* Don't send request since VF is inactive. */ 3933 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3934 adev->virt.ops = NULL; 3935 r = -EAGAIN; 3936 } 3937 amdgpu_release_ras_context(adev); 3938 3939 failed: 3940 amdgpu_vf_error_trans_all(adev); 3941 3942 return r; 3943 } 3944 3945 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3946 { 3947 3948 /* Clear all CPU mappings pointing to this device */ 3949 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3950 3951 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3952 amdgpu_doorbell_fini(adev); 3953 3954 iounmap(adev->rmmio); 3955 adev->rmmio = NULL; 3956 if (adev->mman.aper_base_kaddr) 3957 iounmap(adev->mman.aper_base_kaddr); 3958 adev->mman.aper_base_kaddr = NULL; 3959 3960 /* Memory manager related */ 3961 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3962 arch_phys_wc_del(adev->gmc.vram_mtrr); 3963 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3964 } 3965 } 3966 3967 /** 3968 * amdgpu_device_fini_hw - tear down the driver 3969 * 3970 * @adev: amdgpu_device pointer 3971 * 3972 * Tear down the driver info (all asics). 3973 * Called at driver shutdown. 3974 */ 3975 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3976 { 3977 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3978 flush_delayed_work(&adev->delayed_init_work); 3979 adev->shutdown = true; 3980 3981 /* make sure IB test finished before entering exclusive mode 3982 * to avoid preemption on IB test 3983 */ 3984 if (amdgpu_sriov_vf(adev)) { 3985 amdgpu_virt_request_full_gpu(adev, false); 3986 amdgpu_virt_fini_data_exchange(adev); 3987 } 3988 3989 /* disable all interrupts */ 3990 amdgpu_irq_disable_all(adev); 3991 if (adev->mode_info.mode_config_initialized) { 3992 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3993 drm_helper_force_disable_all(adev_to_drm(adev)); 3994 else 3995 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3996 } 3997 amdgpu_fence_driver_hw_fini(adev); 3998 3999 if (adev->mman.initialized) 4000 drain_workqueue(adev->mman.bdev.wq); 4001 4002 if (adev->pm.sysfs_initialized) 4003 amdgpu_pm_sysfs_fini(adev); 4004 if (adev->ucode_sysfs_en) 4005 amdgpu_ucode_sysfs_fini(adev); 4006 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4007 amdgpu_fru_sysfs_fini(adev); 4008 4009 /* disable ras feature must before hw fini */ 4010 amdgpu_ras_pre_fini(adev); 4011 4012 amdgpu_device_ip_fini_early(adev); 4013 4014 amdgpu_irq_fini_hw(adev); 4015 4016 if (adev->mman.initialized) 4017 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4018 4019 amdgpu_gart_dummy_page_fini(adev); 4020 4021 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4022 amdgpu_device_unmap_mmio(adev); 4023 4024 } 4025 4026 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4027 { 4028 int idx; 4029 bool px; 4030 4031 amdgpu_fence_driver_sw_fini(adev); 4032 amdgpu_device_ip_fini(adev); 4033 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4034 adev->accel_working = false; 4035 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4036 4037 amdgpu_reset_fini(adev); 4038 4039 /* free i2c buses */ 4040 if (!amdgpu_device_has_dc_support(adev)) 4041 amdgpu_i2c_fini(adev); 4042 4043 if (amdgpu_emu_mode != 1) 4044 amdgpu_atombios_fini(adev); 4045 4046 kfree(adev->bios); 4047 adev->bios = NULL; 4048 4049 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4050 4051 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4052 apple_gmux_detect(NULL, NULL))) 4053 vga_switcheroo_unregister_client(adev->pdev); 4054 4055 if (px) 4056 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4057 4058 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4059 vga_client_unregister(adev->pdev); 4060 4061 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4062 4063 iounmap(adev->rmmio); 4064 adev->rmmio = NULL; 4065 amdgpu_doorbell_fini(adev); 4066 drm_dev_exit(idx); 4067 } 4068 4069 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4070 amdgpu_pmu_fini(adev); 4071 if (adev->mman.discovery_bin) 4072 amdgpu_discovery_fini(adev); 4073 4074 amdgpu_reset_put_reset_domain(adev->reset_domain); 4075 adev->reset_domain = NULL; 4076 4077 kfree(adev->pci_state); 4078 4079 } 4080 4081 /** 4082 * amdgpu_device_evict_resources - evict device resources 4083 * @adev: amdgpu device object 4084 * 4085 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4086 * of the vram memory type. Mainly used for evicting device resources 4087 * at suspend time. 4088 * 4089 */ 4090 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4091 { 4092 int ret; 4093 4094 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4095 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4096 return 0; 4097 4098 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4099 if (ret) 4100 DRM_WARN("evicting device resources failed\n"); 4101 return ret; 4102 } 4103 4104 /* 4105 * Suspend & resume. 4106 */ 4107 /** 4108 * amdgpu_device_suspend - initiate device suspend 4109 * 4110 * @dev: drm dev pointer 4111 * @fbcon : notify the fbdev of suspend 4112 * 4113 * Puts the hw in the suspend state (all asics). 4114 * Returns 0 for success or an error on failure. 4115 * Called at driver suspend. 4116 */ 4117 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4118 { 4119 struct amdgpu_device *adev = drm_to_adev(dev); 4120 int r = 0; 4121 4122 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4123 return 0; 4124 4125 adev->in_suspend = true; 4126 4127 /* Evict the majority of BOs before grabbing the full access */ 4128 r = amdgpu_device_evict_resources(adev); 4129 if (r) 4130 return r; 4131 4132 if (amdgpu_sriov_vf(adev)) { 4133 amdgpu_virt_fini_data_exchange(adev); 4134 r = amdgpu_virt_request_full_gpu(adev, false); 4135 if (r) 4136 return r; 4137 } 4138 4139 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4140 DRM_WARN("smart shift update failed\n"); 4141 4142 if (fbcon) 4143 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4144 4145 cancel_delayed_work_sync(&adev->delayed_init_work); 4146 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4147 4148 amdgpu_ras_suspend(adev); 4149 4150 amdgpu_device_ip_suspend_phase1(adev); 4151 4152 if (!adev->in_s0ix) 4153 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4154 4155 r = amdgpu_device_evict_resources(adev); 4156 if (r) 4157 return r; 4158 4159 amdgpu_fence_driver_hw_fini(adev); 4160 4161 amdgpu_device_ip_suspend_phase2(adev); 4162 4163 if (amdgpu_sriov_vf(adev)) 4164 amdgpu_virt_release_full_gpu(adev, false); 4165 4166 return 0; 4167 } 4168 4169 /** 4170 * amdgpu_device_resume - initiate device resume 4171 * 4172 * @dev: drm dev pointer 4173 * @fbcon : notify the fbdev of resume 4174 * 4175 * Bring the hw back to operating state (all asics). 4176 * Returns 0 for success or an error on failure. 4177 * Called at driver resume. 4178 */ 4179 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4180 { 4181 struct amdgpu_device *adev = drm_to_adev(dev); 4182 int r = 0; 4183 4184 if (amdgpu_sriov_vf(adev)) { 4185 r = amdgpu_virt_request_full_gpu(adev, true); 4186 if (r) 4187 return r; 4188 } 4189 4190 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4191 return 0; 4192 4193 if (adev->in_s0ix) 4194 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4195 4196 /* post card */ 4197 if (amdgpu_device_need_post(adev)) { 4198 r = amdgpu_device_asic_init(adev); 4199 if (r) 4200 dev_err(adev->dev, "amdgpu asic init failed\n"); 4201 } 4202 4203 r = amdgpu_device_ip_resume(adev); 4204 4205 if (r) { 4206 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4207 goto exit; 4208 } 4209 amdgpu_fence_driver_hw_init(adev); 4210 4211 r = amdgpu_device_ip_late_init(adev); 4212 if (r) 4213 goto exit; 4214 4215 queue_delayed_work(system_wq, &adev->delayed_init_work, 4216 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4217 4218 if (!adev->in_s0ix) { 4219 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4220 if (r) 4221 goto exit; 4222 } 4223 4224 exit: 4225 if (amdgpu_sriov_vf(adev)) { 4226 amdgpu_virt_init_data_exchange(adev); 4227 amdgpu_virt_release_full_gpu(adev, true); 4228 } 4229 4230 if (r) 4231 return r; 4232 4233 /* Make sure IB tests flushed */ 4234 flush_delayed_work(&adev->delayed_init_work); 4235 4236 if (fbcon) 4237 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4238 4239 amdgpu_ras_resume(adev); 4240 4241 if (adev->mode_info.num_crtc) { 4242 /* 4243 * Most of the connector probing functions try to acquire runtime pm 4244 * refs to ensure that the GPU is powered on when connector polling is 4245 * performed. Since we're calling this from a runtime PM callback, 4246 * trying to acquire rpm refs will cause us to deadlock. 4247 * 4248 * Since we're guaranteed to be holding the rpm lock, it's safe to 4249 * temporarily disable the rpm helpers so this doesn't deadlock us. 4250 */ 4251 #ifdef CONFIG_PM 4252 dev->dev->power.disable_depth++; 4253 #endif 4254 if (!adev->dc_enabled) 4255 drm_helper_hpd_irq_event(dev); 4256 else 4257 drm_kms_helper_hotplug_event(dev); 4258 #ifdef CONFIG_PM 4259 dev->dev->power.disable_depth--; 4260 #endif 4261 } 4262 adev->in_suspend = false; 4263 4264 if (adev->enable_mes) 4265 amdgpu_mes_self_test(adev); 4266 4267 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4268 DRM_WARN("smart shift update failed\n"); 4269 4270 return 0; 4271 } 4272 4273 /** 4274 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4275 * 4276 * @adev: amdgpu_device pointer 4277 * 4278 * The list of all the hardware IPs that make up the asic is walked and 4279 * the check_soft_reset callbacks are run. check_soft_reset determines 4280 * if the asic is still hung or not. 4281 * Returns true if any of the IPs are still in a hung state, false if not. 4282 */ 4283 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4284 { 4285 int i; 4286 bool asic_hang = false; 4287 4288 if (amdgpu_sriov_vf(adev)) 4289 return true; 4290 4291 if (amdgpu_asic_need_full_reset(adev)) 4292 return true; 4293 4294 for (i = 0; i < adev->num_ip_blocks; i++) { 4295 if (!adev->ip_blocks[i].status.valid) 4296 continue; 4297 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4298 adev->ip_blocks[i].status.hang = 4299 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4300 if (adev->ip_blocks[i].status.hang) { 4301 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4302 asic_hang = true; 4303 } 4304 } 4305 return asic_hang; 4306 } 4307 4308 /** 4309 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4310 * 4311 * @adev: amdgpu_device pointer 4312 * 4313 * The list of all the hardware IPs that make up the asic is walked and the 4314 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4315 * handles any IP specific hardware or software state changes that are 4316 * necessary for a soft reset to succeed. 4317 * Returns 0 on success, negative error code on failure. 4318 */ 4319 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4320 { 4321 int i, r = 0; 4322 4323 for (i = 0; i < adev->num_ip_blocks; i++) { 4324 if (!adev->ip_blocks[i].status.valid) 4325 continue; 4326 if (adev->ip_blocks[i].status.hang && 4327 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4328 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4329 if (r) 4330 return r; 4331 } 4332 } 4333 4334 return 0; 4335 } 4336 4337 /** 4338 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4339 * 4340 * @adev: amdgpu_device pointer 4341 * 4342 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4343 * reset is necessary to recover. 4344 * Returns true if a full asic reset is required, false if not. 4345 */ 4346 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4347 { 4348 int i; 4349 4350 if (amdgpu_asic_need_full_reset(adev)) 4351 return true; 4352 4353 for (i = 0; i < adev->num_ip_blocks; i++) { 4354 if (!adev->ip_blocks[i].status.valid) 4355 continue; 4356 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4357 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4358 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4359 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4360 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4361 if (adev->ip_blocks[i].status.hang) { 4362 dev_info(adev->dev, "Some block need full reset!\n"); 4363 return true; 4364 } 4365 } 4366 } 4367 return false; 4368 } 4369 4370 /** 4371 * amdgpu_device_ip_soft_reset - do a soft reset 4372 * 4373 * @adev: amdgpu_device pointer 4374 * 4375 * The list of all the hardware IPs that make up the asic is walked and the 4376 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4377 * IP specific hardware or software state changes that are necessary to soft 4378 * reset the IP. 4379 * Returns 0 on success, negative error code on failure. 4380 */ 4381 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4382 { 4383 int i, r = 0; 4384 4385 for (i = 0; i < adev->num_ip_blocks; i++) { 4386 if (!adev->ip_blocks[i].status.valid) 4387 continue; 4388 if (adev->ip_blocks[i].status.hang && 4389 adev->ip_blocks[i].version->funcs->soft_reset) { 4390 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4391 if (r) 4392 return r; 4393 } 4394 } 4395 4396 return 0; 4397 } 4398 4399 /** 4400 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4401 * 4402 * @adev: amdgpu_device pointer 4403 * 4404 * The list of all the hardware IPs that make up the asic is walked and the 4405 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4406 * handles any IP specific hardware or software state changes that are 4407 * necessary after the IP has been soft reset. 4408 * Returns 0 on success, negative error code on failure. 4409 */ 4410 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4411 { 4412 int i, r = 0; 4413 4414 for (i = 0; i < adev->num_ip_blocks; i++) { 4415 if (!adev->ip_blocks[i].status.valid) 4416 continue; 4417 if (adev->ip_blocks[i].status.hang && 4418 adev->ip_blocks[i].version->funcs->post_soft_reset) 4419 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4420 if (r) 4421 return r; 4422 } 4423 4424 return 0; 4425 } 4426 4427 /** 4428 * amdgpu_device_recover_vram - Recover some VRAM contents 4429 * 4430 * @adev: amdgpu_device pointer 4431 * 4432 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4433 * restore things like GPUVM page tables after a GPU reset where 4434 * the contents of VRAM might be lost. 4435 * 4436 * Returns: 4437 * 0 on success, negative error code on failure. 4438 */ 4439 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4440 { 4441 struct dma_fence *fence = NULL, *next = NULL; 4442 struct amdgpu_bo *shadow; 4443 struct amdgpu_bo_vm *vmbo; 4444 long r = 1, tmo; 4445 4446 if (amdgpu_sriov_runtime(adev)) 4447 tmo = msecs_to_jiffies(8000); 4448 else 4449 tmo = msecs_to_jiffies(100); 4450 4451 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4452 mutex_lock(&adev->shadow_list_lock); 4453 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4454 /* If vm is compute context or adev is APU, shadow will be NULL */ 4455 if (!vmbo->shadow) 4456 continue; 4457 shadow = vmbo->shadow; 4458 4459 /* No need to recover an evicted BO */ 4460 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4461 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4462 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4463 continue; 4464 4465 r = amdgpu_bo_restore_shadow(shadow, &next); 4466 if (r) 4467 break; 4468 4469 if (fence) { 4470 tmo = dma_fence_wait_timeout(fence, false, tmo); 4471 dma_fence_put(fence); 4472 fence = next; 4473 if (tmo == 0) { 4474 r = -ETIMEDOUT; 4475 break; 4476 } else if (tmo < 0) { 4477 r = tmo; 4478 break; 4479 } 4480 } else { 4481 fence = next; 4482 } 4483 } 4484 mutex_unlock(&adev->shadow_list_lock); 4485 4486 if (fence) 4487 tmo = dma_fence_wait_timeout(fence, false, tmo); 4488 dma_fence_put(fence); 4489 4490 if (r < 0 || tmo <= 0) { 4491 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4492 return -EIO; 4493 } 4494 4495 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4496 return 0; 4497 } 4498 4499 4500 /** 4501 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4502 * 4503 * @adev: amdgpu_device pointer 4504 * @from_hypervisor: request from hypervisor 4505 * 4506 * do VF FLR and reinitialize Asic 4507 * return 0 means succeeded otherwise failed 4508 */ 4509 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4510 bool from_hypervisor) 4511 { 4512 int r; 4513 struct amdgpu_hive_info *hive = NULL; 4514 int retry_limit = 0; 4515 4516 retry: 4517 amdgpu_amdkfd_pre_reset(adev); 4518 4519 if (from_hypervisor) 4520 r = amdgpu_virt_request_full_gpu(adev, true); 4521 else 4522 r = amdgpu_virt_reset_gpu(adev); 4523 if (r) 4524 return r; 4525 amdgpu_irq_gpu_reset_resume_helper(adev); 4526 4527 /* some sw clean up VF needs to do before recover */ 4528 amdgpu_virt_post_reset(adev); 4529 4530 /* Resume IP prior to SMC */ 4531 r = amdgpu_device_ip_reinit_early_sriov(adev); 4532 if (r) 4533 goto error; 4534 4535 amdgpu_virt_init_data_exchange(adev); 4536 4537 r = amdgpu_device_fw_loading(adev); 4538 if (r) 4539 return r; 4540 4541 /* now we are okay to resume SMC/CP/SDMA */ 4542 r = amdgpu_device_ip_reinit_late_sriov(adev); 4543 if (r) 4544 goto error; 4545 4546 hive = amdgpu_get_xgmi_hive(adev); 4547 /* Update PSP FW topology after reset */ 4548 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4549 r = amdgpu_xgmi_update_topology(hive, adev); 4550 4551 if (hive) 4552 amdgpu_put_xgmi_hive(hive); 4553 4554 if (!r) { 4555 r = amdgpu_ib_ring_tests(adev); 4556 4557 amdgpu_amdkfd_post_reset(adev); 4558 } 4559 4560 error: 4561 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4562 amdgpu_inc_vram_lost(adev); 4563 r = amdgpu_device_recover_vram(adev); 4564 } 4565 amdgpu_virt_release_full_gpu(adev, true); 4566 4567 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4568 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4569 retry_limit++; 4570 goto retry; 4571 } else 4572 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4573 } 4574 4575 return r; 4576 } 4577 4578 /** 4579 * amdgpu_device_has_job_running - check if there is any job in mirror list 4580 * 4581 * @adev: amdgpu_device pointer 4582 * 4583 * check if there is any job in mirror list 4584 */ 4585 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4586 { 4587 int i; 4588 struct drm_sched_job *job; 4589 4590 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4591 struct amdgpu_ring *ring = adev->rings[i]; 4592 4593 if (!ring || !ring->sched.thread) 4594 continue; 4595 4596 spin_lock(&ring->sched.job_list_lock); 4597 job = list_first_entry_or_null(&ring->sched.pending_list, 4598 struct drm_sched_job, list); 4599 spin_unlock(&ring->sched.job_list_lock); 4600 if (job) 4601 return true; 4602 } 4603 return false; 4604 } 4605 4606 /** 4607 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4608 * 4609 * @adev: amdgpu_device pointer 4610 * 4611 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4612 * a hung GPU. 4613 */ 4614 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4615 { 4616 4617 if (amdgpu_gpu_recovery == 0) 4618 goto disabled; 4619 4620 /* Skip soft reset check in fatal error mode */ 4621 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4622 return true; 4623 4624 if (amdgpu_sriov_vf(adev)) 4625 return true; 4626 4627 if (amdgpu_gpu_recovery == -1) { 4628 switch (adev->asic_type) { 4629 #ifdef CONFIG_DRM_AMDGPU_SI 4630 case CHIP_VERDE: 4631 case CHIP_TAHITI: 4632 case CHIP_PITCAIRN: 4633 case CHIP_OLAND: 4634 case CHIP_HAINAN: 4635 #endif 4636 #ifdef CONFIG_DRM_AMDGPU_CIK 4637 case CHIP_KAVERI: 4638 case CHIP_KABINI: 4639 case CHIP_MULLINS: 4640 #endif 4641 case CHIP_CARRIZO: 4642 case CHIP_STONEY: 4643 case CHIP_CYAN_SKILLFISH: 4644 goto disabled; 4645 default: 4646 break; 4647 } 4648 } 4649 4650 return true; 4651 4652 disabled: 4653 dev_info(adev->dev, "GPU recovery disabled.\n"); 4654 return false; 4655 } 4656 4657 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4658 { 4659 u32 i; 4660 int ret = 0; 4661 4662 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4663 4664 dev_info(adev->dev, "GPU mode1 reset\n"); 4665 4666 /* disable BM */ 4667 pci_clear_master(adev->pdev); 4668 4669 amdgpu_device_cache_pci_state(adev->pdev); 4670 4671 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4672 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4673 ret = amdgpu_dpm_mode1_reset(adev); 4674 } else { 4675 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4676 ret = psp_gpu_reset(adev); 4677 } 4678 4679 if (ret) 4680 goto mode1_reset_failed; 4681 4682 amdgpu_device_load_pci_state(adev->pdev); 4683 ret = amdgpu_psp_wait_for_bootloader(adev); 4684 if (ret) 4685 goto mode1_reset_failed; 4686 4687 /* wait for asic to come out of reset */ 4688 for (i = 0; i < adev->usec_timeout; i++) { 4689 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4690 4691 if (memsize != 0xffffffff) 4692 break; 4693 udelay(1); 4694 } 4695 4696 if (i >= adev->usec_timeout) { 4697 ret = -ETIMEDOUT; 4698 goto mode1_reset_failed; 4699 } 4700 4701 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4702 4703 return 0; 4704 4705 mode1_reset_failed: 4706 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4707 return ret; 4708 } 4709 4710 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4711 struct amdgpu_reset_context *reset_context) 4712 { 4713 int i, r = 0; 4714 struct amdgpu_job *job = NULL; 4715 bool need_full_reset = 4716 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4717 4718 if (reset_context->reset_req_dev == adev) 4719 job = reset_context->job; 4720 4721 if (amdgpu_sriov_vf(adev)) { 4722 /* stop the data exchange thread */ 4723 amdgpu_virt_fini_data_exchange(adev); 4724 } 4725 4726 amdgpu_fence_driver_isr_toggle(adev, true); 4727 4728 /* block all schedulers and reset given job's ring */ 4729 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4730 struct amdgpu_ring *ring = adev->rings[i]; 4731 4732 if (!ring || !ring->sched.thread) 4733 continue; 4734 4735 /* Clear job fence from fence drv to avoid force_completion 4736 * leave NULL and vm flush fence in fence drv 4737 */ 4738 amdgpu_fence_driver_clear_job_fences(ring); 4739 4740 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4741 amdgpu_fence_driver_force_completion(ring); 4742 } 4743 4744 amdgpu_fence_driver_isr_toggle(adev, false); 4745 4746 if (job && job->vm) 4747 drm_sched_increase_karma(&job->base); 4748 4749 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4750 /* If reset handler not implemented, continue; otherwise return */ 4751 if (r == -EOPNOTSUPP) 4752 r = 0; 4753 else 4754 return r; 4755 4756 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4757 if (!amdgpu_sriov_vf(adev)) { 4758 4759 if (!need_full_reset) 4760 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4761 4762 if (!need_full_reset && amdgpu_gpu_recovery && 4763 amdgpu_device_ip_check_soft_reset(adev)) { 4764 amdgpu_device_ip_pre_soft_reset(adev); 4765 r = amdgpu_device_ip_soft_reset(adev); 4766 amdgpu_device_ip_post_soft_reset(adev); 4767 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4768 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4769 need_full_reset = true; 4770 } 4771 } 4772 4773 if (need_full_reset) 4774 r = amdgpu_device_ip_suspend(adev); 4775 if (need_full_reset) 4776 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4777 else 4778 clear_bit(AMDGPU_NEED_FULL_RESET, 4779 &reset_context->flags); 4780 } 4781 4782 return r; 4783 } 4784 4785 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4786 { 4787 int i; 4788 4789 lockdep_assert_held(&adev->reset_domain->sem); 4790 4791 for (i = 0; i < adev->num_regs; i++) { 4792 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4793 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4794 adev->reset_dump_reg_value[i]); 4795 } 4796 4797 return 0; 4798 } 4799 4800 #ifdef CONFIG_DEV_COREDUMP 4801 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4802 size_t count, void *data, size_t datalen) 4803 { 4804 struct drm_printer p; 4805 struct amdgpu_device *adev = data; 4806 struct drm_print_iterator iter; 4807 int i; 4808 4809 iter.data = buffer; 4810 iter.offset = 0; 4811 iter.start = offset; 4812 iter.remain = count; 4813 4814 p = drm_coredump_printer(&iter); 4815 4816 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4817 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4818 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4819 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4820 if (adev->reset_task_info.pid) 4821 drm_printf(&p, "process_name: %s PID: %d\n", 4822 adev->reset_task_info.process_name, 4823 adev->reset_task_info.pid); 4824 4825 if (adev->reset_vram_lost) 4826 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4827 if (adev->num_regs) { 4828 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4829 4830 for (i = 0; i < adev->num_regs; i++) 4831 drm_printf(&p, "0x%08x: 0x%08x\n", 4832 adev->reset_dump_reg_list[i], 4833 adev->reset_dump_reg_value[i]); 4834 } 4835 4836 return count - iter.remain; 4837 } 4838 4839 static void amdgpu_devcoredump_free(void *data) 4840 { 4841 } 4842 4843 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4844 { 4845 struct drm_device *dev = adev_to_drm(adev); 4846 4847 ktime_get_ts64(&adev->reset_time); 4848 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4849 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4850 } 4851 #endif 4852 4853 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4854 struct amdgpu_reset_context *reset_context) 4855 { 4856 struct amdgpu_device *tmp_adev = NULL; 4857 bool need_full_reset, skip_hw_reset, vram_lost = false; 4858 int r = 0; 4859 bool gpu_reset_for_dev_remove = 0; 4860 4861 /* Try reset handler method first */ 4862 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4863 reset_list); 4864 amdgpu_reset_reg_dumps(tmp_adev); 4865 4866 reset_context->reset_device_list = device_list_handle; 4867 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4868 /* If reset handler not implemented, continue; otherwise return */ 4869 if (r == -EOPNOTSUPP) 4870 r = 0; 4871 else 4872 return r; 4873 4874 /* Reset handler not implemented, use the default method */ 4875 need_full_reset = 4876 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4877 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4878 4879 gpu_reset_for_dev_remove = 4880 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4881 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4882 4883 /* 4884 * ASIC reset has to be done on all XGMI hive nodes ASAP 4885 * to allow proper links negotiation in FW (within 1 sec) 4886 */ 4887 if (!skip_hw_reset && need_full_reset) { 4888 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4889 /* For XGMI run all resets in parallel to speed up the process */ 4890 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4891 tmp_adev->gmc.xgmi.pending_reset = false; 4892 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4893 r = -EALREADY; 4894 } else 4895 r = amdgpu_asic_reset(tmp_adev); 4896 4897 if (r) { 4898 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4899 r, adev_to_drm(tmp_adev)->unique); 4900 break; 4901 } 4902 } 4903 4904 /* For XGMI wait for all resets to complete before proceed */ 4905 if (!r) { 4906 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4907 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4908 flush_work(&tmp_adev->xgmi_reset_work); 4909 r = tmp_adev->asic_reset_res; 4910 if (r) 4911 break; 4912 } 4913 } 4914 } 4915 } 4916 4917 if (!r && amdgpu_ras_intr_triggered()) { 4918 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4919 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4920 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4921 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4922 } 4923 4924 amdgpu_ras_intr_cleared(); 4925 } 4926 4927 /* Since the mode1 reset affects base ip blocks, the 4928 * phase1 ip blocks need to be resumed. Otherwise there 4929 * will be a BIOS signature error and the psp bootloader 4930 * can't load kdb on the next amdgpu install. 4931 */ 4932 if (gpu_reset_for_dev_remove) { 4933 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4934 amdgpu_device_ip_resume_phase1(tmp_adev); 4935 4936 goto end; 4937 } 4938 4939 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4940 if (need_full_reset) { 4941 /* post card */ 4942 r = amdgpu_device_asic_init(tmp_adev); 4943 if (r) { 4944 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4945 } else { 4946 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4947 4948 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4949 if (r) 4950 goto out; 4951 4952 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4953 #ifdef CONFIG_DEV_COREDUMP 4954 tmp_adev->reset_vram_lost = vram_lost; 4955 memset(&tmp_adev->reset_task_info, 0, 4956 sizeof(tmp_adev->reset_task_info)); 4957 if (reset_context->job && reset_context->job->vm) 4958 tmp_adev->reset_task_info = 4959 reset_context->job->vm->task_info; 4960 amdgpu_reset_capture_coredumpm(tmp_adev); 4961 #endif 4962 if (vram_lost) { 4963 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4964 amdgpu_inc_vram_lost(tmp_adev); 4965 } 4966 4967 r = amdgpu_device_fw_loading(tmp_adev); 4968 if (r) 4969 return r; 4970 4971 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4972 if (r) 4973 goto out; 4974 4975 if (vram_lost) 4976 amdgpu_device_fill_reset_magic(tmp_adev); 4977 4978 /* 4979 * Add this ASIC as tracked as reset was already 4980 * complete successfully. 4981 */ 4982 amdgpu_register_gpu_instance(tmp_adev); 4983 4984 if (!reset_context->hive && 4985 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4986 amdgpu_xgmi_add_device(tmp_adev); 4987 4988 r = amdgpu_device_ip_late_init(tmp_adev); 4989 if (r) 4990 goto out; 4991 4992 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4993 4994 /* 4995 * The GPU enters bad state once faulty pages 4996 * by ECC has reached the threshold, and ras 4997 * recovery is scheduled next. So add one check 4998 * here to break recovery if it indeed exceeds 4999 * bad page threshold, and remind user to 5000 * retire this GPU or setting one bigger 5001 * bad_page_threshold value to fix this once 5002 * probing driver again. 5003 */ 5004 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5005 /* must succeed. */ 5006 amdgpu_ras_resume(tmp_adev); 5007 } else { 5008 r = -EINVAL; 5009 goto out; 5010 } 5011 5012 /* Update PSP FW topology after reset */ 5013 if (reset_context->hive && 5014 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5015 r = amdgpu_xgmi_update_topology( 5016 reset_context->hive, tmp_adev); 5017 } 5018 } 5019 5020 out: 5021 if (!r) { 5022 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5023 r = amdgpu_ib_ring_tests(tmp_adev); 5024 if (r) { 5025 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5026 need_full_reset = true; 5027 r = -EAGAIN; 5028 goto end; 5029 } 5030 } 5031 5032 if (!r) 5033 r = amdgpu_device_recover_vram(tmp_adev); 5034 else 5035 tmp_adev->asic_reset_res = r; 5036 } 5037 5038 end: 5039 if (need_full_reset) 5040 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5041 else 5042 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5043 return r; 5044 } 5045 5046 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5047 { 5048 5049 switch (amdgpu_asic_reset_method(adev)) { 5050 case AMD_RESET_METHOD_MODE1: 5051 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5052 break; 5053 case AMD_RESET_METHOD_MODE2: 5054 adev->mp1_state = PP_MP1_STATE_RESET; 5055 break; 5056 default: 5057 adev->mp1_state = PP_MP1_STATE_NONE; 5058 break; 5059 } 5060 } 5061 5062 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5063 { 5064 amdgpu_vf_error_trans_all(adev); 5065 adev->mp1_state = PP_MP1_STATE_NONE; 5066 } 5067 5068 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5069 { 5070 struct pci_dev *p = NULL; 5071 5072 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5073 adev->pdev->bus->number, 1); 5074 if (p) { 5075 pm_runtime_enable(&(p->dev)); 5076 pm_runtime_resume(&(p->dev)); 5077 } 5078 5079 pci_dev_put(p); 5080 } 5081 5082 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5083 { 5084 enum amd_reset_method reset_method; 5085 struct pci_dev *p = NULL; 5086 u64 expires; 5087 5088 /* 5089 * For now, only BACO and mode1 reset are confirmed 5090 * to suffer the audio issue without proper suspended. 5091 */ 5092 reset_method = amdgpu_asic_reset_method(adev); 5093 if ((reset_method != AMD_RESET_METHOD_BACO) && 5094 (reset_method != AMD_RESET_METHOD_MODE1)) 5095 return -EINVAL; 5096 5097 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5098 adev->pdev->bus->number, 1); 5099 if (!p) 5100 return -ENODEV; 5101 5102 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5103 if (!expires) 5104 /* 5105 * If we cannot get the audio device autosuspend delay, 5106 * a fixed 4S interval will be used. Considering 3S is 5107 * the audio controller default autosuspend delay setting. 5108 * 4S used here is guaranteed to cover that. 5109 */ 5110 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5111 5112 while (!pm_runtime_status_suspended(&(p->dev))) { 5113 if (!pm_runtime_suspend(&(p->dev))) 5114 break; 5115 5116 if (expires < ktime_get_mono_fast_ns()) { 5117 dev_warn(adev->dev, "failed to suspend display audio\n"); 5118 pci_dev_put(p); 5119 /* TODO: abort the succeeding gpu reset? */ 5120 return -ETIMEDOUT; 5121 } 5122 } 5123 5124 pm_runtime_disable(&(p->dev)); 5125 5126 pci_dev_put(p); 5127 return 0; 5128 } 5129 5130 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5131 { 5132 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5133 5134 #if defined(CONFIG_DEBUG_FS) 5135 if (!amdgpu_sriov_vf(adev)) 5136 cancel_work(&adev->reset_work); 5137 #endif 5138 5139 if (adev->kfd.dev) 5140 cancel_work(&adev->kfd.reset_work); 5141 5142 if (amdgpu_sriov_vf(adev)) 5143 cancel_work(&adev->virt.flr_work); 5144 5145 if (con && adev->ras_enabled) 5146 cancel_work(&con->recovery_work); 5147 5148 } 5149 5150 /** 5151 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5152 * 5153 * @adev: amdgpu_device pointer 5154 * @job: which job trigger hang 5155 * @reset_context: amdgpu reset context pointer 5156 * 5157 * Attempt to reset the GPU if it has hung (all asics). 5158 * Attempt to do soft-reset or full-reset and reinitialize Asic 5159 * Returns 0 for success or an error on failure. 5160 */ 5161 5162 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5163 struct amdgpu_job *job, 5164 struct amdgpu_reset_context *reset_context) 5165 { 5166 struct list_head device_list, *device_list_handle = NULL; 5167 bool job_signaled = false; 5168 struct amdgpu_hive_info *hive = NULL; 5169 struct amdgpu_device *tmp_adev = NULL; 5170 int i, r = 0; 5171 bool need_emergency_restart = false; 5172 bool audio_suspended = false; 5173 bool gpu_reset_for_dev_remove = false; 5174 5175 gpu_reset_for_dev_remove = 5176 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5177 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5178 5179 /* 5180 * Special case: RAS triggered and full reset isn't supported 5181 */ 5182 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5183 5184 /* 5185 * Flush RAM to disk so that after reboot 5186 * the user can read log and see why the system rebooted. 5187 */ 5188 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5189 amdgpu_ras_get_context(adev)->reboot) { 5190 DRM_WARN("Emergency reboot."); 5191 5192 ksys_sync_helper(); 5193 emergency_restart(); 5194 } 5195 5196 dev_info(adev->dev, "GPU %s begin!\n", 5197 need_emergency_restart ? "jobs stop":"reset"); 5198 5199 if (!amdgpu_sriov_vf(adev)) 5200 hive = amdgpu_get_xgmi_hive(adev); 5201 if (hive) 5202 mutex_lock(&hive->hive_lock); 5203 5204 reset_context->job = job; 5205 reset_context->hive = hive; 5206 /* 5207 * Build list of devices to reset. 5208 * In case we are in XGMI hive mode, resort the device list 5209 * to put adev in the 1st position. 5210 */ 5211 INIT_LIST_HEAD(&device_list); 5212 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5213 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5214 list_add_tail(&tmp_adev->reset_list, &device_list); 5215 if (gpu_reset_for_dev_remove && adev->shutdown) 5216 tmp_adev->shutdown = true; 5217 } 5218 if (!list_is_first(&adev->reset_list, &device_list)) 5219 list_rotate_to_front(&adev->reset_list, &device_list); 5220 device_list_handle = &device_list; 5221 } else { 5222 list_add_tail(&adev->reset_list, &device_list); 5223 device_list_handle = &device_list; 5224 } 5225 5226 /* We need to lock reset domain only once both for XGMI and single device */ 5227 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5228 reset_list); 5229 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5230 5231 /* block all schedulers and reset given job's ring */ 5232 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5233 5234 amdgpu_device_set_mp1_state(tmp_adev); 5235 5236 /* 5237 * Try to put the audio codec into suspend state 5238 * before gpu reset started. 5239 * 5240 * Due to the power domain of the graphics device 5241 * is shared with AZ power domain. Without this, 5242 * we may change the audio hardware from behind 5243 * the audio driver's back. That will trigger 5244 * some audio codec errors. 5245 */ 5246 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5247 audio_suspended = true; 5248 5249 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5250 5251 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5252 5253 if (!amdgpu_sriov_vf(tmp_adev)) 5254 amdgpu_amdkfd_pre_reset(tmp_adev); 5255 5256 /* 5257 * Mark these ASICs to be reseted as untracked first 5258 * And add them back after reset completed 5259 */ 5260 amdgpu_unregister_gpu_instance(tmp_adev); 5261 5262 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5263 5264 /* disable ras on ALL IPs */ 5265 if (!need_emergency_restart && 5266 amdgpu_device_ip_need_full_reset(tmp_adev)) 5267 amdgpu_ras_suspend(tmp_adev); 5268 5269 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5270 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5271 5272 if (!ring || !ring->sched.thread) 5273 continue; 5274 5275 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5276 5277 if (need_emergency_restart) 5278 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5279 } 5280 atomic_inc(&tmp_adev->gpu_reset_counter); 5281 } 5282 5283 if (need_emergency_restart) 5284 goto skip_sched_resume; 5285 5286 /* 5287 * Must check guilty signal here since after this point all old 5288 * HW fences are force signaled. 5289 * 5290 * job->base holds a reference to parent fence 5291 */ 5292 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5293 job_signaled = true; 5294 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5295 goto skip_hw_reset; 5296 } 5297 5298 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5299 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5300 if (gpu_reset_for_dev_remove) { 5301 /* Workaroud for ASICs need to disable SMC first */ 5302 amdgpu_device_smu_fini_early(tmp_adev); 5303 } 5304 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5305 /*TODO Should we stop ?*/ 5306 if (r) { 5307 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5308 r, adev_to_drm(tmp_adev)->unique); 5309 tmp_adev->asic_reset_res = r; 5310 } 5311 5312 /* 5313 * Drop all pending non scheduler resets. Scheduler resets 5314 * were already dropped during drm_sched_stop 5315 */ 5316 amdgpu_device_stop_pending_resets(tmp_adev); 5317 } 5318 5319 /* Actual ASIC resets if needed.*/ 5320 /* Host driver will handle XGMI hive reset for SRIOV */ 5321 if (amdgpu_sriov_vf(adev)) { 5322 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5323 if (r) 5324 adev->asic_reset_res = r; 5325 5326 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5327 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5328 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5329 amdgpu_ras_resume(adev); 5330 } else { 5331 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5332 if (r && r == -EAGAIN) 5333 goto retry; 5334 5335 if (!r && gpu_reset_for_dev_remove) 5336 goto recover_end; 5337 } 5338 5339 skip_hw_reset: 5340 5341 /* Post ASIC reset for all devs .*/ 5342 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5343 5344 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5345 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5346 5347 if (!ring || !ring->sched.thread) 5348 continue; 5349 5350 drm_sched_start(&ring->sched, true); 5351 } 5352 5353 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5354 amdgpu_mes_self_test(tmp_adev); 5355 5356 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5357 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5358 5359 if (tmp_adev->asic_reset_res) 5360 r = tmp_adev->asic_reset_res; 5361 5362 tmp_adev->asic_reset_res = 0; 5363 5364 if (r) { 5365 /* bad news, how to tell it to userspace ? */ 5366 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5367 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5368 } else { 5369 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5370 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5371 DRM_WARN("smart shift update failed\n"); 5372 } 5373 } 5374 5375 skip_sched_resume: 5376 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5377 /* unlock kfd: SRIOV would do it separately */ 5378 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5379 amdgpu_amdkfd_post_reset(tmp_adev); 5380 5381 /* kfd_post_reset will do nothing if kfd device is not initialized, 5382 * need to bring up kfd here if it's not be initialized before 5383 */ 5384 if (!adev->kfd.init_complete) 5385 amdgpu_amdkfd_device_init(adev); 5386 5387 if (audio_suspended) 5388 amdgpu_device_resume_display_audio(tmp_adev); 5389 5390 amdgpu_device_unset_mp1_state(tmp_adev); 5391 5392 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5393 } 5394 5395 recover_end: 5396 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5397 reset_list); 5398 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5399 5400 if (hive) { 5401 mutex_unlock(&hive->hive_lock); 5402 amdgpu_put_xgmi_hive(hive); 5403 } 5404 5405 if (r) 5406 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5407 5408 atomic_set(&adev->reset_domain->reset_res, r); 5409 return r; 5410 } 5411 5412 /** 5413 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5414 * 5415 * @adev: amdgpu_device pointer 5416 * 5417 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5418 * and lanes) of the slot the device is in. Handles APUs and 5419 * virtualized environments where PCIE config space may not be available. 5420 */ 5421 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5422 { 5423 struct pci_dev *pdev; 5424 enum pci_bus_speed speed_cap, platform_speed_cap; 5425 enum pcie_link_width platform_link_width; 5426 5427 if (amdgpu_pcie_gen_cap) 5428 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5429 5430 if (amdgpu_pcie_lane_cap) 5431 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5432 5433 /* covers APUs as well */ 5434 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5435 if (adev->pm.pcie_gen_mask == 0) 5436 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5437 if (adev->pm.pcie_mlw_mask == 0) 5438 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5439 return; 5440 } 5441 5442 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5443 return; 5444 5445 pcie_bandwidth_available(adev->pdev, NULL, 5446 &platform_speed_cap, &platform_link_width); 5447 5448 if (adev->pm.pcie_gen_mask == 0) { 5449 /* asic caps */ 5450 pdev = adev->pdev; 5451 speed_cap = pcie_get_speed_cap(pdev); 5452 if (speed_cap == PCI_SPEED_UNKNOWN) { 5453 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5454 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5455 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5456 } else { 5457 if (speed_cap == PCIE_SPEED_32_0GT) 5458 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5459 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5460 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5461 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5463 else if (speed_cap == PCIE_SPEED_16_0GT) 5464 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5465 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5466 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5468 else if (speed_cap == PCIE_SPEED_8_0GT) 5469 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5470 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5471 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5472 else if (speed_cap == PCIE_SPEED_5_0GT) 5473 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5474 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5475 else 5476 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5477 } 5478 /* platform caps */ 5479 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5480 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5481 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5482 } else { 5483 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5484 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5485 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5486 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5487 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5489 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5490 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5491 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5492 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5493 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5494 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5495 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5496 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5497 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5498 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5499 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5500 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5501 else 5502 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5503 5504 } 5505 } 5506 if (adev->pm.pcie_mlw_mask == 0) { 5507 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5508 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5509 } else { 5510 switch (platform_link_width) { 5511 case PCIE_LNK_X32: 5512 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5515 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5516 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5517 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5518 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5519 break; 5520 case PCIE_LNK_X16: 5521 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5522 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5523 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5524 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5525 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5527 break; 5528 case PCIE_LNK_X12: 5529 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5532 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5534 break; 5535 case PCIE_LNK_X8: 5536 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5540 break; 5541 case PCIE_LNK_X4: 5542 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5543 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5545 break; 5546 case PCIE_LNK_X2: 5547 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5549 break; 5550 case PCIE_LNK_X1: 5551 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5552 break; 5553 default: 5554 break; 5555 } 5556 } 5557 } 5558 } 5559 5560 /** 5561 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5562 * 5563 * @adev: amdgpu_device pointer 5564 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5565 * 5566 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5567 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5568 * @peer_adev. 5569 */ 5570 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5571 struct amdgpu_device *peer_adev) 5572 { 5573 #ifdef CONFIG_HSA_AMD_P2P 5574 uint64_t address_mask = peer_adev->dev->dma_mask ? 5575 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5576 resource_size_t aper_limit = 5577 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5578 bool p2p_access = 5579 !adev->gmc.xgmi.connected_to_cpu && 5580 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5581 5582 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5583 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5584 !(adev->gmc.aper_base & address_mask || 5585 aper_limit & address_mask)); 5586 #else 5587 return false; 5588 #endif 5589 } 5590 5591 int amdgpu_device_baco_enter(struct drm_device *dev) 5592 { 5593 struct amdgpu_device *adev = drm_to_adev(dev); 5594 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5595 5596 if (!amdgpu_device_supports_baco(dev)) 5597 return -ENOTSUPP; 5598 5599 if (ras && adev->ras_enabled && 5600 adev->nbio.funcs->enable_doorbell_interrupt) 5601 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5602 5603 return amdgpu_dpm_baco_enter(adev); 5604 } 5605 5606 int amdgpu_device_baco_exit(struct drm_device *dev) 5607 { 5608 struct amdgpu_device *adev = drm_to_adev(dev); 5609 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5610 int ret = 0; 5611 5612 if (!amdgpu_device_supports_baco(dev)) 5613 return -ENOTSUPP; 5614 5615 ret = amdgpu_dpm_baco_exit(adev); 5616 if (ret) 5617 return ret; 5618 5619 if (ras && adev->ras_enabled && 5620 adev->nbio.funcs->enable_doorbell_interrupt) 5621 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5622 5623 if (amdgpu_passthrough(adev) && 5624 adev->nbio.funcs->clear_doorbell_interrupt) 5625 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5626 5627 return 0; 5628 } 5629 5630 /** 5631 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5632 * @pdev: PCI device struct 5633 * @state: PCI channel state 5634 * 5635 * Description: Called when a PCI error is detected. 5636 * 5637 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5638 */ 5639 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5640 { 5641 struct drm_device *dev = pci_get_drvdata(pdev); 5642 struct amdgpu_device *adev = drm_to_adev(dev); 5643 int i; 5644 5645 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5646 5647 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5648 DRM_WARN("No support for XGMI hive yet..."); 5649 return PCI_ERS_RESULT_DISCONNECT; 5650 } 5651 5652 adev->pci_channel_state = state; 5653 5654 switch (state) { 5655 case pci_channel_io_normal: 5656 return PCI_ERS_RESULT_CAN_RECOVER; 5657 /* Fatal error, prepare for slot reset */ 5658 case pci_channel_io_frozen: 5659 /* 5660 * Locking adev->reset_domain->sem will prevent any external access 5661 * to GPU during PCI error recovery 5662 */ 5663 amdgpu_device_lock_reset_domain(adev->reset_domain); 5664 amdgpu_device_set_mp1_state(adev); 5665 5666 /* 5667 * Block any work scheduling as we do for regular GPU reset 5668 * for the duration of the recovery 5669 */ 5670 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5671 struct amdgpu_ring *ring = adev->rings[i]; 5672 5673 if (!ring || !ring->sched.thread) 5674 continue; 5675 5676 drm_sched_stop(&ring->sched, NULL); 5677 } 5678 atomic_inc(&adev->gpu_reset_counter); 5679 return PCI_ERS_RESULT_NEED_RESET; 5680 case pci_channel_io_perm_failure: 5681 /* Permanent error, prepare for device removal */ 5682 return PCI_ERS_RESULT_DISCONNECT; 5683 } 5684 5685 return PCI_ERS_RESULT_NEED_RESET; 5686 } 5687 5688 /** 5689 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5690 * @pdev: pointer to PCI device 5691 */ 5692 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5693 { 5694 5695 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5696 5697 /* TODO - dump whatever for debugging purposes */ 5698 5699 /* This called only if amdgpu_pci_error_detected returns 5700 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5701 * works, no need to reset slot. 5702 */ 5703 5704 return PCI_ERS_RESULT_RECOVERED; 5705 } 5706 5707 /** 5708 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5709 * @pdev: PCI device struct 5710 * 5711 * Description: This routine is called by the pci error recovery 5712 * code after the PCI slot has been reset, just before we 5713 * should resume normal operations. 5714 */ 5715 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5716 { 5717 struct drm_device *dev = pci_get_drvdata(pdev); 5718 struct amdgpu_device *adev = drm_to_adev(dev); 5719 int r, i; 5720 struct amdgpu_reset_context reset_context; 5721 u32 memsize; 5722 struct list_head device_list; 5723 5724 DRM_INFO("PCI error: slot reset callback!!\n"); 5725 5726 memset(&reset_context, 0, sizeof(reset_context)); 5727 5728 INIT_LIST_HEAD(&device_list); 5729 list_add_tail(&adev->reset_list, &device_list); 5730 5731 /* wait for asic to come out of reset */ 5732 msleep(500); 5733 5734 /* Restore PCI confspace */ 5735 amdgpu_device_load_pci_state(pdev); 5736 5737 /* confirm ASIC came out of reset */ 5738 for (i = 0; i < adev->usec_timeout; i++) { 5739 memsize = amdgpu_asic_get_config_memsize(adev); 5740 5741 if (memsize != 0xffffffff) 5742 break; 5743 udelay(1); 5744 } 5745 if (memsize == 0xffffffff) { 5746 r = -ETIME; 5747 goto out; 5748 } 5749 5750 reset_context.method = AMD_RESET_METHOD_NONE; 5751 reset_context.reset_req_dev = adev; 5752 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5753 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5754 5755 adev->no_hw_access = true; 5756 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5757 adev->no_hw_access = false; 5758 if (r) 5759 goto out; 5760 5761 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5762 5763 out: 5764 if (!r) { 5765 if (amdgpu_device_cache_pci_state(adev->pdev)) 5766 pci_restore_state(adev->pdev); 5767 5768 DRM_INFO("PCIe error recovery succeeded\n"); 5769 } else { 5770 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5771 amdgpu_device_unset_mp1_state(adev); 5772 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5773 } 5774 5775 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5776 } 5777 5778 /** 5779 * amdgpu_pci_resume() - resume normal ops after PCI reset 5780 * @pdev: pointer to PCI device 5781 * 5782 * Called when the error recovery driver tells us that its 5783 * OK to resume normal operation. 5784 */ 5785 void amdgpu_pci_resume(struct pci_dev *pdev) 5786 { 5787 struct drm_device *dev = pci_get_drvdata(pdev); 5788 struct amdgpu_device *adev = drm_to_adev(dev); 5789 int i; 5790 5791 5792 DRM_INFO("PCI error: resume callback!!\n"); 5793 5794 /* Only continue execution for the case of pci_channel_io_frozen */ 5795 if (adev->pci_channel_state != pci_channel_io_frozen) 5796 return; 5797 5798 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5799 struct amdgpu_ring *ring = adev->rings[i]; 5800 5801 if (!ring || !ring->sched.thread) 5802 continue; 5803 5804 drm_sched_start(&ring->sched, true); 5805 } 5806 5807 amdgpu_device_unset_mp1_state(adev); 5808 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5809 } 5810 5811 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5812 { 5813 struct drm_device *dev = pci_get_drvdata(pdev); 5814 struct amdgpu_device *adev = drm_to_adev(dev); 5815 int r; 5816 5817 r = pci_save_state(pdev); 5818 if (!r) { 5819 kfree(adev->pci_state); 5820 5821 adev->pci_state = pci_store_saved_state(pdev); 5822 5823 if (!adev->pci_state) { 5824 DRM_ERROR("Failed to store PCI saved state"); 5825 return false; 5826 } 5827 } else { 5828 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5829 return false; 5830 } 5831 5832 return true; 5833 } 5834 5835 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5836 { 5837 struct drm_device *dev = pci_get_drvdata(pdev); 5838 struct amdgpu_device *adev = drm_to_adev(dev); 5839 int r; 5840 5841 if (!adev->pci_state) 5842 return false; 5843 5844 r = pci_load_saved_state(pdev, adev->pci_state); 5845 5846 if (!r) { 5847 pci_restore_state(pdev); 5848 } else { 5849 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5850 return false; 5851 } 5852 5853 return true; 5854 } 5855 5856 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5857 struct amdgpu_ring *ring) 5858 { 5859 #ifdef CONFIG_X86_64 5860 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5861 return; 5862 #endif 5863 if (adev->gmc.xgmi.connected_to_cpu) 5864 return; 5865 5866 if (ring && ring->funcs->emit_hdp_flush) 5867 amdgpu_ring_emit_hdp_flush(ring); 5868 else 5869 amdgpu_asic_flush_hdp(adev, ring); 5870 } 5871 5872 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5873 struct amdgpu_ring *ring) 5874 { 5875 #ifdef CONFIG_X86_64 5876 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5877 return; 5878 #endif 5879 if (adev->gmc.xgmi.connected_to_cpu) 5880 return; 5881 5882 amdgpu_asic_invalidate_hdp(adev, ring); 5883 } 5884 5885 int amdgpu_in_reset(struct amdgpu_device *adev) 5886 { 5887 return atomic_read(&adev->reset_domain->in_gpu_reset); 5888 } 5889 5890 /** 5891 * amdgpu_device_halt() - bring hardware to some kind of halt state 5892 * 5893 * @adev: amdgpu_device pointer 5894 * 5895 * Bring hardware to some kind of halt state so that no one can touch it 5896 * any more. It will help to maintain error context when error occurred. 5897 * Compare to a simple hang, the system will keep stable at least for SSH 5898 * access. Then it should be trivial to inspect the hardware state and 5899 * see what's going on. Implemented as following: 5900 * 5901 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5902 * clears all CPU mappings to device, disallows remappings through page faults 5903 * 2. amdgpu_irq_disable_all() disables all interrupts 5904 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5905 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5906 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5907 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5908 * flush any in flight DMA operations 5909 */ 5910 void amdgpu_device_halt(struct amdgpu_device *adev) 5911 { 5912 struct pci_dev *pdev = adev->pdev; 5913 struct drm_device *ddev = adev_to_drm(adev); 5914 5915 amdgpu_xcp_dev_unplug(adev); 5916 drm_dev_unplug(ddev); 5917 5918 amdgpu_irq_disable_all(adev); 5919 5920 amdgpu_fence_driver_hw_fini(adev); 5921 5922 adev->no_hw_access = true; 5923 5924 amdgpu_device_unmap_mmio(adev); 5925 5926 pci_disable_device(pdev); 5927 pci_wait_for_pending_transaction(pdev); 5928 } 5929 5930 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5931 u32 reg) 5932 { 5933 unsigned long flags, address, data; 5934 u32 r; 5935 5936 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5937 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5938 5939 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5940 WREG32(address, reg * 4); 5941 (void)RREG32(address); 5942 r = RREG32(data); 5943 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5944 return r; 5945 } 5946 5947 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5948 u32 reg, u32 v) 5949 { 5950 unsigned long flags, address, data; 5951 5952 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5953 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5954 5955 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5956 WREG32(address, reg * 4); 5957 (void)RREG32(address); 5958 WREG32(data, v); 5959 (void)RREG32(data); 5960 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5961 } 5962 5963 /** 5964 * amdgpu_device_switch_gang - switch to a new gang 5965 * @adev: amdgpu_device pointer 5966 * @gang: the gang to switch to 5967 * 5968 * Try to switch to a new gang. 5969 * Returns: NULL if we switched to the new gang or a reference to the current 5970 * gang leader. 5971 */ 5972 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5973 struct dma_fence *gang) 5974 { 5975 struct dma_fence *old = NULL; 5976 5977 do { 5978 dma_fence_put(old); 5979 rcu_read_lock(); 5980 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5981 rcu_read_unlock(); 5982 5983 if (old == gang) 5984 break; 5985 5986 if (!dma_fence_is_signaled(old)) 5987 return old; 5988 5989 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5990 old, gang) != old); 5991 5992 dma_fence_put(old); 5993 return NULL; 5994 } 5995 5996 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5997 { 5998 switch (adev->asic_type) { 5999 #ifdef CONFIG_DRM_AMDGPU_SI 6000 case CHIP_HAINAN: 6001 #endif 6002 case CHIP_TOPAZ: 6003 /* chips with no display hardware */ 6004 return false; 6005 #ifdef CONFIG_DRM_AMDGPU_SI 6006 case CHIP_TAHITI: 6007 case CHIP_PITCAIRN: 6008 case CHIP_VERDE: 6009 case CHIP_OLAND: 6010 #endif 6011 #ifdef CONFIG_DRM_AMDGPU_CIK 6012 case CHIP_BONAIRE: 6013 case CHIP_HAWAII: 6014 case CHIP_KAVERI: 6015 case CHIP_KABINI: 6016 case CHIP_MULLINS: 6017 #endif 6018 case CHIP_TONGA: 6019 case CHIP_FIJI: 6020 case CHIP_POLARIS10: 6021 case CHIP_POLARIS11: 6022 case CHIP_POLARIS12: 6023 case CHIP_VEGAM: 6024 case CHIP_CARRIZO: 6025 case CHIP_STONEY: 6026 /* chips with display hardware */ 6027 return true; 6028 default: 6029 /* IP discovery */ 6030 if (!adev->ip_versions[DCE_HWIP][0] || 6031 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6032 return false; 6033 return true; 6034 } 6035 } 6036 6037 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6038 uint32_t inst, uint32_t reg_addr, char reg_name[], 6039 uint32_t expected_value, uint32_t mask) 6040 { 6041 uint32_t ret = 0; 6042 uint32_t old_ = 0; 6043 uint32_t tmp_ = RREG32(reg_addr); 6044 uint32_t loop = adev->usec_timeout; 6045 6046 while ((tmp_ & (mask)) != (expected_value)) { 6047 if (old_ != tmp_) { 6048 loop = adev->usec_timeout; 6049 old_ = tmp_; 6050 } else 6051 udelay(1); 6052 tmp_ = RREG32(reg_addr); 6053 loop--; 6054 if (!loop) { 6055 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6056 inst, reg_name, (uint32_t)expected_value, 6057 (uint32_t)(tmp_ & (mask))); 6058 ret = -ETIMEDOUT; 6059 break; 6060 } 6061 } 6062 return ret; 6063 } 6064