1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * 511 * this function is invoked only for the debugfs register access 512 */ 513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 514 uint32_t reg, uint32_t v, 515 uint32_t xcc_id) 516 { 517 if (amdgpu_device_skip_hw_access(adev)) 518 return; 519 520 if (amdgpu_sriov_fullaccess(adev) && 521 adev->gfx.rlc.funcs && 522 adev->gfx.rlc.funcs->is_rlcg_access_range) { 523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 525 } else if ((reg * 4) >= adev->rmmio_size) { 526 adev->pcie_wreg(adev, reg * 4, v); 527 } else { 528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 529 } 530 } 531 532 /** 533 * amdgpu_device_indirect_rreg - read an indirect register 534 * 535 * @adev: amdgpu_device pointer 536 * @reg_addr: indirect register address to read from 537 * 538 * Returns the value of indirect register @reg_addr 539 */ 540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 541 u32 reg_addr) 542 { 543 unsigned long flags, pcie_index, pcie_data; 544 void __iomem *pcie_index_offset; 545 void __iomem *pcie_data_offset; 546 u32 r; 547 548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 550 551 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 554 555 writel(reg_addr, pcie_index_offset); 556 readl(pcie_index_offset); 557 r = readl(pcie_data_offset); 558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 559 560 return r; 561 } 562 563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 564 u64 reg_addr) 565 { 566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 567 u32 r; 568 void __iomem *pcie_index_offset; 569 void __iomem *pcie_index_hi_offset; 570 void __iomem *pcie_data_offset; 571 572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 574 if (adev->nbio.funcs->get_pcie_index_hi_offset) 575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 576 else 577 pcie_index_hi = 0; 578 579 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 582 if (pcie_index_hi != 0) 583 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 584 pcie_index_hi * 4; 585 586 writel(reg_addr, pcie_index_offset); 587 readl(pcie_index_offset); 588 if (pcie_index_hi != 0) { 589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 590 readl(pcie_index_hi_offset); 591 } 592 r = readl(pcie_data_offset); 593 594 /* clear the high bits */ 595 if (pcie_index_hi != 0) { 596 writel(0, pcie_index_hi_offset); 597 readl(pcie_index_hi_offset); 598 } 599 600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 601 602 return r; 603 } 604 605 /** 606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 607 * 608 * @adev: amdgpu_device pointer 609 * @reg_addr: indirect register address to read from 610 * 611 * Returns the value of indirect register @reg_addr 612 */ 613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 614 u32 reg_addr) 615 { 616 unsigned long flags, pcie_index, pcie_data; 617 void __iomem *pcie_index_offset; 618 void __iomem *pcie_data_offset; 619 u64 r; 620 621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @reg_addr: indirect register offset 646 * @reg_data: indirect register data 647 * 648 */ 649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 650 u32 reg_addr, u32 reg_data) 651 { 652 unsigned long flags, pcie_index, pcie_data; 653 void __iomem *pcie_index_offset; 654 void __iomem *pcie_data_offset; 655 656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 671 u64 reg_addr, u32 reg_data) 672 { 673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 674 void __iomem *pcie_index_offset; 675 void __iomem *pcie_index_hi_offset; 676 void __iomem *pcie_data_offset; 677 678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 680 if (adev->nbio.funcs->get_pcie_index_hi_offset) 681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 682 else 683 pcie_index_hi = 0; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 if (pcie_index_hi != 0) 689 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 690 pcie_index_hi * 4; 691 692 writel(reg_addr, pcie_index_offset); 693 readl(pcie_index_offset); 694 if (pcie_index_hi != 0) { 695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 696 readl(pcie_index_hi_offset); 697 } 698 writel(reg_data, pcie_data_offset); 699 readl(pcie_data_offset); 700 701 /* clear the high bits */ 702 if (pcie_index_hi != 0) { 703 writel(0, pcie_index_hi_offset); 704 readl(pcie_index_hi_offset); 705 } 706 707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 708 } 709 710 /** 711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 712 * 713 * @adev: amdgpu_device pointer 714 * @reg_addr: indirect register offset 715 * @reg_data: indirect register data 716 * 717 */ 718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 719 u32 reg_addr, u64 reg_data) 720 { 721 unsigned long flags, pcie_index, pcie_data; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 727 728 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 731 732 /* write low 32 bits */ 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 736 readl(pcie_data_offset); 737 /* write high 32 bits */ 738 writel(reg_addr + 4, pcie_index_offset); 739 readl(pcie_index_offset); 740 writel((u32)(reg_data >> 32), pcie_data_offset); 741 readl(pcie_data_offset); 742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 743 } 744 745 /** 746 * amdgpu_device_get_rev_id - query device rev_id 747 * 748 * @adev: amdgpu_device pointer 749 * 750 * Return device rev_id 751 */ 752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 753 { 754 return adev->nbio.funcs->get_rev_id(adev); 755 } 756 757 /** 758 * amdgpu_invalid_rreg - dummy reg read function 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: offset of register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 * Returns the value in the register. 766 */ 767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 768 { 769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 770 BUG(); 771 return 0; 772 } 773 774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 775 { 776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 777 BUG(); 778 return 0; 779 } 780 781 /** 782 * amdgpu_invalid_wreg - dummy reg write function 783 * 784 * @adev: amdgpu_device pointer 785 * @reg: offset of register 786 * @v: value to write to the register 787 * 788 * Dummy register read function. Used for register blocks 789 * that certain asics don't have (all asics). 790 */ 791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 792 { 793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 794 reg, v); 795 BUG(); 796 } 797 798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 799 { 800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 801 reg, v); 802 BUG(); 803 } 804 805 /** 806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 807 * 808 * @adev: amdgpu_device pointer 809 * @reg: offset of register 810 * 811 * Dummy register read function. Used for register blocks 812 * that certain asics don't have (all asics). 813 * Returns the value in the register. 814 */ 815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 816 { 817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 818 BUG(); 819 return 0; 820 } 821 822 /** 823 * amdgpu_invalid_wreg64 - dummy reg write function 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: offset of register 827 * @v: value to write to the register 828 * 829 * Dummy register read function. Used for register blocks 830 * that certain asics don't have (all asics). 831 */ 832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 833 { 834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 835 reg, v); 836 BUG(); 837 } 838 839 /** 840 * amdgpu_block_invalid_rreg - dummy reg read function 841 * 842 * @adev: amdgpu_device pointer 843 * @block: offset of instance 844 * @reg: offset of register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 * Returns the value in the register. 849 */ 850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 851 uint32_t block, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 854 reg, block); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_block_invalid_wreg - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @block: offset of instance 864 * @reg: offset of register 865 * @v: value to write to the register 866 * 867 * Dummy register read function. Used for register blocks 868 * that certain asics don't have (all asics). 869 */ 870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 871 uint32_t block, 872 uint32_t reg, uint32_t v) 873 { 874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 875 reg, block, v); 876 BUG(); 877 } 878 879 /** 880 * amdgpu_device_asic_init - Wrapper for atom asic_init 881 * 882 * @adev: amdgpu_device pointer 883 * 884 * Does any asic specific work and then calls atom asic init. 885 */ 886 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 887 { 888 int ret; 889 890 amdgpu_asic_pre_asic_init(adev); 891 892 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 893 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 894 amdgpu_psp_wait_for_bootloader(adev); 895 ret = amdgpu_atomfirmware_asic_init(adev, true); 896 return ret; 897 } else { 898 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 899 } 900 901 return 0; 902 } 903 904 /** 905 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 906 * 907 * @adev: amdgpu_device pointer 908 * 909 * Allocates a scratch page of VRAM for use by various things in the 910 * driver. 911 */ 912 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 913 { 914 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 915 AMDGPU_GEM_DOMAIN_VRAM | 916 AMDGPU_GEM_DOMAIN_GTT, 917 &adev->mem_scratch.robj, 918 &adev->mem_scratch.gpu_addr, 919 (void **)&adev->mem_scratch.ptr); 920 } 921 922 /** 923 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 924 * 925 * @adev: amdgpu_device pointer 926 * 927 * Frees the VRAM scratch page. 928 */ 929 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 930 { 931 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 932 } 933 934 /** 935 * amdgpu_device_program_register_sequence - program an array of registers. 936 * 937 * @adev: amdgpu_device pointer 938 * @registers: pointer to the register array 939 * @array_size: size of the register array 940 * 941 * Programs an array or registers with and or masks. 942 * This is a helper for setting golden registers. 943 */ 944 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 945 const u32 *registers, 946 const u32 array_size) 947 { 948 u32 tmp, reg, and_mask, or_mask; 949 int i; 950 951 if (array_size % 3) 952 return; 953 954 for (i = 0; i < array_size; i += 3) { 955 reg = registers[i + 0]; 956 and_mask = registers[i + 1]; 957 or_mask = registers[i + 2]; 958 959 if (and_mask == 0xffffffff) { 960 tmp = or_mask; 961 } else { 962 tmp = RREG32(reg); 963 tmp &= ~and_mask; 964 if (adev->family >= AMDGPU_FAMILY_AI) 965 tmp |= (or_mask & and_mask); 966 else 967 tmp |= or_mask; 968 } 969 WREG32(reg, tmp); 970 } 971 } 972 973 /** 974 * amdgpu_device_pci_config_reset - reset the GPU 975 * 976 * @adev: amdgpu_device pointer 977 * 978 * Resets the GPU using the pci config reset sequence. 979 * Only applicable to asics prior to vega10. 980 */ 981 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 982 { 983 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 984 } 985 986 /** 987 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 988 * 989 * @adev: amdgpu_device pointer 990 * 991 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 992 */ 993 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 994 { 995 return pci_reset_function(adev->pdev); 996 } 997 998 /* 999 * amdgpu_device_wb_*() 1000 * Writeback is the method by which the GPU updates special pages in memory 1001 * with the status of certain GPU events (fences, ring pointers,etc.). 1002 */ 1003 1004 /** 1005 * amdgpu_device_wb_fini - Disable Writeback and free memory 1006 * 1007 * @adev: amdgpu_device pointer 1008 * 1009 * Disables Writeback and frees the Writeback memory (all asics). 1010 * Used at driver shutdown. 1011 */ 1012 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1013 { 1014 if (adev->wb.wb_obj) { 1015 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1016 &adev->wb.gpu_addr, 1017 (void **)&adev->wb.wb); 1018 adev->wb.wb_obj = NULL; 1019 } 1020 } 1021 1022 /** 1023 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1024 * 1025 * @adev: amdgpu_device pointer 1026 * 1027 * Initializes writeback and allocates writeback memory (all asics). 1028 * Used at driver startup. 1029 * Returns 0 on success or an -error on failure. 1030 */ 1031 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1032 { 1033 int r; 1034 1035 if (adev->wb.wb_obj == NULL) { 1036 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1037 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1038 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1039 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1040 (void **)&adev->wb.wb); 1041 if (r) { 1042 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1043 return r; 1044 } 1045 1046 adev->wb.num_wb = AMDGPU_MAX_WB; 1047 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1048 1049 /* clear wb memory */ 1050 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1051 } 1052 1053 return 0; 1054 } 1055 1056 /** 1057 * amdgpu_device_wb_get - Allocate a wb entry 1058 * 1059 * @adev: amdgpu_device pointer 1060 * @wb: wb index 1061 * 1062 * Allocate a wb slot for use by the driver (all asics). 1063 * Returns 0 on success or -EINVAL on failure. 1064 */ 1065 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1066 { 1067 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1068 1069 if (offset < adev->wb.num_wb) { 1070 __set_bit(offset, adev->wb.used); 1071 *wb = offset << 3; /* convert to dw offset */ 1072 return 0; 1073 } else { 1074 return -EINVAL; 1075 } 1076 } 1077 1078 /** 1079 * amdgpu_device_wb_free - Free a wb entry 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @wb: wb index 1083 * 1084 * Free a wb slot allocated for use by the driver (all asics) 1085 */ 1086 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1087 { 1088 wb >>= 3; 1089 if (wb < adev->wb.num_wb) 1090 __clear_bit(wb, adev->wb.used); 1091 } 1092 1093 /** 1094 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1095 * 1096 * @adev: amdgpu_device pointer 1097 * 1098 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1099 * to fail, but if any of the BARs is not accessible after the size we abort 1100 * driver loading by returning -ENODEV. 1101 */ 1102 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1103 { 1104 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1105 struct pci_bus *root; 1106 struct resource *res; 1107 unsigned int i; 1108 u16 cmd; 1109 int r; 1110 1111 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1112 return 0; 1113 1114 /* Bypass for VF */ 1115 if (amdgpu_sriov_vf(adev)) 1116 return 0; 1117 1118 /* skip if the bios has already enabled large BAR */ 1119 if (adev->gmc.real_vram_size && 1120 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1121 return 0; 1122 1123 /* Check if the root BUS has 64bit memory resources */ 1124 root = adev->pdev->bus; 1125 while (root->parent) 1126 root = root->parent; 1127 1128 pci_bus_for_each_resource(root, res, i) { 1129 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1130 res->start > 0x100000000ull) 1131 break; 1132 } 1133 1134 /* Trying to resize is pointless without a root hub window above 4GB */ 1135 if (!res) 1136 return 0; 1137 1138 /* Limit the BAR size to what is available */ 1139 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1140 rbar_size); 1141 1142 /* Disable memory decoding while we change the BAR addresses and size */ 1143 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1144 pci_write_config_word(adev->pdev, PCI_COMMAND, 1145 cmd & ~PCI_COMMAND_MEMORY); 1146 1147 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1148 amdgpu_doorbell_fini(adev); 1149 if (adev->asic_type >= CHIP_BONAIRE) 1150 pci_release_resource(adev->pdev, 2); 1151 1152 pci_release_resource(adev->pdev, 0); 1153 1154 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1155 if (r == -ENOSPC) 1156 DRM_INFO("Not enough PCI address space for a large BAR."); 1157 else if (r && r != -ENOTSUPP) 1158 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1159 1160 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1161 1162 /* When the doorbell or fb BAR isn't available we have no chance of 1163 * using the device. 1164 */ 1165 r = amdgpu_doorbell_init(adev); 1166 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1167 return -ENODEV; 1168 1169 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1170 1171 return 0; 1172 } 1173 1174 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1175 { 1176 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1177 return false; 1178 1179 return true; 1180 } 1181 1182 /* 1183 * GPU helpers function. 1184 */ 1185 /** 1186 * amdgpu_device_need_post - check if the hw need post or not 1187 * 1188 * @adev: amdgpu_device pointer 1189 * 1190 * Check if the asic has been initialized (all asics) at driver startup 1191 * or post is needed if hw reset is performed. 1192 * Returns true if need or false if not. 1193 */ 1194 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1195 { 1196 uint32_t reg; 1197 1198 if (amdgpu_sriov_vf(adev)) 1199 return false; 1200 1201 if (!amdgpu_device_read_bios(adev)) 1202 return false; 1203 1204 if (amdgpu_passthrough(adev)) { 1205 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1206 * some old smc fw still need driver do vPost otherwise gpu hang, while 1207 * those smc fw version above 22.15 doesn't have this flaw, so we force 1208 * vpost executed for smc version below 22.15 1209 */ 1210 if (adev->asic_type == CHIP_FIJI) { 1211 int err; 1212 uint32_t fw_ver; 1213 1214 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1215 /* force vPost if error occured */ 1216 if (err) 1217 return true; 1218 1219 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1220 if (fw_ver < 0x00160e00) 1221 return true; 1222 } 1223 } 1224 1225 /* Don't post if we need to reset whole hive on init */ 1226 if (adev->gmc.xgmi.pending_reset) 1227 return false; 1228 1229 if (adev->has_hw_reset) { 1230 adev->has_hw_reset = false; 1231 return true; 1232 } 1233 1234 /* bios scratch used on CIK+ */ 1235 if (adev->asic_type >= CHIP_BONAIRE) 1236 return amdgpu_atombios_scratch_need_asic_init(adev); 1237 1238 /* check MEM_SIZE for older asics */ 1239 reg = amdgpu_asic_get_config_memsize(adev); 1240 1241 if ((reg != 0) && (reg != 0xffffffff)) 1242 return false; 1243 1244 return true; 1245 } 1246 1247 /* 1248 * On APUs with >= 64GB white flickering has been observed w/ SG enabled. 1249 * Disable S/G on such systems until we have a proper fix. 1250 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 1251 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 1252 */ 1253 bool amdgpu_sg_display_supported(struct amdgpu_device *adev) 1254 { 1255 switch (amdgpu_sg_display) { 1256 case -1: 1257 break; 1258 case 0: 1259 return false; 1260 case 1: 1261 return true; 1262 default: 1263 return false; 1264 } 1265 if ((totalram_pages() << (PAGE_SHIFT - 10)) + 1266 (adev->gmc.real_vram_size / 1024) >= 64000000) { 1267 DRM_WARN("Disabling S/G due to >=64GB RAM\n"); 1268 return false; 1269 } 1270 return true; 1271 } 1272 1273 /* 1274 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1275 * speed switching. Until we have confirmation from Intel that a specific host 1276 * supports it, it's safer that we keep it disabled for all. 1277 * 1278 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1279 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1280 */ 1281 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1282 { 1283 #if IS_ENABLED(CONFIG_X86) 1284 struct cpuinfo_x86 *c = &cpu_data(0); 1285 1286 if (c->x86_vendor == X86_VENDOR_INTEL) 1287 return false; 1288 #endif 1289 return true; 1290 } 1291 1292 /** 1293 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1294 * 1295 * @adev: amdgpu_device pointer 1296 * 1297 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1298 * be set for this device. 1299 * 1300 * Returns true if it should be used or false if not. 1301 */ 1302 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1303 { 1304 switch (amdgpu_aspm) { 1305 case -1: 1306 break; 1307 case 0: 1308 return false; 1309 case 1: 1310 return true; 1311 default: 1312 return false; 1313 } 1314 return pcie_aspm_enabled(adev->pdev); 1315 } 1316 1317 bool amdgpu_device_aspm_support_quirk(void) 1318 { 1319 #if IS_ENABLED(CONFIG_X86) 1320 struct cpuinfo_x86 *c = &cpu_data(0); 1321 1322 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1323 #else 1324 return true; 1325 #endif 1326 } 1327 1328 /* if we get transitioned to only one device, take VGA back */ 1329 /** 1330 * amdgpu_device_vga_set_decode - enable/disable vga decode 1331 * 1332 * @pdev: PCI device pointer 1333 * @state: enable/disable vga decode 1334 * 1335 * Enable/disable vga decode (all asics). 1336 * Returns VGA resource flags. 1337 */ 1338 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1339 bool state) 1340 { 1341 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1342 1343 amdgpu_asic_set_vga_state(adev, state); 1344 if (state) 1345 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1346 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1347 else 1348 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1349 } 1350 1351 /** 1352 * amdgpu_device_check_block_size - validate the vm block size 1353 * 1354 * @adev: amdgpu_device pointer 1355 * 1356 * Validates the vm block size specified via module parameter. 1357 * The vm block size defines number of bits in page table versus page directory, 1358 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1359 * page table and the remaining bits are in the page directory. 1360 */ 1361 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1362 { 1363 /* defines number of bits in page table versus page directory, 1364 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1365 * page table and the remaining bits are in the page directory 1366 */ 1367 if (amdgpu_vm_block_size == -1) 1368 return; 1369 1370 if (amdgpu_vm_block_size < 9) { 1371 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1372 amdgpu_vm_block_size); 1373 amdgpu_vm_block_size = -1; 1374 } 1375 } 1376 1377 /** 1378 * amdgpu_device_check_vm_size - validate the vm size 1379 * 1380 * @adev: amdgpu_device pointer 1381 * 1382 * Validates the vm size in GB specified via module parameter. 1383 * The VM size is the size of the GPU virtual memory space in GB. 1384 */ 1385 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1386 { 1387 /* no need to check the default value */ 1388 if (amdgpu_vm_size == -1) 1389 return; 1390 1391 if (amdgpu_vm_size < 1) { 1392 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1393 amdgpu_vm_size); 1394 amdgpu_vm_size = -1; 1395 } 1396 } 1397 1398 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1399 { 1400 struct sysinfo si; 1401 bool is_os_64 = (sizeof(void *) == 8); 1402 uint64_t total_memory; 1403 uint64_t dram_size_seven_GB = 0x1B8000000; 1404 uint64_t dram_size_three_GB = 0xB8000000; 1405 1406 if (amdgpu_smu_memory_pool_size == 0) 1407 return; 1408 1409 if (!is_os_64) { 1410 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1411 goto def_value; 1412 } 1413 si_meminfo(&si); 1414 total_memory = (uint64_t)si.totalram * si.mem_unit; 1415 1416 if ((amdgpu_smu_memory_pool_size == 1) || 1417 (amdgpu_smu_memory_pool_size == 2)) { 1418 if (total_memory < dram_size_three_GB) 1419 goto def_value1; 1420 } else if ((amdgpu_smu_memory_pool_size == 4) || 1421 (amdgpu_smu_memory_pool_size == 8)) { 1422 if (total_memory < dram_size_seven_GB) 1423 goto def_value1; 1424 } else { 1425 DRM_WARN("Smu memory pool size not supported\n"); 1426 goto def_value; 1427 } 1428 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1429 1430 return; 1431 1432 def_value1: 1433 DRM_WARN("No enough system memory\n"); 1434 def_value: 1435 adev->pm.smu_prv_buffer_size = 0; 1436 } 1437 1438 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1439 { 1440 if (!(adev->flags & AMD_IS_APU) || 1441 adev->asic_type < CHIP_RAVEN) 1442 return 0; 1443 1444 switch (adev->asic_type) { 1445 case CHIP_RAVEN: 1446 if (adev->pdev->device == 0x15dd) 1447 adev->apu_flags |= AMD_APU_IS_RAVEN; 1448 if (adev->pdev->device == 0x15d8) 1449 adev->apu_flags |= AMD_APU_IS_PICASSO; 1450 break; 1451 case CHIP_RENOIR: 1452 if ((adev->pdev->device == 0x1636) || 1453 (adev->pdev->device == 0x164c)) 1454 adev->apu_flags |= AMD_APU_IS_RENOIR; 1455 else 1456 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1457 break; 1458 case CHIP_VANGOGH: 1459 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1460 break; 1461 case CHIP_YELLOW_CARP: 1462 break; 1463 case CHIP_CYAN_SKILLFISH: 1464 if ((adev->pdev->device == 0x13FE) || 1465 (adev->pdev->device == 0x143F)) 1466 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1467 break; 1468 default: 1469 break; 1470 } 1471 1472 return 0; 1473 } 1474 1475 /** 1476 * amdgpu_device_check_arguments - validate module params 1477 * 1478 * @adev: amdgpu_device pointer 1479 * 1480 * Validates certain module parameters and updates 1481 * the associated values used by the driver (all asics). 1482 */ 1483 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1484 { 1485 if (amdgpu_sched_jobs < 4) { 1486 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1487 amdgpu_sched_jobs); 1488 amdgpu_sched_jobs = 4; 1489 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1490 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1491 amdgpu_sched_jobs); 1492 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1493 } 1494 1495 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1496 /* gart size must be greater or equal to 32M */ 1497 dev_warn(adev->dev, "gart size (%d) too small\n", 1498 amdgpu_gart_size); 1499 amdgpu_gart_size = -1; 1500 } 1501 1502 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1503 /* gtt size must be greater or equal to 32M */ 1504 dev_warn(adev->dev, "gtt size (%d) too small\n", 1505 amdgpu_gtt_size); 1506 amdgpu_gtt_size = -1; 1507 } 1508 1509 /* valid range is between 4 and 9 inclusive */ 1510 if (amdgpu_vm_fragment_size != -1 && 1511 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1512 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1513 amdgpu_vm_fragment_size = -1; 1514 } 1515 1516 if (amdgpu_sched_hw_submission < 2) { 1517 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1518 amdgpu_sched_hw_submission); 1519 amdgpu_sched_hw_submission = 2; 1520 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1521 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1522 amdgpu_sched_hw_submission); 1523 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1524 } 1525 1526 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1527 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1528 amdgpu_reset_method = -1; 1529 } 1530 1531 amdgpu_device_check_smu_prv_buffer_size(adev); 1532 1533 amdgpu_device_check_vm_size(adev); 1534 1535 amdgpu_device_check_block_size(adev); 1536 1537 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1538 1539 return 0; 1540 } 1541 1542 /** 1543 * amdgpu_switcheroo_set_state - set switcheroo state 1544 * 1545 * @pdev: pci dev pointer 1546 * @state: vga_switcheroo state 1547 * 1548 * Callback for the switcheroo driver. Suspends or resumes 1549 * the asics before or after it is powered up using ACPI methods. 1550 */ 1551 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1552 enum vga_switcheroo_state state) 1553 { 1554 struct drm_device *dev = pci_get_drvdata(pdev); 1555 int r; 1556 1557 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1558 return; 1559 1560 if (state == VGA_SWITCHEROO_ON) { 1561 pr_info("switched on\n"); 1562 /* don't suspend or resume card normally */ 1563 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1564 1565 pci_set_power_state(pdev, PCI_D0); 1566 amdgpu_device_load_pci_state(pdev); 1567 r = pci_enable_device(pdev); 1568 if (r) 1569 DRM_WARN("pci_enable_device failed (%d)\n", r); 1570 amdgpu_device_resume(dev, true); 1571 1572 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1573 } else { 1574 pr_info("switched off\n"); 1575 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1576 amdgpu_device_suspend(dev, true); 1577 amdgpu_device_cache_pci_state(pdev); 1578 /* Shut down the device */ 1579 pci_disable_device(pdev); 1580 pci_set_power_state(pdev, PCI_D3cold); 1581 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1582 } 1583 } 1584 1585 /** 1586 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1587 * 1588 * @pdev: pci dev pointer 1589 * 1590 * Callback for the switcheroo driver. Check of the switcheroo 1591 * state can be changed. 1592 * Returns true if the state can be changed, false if not. 1593 */ 1594 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1595 { 1596 struct drm_device *dev = pci_get_drvdata(pdev); 1597 1598 /* 1599 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1600 * locking inversion with the driver load path. And the access here is 1601 * completely racy anyway. So don't bother with locking for now. 1602 */ 1603 return atomic_read(&dev->open_count) == 0; 1604 } 1605 1606 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1607 .set_gpu_state = amdgpu_switcheroo_set_state, 1608 .reprobe = NULL, 1609 .can_switch = amdgpu_switcheroo_can_switch, 1610 }; 1611 1612 /** 1613 * amdgpu_device_ip_set_clockgating_state - set the CG state 1614 * 1615 * @dev: amdgpu_device pointer 1616 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1617 * @state: clockgating state (gate or ungate) 1618 * 1619 * Sets the requested clockgating state for all instances of 1620 * the hardware IP specified. 1621 * Returns the error code from the last instance. 1622 */ 1623 int amdgpu_device_ip_set_clockgating_state(void *dev, 1624 enum amd_ip_block_type block_type, 1625 enum amd_clockgating_state state) 1626 { 1627 struct amdgpu_device *adev = dev; 1628 int i, r = 0; 1629 1630 for (i = 0; i < adev->num_ip_blocks; i++) { 1631 if (!adev->ip_blocks[i].status.valid) 1632 continue; 1633 if (adev->ip_blocks[i].version->type != block_type) 1634 continue; 1635 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1636 continue; 1637 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1638 (void *)adev, state); 1639 if (r) 1640 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1641 adev->ip_blocks[i].version->funcs->name, r); 1642 } 1643 return r; 1644 } 1645 1646 /** 1647 * amdgpu_device_ip_set_powergating_state - set the PG state 1648 * 1649 * @dev: amdgpu_device pointer 1650 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1651 * @state: powergating state (gate or ungate) 1652 * 1653 * Sets the requested powergating state for all instances of 1654 * the hardware IP specified. 1655 * Returns the error code from the last instance. 1656 */ 1657 int amdgpu_device_ip_set_powergating_state(void *dev, 1658 enum amd_ip_block_type block_type, 1659 enum amd_powergating_state state) 1660 { 1661 struct amdgpu_device *adev = dev; 1662 int i, r = 0; 1663 1664 for (i = 0; i < adev->num_ip_blocks; i++) { 1665 if (!adev->ip_blocks[i].status.valid) 1666 continue; 1667 if (adev->ip_blocks[i].version->type != block_type) 1668 continue; 1669 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1670 continue; 1671 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1672 (void *)adev, state); 1673 if (r) 1674 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1675 adev->ip_blocks[i].version->funcs->name, r); 1676 } 1677 return r; 1678 } 1679 1680 /** 1681 * amdgpu_device_ip_get_clockgating_state - get the CG state 1682 * 1683 * @adev: amdgpu_device pointer 1684 * @flags: clockgating feature flags 1685 * 1686 * Walks the list of IPs on the device and updates the clockgating 1687 * flags for each IP. 1688 * Updates @flags with the feature flags for each hardware IP where 1689 * clockgating is enabled. 1690 */ 1691 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1692 u64 *flags) 1693 { 1694 int i; 1695 1696 for (i = 0; i < adev->num_ip_blocks; i++) { 1697 if (!adev->ip_blocks[i].status.valid) 1698 continue; 1699 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1700 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1701 } 1702 } 1703 1704 /** 1705 * amdgpu_device_ip_wait_for_idle - wait for idle 1706 * 1707 * @adev: amdgpu_device pointer 1708 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1709 * 1710 * Waits for the request hardware IP to be idle. 1711 * Returns 0 for success or a negative error code on failure. 1712 */ 1713 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1714 enum amd_ip_block_type block_type) 1715 { 1716 int i, r; 1717 1718 for (i = 0; i < adev->num_ip_blocks; i++) { 1719 if (!adev->ip_blocks[i].status.valid) 1720 continue; 1721 if (adev->ip_blocks[i].version->type == block_type) { 1722 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1723 if (r) 1724 return r; 1725 break; 1726 } 1727 } 1728 return 0; 1729 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_is_idle - is the hardware IP idle 1734 * 1735 * @adev: amdgpu_device pointer 1736 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1737 * 1738 * Check if the hardware IP is idle or not. 1739 * Returns true if it the IP is idle, false if not. 1740 */ 1741 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1742 enum amd_ip_block_type block_type) 1743 { 1744 int i; 1745 1746 for (i = 0; i < adev->num_ip_blocks; i++) { 1747 if (!adev->ip_blocks[i].status.valid) 1748 continue; 1749 if (adev->ip_blocks[i].version->type == block_type) 1750 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1751 } 1752 return true; 1753 1754 } 1755 1756 /** 1757 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1758 * 1759 * @adev: amdgpu_device pointer 1760 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1761 * 1762 * Returns a pointer to the hardware IP block structure 1763 * if it exists for the asic, otherwise NULL. 1764 */ 1765 struct amdgpu_ip_block * 1766 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1767 enum amd_ip_block_type type) 1768 { 1769 int i; 1770 1771 for (i = 0; i < adev->num_ip_blocks; i++) 1772 if (adev->ip_blocks[i].version->type == type) 1773 return &adev->ip_blocks[i]; 1774 1775 return NULL; 1776 } 1777 1778 /** 1779 * amdgpu_device_ip_block_version_cmp 1780 * 1781 * @adev: amdgpu_device pointer 1782 * @type: enum amd_ip_block_type 1783 * @major: major version 1784 * @minor: minor version 1785 * 1786 * return 0 if equal or greater 1787 * return 1 if smaller or the ip_block doesn't exist 1788 */ 1789 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1790 enum amd_ip_block_type type, 1791 u32 major, u32 minor) 1792 { 1793 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1794 1795 if (ip_block && ((ip_block->version->major > major) || 1796 ((ip_block->version->major == major) && 1797 (ip_block->version->minor >= minor)))) 1798 return 0; 1799 1800 return 1; 1801 } 1802 1803 /** 1804 * amdgpu_device_ip_block_add 1805 * 1806 * @adev: amdgpu_device pointer 1807 * @ip_block_version: pointer to the IP to add 1808 * 1809 * Adds the IP block driver information to the collection of IPs 1810 * on the asic. 1811 */ 1812 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1813 const struct amdgpu_ip_block_version *ip_block_version) 1814 { 1815 if (!ip_block_version) 1816 return -EINVAL; 1817 1818 switch (ip_block_version->type) { 1819 case AMD_IP_BLOCK_TYPE_VCN: 1820 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1821 return 0; 1822 break; 1823 case AMD_IP_BLOCK_TYPE_JPEG: 1824 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1825 return 0; 1826 break; 1827 default: 1828 break; 1829 } 1830 1831 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1832 ip_block_version->funcs->name); 1833 1834 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1835 1836 return 0; 1837 } 1838 1839 /** 1840 * amdgpu_device_enable_virtual_display - enable virtual display feature 1841 * 1842 * @adev: amdgpu_device pointer 1843 * 1844 * Enabled the virtual display feature if the user has enabled it via 1845 * the module parameter virtual_display. This feature provides a virtual 1846 * display hardware on headless boards or in virtualized environments. 1847 * This function parses and validates the configuration string specified by 1848 * the user and configues the virtual display configuration (number of 1849 * virtual connectors, crtcs, etc.) specified. 1850 */ 1851 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1852 { 1853 adev->enable_virtual_display = false; 1854 1855 if (amdgpu_virtual_display) { 1856 const char *pci_address_name = pci_name(adev->pdev); 1857 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1858 1859 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1860 pciaddstr_tmp = pciaddstr; 1861 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1862 pciaddname = strsep(&pciaddname_tmp, ","); 1863 if (!strcmp("all", pciaddname) 1864 || !strcmp(pci_address_name, pciaddname)) { 1865 long num_crtc; 1866 int res = -1; 1867 1868 adev->enable_virtual_display = true; 1869 1870 if (pciaddname_tmp) 1871 res = kstrtol(pciaddname_tmp, 10, 1872 &num_crtc); 1873 1874 if (!res) { 1875 if (num_crtc < 1) 1876 num_crtc = 1; 1877 if (num_crtc > 6) 1878 num_crtc = 6; 1879 adev->mode_info.num_crtc = num_crtc; 1880 } else { 1881 adev->mode_info.num_crtc = 1; 1882 } 1883 break; 1884 } 1885 } 1886 1887 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1888 amdgpu_virtual_display, pci_address_name, 1889 adev->enable_virtual_display, adev->mode_info.num_crtc); 1890 1891 kfree(pciaddstr); 1892 } 1893 } 1894 1895 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1896 { 1897 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1898 adev->mode_info.num_crtc = 1; 1899 adev->enable_virtual_display = true; 1900 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1901 adev->enable_virtual_display, adev->mode_info.num_crtc); 1902 } 1903 } 1904 1905 /** 1906 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1907 * 1908 * @adev: amdgpu_device pointer 1909 * 1910 * Parses the asic configuration parameters specified in the gpu info 1911 * firmware and makes them availale to the driver for use in configuring 1912 * the asic. 1913 * Returns 0 on success, -EINVAL on failure. 1914 */ 1915 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1916 { 1917 const char *chip_name; 1918 char fw_name[40]; 1919 int err; 1920 const struct gpu_info_firmware_header_v1_0 *hdr; 1921 1922 adev->firmware.gpu_info_fw = NULL; 1923 1924 if (adev->mman.discovery_bin) { 1925 /* 1926 * FIXME: The bounding box is still needed by Navi12, so 1927 * temporarily read it from gpu_info firmware. Should be dropped 1928 * when DAL no longer needs it. 1929 */ 1930 if (adev->asic_type != CHIP_NAVI12) 1931 return 0; 1932 } 1933 1934 switch (adev->asic_type) { 1935 default: 1936 return 0; 1937 case CHIP_VEGA10: 1938 chip_name = "vega10"; 1939 break; 1940 case CHIP_VEGA12: 1941 chip_name = "vega12"; 1942 break; 1943 case CHIP_RAVEN: 1944 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1945 chip_name = "raven2"; 1946 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1947 chip_name = "picasso"; 1948 else 1949 chip_name = "raven"; 1950 break; 1951 case CHIP_ARCTURUS: 1952 chip_name = "arcturus"; 1953 break; 1954 case CHIP_NAVI12: 1955 chip_name = "navi12"; 1956 break; 1957 } 1958 1959 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1960 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1961 if (err) { 1962 dev_err(adev->dev, 1963 "Failed to get gpu_info firmware \"%s\"\n", 1964 fw_name); 1965 goto out; 1966 } 1967 1968 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1969 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1970 1971 switch (hdr->version_major) { 1972 case 1: 1973 { 1974 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1975 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1976 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1977 1978 /* 1979 * Should be droped when DAL no longer needs it. 1980 */ 1981 if (adev->asic_type == CHIP_NAVI12) 1982 goto parse_soc_bounding_box; 1983 1984 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1985 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1986 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1987 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1988 adev->gfx.config.max_texture_channel_caches = 1989 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1990 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1991 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1992 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1993 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1994 adev->gfx.config.double_offchip_lds_buf = 1995 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1996 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1997 adev->gfx.cu_info.max_waves_per_simd = 1998 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1999 adev->gfx.cu_info.max_scratch_slots_per_cu = 2000 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2001 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2002 if (hdr->version_minor >= 1) { 2003 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2004 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2005 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2006 adev->gfx.config.num_sc_per_sh = 2007 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2008 adev->gfx.config.num_packer_per_sc = 2009 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2010 } 2011 2012 parse_soc_bounding_box: 2013 /* 2014 * soc bounding box info is not integrated in disocovery table, 2015 * we always need to parse it from gpu info firmware if needed. 2016 */ 2017 if (hdr->version_minor == 2) { 2018 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2019 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2020 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2021 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2022 } 2023 break; 2024 } 2025 default: 2026 dev_err(adev->dev, 2027 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2028 err = -EINVAL; 2029 goto out; 2030 } 2031 out: 2032 return err; 2033 } 2034 2035 /** 2036 * amdgpu_device_ip_early_init - run early init for hardware IPs 2037 * 2038 * @adev: amdgpu_device pointer 2039 * 2040 * Early initialization pass for hardware IPs. The hardware IPs that make 2041 * up each asic are discovered each IP's early_init callback is run. This 2042 * is the first stage in initializing the asic. 2043 * Returns 0 on success, negative error code on failure. 2044 */ 2045 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2046 { 2047 struct drm_device *dev = adev_to_drm(adev); 2048 struct pci_dev *parent; 2049 int i, r; 2050 bool total; 2051 2052 amdgpu_device_enable_virtual_display(adev); 2053 2054 if (amdgpu_sriov_vf(adev)) { 2055 r = amdgpu_virt_request_full_gpu(adev, true); 2056 if (r) 2057 return r; 2058 } 2059 2060 switch (adev->asic_type) { 2061 #ifdef CONFIG_DRM_AMDGPU_SI 2062 case CHIP_VERDE: 2063 case CHIP_TAHITI: 2064 case CHIP_PITCAIRN: 2065 case CHIP_OLAND: 2066 case CHIP_HAINAN: 2067 adev->family = AMDGPU_FAMILY_SI; 2068 r = si_set_ip_blocks(adev); 2069 if (r) 2070 return r; 2071 break; 2072 #endif 2073 #ifdef CONFIG_DRM_AMDGPU_CIK 2074 case CHIP_BONAIRE: 2075 case CHIP_HAWAII: 2076 case CHIP_KAVERI: 2077 case CHIP_KABINI: 2078 case CHIP_MULLINS: 2079 if (adev->flags & AMD_IS_APU) 2080 adev->family = AMDGPU_FAMILY_KV; 2081 else 2082 adev->family = AMDGPU_FAMILY_CI; 2083 2084 r = cik_set_ip_blocks(adev); 2085 if (r) 2086 return r; 2087 break; 2088 #endif 2089 case CHIP_TOPAZ: 2090 case CHIP_TONGA: 2091 case CHIP_FIJI: 2092 case CHIP_POLARIS10: 2093 case CHIP_POLARIS11: 2094 case CHIP_POLARIS12: 2095 case CHIP_VEGAM: 2096 case CHIP_CARRIZO: 2097 case CHIP_STONEY: 2098 if (adev->flags & AMD_IS_APU) 2099 adev->family = AMDGPU_FAMILY_CZ; 2100 else 2101 adev->family = AMDGPU_FAMILY_VI; 2102 2103 r = vi_set_ip_blocks(adev); 2104 if (r) 2105 return r; 2106 break; 2107 default: 2108 r = amdgpu_discovery_set_ip_blocks(adev); 2109 if (r) 2110 return r; 2111 break; 2112 } 2113 2114 if (amdgpu_has_atpx() && 2115 (amdgpu_is_atpx_hybrid() || 2116 amdgpu_has_atpx_dgpu_power_cntl()) && 2117 ((adev->flags & AMD_IS_APU) == 0) && 2118 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2119 adev->flags |= AMD_IS_PX; 2120 2121 if (!(adev->flags & AMD_IS_APU)) { 2122 parent = pci_upstream_bridge(adev->pdev); 2123 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2124 } 2125 2126 2127 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2128 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2129 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2130 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2131 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2132 2133 total = true; 2134 for (i = 0; i < adev->num_ip_blocks; i++) { 2135 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2136 DRM_WARN("disabled ip block: %d <%s>\n", 2137 i, adev->ip_blocks[i].version->funcs->name); 2138 adev->ip_blocks[i].status.valid = false; 2139 } else { 2140 if (adev->ip_blocks[i].version->funcs->early_init) { 2141 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2142 if (r == -ENOENT) { 2143 adev->ip_blocks[i].status.valid = false; 2144 } else if (r) { 2145 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2146 adev->ip_blocks[i].version->funcs->name, r); 2147 total = false; 2148 } else { 2149 adev->ip_blocks[i].status.valid = true; 2150 } 2151 } else { 2152 adev->ip_blocks[i].status.valid = true; 2153 } 2154 } 2155 /* get the vbios after the asic_funcs are set up */ 2156 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2157 r = amdgpu_device_parse_gpu_info_fw(adev); 2158 if (r) 2159 return r; 2160 2161 /* Read BIOS */ 2162 if (amdgpu_device_read_bios(adev)) { 2163 if (!amdgpu_get_bios(adev)) 2164 return -EINVAL; 2165 2166 r = amdgpu_atombios_init(adev); 2167 if (r) { 2168 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2169 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2170 return r; 2171 } 2172 } 2173 2174 /*get pf2vf msg info at it's earliest time*/ 2175 if (amdgpu_sriov_vf(adev)) 2176 amdgpu_virt_init_data_exchange(adev); 2177 2178 } 2179 } 2180 if (!total) 2181 return -ENODEV; 2182 2183 amdgpu_amdkfd_device_probe(adev); 2184 adev->cg_flags &= amdgpu_cg_mask; 2185 adev->pg_flags &= amdgpu_pg_mask; 2186 2187 return 0; 2188 } 2189 2190 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2191 { 2192 int i, r; 2193 2194 for (i = 0; i < adev->num_ip_blocks; i++) { 2195 if (!adev->ip_blocks[i].status.sw) 2196 continue; 2197 if (adev->ip_blocks[i].status.hw) 2198 continue; 2199 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2200 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2201 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2202 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2203 if (r) { 2204 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2205 adev->ip_blocks[i].version->funcs->name, r); 2206 return r; 2207 } 2208 adev->ip_blocks[i].status.hw = true; 2209 } 2210 } 2211 2212 return 0; 2213 } 2214 2215 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2216 { 2217 int i, r; 2218 2219 for (i = 0; i < adev->num_ip_blocks; i++) { 2220 if (!adev->ip_blocks[i].status.sw) 2221 continue; 2222 if (adev->ip_blocks[i].status.hw) 2223 continue; 2224 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2225 if (r) { 2226 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2227 adev->ip_blocks[i].version->funcs->name, r); 2228 return r; 2229 } 2230 adev->ip_blocks[i].status.hw = true; 2231 } 2232 2233 return 0; 2234 } 2235 2236 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2237 { 2238 int r = 0; 2239 int i; 2240 uint32_t smu_version; 2241 2242 if (adev->asic_type >= CHIP_VEGA10) { 2243 for (i = 0; i < adev->num_ip_blocks; i++) { 2244 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2245 continue; 2246 2247 if (!adev->ip_blocks[i].status.sw) 2248 continue; 2249 2250 /* no need to do the fw loading again if already done*/ 2251 if (adev->ip_blocks[i].status.hw == true) 2252 break; 2253 2254 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2255 r = adev->ip_blocks[i].version->funcs->resume(adev); 2256 if (r) { 2257 DRM_ERROR("resume of IP block <%s> failed %d\n", 2258 adev->ip_blocks[i].version->funcs->name, r); 2259 return r; 2260 } 2261 } else { 2262 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2263 if (r) { 2264 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2265 adev->ip_blocks[i].version->funcs->name, r); 2266 return r; 2267 } 2268 } 2269 2270 adev->ip_blocks[i].status.hw = true; 2271 break; 2272 } 2273 } 2274 2275 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2276 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2277 2278 return r; 2279 } 2280 2281 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2282 { 2283 long timeout; 2284 int r, i; 2285 2286 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2287 struct amdgpu_ring *ring = adev->rings[i]; 2288 2289 /* No need to setup the GPU scheduler for rings that don't need it */ 2290 if (!ring || ring->no_scheduler) 2291 continue; 2292 2293 switch (ring->funcs->type) { 2294 case AMDGPU_RING_TYPE_GFX: 2295 timeout = adev->gfx_timeout; 2296 break; 2297 case AMDGPU_RING_TYPE_COMPUTE: 2298 timeout = adev->compute_timeout; 2299 break; 2300 case AMDGPU_RING_TYPE_SDMA: 2301 timeout = adev->sdma_timeout; 2302 break; 2303 default: 2304 timeout = adev->video_timeout; 2305 break; 2306 } 2307 2308 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2309 ring->num_hw_submission, 0, 2310 timeout, adev->reset_domain->wq, 2311 ring->sched_score, ring->name, 2312 adev->dev); 2313 if (r) { 2314 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2315 ring->name); 2316 return r; 2317 } 2318 } 2319 2320 amdgpu_xcp_update_partition_sched_list(adev); 2321 2322 return 0; 2323 } 2324 2325 2326 /** 2327 * amdgpu_device_ip_init - run init for hardware IPs 2328 * 2329 * @adev: amdgpu_device pointer 2330 * 2331 * Main initialization pass for hardware IPs. The list of all the hardware 2332 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2333 * are run. sw_init initializes the software state associated with each IP 2334 * and hw_init initializes the hardware associated with each IP. 2335 * Returns 0 on success, negative error code on failure. 2336 */ 2337 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2338 { 2339 int i, r; 2340 2341 r = amdgpu_ras_init(adev); 2342 if (r) 2343 return r; 2344 2345 for (i = 0; i < adev->num_ip_blocks; i++) { 2346 if (!adev->ip_blocks[i].status.valid) 2347 continue; 2348 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2349 if (r) { 2350 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2351 adev->ip_blocks[i].version->funcs->name, r); 2352 goto init_failed; 2353 } 2354 adev->ip_blocks[i].status.sw = true; 2355 2356 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2357 /* need to do common hw init early so everything is set up for gmc */ 2358 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2359 if (r) { 2360 DRM_ERROR("hw_init %d failed %d\n", i, r); 2361 goto init_failed; 2362 } 2363 adev->ip_blocks[i].status.hw = true; 2364 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2365 /* need to do gmc hw init early so we can allocate gpu mem */ 2366 /* Try to reserve bad pages early */ 2367 if (amdgpu_sriov_vf(adev)) 2368 amdgpu_virt_exchange_data(adev); 2369 2370 r = amdgpu_device_mem_scratch_init(adev); 2371 if (r) { 2372 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2373 goto init_failed; 2374 } 2375 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2376 if (r) { 2377 DRM_ERROR("hw_init %d failed %d\n", i, r); 2378 goto init_failed; 2379 } 2380 r = amdgpu_device_wb_init(adev); 2381 if (r) { 2382 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2383 goto init_failed; 2384 } 2385 adev->ip_blocks[i].status.hw = true; 2386 2387 /* right after GMC hw init, we create CSA */ 2388 if (adev->gfx.mcbp) { 2389 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2390 AMDGPU_GEM_DOMAIN_VRAM | 2391 AMDGPU_GEM_DOMAIN_GTT, 2392 AMDGPU_CSA_SIZE); 2393 if (r) { 2394 DRM_ERROR("allocate CSA failed %d\n", r); 2395 goto init_failed; 2396 } 2397 } 2398 } 2399 } 2400 2401 if (amdgpu_sriov_vf(adev)) 2402 amdgpu_virt_init_data_exchange(adev); 2403 2404 r = amdgpu_ib_pool_init(adev); 2405 if (r) { 2406 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2407 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2408 goto init_failed; 2409 } 2410 2411 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2412 if (r) 2413 goto init_failed; 2414 2415 r = amdgpu_device_ip_hw_init_phase1(adev); 2416 if (r) 2417 goto init_failed; 2418 2419 r = amdgpu_device_fw_loading(adev); 2420 if (r) 2421 goto init_failed; 2422 2423 r = amdgpu_device_ip_hw_init_phase2(adev); 2424 if (r) 2425 goto init_failed; 2426 2427 /* 2428 * retired pages will be loaded from eeprom and reserved here, 2429 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2430 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2431 * for I2C communication which only true at this point. 2432 * 2433 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2434 * failure from bad gpu situation and stop amdgpu init process 2435 * accordingly. For other failed cases, it will still release all 2436 * the resource and print error message, rather than returning one 2437 * negative value to upper level. 2438 * 2439 * Note: theoretically, this should be called before all vram allocations 2440 * to protect retired page from abusing 2441 */ 2442 r = amdgpu_ras_recovery_init(adev); 2443 if (r) 2444 goto init_failed; 2445 2446 /** 2447 * In case of XGMI grab extra reference for reset domain for this device 2448 */ 2449 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2450 if (amdgpu_xgmi_add_device(adev) == 0) { 2451 if (!amdgpu_sriov_vf(adev)) { 2452 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2453 2454 if (WARN_ON(!hive)) { 2455 r = -ENOENT; 2456 goto init_failed; 2457 } 2458 2459 if (!hive->reset_domain || 2460 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2461 r = -ENOENT; 2462 amdgpu_put_xgmi_hive(hive); 2463 goto init_failed; 2464 } 2465 2466 /* Drop the early temporary reset domain we created for device */ 2467 amdgpu_reset_put_reset_domain(adev->reset_domain); 2468 adev->reset_domain = hive->reset_domain; 2469 amdgpu_put_xgmi_hive(hive); 2470 } 2471 } 2472 } 2473 2474 r = amdgpu_device_init_schedulers(adev); 2475 if (r) 2476 goto init_failed; 2477 2478 /* Don't init kfd if whole hive need to be reset during init */ 2479 if (!adev->gmc.xgmi.pending_reset) { 2480 kgd2kfd_init_zone_device(adev); 2481 amdgpu_amdkfd_device_init(adev); 2482 } 2483 2484 amdgpu_fru_get_product_info(adev); 2485 2486 init_failed: 2487 2488 return r; 2489 } 2490 2491 /** 2492 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2493 * 2494 * @adev: amdgpu_device pointer 2495 * 2496 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2497 * this function before a GPU reset. If the value is retained after a 2498 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2499 */ 2500 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2501 { 2502 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2503 } 2504 2505 /** 2506 * amdgpu_device_check_vram_lost - check if vram is valid 2507 * 2508 * @adev: amdgpu_device pointer 2509 * 2510 * Checks the reset magic value written to the gart pointer in VRAM. 2511 * The driver calls this after a GPU reset to see if the contents of 2512 * VRAM is lost or now. 2513 * returns true if vram is lost, false if not. 2514 */ 2515 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2516 { 2517 if (memcmp(adev->gart.ptr, adev->reset_magic, 2518 AMDGPU_RESET_MAGIC_NUM)) 2519 return true; 2520 2521 if (!amdgpu_in_reset(adev)) 2522 return false; 2523 2524 /* 2525 * For all ASICs with baco/mode1 reset, the VRAM is 2526 * always assumed to be lost. 2527 */ 2528 switch (amdgpu_asic_reset_method(adev)) { 2529 case AMD_RESET_METHOD_BACO: 2530 case AMD_RESET_METHOD_MODE1: 2531 return true; 2532 default: 2533 return false; 2534 } 2535 } 2536 2537 /** 2538 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2539 * 2540 * @adev: amdgpu_device pointer 2541 * @state: clockgating state (gate or ungate) 2542 * 2543 * The list of all the hardware IPs that make up the asic is walked and the 2544 * set_clockgating_state callbacks are run. 2545 * Late initialization pass enabling clockgating for hardware IPs. 2546 * Fini or suspend, pass disabling clockgating for hardware IPs. 2547 * Returns 0 on success, negative error code on failure. 2548 */ 2549 2550 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2551 enum amd_clockgating_state state) 2552 { 2553 int i, j, r; 2554 2555 if (amdgpu_emu_mode == 1) 2556 return 0; 2557 2558 for (j = 0; j < adev->num_ip_blocks; j++) { 2559 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2560 if (!adev->ip_blocks[i].status.late_initialized) 2561 continue; 2562 /* skip CG for GFX, SDMA on S0ix */ 2563 if (adev->in_s0ix && 2564 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2565 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2566 continue; 2567 /* skip CG for VCE/UVD, it's handled specially */ 2568 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2569 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2570 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2571 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2572 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2573 /* enable clockgating to save power */ 2574 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2575 state); 2576 if (r) { 2577 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2578 adev->ip_blocks[i].version->funcs->name, r); 2579 return r; 2580 } 2581 } 2582 } 2583 2584 return 0; 2585 } 2586 2587 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2588 enum amd_powergating_state state) 2589 { 2590 int i, j, r; 2591 2592 if (amdgpu_emu_mode == 1) 2593 return 0; 2594 2595 for (j = 0; j < adev->num_ip_blocks; j++) { 2596 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2597 if (!adev->ip_blocks[i].status.late_initialized) 2598 continue; 2599 /* skip PG for GFX, SDMA on S0ix */ 2600 if (adev->in_s0ix && 2601 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2602 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2603 continue; 2604 /* skip CG for VCE/UVD, it's handled specially */ 2605 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2606 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2607 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2608 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2609 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2610 /* enable powergating to save power */ 2611 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2612 state); 2613 if (r) { 2614 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2615 adev->ip_blocks[i].version->funcs->name, r); 2616 return r; 2617 } 2618 } 2619 } 2620 return 0; 2621 } 2622 2623 static int amdgpu_device_enable_mgpu_fan_boost(void) 2624 { 2625 struct amdgpu_gpu_instance *gpu_ins; 2626 struct amdgpu_device *adev; 2627 int i, ret = 0; 2628 2629 mutex_lock(&mgpu_info.mutex); 2630 2631 /* 2632 * MGPU fan boost feature should be enabled 2633 * only when there are two or more dGPUs in 2634 * the system 2635 */ 2636 if (mgpu_info.num_dgpu < 2) 2637 goto out; 2638 2639 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2640 gpu_ins = &(mgpu_info.gpu_ins[i]); 2641 adev = gpu_ins->adev; 2642 if (!(adev->flags & AMD_IS_APU) && 2643 !gpu_ins->mgpu_fan_enabled) { 2644 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2645 if (ret) 2646 break; 2647 2648 gpu_ins->mgpu_fan_enabled = 1; 2649 } 2650 } 2651 2652 out: 2653 mutex_unlock(&mgpu_info.mutex); 2654 2655 return ret; 2656 } 2657 2658 /** 2659 * amdgpu_device_ip_late_init - run late init for hardware IPs 2660 * 2661 * @adev: amdgpu_device pointer 2662 * 2663 * Late initialization pass for hardware IPs. The list of all the hardware 2664 * IPs that make up the asic is walked and the late_init callbacks are run. 2665 * late_init covers any special initialization that an IP requires 2666 * after all of the have been initialized or something that needs to happen 2667 * late in the init process. 2668 * Returns 0 on success, negative error code on failure. 2669 */ 2670 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2671 { 2672 struct amdgpu_gpu_instance *gpu_instance; 2673 int i = 0, r; 2674 2675 for (i = 0; i < adev->num_ip_blocks; i++) { 2676 if (!adev->ip_blocks[i].status.hw) 2677 continue; 2678 if (adev->ip_blocks[i].version->funcs->late_init) { 2679 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2680 if (r) { 2681 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2682 adev->ip_blocks[i].version->funcs->name, r); 2683 return r; 2684 } 2685 } 2686 adev->ip_blocks[i].status.late_initialized = true; 2687 } 2688 2689 r = amdgpu_ras_late_init(adev); 2690 if (r) { 2691 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2692 return r; 2693 } 2694 2695 amdgpu_ras_set_error_query_ready(adev, true); 2696 2697 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2698 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2699 2700 amdgpu_device_fill_reset_magic(adev); 2701 2702 r = amdgpu_device_enable_mgpu_fan_boost(); 2703 if (r) 2704 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2705 2706 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2707 if (amdgpu_passthrough(adev) && 2708 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2709 adev->asic_type == CHIP_ALDEBARAN)) 2710 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2711 2712 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2713 mutex_lock(&mgpu_info.mutex); 2714 2715 /* 2716 * Reset device p-state to low as this was booted with high. 2717 * 2718 * This should be performed only after all devices from the same 2719 * hive get initialized. 2720 * 2721 * However, it's unknown how many device in the hive in advance. 2722 * As this is counted one by one during devices initializations. 2723 * 2724 * So, we wait for all XGMI interlinked devices initialized. 2725 * This may bring some delays as those devices may come from 2726 * different hives. But that should be OK. 2727 */ 2728 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2729 for (i = 0; i < mgpu_info.num_gpu; i++) { 2730 gpu_instance = &(mgpu_info.gpu_ins[i]); 2731 if (gpu_instance->adev->flags & AMD_IS_APU) 2732 continue; 2733 2734 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2735 AMDGPU_XGMI_PSTATE_MIN); 2736 if (r) { 2737 DRM_ERROR("pstate setting failed (%d).\n", r); 2738 break; 2739 } 2740 } 2741 } 2742 2743 mutex_unlock(&mgpu_info.mutex); 2744 } 2745 2746 return 0; 2747 } 2748 2749 /** 2750 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2751 * 2752 * @adev: amdgpu_device pointer 2753 * 2754 * For ASICs need to disable SMC first 2755 */ 2756 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2757 { 2758 int i, r; 2759 2760 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2761 return; 2762 2763 for (i = 0; i < adev->num_ip_blocks; i++) { 2764 if (!adev->ip_blocks[i].status.hw) 2765 continue; 2766 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2767 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2768 /* XXX handle errors */ 2769 if (r) { 2770 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2771 adev->ip_blocks[i].version->funcs->name, r); 2772 } 2773 adev->ip_blocks[i].status.hw = false; 2774 break; 2775 } 2776 } 2777 } 2778 2779 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2780 { 2781 int i, r; 2782 2783 for (i = 0; i < adev->num_ip_blocks; i++) { 2784 if (!adev->ip_blocks[i].version->funcs->early_fini) 2785 continue; 2786 2787 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2788 if (r) { 2789 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2790 adev->ip_blocks[i].version->funcs->name, r); 2791 } 2792 } 2793 2794 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2795 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2796 2797 amdgpu_amdkfd_suspend(adev, false); 2798 2799 /* Workaroud for ASICs need to disable SMC first */ 2800 amdgpu_device_smu_fini_early(adev); 2801 2802 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2803 if (!adev->ip_blocks[i].status.hw) 2804 continue; 2805 2806 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2807 /* XXX handle errors */ 2808 if (r) { 2809 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2810 adev->ip_blocks[i].version->funcs->name, r); 2811 } 2812 2813 adev->ip_blocks[i].status.hw = false; 2814 } 2815 2816 if (amdgpu_sriov_vf(adev)) { 2817 if (amdgpu_virt_release_full_gpu(adev, false)) 2818 DRM_ERROR("failed to release exclusive mode on fini\n"); 2819 } 2820 2821 return 0; 2822 } 2823 2824 /** 2825 * amdgpu_device_ip_fini - run fini for hardware IPs 2826 * 2827 * @adev: amdgpu_device pointer 2828 * 2829 * Main teardown pass for hardware IPs. The list of all the hardware 2830 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2831 * are run. hw_fini tears down the hardware associated with each IP 2832 * and sw_fini tears down any software state associated with each IP. 2833 * Returns 0 on success, negative error code on failure. 2834 */ 2835 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2836 { 2837 int i, r; 2838 2839 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2840 amdgpu_virt_release_ras_err_handler_data(adev); 2841 2842 if (adev->gmc.xgmi.num_physical_nodes > 1) 2843 amdgpu_xgmi_remove_device(adev); 2844 2845 amdgpu_amdkfd_device_fini_sw(adev); 2846 2847 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2848 if (!adev->ip_blocks[i].status.sw) 2849 continue; 2850 2851 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2852 amdgpu_ucode_free_bo(adev); 2853 amdgpu_free_static_csa(&adev->virt.csa_obj); 2854 amdgpu_device_wb_fini(adev); 2855 amdgpu_device_mem_scratch_fini(adev); 2856 amdgpu_ib_pool_fini(adev); 2857 } 2858 2859 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2860 /* XXX handle errors */ 2861 if (r) { 2862 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2863 adev->ip_blocks[i].version->funcs->name, r); 2864 } 2865 adev->ip_blocks[i].status.sw = false; 2866 adev->ip_blocks[i].status.valid = false; 2867 } 2868 2869 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2870 if (!adev->ip_blocks[i].status.late_initialized) 2871 continue; 2872 if (adev->ip_blocks[i].version->funcs->late_fini) 2873 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2874 adev->ip_blocks[i].status.late_initialized = false; 2875 } 2876 2877 amdgpu_ras_fini(adev); 2878 2879 return 0; 2880 } 2881 2882 /** 2883 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2884 * 2885 * @work: work_struct. 2886 */ 2887 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2888 { 2889 struct amdgpu_device *adev = 2890 container_of(work, struct amdgpu_device, delayed_init_work.work); 2891 int r; 2892 2893 r = amdgpu_ib_ring_tests(adev); 2894 if (r) 2895 DRM_ERROR("ib ring test failed (%d).\n", r); 2896 } 2897 2898 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2899 { 2900 struct amdgpu_device *adev = 2901 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2902 2903 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2904 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2905 2906 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2907 adev->gfx.gfx_off_state = true; 2908 } 2909 2910 /** 2911 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2912 * 2913 * @adev: amdgpu_device pointer 2914 * 2915 * Main suspend function for hardware IPs. The list of all the hardware 2916 * IPs that make up the asic is walked, clockgating is disabled and the 2917 * suspend callbacks are run. suspend puts the hardware and software state 2918 * in each IP into a state suitable for suspend. 2919 * Returns 0 on success, negative error code on failure. 2920 */ 2921 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2922 { 2923 int i, r; 2924 2925 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2926 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2927 2928 /* 2929 * Per PMFW team's suggestion, driver needs to handle gfxoff 2930 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2931 * scenario. Add the missing df cstate disablement here. 2932 */ 2933 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2934 dev_warn(adev->dev, "Failed to disallow df cstate"); 2935 2936 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2937 if (!adev->ip_blocks[i].status.valid) 2938 continue; 2939 2940 /* displays are handled separately */ 2941 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2942 continue; 2943 2944 /* XXX handle errors */ 2945 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2946 /* XXX handle errors */ 2947 if (r) { 2948 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2949 adev->ip_blocks[i].version->funcs->name, r); 2950 return r; 2951 } 2952 2953 adev->ip_blocks[i].status.hw = false; 2954 } 2955 2956 return 0; 2957 } 2958 2959 /** 2960 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2961 * 2962 * @adev: amdgpu_device pointer 2963 * 2964 * Main suspend function for hardware IPs. The list of all the hardware 2965 * IPs that make up the asic is walked, clockgating is disabled and the 2966 * suspend callbacks are run. suspend puts the hardware and software state 2967 * in each IP into a state suitable for suspend. 2968 * Returns 0 on success, negative error code on failure. 2969 */ 2970 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2971 { 2972 int i, r; 2973 2974 if (adev->in_s0ix) 2975 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2976 2977 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2978 if (!adev->ip_blocks[i].status.valid) 2979 continue; 2980 /* displays are handled in phase1 */ 2981 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2982 continue; 2983 /* PSP lost connection when err_event_athub occurs */ 2984 if (amdgpu_ras_intr_triggered() && 2985 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2986 adev->ip_blocks[i].status.hw = false; 2987 continue; 2988 } 2989 2990 /* skip unnecessary suspend if we do not initialize them yet */ 2991 if (adev->gmc.xgmi.pending_reset && 2992 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2996 adev->ip_blocks[i].status.hw = false; 2997 continue; 2998 } 2999 3000 /* skip suspend of gfx/mes and psp for S0ix 3001 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3002 * like at runtime. PSP is also part of the always on hardware 3003 * so no need to suspend it. 3004 */ 3005 if (adev->in_s0ix && 3006 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3009 continue; 3010 3011 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3012 if (adev->in_s0ix && 3013 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3014 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3015 continue; 3016 3017 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3018 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3019 * from this location and RLC Autoload automatically also gets loaded 3020 * from here based on PMFW -> PSP message during re-init sequence. 3021 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3022 * the TMR and reload FWs again for IMU enabled APU ASICs. 3023 */ 3024 if (amdgpu_in_reset(adev) && 3025 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3026 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3027 continue; 3028 3029 /* XXX handle errors */ 3030 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3031 /* XXX handle errors */ 3032 if (r) { 3033 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3034 adev->ip_blocks[i].version->funcs->name, r); 3035 } 3036 adev->ip_blocks[i].status.hw = false; 3037 /* handle putting the SMC in the appropriate state */ 3038 if (!amdgpu_sriov_vf(adev)) { 3039 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3040 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3041 if (r) { 3042 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3043 adev->mp1_state, r); 3044 return r; 3045 } 3046 } 3047 } 3048 } 3049 3050 return 0; 3051 } 3052 3053 /** 3054 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3055 * 3056 * @adev: amdgpu_device pointer 3057 * 3058 * Main suspend function for hardware IPs. The list of all the hardware 3059 * IPs that make up the asic is walked, clockgating is disabled and the 3060 * suspend callbacks are run. suspend puts the hardware and software state 3061 * in each IP into a state suitable for suspend. 3062 * Returns 0 on success, negative error code on failure. 3063 */ 3064 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3065 { 3066 int r; 3067 3068 if (amdgpu_sriov_vf(adev)) { 3069 amdgpu_virt_fini_data_exchange(adev); 3070 amdgpu_virt_request_full_gpu(adev, false); 3071 } 3072 3073 r = amdgpu_device_ip_suspend_phase1(adev); 3074 if (r) 3075 return r; 3076 r = amdgpu_device_ip_suspend_phase2(adev); 3077 3078 if (amdgpu_sriov_vf(adev)) 3079 amdgpu_virt_release_full_gpu(adev, false); 3080 3081 return r; 3082 } 3083 3084 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3085 { 3086 int i, r; 3087 3088 static enum amd_ip_block_type ip_order[] = { 3089 AMD_IP_BLOCK_TYPE_COMMON, 3090 AMD_IP_BLOCK_TYPE_GMC, 3091 AMD_IP_BLOCK_TYPE_PSP, 3092 AMD_IP_BLOCK_TYPE_IH, 3093 }; 3094 3095 for (i = 0; i < adev->num_ip_blocks; i++) { 3096 int j; 3097 struct amdgpu_ip_block *block; 3098 3099 block = &adev->ip_blocks[i]; 3100 block->status.hw = false; 3101 3102 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3103 3104 if (block->version->type != ip_order[j] || 3105 !block->status.valid) 3106 continue; 3107 3108 r = block->version->funcs->hw_init(adev); 3109 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3110 if (r) 3111 return r; 3112 block->status.hw = true; 3113 } 3114 } 3115 3116 return 0; 3117 } 3118 3119 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3120 { 3121 int i, r; 3122 3123 static enum amd_ip_block_type ip_order[] = { 3124 AMD_IP_BLOCK_TYPE_SMC, 3125 AMD_IP_BLOCK_TYPE_DCE, 3126 AMD_IP_BLOCK_TYPE_GFX, 3127 AMD_IP_BLOCK_TYPE_SDMA, 3128 AMD_IP_BLOCK_TYPE_MES, 3129 AMD_IP_BLOCK_TYPE_UVD, 3130 AMD_IP_BLOCK_TYPE_VCE, 3131 AMD_IP_BLOCK_TYPE_VCN, 3132 AMD_IP_BLOCK_TYPE_JPEG 3133 }; 3134 3135 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3136 int j; 3137 struct amdgpu_ip_block *block; 3138 3139 for (j = 0; j < adev->num_ip_blocks; j++) { 3140 block = &adev->ip_blocks[j]; 3141 3142 if (block->version->type != ip_order[i] || 3143 !block->status.valid || 3144 block->status.hw) 3145 continue; 3146 3147 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3148 r = block->version->funcs->resume(adev); 3149 else 3150 r = block->version->funcs->hw_init(adev); 3151 3152 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3153 if (r) 3154 return r; 3155 block->status.hw = true; 3156 } 3157 } 3158 3159 return 0; 3160 } 3161 3162 /** 3163 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3164 * 3165 * @adev: amdgpu_device pointer 3166 * 3167 * First resume function for hardware IPs. The list of all the hardware 3168 * IPs that make up the asic is walked and the resume callbacks are run for 3169 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3170 * after a suspend and updates the software state as necessary. This 3171 * function is also used for restoring the GPU after a GPU reset. 3172 * Returns 0 on success, negative error code on failure. 3173 */ 3174 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3175 { 3176 int i, r; 3177 3178 for (i = 0; i < adev->num_ip_blocks; i++) { 3179 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3180 continue; 3181 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3182 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3183 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3184 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3185 3186 r = adev->ip_blocks[i].version->funcs->resume(adev); 3187 if (r) { 3188 DRM_ERROR("resume of IP block <%s> failed %d\n", 3189 adev->ip_blocks[i].version->funcs->name, r); 3190 return r; 3191 } 3192 adev->ip_blocks[i].status.hw = true; 3193 } 3194 } 3195 3196 return 0; 3197 } 3198 3199 /** 3200 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3201 * 3202 * @adev: amdgpu_device pointer 3203 * 3204 * First resume function for hardware IPs. The list of all the hardware 3205 * IPs that make up the asic is walked and the resume callbacks are run for 3206 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3207 * functional state after a suspend and updates the software state as 3208 * necessary. This function is also used for restoring the GPU after a GPU 3209 * reset. 3210 * Returns 0 on success, negative error code on failure. 3211 */ 3212 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3213 { 3214 int i, r; 3215 3216 for (i = 0; i < adev->num_ip_blocks; i++) { 3217 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3218 continue; 3219 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3222 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3223 continue; 3224 r = adev->ip_blocks[i].version->funcs->resume(adev); 3225 if (r) { 3226 DRM_ERROR("resume of IP block <%s> failed %d\n", 3227 adev->ip_blocks[i].version->funcs->name, r); 3228 return r; 3229 } 3230 adev->ip_blocks[i].status.hw = true; 3231 } 3232 3233 return 0; 3234 } 3235 3236 /** 3237 * amdgpu_device_ip_resume - run resume for hardware IPs 3238 * 3239 * @adev: amdgpu_device pointer 3240 * 3241 * Main resume function for hardware IPs. The hardware IPs 3242 * are split into two resume functions because they are 3243 * also used in recovering from a GPU reset and some additional 3244 * steps need to be take between them. In this case (S3/S4) they are 3245 * run sequentially. 3246 * Returns 0 on success, negative error code on failure. 3247 */ 3248 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3249 { 3250 int r; 3251 3252 r = amdgpu_device_ip_resume_phase1(adev); 3253 if (r) 3254 return r; 3255 3256 r = amdgpu_device_fw_loading(adev); 3257 if (r) 3258 return r; 3259 3260 r = amdgpu_device_ip_resume_phase2(adev); 3261 3262 return r; 3263 } 3264 3265 /** 3266 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3267 * 3268 * @adev: amdgpu_device pointer 3269 * 3270 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3271 */ 3272 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3273 { 3274 if (amdgpu_sriov_vf(adev)) { 3275 if (adev->is_atom_fw) { 3276 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3277 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3278 } else { 3279 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3280 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3281 } 3282 3283 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3284 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3285 } 3286 } 3287 3288 /** 3289 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3290 * 3291 * @asic_type: AMD asic type 3292 * 3293 * Check if there is DC (new modesetting infrastructre) support for an asic. 3294 * returns true if DC has support, false if not. 3295 */ 3296 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3297 { 3298 switch (asic_type) { 3299 #ifdef CONFIG_DRM_AMDGPU_SI 3300 case CHIP_HAINAN: 3301 #endif 3302 case CHIP_TOPAZ: 3303 /* chips with no display hardware */ 3304 return false; 3305 #if defined(CONFIG_DRM_AMD_DC) 3306 case CHIP_TAHITI: 3307 case CHIP_PITCAIRN: 3308 case CHIP_VERDE: 3309 case CHIP_OLAND: 3310 /* 3311 * We have systems in the wild with these ASICs that require 3312 * LVDS and VGA support which is not supported with DC. 3313 * 3314 * Fallback to the non-DC driver here by default so as not to 3315 * cause regressions. 3316 */ 3317 #if defined(CONFIG_DRM_AMD_DC_SI) 3318 return amdgpu_dc > 0; 3319 #else 3320 return false; 3321 #endif 3322 case CHIP_BONAIRE: 3323 case CHIP_KAVERI: 3324 case CHIP_KABINI: 3325 case CHIP_MULLINS: 3326 /* 3327 * We have systems in the wild with these ASICs that require 3328 * VGA support which is not supported with DC. 3329 * 3330 * Fallback to the non-DC driver here by default so as not to 3331 * cause regressions. 3332 */ 3333 return amdgpu_dc > 0; 3334 default: 3335 return amdgpu_dc != 0; 3336 #else 3337 default: 3338 if (amdgpu_dc > 0) 3339 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3340 return false; 3341 #endif 3342 } 3343 } 3344 3345 /** 3346 * amdgpu_device_has_dc_support - check if dc is supported 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Returns true for supported, false for not supported 3351 */ 3352 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3353 { 3354 if (adev->enable_virtual_display || 3355 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3356 return false; 3357 3358 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3359 } 3360 3361 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3362 { 3363 struct amdgpu_device *adev = 3364 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3365 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3366 3367 /* It's a bug to not have a hive within this function */ 3368 if (WARN_ON(!hive)) 3369 return; 3370 3371 /* 3372 * Use task barrier to synchronize all xgmi reset works across the 3373 * hive. task_barrier_enter and task_barrier_exit will block 3374 * until all the threads running the xgmi reset works reach 3375 * those points. task_barrier_full will do both blocks. 3376 */ 3377 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3378 3379 task_barrier_enter(&hive->tb); 3380 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3381 3382 if (adev->asic_reset_res) 3383 goto fail; 3384 3385 task_barrier_exit(&hive->tb); 3386 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3387 3388 if (adev->asic_reset_res) 3389 goto fail; 3390 3391 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3392 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3393 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3394 } else { 3395 3396 task_barrier_full(&hive->tb); 3397 adev->asic_reset_res = amdgpu_asic_reset(adev); 3398 } 3399 3400 fail: 3401 if (adev->asic_reset_res) 3402 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3403 adev->asic_reset_res, adev_to_drm(adev)->unique); 3404 amdgpu_put_xgmi_hive(hive); 3405 } 3406 3407 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3408 { 3409 char *input = amdgpu_lockup_timeout; 3410 char *timeout_setting = NULL; 3411 int index = 0; 3412 long timeout; 3413 int ret = 0; 3414 3415 /* 3416 * By default timeout for non compute jobs is 10000 3417 * and 60000 for compute jobs. 3418 * In SR-IOV or passthrough mode, timeout for compute 3419 * jobs are 60000 by default. 3420 */ 3421 adev->gfx_timeout = msecs_to_jiffies(10000); 3422 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3423 if (amdgpu_sriov_vf(adev)) 3424 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3425 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3426 else 3427 adev->compute_timeout = msecs_to_jiffies(60000); 3428 3429 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3430 while ((timeout_setting = strsep(&input, ",")) && 3431 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3432 ret = kstrtol(timeout_setting, 0, &timeout); 3433 if (ret) 3434 return ret; 3435 3436 if (timeout == 0) { 3437 index++; 3438 continue; 3439 } else if (timeout < 0) { 3440 timeout = MAX_SCHEDULE_TIMEOUT; 3441 dev_warn(adev->dev, "lockup timeout disabled"); 3442 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3443 } else { 3444 timeout = msecs_to_jiffies(timeout); 3445 } 3446 3447 switch (index++) { 3448 case 0: 3449 adev->gfx_timeout = timeout; 3450 break; 3451 case 1: 3452 adev->compute_timeout = timeout; 3453 break; 3454 case 2: 3455 adev->sdma_timeout = timeout; 3456 break; 3457 case 3: 3458 adev->video_timeout = timeout; 3459 break; 3460 default: 3461 break; 3462 } 3463 } 3464 /* 3465 * There is only one value specified and 3466 * it should apply to all non-compute jobs. 3467 */ 3468 if (index == 1) { 3469 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3470 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3471 adev->compute_timeout = adev->gfx_timeout; 3472 } 3473 } 3474 3475 return ret; 3476 } 3477 3478 /** 3479 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3480 * 3481 * @adev: amdgpu_device pointer 3482 * 3483 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3484 */ 3485 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3486 { 3487 struct iommu_domain *domain; 3488 3489 domain = iommu_get_domain_for_dev(adev->dev); 3490 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3491 adev->ram_is_direct_mapped = true; 3492 } 3493 3494 static const struct attribute *amdgpu_dev_attributes[] = { 3495 &dev_attr_pcie_replay_count.attr, 3496 NULL 3497 }; 3498 3499 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3500 { 3501 if (amdgpu_mcbp == 1) 3502 adev->gfx.mcbp = true; 3503 else if (amdgpu_mcbp == 0) 3504 adev->gfx.mcbp = false; 3505 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3506 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3507 adev->gfx.num_gfx_rings) 3508 adev->gfx.mcbp = true; 3509 3510 if (amdgpu_sriov_vf(adev)) 3511 adev->gfx.mcbp = true; 3512 3513 if (adev->gfx.mcbp) 3514 DRM_INFO("MCBP is enabled\n"); 3515 } 3516 3517 /** 3518 * amdgpu_device_init - initialize the driver 3519 * 3520 * @adev: amdgpu_device pointer 3521 * @flags: driver flags 3522 * 3523 * Initializes the driver info and hw (all asics). 3524 * Returns 0 for success or an error on failure. 3525 * Called at driver startup. 3526 */ 3527 int amdgpu_device_init(struct amdgpu_device *adev, 3528 uint32_t flags) 3529 { 3530 struct drm_device *ddev = adev_to_drm(adev); 3531 struct pci_dev *pdev = adev->pdev; 3532 int r, i; 3533 bool px = false; 3534 u32 max_MBps; 3535 int tmp; 3536 3537 adev->shutdown = false; 3538 adev->flags = flags; 3539 3540 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3541 adev->asic_type = amdgpu_force_asic_type; 3542 else 3543 adev->asic_type = flags & AMD_ASIC_MASK; 3544 3545 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3546 if (amdgpu_emu_mode == 1) 3547 adev->usec_timeout *= 10; 3548 adev->gmc.gart_size = 512 * 1024 * 1024; 3549 adev->accel_working = false; 3550 adev->num_rings = 0; 3551 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3552 adev->mman.buffer_funcs = NULL; 3553 adev->mman.buffer_funcs_ring = NULL; 3554 adev->vm_manager.vm_pte_funcs = NULL; 3555 adev->vm_manager.vm_pte_num_scheds = 0; 3556 adev->gmc.gmc_funcs = NULL; 3557 adev->harvest_ip_mask = 0x0; 3558 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3559 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3560 3561 adev->smc_rreg = &amdgpu_invalid_rreg; 3562 adev->smc_wreg = &amdgpu_invalid_wreg; 3563 adev->pcie_rreg = &amdgpu_invalid_rreg; 3564 adev->pcie_wreg = &amdgpu_invalid_wreg; 3565 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3566 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3567 adev->pciep_rreg = &amdgpu_invalid_rreg; 3568 adev->pciep_wreg = &amdgpu_invalid_wreg; 3569 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3570 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3571 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3572 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3573 adev->didt_rreg = &amdgpu_invalid_rreg; 3574 adev->didt_wreg = &amdgpu_invalid_wreg; 3575 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3576 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3577 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3578 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3579 3580 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3581 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3582 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3583 3584 /* mutex initialization are all done here so we 3585 * can recall function without having locking issues 3586 */ 3587 mutex_init(&adev->firmware.mutex); 3588 mutex_init(&adev->pm.mutex); 3589 mutex_init(&adev->gfx.gpu_clock_mutex); 3590 mutex_init(&adev->srbm_mutex); 3591 mutex_init(&adev->gfx.pipe_reserve_mutex); 3592 mutex_init(&adev->gfx.gfx_off_mutex); 3593 mutex_init(&adev->gfx.partition_mutex); 3594 mutex_init(&adev->grbm_idx_mutex); 3595 mutex_init(&adev->mn_lock); 3596 mutex_init(&adev->virt.vf_errors.lock); 3597 hash_init(adev->mn_hash); 3598 mutex_init(&adev->psp.mutex); 3599 mutex_init(&adev->notifier_lock); 3600 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3601 mutex_init(&adev->benchmark_mutex); 3602 3603 amdgpu_device_init_apu_flags(adev); 3604 3605 r = amdgpu_device_check_arguments(adev); 3606 if (r) 3607 return r; 3608 3609 spin_lock_init(&adev->mmio_idx_lock); 3610 spin_lock_init(&adev->smc_idx_lock); 3611 spin_lock_init(&adev->pcie_idx_lock); 3612 spin_lock_init(&adev->uvd_ctx_idx_lock); 3613 spin_lock_init(&adev->didt_idx_lock); 3614 spin_lock_init(&adev->gc_cac_idx_lock); 3615 spin_lock_init(&adev->se_cac_idx_lock); 3616 spin_lock_init(&adev->audio_endpt_idx_lock); 3617 spin_lock_init(&adev->mm_stats.lock); 3618 3619 INIT_LIST_HEAD(&adev->shadow_list); 3620 mutex_init(&adev->shadow_list_lock); 3621 3622 INIT_LIST_HEAD(&adev->reset_list); 3623 3624 INIT_LIST_HEAD(&adev->ras_list); 3625 3626 INIT_DELAYED_WORK(&adev->delayed_init_work, 3627 amdgpu_device_delayed_init_work_handler); 3628 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3629 amdgpu_device_delay_enable_gfx_off); 3630 3631 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3632 3633 adev->gfx.gfx_off_req_count = 1; 3634 adev->gfx.gfx_off_residency = 0; 3635 adev->gfx.gfx_off_entrycount = 0; 3636 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3637 3638 atomic_set(&adev->throttling_logging_enabled, 1); 3639 /* 3640 * If throttling continues, logging will be performed every minute 3641 * to avoid log flooding. "-1" is subtracted since the thermal 3642 * throttling interrupt comes every second. Thus, the total logging 3643 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3644 * for throttling interrupt) = 60 seconds. 3645 */ 3646 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3647 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3648 3649 /* Registers mapping */ 3650 /* TODO: block userspace mapping of io register */ 3651 if (adev->asic_type >= CHIP_BONAIRE) { 3652 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3653 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3654 } else { 3655 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3656 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3657 } 3658 3659 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3660 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3661 3662 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3663 if (!adev->rmmio) 3664 return -ENOMEM; 3665 3666 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3667 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3668 3669 /* 3670 * Reset domain needs to be present early, before XGMI hive discovered 3671 * (if any) and intitialized to use reset sem and in_gpu reset flag 3672 * early on during init and before calling to RREG32. 3673 */ 3674 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3675 if (!adev->reset_domain) 3676 return -ENOMEM; 3677 3678 /* detect hw virtualization here */ 3679 amdgpu_detect_virtualization(adev); 3680 3681 amdgpu_device_get_pcie_info(adev); 3682 3683 r = amdgpu_device_get_job_timeout_settings(adev); 3684 if (r) { 3685 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3686 return r; 3687 } 3688 3689 /* early init functions */ 3690 r = amdgpu_device_ip_early_init(adev); 3691 if (r) 3692 return r; 3693 3694 amdgpu_device_set_mcbp(adev); 3695 3696 /* Get rid of things like offb */ 3697 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3698 if (r) 3699 return r; 3700 3701 /* Enable TMZ based on IP_VERSION */ 3702 amdgpu_gmc_tmz_set(adev); 3703 3704 amdgpu_gmc_noretry_set(adev); 3705 /* Need to get xgmi info early to decide the reset behavior*/ 3706 if (adev->gmc.xgmi.supported) { 3707 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3708 if (r) 3709 return r; 3710 } 3711 3712 /* enable PCIE atomic ops */ 3713 if (amdgpu_sriov_vf(adev)) { 3714 if (adev->virt.fw_reserve.p_pf2vf) 3715 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3716 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3717 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3718 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3719 * internal path natively support atomics, set have_atomics_support to true. 3720 */ 3721 } else if ((adev->flags & AMD_IS_APU) && 3722 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3723 adev->have_atomics_support = true; 3724 } else { 3725 adev->have_atomics_support = 3726 !pci_enable_atomic_ops_to_root(adev->pdev, 3727 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3728 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3729 } 3730 3731 if (!adev->have_atomics_support) 3732 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3733 3734 /* doorbell bar mapping and doorbell index init*/ 3735 amdgpu_doorbell_init(adev); 3736 3737 if (amdgpu_emu_mode == 1) { 3738 /* post the asic on emulation mode */ 3739 emu_soc_asic_init(adev); 3740 goto fence_driver_init; 3741 } 3742 3743 amdgpu_reset_init(adev); 3744 3745 /* detect if we are with an SRIOV vbios */ 3746 if (adev->bios) 3747 amdgpu_device_detect_sriov_bios(adev); 3748 3749 /* check if we need to reset the asic 3750 * E.g., driver was not cleanly unloaded previously, etc. 3751 */ 3752 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3753 if (adev->gmc.xgmi.num_physical_nodes) { 3754 dev_info(adev->dev, "Pending hive reset.\n"); 3755 adev->gmc.xgmi.pending_reset = true; 3756 /* Only need to init necessary block for SMU to handle the reset */ 3757 for (i = 0; i < adev->num_ip_blocks; i++) { 3758 if (!adev->ip_blocks[i].status.valid) 3759 continue; 3760 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3761 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3762 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3763 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3764 DRM_DEBUG("IP %s disabled for hw_init.\n", 3765 adev->ip_blocks[i].version->funcs->name); 3766 adev->ip_blocks[i].status.hw = true; 3767 } 3768 } 3769 } else { 3770 tmp = amdgpu_reset_method; 3771 /* It should do a default reset when loading or reloading the driver, 3772 * regardless of the module parameter reset_method. 3773 */ 3774 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3775 r = amdgpu_asic_reset(adev); 3776 amdgpu_reset_method = tmp; 3777 if (r) { 3778 dev_err(adev->dev, "asic reset on init failed\n"); 3779 goto failed; 3780 } 3781 } 3782 } 3783 3784 /* Post card if necessary */ 3785 if (amdgpu_device_need_post(adev)) { 3786 if (!adev->bios) { 3787 dev_err(adev->dev, "no vBIOS found\n"); 3788 r = -EINVAL; 3789 goto failed; 3790 } 3791 DRM_INFO("GPU posting now...\n"); 3792 r = amdgpu_device_asic_init(adev); 3793 if (r) { 3794 dev_err(adev->dev, "gpu post error!\n"); 3795 goto failed; 3796 } 3797 } 3798 3799 if (adev->bios) { 3800 if (adev->is_atom_fw) { 3801 /* Initialize clocks */ 3802 r = amdgpu_atomfirmware_get_clock_info(adev); 3803 if (r) { 3804 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3805 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3806 goto failed; 3807 } 3808 } else { 3809 /* Initialize clocks */ 3810 r = amdgpu_atombios_get_clock_info(adev); 3811 if (r) { 3812 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3813 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3814 goto failed; 3815 } 3816 /* init i2c buses */ 3817 if (!amdgpu_device_has_dc_support(adev)) 3818 amdgpu_atombios_i2c_init(adev); 3819 } 3820 } 3821 3822 fence_driver_init: 3823 /* Fence driver */ 3824 r = amdgpu_fence_driver_sw_init(adev); 3825 if (r) { 3826 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3827 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3828 goto failed; 3829 } 3830 3831 /* init the mode config */ 3832 drm_mode_config_init(adev_to_drm(adev)); 3833 3834 r = amdgpu_device_ip_init(adev); 3835 if (r) { 3836 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3837 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3838 goto release_ras_con; 3839 } 3840 3841 amdgpu_fence_driver_hw_init(adev); 3842 3843 dev_info(adev->dev, 3844 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3845 adev->gfx.config.max_shader_engines, 3846 adev->gfx.config.max_sh_per_se, 3847 adev->gfx.config.max_cu_per_sh, 3848 adev->gfx.cu_info.number); 3849 3850 adev->accel_working = true; 3851 3852 amdgpu_vm_check_compute_bug(adev); 3853 3854 /* Initialize the buffer migration limit. */ 3855 if (amdgpu_moverate >= 0) 3856 max_MBps = amdgpu_moverate; 3857 else 3858 max_MBps = 8; /* Allow 8 MB/s. */ 3859 /* Get a log2 for easy divisions. */ 3860 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3861 3862 r = amdgpu_atombios_sysfs_init(adev); 3863 if (r) 3864 drm_err(&adev->ddev, 3865 "registering atombios sysfs failed (%d).\n", r); 3866 3867 r = amdgpu_pm_sysfs_init(adev); 3868 if (r) 3869 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3870 3871 r = amdgpu_ucode_sysfs_init(adev); 3872 if (r) { 3873 adev->ucode_sysfs_en = false; 3874 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3875 } else 3876 adev->ucode_sysfs_en = true; 3877 3878 /* 3879 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3880 * Otherwise the mgpu fan boost feature will be skipped due to the 3881 * gpu instance is counted less. 3882 */ 3883 amdgpu_register_gpu_instance(adev); 3884 3885 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3886 * explicit gating rather than handling it automatically. 3887 */ 3888 if (!adev->gmc.xgmi.pending_reset) { 3889 r = amdgpu_device_ip_late_init(adev); 3890 if (r) { 3891 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3892 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3893 goto release_ras_con; 3894 } 3895 /* must succeed. */ 3896 amdgpu_ras_resume(adev); 3897 queue_delayed_work(system_wq, &adev->delayed_init_work, 3898 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3899 } 3900 3901 if (amdgpu_sriov_vf(adev)) { 3902 amdgpu_virt_release_full_gpu(adev, true); 3903 flush_delayed_work(&adev->delayed_init_work); 3904 } 3905 3906 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3907 if (r) 3908 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3909 3910 amdgpu_fru_sysfs_init(adev); 3911 3912 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3913 r = amdgpu_pmu_init(adev); 3914 if (r) 3915 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3916 3917 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3918 if (amdgpu_device_cache_pci_state(adev->pdev)) 3919 pci_restore_state(pdev); 3920 3921 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3922 /* this will fail for cards that aren't VGA class devices, just 3923 * ignore it 3924 */ 3925 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3926 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3927 3928 px = amdgpu_device_supports_px(ddev); 3929 3930 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3931 apple_gmux_detect(NULL, NULL))) 3932 vga_switcheroo_register_client(adev->pdev, 3933 &amdgpu_switcheroo_ops, px); 3934 3935 if (px) 3936 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3937 3938 if (adev->gmc.xgmi.pending_reset) 3939 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3940 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3941 3942 amdgpu_device_check_iommu_direct_map(adev); 3943 3944 return 0; 3945 3946 release_ras_con: 3947 if (amdgpu_sriov_vf(adev)) 3948 amdgpu_virt_release_full_gpu(adev, true); 3949 3950 /* failed in exclusive mode due to timeout */ 3951 if (amdgpu_sriov_vf(adev) && 3952 !amdgpu_sriov_runtime(adev) && 3953 amdgpu_virt_mmio_blocked(adev) && 3954 !amdgpu_virt_wait_reset(adev)) { 3955 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3956 /* Don't send request since VF is inactive. */ 3957 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3958 adev->virt.ops = NULL; 3959 r = -EAGAIN; 3960 } 3961 amdgpu_release_ras_context(adev); 3962 3963 failed: 3964 amdgpu_vf_error_trans_all(adev); 3965 3966 return r; 3967 } 3968 3969 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3970 { 3971 3972 /* Clear all CPU mappings pointing to this device */ 3973 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3974 3975 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3976 amdgpu_doorbell_fini(adev); 3977 3978 iounmap(adev->rmmio); 3979 adev->rmmio = NULL; 3980 if (adev->mman.aper_base_kaddr) 3981 iounmap(adev->mman.aper_base_kaddr); 3982 adev->mman.aper_base_kaddr = NULL; 3983 3984 /* Memory manager related */ 3985 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3986 arch_phys_wc_del(adev->gmc.vram_mtrr); 3987 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3988 } 3989 } 3990 3991 /** 3992 * amdgpu_device_fini_hw - tear down the driver 3993 * 3994 * @adev: amdgpu_device pointer 3995 * 3996 * Tear down the driver info (all asics). 3997 * Called at driver shutdown. 3998 */ 3999 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4000 { 4001 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4002 flush_delayed_work(&adev->delayed_init_work); 4003 adev->shutdown = true; 4004 4005 /* make sure IB test finished before entering exclusive mode 4006 * to avoid preemption on IB test 4007 */ 4008 if (amdgpu_sriov_vf(adev)) { 4009 amdgpu_virt_request_full_gpu(adev, false); 4010 amdgpu_virt_fini_data_exchange(adev); 4011 } 4012 4013 /* disable all interrupts */ 4014 amdgpu_irq_disable_all(adev); 4015 if (adev->mode_info.mode_config_initialized) { 4016 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4017 drm_helper_force_disable_all(adev_to_drm(adev)); 4018 else 4019 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4020 } 4021 amdgpu_fence_driver_hw_fini(adev); 4022 4023 if (adev->mman.initialized) 4024 drain_workqueue(adev->mman.bdev.wq); 4025 4026 if (adev->pm.sysfs_initialized) 4027 amdgpu_pm_sysfs_fini(adev); 4028 if (adev->ucode_sysfs_en) 4029 amdgpu_ucode_sysfs_fini(adev); 4030 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4031 amdgpu_fru_sysfs_fini(adev); 4032 4033 /* disable ras feature must before hw fini */ 4034 amdgpu_ras_pre_fini(adev); 4035 4036 amdgpu_device_ip_fini_early(adev); 4037 4038 amdgpu_irq_fini_hw(adev); 4039 4040 if (adev->mman.initialized) 4041 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4042 4043 amdgpu_gart_dummy_page_fini(adev); 4044 4045 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4046 amdgpu_device_unmap_mmio(adev); 4047 4048 } 4049 4050 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4051 { 4052 int idx; 4053 bool px; 4054 4055 amdgpu_fence_driver_sw_fini(adev); 4056 amdgpu_device_ip_fini(adev); 4057 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4058 adev->accel_working = false; 4059 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4060 4061 amdgpu_reset_fini(adev); 4062 4063 /* free i2c buses */ 4064 if (!amdgpu_device_has_dc_support(adev)) 4065 amdgpu_i2c_fini(adev); 4066 4067 if (amdgpu_emu_mode != 1) 4068 amdgpu_atombios_fini(adev); 4069 4070 kfree(adev->bios); 4071 adev->bios = NULL; 4072 4073 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4074 4075 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4076 apple_gmux_detect(NULL, NULL))) 4077 vga_switcheroo_unregister_client(adev->pdev); 4078 4079 if (px) 4080 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4081 4082 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4083 vga_client_unregister(adev->pdev); 4084 4085 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4086 4087 iounmap(adev->rmmio); 4088 adev->rmmio = NULL; 4089 amdgpu_doorbell_fini(adev); 4090 drm_dev_exit(idx); 4091 } 4092 4093 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4094 amdgpu_pmu_fini(adev); 4095 if (adev->mman.discovery_bin) 4096 amdgpu_discovery_fini(adev); 4097 4098 amdgpu_reset_put_reset_domain(adev->reset_domain); 4099 adev->reset_domain = NULL; 4100 4101 kfree(adev->pci_state); 4102 4103 } 4104 4105 /** 4106 * amdgpu_device_evict_resources - evict device resources 4107 * @adev: amdgpu device object 4108 * 4109 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4110 * of the vram memory type. Mainly used for evicting device resources 4111 * at suspend time. 4112 * 4113 */ 4114 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4115 { 4116 int ret; 4117 4118 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4119 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4120 return 0; 4121 4122 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4123 if (ret) 4124 DRM_WARN("evicting device resources failed\n"); 4125 return ret; 4126 } 4127 4128 /* 4129 * Suspend & resume. 4130 */ 4131 /** 4132 * amdgpu_device_suspend - initiate device suspend 4133 * 4134 * @dev: drm dev pointer 4135 * @fbcon : notify the fbdev of suspend 4136 * 4137 * Puts the hw in the suspend state (all asics). 4138 * Returns 0 for success or an error on failure. 4139 * Called at driver suspend. 4140 */ 4141 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4142 { 4143 struct amdgpu_device *adev = drm_to_adev(dev); 4144 int r = 0; 4145 4146 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4147 return 0; 4148 4149 adev->in_suspend = true; 4150 4151 /* Evict the majority of BOs before grabbing the full access */ 4152 r = amdgpu_device_evict_resources(adev); 4153 if (r) 4154 return r; 4155 4156 if (amdgpu_sriov_vf(adev)) { 4157 amdgpu_virt_fini_data_exchange(adev); 4158 r = amdgpu_virt_request_full_gpu(adev, false); 4159 if (r) 4160 return r; 4161 } 4162 4163 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4164 DRM_WARN("smart shift update failed\n"); 4165 4166 if (fbcon) 4167 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4168 4169 cancel_delayed_work_sync(&adev->delayed_init_work); 4170 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4171 4172 amdgpu_ras_suspend(adev); 4173 4174 amdgpu_device_ip_suspend_phase1(adev); 4175 4176 if (!adev->in_s0ix) 4177 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4178 4179 r = amdgpu_device_evict_resources(adev); 4180 if (r) 4181 return r; 4182 4183 amdgpu_fence_driver_hw_fini(adev); 4184 4185 amdgpu_device_ip_suspend_phase2(adev); 4186 4187 if (amdgpu_sriov_vf(adev)) 4188 amdgpu_virt_release_full_gpu(adev, false); 4189 4190 return 0; 4191 } 4192 4193 /** 4194 * amdgpu_device_resume - initiate device resume 4195 * 4196 * @dev: drm dev pointer 4197 * @fbcon : notify the fbdev of resume 4198 * 4199 * Bring the hw back to operating state (all asics). 4200 * Returns 0 for success or an error on failure. 4201 * Called at driver resume. 4202 */ 4203 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4204 { 4205 struct amdgpu_device *adev = drm_to_adev(dev); 4206 int r = 0; 4207 4208 if (amdgpu_sriov_vf(adev)) { 4209 r = amdgpu_virt_request_full_gpu(adev, true); 4210 if (r) 4211 return r; 4212 } 4213 4214 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4215 return 0; 4216 4217 if (adev->in_s0ix) 4218 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4219 4220 /* post card */ 4221 if (amdgpu_device_need_post(adev)) { 4222 r = amdgpu_device_asic_init(adev); 4223 if (r) 4224 dev_err(adev->dev, "amdgpu asic init failed\n"); 4225 } 4226 4227 r = amdgpu_device_ip_resume(adev); 4228 4229 if (r) { 4230 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4231 goto exit; 4232 } 4233 amdgpu_fence_driver_hw_init(adev); 4234 4235 r = amdgpu_device_ip_late_init(adev); 4236 if (r) 4237 goto exit; 4238 4239 queue_delayed_work(system_wq, &adev->delayed_init_work, 4240 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4241 4242 if (!adev->in_s0ix) { 4243 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4244 if (r) 4245 goto exit; 4246 } 4247 4248 exit: 4249 if (amdgpu_sriov_vf(adev)) { 4250 amdgpu_virt_init_data_exchange(adev); 4251 amdgpu_virt_release_full_gpu(adev, true); 4252 } 4253 4254 if (r) 4255 return r; 4256 4257 /* Make sure IB tests flushed */ 4258 flush_delayed_work(&adev->delayed_init_work); 4259 4260 if (fbcon) 4261 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4262 4263 amdgpu_ras_resume(adev); 4264 4265 if (adev->mode_info.num_crtc) { 4266 /* 4267 * Most of the connector probing functions try to acquire runtime pm 4268 * refs to ensure that the GPU is powered on when connector polling is 4269 * performed. Since we're calling this from a runtime PM callback, 4270 * trying to acquire rpm refs will cause us to deadlock. 4271 * 4272 * Since we're guaranteed to be holding the rpm lock, it's safe to 4273 * temporarily disable the rpm helpers so this doesn't deadlock us. 4274 */ 4275 #ifdef CONFIG_PM 4276 dev->dev->power.disable_depth++; 4277 #endif 4278 if (!adev->dc_enabled) 4279 drm_helper_hpd_irq_event(dev); 4280 else 4281 drm_kms_helper_hotplug_event(dev); 4282 #ifdef CONFIG_PM 4283 dev->dev->power.disable_depth--; 4284 #endif 4285 } 4286 adev->in_suspend = false; 4287 4288 if (adev->enable_mes) 4289 amdgpu_mes_self_test(adev); 4290 4291 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4292 DRM_WARN("smart shift update failed\n"); 4293 4294 return 0; 4295 } 4296 4297 /** 4298 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4299 * 4300 * @adev: amdgpu_device pointer 4301 * 4302 * The list of all the hardware IPs that make up the asic is walked and 4303 * the check_soft_reset callbacks are run. check_soft_reset determines 4304 * if the asic is still hung or not. 4305 * Returns true if any of the IPs are still in a hung state, false if not. 4306 */ 4307 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4308 { 4309 int i; 4310 bool asic_hang = false; 4311 4312 if (amdgpu_sriov_vf(adev)) 4313 return true; 4314 4315 if (amdgpu_asic_need_full_reset(adev)) 4316 return true; 4317 4318 for (i = 0; i < adev->num_ip_blocks; i++) { 4319 if (!adev->ip_blocks[i].status.valid) 4320 continue; 4321 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4322 adev->ip_blocks[i].status.hang = 4323 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4324 if (adev->ip_blocks[i].status.hang) { 4325 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4326 asic_hang = true; 4327 } 4328 } 4329 return asic_hang; 4330 } 4331 4332 /** 4333 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4334 * 4335 * @adev: amdgpu_device pointer 4336 * 4337 * The list of all the hardware IPs that make up the asic is walked and the 4338 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4339 * handles any IP specific hardware or software state changes that are 4340 * necessary for a soft reset to succeed. 4341 * Returns 0 on success, negative error code on failure. 4342 */ 4343 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4344 { 4345 int i, r = 0; 4346 4347 for (i = 0; i < adev->num_ip_blocks; i++) { 4348 if (!adev->ip_blocks[i].status.valid) 4349 continue; 4350 if (adev->ip_blocks[i].status.hang && 4351 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4352 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4353 if (r) 4354 return r; 4355 } 4356 } 4357 4358 return 0; 4359 } 4360 4361 /** 4362 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4363 * 4364 * @adev: amdgpu_device pointer 4365 * 4366 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4367 * reset is necessary to recover. 4368 * Returns true if a full asic reset is required, false if not. 4369 */ 4370 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4371 { 4372 int i; 4373 4374 if (amdgpu_asic_need_full_reset(adev)) 4375 return true; 4376 4377 for (i = 0; i < adev->num_ip_blocks; i++) { 4378 if (!adev->ip_blocks[i].status.valid) 4379 continue; 4380 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4381 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4382 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4383 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4384 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4385 if (adev->ip_blocks[i].status.hang) { 4386 dev_info(adev->dev, "Some block need full reset!\n"); 4387 return true; 4388 } 4389 } 4390 } 4391 return false; 4392 } 4393 4394 /** 4395 * amdgpu_device_ip_soft_reset - do a soft reset 4396 * 4397 * @adev: amdgpu_device pointer 4398 * 4399 * The list of all the hardware IPs that make up the asic is walked and the 4400 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4401 * IP specific hardware or software state changes that are necessary to soft 4402 * reset the IP. 4403 * Returns 0 on success, negative error code on failure. 4404 */ 4405 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4406 { 4407 int i, r = 0; 4408 4409 for (i = 0; i < adev->num_ip_blocks; i++) { 4410 if (!adev->ip_blocks[i].status.valid) 4411 continue; 4412 if (adev->ip_blocks[i].status.hang && 4413 adev->ip_blocks[i].version->funcs->soft_reset) { 4414 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4415 if (r) 4416 return r; 4417 } 4418 } 4419 4420 return 0; 4421 } 4422 4423 /** 4424 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4425 * 4426 * @adev: amdgpu_device pointer 4427 * 4428 * The list of all the hardware IPs that make up the asic is walked and the 4429 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4430 * handles any IP specific hardware or software state changes that are 4431 * necessary after the IP has been soft reset. 4432 * Returns 0 on success, negative error code on failure. 4433 */ 4434 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4435 { 4436 int i, r = 0; 4437 4438 for (i = 0; i < adev->num_ip_blocks; i++) { 4439 if (!adev->ip_blocks[i].status.valid) 4440 continue; 4441 if (adev->ip_blocks[i].status.hang && 4442 adev->ip_blocks[i].version->funcs->post_soft_reset) 4443 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4444 if (r) 4445 return r; 4446 } 4447 4448 return 0; 4449 } 4450 4451 /** 4452 * amdgpu_device_recover_vram - Recover some VRAM contents 4453 * 4454 * @adev: amdgpu_device pointer 4455 * 4456 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4457 * restore things like GPUVM page tables after a GPU reset where 4458 * the contents of VRAM might be lost. 4459 * 4460 * Returns: 4461 * 0 on success, negative error code on failure. 4462 */ 4463 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4464 { 4465 struct dma_fence *fence = NULL, *next = NULL; 4466 struct amdgpu_bo *shadow; 4467 struct amdgpu_bo_vm *vmbo; 4468 long r = 1, tmo; 4469 4470 if (amdgpu_sriov_runtime(adev)) 4471 tmo = msecs_to_jiffies(8000); 4472 else 4473 tmo = msecs_to_jiffies(100); 4474 4475 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4476 mutex_lock(&adev->shadow_list_lock); 4477 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4478 /* If vm is compute context or adev is APU, shadow will be NULL */ 4479 if (!vmbo->shadow) 4480 continue; 4481 shadow = vmbo->shadow; 4482 4483 /* No need to recover an evicted BO */ 4484 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4485 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4486 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4487 continue; 4488 4489 r = amdgpu_bo_restore_shadow(shadow, &next); 4490 if (r) 4491 break; 4492 4493 if (fence) { 4494 tmo = dma_fence_wait_timeout(fence, false, tmo); 4495 dma_fence_put(fence); 4496 fence = next; 4497 if (tmo == 0) { 4498 r = -ETIMEDOUT; 4499 break; 4500 } else if (tmo < 0) { 4501 r = tmo; 4502 break; 4503 } 4504 } else { 4505 fence = next; 4506 } 4507 } 4508 mutex_unlock(&adev->shadow_list_lock); 4509 4510 if (fence) 4511 tmo = dma_fence_wait_timeout(fence, false, tmo); 4512 dma_fence_put(fence); 4513 4514 if (r < 0 || tmo <= 0) { 4515 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4516 return -EIO; 4517 } 4518 4519 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4520 return 0; 4521 } 4522 4523 4524 /** 4525 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4526 * 4527 * @adev: amdgpu_device pointer 4528 * @from_hypervisor: request from hypervisor 4529 * 4530 * do VF FLR and reinitialize Asic 4531 * return 0 means succeeded otherwise failed 4532 */ 4533 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4534 bool from_hypervisor) 4535 { 4536 int r; 4537 struct amdgpu_hive_info *hive = NULL; 4538 int retry_limit = 0; 4539 4540 retry: 4541 amdgpu_amdkfd_pre_reset(adev); 4542 4543 if (from_hypervisor) 4544 r = amdgpu_virt_request_full_gpu(adev, true); 4545 else 4546 r = amdgpu_virt_reset_gpu(adev); 4547 if (r) 4548 return r; 4549 amdgpu_irq_gpu_reset_resume_helper(adev); 4550 4551 /* some sw clean up VF needs to do before recover */ 4552 amdgpu_virt_post_reset(adev); 4553 4554 /* Resume IP prior to SMC */ 4555 r = amdgpu_device_ip_reinit_early_sriov(adev); 4556 if (r) 4557 goto error; 4558 4559 amdgpu_virt_init_data_exchange(adev); 4560 4561 r = amdgpu_device_fw_loading(adev); 4562 if (r) 4563 return r; 4564 4565 /* now we are okay to resume SMC/CP/SDMA */ 4566 r = amdgpu_device_ip_reinit_late_sriov(adev); 4567 if (r) 4568 goto error; 4569 4570 hive = amdgpu_get_xgmi_hive(adev); 4571 /* Update PSP FW topology after reset */ 4572 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4573 r = amdgpu_xgmi_update_topology(hive, adev); 4574 4575 if (hive) 4576 amdgpu_put_xgmi_hive(hive); 4577 4578 if (!r) { 4579 r = amdgpu_ib_ring_tests(adev); 4580 4581 amdgpu_amdkfd_post_reset(adev); 4582 } 4583 4584 error: 4585 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4586 amdgpu_inc_vram_lost(adev); 4587 r = amdgpu_device_recover_vram(adev); 4588 } 4589 amdgpu_virt_release_full_gpu(adev, true); 4590 4591 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4592 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4593 retry_limit++; 4594 goto retry; 4595 } else 4596 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4597 } 4598 4599 return r; 4600 } 4601 4602 /** 4603 * amdgpu_device_has_job_running - check if there is any job in mirror list 4604 * 4605 * @adev: amdgpu_device pointer 4606 * 4607 * check if there is any job in mirror list 4608 */ 4609 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4610 { 4611 int i; 4612 struct drm_sched_job *job; 4613 4614 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4615 struct amdgpu_ring *ring = adev->rings[i]; 4616 4617 if (!ring || !ring->sched.thread) 4618 continue; 4619 4620 spin_lock(&ring->sched.job_list_lock); 4621 job = list_first_entry_or_null(&ring->sched.pending_list, 4622 struct drm_sched_job, list); 4623 spin_unlock(&ring->sched.job_list_lock); 4624 if (job) 4625 return true; 4626 } 4627 return false; 4628 } 4629 4630 /** 4631 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4632 * 4633 * @adev: amdgpu_device pointer 4634 * 4635 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4636 * a hung GPU. 4637 */ 4638 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4639 { 4640 4641 if (amdgpu_gpu_recovery == 0) 4642 goto disabled; 4643 4644 /* Skip soft reset check in fatal error mode */ 4645 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4646 return true; 4647 4648 if (amdgpu_sriov_vf(adev)) 4649 return true; 4650 4651 if (amdgpu_gpu_recovery == -1) { 4652 switch (adev->asic_type) { 4653 #ifdef CONFIG_DRM_AMDGPU_SI 4654 case CHIP_VERDE: 4655 case CHIP_TAHITI: 4656 case CHIP_PITCAIRN: 4657 case CHIP_OLAND: 4658 case CHIP_HAINAN: 4659 #endif 4660 #ifdef CONFIG_DRM_AMDGPU_CIK 4661 case CHIP_KAVERI: 4662 case CHIP_KABINI: 4663 case CHIP_MULLINS: 4664 #endif 4665 case CHIP_CARRIZO: 4666 case CHIP_STONEY: 4667 case CHIP_CYAN_SKILLFISH: 4668 goto disabled; 4669 default: 4670 break; 4671 } 4672 } 4673 4674 return true; 4675 4676 disabled: 4677 dev_info(adev->dev, "GPU recovery disabled.\n"); 4678 return false; 4679 } 4680 4681 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4682 { 4683 u32 i; 4684 int ret = 0; 4685 4686 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4687 4688 dev_info(adev->dev, "GPU mode1 reset\n"); 4689 4690 /* disable BM */ 4691 pci_clear_master(adev->pdev); 4692 4693 amdgpu_device_cache_pci_state(adev->pdev); 4694 4695 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4696 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4697 ret = amdgpu_dpm_mode1_reset(adev); 4698 } else { 4699 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4700 ret = psp_gpu_reset(adev); 4701 } 4702 4703 if (ret) 4704 goto mode1_reset_failed; 4705 4706 amdgpu_device_load_pci_state(adev->pdev); 4707 ret = amdgpu_psp_wait_for_bootloader(adev); 4708 if (ret) 4709 goto mode1_reset_failed; 4710 4711 /* wait for asic to come out of reset */ 4712 for (i = 0; i < adev->usec_timeout; i++) { 4713 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4714 4715 if (memsize != 0xffffffff) 4716 break; 4717 udelay(1); 4718 } 4719 4720 if (i >= adev->usec_timeout) { 4721 ret = -ETIMEDOUT; 4722 goto mode1_reset_failed; 4723 } 4724 4725 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4726 4727 return 0; 4728 4729 mode1_reset_failed: 4730 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4731 return ret; 4732 } 4733 4734 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4735 struct amdgpu_reset_context *reset_context) 4736 { 4737 int i, r = 0; 4738 struct amdgpu_job *job = NULL; 4739 bool need_full_reset = 4740 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4741 4742 if (reset_context->reset_req_dev == adev) 4743 job = reset_context->job; 4744 4745 if (amdgpu_sriov_vf(adev)) { 4746 /* stop the data exchange thread */ 4747 amdgpu_virt_fini_data_exchange(adev); 4748 } 4749 4750 amdgpu_fence_driver_isr_toggle(adev, true); 4751 4752 /* block all schedulers and reset given job's ring */ 4753 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4754 struct amdgpu_ring *ring = adev->rings[i]; 4755 4756 if (!ring || !ring->sched.thread) 4757 continue; 4758 4759 /* Clear job fence from fence drv to avoid force_completion 4760 * leave NULL and vm flush fence in fence drv 4761 */ 4762 amdgpu_fence_driver_clear_job_fences(ring); 4763 4764 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4765 amdgpu_fence_driver_force_completion(ring); 4766 } 4767 4768 amdgpu_fence_driver_isr_toggle(adev, false); 4769 4770 if (job && job->vm) 4771 drm_sched_increase_karma(&job->base); 4772 4773 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4774 /* If reset handler not implemented, continue; otherwise return */ 4775 if (r == -EOPNOTSUPP) 4776 r = 0; 4777 else 4778 return r; 4779 4780 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4781 if (!amdgpu_sriov_vf(adev)) { 4782 4783 if (!need_full_reset) 4784 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4785 4786 if (!need_full_reset && amdgpu_gpu_recovery && 4787 amdgpu_device_ip_check_soft_reset(adev)) { 4788 amdgpu_device_ip_pre_soft_reset(adev); 4789 r = amdgpu_device_ip_soft_reset(adev); 4790 amdgpu_device_ip_post_soft_reset(adev); 4791 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4792 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4793 need_full_reset = true; 4794 } 4795 } 4796 4797 if (need_full_reset) 4798 r = amdgpu_device_ip_suspend(adev); 4799 if (need_full_reset) 4800 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4801 else 4802 clear_bit(AMDGPU_NEED_FULL_RESET, 4803 &reset_context->flags); 4804 } 4805 4806 return r; 4807 } 4808 4809 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4810 { 4811 int i; 4812 4813 lockdep_assert_held(&adev->reset_domain->sem); 4814 4815 for (i = 0; i < adev->num_regs; i++) { 4816 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4817 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4818 adev->reset_dump_reg_value[i]); 4819 } 4820 4821 return 0; 4822 } 4823 4824 #ifdef CONFIG_DEV_COREDUMP 4825 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4826 size_t count, void *data, size_t datalen) 4827 { 4828 struct drm_printer p; 4829 struct amdgpu_device *adev = data; 4830 struct drm_print_iterator iter; 4831 int i; 4832 4833 iter.data = buffer; 4834 iter.offset = 0; 4835 iter.start = offset; 4836 iter.remain = count; 4837 4838 p = drm_coredump_printer(&iter); 4839 4840 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4841 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4842 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4843 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4844 if (adev->reset_task_info.pid) 4845 drm_printf(&p, "process_name: %s PID: %d\n", 4846 adev->reset_task_info.process_name, 4847 adev->reset_task_info.pid); 4848 4849 if (adev->reset_vram_lost) 4850 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4851 if (adev->num_regs) { 4852 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4853 4854 for (i = 0; i < adev->num_regs; i++) 4855 drm_printf(&p, "0x%08x: 0x%08x\n", 4856 adev->reset_dump_reg_list[i], 4857 adev->reset_dump_reg_value[i]); 4858 } 4859 4860 return count - iter.remain; 4861 } 4862 4863 static void amdgpu_devcoredump_free(void *data) 4864 { 4865 } 4866 4867 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4868 { 4869 struct drm_device *dev = adev_to_drm(adev); 4870 4871 ktime_get_ts64(&adev->reset_time); 4872 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4873 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4874 } 4875 #endif 4876 4877 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4878 struct amdgpu_reset_context *reset_context) 4879 { 4880 struct amdgpu_device *tmp_adev = NULL; 4881 bool need_full_reset, skip_hw_reset, vram_lost = false; 4882 int r = 0; 4883 bool gpu_reset_for_dev_remove = 0; 4884 4885 /* Try reset handler method first */ 4886 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4887 reset_list); 4888 amdgpu_reset_reg_dumps(tmp_adev); 4889 4890 reset_context->reset_device_list = device_list_handle; 4891 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4892 /* If reset handler not implemented, continue; otherwise return */ 4893 if (r == -EOPNOTSUPP) 4894 r = 0; 4895 else 4896 return r; 4897 4898 /* Reset handler not implemented, use the default method */ 4899 need_full_reset = 4900 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4901 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4902 4903 gpu_reset_for_dev_remove = 4904 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4905 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4906 4907 /* 4908 * ASIC reset has to be done on all XGMI hive nodes ASAP 4909 * to allow proper links negotiation in FW (within 1 sec) 4910 */ 4911 if (!skip_hw_reset && need_full_reset) { 4912 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4913 /* For XGMI run all resets in parallel to speed up the process */ 4914 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4915 tmp_adev->gmc.xgmi.pending_reset = false; 4916 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4917 r = -EALREADY; 4918 } else 4919 r = amdgpu_asic_reset(tmp_adev); 4920 4921 if (r) { 4922 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4923 r, adev_to_drm(tmp_adev)->unique); 4924 break; 4925 } 4926 } 4927 4928 /* For XGMI wait for all resets to complete before proceed */ 4929 if (!r) { 4930 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4931 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4932 flush_work(&tmp_adev->xgmi_reset_work); 4933 r = tmp_adev->asic_reset_res; 4934 if (r) 4935 break; 4936 } 4937 } 4938 } 4939 } 4940 4941 if (!r && amdgpu_ras_intr_triggered()) { 4942 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4943 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4944 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4945 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4946 } 4947 4948 amdgpu_ras_intr_cleared(); 4949 } 4950 4951 /* Since the mode1 reset affects base ip blocks, the 4952 * phase1 ip blocks need to be resumed. Otherwise there 4953 * will be a BIOS signature error and the psp bootloader 4954 * can't load kdb on the next amdgpu install. 4955 */ 4956 if (gpu_reset_for_dev_remove) { 4957 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4958 amdgpu_device_ip_resume_phase1(tmp_adev); 4959 4960 goto end; 4961 } 4962 4963 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4964 if (need_full_reset) { 4965 /* post card */ 4966 r = amdgpu_device_asic_init(tmp_adev); 4967 if (r) { 4968 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4969 } else { 4970 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4971 4972 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4973 if (r) 4974 goto out; 4975 4976 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4977 #ifdef CONFIG_DEV_COREDUMP 4978 tmp_adev->reset_vram_lost = vram_lost; 4979 memset(&tmp_adev->reset_task_info, 0, 4980 sizeof(tmp_adev->reset_task_info)); 4981 if (reset_context->job && reset_context->job->vm) 4982 tmp_adev->reset_task_info = 4983 reset_context->job->vm->task_info; 4984 amdgpu_reset_capture_coredumpm(tmp_adev); 4985 #endif 4986 if (vram_lost) { 4987 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4988 amdgpu_inc_vram_lost(tmp_adev); 4989 } 4990 4991 r = amdgpu_device_fw_loading(tmp_adev); 4992 if (r) 4993 return r; 4994 4995 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4996 if (r) 4997 goto out; 4998 4999 if (vram_lost) 5000 amdgpu_device_fill_reset_magic(tmp_adev); 5001 5002 /* 5003 * Add this ASIC as tracked as reset was already 5004 * complete successfully. 5005 */ 5006 amdgpu_register_gpu_instance(tmp_adev); 5007 5008 if (!reset_context->hive && 5009 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5010 amdgpu_xgmi_add_device(tmp_adev); 5011 5012 r = amdgpu_device_ip_late_init(tmp_adev); 5013 if (r) 5014 goto out; 5015 5016 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5017 5018 /* 5019 * The GPU enters bad state once faulty pages 5020 * by ECC has reached the threshold, and ras 5021 * recovery is scheduled next. So add one check 5022 * here to break recovery if it indeed exceeds 5023 * bad page threshold, and remind user to 5024 * retire this GPU or setting one bigger 5025 * bad_page_threshold value to fix this once 5026 * probing driver again. 5027 */ 5028 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5029 /* must succeed. */ 5030 amdgpu_ras_resume(tmp_adev); 5031 } else { 5032 r = -EINVAL; 5033 goto out; 5034 } 5035 5036 /* Update PSP FW topology after reset */ 5037 if (reset_context->hive && 5038 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5039 r = amdgpu_xgmi_update_topology( 5040 reset_context->hive, tmp_adev); 5041 } 5042 } 5043 5044 out: 5045 if (!r) { 5046 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5047 r = amdgpu_ib_ring_tests(tmp_adev); 5048 if (r) { 5049 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5050 need_full_reset = true; 5051 r = -EAGAIN; 5052 goto end; 5053 } 5054 } 5055 5056 if (!r) 5057 r = amdgpu_device_recover_vram(tmp_adev); 5058 else 5059 tmp_adev->asic_reset_res = r; 5060 } 5061 5062 end: 5063 if (need_full_reset) 5064 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5065 else 5066 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5067 return r; 5068 } 5069 5070 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5071 { 5072 5073 switch (amdgpu_asic_reset_method(adev)) { 5074 case AMD_RESET_METHOD_MODE1: 5075 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5076 break; 5077 case AMD_RESET_METHOD_MODE2: 5078 adev->mp1_state = PP_MP1_STATE_RESET; 5079 break; 5080 default: 5081 adev->mp1_state = PP_MP1_STATE_NONE; 5082 break; 5083 } 5084 } 5085 5086 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5087 { 5088 amdgpu_vf_error_trans_all(adev); 5089 adev->mp1_state = PP_MP1_STATE_NONE; 5090 } 5091 5092 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5093 { 5094 struct pci_dev *p = NULL; 5095 5096 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5097 adev->pdev->bus->number, 1); 5098 if (p) { 5099 pm_runtime_enable(&(p->dev)); 5100 pm_runtime_resume(&(p->dev)); 5101 } 5102 5103 pci_dev_put(p); 5104 } 5105 5106 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5107 { 5108 enum amd_reset_method reset_method; 5109 struct pci_dev *p = NULL; 5110 u64 expires; 5111 5112 /* 5113 * For now, only BACO and mode1 reset are confirmed 5114 * to suffer the audio issue without proper suspended. 5115 */ 5116 reset_method = amdgpu_asic_reset_method(adev); 5117 if ((reset_method != AMD_RESET_METHOD_BACO) && 5118 (reset_method != AMD_RESET_METHOD_MODE1)) 5119 return -EINVAL; 5120 5121 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5122 adev->pdev->bus->number, 1); 5123 if (!p) 5124 return -ENODEV; 5125 5126 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5127 if (!expires) 5128 /* 5129 * If we cannot get the audio device autosuspend delay, 5130 * a fixed 4S interval will be used. Considering 3S is 5131 * the audio controller default autosuspend delay setting. 5132 * 4S used here is guaranteed to cover that. 5133 */ 5134 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5135 5136 while (!pm_runtime_status_suspended(&(p->dev))) { 5137 if (!pm_runtime_suspend(&(p->dev))) 5138 break; 5139 5140 if (expires < ktime_get_mono_fast_ns()) { 5141 dev_warn(adev->dev, "failed to suspend display audio\n"); 5142 pci_dev_put(p); 5143 /* TODO: abort the succeeding gpu reset? */ 5144 return -ETIMEDOUT; 5145 } 5146 } 5147 5148 pm_runtime_disable(&(p->dev)); 5149 5150 pci_dev_put(p); 5151 return 0; 5152 } 5153 5154 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5155 { 5156 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5157 5158 #if defined(CONFIG_DEBUG_FS) 5159 if (!amdgpu_sriov_vf(adev)) 5160 cancel_work(&adev->reset_work); 5161 #endif 5162 5163 if (adev->kfd.dev) 5164 cancel_work(&adev->kfd.reset_work); 5165 5166 if (amdgpu_sriov_vf(adev)) 5167 cancel_work(&adev->virt.flr_work); 5168 5169 if (con && adev->ras_enabled) 5170 cancel_work(&con->recovery_work); 5171 5172 } 5173 5174 /** 5175 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5176 * 5177 * @adev: amdgpu_device pointer 5178 * @job: which job trigger hang 5179 * @reset_context: amdgpu reset context pointer 5180 * 5181 * Attempt to reset the GPU if it has hung (all asics). 5182 * Attempt to do soft-reset or full-reset and reinitialize Asic 5183 * Returns 0 for success or an error on failure. 5184 */ 5185 5186 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5187 struct amdgpu_job *job, 5188 struct amdgpu_reset_context *reset_context) 5189 { 5190 struct list_head device_list, *device_list_handle = NULL; 5191 bool job_signaled = false; 5192 struct amdgpu_hive_info *hive = NULL; 5193 struct amdgpu_device *tmp_adev = NULL; 5194 int i, r = 0; 5195 bool need_emergency_restart = false; 5196 bool audio_suspended = false; 5197 bool gpu_reset_for_dev_remove = false; 5198 5199 gpu_reset_for_dev_remove = 5200 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5201 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5202 5203 /* 5204 * Special case: RAS triggered and full reset isn't supported 5205 */ 5206 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5207 5208 /* 5209 * Flush RAM to disk so that after reboot 5210 * the user can read log and see why the system rebooted. 5211 */ 5212 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5213 DRM_WARN("Emergency reboot."); 5214 5215 ksys_sync_helper(); 5216 emergency_restart(); 5217 } 5218 5219 dev_info(adev->dev, "GPU %s begin!\n", 5220 need_emergency_restart ? "jobs stop":"reset"); 5221 5222 if (!amdgpu_sriov_vf(adev)) 5223 hive = amdgpu_get_xgmi_hive(adev); 5224 if (hive) 5225 mutex_lock(&hive->hive_lock); 5226 5227 reset_context->job = job; 5228 reset_context->hive = hive; 5229 /* 5230 * Build list of devices to reset. 5231 * In case we are in XGMI hive mode, resort the device list 5232 * to put adev in the 1st position. 5233 */ 5234 INIT_LIST_HEAD(&device_list); 5235 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5236 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5237 list_add_tail(&tmp_adev->reset_list, &device_list); 5238 if (gpu_reset_for_dev_remove && adev->shutdown) 5239 tmp_adev->shutdown = true; 5240 } 5241 if (!list_is_first(&adev->reset_list, &device_list)) 5242 list_rotate_to_front(&adev->reset_list, &device_list); 5243 device_list_handle = &device_list; 5244 } else { 5245 list_add_tail(&adev->reset_list, &device_list); 5246 device_list_handle = &device_list; 5247 } 5248 5249 /* We need to lock reset domain only once both for XGMI and single device */ 5250 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5251 reset_list); 5252 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5253 5254 /* block all schedulers and reset given job's ring */ 5255 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5256 5257 amdgpu_device_set_mp1_state(tmp_adev); 5258 5259 /* 5260 * Try to put the audio codec into suspend state 5261 * before gpu reset started. 5262 * 5263 * Due to the power domain of the graphics device 5264 * is shared with AZ power domain. Without this, 5265 * we may change the audio hardware from behind 5266 * the audio driver's back. That will trigger 5267 * some audio codec errors. 5268 */ 5269 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5270 audio_suspended = true; 5271 5272 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5273 5274 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5275 5276 if (!amdgpu_sriov_vf(tmp_adev)) 5277 amdgpu_amdkfd_pre_reset(tmp_adev); 5278 5279 /* 5280 * Mark these ASICs to be reseted as untracked first 5281 * And add them back after reset completed 5282 */ 5283 amdgpu_unregister_gpu_instance(tmp_adev); 5284 5285 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5286 5287 /* disable ras on ALL IPs */ 5288 if (!need_emergency_restart && 5289 amdgpu_device_ip_need_full_reset(tmp_adev)) 5290 amdgpu_ras_suspend(tmp_adev); 5291 5292 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5293 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5294 5295 if (!ring || !ring->sched.thread) 5296 continue; 5297 5298 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5299 5300 if (need_emergency_restart) 5301 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5302 } 5303 atomic_inc(&tmp_adev->gpu_reset_counter); 5304 } 5305 5306 if (need_emergency_restart) 5307 goto skip_sched_resume; 5308 5309 /* 5310 * Must check guilty signal here since after this point all old 5311 * HW fences are force signaled. 5312 * 5313 * job->base holds a reference to parent fence 5314 */ 5315 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5316 job_signaled = true; 5317 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5318 goto skip_hw_reset; 5319 } 5320 5321 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5322 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5323 if (gpu_reset_for_dev_remove) { 5324 /* Workaroud for ASICs need to disable SMC first */ 5325 amdgpu_device_smu_fini_early(tmp_adev); 5326 } 5327 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5328 /*TODO Should we stop ?*/ 5329 if (r) { 5330 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5331 r, adev_to_drm(tmp_adev)->unique); 5332 tmp_adev->asic_reset_res = r; 5333 } 5334 5335 /* 5336 * Drop all pending non scheduler resets. Scheduler resets 5337 * were already dropped during drm_sched_stop 5338 */ 5339 amdgpu_device_stop_pending_resets(tmp_adev); 5340 } 5341 5342 /* Actual ASIC resets if needed.*/ 5343 /* Host driver will handle XGMI hive reset for SRIOV */ 5344 if (amdgpu_sriov_vf(adev)) { 5345 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5346 if (r) 5347 adev->asic_reset_res = r; 5348 5349 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5350 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5351 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5352 amdgpu_ras_resume(adev); 5353 } else { 5354 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5355 if (r && r == -EAGAIN) 5356 goto retry; 5357 5358 if (!r && gpu_reset_for_dev_remove) 5359 goto recover_end; 5360 } 5361 5362 skip_hw_reset: 5363 5364 /* Post ASIC reset for all devs .*/ 5365 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5366 5367 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5368 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5369 5370 if (!ring || !ring->sched.thread) 5371 continue; 5372 5373 drm_sched_start(&ring->sched, true); 5374 } 5375 5376 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5377 amdgpu_mes_self_test(tmp_adev); 5378 5379 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5380 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5381 5382 if (tmp_adev->asic_reset_res) 5383 r = tmp_adev->asic_reset_res; 5384 5385 tmp_adev->asic_reset_res = 0; 5386 5387 if (r) { 5388 /* bad news, how to tell it to userspace ? */ 5389 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5390 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5391 } else { 5392 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5393 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5394 DRM_WARN("smart shift update failed\n"); 5395 } 5396 } 5397 5398 skip_sched_resume: 5399 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5400 /* unlock kfd: SRIOV would do it separately */ 5401 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5402 amdgpu_amdkfd_post_reset(tmp_adev); 5403 5404 /* kfd_post_reset will do nothing if kfd device is not initialized, 5405 * need to bring up kfd here if it's not be initialized before 5406 */ 5407 if (!adev->kfd.init_complete) 5408 amdgpu_amdkfd_device_init(adev); 5409 5410 if (audio_suspended) 5411 amdgpu_device_resume_display_audio(tmp_adev); 5412 5413 amdgpu_device_unset_mp1_state(tmp_adev); 5414 5415 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5416 } 5417 5418 recover_end: 5419 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5420 reset_list); 5421 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5422 5423 if (hive) { 5424 mutex_unlock(&hive->hive_lock); 5425 amdgpu_put_xgmi_hive(hive); 5426 } 5427 5428 if (r) 5429 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5430 5431 atomic_set(&adev->reset_domain->reset_res, r); 5432 return r; 5433 } 5434 5435 /** 5436 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5437 * 5438 * @adev: amdgpu_device pointer 5439 * 5440 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5441 * and lanes) of the slot the device is in. Handles APUs and 5442 * virtualized environments where PCIE config space may not be available. 5443 */ 5444 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5445 { 5446 struct pci_dev *pdev; 5447 enum pci_bus_speed speed_cap, platform_speed_cap; 5448 enum pcie_link_width platform_link_width; 5449 5450 if (amdgpu_pcie_gen_cap) 5451 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5452 5453 if (amdgpu_pcie_lane_cap) 5454 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5455 5456 /* covers APUs as well */ 5457 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5458 if (adev->pm.pcie_gen_mask == 0) 5459 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5460 if (adev->pm.pcie_mlw_mask == 0) 5461 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5462 return; 5463 } 5464 5465 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5466 return; 5467 5468 pcie_bandwidth_available(adev->pdev, NULL, 5469 &platform_speed_cap, &platform_link_width); 5470 5471 if (adev->pm.pcie_gen_mask == 0) { 5472 /* asic caps */ 5473 pdev = adev->pdev; 5474 speed_cap = pcie_get_speed_cap(pdev); 5475 if (speed_cap == PCI_SPEED_UNKNOWN) { 5476 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5477 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5478 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5479 } else { 5480 if (speed_cap == PCIE_SPEED_32_0GT) 5481 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5482 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5483 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5484 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5486 else if (speed_cap == PCIE_SPEED_16_0GT) 5487 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5488 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5489 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5491 else if (speed_cap == PCIE_SPEED_8_0GT) 5492 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5494 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5495 else if (speed_cap == PCIE_SPEED_5_0GT) 5496 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5497 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5498 else 5499 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5500 } 5501 /* platform caps */ 5502 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5503 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5504 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5505 } else { 5506 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5507 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5508 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5509 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5511 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5512 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5513 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5514 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5515 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5516 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5517 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5518 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5519 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5520 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5521 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5522 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5523 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5524 else 5525 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5526 5527 } 5528 } 5529 if (adev->pm.pcie_mlw_mask == 0) { 5530 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5531 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5532 } else { 5533 switch (platform_link_width) { 5534 case PCIE_LNK_X32: 5535 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5542 break; 5543 case PCIE_LNK_X16: 5544 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5549 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5550 break; 5551 case PCIE_LNK_X12: 5552 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5557 break; 5558 case PCIE_LNK_X8: 5559 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5560 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5563 break; 5564 case PCIE_LNK_X4: 5565 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5567 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5568 break; 5569 case PCIE_LNK_X2: 5570 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5572 break; 5573 case PCIE_LNK_X1: 5574 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5575 break; 5576 default: 5577 break; 5578 } 5579 } 5580 } 5581 } 5582 5583 /** 5584 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5585 * 5586 * @adev: amdgpu_device pointer 5587 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5588 * 5589 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5590 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5591 * @peer_adev. 5592 */ 5593 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5594 struct amdgpu_device *peer_adev) 5595 { 5596 #ifdef CONFIG_HSA_AMD_P2P 5597 uint64_t address_mask = peer_adev->dev->dma_mask ? 5598 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5599 resource_size_t aper_limit = 5600 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5601 bool p2p_access = 5602 !adev->gmc.xgmi.connected_to_cpu && 5603 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5604 5605 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5606 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5607 !(adev->gmc.aper_base & address_mask || 5608 aper_limit & address_mask)); 5609 #else 5610 return false; 5611 #endif 5612 } 5613 5614 int amdgpu_device_baco_enter(struct drm_device *dev) 5615 { 5616 struct amdgpu_device *adev = drm_to_adev(dev); 5617 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5618 5619 if (!amdgpu_device_supports_baco(dev)) 5620 return -ENOTSUPP; 5621 5622 if (ras && adev->ras_enabled && 5623 adev->nbio.funcs->enable_doorbell_interrupt) 5624 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5625 5626 return amdgpu_dpm_baco_enter(adev); 5627 } 5628 5629 int amdgpu_device_baco_exit(struct drm_device *dev) 5630 { 5631 struct amdgpu_device *adev = drm_to_adev(dev); 5632 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5633 int ret = 0; 5634 5635 if (!amdgpu_device_supports_baco(dev)) 5636 return -ENOTSUPP; 5637 5638 ret = amdgpu_dpm_baco_exit(adev); 5639 if (ret) 5640 return ret; 5641 5642 if (ras && adev->ras_enabled && 5643 adev->nbio.funcs->enable_doorbell_interrupt) 5644 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5645 5646 if (amdgpu_passthrough(adev) && 5647 adev->nbio.funcs->clear_doorbell_interrupt) 5648 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5649 5650 return 0; 5651 } 5652 5653 /** 5654 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5655 * @pdev: PCI device struct 5656 * @state: PCI channel state 5657 * 5658 * Description: Called when a PCI error is detected. 5659 * 5660 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5661 */ 5662 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5663 { 5664 struct drm_device *dev = pci_get_drvdata(pdev); 5665 struct amdgpu_device *adev = drm_to_adev(dev); 5666 int i; 5667 5668 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5669 5670 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5671 DRM_WARN("No support for XGMI hive yet..."); 5672 return PCI_ERS_RESULT_DISCONNECT; 5673 } 5674 5675 adev->pci_channel_state = state; 5676 5677 switch (state) { 5678 case pci_channel_io_normal: 5679 return PCI_ERS_RESULT_CAN_RECOVER; 5680 /* Fatal error, prepare for slot reset */ 5681 case pci_channel_io_frozen: 5682 /* 5683 * Locking adev->reset_domain->sem will prevent any external access 5684 * to GPU during PCI error recovery 5685 */ 5686 amdgpu_device_lock_reset_domain(adev->reset_domain); 5687 amdgpu_device_set_mp1_state(adev); 5688 5689 /* 5690 * Block any work scheduling as we do for regular GPU reset 5691 * for the duration of the recovery 5692 */ 5693 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5694 struct amdgpu_ring *ring = adev->rings[i]; 5695 5696 if (!ring || !ring->sched.thread) 5697 continue; 5698 5699 drm_sched_stop(&ring->sched, NULL); 5700 } 5701 atomic_inc(&adev->gpu_reset_counter); 5702 return PCI_ERS_RESULT_NEED_RESET; 5703 case pci_channel_io_perm_failure: 5704 /* Permanent error, prepare for device removal */ 5705 return PCI_ERS_RESULT_DISCONNECT; 5706 } 5707 5708 return PCI_ERS_RESULT_NEED_RESET; 5709 } 5710 5711 /** 5712 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5713 * @pdev: pointer to PCI device 5714 */ 5715 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5716 { 5717 5718 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5719 5720 /* TODO - dump whatever for debugging purposes */ 5721 5722 /* This called only if amdgpu_pci_error_detected returns 5723 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5724 * works, no need to reset slot. 5725 */ 5726 5727 return PCI_ERS_RESULT_RECOVERED; 5728 } 5729 5730 /** 5731 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5732 * @pdev: PCI device struct 5733 * 5734 * Description: This routine is called by the pci error recovery 5735 * code after the PCI slot has been reset, just before we 5736 * should resume normal operations. 5737 */ 5738 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5739 { 5740 struct drm_device *dev = pci_get_drvdata(pdev); 5741 struct amdgpu_device *adev = drm_to_adev(dev); 5742 int r, i; 5743 struct amdgpu_reset_context reset_context; 5744 u32 memsize; 5745 struct list_head device_list; 5746 5747 DRM_INFO("PCI error: slot reset callback!!\n"); 5748 5749 memset(&reset_context, 0, sizeof(reset_context)); 5750 5751 INIT_LIST_HEAD(&device_list); 5752 list_add_tail(&adev->reset_list, &device_list); 5753 5754 /* wait for asic to come out of reset */ 5755 msleep(500); 5756 5757 /* Restore PCI confspace */ 5758 amdgpu_device_load_pci_state(pdev); 5759 5760 /* confirm ASIC came out of reset */ 5761 for (i = 0; i < adev->usec_timeout; i++) { 5762 memsize = amdgpu_asic_get_config_memsize(adev); 5763 5764 if (memsize != 0xffffffff) 5765 break; 5766 udelay(1); 5767 } 5768 if (memsize == 0xffffffff) { 5769 r = -ETIME; 5770 goto out; 5771 } 5772 5773 reset_context.method = AMD_RESET_METHOD_NONE; 5774 reset_context.reset_req_dev = adev; 5775 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5776 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5777 5778 adev->no_hw_access = true; 5779 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5780 adev->no_hw_access = false; 5781 if (r) 5782 goto out; 5783 5784 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5785 5786 out: 5787 if (!r) { 5788 if (amdgpu_device_cache_pci_state(adev->pdev)) 5789 pci_restore_state(adev->pdev); 5790 5791 DRM_INFO("PCIe error recovery succeeded\n"); 5792 } else { 5793 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5794 amdgpu_device_unset_mp1_state(adev); 5795 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5796 } 5797 5798 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5799 } 5800 5801 /** 5802 * amdgpu_pci_resume() - resume normal ops after PCI reset 5803 * @pdev: pointer to PCI device 5804 * 5805 * Called when the error recovery driver tells us that its 5806 * OK to resume normal operation. 5807 */ 5808 void amdgpu_pci_resume(struct pci_dev *pdev) 5809 { 5810 struct drm_device *dev = pci_get_drvdata(pdev); 5811 struct amdgpu_device *adev = drm_to_adev(dev); 5812 int i; 5813 5814 5815 DRM_INFO("PCI error: resume callback!!\n"); 5816 5817 /* Only continue execution for the case of pci_channel_io_frozen */ 5818 if (adev->pci_channel_state != pci_channel_io_frozen) 5819 return; 5820 5821 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5822 struct amdgpu_ring *ring = adev->rings[i]; 5823 5824 if (!ring || !ring->sched.thread) 5825 continue; 5826 5827 drm_sched_start(&ring->sched, true); 5828 } 5829 5830 amdgpu_device_unset_mp1_state(adev); 5831 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5832 } 5833 5834 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5835 { 5836 struct drm_device *dev = pci_get_drvdata(pdev); 5837 struct amdgpu_device *adev = drm_to_adev(dev); 5838 int r; 5839 5840 r = pci_save_state(pdev); 5841 if (!r) { 5842 kfree(adev->pci_state); 5843 5844 adev->pci_state = pci_store_saved_state(pdev); 5845 5846 if (!adev->pci_state) { 5847 DRM_ERROR("Failed to store PCI saved state"); 5848 return false; 5849 } 5850 } else { 5851 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5852 return false; 5853 } 5854 5855 return true; 5856 } 5857 5858 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5859 { 5860 struct drm_device *dev = pci_get_drvdata(pdev); 5861 struct amdgpu_device *adev = drm_to_adev(dev); 5862 int r; 5863 5864 if (!adev->pci_state) 5865 return false; 5866 5867 r = pci_load_saved_state(pdev, adev->pci_state); 5868 5869 if (!r) { 5870 pci_restore_state(pdev); 5871 } else { 5872 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5873 return false; 5874 } 5875 5876 return true; 5877 } 5878 5879 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5880 struct amdgpu_ring *ring) 5881 { 5882 #ifdef CONFIG_X86_64 5883 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5884 return; 5885 #endif 5886 if (adev->gmc.xgmi.connected_to_cpu) 5887 return; 5888 5889 if (ring && ring->funcs->emit_hdp_flush) 5890 amdgpu_ring_emit_hdp_flush(ring); 5891 else 5892 amdgpu_asic_flush_hdp(adev, ring); 5893 } 5894 5895 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5896 struct amdgpu_ring *ring) 5897 { 5898 #ifdef CONFIG_X86_64 5899 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5900 return; 5901 #endif 5902 if (adev->gmc.xgmi.connected_to_cpu) 5903 return; 5904 5905 amdgpu_asic_invalidate_hdp(adev, ring); 5906 } 5907 5908 int amdgpu_in_reset(struct amdgpu_device *adev) 5909 { 5910 return atomic_read(&adev->reset_domain->in_gpu_reset); 5911 } 5912 5913 /** 5914 * amdgpu_device_halt() - bring hardware to some kind of halt state 5915 * 5916 * @adev: amdgpu_device pointer 5917 * 5918 * Bring hardware to some kind of halt state so that no one can touch it 5919 * any more. It will help to maintain error context when error occurred. 5920 * Compare to a simple hang, the system will keep stable at least for SSH 5921 * access. Then it should be trivial to inspect the hardware state and 5922 * see what's going on. Implemented as following: 5923 * 5924 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5925 * clears all CPU mappings to device, disallows remappings through page faults 5926 * 2. amdgpu_irq_disable_all() disables all interrupts 5927 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5928 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5929 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5930 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5931 * flush any in flight DMA operations 5932 */ 5933 void amdgpu_device_halt(struct amdgpu_device *adev) 5934 { 5935 struct pci_dev *pdev = adev->pdev; 5936 struct drm_device *ddev = adev_to_drm(adev); 5937 5938 amdgpu_xcp_dev_unplug(adev); 5939 drm_dev_unplug(ddev); 5940 5941 amdgpu_irq_disable_all(adev); 5942 5943 amdgpu_fence_driver_hw_fini(adev); 5944 5945 adev->no_hw_access = true; 5946 5947 amdgpu_device_unmap_mmio(adev); 5948 5949 pci_disable_device(pdev); 5950 pci_wait_for_pending_transaction(pdev); 5951 } 5952 5953 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5954 u32 reg) 5955 { 5956 unsigned long flags, address, data; 5957 u32 r; 5958 5959 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5960 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5961 5962 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5963 WREG32(address, reg * 4); 5964 (void)RREG32(address); 5965 r = RREG32(data); 5966 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5967 return r; 5968 } 5969 5970 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5971 u32 reg, u32 v) 5972 { 5973 unsigned long flags, address, data; 5974 5975 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5976 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5977 5978 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5979 WREG32(address, reg * 4); 5980 (void)RREG32(address); 5981 WREG32(data, v); 5982 (void)RREG32(data); 5983 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5984 } 5985 5986 /** 5987 * amdgpu_device_switch_gang - switch to a new gang 5988 * @adev: amdgpu_device pointer 5989 * @gang: the gang to switch to 5990 * 5991 * Try to switch to a new gang. 5992 * Returns: NULL if we switched to the new gang or a reference to the current 5993 * gang leader. 5994 */ 5995 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5996 struct dma_fence *gang) 5997 { 5998 struct dma_fence *old = NULL; 5999 6000 do { 6001 dma_fence_put(old); 6002 rcu_read_lock(); 6003 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6004 rcu_read_unlock(); 6005 6006 if (old == gang) 6007 break; 6008 6009 if (!dma_fence_is_signaled(old)) 6010 return old; 6011 6012 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6013 old, gang) != old); 6014 6015 dma_fence_put(old); 6016 return NULL; 6017 } 6018 6019 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6020 { 6021 switch (adev->asic_type) { 6022 #ifdef CONFIG_DRM_AMDGPU_SI 6023 case CHIP_HAINAN: 6024 #endif 6025 case CHIP_TOPAZ: 6026 /* chips with no display hardware */ 6027 return false; 6028 #ifdef CONFIG_DRM_AMDGPU_SI 6029 case CHIP_TAHITI: 6030 case CHIP_PITCAIRN: 6031 case CHIP_VERDE: 6032 case CHIP_OLAND: 6033 #endif 6034 #ifdef CONFIG_DRM_AMDGPU_CIK 6035 case CHIP_BONAIRE: 6036 case CHIP_HAWAII: 6037 case CHIP_KAVERI: 6038 case CHIP_KABINI: 6039 case CHIP_MULLINS: 6040 #endif 6041 case CHIP_TONGA: 6042 case CHIP_FIJI: 6043 case CHIP_POLARIS10: 6044 case CHIP_POLARIS11: 6045 case CHIP_POLARIS12: 6046 case CHIP_VEGAM: 6047 case CHIP_CARRIZO: 6048 case CHIP_STONEY: 6049 /* chips with display hardware */ 6050 return true; 6051 default: 6052 /* IP discovery */ 6053 if (!adev->ip_versions[DCE_HWIP][0] || 6054 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6055 return false; 6056 return true; 6057 } 6058 } 6059 6060 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6061 uint32_t inst, uint32_t reg_addr, char reg_name[], 6062 uint32_t expected_value, uint32_t mask) 6063 { 6064 uint32_t ret = 0; 6065 uint32_t old_ = 0; 6066 uint32_t tmp_ = RREG32(reg_addr); 6067 uint32_t loop = adev->usec_timeout; 6068 6069 while ((tmp_ & (mask)) != (expected_value)) { 6070 if (old_ != tmp_) { 6071 loop = adev->usec_timeout; 6072 old_ = tmp_; 6073 } else 6074 udelay(1); 6075 tmp_ = RREG32(reg_addr); 6076 loop--; 6077 if (!loop) { 6078 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6079 inst, reg_name, (uint32_t)expected_value, 6080 (uint32_t)(tmp_ & (mask))); 6081 ret = -ETIMEDOUT; 6082 break; 6083 } 6084 } 6085 return ret; 6086 } 6087