1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 101 static const struct drm_driver amdgpu_kms_driver; 102 103 const char *amdgpu_asic_name[] = { 104 "TAHITI", 105 "PITCAIRN", 106 "VERDE", 107 "OLAND", 108 "HAINAN", 109 "BONAIRE", 110 "KAVERI", 111 "KABINI", 112 "HAWAII", 113 "MULLINS", 114 "TOPAZ", 115 "TONGA", 116 "FIJI", 117 "CARRIZO", 118 "STONEY", 119 "POLARIS10", 120 "POLARIS11", 121 "POLARIS12", 122 "VEGAM", 123 "VEGA10", 124 "VEGA12", 125 "VEGA20", 126 "RAVEN", 127 "ARCTURUS", 128 "RENOIR", 129 "ALDEBARAN", 130 "NAVI10", 131 "CYAN_SKILLFISH", 132 "NAVI14", 133 "NAVI12", 134 "SIENNA_CICHLID", 135 "NAVY_FLOUNDER", 136 "VANGOGH", 137 "DIMGREY_CAVEFISH", 138 "BEIGE_GOBY", 139 "YELLOW_CARP", 140 "IP DISCOVERY", 141 "LAST", 142 }; 143 144 /** 145 * DOC: pcie_replay_count 146 * 147 * The amdgpu driver provides a sysfs API for reporting the total number 148 * of PCIe replays (NAKs) 149 * The file pcie_replay_count is used for this and returns the total 150 * number of replays as a sum of the NAKs generated and NAKs received 151 */ 152 153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 154 struct device_attribute *attr, char *buf) 155 { 156 struct drm_device *ddev = dev_get_drvdata(dev); 157 struct amdgpu_device *adev = drm_to_adev(ddev); 158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 159 160 return sysfs_emit(buf, "%llu\n", cnt); 161 } 162 163 static DEVICE_ATTR(pcie_replay_count, 0444, 164 amdgpu_device_get_pcie_replay_count, NULL); 165 166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 167 168 169 /** 170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 171 * 172 * @dev: drm_device pointer 173 * 174 * Returns true if the device is a dGPU with ATPX power control, 175 * otherwise return false. 176 */ 177 bool amdgpu_device_supports_px(struct drm_device *dev) 178 { 179 struct amdgpu_device *adev = drm_to_adev(dev); 180 181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 182 return true; 183 return false; 184 } 185 186 /** 187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 188 * 189 * @dev: drm_device pointer 190 * 191 * Returns true if the device is a dGPU with ACPI power control, 192 * otherwise return false. 193 */ 194 bool amdgpu_device_supports_boco(struct drm_device *dev) 195 { 196 struct amdgpu_device *adev = drm_to_adev(dev); 197 198 if (adev->has_pr3 || 199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 200 return true; 201 return false; 202 } 203 204 /** 205 * amdgpu_device_supports_baco - Does the device support BACO 206 * 207 * @dev: drm_device pointer 208 * 209 * Returns true if the device supporte BACO, 210 * otherwise return false. 211 */ 212 bool amdgpu_device_supports_baco(struct drm_device *dev) 213 { 214 struct amdgpu_device *adev = drm_to_adev(dev); 215 216 return amdgpu_asic_supports_baco(adev); 217 } 218 219 /** 220 * amdgpu_device_supports_smart_shift - Is the device dGPU with 221 * smart shift support 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with Smart Shift support, 226 * otherwise returns false. 227 */ 228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 229 { 230 return (amdgpu_device_supports_boco(dev) && 231 amdgpu_acpi_is_power_shift_control_supported()); 232 } 233 234 /* 235 * VRAM access helper functions 236 */ 237 238 /** 239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 240 * 241 * @adev: amdgpu_device pointer 242 * @pos: offset of the buffer in vram 243 * @buf: virtual address of the buffer in system memory 244 * @size: read/write size, sizeof(@buf) must > @size 245 * @write: true - write to vram, otherwise - read from vram 246 */ 247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 248 void *buf, size_t size, bool write) 249 { 250 unsigned long flags; 251 uint32_t hi = ~0, tmp = 0; 252 uint32_t *data = buf; 253 uint64_t last; 254 int idx; 255 256 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 257 return; 258 259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 260 261 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 262 for (last = pos + size; pos < last; pos += 4) { 263 tmp = pos >> 31; 264 265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 266 if (tmp != hi) { 267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 268 hi = tmp; 269 } 270 if (write) 271 WREG32_NO_KIQ(mmMM_DATA, *data++); 272 else 273 *data++ = RREG32_NO_KIQ(mmMM_DATA); 274 } 275 276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 277 drm_dev_exit(idx); 278 } 279 280 /** 281 * amdgpu_device_aper_access - access vram by vram aperature 282 * 283 * @adev: amdgpu_device pointer 284 * @pos: offset of the buffer in vram 285 * @buf: virtual address of the buffer in system memory 286 * @size: read/write size, sizeof(@buf) must > @size 287 * @write: true - write to vram, otherwise - read from vram 288 * 289 * The return value means how many bytes have been transferred. 290 */ 291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 292 void *buf, size_t size, bool write) 293 { 294 #ifdef CONFIG_64BIT 295 void __iomem *addr; 296 size_t count = 0; 297 uint64_t last; 298 299 if (!adev->mman.aper_base_kaddr) 300 return 0; 301 302 last = min(pos + size, adev->gmc.visible_vram_size); 303 if (last > pos) { 304 addr = adev->mman.aper_base_kaddr + pos; 305 count = last - pos; 306 307 if (write) { 308 memcpy_toio(addr, buf, count); 309 /* Make sure HDP write cache flush happens without any reordering 310 * after the system memory contents are sent over PCIe device 311 */ 312 mb(); 313 amdgpu_device_flush_hdp(adev, NULL); 314 } else { 315 amdgpu_device_invalidate_hdp(adev, NULL); 316 /* Make sure HDP read cache is invalidated before issuing a read 317 * to the PCIe device 318 */ 319 mb(); 320 memcpy_fromio(buf, addr, count); 321 } 322 323 } 324 325 return count; 326 #else 327 return 0; 328 #endif 329 } 330 331 /** 332 * amdgpu_device_vram_access - read/write a buffer in vram 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 */ 340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 341 void *buf, size_t size, bool write) 342 { 343 size_t count; 344 345 /* try to using vram apreature to access vram first */ 346 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 347 size -= count; 348 if (size) { 349 /* using MM to access rest vram */ 350 pos += count; 351 buf += count; 352 amdgpu_device_mm_access(adev, pos, buf, size, write); 353 } 354 } 355 356 /* 357 * register access helper functions. 358 */ 359 360 /* Check if hw access should be skipped because of hotplug or device error */ 361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 362 { 363 if (adev->no_hw_access) 364 return true; 365 366 #ifdef CONFIG_LOCKDEP 367 /* 368 * This is a bit complicated to understand, so worth a comment. What we assert 369 * here is that the GPU reset is not running on another thread in parallel. 370 * 371 * For this we trylock the read side of the reset semaphore, if that succeeds 372 * we know that the reset is not running in paralell. 373 * 374 * If the trylock fails we assert that we are either already holding the read 375 * side of the lock or are the reset thread itself and hold the write side of 376 * the lock. 377 */ 378 if (in_task()) { 379 if (down_read_trylock(&adev->reset_domain->sem)) 380 up_read(&adev->reset_domain->sem); 381 else 382 lockdep_assert_held(&adev->reset_domain->sem); 383 } 384 #endif 385 return false; 386 } 387 388 /** 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register 390 * 391 * @adev: amdgpu_device pointer 392 * @reg: dword aligned register offset 393 * @acc_flags: access flags which require special behavior 394 * 395 * Returns the 32 bit value from the offset specified. 396 */ 397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t acc_flags) 399 { 400 uint32_t ret; 401 402 if (amdgpu_device_skip_hw_access(adev)) 403 return 0; 404 405 if ((reg * 4) < adev->rmmio_size) { 406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 407 amdgpu_sriov_runtime(adev) && 408 down_read_trylock(&adev->reset_domain->sem)) { 409 ret = amdgpu_kiq_rreg(adev, reg); 410 up_read(&adev->reset_domain->sem); 411 } else { 412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 413 } 414 } else { 415 ret = adev->pcie_rreg(adev, reg * 4); 416 } 417 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 419 420 return ret; 421 } 422 423 /* 424 * MMIO register read with bytes helper functions 425 * @offset:bytes offset from MMIO start 426 */ 427 428 /** 429 * amdgpu_mm_rreg8 - read a memory mapped IO register 430 * 431 * @adev: amdgpu_device pointer 432 * @offset: byte aligned register offset 433 * 434 * Returns the 8 bit value from the offset specified. 435 */ 436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 437 { 438 if (amdgpu_device_skip_hw_access(adev)) 439 return 0; 440 441 if (offset < adev->rmmio_size) 442 return (readb(adev->rmmio + offset)); 443 BUG(); 444 } 445 446 /* 447 * MMIO register write with bytes helper functions 448 * @offset:bytes offset from MMIO start 449 * @value: the value want to be written to the register 450 */ 451 452 /** 453 * amdgpu_mm_wreg8 - read a memory mapped IO register 454 * 455 * @adev: amdgpu_device pointer 456 * @offset: byte aligned register offset 457 * @value: 8 bit value to write 458 * 459 * Writes the value specified to the offset specified. 460 */ 461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 462 { 463 if (amdgpu_device_skip_hw_access(adev)) 464 return; 465 466 if (offset < adev->rmmio_size) 467 writeb(value, adev->rmmio + offset); 468 else 469 BUG(); 470 } 471 472 /** 473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 474 * 475 * @adev: amdgpu_device pointer 476 * @reg: dword aligned register offset 477 * @v: 32 bit value to write to the register 478 * @acc_flags: access flags which require special behavior 479 * 480 * Writes the value specified to the offset specified. 481 */ 482 void amdgpu_device_wreg(struct amdgpu_device *adev, 483 uint32_t reg, uint32_t v, 484 uint32_t acc_flags) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return; 488 489 if ((reg * 4) < adev->rmmio_size) { 490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 491 amdgpu_sriov_runtime(adev) && 492 down_read_trylock(&adev->reset_domain->sem)) { 493 amdgpu_kiq_wreg(adev, reg, v); 494 up_read(&adev->reset_domain->sem); 495 } else { 496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 497 } 498 } else { 499 adev->pcie_wreg(adev, reg * 4, v); 500 } 501 502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 503 } 504 505 /** 506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 507 * 508 * @adev: amdgpu_device pointer 509 * @reg: mmio/rlc register 510 * @v: value to write 511 * 512 * this function is invoked only for the debugfs register access 513 */ 514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 515 uint32_t reg, uint32_t v, 516 uint32_t xcc_id) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (amdgpu_sriov_fullaccess(adev) && 522 adev->gfx.rlc.funcs && 523 adev->gfx.rlc.funcs->is_rlcg_access_range) { 524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 526 } else if ((reg * 4) >= adev->rmmio_size) { 527 adev->pcie_wreg(adev, reg * 4, v); 528 } else { 529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 530 } 531 } 532 533 /** 534 * amdgpu_device_indirect_rreg - read an indirect register 535 * 536 * @adev: amdgpu_device pointer 537 * @reg_addr: indirect register address to read from 538 * 539 * Returns the value of indirect register @reg_addr 540 */ 541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 542 u32 reg_addr) 543 { 544 unsigned long flags, pcie_index, pcie_data; 545 void __iomem *pcie_index_offset; 546 void __iomem *pcie_data_offset; 547 u32 r; 548 549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 551 552 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 555 556 writel(reg_addr, pcie_index_offset); 557 readl(pcie_index_offset); 558 r = readl(pcie_data_offset); 559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 560 561 return r; 562 } 563 564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 565 u64 reg_addr) 566 { 567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 568 u32 r; 569 void __iomem *pcie_index_offset; 570 void __iomem *pcie_index_hi_offset; 571 void __iomem *pcie_data_offset; 572 573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 575 if (adev->nbio.funcs->get_pcie_index_hi_offset) 576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 577 else 578 pcie_index_hi = 0; 579 580 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 583 if (pcie_index_hi != 0) 584 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 585 pcie_index_hi * 4; 586 587 writel(reg_addr, pcie_index_offset); 588 readl(pcie_index_offset); 589 if (pcie_index_hi != 0) { 590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 591 readl(pcie_index_hi_offset); 592 } 593 r = readl(pcie_data_offset); 594 595 /* clear the high bits */ 596 if (pcie_index_hi != 0) { 597 writel(0, pcie_index_hi_offset); 598 readl(pcie_index_hi_offset); 599 } 600 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @reg_addr: indirect register address to read from 611 * 612 * Returns the value of indirect register @reg_addr 613 */ 614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 615 u32 reg_addr) 616 { 617 unsigned long flags, pcie_index, pcie_data; 618 void __iomem *pcie_index_offset; 619 void __iomem *pcie_data_offset; 620 u64 r; 621 622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 624 625 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 628 629 /* read low 32 bits */ 630 writel(reg_addr, pcie_index_offset); 631 readl(pcie_index_offset); 632 r = readl(pcie_data_offset); 633 /* read high 32 bits */ 634 writel(reg_addr + 4, pcie_index_offset); 635 readl(pcie_index_offset); 636 r |= ((u64)readl(pcie_data_offset) << 32); 637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 638 639 return r; 640 } 641 642 /** 643 * amdgpu_device_indirect_wreg - write an indirect register address 644 * 645 * @adev: amdgpu_device pointer 646 * @reg_addr: indirect register offset 647 * @reg_data: indirect register data 648 * 649 */ 650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 651 u32 reg_addr, u32 reg_data) 652 { 653 unsigned long flags, pcie_index, pcie_data; 654 void __iomem *pcie_index_offset; 655 void __iomem *pcie_data_offset; 656 657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 659 660 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 663 664 writel(reg_addr, pcie_index_offset); 665 readl(pcie_index_offset); 666 writel(reg_data, pcie_data_offset); 667 readl(pcie_data_offset); 668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 669 } 670 671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 672 u64 reg_addr, u32 reg_data) 673 { 674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 675 void __iomem *pcie_index_offset; 676 void __iomem *pcie_index_hi_offset; 677 void __iomem *pcie_data_offset; 678 679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 681 if (adev->nbio.funcs->get_pcie_index_hi_offset) 682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 683 else 684 pcie_index_hi = 0; 685 686 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 689 if (pcie_index_hi != 0) 690 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 691 pcie_index_hi * 4; 692 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 if (pcie_index_hi != 0) { 696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 697 readl(pcie_index_hi_offset); 698 } 699 writel(reg_data, pcie_data_offset); 700 readl(pcie_data_offset); 701 702 /* clear the high bits */ 703 if (pcie_index_hi != 0) { 704 writel(0, pcie_index_hi_offset); 705 readl(pcie_index_hi_offset); 706 } 707 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @reg_addr: indirect register offset 716 * @reg_data: indirect register data 717 * 718 */ 719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 720 u32 reg_addr, u64 reg_data) 721 { 722 unsigned long flags, pcie_index, pcie_data; 723 void __iomem *pcie_index_offset; 724 void __iomem *pcie_data_offset; 725 726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_device_get_rev_id - query device rev_id 748 * 749 * @adev: amdgpu_device pointer 750 * 751 * Return device rev_id 752 */ 753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 754 { 755 return adev->nbio.funcs->get_rev_id(adev); 756 } 757 758 /** 759 * amdgpu_invalid_rreg - dummy reg read function 760 * 761 * @adev: amdgpu_device pointer 762 * @reg: offset of register 763 * 764 * Dummy register read function. Used for register blocks 765 * that certain asics don't have (all asics). 766 * Returns the value in the register. 767 */ 768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 769 { 770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 771 BUG(); 772 return 0; 773 } 774 775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 776 { 777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 778 BUG(); 779 return 0; 780 } 781 782 /** 783 * amdgpu_invalid_wreg - dummy reg write function 784 * 785 * @adev: amdgpu_device pointer 786 * @reg: offset of register 787 * @v: value to write to the register 788 * 789 * Dummy register read function. Used for register blocks 790 * that certain asics don't have (all asics). 791 */ 792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 793 { 794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 795 reg, v); 796 BUG(); 797 } 798 799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 800 { 801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 802 reg, v); 803 BUG(); 804 } 805 806 /** 807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: offset of register 811 * 812 * Dummy register read function. Used for register blocks 813 * that certain asics don't have (all asics). 814 * Returns the value in the register. 815 */ 816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 817 { 818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 819 BUG(); 820 return 0; 821 } 822 823 /** 824 * amdgpu_invalid_wreg64 - dummy reg write function 825 * 826 * @adev: amdgpu_device pointer 827 * @reg: offset of register 828 * @v: value to write to the register 829 * 830 * Dummy register read function. Used for register blocks 831 * that certain asics don't have (all asics). 832 */ 833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 834 { 835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 836 reg, v); 837 BUG(); 838 } 839 840 /** 841 * amdgpu_block_invalid_rreg - dummy reg read function 842 * 843 * @adev: amdgpu_device pointer 844 * @block: offset of instance 845 * @reg: offset of register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 * Returns the value in the register. 850 */ 851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 852 uint32_t block, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 855 reg, block); 856 BUG(); 857 return 0; 858 } 859 860 /** 861 * amdgpu_block_invalid_wreg - dummy reg write function 862 * 863 * @adev: amdgpu_device pointer 864 * @block: offset of instance 865 * @reg: offset of register 866 * @v: value to write to the register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 */ 871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 872 uint32_t block, 873 uint32_t reg, uint32_t v) 874 { 875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 876 reg, block, v); 877 BUG(); 878 } 879 880 /** 881 * amdgpu_device_asic_init - Wrapper for atom asic_init 882 * 883 * @adev: amdgpu_device pointer 884 * 885 * Does any asic specific work and then calls atom asic init. 886 */ 887 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 888 { 889 int ret; 890 891 amdgpu_asic_pre_asic_init(adev); 892 893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 895 amdgpu_psp_wait_for_bootloader(adev); 896 ret = amdgpu_atomfirmware_asic_init(adev, true); 897 return ret; 898 } else { 899 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 900 } 901 902 return 0; 903 } 904 905 /** 906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Allocates a scratch page of VRAM for use by various things in the 911 * driver. 912 */ 913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 914 { 915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 916 AMDGPU_GEM_DOMAIN_VRAM | 917 AMDGPU_GEM_DOMAIN_GTT, 918 &adev->mem_scratch.robj, 919 &adev->mem_scratch.gpu_addr, 920 (void **)&adev->mem_scratch.ptr); 921 } 922 923 /** 924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 925 * 926 * @adev: amdgpu_device pointer 927 * 928 * Frees the VRAM scratch page. 929 */ 930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 931 { 932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 933 } 934 935 /** 936 * amdgpu_device_program_register_sequence - program an array of registers. 937 * 938 * @adev: amdgpu_device pointer 939 * @registers: pointer to the register array 940 * @array_size: size of the register array 941 * 942 * Programs an array or registers with and or masks. 943 * This is a helper for setting golden registers. 944 */ 945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 946 const u32 *registers, 947 const u32 array_size) 948 { 949 u32 tmp, reg, and_mask, or_mask; 950 int i; 951 952 if (array_size % 3) 953 return; 954 955 for (i = 0; i < array_size; i += 3) { 956 reg = registers[i + 0]; 957 and_mask = registers[i + 1]; 958 or_mask = registers[i + 2]; 959 960 if (and_mask == 0xffffffff) { 961 tmp = or_mask; 962 } else { 963 tmp = RREG32(reg); 964 tmp &= ~and_mask; 965 if (adev->family >= AMDGPU_FAMILY_AI) 966 tmp |= (or_mask & and_mask); 967 else 968 tmp |= or_mask; 969 } 970 WREG32(reg, tmp); 971 } 972 } 973 974 /** 975 * amdgpu_device_pci_config_reset - reset the GPU 976 * 977 * @adev: amdgpu_device pointer 978 * 979 * Resets the GPU using the pci config reset sequence. 980 * Only applicable to asics prior to vega10. 981 */ 982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 983 { 984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 985 } 986 987 /** 988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 993 */ 994 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 995 { 996 return pci_reset_function(adev->pdev); 997 } 998 999 /* 1000 * amdgpu_device_wb_*() 1001 * Writeback is the method by which the GPU updates special pages in memory 1002 * with the status of certain GPU events (fences, ring pointers,etc.). 1003 */ 1004 1005 /** 1006 * amdgpu_device_wb_fini - Disable Writeback and free memory 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Disables Writeback and frees the Writeback memory (all asics). 1011 * Used at driver shutdown. 1012 */ 1013 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1014 { 1015 if (adev->wb.wb_obj) { 1016 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1017 &adev->wb.gpu_addr, 1018 (void **)&adev->wb.wb); 1019 adev->wb.wb_obj = NULL; 1020 } 1021 } 1022 1023 /** 1024 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1025 * 1026 * @adev: amdgpu_device pointer 1027 * 1028 * Initializes writeback and allocates writeback memory (all asics). 1029 * Used at driver startup. 1030 * Returns 0 on success or an -error on failure. 1031 */ 1032 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1033 { 1034 int r; 1035 1036 if (adev->wb.wb_obj == NULL) { 1037 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1038 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1039 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1040 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1041 (void **)&adev->wb.wb); 1042 if (r) { 1043 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1044 return r; 1045 } 1046 1047 adev->wb.num_wb = AMDGPU_MAX_WB; 1048 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1049 1050 /* clear wb memory */ 1051 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1052 } 1053 1054 return 0; 1055 } 1056 1057 /** 1058 * amdgpu_device_wb_get - Allocate a wb entry 1059 * 1060 * @adev: amdgpu_device pointer 1061 * @wb: wb index 1062 * 1063 * Allocate a wb slot for use by the driver (all asics). 1064 * Returns 0 on success or -EINVAL on failure. 1065 */ 1066 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1067 { 1068 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1069 1070 if (offset < adev->wb.num_wb) { 1071 __set_bit(offset, adev->wb.used); 1072 *wb = offset << 3; /* convert to dw offset */ 1073 return 0; 1074 } else { 1075 return -EINVAL; 1076 } 1077 } 1078 1079 /** 1080 * amdgpu_device_wb_free - Free a wb entry 1081 * 1082 * @adev: amdgpu_device pointer 1083 * @wb: wb index 1084 * 1085 * Free a wb slot allocated for use by the driver (all asics) 1086 */ 1087 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1088 { 1089 wb >>= 3; 1090 if (wb < adev->wb.num_wb) 1091 __clear_bit(wb, adev->wb.used); 1092 } 1093 1094 /** 1095 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1096 * 1097 * @adev: amdgpu_device pointer 1098 * 1099 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1100 * to fail, but if any of the BARs is not accessible after the size we abort 1101 * driver loading by returning -ENODEV. 1102 */ 1103 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1104 { 1105 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1106 struct pci_bus *root; 1107 struct resource *res; 1108 unsigned int i; 1109 u16 cmd; 1110 int r; 1111 1112 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1113 return 0; 1114 1115 /* Bypass for VF */ 1116 if (amdgpu_sriov_vf(adev)) 1117 return 0; 1118 1119 /* skip if the bios has already enabled large BAR */ 1120 if (adev->gmc.real_vram_size && 1121 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1122 return 0; 1123 1124 /* Check if the root BUS has 64bit memory resources */ 1125 root = adev->pdev->bus; 1126 while (root->parent) 1127 root = root->parent; 1128 1129 pci_bus_for_each_resource(root, res, i) { 1130 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1131 res->start > 0x100000000ull) 1132 break; 1133 } 1134 1135 /* Trying to resize is pointless without a root hub window above 4GB */ 1136 if (!res) 1137 return 0; 1138 1139 /* Limit the BAR size to what is available */ 1140 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1141 rbar_size); 1142 1143 /* Disable memory decoding while we change the BAR addresses and size */ 1144 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1145 pci_write_config_word(adev->pdev, PCI_COMMAND, 1146 cmd & ~PCI_COMMAND_MEMORY); 1147 1148 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1149 amdgpu_doorbell_fini(adev); 1150 if (adev->asic_type >= CHIP_BONAIRE) 1151 pci_release_resource(adev->pdev, 2); 1152 1153 pci_release_resource(adev->pdev, 0); 1154 1155 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1156 if (r == -ENOSPC) 1157 DRM_INFO("Not enough PCI address space for a large BAR."); 1158 else if (r && r != -ENOTSUPP) 1159 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1160 1161 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1162 1163 /* When the doorbell or fb BAR isn't available we have no chance of 1164 * using the device. 1165 */ 1166 r = amdgpu_doorbell_init(adev); 1167 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1168 return -ENODEV; 1169 1170 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1171 1172 return 0; 1173 } 1174 1175 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1176 { 1177 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1178 return false; 1179 1180 return true; 1181 } 1182 1183 /* 1184 * GPU helpers function. 1185 */ 1186 /** 1187 * amdgpu_device_need_post - check if the hw need post or not 1188 * 1189 * @adev: amdgpu_device pointer 1190 * 1191 * Check if the asic has been initialized (all asics) at driver startup 1192 * or post is needed if hw reset is performed. 1193 * Returns true if need or false if not. 1194 */ 1195 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1196 { 1197 uint32_t reg; 1198 1199 if (amdgpu_sriov_vf(adev)) 1200 return false; 1201 1202 if (!amdgpu_device_read_bios(adev)) 1203 return false; 1204 1205 if (amdgpu_passthrough(adev)) { 1206 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1207 * some old smc fw still need driver do vPost otherwise gpu hang, while 1208 * those smc fw version above 22.15 doesn't have this flaw, so we force 1209 * vpost executed for smc version below 22.15 1210 */ 1211 if (adev->asic_type == CHIP_FIJI) { 1212 int err; 1213 uint32_t fw_ver; 1214 1215 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1216 /* force vPost if error occured */ 1217 if (err) 1218 return true; 1219 1220 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1221 if (fw_ver < 0x00160e00) 1222 return true; 1223 } 1224 } 1225 1226 /* Don't post if we need to reset whole hive on init */ 1227 if (adev->gmc.xgmi.pending_reset) 1228 return false; 1229 1230 if (adev->has_hw_reset) { 1231 adev->has_hw_reset = false; 1232 return true; 1233 } 1234 1235 /* bios scratch used on CIK+ */ 1236 if (adev->asic_type >= CHIP_BONAIRE) 1237 return amdgpu_atombios_scratch_need_asic_init(adev); 1238 1239 /* check MEM_SIZE for older asics */ 1240 reg = amdgpu_asic_get_config_memsize(adev); 1241 1242 if ((reg != 0) && (reg != 0xffffffff)) 1243 return false; 1244 1245 return true; 1246 } 1247 1248 /* 1249 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1250 * speed switching. Until we have confirmation from Intel that a specific host 1251 * supports it, it's safer that we keep it disabled for all. 1252 * 1253 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1254 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1255 */ 1256 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1257 { 1258 #if IS_ENABLED(CONFIG_X86) 1259 struct cpuinfo_x86 *c = &cpu_data(0); 1260 1261 if (c->x86_vendor == X86_VENDOR_INTEL) 1262 return false; 1263 #endif 1264 return true; 1265 } 1266 1267 /** 1268 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1273 * be set for this device. 1274 * 1275 * Returns true if it should be used or false if not. 1276 */ 1277 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1278 { 1279 switch (amdgpu_aspm) { 1280 case -1: 1281 break; 1282 case 0: 1283 return false; 1284 case 1: 1285 return true; 1286 default: 1287 return false; 1288 } 1289 return pcie_aspm_enabled(adev->pdev); 1290 } 1291 1292 bool amdgpu_device_aspm_support_quirk(void) 1293 { 1294 #if IS_ENABLED(CONFIG_X86) 1295 struct cpuinfo_x86 *c = &cpu_data(0); 1296 1297 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1298 #else 1299 return true; 1300 #endif 1301 } 1302 1303 /* if we get transitioned to only one device, take VGA back */ 1304 /** 1305 * amdgpu_device_vga_set_decode - enable/disable vga decode 1306 * 1307 * @pdev: PCI device pointer 1308 * @state: enable/disable vga decode 1309 * 1310 * Enable/disable vga decode (all asics). 1311 * Returns VGA resource flags. 1312 */ 1313 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1314 bool state) 1315 { 1316 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1317 1318 amdgpu_asic_set_vga_state(adev, state); 1319 if (state) 1320 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1321 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1322 else 1323 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1324 } 1325 1326 /** 1327 * amdgpu_device_check_block_size - validate the vm block size 1328 * 1329 * @adev: amdgpu_device pointer 1330 * 1331 * Validates the vm block size specified via module parameter. 1332 * The vm block size defines number of bits in page table versus page directory, 1333 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1334 * page table and the remaining bits are in the page directory. 1335 */ 1336 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1337 { 1338 /* defines number of bits in page table versus page directory, 1339 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1340 * page table and the remaining bits are in the page directory 1341 */ 1342 if (amdgpu_vm_block_size == -1) 1343 return; 1344 1345 if (amdgpu_vm_block_size < 9) { 1346 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1347 amdgpu_vm_block_size); 1348 amdgpu_vm_block_size = -1; 1349 } 1350 } 1351 1352 /** 1353 * amdgpu_device_check_vm_size - validate the vm size 1354 * 1355 * @adev: amdgpu_device pointer 1356 * 1357 * Validates the vm size in GB specified via module parameter. 1358 * The VM size is the size of the GPU virtual memory space in GB. 1359 */ 1360 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1361 { 1362 /* no need to check the default value */ 1363 if (amdgpu_vm_size == -1) 1364 return; 1365 1366 if (amdgpu_vm_size < 1) { 1367 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1368 amdgpu_vm_size); 1369 amdgpu_vm_size = -1; 1370 } 1371 } 1372 1373 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1374 { 1375 struct sysinfo si; 1376 bool is_os_64 = (sizeof(void *) == 8); 1377 uint64_t total_memory; 1378 uint64_t dram_size_seven_GB = 0x1B8000000; 1379 uint64_t dram_size_three_GB = 0xB8000000; 1380 1381 if (amdgpu_smu_memory_pool_size == 0) 1382 return; 1383 1384 if (!is_os_64) { 1385 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1386 goto def_value; 1387 } 1388 si_meminfo(&si); 1389 total_memory = (uint64_t)si.totalram * si.mem_unit; 1390 1391 if ((amdgpu_smu_memory_pool_size == 1) || 1392 (amdgpu_smu_memory_pool_size == 2)) { 1393 if (total_memory < dram_size_three_GB) 1394 goto def_value1; 1395 } else if ((amdgpu_smu_memory_pool_size == 4) || 1396 (amdgpu_smu_memory_pool_size == 8)) { 1397 if (total_memory < dram_size_seven_GB) 1398 goto def_value1; 1399 } else { 1400 DRM_WARN("Smu memory pool size not supported\n"); 1401 goto def_value; 1402 } 1403 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1404 1405 return; 1406 1407 def_value1: 1408 DRM_WARN("No enough system memory\n"); 1409 def_value: 1410 adev->pm.smu_prv_buffer_size = 0; 1411 } 1412 1413 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1414 { 1415 if (!(adev->flags & AMD_IS_APU) || 1416 adev->asic_type < CHIP_RAVEN) 1417 return 0; 1418 1419 switch (adev->asic_type) { 1420 case CHIP_RAVEN: 1421 if (adev->pdev->device == 0x15dd) 1422 adev->apu_flags |= AMD_APU_IS_RAVEN; 1423 if (adev->pdev->device == 0x15d8) 1424 adev->apu_flags |= AMD_APU_IS_PICASSO; 1425 break; 1426 case CHIP_RENOIR: 1427 if ((adev->pdev->device == 0x1636) || 1428 (adev->pdev->device == 0x164c)) 1429 adev->apu_flags |= AMD_APU_IS_RENOIR; 1430 else 1431 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1432 break; 1433 case CHIP_VANGOGH: 1434 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1435 break; 1436 case CHIP_YELLOW_CARP: 1437 break; 1438 case CHIP_CYAN_SKILLFISH: 1439 if ((adev->pdev->device == 0x13FE) || 1440 (adev->pdev->device == 0x143F)) 1441 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1442 break; 1443 default: 1444 break; 1445 } 1446 1447 return 0; 1448 } 1449 1450 /** 1451 * amdgpu_device_check_arguments - validate module params 1452 * 1453 * @adev: amdgpu_device pointer 1454 * 1455 * Validates certain module parameters and updates 1456 * the associated values used by the driver (all asics). 1457 */ 1458 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1459 { 1460 if (amdgpu_sched_jobs < 4) { 1461 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1462 amdgpu_sched_jobs); 1463 amdgpu_sched_jobs = 4; 1464 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1465 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1466 amdgpu_sched_jobs); 1467 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1468 } 1469 1470 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1471 /* gart size must be greater or equal to 32M */ 1472 dev_warn(adev->dev, "gart size (%d) too small\n", 1473 amdgpu_gart_size); 1474 amdgpu_gart_size = -1; 1475 } 1476 1477 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1478 /* gtt size must be greater or equal to 32M */ 1479 dev_warn(adev->dev, "gtt size (%d) too small\n", 1480 amdgpu_gtt_size); 1481 amdgpu_gtt_size = -1; 1482 } 1483 1484 /* valid range is between 4 and 9 inclusive */ 1485 if (amdgpu_vm_fragment_size != -1 && 1486 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1487 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1488 amdgpu_vm_fragment_size = -1; 1489 } 1490 1491 if (amdgpu_sched_hw_submission < 2) { 1492 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1493 amdgpu_sched_hw_submission); 1494 amdgpu_sched_hw_submission = 2; 1495 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1496 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1497 amdgpu_sched_hw_submission); 1498 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1499 } 1500 1501 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1502 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1503 amdgpu_reset_method = -1; 1504 } 1505 1506 amdgpu_device_check_smu_prv_buffer_size(adev); 1507 1508 amdgpu_device_check_vm_size(adev); 1509 1510 amdgpu_device_check_block_size(adev); 1511 1512 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1513 1514 return 0; 1515 } 1516 1517 /** 1518 * amdgpu_switcheroo_set_state - set switcheroo state 1519 * 1520 * @pdev: pci dev pointer 1521 * @state: vga_switcheroo state 1522 * 1523 * Callback for the switcheroo driver. Suspends or resumes 1524 * the asics before or after it is powered up using ACPI methods. 1525 */ 1526 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1527 enum vga_switcheroo_state state) 1528 { 1529 struct drm_device *dev = pci_get_drvdata(pdev); 1530 int r; 1531 1532 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1533 return; 1534 1535 if (state == VGA_SWITCHEROO_ON) { 1536 pr_info("switched on\n"); 1537 /* don't suspend or resume card normally */ 1538 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1539 1540 pci_set_power_state(pdev, PCI_D0); 1541 amdgpu_device_load_pci_state(pdev); 1542 r = pci_enable_device(pdev); 1543 if (r) 1544 DRM_WARN("pci_enable_device failed (%d)\n", r); 1545 amdgpu_device_resume(dev, true); 1546 1547 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1548 } else { 1549 pr_info("switched off\n"); 1550 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1551 amdgpu_device_suspend(dev, true); 1552 amdgpu_device_cache_pci_state(pdev); 1553 /* Shut down the device */ 1554 pci_disable_device(pdev); 1555 pci_set_power_state(pdev, PCI_D3cold); 1556 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1557 } 1558 } 1559 1560 /** 1561 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1562 * 1563 * @pdev: pci dev pointer 1564 * 1565 * Callback for the switcheroo driver. Check of the switcheroo 1566 * state can be changed. 1567 * Returns true if the state can be changed, false if not. 1568 */ 1569 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1570 { 1571 struct drm_device *dev = pci_get_drvdata(pdev); 1572 1573 /* 1574 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1575 * locking inversion with the driver load path. And the access here is 1576 * completely racy anyway. So don't bother with locking for now. 1577 */ 1578 return atomic_read(&dev->open_count) == 0; 1579 } 1580 1581 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1582 .set_gpu_state = amdgpu_switcheroo_set_state, 1583 .reprobe = NULL, 1584 .can_switch = amdgpu_switcheroo_can_switch, 1585 }; 1586 1587 /** 1588 * amdgpu_device_ip_set_clockgating_state - set the CG state 1589 * 1590 * @dev: amdgpu_device pointer 1591 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1592 * @state: clockgating state (gate or ungate) 1593 * 1594 * Sets the requested clockgating state for all instances of 1595 * the hardware IP specified. 1596 * Returns the error code from the last instance. 1597 */ 1598 int amdgpu_device_ip_set_clockgating_state(void *dev, 1599 enum amd_ip_block_type block_type, 1600 enum amd_clockgating_state state) 1601 { 1602 struct amdgpu_device *adev = dev; 1603 int i, r = 0; 1604 1605 for (i = 0; i < adev->num_ip_blocks; i++) { 1606 if (!adev->ip_blocks[i].status.valid) 1607 continue; 1608 if (adev->ip_blocks[i].version->type != block_type) 1609 continue; 1610 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1611 continue; 1612 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1613 (void *)adev, state); 1614 if (r) 1615 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1616 adev->ip_blocks[i].version->funcs->name, r); 1617 } 1618 return r; 1619 } 1620 1621 /** 1622 * amdgpu_device_ip_set_powergating_state - set the PG state 1623 * 1624 * @dev: amdgpu_device pointer 1625 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1626 * @state: powergating state (gate or ungate) 1627 * 1628 * Sets the requested powergating state for all instances of 1629 * the hardware IP specified. 1630 * Returns the error code from the last instance. 1631 */ 1632 int amdgpu_device_ip_set_powergating_state(void *dev, 1633 enum amd_ip_block_type block_type, 1634 enum amd_powergating_state state) 1635 { 1636 struct amdgpu_device *adev = dev; 1637 int i, r = 0; 1638 1639 for (i = 0; i < adev->num_ip_blocks; i++) { 1640 if (!adev->ip_blocks[i].status.valid) 1641 continue; 1642 if (adev->ip_blocks[i].version->type != block_type) 1643 continue; 1644 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1645 continue; 1646 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1647 (void *)adev, state); 1648 if (r) 1649 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1650 adev->ip_blocks[i].version->funcs->name, r); 1651 } 1652 return r; 1653 } 1654 1655 /** 1656 * amdgpu_device_ip_get_clockgating_state - get the CG state 1657 * 1658 * @adev: amdgpu_device pointer 1659 * @flags: clockgating feature flags 1660 * 1661 * Walks the list of IPs on the device and updates the clockgating 1662 * flags for each IP. 1663 * Updates @flags with the feature flags for each hardware IP where 1664 * clockgating is enabled. 1665 */ 1666 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1667 u64 *flags) 1668 { 1669 int i; 1670 1671 for (i = 0; i < adev->num_ip_blocks; i++) { 1672 if (!adev->ip_blocks[i].status.valid) 1673 continue; 1674 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1675 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1676 } 1677 } 1678 1679 /** 1680 * amdgpu_device_ip_wait_for_idle - wait for idle 1681 * 1682 * @adev: amdgpu_device pointer 1683 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1684 * 1685 * Waits for the request hardware IP to be idle. 1686 * Returns 0 for success or a negative error code on failure. 1687 */ 1688 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1689 enum amd_ip_block_type block_type) 1690 { 1691 int i, r; 1692 1693 for (i = 0; i < adev->num_ip_blocks; i++) { 1694 if (!adev->ip_blocks[i].status.valid) 1695 continue; 1696 if (adev->ip_blocks[i].version->type == block_type) { 1697 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1698 if (r) 1699 return r; 1700 break; 1701 } 1702 } 1703 return 0; 1704 1705 } 1706 1707 /** 1708 * amdgpu_device_ip_is_idle - is the hardware IP idle 1709 * 1710 * @adev: amdgpu_device pointer 1711 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1712 * 1713 * Check if the hardware IP is idle or not. 1714 * Returns true if it the IP is idle, false if not. 1715 */ 1716 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1717 enum amd_ip_block_type block_type) 1718 { 1719 int i; 1720 1721 for (i = 0; i < adev->num_ip_blocks; i++) { 1722 if (!adev->ip_blocks[i].status.valid) 1723 continue; 1724 if (adev->ip_blocks[i].version->type == block_type) 1725 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1726 } 1727 return true; 1728 1729 } 1730 1731 /** 1732 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1733 * 1734 * @adev: amdgpu_device pointer 1735 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1736 * 1737 * Returns a pointer to the hardware IP block structure 1738 * if it exists for the asic, otherwise NULL. 1739 */ 1740 struct amdgpu_ip_block * 1741 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1742 enum amd_ip_block_type type) 1743 { 1744 int i; 1745 1746 for (i = 0; i < adev->num_ip_blocks; i++) 1747 if (adev->ip_blocks[i].version->type == type) 1748 return &adev->ip_blocks[i]; 1749 1750 return NULL; 1751 } 1752 1753 /** 1754 * amdgpu_device_ip_block_version_cmp 1755 * 1756 * @adev: amdgpu_device pointer 1757 * @type: enum amd_ip_block_type 1758 * @major: major version 1759 * @minor: minor version 1760 * 1761 * return 0 if equal or greater 1762 * return 1 if smaller or the ip_block doesn't exist 1763 */ 1764 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1765 enum amd_ip_block_type type, 1766 u32 major, u32 minor) 1767 { 1768 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1769 1770 if (ip_block && ((ip_block->version->major > major) || 1771 ((ip_block->version->major == major) && 1772 (ip_block->version->minor >= minor)))) 1773 return 0; 1774 1775 return 1; 1776 } 1777 1778 /** 1779 * amdgpu_device_ip_block_add 1780 * 1781 * @adev: amdgpu_device pointer 1782 * @ip_block_version: pointer to the IP to add 1783 * 1784 * Adds the IP block driver information to the collection of IPs 1785 * on the asic. 1786 */ 1787 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1788 const struct amdgpu_ip_block_version *ip_block_version) 1789 { 1790 if (!ip_block_version) 1791 return -EINVAL; 1792 1793 switch (ip_block_version->type) { 1794 case AMD_IP_BLOCK_TYPE_VCN: 1795 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1796 return 0; 1797 break; 1798 case AMD_IP_BLOCK_TYPE_JPEG: 1799 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1800 return 0; 1801 break; 1802 default: 1803 break; 1804 } 1805 1806 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1807 ip_block_version->funcs->name); 1808 1809 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1810 1811 return 0; 1812 } 1813 1814 /** 1815 * amdgpu_device_enable_virtual_display - enable virtual display feature 1816 * 1817 * @adev: amdgpu_device pointer 1818 * 1819 * Enabled the virtual display feature if the user has enabled it via 1820 * the module parameter virtual_display. This feature provides a virtual 1821 * display hardware on headless boards or in virtualized environments. 1822 * This function parses and validates the configuration string specified by 1823 * the user and configues the virtual display configuration (number of 1824 * virtual connectors, crtcs, etc.) specified. 1825 */ 1826 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1827 { 1828 adev->enable_virtual_display = false; 1829 1830 if (amdgpu_virtual_display) { 1831 const char *pci_address_name = pci_name(adev->pdev); 1832 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1833 1834 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1835 pciaddstr_tmp = pciaddstr; 1836 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1837 pciaddname = strsep(&pciaddname_tmp, ","); 1838 if (!strcmp("all", pciaddname) 1839 || !strcmp(pci_address_name, pciaddname)) { 1840 long num_crtc; 1841 int res = -1; 1842 1843 adev->enable_virtual_display = true; 1844 1845 if (pciaddname_tmp) 1846 res = kstrtol(pciaddname_tmp, 10, 1847 &num_crtc); 1848 1849 if (!res) { 1850 if (num_crtc < 1) 1851 num_crtc = 1; 1852 if (num_crtc > 6) 1853 num_crtc = 6; 1854 adev->mode_info.num_crtc = num_crtc; 1855 } else { 1856 adev->mode_info.num_crtc = 1; 1857 } 1858 break; 1859 } 1860 } 1861 1862 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1863 amdgpu_virtual_display, pci_address_name, 1864 adev->enable_virtual_display, adev->mode_info.num_crtc); 1865 1866 kfree(pciaddstr); 1867 } 1868 } 1869 1870 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1871 { 1872 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1873 adev->mode_info.num_crtc = 1; 1874 adev->enable_virtual_display = true; 1875 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1876 adev->enable_virtual_display, adev->mode_info.num_crtc); 1877 } 1878 } 1879 1880 /** 1881 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1882 * 1883 * @adev: amdgpu_device pointer 1884 * 1885 * Parses the asic configuration parameters specified in the gpu info 1886 * firmware and makes them availale to the driver for use in configuring 1887 * the asic. 1888 * Returns 0 on success, -EINVAL on failure. 1889 */ 1890 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1891 { 1892 const char *chip_name; 1893 char fw_name[40]; 1894 int err; 1895 const struct gpu_info_firmware_header_v1_0 *hdr; 1896 1897 adev->firmware.gpu_info_fw = NULL; 1898 1899 if (adev->mman.discovery_bin) { 1900 /* 1901 * FIXME: The bounding box is still needed by Navi12, so 1902 * temporarily read it from gpu_info firmware. Should be dropped 1903 * when DAL no longer needs it. 1904 */ 1905 if (adev->asic_type != CHIP_NAVI12) 1906 return 0; 1907 } 1908 1909 switch (adev->asic_type) { 1910 default: 1911 return 0; 1912 case CHIP_VEGA10: 1913 chip_name = "vega10"; 1914 break; 1915 case CHIP_VEGA12: 1916 chip_name = "vega12"; 1917 break; 1918 case CHIP_RAVEN: 1919 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1920 chip_name = "raven2"; 1921 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1922 chip_name = "picasso"; 1923 else 1924 chip_name = "raven"; 1925 break; 1926 case CHIP_ARCTURUS: 1927 chip_name = "arcturus"; 1928 break; 1929 case CHIP_NAVI12: 1930 chip_name = "navi12"; 1931 break; 1932 } 1933 1934 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1935 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1936 if (err) { 1937 dev_err(adev->dev, 1938 "Failed to get gpu_info firmware \"%s\"\n", 1939 fw_name); 1940 goto out; 1941 } 1942 1943 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1944 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1945 1946 switch (hdr->version_major) { 1947 case 1: 1948 { 1949 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1950 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1951 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1952 1953 /* 1954 * Should be droped when DAL no longer needs it. 1955 */ 1956 if (adev->asic_type == CHIP_NAVI12) 1957 goto parse_soc_bounding_box; 1958 1959 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1960 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1961 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1962 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1963 adev->gfx.config.max_texture_channel_caches = 1964 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1965 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1966 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1967 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1968 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1969 adev->gfx.config.double_offchip_lds_buf = 1970 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1971 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1972 adev->gfx.cu_info.max_waves_per_simd = 1973 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1974 adev->gfx.cu_info.max_scratch_slots_per_cu = 1975 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1976 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1977 if (hdr->version_minor >= 1) { 1978 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1979 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1980 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1981 adev->gfx.config.num_sc_per_sh = 1982 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1983 adev->gfx.config.num_packer_per_sc = 1984 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1985 } 1986 1987 parse_soc_bounding_box: 1988 /* 1989 * soc bounding box info is not integrated in disocovery table, 1990 * we always need to parse it from gpu info firmware if needed. 1991 */ 1992 if (hdr->version_minor == 2) { 1993 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1994 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1995 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1996 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1997 } 1998 break; 1999 } 2000 default: 2001 dev_err(adev->dev, 2002 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2003 err = -EINVAL; 2004 goto out; 2005 } 2006 out: 2007 return err; 2008 } 2009 2010 /** 2011 * amdgpu_device_ip_early_init - run early init for hardware IPs 2012 * 2013 * @adev: amdgpu_device pointer 2014 * 2015 * Early initialization pass for hardware IPs. The hardware IPs that make 2016 * up each asic are discovered each IP's early_init callback is run. This 2017 * is the first stage in initializing the asic. 2018 * Returns 0 on success, negative error code on failure. 2019 */ 2020 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2021 { 2022 struct pci_dev *parent; 2023 int i, r; 2024 bool total; 2025 2026 amdgpu_device_enable_virtual_display(adev); 2027 2028 if (amdgpu_sriov_vf(adev)) { 2029 r = amdgpu_virt_request_full_gpu(adev, true); 2030 if (r) 2031 return r; 2032 } 2033 2034 switch (adev->asic_type) { 2035 #ifdef CONFIG_DRM_AMDGPU_SI 2036 case CHIP_VERDE: 2037 case CHIP_TAHITI: 2038 case CHIP_PITCAIRN: 2039 case CHIP_OLAND: 2040 case CHIP_HAINAN: 2041 adev->family = AMDGPU_FAMILY_SI; 2042 r = si_set_ip_blocks(adev); 2043 if (r) 2044 return r; 2045 break; 2046 #endif 2047 #ifdef CONFIG_DRM_AMDGPU_CIK 2048 case CHIP_BONAIRE: 2049 case CHIP_HAWAII: 2050 case CHIP_KAVERI: 2051 case CHIP_KABINI: 2052 case CHIP_MULLINS: 2053 if (adev->flags & AMD_IS_APU) 2054 adev->family = AMDGPU_FAMILY_KV; 2055 else 2056 adev->family = AMDGPU_FAMILY_CI; 2057 2058 r = cik_set_ip_blocks(adev); 2059 if (r) 2060 return r; 2061 break; 2062 #endif 2063 case CHIP_TOPAZ: 2064 case CHIP_TONGA: 2065 case CHIP_FIJI: 2066 case CHIP_POLARIS10: 2067 case CHIP_POLARIS11: 2068 case CHIP_POLARIS12: 2069 case CHIP_VEGAM: 2070 case CHIP_CARRIZO: 2071 case CHIP_STONEY: 2072 if (adev->flags & AMD_IS_APU) 2073 adev->family = AMDGPU_FAMILY_CZ; 2074 else 2075 adev->family = AMDGPU_FAMILY_VI; 2076 2077 r = vi_set_ip_blocks(adev); 2078 if (r) 2079 return r; 2080 break; 2081 default: 2082 r = amdgpu_discovery_set_ip_blocks(adev); 2083 if (r) 2084 return r; 2085 break; 2086 } 2087 2088 if (amdgpu_has_atpx() && 2089 (amdgpu_is_atpx_hybrid() || 2090 amdgpu_has_atpx_dgpu_power_cntl()) && 2091 ((adev->flags & AMD_IS_APU) == 0) && 2092 !dev_is_removable(&adev->pdev->dev)) 2093 adev->flags |= AMD_IS_PX; 2094 2095 if (!(adev->flags & AMD_IS_APU)) { 2096 parent = pcie_find_root_port(adev->pdev); 2097 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2098 } 2099 2100 2101 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2102 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2103 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2104 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2105 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2106 if (!amdgpu_device_pcie_dynamic_switching_supported()) 2107 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2108 2109 total = true; 2110 for (i = 0; i < adev->num_ip_blocks; i++) { 2111 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2112 DRM_WARN("disabled ip block: %d <%s>\n", 2113 i, adev->ip_blocks[i].version->funcs->name); 2114 adev->ip_blocks[i].status.valid = false; 2115 } else { 2116 if (adev->ip_blocks[i].version->funcs->early_init) { 2117 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2118 if (r == -ENOENT) { 2119 adev->ip_blocks[i].status.valid = false; 2120 } else if (r) { 2121 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2122 adev->ip_blocks[i].version->funcs->name, r); 2123 total = false; 2124 } else { 2125 adev->ip_blocks[i].status.valid = true; 2126 } 2127 } else { 2128 adev->ip_blocks[i].status.valid = true; 2129 } 2130 } 2131 /* get the vbios after the asic_funcs are set up */ 2132 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2133 r = amdgpu_device_parse_gpu_info_fw(adev); 2134 if (r) 2135 return r; 2136 2137 /* Read BIOS */ 2138 if (amdgpu_device_read_bios(adev)) { 2139 if (!amdgpu_get_bios(adev)) 2140 return -EINVAL; 2141 2142 r = amdgpu_atombios_init(adev); 2143 if (r) { 2144 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2145 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2146 return r; 2147 } 2148 } 2149 2150 /*get pf2vf msg info at it's earliest time*/ 2151 if (amdgpu_sriov_vf(adev)) 2152 amdgpu_virt_init_data_exchange(adev); 2153 2154 } 2155 } 2156 if (!total) 2157 return -ENODEV; 2158 2159 amdgpu_amdkfd_device_probe(adev); 2160 adev->cg_flags &= amdgpu_cg_mask; 2161 adev->pg_flags &= amdgpu_pg_mask; 2162 2163 return 0; 2164 } 2165 2166 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2167 { 2168 int i, r; 2169 2170 for (i = 0; i < adev->num_ip_blocks; i++) { 2171 if (!adev->ip_blocks[i].status.sw) 2172 continue; 2173 if (adev->ip_blocks[i].status.hw) 2174 continue; 2175 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2176 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2177 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2178 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2179 if (r) { 2180 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2181 adev->ip_blocks[i].version->funcs->name, r); 2182 return r; 2183 } 2184 adev->ip_blocks[i].status.hw = true; 2185 } 2186 } 2187 2188 return 0; 2189 } 2190 2191 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2192 { 2193 int i, r; 2194 2195 for (i = 0; i < adev->num_ip_blocks; i++) { 2196 if (!adev->ip_blocks[i].status.sw) 2197 continue; 2198 if (adev->ip_blocks[i].status.hw) 2199 continue; 2200 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2201 if (r) { 2202 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 return r; 2205 } 2206 adev->ip_blocks[i].status.hw = true; 2207 } 2208 2209 return 0; 2210 } 2211 2212 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2213 { 2214 int r = 0; 2215 int i; 2216 uint32_t smu_version; 2217 2218 if (adev->asic_type >= CHIP_VEGA10) { 2219 for (i = 0; i < adev->num_ip_blocks; i++) { 2220 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2221 continue; 2222 2223 if (!adev->ip_blocks[i].status.sw) 2224 continue; 2225 2226 /* no need to do the fw loading again if already done*/ 2227 if (adev->ip_blocks[i].status.hw == true) 2228 break; 2229 2230 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2231 r = adev->ip_blocks[i].version->funcs->resume(adev); 2232 if (r) { 2233 DRM_ERROR("resume of IP block <%s> failed %d\n", 2234 adev->ip_blocks[i].version->funcs->name, r); 2235 return r; 2236 } 2237 } else { 2238 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2239 if (r) { 2240 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2241 adev->ip_blocks[i].version->funcs->name, r); 2242 return r; 2243 } 2244 } 2245 2246 adev->ip_blocks[i].status.hw = true; 2247 break; 2248 } 2249 } 2250 2251 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2252 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2253 2254 return r; 2255 } 2256 2257 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2258 { 2259 long timeout; 2260 int r, i; 2261 2262 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2263 struct amdgpu_ring *ring = adev->rings[i]; 2264 2265 /* No need to setup the GPU scheduler for rings that don't need it */ 2266 if (!ring || ring->no_scheduler) 2267 continue; 2268 2269 switch (ring->funcs->type) { 2270 case AMDGPU_RING_TYPE_GFX: 2271 timeout = adev->gfx_timeout; 2272 break; 2273 case AMDGPU_RING_TYPE_COMPUTE: 2274 timeout = adev->compute_timeout; 2275 break; 2276 case AMDGPU_RING_TYPE_SDMA: 2277 timeout = adev->sdma_timeout; 2278 break; 2279 default: 2280 timeout = adev->video_timeout; 2281 break; 2282 } 2283 2284 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2285 ring->num_hw_submission, 0, 2286 timeout, adev->reset_domain->wq, 2287 ring->sched_score, ring->name, 2288 adev->dev); 2289 if (r) { 2290 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2291 ring->name); 2292 return r; 2293 } 2294 } 2295 2296 amdgpu_xcp_update_partition_sched_list(adev); 2297 2298 return 0; 2299 } 2300 2301 2302 /** 2303 * amdgpu_device_ip_init - run init for hardware IPs 2304 * 2305 * @adev: amdgpu_device pointer 2306 * 2307 * Main initialization pass for hardware IPs. The list of all the hardware 2308 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2309 * are run. sw_init initializes the software state associated with each IP 2310 * and hw_init initializes the hardware associated with each IP. 2311 * Returns 0 on success, negative error code on failure. 2312 */ 2313 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2314 { 2315 int i, r; 2316 2317 r = amdgpu_ras_init(adev); 2318 if (r) 2319 return r; 2320 2321 for (i = 0; i < adev->num_ip_blocks; i++) { 2322 if (!adev->ip_blocks[i].status.valid) 2323 continue; 2324 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2325 if (r) { 2326 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2327 adev->ip_blocks[i].version->funcs->name, r); 2328 goto init_failed; 2329 } 2330 adev->ip_blocks[i].status.sw = true; 2331 2332 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2333 /* need to do common hw init early so everything is set up for gmc */ 2334 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2335 if (r) { 2336 DRM_ERROR("hw_init %d failed %d\n", i, r); 2337 goto init_failed; 2338 } 2339 adev->ip_blocks[i].status.hw = true; 2340 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2341 /* need to do gmc hw init early so we can allocate gpu mem */ 2342 /* Try to reserve bad pages early */ 2343 if (amdgpu_sriov_vf(adev)) 2344 amdgpu_virt_exchange_data(adev); 2345 2346 r = amdgpu_device_mem_scratch_init(adev); 2347 if (r) { 2348 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2349 goto init_failed; 2350 } 2351 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2352 if (r) { 2353 DRM_ERROR("hw_init %d failed %d\n", i, r); 2354 goto init_failed; 2355 } 2356 r = amdgpu_device_wb_init(adev); 2357 if (r) { 2358 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2359 goto init_failed; 2360 } 2361 adev->ip_blocks[i].status.hw = true; 2362 2363 /* right after GMC hw init, we create CSA */ 2364 if (adev->gfx.mcbp) { 2365 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2366 AMDGPU_GEM_DOMAIN_VRAM | 2367 AMDGPU_GEM_DOMAIN_GTT, 2368 AMDGPU_CSA_SIZE); 2369 if (r) { 2370 DRM_ERROR("allocate CSA failed %d\n", r); 2371 goto init_failed; 2372 } 2373 } 2374 } 2375 } 2376 2377 if (amdgpu_sriov_vf(adev)) 2378 amdgpu_virt_init_data_exchange(adev); 2379 2380 r = amdgpu_ib_pool_init(adev); 2381 if (r) { 2382 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2383 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2384 goto init_failed; 2385 } 2386 2387 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2388 if (r) 2389 goto init_failed; 2390 2391 r = amdgpu_device_ip_hw_init_phase1(adev); 2392 if (r) 2393 goto init_failed; 2394 2395 r = amdgpu_device_fw_loading(adev); 2396 if (r) 2397 goto init_failed; 2398 2399 r = amdgpu_device_ip_hw_init_phase2(adev); 2400 if (r) 2401 goto init_failed; 2402 2403 /* 2404 * retired pages will be loaded from eeprom and reserved here, 2405 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2406 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2407 * for I2C communication which only true at this point. 2408 * 2409 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2410 * failure from bad gpu situation and stop amdgpu init process 2411 * accordingly. For other failed cases, it will still release all 2412 * the resource and print error message, rather than returning one 2413 * negative value to upper level. 2414 * 2415 * Note: theoretically, this should be called before all vram allocations 2416 * to protect retired page from abusing 2417 */ 2418 r = amdgpu_ras_recovery_init(adev); 2419 if (r) 2420 goto init_failed; 2421 2422 /** 2423 * In case of XGMI grab extra reference for reset domain for this device 2424 */ 2425 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2426 if (amdgpu_xgmi_add_device(adev) == 0) { 2427 if (!amdgpu_sriov_vf(adev)) { 2428 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2429 2430 if (WARN_ON(!hive)) { 2431 r = -ENOENT; 2432 goto init_failed; 2433 } 2434 2435 if (!hive->reset_domain || 2436 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2437 r = -ENOENT; 2438 amdgpu_put_xgmi_hive(hive); 2439 goto init_failed; 2440 } 2441 2442 /* Drop the early temporary reset domain we created for device */ 2443 amdgpu_reset_put_reset_domain(adev->reset_domain); 2444 adev->reset_domain = hive->reset_domain; 2445 amdgpu_put_xgmi_hive(hive); 2446 } 2447 } 2448 } 2449 2450 r = amdgpu_device_init_schedulers(adev); 2451 if (r) 2452 goto init_failed; 2453 2454 /* Don't init kfd if whole hive need to be reset during init */ 2455 if (!adev->gmc.xgmi.pending_reset) { 2456 kgd2kfd_init_zone_device(adev); 2457 amdgpu_amdkfd_device_init(adev); 2458 } 2459 2460 amdgpu_fru_get_product_info(adev); 2461 2462 init_failed: 2463 2464 return r; 2465 } 2466 2467 /** 2468 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2469 * 2470 * @adev: amdgpu_device pointer 2471 * 2472 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2473 * this function before a GPU reset. If the value is retained after a 2474 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2475 */ 2476 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2477 { 2478 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2479 } 2480 2481 /** 2482 * amdgpu_device_check_vram_lost - check if vram is valid 2483 * 2484 * @adev: amdgpu_device pointer 2485 * 2486 * Checks the reset magic value written to the gart pointer in VRAM. 2487 * The driver calls this after a GPU reset to see if the contents of 2488 * VRAM is lost or now. 2489 * returns true if vram is lost, false if not. 2490 */ 2491 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2492 { 2493 if (memcmp(adev->gart.ptr, adev->reset_magic, 2494 AMDGPU_RESET_MAGIC_NUM)) 2495 return true; 2496 2497 if (!amdgpu_in_reset(adev)) 2498 return false; 2499 2500 /* 2501 * For all ASICs with baco/mode1 reset, the VRAM is 2502 * always assumed to be lost. 2503 */ 2504 switch (amdgpu_asic_reset_method(adev)) { 2505 case AMD_RESET_METHOD_BACO: 2506 case AMD_RESET_METHOD_MODE1: 2507 return true; 2508 default: 2509 return false; 2510 } 2511 } 2512 2513 /** 2514 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2515 * 2516 * @adev: amdgpu_device pointer 2517 * @state: clockgating state (gate or ungate) 2518 * 2519 * The list of all the hardware IPs that make up the asic is walked and the 2520 * set_clockgating_state callbacks are run. 2521 * Late initialization pass enabling clockgating for hardware IPs. 2522 * Fini or suspend, pass disabling clockgating for hardware IPs. 2523 * Returns 0 on success, negative error code on failure. 2524 */ 2525 2526 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2527 enum amd_clockgating_state state) 2528 { 2529 int i, j, r; 2530 2531 if (amdgpu_emu_mode == 1) 2532 return 0; 2533 2534 for (j = 0; j < adev->num_ip_blocks; j++) { 2535 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2536 if (!adev->ip_blocks[i].status.late_initialized) 2537 continue; 2538 /* skip CG for GFX, SDMA on S0ix */ 2539 if (adev->in_s0ix && 2540 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2541 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2542 continue; 2543 /* skip CG for VCE/UVD, it's handled specially */ 2544 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2545 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2546 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2547 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2548 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2549 /* enable clockgating to save power */ 2550 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2551 state); 2552 if (r) { 2553 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2554 adev->ip_blocks[i].version->funcs->name, r); 2555 return r; 2556 } 2557 } 2558 } 2559 2560 return 0; 2561 } 2562 2563 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2564 enum amd_powergating_state state) 2565 { 2566 int i, j, r; 2567 2568 if (amdgpu_emu_mode == 1) 2569 return 0; 2570 2571 for (j = 0; j < adev->num_ip_blocks; j++) { 2572 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2573 if (!adev->ip_blocks[i].status.late_initialized) 2574 continue; 2575 /* skip PG for GFX, SDMA on S0ix */ 2576 if (adev->in_s0ix && 2577 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2579 continue; 2580 /* skip CG for VCE/UVD, it's handled specially */ 2581 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2582 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2583 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2584 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2585 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2586 /* enable powergating to save power */ 2587 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2588 state); 2589 if (r) { 2590 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2591 adev->ip_blocks[i].version->funcs->name, r); 2592 return r; 2593 } 2594 } 2595 } 2596 return 0; 2597 } 2598 2599 static int amdgpu_device_enable_mgpu_fan_boost(void) 2600 { 2601 struct amdgpu_gpu_instance *gpu_ins; 2602 struct amdgpu_device *adev; 2603 int i, ret = 0; 2604 2605 mutex_lock(&mgpu_info.mutex); 2606 2607 /* 2608 * MGPU fan boost feature should be enabled 2609 * only when there are two or more dGPUs in 2610 * the system 2611 */ 2612 if (mgpu_info.num_dgpu < 2) 2613 goto out; 2614 2615 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2616 gpu_ins = &(mgpu_info.gpu_ins[i]); 2617 adev = gpu_ins->adev; 2618 if (!(adev->flags & AMD_IS_APU) && 2619 !gpu_ins->mgpu_fan_enabled) { 2620 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2621 if (ret) 2622 break; 2623 2624 gpu_ins->mgpu_fan_enabled = 1; 2625 } 2626 } 2627 2628 out: 2629 mutex_unlock(&mgpu_info.mutex); 2630 2631 return ret; 2632 } 2633 2634 /** 2635 * amdgpu_device_ip_late_init - run late init for hardware IPs 2636 * 2637 * @adev: amdgpu_device pointer 2638 * 2639 * Late initialization pass for hardware IPs. The list of all the hardware 2640 * IPs that make up the asic is walked and the late_init callbacks are run. 2641 * late_init covers any special initialization that an IP requires 2642 * after all of the have been initialized or something that needs to happen 2643 * late in the init process. 2644 * Returns 0 on success, negative error code on failure. 2645 */ 2646 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2647 { 2648 struct amdgpu_gpu_instance *gpu_instance; 2649 int i = 0, r; 2650 2651 for (i = 0; i < adev->num_ip_blocks; i++) { 2652 if (!adev->ip_blocks[i].status.hw) 2653 continue; 2654 if (adev->ip_blocks[i].version->funcs->late_init) { 2655 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2656 if (r) { 2657 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2658 adev->ip_blocks[i].version->funcs->name, r); 2659 return r; 2660 } 2661 } 2662 adev->ip_blocks[i].status.late_initialized = true; 2663 } 2664 2665 r = amdgpu_ras_late_init(adev); 2666 if (r) { 2667 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2668 return r; 2669 } 2670 2671 amdgpu_ras_set_error_query_ready(adev, true); 2672 2673 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2674 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2675 2676 amdgpu_device_fill_reset_magic(adev); 2677 2678 r = amdgpu_device_enable_mgpu_fan_boost(); 2679 if (r) 2680 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2681 2682 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2683 if (amdgpu_passthrough(adev) && 2684 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2685 adev->asic_type == CHIP_ALDEBARAN)) 2686 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2687 2688 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2689 mutex_lock(&mgpu_info.mutex); 2690 2691 /* 2692 * Reset device p-state to low as this was booted with high. 2693 * 2694 * This should be performed only after all devices from the same 2695 * hive get initialized. 2696 * 2697 * However, it's unknown how many device in the hive in advance. 2698 * As this is counted one by one during devices initializations. 2699 * 2700 * So, we wait for all XGMI interlinked devices initialized. 2701 * This may bring some delays as those devices may come from 2702 * different hives. But that should be OK. 2703 */ 2704 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2705 for (i = 0; i < mgpu_info.num_gpu; i++) { 2706 gpu_instance = &(mgpu_info.gpu_ins[i]); 2707 if (gpu_instance->adev->flags & AMD_IS_APU) 2708 continue; 2709 2710 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2711 AMDGPU_XGMI_PSTATE_MIN); 2712 if (r) { 2713 DRM_ERROR("pstate setting failed (%d).\n", r); 2714 break; 2715 } 2716 } 2717 } 2718 2719 mutex_unlock(&mgpu_info.mutex); 2720 } 2721 2722 return 0; 2723 } 2724 2725 /** 2726 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2727 * 2728 * @adev: amdgpu_device pointer 2729 * 2730 * For ASICs need to disable SMC first 2731 */ 2732 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2733 { 2734 int i, r; 2735 2736 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2737 return; 2738 2739 for (i = 0; i < adev->num_ip_blocks; i++) { 2740 if (!adev->ip_blocks[i].status.hw) 2741 continue; 2742 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2743 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2744 /* XXX handle errors */ 2745 if (r) { 2746 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2747 adev->ip_blocks[i].version->funcs->name, r); 2748 } 2749 adev->ip_blocks[i].status.hw = false; 2750 break; 2751 } 2752 } 2753 } 2754 2755 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2756 { 2757 int i, r; 2758 2759 for (i = 0; i < adev->num_ip_blocks; i++) { 2760 if (!adev->ip_blocks[i].version->funcs->early_fini) 2761 continue; 2762 2763 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2764 if (r) { 2765 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2766 adev->ip_blocks[i].version->funcs->name, r); 2767 } 2768 } 2769 2770 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2771 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2772 2773 amdgpu_amdkfd_suspend(adev, false); 2774 2775 /* Workaroud for ASICs need to disable SMC first */ 2776 amdgpu_device_smu_fini_early(adev); 2777 2778 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2779 if (!adev->ip_blocks[i].status.hw) 2780 continue; 2781 2782 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2783 /* XXX handle errors */ 2784 if (r) { 2785 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2786 adev->ip_blocks[i].version->funcs->name, r); 2787 } 2788 2789 adev->ip_blocks[i].status.hw = false; 2790 } 2791 2792 if (amdgpu_sriov_vf(adev)) { 2793 if (amdgpu_virt_release_full_gpu(adev, false)) 2794 DRM_ERROR("failed to release exclusive mode on fini\n"); 2795 } 2796 2797 return 0; 2798 } 2799 2800 /** 2801 * amdgpu_device_ip_fini - run fini for hardware IPs 2802 * 2803 * @adev: amdgpu_device pointer 2804 * 2805 * Main teardown pass for hardware IPs. The list of all the hardware 2806 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2807 * are run. hw_fini tears down the hardware associated with each IP 2808 * and sw_fini tears down any software state associated with each IP. 2809 * Returns 0 on success, negative error code on failure. 2810 */ 2811 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2812 { 2813 int i, r; 2814 2815 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2816 amdgpu_virt_release_ras_err_handler_data(adev); 2817 2818 if (adev->gmc.xgmi.num_physical_nodes > 1) 2819 amdgpu_xgmi_remove_device(adev); 2820 2821 amdgpu_amdkfd_device_fini_sw(adev); 2822 2823 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2824 if (!adev->ip_blocks[i].status.sw) 2825 continue; 2826 2827 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2828 amdgpu_ucode_free_bo(adev); 2829 amdgpu_free_static_csa(&adev->virt.csa_obj); 2830 amdgpu_device_wb_fini(adev); 2831 amdgpu_device_mem_scratch_fini(adev); 2832 amdgpu_ib_pool_fini(adev); 2833 } 2834 2835 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2836 /* XXX handle errors */ 2837 if (r) { 2838 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2839 adev->ip_blocks[i].version->funcs->name, r); 2840 } 2841 adev->ip_blocks[i].status.sw = false; 2842 adev->ip_blocks[i].status.valid = false; 2843 } 2844 2845 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2846 if (!adev->ip_blocks[i].status.late_initialized) 2847 continue; 2848 if (adev->ip_blocks[i].version->funcs->late_fini) 2849 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2850 adev->ip_blocks[i].status.late_initialized = false; 2851 } 2852 2853 amdgpu_ras_fini(adev); 2854 2855 return 0; 2856 } 2857 2858 /** 2859 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2860 * 2861 * @work: work_struct. 2862 */ 2863 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2864 { 2865 struct amdgpu_device *adev = 2866 container_of(work, struct amdgpu_device, delayed_init_work.work); 2867 int r; 2868 2869 r = amdgpu_ib_ring_tests(adev); 2870 if (r) 2871 DRM_ERROR("ib ring test failed (%d).\n", r); 2872 } 2873 2874 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2875 { 2876 struct amdgpu_device *adev = 2877 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2878 2879 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2880 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2881 2882 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2883 adev->gfx.gfx_off_state = true; 2884 } 2885 2886 /** 2887 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2888 * 2889 * @adev: amdgpu_device pointer 2890 * 2891 * Main suspend function for hardware IPs. The list of all the hardware 2892 * IPs that make up the asic is walked, clockgating is disabled and the 2893 * suspend callbacks are run. suspend puts the hardware and software state 2894 * in each IP into a state suitable for suspend. 2895 * Returns 0 on success, negative error code on failure. 2896 */ 2897 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2898 { 2899 int i, r; 2900 2901 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2902 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2903 2904 /* 2905 * Per PMFW team's suggestion, driver needs to handle gfxoff 2906 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2907 * scenario. Add the missing df cstate disablement here. 2908 */ 2909 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2910 dev_warn(adev->dev, "Failed to disallow df cstate"); 2911 2912 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2913 if (!adev->ip_blocks[i].status.valid) 2914 continue; 2915 2916 /* displays are handled separately */ 2917 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2918 continue; 2919 2920 /* XXX handle errors */ 2921 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2922 /* XXX handle errors */ 2923 if (r) { 2924 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2925 adev->ip_blocks[i].version->funcs->name, r); 2926 return r; 2927 } 2928 2929 adev->ip_blocks[i].status.hw = false; 2930 } 2931 2932 return 0; 2933 } 2934 2935 /** 2936 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2937 * 2938 * @adev: amdgpu_device pointer 2939 * 2940 * Main suspend function for hardware IPs. The list of all the hardware 2941 * IPs that make up the asic is walked, clockgating is disabled and the 2942 * suspend callbacks are run. suspend puts the hardware and software state 2943 * in each IP into a state suitable for suspend. 2944 * Returns 0 on success, negative error code on failure. 2945 */ 2946 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2947 { 2948 int i, r; 2949 2950 if (adev->in_s0ix) 2951 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2952 2953 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2954 if (!adev->ip_blocks[i].status.valid) 2955 continue; 2956 /* displays are handled in phase1 */ 2957 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2958 continue; 2959 /* PSP lost connection when err_event_athub occurs */ 2960 if (amdgpu_ras_intr_triggered() && 2961 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2962 adev->ip_blocks[i].status.hw = false; 2963 continue; 2964 } 2965 2966 /* skip unnecessary suspend if we do not initialize them yet */ 2967 if (adev->gmc.xgmi.pending_reset && 2968 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2969 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2970 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2971 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2972 adev->ip_blocks[i].status.hw = false; 2973 continue; 2974 } 2975 2976 /* skip suspend of gfx/mes and psp for S0ix 2977 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2978 * like at runtime. PSP is also part of the always on hardware 2979 * so no need to suspend it. 2980 */ 2981 if (adev->in_s0ix && 2982 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 2985 continue; 2986 2987 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 2988 if (adev->in_s0ix && 2989 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 2990 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2991 continue; 2992 2993 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 2994 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 2995 * from this location and RLC Autoload automatically also gets loaded 2996 * from here based on PMFW -> PSP message during re-init sequence. 2997 * Therefore, the psp suspend & resume should be skipped to avoid destroy 2998 * the TMR and reload FWs again for IMU enabled APU ASICs. 2999 */ 3000 if (amdgpu_in_reset(adev) && 3001 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3002 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3003 continue; 3004 3005 /* XXX handle errors */ 3006 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3007 /* XXX handle errors */ 3008 if (r) { 3009 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3010 adev->ip_blocks[i].version->funcs->name, r); 3011 } 3012 adev->ip_blocks[i].status.hw = false; 3013 /* handle putting the SMC in the appropriate state */ 3014 if (!amdgpu_sriov_vf(adev)) { 3015 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3016 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3017 if (r) { 3018 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3019 adev->mp1_state, r); 3020 return r; 3021 } 3022 } 3023 } 3024 } 3025 3026 return 0; 3027 } 3028 3029 /** 3030 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3031 * 3032 * @adev: amdgpu_device pointer 3033 * 3034 * Main suspend function for hardware IPs. The list of all the hardware 3035 * IPs that make up the asic is walked, clockgating is disabled and the 3036 * suspend callbacks are run. suspend puts the hardware and software state 3037 * in each IP into a state suitable for suspend. 3038 * Returns 0 on success, negative error code on failure. 3039 */ 3040 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3041 { 3042 int r; 3043 3044 if (amdgpu_sriov_vf(adev)) { 3045 amdgpu_virt_fini_data_exchange(adev); 3046 amdgpu_virt_request_full_gpu(adev, false); 3047 } 3048 3049 r = amdgpu_device_ip_suspend_phase1(adev); 3050 if (r) 3051 return r; 3052 r = amdgpu_device_ip_suspend_phase2(adev); 3053 3054 if (amdgpu_sriov_vf(adev)) 3055 amdgpu_virt_release_full_gpu(adev, false); 3056 3057 return r; 3058 } 3059 3060 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3061 { 3062 int i, r; 3063 3064 static enum amd_ip_block_type ip_order[] = { 3065 AMD_IP_BLOCK_TYPE_COMMON, 3066 AMD_IP_BLOCK_TYPE_GMC, 3067 AMD_IP_BLOCK_TYPE_PSP, 3068 AMD_IP_BLOCK_TYPE_IH, 3069 }; 3070 3071 for (i = 0; i < adev->num_ip_blocks; i++) { 3072 int j; 3073 struct amdgpu_ip_block *block; 3074 3075 block = &adev->ip_blocks[i]; 3076 block->status.hw = false; 3077 3078 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3079 3080 if (block->version->type != ip_order[j] || 3081 !block->status.valid) 3082 continue; 3083 3084 r = block->version->funcs->hw_init(adev); 3085 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3086 if (r) 3087 return r; 3088 block->status.hw = true; 3089 } 3090 } 3091 3092 return 0; 3093 } 3094 3095 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3096 { 3097 int i, r; 3098 3099 static enum amd_ip_block_type ip_order[] = { 3100 AMD_IP_BLOCK_TYPE_SMC, 3101 AMD_IP_BLOCK_TYPE_DCE, 3102 AMD_IP_BLOCK_TYPE_GFX, 3103 AMD_IP_BLOCK_TYPE_SDMA, 3104 AMD_IP_BLOCK_TYPE_MES, 3105 AMD_IP_BLOCK_TYPE_UVD, 3106 AMD_IP_BLOCK_TYPE_VCE, 3107 AMD_IP_BLOCK_TYPE_VCN, 3108 AMD_IP_BLOCK_TYPE_JPEG 3109 }; 3110 3111 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3112 int j; 3113 struct amdgpu_ip_block *block; 3114 3115 for (j = 0; j < adev->num_ip_blocks; j++) { 3116 block = &adev->ip_blocks[j]; 3117 3118 if (block->version->type != ip_order[i] || 3119 !block->status.valid || 3120 block->status.hw) 3121 continue; 3122 3123 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3124 r = block->version->funcs->resume(adev); 3125 else 3126 r = block->version->funcs->hw_init(adev); 3127 3128 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3129 if (r) 3130 return r; 3131 block->status.hw = true; 3132 } 3133 } 3134 3135 return 0; 3136 } 3137 3138 /** 3139 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3140 * 3141 * @adev: amdgpu_device pointer 3142 * 3143 * First resume function for hardware IPs. The list of all the hardware 3144 * IPs that make up the asic is walked and the resume callbacks are run for 3145 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3146 * after a suspend and updates the software state as necessary. This 3147 * function is also used for restoring the GPU after a GPU reset. 3148 * Returns 0 on success, negative error code on failure. 3149 */ 3150 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3151 { 3152 int i, r; 3153 3154 for (i = 0; i < adev->num_ip_blocks; i++) { 3155 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3156 continue; 3157 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3158 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3159 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3160 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3161 3162 r = adev->ip_blocks[i].version->funcs->resume(adev); 3163 if (r) { 3164 DRM_ERROR("resume of IP block <%s> failed %d\n", 3165 adev->ip_blocks[i].version->funcs->name, r); 3166 return r; 3167 } 3168 adev->ip_blocks[i].status.hw = true; 3169 } 3170 } 3171 3172 return 0; 3173 } 3174 3175 /** 3176 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3177 * 3178 * @adev: amdgpu_device pointer 3179 * 3180 * First resume function for hardware IPs. The list of all the hardware 3181 * IPs that make up the asic is walked and the resume callbacks are run for 3182 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3183 * functional state after a suspend and updates the software state as 3184 * necessary. This function is also used for restoring the GPU after a GPU 3185 * reset. 3186 * Returns 0 on success, negative error code on failure. 3187 */ 3188 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3189 { 3190 int i, r; 3191 3192 for (i = 0; i < adev->num_ip_blocks; i++) { 3193 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3194 continue; 3195 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3196 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3197 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3198 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3199 continue; 3200 r = adev->ip_blocks[i].version->funcs->resume(adev); 3201 if (r) { 3202 DRM_ERROR("resume of IP block <%s> failed %d\n", 3203 adev->ip_blocks[i].version->funcs->name, r); 3204 return r; 3205 } 3206 adev->ip_blocks[i].status.hw = true; 3207 } 3208 3209 return 0; 3210 } 3211 3212 /** 3213 * amdgpu_device_ip_resume - run resume for hardware IPs 3214 * 3215 * @adev: amdgpu_device pointer 3216 * 3217 * Main resume function for hardware IPs. The hardware IPs 3218 * are split into two resume functions because they are 3219 * also used in recovering from a GPU reset and some additional 3220 * steps need to be take between them. In this case (S3/S4) they are 3221 * run sequentially. 3222 * Returns 0 on success, negative error code on failure. 3223 */ 3224 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3225 { 3226 int r; 3227 3228 r = amdgpu_device_ip_resume_phase1(adev); 3229 if (r) 3230 return r; 3231 3232 r = amdgpu_device_fw_loading(adev); 3233 if (r) 3234 return r; 3235 3236 r = amdgpu_device_ip_resume_phase2(adev); 3237 3238 return r; 3239 } 3240 3241 /** 3242 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3243 * 3244 * @adev: amdgpu_device pointer 3245 * 3246 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3247 */ 3248 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3249 { 3250 if (amdgpu_sriov_vf(adev)) { 3251 if (adev->is_atom_fw) { 3252 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3253 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3254 } else { 3255 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3256 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3257 } 3258 3259 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3260 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3261 } 3262 } 3263 3264 /** 3265 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3266 * 3267 * @asic_type: AMD asic type 3268 * 3269 * Check if there is DC (new modesetting infrastructre) support for an asic. 3270 * returns true if DC has support, false if not. 3271 */ 3272 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3273 { 3274 switch (asic_type) { 3275 #ifdef CONFIG_DRM_AMDGPU_SI 3276 case CHIP_HAINAN: 3277 #endif 3278 case CHIP_TOPAZ: 3279 /* chips with no display hardware */ 3280 return false; 3281 #if defined(CONFIG_DRM_AMD_DC) 3282 case CHIP_TAHITI: 3283 case CHIP_PITCAIRN: 3284 case CHIP_VERDE: 3285 case CHIP_OLAND: 3286 /* 3287 * We have systems in the wild with these ASICs that require 3288 * LVDS and VGA support which is not supported with DC. 3289 * 3290 * Fallback to the non-DC driver here by default so as not to 3291 * cause regressions. 3292 */ 3293 #if defined(CONFIG_DRM_AMD_DC_SI) 3294 return amdgpu_dc > 0; 3295 #else 3296 return false; 3297 #endif 3298 case CHIP_BONAIRE: 3299 case CHIP_KAVERI: 3300 case CHIP_KABINI: 3301 case CHIP_MULLINS: 3302 /* 3303 * We have systems in the wild with these ASICs that require 3304 * VGA support which is not supported with DC. 3305 * 3306 * Fallback to the non-DC driver here by default so as not to 3307 * cause regressions. 3308 */ 3309 return amdgpu_dc > 0; 3310 default: 3311 return amdgpu_dc != 0; 3312 #else 3313 default: 3314 if (amdgpu_dc > 0) 3315 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3316 return false; 3317 #endif 3318 } 3319 } 3320 3321 /** 3322 * amdgpu_device_has_dc_support - check if dc is supported 3323 * 3324 * @adev: amdgpu_device pointer 3325 * 3326 * Returns true for supported, false for not supported 3327 */ 3328 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3329 { 3330 if (adev->enable_virtual_display || 3331 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3332 return false; 3333 3334 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3335 } 3336 3337 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3338 { 3339 struct amdgpu_device *adev = 3340 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3341 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3342 3343 /* It's a bug to not have a hive within this function */ 3344 if (WARN_ON(!hive)) 3345 return; 3346 3347 /* 3348 * Use task barrier to synchronize all xgmi reset works across the 3349 * hive. task_barrier_enter and task_barrier_exit will block 3350 * until all the threads running the xgmi reset works reach 3351 * those points. task_barrier_full will do both blocks. 3352 */ 3353 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3354 3355 task_barrier_enter(&hive->tb); 3356 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3357 3358 if (adev->asic_reset_res) 3359 goto fail; 3360 3361 task_barrier_exit(&hive->tb); 3362 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3363 3364 if (adev->asic_reset_res) 3365 goto fail; 3366 3367 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3368 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3369 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3370 } else { 3371 3372 task_barrier_full(&hive->tb); 3373 adev->asic_reset_res = amdgpu_asic_reset(adev); 3374 } 3375 3376 fail: 3377 if (adev->asic_reset_res) 3378 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3379 adev->asic_reset_res, adev_to_drm(adev)->unique); 3380 amdgpu_put_xgmi_hive(hive); 3381 } 3382 3383 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3384 { 3385 char *input = amdgpu_lockup_timeout; 3386 char *timeout_setting = NULL; 3387 int index = 0; 3388 long timeout; 3389 int ret = 0; 3390 3391 /* 3392 * By default timeout for non compute jobs is 10000 3393 * and 60000 for compute jobs. 3394 * In SR-IOV or passthrough mode, timeout for compute 3395 * jobs are 60000 by default. 3396 */ 3397 adev->gfx_timeout = msecs_to_jiffies(10000); 3398 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3399 if (amdgpu_sriov_vf(adev)) 3400 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3401 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3402 else 3403 adev->compute_timeout = msecs_to_jiffies(60000); 3404 3405 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3406 while ((timeout_setting = strsep(&input, ",")) && 3407 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3408 ret = kstrtol(timeout_setting, 0, &timeout); 3409 if (ret) 3410 return ret; 3411 3412 if (timeout == 0) { 3413 index++; 3414 continue; 3415 } else if (timeout < 0) { 3416 timeout = MAX_SCHEDULE_TIMEOUT; 3417 dev_warn(adev->dev, "lockup timeout disabled"); 3418 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3419 } else { 3420 timeout = msecs_to_jiffies(timeout); 3421 } 3422 3423 switch (index++) { 3424 case 0: 3425 adev->gfx_timeout = timeout; 3426 break; 3427 case 1: 3428 adev->compute_timeout = timeout; 3429 break; 3430 case 2: 3431 adev->sdma_timeout = timeout; 3432 break; 3433 case 3: 3434 adev->video_timeout = timeout; 3435 break; 3436 default: 3437 break; 3438 } 3439 } 3440 /* 3441 * There is only one value specified and 3442 * it should apply to all non-compute jobs. 3443 */ 3444 if (index == 1) { 3445 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3446 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3447 adev->compute_timeout = adev->gfx_timeout; 3448 } 3449 } 3450 3451 return ret; 3452 } 3453 3454 /** 3455 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3456 * 3457 * @adev: amdgpu_device pointer 3458 * 3459 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3460 */ 3461 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3462 { 3463 struct iommu_domain *domain; 3464 3465 domain = iommu_get_domain_for_dev(adev->dev); 3466 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3467 adev->ram_is_direct_mapped = true; 3468 } 3469 3470 static const struct attribute *amdgpu_dev_attributes[] = { 3471 &dev_attr_pcie_replay_count.attr, 3472 NULL 3473 }; 3474 3475 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3476 { 3477 if (amdgpu_mcbp == 1) 3478 adev->gfx.mcbp = true; 3479 else if (amdgpu_mcbp == 0) 3480 adev->gfx.mcbp = false; 3481 3482 if (amdgpu_sriov_vf(adev)) 3483 adev->gfx.mcbp = true; 3484 3485 if (adev->gfx.mcbp) 3486 DRM_INFO("MCBP is enabled\n"); 3487 } 3488 3489 /** 3490 * amdgpu_device_init - initialize the driver 3491 * 3492 * @adev: amdgpu_device pointer 3493 * @flags: driver flags 3494 * 3495 * Initializes the driver info and hw (all asics). 3496 * Returns 0 for success or an error on failure. 3497 * Called at driver startup. 3498 */ 3499 int amdgpu_device_init(struct amdgpu_device *adev, 3500 uint32_t flags) 3501 { 3502 struct drm_device *ddev = adev_to_drm(adev); 3503 struct pci_dev *pdev = adev->pdev; 3504 int r, i; 3505 bool px = false; 3506 u32 max_MBps; 3507 int tmp; 3508 3509 adev->shutdown = false; 3510 adev->flags = flags; 3511 3512 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3513 adev->asic_type = amdgpu_force_asic_type; 3514 else 3515 adev->asic_type = flags & AMD_ASIC_MASK; 3516 3517 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3518 if (amdgpu_emu_mode == 1) 3519 adev->usec_timeout *= 10; 3520 adev->gmc.gart_size = 512 * 1024 * 1024; 3521 adev->accel_working = false; 3522 adev->num_rings = 0; 3523 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3524 adev->mman.buffer_funcs = NULL; 3525 adev->mman.buffer_funcs_ring = NULL; 3526 adev->vm_manager.vm_pte_funcs = NULL; 3527 adev->vm_manager.vm_pte_num_scheds = 0; 3528 adev->gmc.gmc_funcs = NULL; 3529 adev->harvest_ip_mask = 0x0; 3530 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3531 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3532 3533 adev->smc_rreg = &amdgpu_invalid_rreg; 3534 adev->smc_wreg = &amdgpu_invalid_wreg; 3535 adev->pcie_rreg = &amdgpu_invalid_rreg; 3536 adev->pcie_wreg = &amdgpu_invalid_wreg; 3537 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3538 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3539 adev->pciep_rreg = &amdgpu_invalid_rreg; 3540 adev->pciep_wreg = &amdgpu_invalid_wreg; 3541 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3542 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3543 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3544 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3545 adev->didt_rreg = &amdgpu_invalid_rreg; 3546 adev->didt_wreg = &amdgpu_invalid_wreg; 3547 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3548 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3549 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3550 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3551 3552 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3553 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3554 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3555 3556 /* mutex initialization are all done here so we 3557 * can recall function without having locking issues 3558 */ 3559 mutex_init(&adev->firmware.mutex); 3560 mutex_init(&adev->pm.mutex); 3561 mutex_init(&adev->gfx.gpu_clock_mutex); 3562 mutex_init(&adev->srbm_mutex); 3563 mutex_init(&adev->gfx.pipe_reserve_mutex); 3564 mutex_init(&adev->gfx.gfx_off_mutex); 3565 mutex_init(&adev->gfx.partition_mutex); 3566 mutex_init(&adev->grbm_idx_mutex); 3567 mutex_init(&adev->mn_lock); 3568 mutex_init(&adev->virt.vf_errors.lock); 3569 hash_init(adev->mn_hash); 3570 mutex_init(&adev->psp.mutex); 3571 mutex_init(&adev->notifier_lock); 3572 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3573 mutex_init(&adev->benchmark_mutex); 3574 3575 amdgpu_device_init_apu_flags(adev); 3576 3577 r = amdgpu_device_check_arguments(adev); 3578 if (r) 3579 return r; 3580 3581 spin_lock_init(&adev->mmio_idx_lock); 3582 spin_lock_init(&adev->smc_idx_lock); 3583 spin_lock_init(&adev->pcie_idx_lock); 3584 spin_lock_init(&adev->uvd_ctx_idx_lock); 3585 spin_lock_init(&adev->didt_idx_lock); 3586 spin_lock_init(&adev->gc_cac_idx_lock); 3587 spin_lock_init(&adev->se_cac_idx_lock); 3588 spin_lock_init(&adev->audio_endpt_idx_lock); 3589 spin_lock_init(&adev->mm_stats.lock); 3590 3591 INIT_LIST_HEAD(&adev->shadow_list); 3592 mutex_init(&adev->shadow_list_lock); 3593 3594 INIT_LIST_HEAD(&adev->reset_list); 3595 3596 INIT_LIST_HEAD(&adev->ras_list); 3597 3598 INIT_DELAYED_WORK(&adev->delayed_init_work, 3599 amdgpu_device_delayed_init_work_handler); 3600 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3601 amdgpu_device_delay_enable_gfx_off); 3602 3603 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3604 3605 adev->gfx.gfx_off_req_count = 1; 3606 adev->gfx.gfx_off_residency = 0; 3607 adev->gfx.gfx_off_entrycount = 0; 3608 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3609 3610 atomic_set(&adev->throttling_logging_enabled, 1); 3611 /* 3612 * If throttling continues, logging will be performed every minute 3613 * to avoid log flooding. "-1" is subtracted since the thermal 3614 * throttling interrupt comes every second. Thus, the total logging 3615 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3616 * for throttling interrupt) = 60 seconds. 3617 */ 3618 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3619 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3620 3621 /* Registers mapping */ 3622 /* TODO: block userspace mapping of io register */ 3623 if (adev->asic_type >= CHIP_BONAIRE) { 3624 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3625 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3626 } else { 3627 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3628 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3629 } 3630 3631 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3632 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3633 3634 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3635 if (!adev->rmmio) 3636 return -ENOMEM; 3637 3638 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3639 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3640 3641 /* 3642 * Reset domain needs to be present early, before XGMI hive discovered 3643 * (if any) and intitialized to use reset sem and in_gpu reset flag 3644 * early on during init and before calling to RREG32. 3645 */ 3646 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3647 if (!adev->reset_domain) 3648 return -ENOMEM; 3649 3650 /* detect hw virtualization here */ 3651 amdgpu_detect_virtualization(adev); 3652 3653 amdgpu_device_get_pcie_info(adev); 3654 3655 r = amdgpu_device_get_job_timeout_settings(adev); 3656 if (r) { 3657 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3658 return r; 3659 } 3660 3661 /* early init functions */ 3662 r = amdgpu_device_ip_early_init(adev); 3663 if (r) 3664 return r; 3665 3666 amdgpu_device_set_mcbp(adev); 3667 3668 /* Get rid of things like offb */ 3669 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3670 if (r) 3671 return r; 3672 3673 /* Enable TMZ based on IP_VERSION */ 3674 amdgpu_gmc_tmz_set(adev); 3675 3676 amdgpu_gmc_noretry_set(adev); 3677 /* Need to get xgmi info early to decide the reset behavior*/ 3678 if (adev->gmc.xgmi.supported) { 3679 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3680 if (r) 3681 return r; 3682 } 3683 3684 /* enable PCIE atomic ops */ 3685 if (amdgpu_sriov_vf(adev)) { 3686 if (adev->virt.fw_reserve.p_pf2vf) 3687 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3688 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3689 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3690 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3691 * internal path natively support atomics, set have_atomics_support to true. 3692 */ 3693 } else if ((adev->flags & AMD_IS_APU) && 3694 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3695 adev->have_atomics_support = true; 3696 } else { 3697 adev->have_atomics_support = 3698 !pci_enable_atomic_ops_to_root(adev->pdev, 3699 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3700 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3701 } 3702 3703 if (!adev->have_atomics_support) 3704 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3705 3706 /* doorbell bar mapping and doorbell index init*/ 3707 amdgpu_doorbell_init(adev); 3708 3709 if (amdgpu_emu_mode == 1) { 3710 /* post the asic on emulation mode */ 3711 emu_soc_asic_init(adev); 3712 goto fence_driver_init; 3713 } 3714 3715 amdgpu_reset_init(adev); 3716 3717 /* detect if we are with an SRIOV vbios */ 3718 if (adev->bios) 3719 amdgpu_device_detect_sriov_bios(adev); 3720 3721 /* check if we need to reset the asic 3722 * E.g., driver was not cleanly unloaded previously, etc. 3723 */ 3724 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3725 if (adev->gmc.xgmi.num_physical_nodes) { 3726 dev_info(adev->dev, "Pending hive reset.\n"); 3727 adev->gmc.xgmi.pending_reset = true; 3728 /* Only need to init necessary block for SMU to handle the reset */ 3729 for (i = 0; i < adev->num_ip_blocks; i++) { 3730 if (!adev->ip_blocks[i].status.valid) 3731 continue; 3732 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3736 DRM_DEBUG("IP %s disabled for hw_init.\n", 3737 adev->ip_blocks[i].version->funcs->name); 3738 adev->ip_blocks[i].status.hw = true; 3739 } 3740 } 3741 } else { 3742 tmp = amdgpu_reset_method; 3743 /* It should do a default reset when loading or reloading the driver, 3744 * regardless of the module parameter reset_method. 3745 */ 3746 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3747 r = amdgpu_asic_reset(adev); 3748 amdgpu_reset_method = tmp; 3749 if (r) { 3750 dev_err(adev->dev, "asic reset on init failed\n"); 3751 goto failed; 3752 } 3753 } 3754 } 3755 3756 /* Post card if necessary */ 3757 if (amdgpu_device_need_post(adev)) { 3758 if (!adev->bios) { 3759 dev_err(adev->dev, "no vBIOS found\n"); 3760 r = -EINVAL; 3761 goto failed; 3762 } 3763 DRM_INFO("GPU posting now...\n"); 3764 r = amdgpu_device_asic_init(adev); 3765 if (r) { 3766 dev_err(adev->dev, "gpu post error!\n"); 3767 goto failed; 3768 } 3769 } 3770 3771 if (adev->bios) { 3772 if (adev->is_atom_fw) { 3773 /* Initialize clocks */ 3774 r = amdgpu_atomfirmware_get_clock_info(adev); 3775 if (r) { 3776 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3777 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3778 goto failed; 3779 } 3780 } else { 3781 /* Initialize clocks */ 3782 r = amdgpu_atombios_get_clock_info(adev); 3783 if (r) { 3784 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3785 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3786 goto failed; 3787 } 3788 /* init i2c buses */ 3789 if (!amdgpu_device_has_dc_support(adev)) 3790 amdgpu_atombios_i2c_init(adev); 3791 } 3792 } 3793 3794 fence_driver_init: 3795 /* Fence driver */ 3796 r = amdgpu_fence_driver_sw_init(adev); 3797 if (r) { 3798 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3799 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3800 goto failed; 3801 } 3802 3803 /* init the mode config */ 3804 drm_mode_config_init(adev_to_drm(adev)); 3805 3806 r = amdgpu_device_ip_init(adev); 3807 if (r) { 3808 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3809 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3810 goto release_ras_con; 3811 } 3812 3813 amdgpu_fence_driver_hw_init(adev); 3814 3815 dev_info(adev->dev, 3816 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3817 adev->gfx.config.max_shader_engines, 3818 adev->gfx.config.max_sh_per_se, 3819 adev->gfx.config.max_cu_per_sh, 3820 adev->gfx.cu_info.number); 3821 3822 adev->accel_working = true; 3823 3824 amdgpu_vm_check_compute_bug(adev); 3825 3826 /* Initialize the buffer migration limit. */ 3827 if (amdgpu_moverate >= 0) 3828 max_MBps = amdgpu_moverate; 3829 else 3830 max_MBps = 8; /* Allow 8 MB/s. */ 3831 /* Get a log2 for easy divisions. */ 3832 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3833 3834 r = amdgpu_atombios_sysfs_init(adev); 3835 if (r) 3836 drm_err(&adev->ddev, 3837 "registering atombios sysfs failed (%d).\n", r); 3838 3839 r = amdgpu_pm_sysfs_init(adev); 3840 if (r) 3841 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3842 3843 r = amdgpu_ucode_sysfs_init(adev); 3844 if (r) { 3845 adev->ucode_sysfs_en = false; 3846 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3847 } else 3848 adev->ucode_sysfs_en = true; 3849 3850 /* 3851 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3852 * Otherwise the mgpu fan boost feature will be skipped due to the 3853 * gpu instance is counted less. 3854 */ 3855 amdgpu_register_gpu_instance(adev); 3856 3857 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3858 * explicit gating rather than handling it automatically. 3859 */ 3860 if (!adev->gmc.xgmi.pending_reset) { 3861 r = amdgpu_device_ip_late_init(adev); 3862 if (r) { 3863 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3864 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3865 goto release_ras_con; 3866 } 3867 /* must succeed. */ 3868 amdgpu_ras_resume(adev); 3869 queue_delayed_work(system_wq, &adev->delayed_init_work, 3870 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3871 } 3872 3873 if (amdgpu_sriov_vf(adev)) { 3874 amdgpu_virt_release_full_gpu(adev, true); 3875 flush_delayed_work(&adev->delayed_init_work); 3876 } 3877 3878 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3879 if (r) 3880 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3881 3882 amdgpu_fru_sysfs_init(adev); 3883 3884 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3885 r = amdgpu_pmu_init(adev); 3886 if (r) 3887 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3888 3889 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3890 if (amdgpu_device_cache_pci_state(adev->pdev)) 3891 pci_restore_state(pdev); 3892 3893 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3894 /* this will fail for cards that aren't VGA class devices, just 3895 * ignore it 3896 */ 3897 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3898 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3899 3900 px = amdgpu_device_supports_px(ddev); 3901 3902 if (px || (!dev_is_removable(&adev->pdev->dev) && 3903 apple_gmux_detect(NULL, NULL))) 3904 vga_switcheroo_register_client(adev->pdev, 3905 &amdgpu_switcheroo_ops, px); 3906 3907 if (px) 3908 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3909 3910 if (adev->gmc.xgmi.pending_reset) 3911 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3912 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3913 3914 amdgpu_device_check_iommu_direct_map(adev); 3915 3916 return 0; 3917 3918 release_ras_con: 3919 if (amdgpu_sriov_vf(adev)) 3920 amdgpu_virt_release_full_gpu(adev, true); 3921 3922 /* failed in exclusive mode due to timeout */ 3923 if (amdgpu_sriov_vf(adev) && 3924 !amdgpu_sriov_runtime(adev) && 3925 amdgpu_virt_mmio_blocked(adev) && 3926 !amdgpu_virt_wait_reset(adev)) { 3927 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3928 /* Don't send request since VF is inactive. */ 3929 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3930 adev->virt.ops = NULL; 3931 r = -EAGAIN; 3932 } 3933 amdgpu_release_ras_context(adev); 3934 3935 failed: 3936 amdgpu_vf_error_trans_all(adev); 3937 3938 return r; 3939 } 3940 3941 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3942 { 3943 3944 /* Clear all CPU mappings pointing to this device */ 3945 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3946 3947 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3948 amdgpu_doorbell_fini(adev); 3949 3950 iounmap(adev->rmmio); 3951 adev->rmmio = NULL; 3952 if (adev->mman.aper_base_kaddr) 3953 iounmap(adev->mman.aper_base_kaddr); 3954 adev->mman.aper_base_kaddr = NULL; 3955 3956 /* Memory manager related */ 3957 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3958 arch_phys_wc_del(adev->gmc.vram_mtrr); 3959 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3960 } 3961 } 3962 3963 /** 3964 * amdgpu_device_fini_hw - tear down the driver 3965 * 3966 * @adev: amdgpu_device pointer 3967 * 3968 * Tear down the driver info (all asics). 3969 * Called at driver shutdown. 3970 */ 3971 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3972 { 3973 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3974 flush_delayed_work(&adev->delayed_init_work); 3975 adev->shutdown = true; 3976 3977 /* make sure IB test finished before entering exclusive mode 3978 * to avoid preemption on IB test 3979 */ 3980 if (amdgpu_sriov_vf(adev)) { 3981 amdgpu_virt_request_full_gpu(adev, false); 3982 amdgpu_virt_fini_data_exchange(adev); 3983 } 3984 3985 /* disable all interrupts */ 3986 amdgpu_irq_disable_all(adev); 3987 if (adev->mode_info.mode_config_initialized) { 3988 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3989 drm_helper_force_disable_all(adev_to_drm(adev)); 3990 else 3991 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3992 } 3993 amdgpu_fence_driver_hw_fini(adev); 3994 3995 if (adev->mman.initialized) 3996 drain_workqueue(adev->mman.bdev.wq); 3997 3998 if (adev->pm.sysfs_initialized) 3999 amdgpu_pm_sysfs_fini(adev); 4000 if (adev->ucode_sysfs_en) 4001 amdgpu_ucode_sysfs_fini(adev); 4002 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4003 amdgpu_fru_sysfs_fini(adev); 4004 4005 /* disable ras feature must before hw fini */ 4006 amdgpu_ras_pre_fini(adev); 4007 4008 amdgpu_device_ip_fini_early(adev); 4009 4010 amdgpu_irq_fini_hw(adev); 4011 4012 if (adev->mman.initialized) 4013 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4014 4015 amdgpu_gart_dummy_page_fini(adev); 4016 4017 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4018 amdgpu_device_unmap_mmio(adev); 4019 4020 } 4021 4022 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4023 { 4024 int idx; 4025 bool px; 4026 4027 amdgpu_fence_driver_sw_fini(adev); 4028 amdgpu_device_ip_fini(adev); 4029 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4030 adev->accel_working = false; 4031 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4032 4033 amdgpu_reset_fini(adev); 4034 4035 /* free i2c buses */ 4036 if (!amdgpu_device_has_dc_support(adev)) 4037 amdgpu_i2c_fini(adev); 4038 4039 if (amdgpu_emu_mode != 1) 4040 amdgpu_atombios_fini(adev); 4041 4042 kfree(adev->bios); 4043 adev->bios = NULL; 4044 4045 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4046 4047 if (px || (!dev_is_removable(&adev->pdev->dev) && 4048 apple_gmux_detect(NULL, NULL))) 4049 vga_switcheroo_unregister_client(adev->pdev); 4050 4051 if (px) 4052 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4053 4054 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4055 vga_client_unregister(adev->pdev); 4056 4057 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4058 4059 iounmap(adev->rmmio); 4060 adev->rmmio = NULL; 4061 amdgpu_doorbell_fini(adev); 4062 drm_dev_exit(idx); 4063 } 4064 4065 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4066 amdgpu_pmu_fini(adev); 4067 if (adev->mman.discovery_bin) 4068 amdgpu_discovery_fini(adev); 4069 4070 amdgpu_reset_put_reset_domain(adev->reset_domain); 4071 adev->reset_domain = NULL; 4072 4073 kfree(adev->pci_state); 4074 4075 } 4076 4077 /** 4078 * amdgpu_device_evict_resources - evict device resources 4079 * @adev: amdgpu device object 4080 * 4081 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4082 * of the vram memory type. Mainly used for evicting device resources 4083 * at suspend time. 4084 * 4085 */ 4086 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4087 { 4088 int ret; 4089 4090 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4091 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4092 return 0; 4093 4094 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4095 if (ret) 4096 DRM_WARN("evicting device resources failed\n"); 4097 return ret; 4098 } 4099 4100 /* 4101 * Suspend & resume. 4102 */ 4103 /** 4104 * amdgpu_device_suspend - initiate device suspend 4105 * 4106 * @dev: drm dev pointer 4107 * @fbcon : notify the fbdev of suspend 4108 * 4109 * Puts the hw in the suspend state (all asics). 4110 * Returns 0 for success or an error on failure. 4111 * Called at driver suspend. 4112 */ 4113 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4114 { 4115 struct amdgpu_device *adev = drm_to_adev(dev); 4116 int r = 0; 4117 4118 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4119 return 0; 4120 4121 adev->in_suspend = true; 4122 4123 /* Evict the majority of BOs before grabbing the full access */ 4124 r = amdgpu_device_evict_resources(adev); 4125 if (r) 4126 return r; 4127 4128 if (amdgpu_sriov_vf(adev)) { 4129 amdgpu_virt_fini_data_exchange(adev); 4130 r = amdgpu_virt_request_full_gpu(adev, false); 4131 if (r) 4132 return r; 4133 } 4134 4135 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4136 DRM_WARN("smart shift update failed\n"); 4137 4138 if (fbcon) 4139 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4140 4141 cancel_delayed_work_sync(&adev->delayed_init_work); 4142 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4143 4144 amdgpu_ras_suspend(adev); 4145 4146 amdgpu_device_ip_suspend_phase1(adev); 4147 4148 if (!adev->in_s0ix) 4149 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4150 4151 r = amdgpu_device_evict_resources(adev); 4152 if (r) 4153 return r; 4154 4155 amdgpu_fence_driver_hw_fini(adev); 4156 4157 amdgpu_device_ip_suspend_phase2(adev); 4158 4159 if (amdgpu_sriov_vf(adev)) 4160 amdgpu_virt_release_full_gpu(adev, false); 4161 4162 return 0; 4163 } 4164 4165 /** 4166 * amdgpu_device_resume - initiate device resume 4167 * 4168 * @dev: drm dev pointer 4169 * @fbcon : notify the fbdev of resume 4170 * 4171 * Bring the hw back to operating state (all asics). 4172 * Returns 0 for success or an error on failure. 4173 * Called at driver resume. 4174 */ 4175 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4176 { 4177 struct amdgpu_device *adev = drm_to_adev(dev); 4178 int r = 0; 4179 4180 if (amdgpu_sriov_vf(adev)) { 4181 r = amdgpu_virt_request_full_gpu(adev, true); 4182 if (r) 4183 return r; 4184 } 4185 4186 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4187 return 0; 4188 4189 if (adev->in_s0ix) 4190 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4191 4192 /* post card */ 4193 if (amdgpu_device_need_post(adev)) { 4194 r = amdgpu_device_asic_init(adev); 4195 if (r) 4196 dev_err(adev->dev, "amdgpu asic init failed\n"); 4197 } 4198 4199 r = amdgpu_device_ip_resume(adev); 4200 4201 if (r) { 4202 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4203 goto exit; 4204 } 4205 amdgpu_fence_driver_hw_init(adev); 4206 4207 r = amdgpu_device_ip_late_init(adev); 4208 if (r) 4209 goto exit; 4210 4211 queue_delayed_work(system_wq, &adev->delayed_init_work, 4212 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4213 4214 if (!adev->in_s0ix) { 4215 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4216 if (r) 4217 goto exit; 4218 } 4219 4220 exit: 4221 if (amdgpu_sriov_vf(adev)) { 4222 amdgpu_virt_init_data_exchange(adev); 4223 amdgpu_virt_release_full_gpu(adev, true); 4224 } 4225 4226 if (r) 4227 return r; 4228 4229 /* Make sure IB tests flushed */ 4230 flush_delayed_work(&adev->delayed_init_work); 4231 4232 if (fbcon) 4233 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4234 4235 amdgpu_ras_resume(adev); 4236 4237 if (adev->mode_info.num_crtc) { 4238 /* 4239 * Most of the connector probing functions try to acquire runtime pm 4240 * refs to ensure that the GPU is powered on when connector polling is 4241 * performed. Since we're calling this from a runtime PM callback, 4242 * trying to acquire rpm refs will cause us to deadlock. 4243 * 4244 * Since we're guaranteed to be holding the rpm lock, it's safe to 4245 * temporarily disable the rpm helpers so this doesn't deadlock us. 4246 */ 4247 #ifdef CONFIG_PM 4248 dev->dev->power.disable_depth++; 4249 #endif 4250 if (!adev->dc_enabled) 4251 drm_helper_hpd_irq_event(dev); 4252 else 4253 drm_kms_helper_hotplug_event(dev); 4254 #ifdef CONFIG_PM 4255 dev->dev->power.disable_depth--; 4256 #endif 4257 } 4258 adev->in_suspend = false; 4259 4260 if (adev->enable_mes) 4261 amdgpu_mes_self_test(adev); 4262 4263 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4264 DRM_WARN("smart shift update failed\n"); 4265 4266 return 0; 4267 } 4268 4269 /** 4270 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4271 * 4272 * @adev: amdgpu_device pointer 4273 * 4274 * The list of all the hardware IPs that make up the asic is walked and 4275 * the check_soft_reset callbacks are run. check_soft_reset determines 4276 * if the asic is still hung or not. 4277 * Returns true if any of the IPs are still in a hung state, false if not. 4278 */ 4279 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4280 { 4281 int i; 4282 bool asic_hang = false; 4283 4284 if (amdgpu_sriov_vf(adev)) 4285 return true; 4286 4287 if (amdgpu_asic_need_full_reset(adev)) 4288 return true; 4289 4290 for (i = 0; i < adev->num_ip_blocks; i++) { 4291 if (!adev->ip_blocks[i].status.valid) 4292 continue; 4293 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4294 adev->ip_blocks[i].status.hang = 4295 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4296 if (adev->ip_blocks[i].status.hang) { 4297 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4298 asic_hang = true; 4299 } 4300 } 4301 return asic_hang; 4302 } 4303 4304 /** 4305 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4306 * 4307 * @adev: amdgpu_device pointer 4308 * 4309 * The list of all the hardware IPs that make up the asic is walked and the 4310 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4311 * handles any IP specific hardware or software state changes that are 4312 * necessary for a soft reset to succeed. 4313 * Returns 0 on success, negative error code on failure. 4314 */ 4315 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4316 { 4317 int i, r = 0; 4318 4319 for (i = 0; i < adev->num_ip_blocks; i++) { 4320 if (!adev->ip_blocks[i].status.valid) 4321 continue; 4322 if (adev->ip_blocks[i].status.hang && 4323 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4324 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4325 if (r) 4326 return r; 4327 } 4328 } 4329 4330 return 0; 4331 } 4332 4333 /** 4334 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4335 * 4336 * @adev: amdgpu_device pointer 4337 * 4338 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4339 * reset is necessary to recover. 4340 * Returns true if a full asic reset is required, false if not. 4341 */ 4342 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4343 { 4344 int i; 4345 4346 if (amdgpu_asic_need_full_reset(adev)) 4347 return true; 4348 4349 for (i = 0; i < adev->num_ip_blocks; i++) { 4350 if (!adev->ip_blocks[i].status.valid) 4351 continue; 4352 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4353 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4354 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4355 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4356 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4357 if (adev->ip_blocks[i].status.hang) { 4358 dev_info(adev->dev, "Some block need full reset!\n"); 4359 return true; 4360 } 4361 } 4362 } 4363 return false; 4364 } 4365 4366 /** 4367 * amdgpu_device_ip_soft_reset - do a soft reset 4368 * 4369 * @adev: amdgpu_device pointer 4370 * 4371 * The list of all the hardware IPs that make up the asic is walked and the 4372 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4373 * IP specific hardware or software state changes that are necessary to soft 4374 * reset the IP. 4375 * Returns 0 on success, negative error code on failure. 4376 */ 4377 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4378 { 4379 int i, r = 0; 4380 4381 for (i = 0; i < adev->num_ip_blocks; i++) { 4382 if (!adev->ip_blocks[i].status.valid) 4383 continue; 4384 if (adev->ip_blocks[i].status.hang && 4385 adev->ip_blocks[i].version->funcs->soft_reset) { 4386 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4387 if (r) 4388 return r; 4389 } 4390 } 4391 4392 return 0; 4393 } 4394 4395 /** 4396 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4397 * 4398 * @adev: amdgpu_device pointer 4399 * 4400 * The list of all the hardware IPs that make up the asic is walked and the 4401 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4402 * handles any IP specific hardware or software state changes that are 4403 * necessary after the IP has been soft reset. 4404 * Returns 0 on success, negative error code on failure. 4405 */ 4406 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4407 { 4408 int i, r = 0; 4409 4410 for (i = 0; i < adev->num_ip_blocks; i++) { 4411 if (!adev->ip_blocks[i].status.valid) 4412 continue; 4413 if (adev->ip_blocks[i].status.hang && 4414 adev->ip_blocks[i].version->funcs->post_soft_reset) 4415 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4416 if (r) 4417 return r; 4418 } 4419 4420 return 0; 4421 } 4422 4423 /** 4424 * amdgpu_device_recover_vram - Recover some VRAM contents 4425 * 4426 * @adev: amdgpu_device pointer 4427 * 4428 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4429 * restore things like GPUVM page tables after a GPU reset where 4430 * the contents of VRAM might be lost. 4431 * 4432 * Returns: 4433 * 0 on success, negative error code on failure. 4434 */ 4435 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4436 { 4437 struct dma_fence *fence = NULL, *next = NULL; 4438 struct amdgpu_bo *shadow; 4439 struct amdgpu_bo_vm *vmbo; 4440 long r = 1, tmo; 4441 4442 if (amdgpu_sriov_runtime(adev)) 4443 tmo = msecs_to_jiffies(8000); 4444 else 4445 tmo = msecs_to_jiffies(100); 4446 4447 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4448 mutex_lock(&adev->shadow_list_lock); 4449 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4450 /* If vm is compute context or adev is APU, shadow will be NULL */ 4451 if (!vmbo->shadow) 4452 continue; 4453 shadow = vmbo->shadow; 4454 4455 /* No need to recover an evicted BO */ 4456 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4457 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4458 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4459 continue; 4460 4461 r = amdgpu_bo_restore_shadow(shadow, &next); 4462 if (r) 4463 break; 4464 4465 if (fence) { 4466 tmo = dma_fence_wait_timeout(fence, false, tmo); 4467 dma_fence_put(fence); 4468 fence = next; 4469 if (tmo == 0) { 4470 r = -ETIMEDOUT; 4471 break; 4472 } else if (tmo < 0) { 4473 r = tmo; 4474 break; 4475 } 4476 } else { 4477 fence = next; 4478 } 4479 } 4480 mutex_unlock(&adev->shadow_list_lock); 4481 4482 if (fence) 4483 tmo = dma_fence_wait_timeout(fence, false, tmo); 4484 dma_fence_put(fence); 4485 4486 if (r < 0 || tmo <= 0) { 4487 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4488 return -EIO; 4489 } 4490 4491 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4492 return 0; 4493 } 4494 4495 4496 /** 4497 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4498 * 4499 * @adev: amdgpu_device pointer 4500 * @from_hypervisor: request from hypervisor 4501 * 4502 * do VF FLR and reinitialize Asic 4503 * return 0 means succeeded otherwise failed 4504 */ 4505 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4506 bool from_hypervisor) 4507 { 4508 int r; 4509 struct amdgpu_hive_info *hive = NULL; 4510 int retry_limit = 0; 4511 4512 retry: 4513 amdgpu_amdkfd_pre_reset(adev); 4514 4515 if (from_hypervisor) 4516 r = amdgpu_virt_request_full_gpu(adev, true); 4517 else 4518 r = amdgpu_virt_reset_gpu(adev); 4519 if (r) 4520 return r; 4521 amdgpu_irq_gpu_reset_resume_helper(adev); 4522 4523 /* some sw clean up VF needs to do before recover */ 4524 amdgpu_virt_post_reset(adev); 4525 4526 /* Resume IP prior to SMC */ 4527 r = amdgpu_device_ip_reinit_early_sriov(adev); 4528 if (r) 4529 goto error; 4530 4531 amdgpu_virt_init_data_exchange(adev); 4532 4533 r = amdgpu_device_fw_loading(adev); 4534 if (r) 4535 return r; 4536 4537 /* now we are okay to resume SMC/CP/SDMA */ 4538 r = amdgpu_device_ip_reinit_late_sriov(adev); 4539 if (r) 4540 goto error; 4541 4542 hive = amdgpu_get_xgmi_hive(adev); 4543 /* Update PSP FW topology after reset */ 4544 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4545 r = amdgpu_xgmi_update_topology(hive, adev); 4546 4547 if (hive) 4548 amdgpu_put_xgmi_hive(hive); 4549 4550 if (!r) { 4551 r = amdgpu_ib_ring_tests(adev); 4552 4553 amdgpu_amdkfd_post_reset(adev); 4554 } 4555 4556 error: 4557 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4558 amdgpu_inc_vram_lost(adev); 4559 r = amdgpu_device_recover_vram(adev); 4560 } 4561 amdgpu_virt_release_full_gpu(adev, true); 4562 4563 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4564 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4565 retry_limit++; 4566 goto retry; 4567 } else 4568 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4569 } 4570 4571 return r; 4572 } 4573 4574 /** 4575 * amdgpu_device_has_job_running - check if there is any job in mirror list 4576 * 4577 * @adev: amdgpu_device pointer 4578 * 4579 * check if there is any job in mirror list 4580 */ 4581 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4582 { 4583 int i; 4584 struct drm_sched_job *job; 4585 4586 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4587 struct amdgpu_ring *ring = adev->rings[i]; 4588 4589 if (!ring || !ring->sched.thread) 4590 continue; 4591 4592 spin_lock(&ring->sched.job_list_lock); 4593 job = list_first_entry_or_null(&ring->sched.pending_list, 4594 struct drm_sched_job, list); 4595 spin_unlock(&ring->sched.job_list_lock); 4596 if (job) 4597 return true; 4598 } 4599 return false; 4600 } 4601 4602 /** 4603 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4604 * 4605 * @adev: amdgpu_device pointer 4606 * 4607 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4608 * a hung GPU. 4609 */ 4610 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4611 { 4612 4613 if (amdgpu_gpu_recovery == 0) 4614 goto disabled; 4615 4616 /* Skip soft reset check in fatal error mode */ 4617 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4618 return true; 4619 4620 if (amdgpu_sriov_vf(adev)) 4621 return true; 4622 4623 if (amdgpu_gpu_recovery == -1) { 4624 switch (adev->asic_type) { 4625 #ifdef CONFIG_DRM_AMDGPU_SI 4626 case CHIP_VERDE: 4627 case CHIP_TAHITI: 4628 case CHIP_PITCAIRN: 4629 case CHIP_OLAND: 4630 case CHIP_HAINAN: 4631 #endif 4632 #ifdef CONFIG_DRM_AMDGPU_CIK 4633 case CHIP_KAVERI: 4634 case CHIP_KABINI: 4635 case CHIP_MULLINS: 4636 #endif 4637 case CHIP_CARRIZO: 4638 case CHIP_STONEY: 4639 case CHIP_CYAN_SKILLFISH: 4640 goto disabled; 4641 default: 4642 break; 4643 } 4644 } 4645 4646 return true; 4647 4648 disabled: 4649 dev_info(adev->dev, "GPU recovery disabled.\n"); 4650 return false; 4651 } 4652 4653 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4654 { 4655 u32 i; 4656 int ret = 0; 4657 4658 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4659 4660 dev_info(adev->dev, "GPU mode1 reset\n"); 4661 4662 /* disable BM */ 4663 pci_clear_master(adev->pdev); 4664 4665 amdgpu_device_cache_pci_state(adev->pdev); 4666 4667 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4668 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4669 ret = amdgpu_dpm_mode1_reset(adev); 4670 } else { 4671 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4672 ret = psp_gpu_reset(adev); 4673 } 4674 4675 if (ret) 4676 goto mode1_reset_failed; 4677 4678 amdgpu_device_load_pci_state(adev->pdev); 4679 ret = amdgpu_psp_wait_for_bootloader(adev); 4680 if (ret) 4681 goto mode1_reset_failed; 4682 4683 /* wait for asic to come out of reset */ 4684 for (i = 0; i < adev->usec_timeout; i++) { 4685 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4686 4687 if (memsize != 0xffffffff) 4688 break; 4689 udelay(1); 4690 } 4691 4692 if (i >= adev->usec_timeout) { 4693 ret = -ETIMEDOUT; 4694 goto mode1_reset_failed; 4695 } 4696 4697 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4698 4699 return 0; 4700 4701 mode1_reset_failed: 4702 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4703 return ret; 4704 } 4705 4706 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4707 struct amdgpu_reset_context *reset_context) 4708 { 4709 int i, r = 0; 4710 struct amdgpu_job *job = NULL; 4711 bool need_full_reset = 4712 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4713 4714 if (reset_context->reset_req_dev == adev) 4715 job = reset_context->job; 4716 4717 if (amdgpu_sriov_vf(adev)) { 4718 /* stop the data exchange thread */ 4719 amdgpu_virt_fini_data_exchange(adev); 4720 } 4721 4722 amdgpu_fence_driver_isr_toggle(adev, true); 4723 4724 /* block all schedulers and reset given job's ring */ 4725 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4726 struct amdgpu_ring *ring = adev->rings[i]; 4727 4728 if (!ring || !ring->sched.thread) 4729 continue; 4730 4731 /* Clear job fence from fence drv to avoid force_completion 4732 * leave NULL and vm flush fence in fence drv 4733 */ 4734 amdgpu_fence_driver_clear_job_fences(ring); 4735 4736 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4737 amdgpu_fence_driver_force_completion(ring); 4738 } 4739 4740 amdgpu_fence_driver_isr_toggle(adev, false); 4741 4742 if (job && job->vm) 4743 drm_sched_increase_karma(&job->base); 4744 4745 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4746 /* If reset handler not implemented, continue; otherwise return */ 4747 if (r == -EOPNOTSUPP) 4748 r = 0; 4749 else 4750 return r; 4751 4752 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4753 if (!amdgpu_sriov_vf(adev)) { 4754 4755 if (!need_full_reset) 4756 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4757 4758 if (!need_full_reset && amdgpu_gpu_recovery && 4759 amdgpu_device_ip_check_soft_reset(adev)) { 4760 amdgpu_device_ip_pre_soft_reset(adev); 4761 r = amdgpu_device_ip_soft_reset(adev); 4762 amdgpu_device_ip_post_soft_reset(adev); 4763 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4764 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4765 need_full_reset = true; 4766 } 4767 } 4768 4769 if (need_full_reset) 4770 r = amdgpu_device_ip_suspend(adev); 4771 if (need_full_reset) 4772 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4773 else 4774 clear_bit(AMDGPU_NEED_FULL_RESET, 4775 &reset_context->flags); 4776 } 4777 4778 return r; 4779 } 4780 4781 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4782 { 4783 int i; 4784 4785 lockdep_assert_held(&adev->reset_domain->sem); 4786 4787 for (i = 0; i < adev->num_regs; i++) { 4788 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4789 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4790 adev->reset_dump_reg_value[i]); 4791 } 4792 4793 return 0; 4794 } 4795 4796 #ifdef CONFIG_DEV_COREDUMP 4797 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4798 size_t count, void *data, size_t datalen) 4799 { 4800 struct drm_printer p; 4801 struct amdgpu_device *adev = data; 4802 struct drm_print_iterator iter; 4803 int i; 4804 4805 iter.data = buffer; 4806 iter.offset = 0; 4807 iter.start = offset; 4808 iter.remain = count; 4809 4810 p = drm_coredump_printer(&iter); 4811 4812 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4813 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4814 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4815 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4816 if (adev->reset_task_info.pid) 4817 drm_printf(&p, "process_name: %s PID: %d\n", 4818 adev->reset_task_info.process_name, 4819 adev->reset_task_info.pid); 4820 4821 if (adev->reset_vram_lost) 4822 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4823 if (adev->num_regs) { 4824 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4825 4826 for (i = 0; i < adev->num_regs; i++) 4827 drm_printf(&p, "0x%08x: 0x%08x\n", 4828 adev->reset_dump_reg_list[i], 4829 adev->reset_dump_reg_value[i]); 4830 } 4831 4832 return count - iter.remain; 4833 } 4834 4835 static void amdgpu_devcoredump_free(void *data) 4836 { 4837 } 4838 4839 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4840 { 4841 struct drm_device *dev = adev_to_drm(adev); 4842 4843 ktime_get_ts64(&adev->reset_time); 4844 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4845 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4846 } 4847 #endif 4848 4849 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4850 struct amdgpu_reset_context *reset_context) 4851 { 4852 struct amdgpu_device *tmp_adev = NULL; 4853 bool need_full_reset, skip_hw_reset, vram_lost = false; 4854 int r = 0; 4855 bool gpu_reset_for_dev_remove = 0; 4856 4857 /* Try reset handler method first */ 4858 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4859 reset_list); 4860 amdgpu_reset_reg_dumps(tmp_adev); 4861 4862 reset_context->reset_device_list = device_list_handle; 4863 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4864 /* If reset handler not implemented, continue; otherwise return */ 4865 if (r == -EOPNOTSUPP) 4866 r = 0; 4867 else 4868 return r; 4869 4870 /* Reset handler not implemented, use the default method */ 4871 need_full_reset = 4872 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4873 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4874 4875 gpu_reset_for_dev_remove = 4876 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4877 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4878 4879 /* 4880 * ASIC reset has to be done on all XGMI hive nodes ASAP 4881 * to allow proper links negotiation in FW (within 1 sec) 4882 */ 4883 if (!skip_hw_reset && need_full_reset) { 4884 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4885 /* For XGMI run all resets in parallel to speed up the process */ 4886 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4887 tmp_adev->gmc.xgmi.pending_reset = false; 4888 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4889 r = -EALREADY; 4890 } else 4891 r = amdgpu_asic_reset(tmp_adev); 4892 4893 if (r) { 4894 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4895 r, adev_to_drm(tmp_adev)->unique); 4896 break; 4897 } 4898 } 4899 4900 /* For XGMI wait for all resets to complete before proceed */ 4901 if (!r) { 4902 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4903 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4904 flush_work(&tmp_adev->xgmi_reset_work); 4905 r = tmp_adev->asic_reset_res; 4906 if (r) 4907 break; 4908 } 4909 } 4910 } 4911 } 4912 4913 if (!r && amdgpu_ras_intr_triggered()) { 4914 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4915 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4916 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4917 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4918 } 4919 4920 amdgpu_ras_intr_cleared(); 4921 } 4922 4923 /* Since the mode1 reset affects base ip blocks, the 4924 * phase1 ip blocks need to be resumed. Otherwise there 4925 * will be a BIOS signature error and the psp bootloader 4926 * can't load kdb on the next amdgpu install. 4927 */ 4928 if (gpu_reset_for_dev_remove) { 4929 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4930 amdgpu_device_ip_resume_phase1(tmp_adev); 4931 4932 goto end; 4933 } 4934 4935 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4936 if (need_full_reset) { 4937 /* post card */ 4938 r = amdgpu_device_asic_init(tmp_adev); 4939 if (r) { 4940 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4941 } else { 4942 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4943 4944 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4945 if (r) 4946 goto out; 4947 4948 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4949 #ifdef CONFIG_DEV_COREDUMP 4950 tmp_adev->reset_vram_lost = vram_lost; 4951 memset(&tmp_adev->reset_task_info, 0, 4952 sizeof(tmp_adev->reset_task_info)); 4953 if (reset_context->job && reset_context->job->vm) 4954 tmp_adev->reset_task_info = 4955 reset_context->job->vm->task_info; 4956 amdgpu_reset_capture_coredumpm(tmp_adev); 4957 #endif 4958 if (vram_lost) { 4959 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4960 amdgpu_inc_vram_lost(tmp_adev); 4961 } 4962 4963 r = amdgpu_device_fw_loading(tmp_adev); 4964 if (r) 4965 return r; 4966 4967 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4968 if (r) 4969 goto out; 4970 4971 if (vram_lost) 4972 amdgpu_device_fill_reset_magic(tmp_adev); 4973 4974 /* 4975 * Add this ASIC as tracked as reset was already 4976 * complete successfully. 4977 */ 4978 amdgpu_register_gpu_instance(tmp_adev); 4979 4980 if (!reset_context->hive && 4981 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4982 amdgpu_xgmi_add_device(tmp_adev); 4983 4984 r = amdgpu_device_ip_late_init(tmp_adev); 4985 if (r) 4986 goto out; 4987 4988 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4989 4990 /* 4991 * The GPU enters bad state once faulty pages 4992 * by ECC has reached the threshold, and ras 4993 * recovery is scheduled next. So add one check 4994 * here to break recovery if it indeed exceeds 4995 * bad page threshold, and remind user to 4996 * retire this GPU or setting one bigger 4997 * bad_page_threshold value to fix this once 4998 * probing driver again. 4999 */ 5000 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5001 /* must succeed. */ 5002 amdgpu_ras_resume(tmp_adev); 5003 } else { 5004 r = -EINVAL; 5005 goto out; 5006 } 5007 5008 /* Update PSP FW topology after reset */ 5009 if (reset_context->hive && 5010 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5011 r = amdgpu_xgmi_update_topology( 5012 reset_context->hive, tmp_adev); 5013 } 5014 } 5015 5016 out: 5017 if (!r) { 5018 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5019 r = amdgpu_ib_ring_tests(tmp_adev); 5020 if (r) { 5021 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5022 need_full_reset = true; 5023 r = -EAGAIN; 5024 goto end; 5025 } 5026 } 5027 5028 if (!r) 5029 r = amdgpu_device_recover_vram(tmp_adev); 5030 else 5031 tmp_adev->asic_reset_res = r; 5032 } 5033 5034 end: 5035 if (need_full_reset) 5036 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5037 else 5038 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5039 return r; 5040 } 5041 5042 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5043 { 5044 5045 switch (amdgpu_asic_reset_method(adev)) { 5046 case AMD_RESET_METHOD_MODE1: 5047 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5048 break; 5049 case AMD_RESET_METHOD_MODE2: 5050 adev->mp1_state = PP_MP1_STATE_RESET; 5051 break; 5052 default: 5053 adev->mp1_state = PP_MP1_STATE_NONE; 5054 break; 5055 } 5056 } 5057 5058 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5059 { 5060 amdgpu_vf_error_trans_all(adev); 5061 adev->mp1_state = PP_MP1_STATE_NONE; 5062 } 5063 5064 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5065 { 5066 struct pci_dev *p = NULL; 5067 5068 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5069 adev->pdev->bus->number, 1); 5070 if (p) { 5071 pm_runtime_enable(&(p->dev)); 5072 pm_runtime_resume(&(p->dev)); 5073 } 5074 5075 pci_dev_put(p); 5076 } 5077 5078 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5079 { 5080 enum amd_reset_method reset_method; 5081 struct pci_dev *p = NULL; 5082 u64 expires; 5083 5084 /* 5085 * For now, only BACO and mode1 reset are confirmed 5086 * to suffer the audio issue without proper suspended. 5087 */ 5088 reset_method = amdgpu_asic_reset_method(adev); 5089 if ((reset_method != AMD_RESET_METHOD_BACO) && 5090 (reset_method != AMD_RESET_METHOD_MODE1)) 5091 return -EINVAL; 5092 5093 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5094 adev->pdev->bus->number, 1); 5095 if (!p) 5096 return -ENODEV; 5097 5098 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5099 if (!expires) 5100 /* 5101 * If we cannot get the audio device autosuspend delay, 5102 * a fixed 4S interval will be used. Considering 3S is 5103 * the audio controller default autosuspend delay setting. 5104 * 4S used here is guaranteed to cover that. 5105 */ 5106 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5107 5108 while (!pm_runtime_status_suspended(&(p->dev))) { 5109 if (!pm_runtime_suspend(&(p->dev))) 5110 break; 5111 5112 if (expires < ktime_get_mono_fast_ns()) { 5113 dev_warn(adev->dev, "failed to suspend display audio\n"); 5114 pci_dev_put(p); 5115 /* TODO: abort the succeeding gpu reset? */ 5116 return -ETIMEDOUT; 5117 } 5118 } 5119 5120 pm_runtime_disable(&(p->dev)); 5121 5122 pci_dev_put(p); 5123 return 0; 5124 } 5125 5126 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5127 { 5128 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5129 5130 #if defined(CONFIG_DEBUG_FS) 5131 if (!amdgpu_sriov_vf(adev)) 5132 cancel_work(&adev->reset_work); 5133 #endif 5134 5135 if (adev->kfd.dev) 5136 cancel_work(&adev->kfd.reset_work); 5137 5138 if (amdgpu_sriov_vf(adev)) 5139 cancel_work(&adev->virt.flr_work); 5140 5141 if (con && adev->ras_enabled) 5142 cancel_work(&con->recovery_work); 5143 5144 } 5145 5146 /** 5147 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5148 * 5149 * @adev: amdgpu_device pointer 5150 * @job: which job trigger hang 5151 * @reset_context: amdgpu reset context pointer 5152 * 5153 * Attempt to reset the GPU if it has hung (all asics). 5154 * Attempt to do soft-reset or full-reset and reinitialize Asic 5155 * Returns 0 for success or an error on failure. 5156 */ 5157 5158 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5159 struct amdgpu_job *job, 5160 struct amdgpu_reset_context *reset_context) 5161 { 5162 struct list_head device_list, *device_list_handle = NULL; 5163 bool job_signaled = false; 5164 struct amdgpu_hive_info *hive = NULL; 5165 struct amdgpu_device *tmp_adev = NULL; 5166 int i, r = 0; 5167 bool need_emergency_restart = false; 5168 bool audio_suspended = false; 5169 bool gpu_reset_for_dev_remove = false; 5170 5171 gpu_reset_for_dev_remove = 5172 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5173 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5174 5175 /* 5176 * Special case: RAS triggered and full reset isn't supported 5177 */ 5178 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5179 5180 /* 5181 * Flush RAM to disk so that after reboot 5182 * the user can read log and see why the system rebooted. 5183 */ 5184 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5185 amdgpu_ras_get_context(adev)->reboot) { 5186 DRM_WARN("Emergency reboot."); 5187 5188 ksys_sync_helper(); 5189 emergency_restart(); 5190 } 5191 5192 dev_info(adev->dev, "GPU %s begin!\n", 5193 need_emergency_restart ? "jobs stop":"reset"); 5194 5195 if (!amdgpu_sriov_vf(adev)) 5196 hive = amdgpu_get_xgmi_hive(adev); 5197 if (hive) 5198 mutex_lock(&hive->hive_lock); 5199 5200 reset_context->job = job; 5201 reset_context->hive = hive; 5202 /* 5203 * Build list of devices to reset. 5204 * In case we are in XGMI hive mode, resort the device list 5205 * to put adev in the 1st position. 5206 */ 5207 INIT_LIST_HEAD(&device_list); 5208 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5209 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5210 list_add_tail(&tmp_adev->reset_list, &device_list); 5211 if (gpu_reset_for_dev_remove && adev->shutdown) 5212 tmp_adev->shutdown = true; 5213 } 5214 if (!list_is_first(&adev->reset_list, &device_list)) 5215 list_rotate_to_front(&adev->reset_list, &device_list); 5216 device_list_handle = &device_list; 5217 } else { 5218 list_add_tail(&adev->reset_list, &device_list); 5219 device_list_handle = &device_list; 5220 } 5221 5222 /* We need to lock reset domain only once both for XGMI and single device */ 5223 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5224 reset_list); 5225 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5226 5227 /* block all schedulers and reset given job's ring */ 5228 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5229 5230 amdgpu_device_set_mp1_state(tmp_adev); 5231 5232 /* 5233 * Try to put the audio codec into suspend state 5234 * before gpu reset started. 5235 * 5236 * Due to the power domain of the graphics device 5237 * is shared with AZ power domain. Without this, 5238 * we may change the audio hardware from behind 5239 * the audio driver's back. That will trigger 5240 * some audio codec errors. 5241 */ 5242 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5243 audio_suspended = true; 5244 5245 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5246 5247 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5248 5249 if (!amdgpu_sriov_vf(tmp_adev)) 5250 amdgpu_amdkfd_pre_reset(tmp_adev); 5251 5252 /* 5253 * Mark these ASICs to be reseted as untracked first 5254 * And add them back after reset completed 5255 */ 5256 amdgpu_unregister_gpu_instance(tmp_adev); 5257 5258 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5259 5260 /* disable ras on ALL IPs */ 5261 if (!need_emergency_restart && 5262 amdgpu_device_ip_need_full_reset(tmp_adev)) 5263 amdgpu_ras_suspend(tmp_adev); 5264 5265 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5266 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5267 5268 if (!ring || !ring->sched.thread) 5269 continue; 5270 5271 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5272 5273 if (need_emergency_restart) 5274 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5275 } 5276 atomic_inc(&tmp_adev->gpu_reset_counter); 5277 } 5278 5279 if (need_emergency_restart) 5280 goto skip_sched_resume; 5281 5282 /* 5283 * Must check guilty signal here since after this point all old 5284 * HW fences are force signaled. 5285 * 5286 * job->base holds a reference to parent fence 5287 */ 5288 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5289 job_signaled = true; 5290 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5291 goto skip_hw_reset; 5292 } 5293 5294 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5295 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5296 if (gpu_reset_for_dev_remove) { 5297 /* Workaroud for ASICs need to disable SMC first */ 5298 amdgpu_device_smu_fini_early(tmp_adev); 5299 } 5300 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5301 /*TODO Should we stop ?*/ 5302 if (r) { 5303 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5304 r, adev_to_drm(tmp_adev)->unique); 5305 tmp_adev->asic_reset_res = r; 5306 } 5307 5308 /* 5309 * Drop all pending non scheduler resets. Scheduler resets 5310 * were already dropped during drm_sched_stop 5311 */ 5312 amdgpu_device_stop_pending_resets(tmp_adev); 5313 } 5314 5315 /* Actual ASIC resets if needed.*/ 5316 /* Host driver will handle XGMI hive reset for SRIOV */ 5317 if (amdgpu_sriov_vf(adev)) { 5318 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5319 if (r) 5320 adev->asic_reset_res = r; 5321 5322 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5323 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5324 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5325 amdgpu_ras_resume(adev); 5326 } else { 5327 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5328 if (r && r == -EAGAIN) 5329 goto retry; 5330 5331 if (!r && gpu_reset_for_dev_remove) 5332 goto recover_end; 5333 } 5334 5335 skip_hw_reset: 5336 5337 /* Post ASIC reset for all devs .*/ 5338 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5339 5340 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5341 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5342 5343 if (!ring || !ring->sched.thread) 5344 continue; 5345 5346 drm_sched_start(&ring->sched, true); 5347 } 5348 5349 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5350 amdgpu_mes_self_test(tmp_adev); 5351 5352 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5353 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5354 5355 if (tmp_adev->asic_reset_res) 5356 r = tmp_adev->asic_reset_res; 5357 5358 tmp_adev->asic_reset_res = 0; 5359 5360 if (r) { 5361 /* bad news, how to tell it to userspace ? */ 5362 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5363 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5364 } else { 5365 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5366 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5367 DRM_WARN("smart shift update failed\n"); 5368 } 5369 } 5370 5371 skip_sched_resume: 5372 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5373 /* unlock kfd: SRIOV would do it separately */ 5374 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5375 amdgpu_amdkfd_post_reset(tmp_adev); 5376 5377 /* kfd_post_reset will do nothing if kfd device is not initialized, 5378 * need to bring up kfd here if it's not be initialized before 5379 */ 5380 if (!adev->kfd.init_complete) 5381 amdgpu_amdkfd_device_init(adev); 5382 5383 if (audio_suspended) 5384 amdgpu_device_resume_display_audio(tmp_adev); 5385 5386 amdgpu_device_unset_mp1_state(tmp_adev); 5387 5388 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5389 } 5390 5391 recover_end: 5392 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5393 reset_list); 5394 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5395 5396 if (hive) { 5397 mutex_unlock(&hive->hive_lock); 5398 amdgpu_put_xgmi_hive(hive); 5399 } 5400 5401 if (r) 5402 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5403 5404 atomic_set(&adev->reset_domain->reset_res, r); 5405 return r; 5406 } 5407 5408 /** 5409 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5410 * 5411 * @adev: amdgpu_device pointer 5412 * 5413 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5414 * and lanes) of the slot the device is in. Handles APUs and 5415 * virtualized environments where PCIE config space may not be available. 5416 */ 5417 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5418 { 5419 struct pci_dev *pdev; 5420 enum pci_bus_speed speed_cap, platform_speed_cap; 5421 enum pcie_link_width platform_link_width; 5422 5423 if (amdgpu_pcie_gen_cap) 5424 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5425 5426 if (amdgpu_pcie_lane_cap) 5427 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5428 5429 /* covers APUs as well */ 5430 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5431 if (adev->pm.pcie_gen_mask == 0) 5432 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5433 if (adev->pm.pcie_mlw_mask == 0) 5434 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5435 return; 5436 } 5437 5438 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5439 return; 5440 5441 pcie_bandwidth_available(adev->pdev, NULL, 5442 &platform_speed_cap, &platform_link_width); 5443 5444 if (adev->pm.pcie_gen_mask == 0) { 5445 /* asic caps */ 5446 pdev = adev->pdev; 5447 speed_cap = pcie_get_speed_cap(pdev); 5448 if (speed_cap == PCI_SPEED_UNKNOWN) { 5449 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5450 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5451 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5452 } else { 5453 if (speed_cap == PCIE_SPEED_32_0GT) 5454 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5455 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5456 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5457 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5458 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5459 else if (speed_cap == PCIE_SPEED_16_0GT) 5460 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5461 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5463 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5464 else if (speed_cap == PCIE_SPEED_8_0GT) 5465 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5466 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5468 else if (speed_cap == PCIE_SPEED_5_0GT) 5469 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5470 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5471 else 5472 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5473 } 5474 /* platform caps */ 5475 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5476 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5477 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5478 } else { 5479 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5480 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5481 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5482 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5483 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5484 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5485 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5486 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5487 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5489 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5490 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5491 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5492 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5493 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5494 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5495 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5496 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5497 else 5498 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5499 5500 } 5501 } 5502 if (adev->pm.pcie_mlw_mask == 0) { 5503 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5504 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5505 } else { 5506 switch (platform_link_width) { 5507 case PCIE_LNK_X32: 5508 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5509 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5510 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5511 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5512 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5515 break; 5516 case PCIE_LNK_X16: 5517 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5518 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5519 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5520 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5521 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5522 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5523 break; 5524 case PCIE_LNK_X12: 5525 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5530 break; 5531 case PCIE_LNK_X8: 5532 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5534 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5536 break; 5537 case PCIE_LNK_X4: 5538 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5541 break; 5542 case PCIE_LNK_X2: 5543 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5545 break; 5546 case PCIE_LNK_X1: 5547 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5548 break; 5549 default: 5550 break; 5551 } 5552 } 5553 } 5554 } 5555 5556 /** 5557 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5558 * 5559 * @adev: amdgpu_device pointer 5560 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5561 * 5562 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5563 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5564 * @peer_adev. 5565 */ 5566 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5567 struct amdgpu_device *peer_adev) 5568 { 5569 #ifdef CONFIG_HSA_AMD_P2P 5570 uint64_t address_mask = peer_adev->dev->dma_mask ? 5571 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5572 resource_size_t aper_limit = 5573 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5574 bool p2p_access = 5575 !adev->gmc.xgmi.connected_to_cpu && 5576 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5577 5578 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5579 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5580 !(adev->gmc.aper_base & address_mask || 5581 aper_limit & address_mask)); 5582 #else 5583 return false; 5584 #endif 5585 } 5586 5587 int amdgpu_device_baco_enter(struct drm_device *dev) 5588 { 5589 struct amdgpu_device *adev = drm_to_adev(dev); 5590 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5591 5592 if (!amdgpu_device_supports_baco(dev)) 5593 return -ENOTSUPP; 5594 5595 if (ras && adev->ras_enabled && 5596 adev->nbio.funcs->enable_doorbell_interrupt) 5597 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5598 5599 return amdgpu_dpm_baco_enter(adev); 5600 } 5601 5602 int amdgpu_device_baco_exit(struct drm_device *dev) 5603 { 5604 struct amdgpu_device *adev = drm_to_adev(dev); 5605 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5606 int ret = 0; 5607 5608 if (!amdgpu_device_supports_baco(dev)) 5609 return -ENOTSUPP; 5610 5611 ret = amdgpu_dpm_baco_exit(adev); 5612 if (ret) 5613 return ret; 5614 5615 if (ras && adev->ras_enabled && 5616 adev->nbio.funcs->enable_doorbell_interrupt) 5617 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5618 5619 if (amdgpu_passthrough(adev) && 5620 adev->nbio.funcs->clear_doorbell_interrupt) 5621 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5622 5623 return 0; 5624 } 5625 5626 /** 5627 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5628 * @pdev: PCI device struct 5629 * @state: PCI channel state 5630 * 5631 * Description: Called when a PCI error is detected. 5632 * 5633 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5634 */ 5635 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5636 { 5637 struct drm_device *dev = pci_get_drvdata(pdev); 5638 struct amdgpu_device *adev = drm_to_adev(dev); 5639 int i; 5640 5641 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5642 5643 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5644 DRM_WARN("No support for XGMI hive yet..."); 5645 return PCI_ERS_RESULT_DISCONNECT; 5646 } 5647 5648 adev->pci_channel_state = state; 5649 5650 switch (state) { 5651 case pci_channel_io_normal: 5652 return PCI_ERS_RESULT_CAN_RECOVER; 5653 /* Fatal error, prepare for slot reset */ 5654 case pci_channel_io_frozen: 5655 /* 5656 * Locking adev->reset_domain->sem will prevent any external access 5657 * to GPU during PCI error recovery 5658 */ 5659 amdgpu_device_lock_reset_domain(adev->reset_domain); 5660 amdgpu_device_set_mp1_state(adev); 5661 5662 /* 5663 * Block any work scheduling as we do for regular GPU reset 5664 * for the duration of the recovery 5665 */ 5666 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5667 struct amdgpu_ring *ring = adev->rings[i]; 5668 5669 if (!ring || !ring->sched.thread) 5670 continue; 5671 5672 drm_sched_stop(&ring->sched, NULL); 5673 } 5674 atomic_inc(&adev->gpu_reset_counter); 5675 return PCI_ERS_RESULT_NEED_RESET; 5676 case pci_channel_io_perm_failure: 5677 /* Permanent error, prepare for device removal */ 5678 return PCI_ERS_RESULT_DISCONNECT; 5679 } 5680 5681 return PCI_ERS_RESULT_NEED_RESET; 5682 } 5683 5684 /** 5685 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5686 * @pdev: pointer to PCI device 5687 */ 5688 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5689 { 5690 5691 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5692 5693 /* TODO - dump whatever for debugging purposes */ 5694 5695 /* This called only if amdgpu_pci_error_detected returns 5696 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5697 * works, no need to reset slot. 5698 */ 5699 5700 return PCI_ERS_RESULT_RECOVERED; 5701 } 5702 5703 /** 5704 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5705 * @pdev: PCI device struct 5706 * 5707 * Description: This routine is called by the pci error recovery 5708 * code after the PCI slot has been reset, just before we 5709 * should resume normal operations. 5710 */ 5711 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5712 { 5713 struct drm_device *dev = pci_get_drvdata(pdev); 5714 struct amdgpu_device *adev = drm_to_adev(dev); 5715 int r, i; 5716 struct amdgpu_reset_context reset_context; 5717 u32 memsize; 5718 struct list_head device_list; 5719 5720 DRM_INFO("PCI error: slot reset callback!!\n"); 5721 5722 memset(&reset_context, 0, sizeof(reset_context)); 5723 5724 INIT_LIST_HEAD(&device_list); 5725 list_add_tail(&adev->reset_list, &device_list); 5726 5727 /* wait for asic to come out of reset */ 5728 msleep(500); 5729 5730 /* Restore PCI confspace */ 5731 amdgpu_device_load_pci_state(pdev); 5732 5733 /* confirm ASIC came out of reset */ 5734 for (i = 0; i < adev->usec_timeout; i++) { 5735 memsize = amdgpu_asic_get_config_memsize(adev); 5736 5737 if (memsize != 0xffffffff) 5738 break; 5739 udelay(1); 5740 } 5741 if (memsize == 0xffffffff) { 5742 r = -ETIME; 5743 goto out; 5744 } 5745 5746 reset_context.method = AMD_RESET_METHOD_NONE; 5747 reset_context.reset_req_dev = adev; 5748 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5749 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5750 5751 adev->no_hw_access = true; 5752 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5753 adev->no_hw_access = false; 5754 if (r) 5755 goto out; 5756 5757 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5758 5759 out: 5760 if (!r) { 5761 if (amdgpu_device_cache_pci_state(adev->pdev)) 5762 pci_restore_state(adev->pdev); 5763 5764 DRM_INFO("PCIe error recovery succeeded\n"); 5765 } else { 5766 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5767 amdgpu_device_unset_mp1_state(adev); 5768 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5769 } 5770 5771 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5772 } 5773 5774 /** 5775 * amdgpu_pci_resume() - resume normal ops after PCI reset 5776 * @pdev: pointer to PCI device 5777 * 5778 * Called when the error recovery driver tells us that its 5779 * OK to resume normal operation. 5780 */ 5781 void amdgpu_pci_resume(struct pci_dev *pdev) 5782 { 5783 struct drm_device *dev = pci_get_drvdata(pdev); 5784 struct amdgpu_device *adev = drm_to_adev(dev); 5785 int i; 5786 5787 5788 DRM_INFO("PCI error: resume callback!!\n"); 5789 5790 /* Only continue execution for the case of pci_channel_io_frozen */ 5791 if (adev->pci_channel_state != pci_channel_io_frozen) 5792 return; 5793 5794 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5795 struct amdgpu_ring *ring = adev->rings[i]; 5796 5797 if (!ring || !ring->sched.thread) 5798 continue; 5799 5800 drm_sched_start(&ring->sched, true); 5801 } 5802 5803 amdgpu_device_unset_mp1_state(adev); 5804 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5805 } 5806 5807 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5808 { 5809 struct drm_device *dev = pci_get_drvdata(pdev); 5810 struct amdgpu_device *adev = drm_to_adev(dev); 5811 int r; 5812 5813 r = pci_save_state(pdev); 5814 if (!r) { 5815 kfree(adev->pci_state); 5816 5817 adev->pci_state = pci_store_saved_state(pdev); 5818 5819 if (!adev->pci_state) { 5820 DRM_ERROR("Failed to store PCI saved state"); 5821 return false; 5822 } 5823 } else { 5824 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5825 return false; 5826 } 5827 5828 return true; 5829 } 5830 5831 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5832 { 5833 struct drm_device *dev = pci_get_drvdata(pdev); 5834 struct amdgpu_device *adev = drm_to_adev(dev); 5835 int r; 5836 5837 if (!adev->pci_state) 5838 return false; 5839 5840 r = pci_load_saved_state(pdev, adev->pci_state); 5841 5842 if (!r) { 5843 pci_restore_state(pdev); 5844 } else { 5845 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5846 return false; 5847 } 5848 5849 return true; 5850 } 5851 5852 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5853 struct amdgpu_ring *ring) 5854 { 5855 #ifdef CONFIG_X86_64 5856 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5857 return; 5858 #endif 5859 if (adev->gmc.xgmi.connected_to_cpu) 5860 return; 5861 5862 if (ring && ring->funcs->emit_hdp_flush) 5863 amdgpu_ring_emit_hdp_flush(ring); 5864 else 5865 amdgpu_asic_flush_hdp(adev, ring); 5866 } 5867 5868 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5869 struct amdgpu_ring *ring) 5870 { 5871 #ifdef CONFIG_X86_64 5872 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5873 return; 5874 #endif 5875 if (adev->gmc.xgmi.connected_to_cpu) 5876 return; 5877 5878 amdgpu_asic_invalidate_hdp(adev, ring); 5879 } 5880 5881 int amdgpu_in_reset(struct amdgpu_device *adev) 5882 { 5883 return atomic_read(&adev->reset_domain->in_gpu_reset); 5884 } 5885 5886 /** 5887 * amdgpu_device_halt() - bring hardware to some kind of halt state 5888 * 5889 * @adev: amdgpu_device pointer 5890 * 5891 * Bring hardware to some kind of halt state so that no one can touch it 5892 * any more. It will help to maintain error context when error occurred. 5893 * Compare to a simple hang, the system will keep stable at least for SSH 5894 * access. Then it should be trivial to inspect the hardware state and 5895 * see what's going on. Implemented as following: 5896 * 5897 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5898 * clears all CPU mappings to device, disallows remappings through page faults 5899 * 2. amdgpu_irq_disable_all() disables all interrupts 5900 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5901 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5902 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5903 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5904 * flush any in flight DMA operations 5905 */ 5906 void amdgpu_device_halt(struct amdgpu_device *adev) 5907 { 5908 struct pci_dev *pdev = adev->pdev; 5909 struct drm_device *ddev = adev_to_drm(adev); 5910 5911 amdgpu_xcp_dev_unplug(adev); 5912 drm_dev_unplug(ddev); 5913 5914 amdgpu_irq_disable_all(adev); 5915 5916 amdgpu_fence_driver_hw_fini(adev); 5917 5918 adev->no_hw_access = true; 5919 5920 amdgpu_device_unmap_mmio(adev); 5921 5922 pci_disable_device(pdev); 5923 pci_wait_for_pending_transaction(pdev); 5924 } 5925 5926 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5927 u32 reg) 5928 { 5929 unsigned long flags, address, data; 5930 u32 r; 5931 5932 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5933 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5934 5935 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5936 WREG32(address, reg * 4); 5937 (void)RREG32(address); 5938 r = RREG32(data); 5939 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5940 return r; 5941 } 5942 5943 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5944 u32 reg, u32 v) 5945 { 5946 unsigned long flags, address, data; 5947 5948 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5949 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5950 5951 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5952 WREG32(address, reg * 4); 5953 (void)RREG32(address); 5954 WREG32(data, v); 5955 (void)RREG32(data); 5956 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5957 } 5958 5959 /** 5960 * amdgpu_device_switch_gang - switch to a new gang 5961 * @adev: amdgpu_device pointer 5962 * @gang: the gang to switch to 5963 * 5964 * Try to switch to a new gang. 5965 * Returns: NULL if we switched to the new gang or a reference to the current 5966 * gang leader. 5967 */ 5968 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5969 struct dma_fence *gang) 5970 { 5971 struct dma_fence *old = NULL; 5972 5973 do { 5974 dma_fence_put(old); 5975 rcu_read_lock(); 5976 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5977 rcu_read_unlock(); 5978 5979 if (old == gang) 5980 break; 5981 5982 if (!dma_fence_is_signaled(old)) 5983 return old; 5984 5985 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5986 old, gang) != old); 5987 5988 dma_fence_put(old); 5989 return NULL; 5990 } 5991 5992 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5993 { 5994 switch (adev->asic_type) { 5995 #ifdef CONFIG_DRM_AMDGPU_SI 5996 case CHIP_HAINAN: 5997 #endif 5998 case CHIP_TOPAZ: 5999 /* chips with no display hardware */ 6000 return false; 6001 #ifdef CONFIG_DRM_AMDGPU_SI 6002 case CHIP_TAHITI: 6003 case CHIP_PITCAIRN: 6004 case CHIP_VERDE: 6005 case CHIP_OLAND: 6006 #endif 6007 #ifdef CONFIG_DRM_AMDGPU_CIK 6008 case CHIP_BONAIRE: 6009 case CHIP_HAWAII: 6010 case CHIP_KAVERI: 6011 case CHIP_KABINI: 6012 case CHIP_MULLINS: 6013 #endif 6014 case CHIP_TONGA: 6015 case CHIP_FIJI: 6016 case CHIP_POLARIS10: 6017 case CHIP_POLARIS11: 6018 case CHIP_POLARIS12: 6019 case CHIP_VEGAM: 6020 case CHIP_CARRIZO: 6021 case CHIP_STONEY: 6022 /* chips with display hardware */ 6023 return true; 6024 default: 6025 /* IP discovery */ 6026 if (!adev->ip_versions[DCE_HWIP][0] || 6027 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6028 return false; 6029 return true; 6030 } 6031 } 6032 6033 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6034 uint32_t inst, uint32_t reg_addr, char reg_name[], 6035 uint32_t expected_value, uint32_t mask) 6036 { 6037 uint32_t ret = 0; 6038 uint32_t old_ = 0; 6039 uint32_t tmp_ = RREG32(reg_addr); 6040 uint32_t loop = adev->usec_timeout; 6041 6042 while ((tmp_ & (mask)) != (expected_value)) { 6043 if (old_ != tmp_) { 6044 loop = adev->usec_timeout; 6045 old_ = tmp_; 6046 } else 6047 udelay(1); 6048 tmp_ = RREG32(reg_addr); 6049 loop--; 6050 if (!loop) { 6051 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6052 inst, reg_name, (uint32_t)expected_value, 6053 (uint32_t)(tmp_ & (mask))); 6054 ret = -ETIMEDOUT; 6055 break; 6056 } 6057 } 6058 return ret; 6059 } 6060