1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * 511 * this function is invoked only for the debugfs register access 512 */ 513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 514 uint32_t reg, uint32_t v, 515 uint32_t xcc_id) 516 { 517 if (amdgpu_device_skip_hw_access(adev)) 518 return; 519 520 if (amdgpu_sriov_fullaccess(adev) && 521 adev->gfx.rlc.funcs && 522 adev->gfx.rlc.funcs->is_rlcg_access_range) { 523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 525 } else if ((reg * 4) >= adev->rmmio_size) { 526 adev->pcie_wreg(adev, reg * 4, v); 527 } else { 528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 529 } 530 } 531 532 /** 533 * amdgpu_device_indirect_rreg - read an indirect register 534 * 535 * @adev: amdgpu_device pointer 536 * @reg_addr: indirect register address to read from 537 * 538 * Returns the value of indirect register @reg_addr 539 */ 540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 541 u32 reg_addr) 542 { 543 unsigned long flags, pcie_index, pcie_data; 544 void __iomem *pcie_index_offset; 545 void __iomem *pcie_data_offset; 546 u32 r; 547 548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 550 551 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 554 555 writel(reg_addr, pcie_index_offset); 556 readl(pcie_index_offset); 557 r = readl(pcie_data_offset); 558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 559 560 return r; 561 } 562 563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 564 u64 reg_addr) 565 { 566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 567 u32 r; 568 void __iomem *pcie_index_offset; 569 void __iomem *pcie_index_hi_offset; 570 void __iomem *pcie_data_offset; 571 572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 574 if (adev->nbio.funcs->get_pcie_index_hi_offset) 575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 576 else 577 pcie_index_hi = 0; 578 579 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 582 if (pcie_index_hi != 0) 583 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 584 pcie_index_hi * 4; 585 586 writel(reg_addr, pcie_index_offset); 587 readl(pcie_index_offset); 588 if (pcie_index_hi != 0) { 589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 590 readl(pcie_index_hi_offset); 591 } 592 r = readl(pcie_data_offset); 593 594 /* clear the high bits */ 595 if (pcie_index_hi != 0) { 596 writel(0, pcie_index_hi_offset); 597 readl(pcie_index_hi_offset); 598 } 599 600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 601 602 return r; 603 } 604 605 /** 606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 607 * 608 * @adev: amdgpu_device pointer 609 * @reg_addr: indirect register address to read from 610 * 611 * Returns the value of indirect register @reg_addr 612 */ 613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 614 u32 reg_addr) 615 { 616 unsigned long flags, pcie_index, pcie_data; 617 void __iomem *pcie_index_offset; 618 void __iomem *pcie_data_offset; 619 u64 r; 620 621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @reg_addr: indirect register offset 646 * @reg_data: indirect register data 647 * 648 */ 649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 650 u32 reg_addr, u32 reg_data) 651 { 652 unsigned long flags, pcie_index, pcie_data; 653 void __iomem *pcie_index_offset; 654 void __iomem *pcie_data_offset; 655 656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 671 u64 reg_addr, u32 reg_data) 672 { 673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 674 void __iomem *pcie_index_offset; 675 void __iomem *pcie_index_hi_offset; 676 void __iomem *pcie_data_offset; 677 678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 680 if (adev->nbio.funcs->get_pcie_index_hi_offset) 681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 682 else 683 pcie_index_hi = 0; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 if (pcie_index_hi != 0) 689 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 690 pcie_index_hi * 4; 691 692 writel(reg_addr, pcie_index_offset); 693 readl(pcie_index_offset); 694 if (pcie_index_hi != 0) { 695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 696 readl(pcie_index_hi_offset); 697 } 698 writel(reg_data, pcie_data_offset); 699 readl(pcie_data_offset); 700 701 /* clear the high bits */ 702 if (pcie_index_hi != 0) { 703 writel(0, pcie_index_hi_offset); 704 readl(pcie_index_hi_offset); 705 } 706 707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 708 } 709 710 /** 711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 712 * 713 * @adev: amdgpu_device pointer 714 * @reg_addr: indirect register offset 715 * @reg_data: indirect register data 716 * 717 */ 718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 719 u32 reg_addr, u64 reg_data) 720 { 721 unsigned long flags, pcie_index, pcie_data; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 727 728 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 731 732 /* write low 32 bits */ 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 736 readl(pcie_data_offset); 737 /* write high 32 bits */ 738 writel(reg_addr + 4, pcie_index_offset); 739 readl(pcie_index_offset); 740 writel((u32)(reg_data >> 32), pcie_data_offset); 741 readl(pcie_data_offset); 742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 743 } 744 745 /** 746 * amdgpu_device_get_rev_id - query device rev_id 747 * 748 * @adev: amdgpu_device pointer 749 * 750 * Return device rev_id 751 */ 752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 753 { 754 return adev->nbio.funcs->get_rev_id(adev); 755 } 756 757 /** 758 * amdgpu_invalid_rreg - dummy reg read function 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: offset of register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 * Returns the value in the register. 766 */ 767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 768 { 769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 770 BUG(); 771 return 0; 772 } 773 774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 775 { 776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 777 BUG(); 778 return 0; 779 } 780 781 /** 782 * amdgpu_invalid_wreg - dummy reg write function 783 * 784 * @adev: amdgpu_device pointer 785 * @reg: offset of register 786 * @v: value to write to the register 787 * 788 * Dummy register read function. Used for register blocks 789 * that certain asics don't have (all asics). 790 */ 791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 792 { 793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 794 reg, v); 795 BUG(); 796 } 797 798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 799 { 800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 801 reg, v); 802 BUG(); 803 } 804 805 /** 806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 807 * 808 * @adev: amdgpu_device pointer 809 * @reg: offset of register 810 * 811 * Dummy register read function. Used for register blocks 812 * that certain asics don't have (all asics). 813 * Returns the value in the register. 814 */ 815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 816 { 817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 818 BUG(); 819 return 0; 820 } 821 822 /** 823 * amdgpu_invalid_wreg64 - dummy reg write function 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: offset of register 827 * @v: value to write to the register 828 * 829 * Dummy register read function. Used for register blocks 830 * that certain asics don't have (all asics). 831 */ 832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 833 { 834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 835 reg, v); 836 BUG(); 837 } 838 839 /** 840 * amdgpu_block_invalid_rreg - dummy reg read function 841 * 842 * @adev: amdgpu_device pointer 843 * @block: offset of instance 844 * @reg: offset of register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 * Returns the value in the register. 849 */ 850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 851 uint32_t block, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 854 reg, block); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_block_invalid_wreg - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @block: offset of instance 864 * @reg: offset of register 865 * @v: value to write to the register 866 * 867 * Dummy register read function. Used for register blocks 868 * that certain asics don't have (all asics). 869 */ 870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 871 uint32_t block, 872 uint32_t reg, uint32_t v) 873 { 874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 875 reg, block, v); 876 BUG(); 877 } 878 879 /** 880 * amdgpu_device_asic_init - Wrapper for atom asic_init 881 * 882 * @adev: amdgpu_device pointer 883 * 884 * Does any asic specific work and then calls atom asic init. 885 */ 886 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 887 { 888 int ret; 889 890 amdgpu_asic_pre_asic_init(adev); 891 892 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 893 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 894 amdgpu_psp_wait_for_bootloader(adev); 895 ret = amdgpu_atomfirmware_asic_init(adev, true); 896 return ret; 897 } else { 898 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 899 } 900 901 return 0; 902 } 903 904 /** 905 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 906 * 907 * @adev: amdgpu_device pointer 908 * 909 * Allocates a scratch page of VRAM for use by various things in the 910 * driver. 911 */ 912 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 913 { 914 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 915 AMDGPU_GEM_DOMAIN_VRAM | 916 AMDGPU_GEM_DOMAIN_GTT, 917 &adev->mem_scratch.robj, 918 &adev->mem_scratch.gpu_addr, 919 (void **)&adev->mem_scratch.ptr); 920 } 921 922 /** 923 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 924 * 925 * @adev: amdgpu_device pointer 926 * 927 * Frees the VRAM scratch page. 928 */ 929 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 930 { 931 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 932 } 933 934 /** 935 * amdgpu_device_program_register_sequence - program an array of registers. 936 * 937 * @adev: amdgpu_device pointer 938 * @registers: pointer to the register array 939 * @array_size: size of the register array 940 * 941 * Programs an array or registers with and or masks. 942 * This is a helper for setting golden registers. 943 */ 944 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 945 const u32 *registers, 946 const u32 array_size) 947 { 948 u32 tmp, reg, and_mask, or_mask; 949 int i; 950 951 if (array_size % 3) 952 return; 953 954 for (i = 0; i < array_size; i += 3) { 955 reg = registers[i + 0]; 956 and_mask = registers[i + 1]; 957 or_mask = registers[i + 2]; 958 959 if (and_mask == 0xffffffff) { 960 tmp = or_mask; 961 } else { 962 tmp = RREG32(reg); 963 tmp &= ~and_mask; 964 if (adev->family >= AMDGPU_FAMILY_AI) 965 tmp |= (or_mask & and_mask); 966 else 967 tmp |= or_mask; 968 } 969 WREG32(reg, tmp); 970 } 971 } 972 973 /** 974 * amdgpu_device_pci_config_reset - reset the GPU 975 * 976 * @adev: amdgpu_device pointer 977 * 978 * Resets the GPU using the pci config reset sequence. 979 * Only applicable to asics prior to vega10. 980 */ 981 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 982 { 983 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 984 } 985 986 /** 987 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 988 * 989 * @adev: amdgpu_device pointer 990 * 991 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 992 */ 993 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 994 { 995 return pci_reset_function(adev->pdev); 996 } 997 998 /* 999 * amdgpu_device_wb_*() 1000 * Writeback is the method by which the GPU updates special pages in memory 1001 * with the status of certain GPU events (fences, ring pointers,etc.). 1002 */ 1003 1004 /** 1005 * amdgpu_device_wb_fini - Disable Writeback and free memory 1006 * 1007 * @adev: amdgpu_device pointer 1008 * 1009 * Disables Writeback and frees the Writeback memory (all asics). 1010 * Used at driver shutdown. 1011 */ 1012 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1013 { 1014 if (adev->wb.wb_obj) { 1015 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1016 &adev->wb.gpu_addr, 1017 (void **)&adev->wb.wb); 1018 adev->wb.wb_obj = NULL; 1019 } 1020 } 1021 1022 /** 1023 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1024 * 1025 * @adev: amdgpu_device pointer 1026 * 1027 * Initializes writeback and allocates writeback memory (all asics). 1028 * Used at driver startup. 1029 * Returns 0 on success or an -error on failure. 1030 */ 1031 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1032 { 1033 int r; 1034 1035 if (adev->wb.wb_obj == NULL) { 1036 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1037 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1038 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1039 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1040 (void **)&adev->wb.wb); 1041 if (r) { 1042 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1043 return r; 1044 } 1045 1046 adev->wb.num_wb = AMDGPU_MAX_WB; 1047 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1048 1049 /* clear wb memory */ 1050 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1051 } 1052 1053 return 0; 1054 } 1055 1056 /** 1057 * amdgpu_device_wb_get - Allocate a wb entry 1058 * 1059 * @adev: amdgpu_device pointer 1060 * @wb: wb index 1061 * 1062 * Allocate a wb slot for use by the driver (all asics). 1063 * Returns 0 on success or -EINVAL on failure. 1064 */ 1065 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1066 { 1067 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1068 1069 if (offset < adev->wb.num_wb) { 1070 __set_bit(offset, adev->wb.used); 1071 *wb = offset << 3; /* convert to dw offset */ 1072 return 0; 1073 } else { 1074 return -EINVAL; 1075 } 1076 } 1077 1078 /** 1079 * amdgpu_device_wb_free - Free a wb entry 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @wb: wb index 1083 * 1084 * Free a wb slot allocated for use by the driver (all asics) 1085 */ 1086 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1087 { 1088 wb >>= 3; 1089 if (wb < adev->wb.num_wb) 1090 __clear_bit(wb, adev->wb.used); 1091 } 1092 1093 /** 1094 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1095 * 1096 * @adev: amdgpu_device pointer 1097 * 1098 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1099 * to fail, but if any of the BARs is not accessible after the size we abort 1100 * driver loading by returning -ENODEV. 1101 */ 1102 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1103 { 1104 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1105 struct pci_bus *root; 1106 struct resource *res; 1107 unsigned int i; 1108 u16 cmd; 1109 int r; 1110 1111 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1112 return 0; 1113 1114 /* Bypass for VF */ 1115 if (amdgpu_sriov_vf(adev)) 1116 return 0; 1117 1118 /* skip if the bios has already enabled large BAR */ 1119 if (adev->gmc.real_vram_size && 1120 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1121 return 0; 1122 1123 /* Check if the root BUS has 64bit memory resources */ 1124 root = adev->pdev->bus; 1125 while (root->parent) 1126 root = root->parent; 1127 1128 pci_bus_for_each_resource(root, res, i) { 1129 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1130 res->start > 0x100000000ull) 1131 break; 1132 } 1133 1134 /* Trying to resize is pointless without a root hub window above 4GB */ 1135 if (!res) 1136 return 0; 1137 1138 /* Limit the BAR size to what is available */ 1139 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1140 rbar_size); 1141 1142 /* Disable memory decoding while we change the BAR addresses and size */ 1143 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1144 pci_write_config_word(adev->pdev, PCI_COMMAND, 1145 cmd & ~PCI_COMMAND_MEMORY); 1146 1147 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1148 amdgpu_doorbell_fini(adev); 1149 if (adev->asic_type >= CHIP_BONAIRE) 1150 pci_release_resource(adev->pdev, 2); 1151 1152 pci_release_resource(adev->pdev, 0); 1153 1154 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1155 if (r == -ENOSPC) 1156 DRM_INFO("Not enough PCI address space for a large BAR."); 1157 else if (r && r != -ENOTSUPP) 1158 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1159 1160 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1161 1162 /* When the doorbell or fb BAR isn't available we have no chance of 1163 * using the device. 1164 */ 1165 r = amdgpu_doorbell_init(adev); 1166 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1167 return -ENODEV; 1168 1169 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1170 1171 return 0; 1172 } 1173 1174 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1175 { 1176 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1177 return false; 1178 1179 return true; 1180 } 1181 1182 /* 1183 * GPU helpers function. 1184 */ 1185 /** 1186 * amdgpu_device_need_post - check if the hw need post or not 1187 * 1188 * @adev: amdgpu_device pointer 1189 * 1190 * Check if the asic has been initialized (all asics) at driver startup 1191 * or post is needed if hw reset is performed. 1192 * Returns true if need or false if not. 1193 */ 1194 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1195 { 1196 uint32_t reg; 1197 1198 if (amdgpu_sriov_vf(adev)) 1199 return false; 1200 1201 if (!amdgpu_device_read_bios(adev)) 1202 return false; 1203 1204 if (amdgpu_passthrough(adev)) { 1205 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1206 * some old smc fw still need driver do vPost otherwise gpu hang, while 1207 * those smc fw version above 22.15 doesn't have this flaw, so we force 1208 * vpost executed for smc version below 22.15 1209 */ 1210 if (adev->asic_type == CHIP_FIJI) { 1211 int err; 1212 uint32_t fw_ver; 1213 1214 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1215 /* force vPost if error occured */ 1216 if (err) 1217 return true; 1218 1219 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1220 if (fw_ver < 0x00160e00) 1221 return true; 1222 } 1223 } 1224 1225 /* Don't post if we need to reset whole hive on init */ 1226 if (adev->gmc.xgmi.pending_reset) 1227 return false; 1228 1229 if (adev->has_hw_reset) { 1230 adev->has_hw_reset = false; 1231 return true; 1232 } 1233 1234 /* bios scratch used on CIK+ */ 1235 if (adev->asic_type >= CHIP_BONAIRE) 1236 return amdgpu_atombios_scratch_need_asic_init(adev); 1237 1238 /* check MEM_SIZE for older asics */ 1239 reg = amdgpu_asic_get_config_memsize(adev); 1240 1241 if ((reg != 0) && (reg != 0xffffffff)) 1242 return false; 1243 1244 return true; 1245 } 1246 1247 /* 1248 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1249 * speed switching. Until we have confirmation from Intel that a specific host 1250 * supports it, it's safer that we keep it disabled for all. 1251 * 1252 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1253 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1254 */ 1255 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1256 { 1257 #if IS_ENABLED(CONFIG_X86) 1258 struct cpuinfo_x86 *c = &cpu_data(0); 1259 1260 if (c->x86_vendor == X86_VENDOR_INTEL) 1261 return false; 1262 #endif 1263 return true; 1264 } 1265 1266 /** 1267 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1272 * be set for this device. 1273 * 1274 * Returns true if it should be used or false if not. 1275 */ 1276 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1277 { 1278 switch (amdgpu_aspm) { 1279 case -1: 1280 break; 1281 case 0: 1282 return false; 1283 case 1: 1284 return true; 1285 default: 1286 return false; 1287 } 1288 return pcie_aspm_enabled(adev->pdev); 1289 } 1290 1291 bool amdgpu_device_aspm_support_quirk(void) 1292 { 1293 #if IS_ENABLED(CONFIG_X86) 1294 struct cpuinfo_x86 *c = &cpu_data(0); 1295 1296 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1297 #else 1298 return true; 1299 #endif 1300 } 1301 1302 /* if we get transitioned to only one device, take VGA back */ 1303 /** 1304 * amdgpu_device_vga_set_decode - enable/disable vga decode 1305 * 1306 * @pdev: PCI device pointer 1307 * @state: enable/disable vga decode 1308 * 1309 * Enable/disable vga decode (all asics). 1310 * Returns VGA resource flags. 1311 */ 1312 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1313 bool state) 1314 { 1315 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1316 1317 amdgpu_asic_set_vga_state(adev, state); 1318 if (state) 1319 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1320 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1321 else 1322 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1323 } 1324 1325 /** 1326 * amdgpu_device_check_block_size - validate the vm block size 1327 * 1328 * @adev: amdgpu_device pointer 1329 * 1330 * Validates the vm block size specified via module parameter. 1331 * The vm block size defines number of bits in page table versus page directory, 1332 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1333 * page table and the remaining bits are in the page directory. 1334 */ 1335 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1336 { 1337 /* defines number of bits in page table versus page directory, 1338 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1339 * page table and the remaining bits are in the page directory 1340 */ 1341 if (amdgpu_vm_block_size == -1) 1342 return; 1343 1344 if (amdgpu_vm_block_size < 9) { 1345 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1346 amdgpu_vm_block_size); 1347 amdgpu_vm_block_size = -1; 1348 } 1349 } 1350 1351 /** 1352 * amdgpu_device_check_vm_size - validate the vm size 1353 * 1354 * @adev: amdgpu_device pointer 1355 * 1356 * Validates the vm size in GB specified via module parameter. 1357 * The VM size is the size of the GPU virtual memory space in GB. 1358 */ 1359 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1360 { 1361 /* no need to check the default value */ 1362 if (amdgpu_vm_size == -1) 1363 return; 1364 1365 if (amdgpu_vm_size < 1) { 1366 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1367 amdgpu_vm_size); 1368 amdgpu_vm_size = -1; 1369 } 1370 } 1371 1372 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1373 { 1374 struct sysinfo si; 1375 bool is_os_64 = (sizeof(void *) == 8); 1376 uint64_t total_memory; 1377 uint64_t dram_size_seven_GB = 0x1B8000000; 1378 uint64_t dram_size_three_GB = 0xB8000000; 1379 1380 if (amdgpu_smu_memory_pool_size == 0) 1381 return; 1382 1383 if (!is_os_64) { 1384 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1385 goto def_value; 1386 } 1387 si_meminfo(&si); 1388 total_memory = (uint64_t)si.totalram * si.mem_unit; 1389 1390 if ((amdgpu_smu_memory_pool_size == 1) || 1391 (amdgpu_smu_memory_pool_size == 2)) { 1392 if (total_memory < dram_size_three_GB) 1393 goto def_value1; 1394 } else if ((amdgpu_smu_memory_pool_size == 4) || 1395 (amdgpu_smu_memory_pool_size == 8)) { 1396 if (total_memory < dram_size_seven_GB) 1397 goto def_value1; 1398 } else { 1399 DRM_WARN("Smu memory pool size not supported\n"); 1400 goto def_value; 1401 } 1402 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1403 1404 return; 1405 1406 def_value1: 1407 DRM_WARN("No enough system memory\n"); 1408 def_value: 1409 adev->pm.smu_prv_buffer_size = 0; 1410 } 1411 1412 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1413 { 1414 if (!(adev->flags & AMD_IS_APU) || 1415 adev->asic_type < CHIP_RAVEN) 1416 return 0; 1417 1418 switch (adev->asic_type) { 1419 case CHIP_RAVEN: 1420 if (adev->pdev->device == 0x15dd) 1421 adev->apu_flags |= AMD_APU_IS_RAVEN; 1422 if (adev->pdev->device == 0x15d8) 1423 adev->apu_flags |= AMD_APU_IS_PICASSO; 1424 break; 1425 case CHIP_RENOIR: 1426 if ((adev->pdev->device == 0x1636) || 1427 (adev->pdev->device == 0x164c)) 1428 adev->apu_flags |= AMD_APU_IS_RENOIR; 1429 else 1430 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1431 break; 1432 case CHIP_VANGOGH: 1433 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1434 break; 1435 case CHIP_YELLOW_CARP: 1436 break; 1437 case CHIP_CYAN_SKILLFISH: 1438 if ((adev->pdev->device == 0x13FE) || 1439 (adev->pdev->device == 0x143F)) 1440 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1441 break; 1442 default: 1443 break; 1444 } 1445 1446 return 0; 1447 } 1448 1449 /** 1450 * amdgpu_device_check_arguments - validate module params 1451 * 1452 * @adev: amdgpu_device pointer 1453 * 1454 * Validates certain module parameters and updates 1455 * the associated values used by the driver (all asics). 1456 */ 1457 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1458 { 1459 if (amdgpu_sched_jobs < 4) { 1460 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1461 amdgpu_sched_jobs); 1462 amdgpu_sched_jobs = 4; 1463 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1464 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1465 amdgpu_sched_jobs); 1466 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1467 } 1468 1469 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1470 /* gart size must be greater or equal to 32M */ 1471 dev_warn(adev->dev, "gart size (%d) too small\n", 1472 amdgpu_gart_size); 1473 amdgpu_gart_size = -1; 1474 } 1475 1476 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1477 /* gtt size must be greater or equal to 32M */ 1478 dev_warn(adev->dev, "gtt size (%d) too small\n", 1479 amdgpu_gtt_size); 1480 amdgpu_gtt_size = -1; 1481 } 1482 1483 /* valid range is between 4 and 9 inclusive */ 1484 if (amdgpu_vm_fragment_size != -1 && 1485 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1486 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1487 amdgpu_vm_fragment_size = -1; 1488 } 1489 1490 if (amdgpu_sched_hw_submission < 2) { 1491 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1492 amdgpu_sched_hw_submission); 1493 amdgpu_sched_hw_submission = 2; 1494 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1495 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1496 amdgpu_sched_hw_submission); 1497 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1498 } 1499 1500 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1501 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1502 amdgpu_reset_method = -1; 1503 } 1504 1505 amdgpu_device_check_smu_prv_buffer_size(adev); 1506 1507 amdgpu_device_check_vm_size(adev); 1508 1509 amdgpu_device_check_block_size(adev); 1510 1511 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1512 1513 return 0; 1514 } 1515 1516 /** 1517 * amdgpu_switcheroo_set_state - set switcheroo state 1518 * 1519 * @pdev: pci dev pointer 1520 * @state: vga_switcheroo state 1521 * 1522 * Callback for the switcheroo driver. Suspends or resumes 1523 * the asics before or after it is powered up using ACPI methods. 1524 */ 1525 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1526 enum vga_switcheroo_state state) 1527 { 1528 struct drm_device *dev = pci_get_drvdata(pdev); 1529 int r; 1530 1531 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1532 return; 1533 1534 if (state == VGA_SWITCHEROO_ON) { 1535 pr_info("switched on\n"); 1536 /* don't suspend or resume card normally */ 1537 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1538 1539 pci_set_power_state(pdev, PCI_D0); 1540 amdgpu_device_load_pci_state(pdev); 1541 r = pci_enable_device(pdev); 1542 if (r) 1543 DRM_WARN("pci_enable_device failed (%d)\n", r); 1544 amdgpu_device_resume(dev, true); 1545 1546 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1547 } else { 1548 pr_info("switched off\n"); 1549 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1550 amdgpu_device_suspend(dev, true); 1551 amdgpu_device_cache_pci_state(pdev); 1552 /* Shut down the device */ 1553 pci_disable_device(pdev); 1554 pci_set_power_state(pdev, PCI_D3cold); 1555 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1556 } 1557 } 1558 1559 /** 1560 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1561 * 1562 * @pdev: pci dev pointer 1563 * 1564 * Callback for the switcheroo driver. Check of the switcheroo 1565 * state can be changed. 1566 * Returns true if the state can be changed, false if not. 1567 */ 1568 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1569 { 1570 struct drm_device *dev = pci_get_drvdata(pdev); 1571 1572 /* 1573 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1574 * locking inversion with the driver load path. And the access here is 1575 * completely racy anyway. So don't bother with locking for now. 1576 */ 1577 return atomic_read(&dev->open_count) == 0; 1578 } 1579 1580 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1581 .set_gpu_state = amdgpu_switcheroo_set_state, 1582 .reprobe = NULL, 1583 .can_switch = amdgpu_switcheroo_can_switch, 1584 }; 1585 1586 /** 1587 * amdgpu_device_ip_set_clockgating_state - set the CG state 1588 * 1589 * @dev: amdgpu_device pointer 1590 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1591 * @state: clockgating state (gate or ungate) 1592 * 1593 * Sets the requested clockgating state for all instances of 1594 * the hardware IP specified. 1595 * Returns the error code from the last instance. 1596 */ 1597 int amdgpu_device_ip_set_clockgating_state(void *dev, 1598 enum amd_ip_block_type block_type, 1599 enum amd_clockgating_state state) 1600 { 1601 struct amdgpu_device *adev = dev; 1602 int i, r = 0; 1603 1604 for (i = 0; i < adev->num_ip_blocks; i++) { 1605 if (!adev->ip_blocks[i].status.valid) 1606 continue; 1607 if (adev->ip_blocks[i].version->type != block_type) 1608 continue; 1609 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1610 continue; 1611 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1612 (void *)adev, state); 1613 if (r) 1614 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1615 adev->ip_blocks[i].version->funcs->name, r); 1616 } 1617 return r; 1618 } 1619 1620 /** 1621 * amdgpu_device_ip_set_powergating_state - set the PG state 1622 * 1623 * @dev: amdgpu_device pointer 1624 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1625 * @state: powergating state (gate or ungate) 1626 * 1627 * Sets the requested powergating state for all instances of 1628 * the hardware IP specified. 1629 * Returns the error code from the last instance. 1630 */ 1631 int amdgpu_device_ip_set_powergating_state(void *dev, 1632 enum amd_ip_block_type block_type, 1633 enum amd_powergating_state state) 1634 { 1635 struct amdgpu_device *adev = dev; 1636 int i, r = 0; 1637 1638 for (i = 0; i < adev->num_ip_blocks; i++) { 1639 if (!adev->ip_blocks[i].status.valid) 1640 continue; 1641 if (adev->ip_blocks[i].version->type != block_type) 1642 continue; 1643 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1644 continue; 1645 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1646 (void *)adev, state); 1647 if (r) 1648 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1649 adev->ip_blocks[i].version->funcs->name, r); 1650 } 1651 return r; 1652 } 1653 1654 /** 1655 * amdgpu_device_ip_get_clockgating_state - get the CG state 1656 * 1657 * @adev: amdgpu_device pointer 1658 * @flags: clockgating feature flags 1659 * 1660 * Walks the list of IPs on the device and updates the clockgating 1661 * flags for each IP. 1662 * Updates @flags with the feature flags for each hardware IP where 1663 * clockgating is enabled. 1664 */ 1665 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1666 u64 *flags) 1667 { 1668 int i; 1669 1670 for (i = 0; i < adev->num_ip_blocks; i++) { 1671 if (!adev->ip_blocks[i].status.valid) 1672 continue; 1673 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1674 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1675 } 1676 } 1677 1678 /** 1679 * amdgpu_device_ip_wait_for_idle - wait for idle 1680 * 1681 * @adev: amdgpu_device pointer 1682 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1683 * 1684 * Waits for the request hardware IP to be idle. 1685 * Returns 0 for success or a negative error code on failure. 1686 */ 1687 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1688 enum amd_ip_block_type block_type) 1689 { 1690 int i, r; 1691 1692 for (i = 0; i < adev->num_ip_blocks; i++) { 1693 if (!adev->ip_blocks[i].status.valid) 1694 continue; 1695 if (adev->ip_blocks[i].version->type == block_type) { 1696 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1697 if (r) 1698 return r; 1699 break; 1700 } 1701 } 1702 return 0; 1703 1704 } 1705 1706 /** 1707 * amdgpu_device_ip_is_idle - is the hardware IP idle 1708 * 1709 * @adev: amdgpu_device pointer 1710 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1711 * 1712 * Check if the hardware IP is idle or not. 1713 * Returns true if it the IP is idle, false if not. 1714 */ 1715 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1716 enum amd_ip_block_type block_type) 1717 { 1718 int i; 1719 1720 for (i = 0; i < adev->num_ip_blocks; i++) { 1721 if (!adev->ip_blocks[i].status.valid) 1722 continue; 1723 if (adev->ip_blocks[i].version->type == block_type) 1724 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1725 } 1726 return true; 1727 1728 } 1729 1730 /** 1731 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1732 * 1733 * @adev: amdgpu_device pointer 1734 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1735 * 1736 * Returns a pointer to the hardware IP block structure 1737 * if it exists for the asic, otherwise NULL. 1738 */ 1739 struct amdgpu_ip_block * 1740 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1741 enum amd_ip_block_type type) 1742 { 1743 int i; 1744 1745 for (i = 0; i < adev->num_ip_blocks; i++) 1746 if (adev->ip_blocks[i].version->type == type) 1747 return &adev->ip_blocks[i]; 1748 1749 return NULL; 1750 } 1751 1752 /** 1753 * amdgpu_device_ip_block_version_cmp 1754 * 1755 * @adev: amdgpu_device pointer 1756 * @type: enum amd_ip_block_type 1757 * @major: major version 1758 * @minor: minor version 1759 * 1760 * return 0 if equal or greater 1761 * return 1 if smaller or the ip_block doesn't exist 1762 */ 1763 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1764 enum amd_ip_block_type type, 1765 u32 major, u32 minor) 1766 { 1767 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1768 1769 if (ip_block && ((ip_block->version->major > major) || 1770 ((ip_block->version->major == major) && 1771 (ip_block->version->minor >= minor)))) 1772 return 0; 1773 1774 return 1; 1775 } 1776 1777 /** 1778 * amdgpu_device_ip_block_add 1779 * 1780 * @adev: amdgpu_device pointer 1781 * @ip_block_version: pointer to the IP to add 1782 * 1783 * Adds the IP block driver information to the collection of IPs 1784 * on the asic. 1785 */ 1786 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1787 const struct amdgpu_ip_block_version *ip_block_version) 1788 { 1789 if (!ip_block_version) 1790 return -EINVAL; 1791 1792 switch (ip_block_version->type) { 1793 case AMD_IP_BLOCK_TYPE_VCN: 1794 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1795 return 0; 1796 break; 1797 case AMD_IP_BLOCK_TYPE_JPEG: 1798 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1799 return 0; 1800 break; 1801 default: 1802 break; 1803 } 1804 1805 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1806 ip_block_version->funcs->name); 1807 1808 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1809 1810 return 0; 1811 } 1812 1813 /** 1814 * amdgpu_device_enable_virtual_display - enable virtual display feature 1815 * 1816 * @adev: amdgpu_device pointer 1817 * 1818 * Enabled the virtual display feature if the user has enabled it via 1819 * the module parameter virtual_display. This feature provides a virtual 1820 * display hardware on headless boards or in virtualized environments. 1821 * This function parses and validates the configuration string specified by 1822 * the user and configues the virtual display configuration (number of 1823 * virtual connectors, crtcs, etc.) specified. 1824 */ 1825 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1826 { 1827 adev->enable_virtual_display = false; 1828 1829 if (amdgpu_virtual_display) { 1830 const char *pci_address_name = pci_name(adev->pdev); 1831 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1832 1833 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1834 pciaddstr_tmp = pciaddstr; 1835 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1836 pciaddname = strsep(&pciaddname_tmp, ","); 1837 if (!strcmp("all", pciaddname) 1838 || !strcmp(pci_address_name, pciaddname)) { 1839 long num_crtc; 1840 int res = -1; 1841 1842 adev->enable_virtual_display = true; 1843 1844 if (pciaddname_tmp) 1845 res = kstrtol(pciaddname_tmp, 10, 1846 &num_crtc); 1847 1848 if (!res) { 1849 if (num_crtc < 1) 1850 num_crtc = 1; 1851 if (num_crtc > 6) 1852 num_crtc = 6; 1853 adev->mode_info.num_crtc = num_crtc; 1854 } else { 1855 adev->mode_info.num_crtc = 1; 1856 } 1857 break; 1858 } 1859 } 1860 1861 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1862 amdgpu_virtual_display, pci_address_name, 1863 adev->enable_virtual_display, adev->mode_info.num_crtc); 1864 1865 kfree(pciaddstr); 1866 } 1867 } 1868 1869 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1870 { 1871 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1872 adev->mode_info.num_crtc = 1; 1873 adev->enable_virtual_display = true; 1874 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1875 adev->enable_virtual_display, adev->mode_info.num_crtc); 1876 } 1877 } 1878 1879 /** 1880 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1881 * 1882 * @adev: amdgpu_device pointer 1883 * 1884 * Parses the asic configuration parameters specified in the gpu info 1885 * firmware and makes them availale to the driver for use in configuring 1886 * the asic. 1887 * Returns 0 on success, -EINVAL on failure. 1888 */ 1889 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1890 { 1891 const char *chip_name; 1892 char fw_name[40]; 1893 int err; 1894 const struct gpu_info_firmware_header_v1_0 *hdr; 1895 1896 adev->firmware.gpu_info_fw = NULL; 1897 1898 if (adev->mman.discovery_bin) { 1899 /* 1900 * FIXME: The bounding box is still needed by Navi12, so 1901 * temporarily read it from gpu_info firmware. Should be dropped 1902 * when DAL no longer needs it. 1903 */ 1904 if (adev->asic_type != CHIP_NAVI12) 1905 return 0; 1906 } 1907 1908 switch (adev->asic_type) { 1909 default: 1910 return 0; 1911 case CHIP_VEGA10: 1912 chip_name = "vega10"; 1913 break; 1914 case CHIP_VEGA12: 1915 chip_name = "vega12"; 1916 break; 1917 case CHIP_RAVEN: 1918 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1919 chip_name = "raven2"; 1920 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1921 chip_name = "picasso"; 1922 else 1923 chip_name = "raven"; 1924 break; 1925 case CHIP_ARCTURUS: 1926 chip_name = "arcturus"; 1927 break; 1928 case CHIP_NAVI12: 1929 chip_name = "navi12"; 1930 break; 1931 } 1932 1933 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1934 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1935 if (err) { 1936 dev_err(adev->dev, 1937 "Failed to get gpu_info firmware \"%s\"\n", 1938 fw_name); 1939 goto out; 1940 } 1941 1942 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1943 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1944 1945 switch (hdr->version_major) { 1946 case 1: 1947 { 1948 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1949 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1950 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1951 1952 /* 1953 * Should be droped when DAL no longer needs it. 1954 */ 1955 if (adev->asic_type == CHIP_NAVI12) 1956 goto parse_soc_bounding_box; 1957 1958 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1959 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1960 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1961 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1962 adev->gfx.config.max_texture_channel_caches = 1963 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1964 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1965 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1966 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1967 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1968 adev->gfx.config.double_offchip_lds_buf = 1969 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1970 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1971 adev->gfx.cu_info.max_waves_per_simd = 1972 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1973 adev->gfx.cu_info.max_scratch_slots_per_cu = 1974 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1975 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1976 if (hdr->version_minor >= 1) { 1977 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1978 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1979 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1980 adev->gfx.config.num_sc_per_sh = 1981 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1982 adev->gfx.config.num_packer_per_sc = 1983 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1984 } 1985 1986 parse_soc_bounding_box: 1987 /* 1988 * soc bounding box info is not integrated in disocovery table, 1989 * we always need to parse it from gpu info firmware if needed. 1990 */ 1991 if (hdr->version_minor == 2) { 1992 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1993 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1994 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1995 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1996 } 1997 break; 1998 } 1999 default: 2000 dev_err(adev->dev, 2001 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2002 err = -EINVAL; 2003 goto out; 2004 } 2005 out: 2006 return err; 2007 } 2008 2009 /** 2010 * amdgpu_device_ip_early_init - run early init for hardware IPs 2011 * 2012 * @adev: amdgpu_device pointer 2013 * 2014 * Early initialization pass for hardware IPs. The hardware IPs that make 2015 * up each asic are discovered each IP's early_init callback is run. This 2016 * is the first stage in initializing the asic. 2017 * Returns 0 on success, negative error code on failure. 2018 */ 2019 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2020 { 2021 struct drm_device *dev = adev_to_drm(adev); 2022 struct pci_dev *parent; 2023 int i, r; 2024 bool total; 2025 2026 amdgpu_device_enable_virtual_display(adev); 2027 2028 if (amdgpu_sriov_vf(adev)) { 2029 r = amdgpu_virt_request_full_gpu(adev, true); 2030 if (r) 2031 return r; 2032 } 2033 2034 switch (adev->asic_type) { 2035 #ifdef CONFIG_DRM_AMDGPU_SI 2036 case CHIP_VERDE: 2037 case CHIP_TAHITI: 2038 case CHIP_PITCAIRN: 2039 case CHIP_OLAND: 2040 case CHIP_HAINAN: 2041 adev->family = AMDGPU_FAMILY_SI; 2042 r = si_set_ip_blocks(adev); 2043 if (r) 2044 return r; 2045 break; 2046 #endif 2047 #ifdef CONFIG_DRM_AMDGPU_CIK 2048 case CHIP_BONAIRE: 2049 case CHIP_HAWAII: 2050 case CHIP_KAVERI: 2051 case CHIP_KABINI: 2052 case CHIP_MULLINS: 2053 if (adev->flags & AMD_IS_APU) 2054 adev->family = AMDGPU_FAMILY_KV; 2055 else 2056 adev->family = AMDGPU_FAMILY_CI; 2057 2058 r = cik_set_ip_blocks(adev); 2059 if (r) 2060 return r; 2061 break; 2062 #endif 2063 case CHIP_TOPAZ: 2064 case CHIP_TONGA: 2065 case CHIP_FIJI: 2066 case CHIP_POLARIS10: 2067 case CHIP_POLARIS11: 2068 case CHIP_POLARIS12: 2069 case CHIP_VEGAM: 2070 case CHIP_CARRIZO: 2071 case CHIP_STONEY: 2072 if (adev->flags & AMD_IS_APU) 2073 adev->family = AMDGPU_FAMILY_CZ; 2074 else 2075 adev->family = AMDGPU_FAMILY_VI; 2076 2077 r = vi_set_ip_blocks(adev); 2078 if (r) 2079 return r; 2080 break; 2081 default: 2082 r = amdgpu_discovery_set_ip_blocks(adev); 2083 if (r) 2084 return r; 2085 break; 2086 } 2087 2088 if (amdgpu_has_atpx() && 2089 (amdgpu_is_atpx_hybrid() || 2090 amdgpu_has_atpx_dgpu_power_cntl()) && 2091 ((adev->flags & AMD_IS_APU) == 0) && 2092 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2093 adev->flags |= AMD_IS_PX; 2094 2095 if (!(adev->flags & AMD_IS_APU)) { 2096 parent = pcie_find_root_port(adev->pdev); 2097 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2098 } 2099 2100 2101 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2102 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2103 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2104 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2105 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2106 2107 total = true; 2108 for (i = 0; i < adev->num_ip_blocks; i++) { 2109 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2110 DRM_WARN("disabled ip block: %d <%s>\n", 2111 i, adev->ip_blocks[i].version->funcs->name); 2112 adev->ip_blocks[i].status.valid = false; 2113 } else { 2114 if (adev->ip_blocks[i].version->funcs->early_init) { 2115 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2116 if (r == -ENOENT) { 2117 adev->ip_blocks[i].status.valid = false; 2118 } else if (r) { 2119 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2120 adev->ip_blocks[i].version->funcs->name, r); 2121 total = false; 2122 } else { 2123 adev->ip_blocks[i].status.valid = true; 2124 } 2125 } else { 2126 adev->ip_blocks[i].status.valid = true; 2127 } 2128 } 2129 /* get the vbios after the asic_funcs are set up */ 2130 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2131 r = amdgpu_device_parse_gpu_info_fw(adev); 2132 if (r) 2133 return r; 2134 2135 /* Read BIOS */ 2136 if (amdgpu_device_read_bios(adev)) { 2137 if (!amdgpu_get_bios(adev)) 2138 return -EINVAL; 2139 2140 r = amdgpu_atombios_init(adev); 2141 if (r) { 2142 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2143 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2144 return r; 2145 } 2146 } 2147 2148 /*get pf2vf msg info at it's earliest time*/ 2149 if (amdgpu_sriov_vf(adev)) 2150 amdgpu_virt_init_data_exchange(adev); 2151 2152 } 2153 } 2154 if (!total) 2155 return -ENODEV; 2156 2157 amdgpu_amdkfd_device_probe(adev); 2158 adev->cg_flags &= amdgpu_cg_mask; 2159 adev->pg_flags &= amdgpu_pg_mask; 2160 2161 return 0; 2162 } 2163 2164 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2165 { 2166 int i, r; 2167 2168 for (i = 0; i < adev->num_ip_blocks; i++) { 2169 if (!adev->ip_blocks[i].status.sw) 2170 continue; 2171 if (adev->ip_blocks[i].status.hw) 2172 continue; 2173 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2174 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2176 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2177 if (r) { 2178 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2179 adev->ip_blocks[i].version->funcs->name, r); 2180 return r; 2181 } 2182 adev->ip_blocks[i].status.hw = true; 2183 } 2184 } 2185 2186 return 0; 2187 } 2188 2189 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2190 { 2191 int i, r; 2192 2193 for (i = 0; i < adev->num_ip_blocks; i++) { 2194 if (!adev->ip_blocks[i].status.sw) 2195 continue; 2196 if (adev->ip_blocks[i].status.hw) 2197 continue; 2198 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2199 if (r) { 2200 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2201 adev->ip_blocks[i].version->funcs->name, r); 2202 return r; 2203 } 2204 adev->ip_blocks[i].status.hw = true; 2205 } 2206 2207 return 0; 2208 } 2209 2210 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2211 { 2212 int r = 0; 2213 int i; 2214 uint32_t smu_version; 2215 2216 if (adev->asic_type >= CHIP_VEGA10) { 2217 for (i = 0; i < adev->num_ip_blocks; i++) { 2218 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2219 continue; 2220 2221 if (!adev->ip_blocks[i].status.sw) 2222 continue; 2223 2224 /* no need to do the fw loading again if already done*/ 2225 if (adev->ip_blocks[i].status.hw == true) 2226 break; 2227 2228 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2229 r = adev->ip_blocks[i].version->funcs->resume(adev); 2230 if (r) { 2231 DRM_ERROR("resume of IP block <%s> failed %d\n", 2232 adev->ip_blocks[i].version->funcs->name, r); 2233 return r; 2234 } 2235 } else { 2236 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2237 if (r) { 2238 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2239 adev->ip_blocks[i].version->funcs->name, r); 2240 return r; 2241 } 2242 } 2243 2244 adev->ip_blocks[i].status.hw = true; 2245 break; 2246 } 2247 } 2248 2249 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2250 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2251 2252 return r; 2253 } 2254 2255 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2256 { 2257 long timeout; 2258 int r, i; 2259 2260 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2261 struct amdgpu_ring *ring = adev->rings[i]; 2262 2263 /* No need to setup the GPU scheduler for rings that don't need it */ 2264 if (!ring || ring->no_scheduler) 2265 continue; 2266 2267 switch (ring->funcs->type) { 2268 case AMDGPU_RING_TYPE_GFX: 2269 timeout = adev->gfx_timeout; 2270 break; 2271 case AMDGPU_RING_TYPE_COMPUTE: 2272 timeout = adev->compute_timeout; 2273 break; 2274 case AMDGPU_RING_TYPE_SDMA: 2275 timeout = adev->sdma_timeout; 2276 break; 2277 default: 2278 timeout = adev->video_timeout; 2279 break; 2280 } 2281 2282 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2283 ring->num_hw_submission, 0, 2284 timeout, adev->reset_domain->wq, 2285 ring->sched_score, ring->name, 2286 adev->dev); 2287 if (r) { 2288 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2289 ring->name); 2290 return r; 2291 } 2292 } 2293 2294 amdgpu_xcp_update_partition_sched_list(adev); 2295 2296 return 0; 2297 } 2298 2299 2300 /** 2301 * amdgpu_device_ip_init - run init for hardware IPs 2302 * 2303 * @adev: amdgpu_device pointer 2304 * 2305 * Main initialization pass for hardware IPs. The list of all the hardware 2306 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2307 * are run. sw_init initializes the software state associated with each IP 2308 * and hw_init initializes the hardware associated with each IP. 2309 * Returns 0 on success, negative error code on failure. 2310 */ 2311 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2312 { 2313 int i, r; 2314 2315 r = amdgpu_ras_init(adev); 2316 if (r) 2317 return r; 2318 2319 for (i = 0; i < adev->num_ip_blocks; i++) { 2320 if (!adev->ip_blocks[i].status.valid) 2321 continue; 2322 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2323 if (r) { 2324 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2325 adev->ip_blocks[i].version->funcs->name, r); 2326 goto init_failed; 2327 } 2328 adev->ip_blocks[i].status.sw = true; 2329 2330 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2331 /* need to do common hw init early so everything is set up for gmc */ 2332 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2333 if (r) { 2334 DRM_ERROR("hw_init %d failed %d\n", i, r); 2335 goto init_failed; 2336 } 2337 adev->ip_blocks[i].status.hw = true; 2338 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2339 /* need to do gmc hw init early so we can allocate gpu mem */ 2340 /* Try to reserve bad pages early */ 2341 if (amdgpu_sriov_vf(adev)) 2342 amdgpu_virt_exchange_data(adev); 2343 2344 r = amdgpu_device_mem_scratch_init(adev); 2345 if (r) { 2346 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2347 goto init_failed; 2348 } 2349 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2350 if (r) { 2351 DRM_ERROR("hw_init %d failed %d\n", i, r); 2352 goto init_failed; 2353 } 2354 r = amdgpu_device_wb_init(adev); 2355 if (r) { 2356 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2357 goto init_failed; 2358 } 2359 adev->ip_blocks[i].status.hw = true; 2360 2361 /* right after GMC hw init, we create CSA */ 2362 if (adev->gfx.mcbp) { 2363 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2364 AMDGPU_GEM_DOMAIN_VRAM | 2365 AMDGPU_GEM_DOMAIN_GTT, 2366 AMDGPU_CSA_SIZE); 2367 if (r) { 2368 DRM_ERROR("allocate CSA failed %d\n", r); 2369 goto init_failed; 2370 } 2371 } 2372 } 2373 } 2374 2375 if (amdgpu_sriov_vf(adev)) 2376 amdgpu_virt_init_data_exchange(adev); 2377 2378 r = amdgpu_ib_pool_init(adev); 2379 if (r) { 2380 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2381 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2382 goto init_failed; 2383 } 2384 2385 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2386 if (r) 2387 goto init_failed; 2388 2389 r = amdgpu_device_ip_hw_init_phase1(adev); 2390 if (r) 2391 goto init_failed; 2392 2393 r = amdgpu_device_fw_loading(adev); 2394 if (r) 2395 goto init_failed; 2396 2397 r = amdgpu_device_ip_hw_init_phase2(adev); 2398 if (r) 2399 goto init_failed; 2400 2401 /* 2402 * retired pages will be loaded from eeprom and reserved here, 2403 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2404 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2405 * for I2C communication which only true at this point. 2406 * 2407 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2408 * failure from bad gpu situation and stop amdgpu init process 2409 * accordingly. For other failed cases, it will still release all 2410 * the resource and print error message, rather than returning one 2411 * negative value to upper level. 2412 * 2413 * Note: theoretically, this should be called before all vram allocations 2414 * to protect retired page from abusing 2415 */ 2416 r = amdgpu_ras_recovery_init(adev); 2417 if (r) 2418 goto init_failed; 2419 2420 /** 2421 * In case of XGMI grab extra reference for reset domain for this device 2422 */ 2423 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2424 if (amdgpu_xgmi_add_device(adev) == 0) { 2425 if (!amdgpu_sriov_vf(adev)) { 2426 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2427 2428 if (WARN_ON(!hive)) { 2429 r = -ENOENT; 2430 goto init_failed; 2431 } 2432 2433 if (!hive->reset_domain || 2434 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2435 r = -ENOENT; 2436 amdgpu_put_xgmi_hive(hive); 2437 goto init_failed; 2438 } 2439 2440 /* Drop the early temporary reset domain we created for device */ 2441 amdgpu_reset_put_reset_domain(adev->reset_domain); 2442 adev->reset_domain = hive->reset_domain; 2443 amdgpu_put_xgmi_hive(hive); 2444 } 2445 } 2446 } 2447 2448 r = amdgpu_device_init_schedulers(adev); 2449 if (r) 2450 goto init_failed; 2451 2452 /* Don't init kfd if whole hive need to be reset during init */ 2453 if (!adev->gmc.xgmi.pending_reset) { 2454 kgd2kfd_init_zone_device(adev); 2455 amdgpu_amdkfd_device_init(adev); 2456 } 2457 2458 amdgpu_fru_get_product_info(adev); 2459 2460 init_failed: 2461 2462 return r; 2463 } 2464 2465 /** 2466 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2467 * 2468 * @adev: amdgpu_device pointer 2469 * 2470 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2471 * this function before a GPU reset. If the value is retained after a 2472 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2473 */ 2474 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2475 { 2476 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2477 } 2478 2479 /** 2480 * amdgpu_device_check_vram_lost - check if vram is valid 2481 * 2482 * @adev: amdgpu_device pointer 2483 * 2484 * Checks the reset magic value written to the gart pointer in VRAM. 2485 * The driver calls this after a GPU reset to see if the contents of 2486 * VRAM is lost or now. 2487 * returns true if vram is lost, false if not. 2488 */ 2489 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2490 { 2491 if (memcmp(adev->gart.ptr, adev->reset_magic, 2492 AMDGPU_RESET_MAGIC_NUM)) 2493 return true; 2494 2495 if (!amdgpu_in_reset(adev)) 2496 return false; 2497 2498 /* 2499 * For all ASICs with baco/mode1 reset, the VRAM is 2500 * always assumed to be lost. 2501 */ 2502 switch (amdgpu_asic_reset_method(adev)) { 2503 case AMD_RESET_METHOD_BACO: 2504 case AMD_RESET_METHOD_MODE1: 2505 return true; 2506 default: 2507 return false; 2508 } 2509 } 2510 2511 /** 2512 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2513 * 2514 * @adev: amdgpu_device pointer 2515 * @state: clockgating state (gate or ungate) 2516 * 2517 * The list of all the hardware IPs that make up the asic is walked and the 2518 * set_clockgating_state callbacks are run. 2519 * Late initialization pass enabling clockgating for hardware IPs. 2520 * Fini or suspend, pass disabling clockgating for hardware IPs. 2521 * Returns 0 on success, negative error code on failure. 2522 */ 2523 2524 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2525 enum amd_clockgating_state state) 2526 { 2527 int i, j, r; 2528 2529 if (amdgpu_emu_mode == 1) 2530 return 0; 2531 2532 for (j = 0; j < adev->num_ip_blocks; j++) { 2533 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2534 if (!adev->ip_blocks[i].status.late_initialized) 2535 continue; 2536 /* skip CG for GFX, SDMA on S0ix */ 2537 if (adev->in_s0ix && 2538 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2539 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2540 continue; 2541 /* skip CG for VCE/UVD, it's handled specially */ 2542 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2543 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2544 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2545 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2546 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2547 /* enable clockgating to save power */ 2548 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2549 state); 2550 if (r) { 2551 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2552 adev->ip_blocks[i].version->funcs->name, r); 2553 return r; 2554 } 2555 } 2556 } 2557 2558 return 0; 2559 } 2560 2561 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2562 enum amd_powergating_state state) 2563 { 2564 int i, j, r; 2565 2566 if (amdgpu_emu_mode == 1) 2567 return 0; 2568 2569 for (j = 0; j < adev->num_ip_blocks; j++) { 2570 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2571 if (!adev->ip_blocks[i].status.late_initialized) 2572 continue; 2573 /* skip PG for GFX, SDMA on S0ix */ 2574 if (adev->in_s0ix && 2575 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2576 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2577 continue; 2578 /* skip CG for VCE/UVD, it's handled specially */ 2579 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2580 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2581 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2582 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2583 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2584 /* enable powergating to save power */ 2585 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2586 state); 2587 if (r) { 2588 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2589 adev->ip_blocks[i].version->funcs->name, r); 2590 return r; 2591 } 2592 } 2593 } 2594 return 0; 2595 } 2596 2597 static int amdgpu_device_enable_mgpu_fan_boost(void) 2598 { 2599 struct amdgpu_gpu_instance *gpu_ins; 2600 struct amdgpu_device *adev; 2601 int i, ret = 0; 2602 2603 mutex_lock(&mgpu_info.mutex); 2604 2605 /* 2606 * MGPU fan boost feature should be enabled 2607 * only when there are two or more dGPUs in 2608 * the system 2609 */ 2610 if (mgpu_info.num_dgpu < 2) 2611 goto out; 2612 2613 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2614 gpu_ins = &(mgpu_info.gpu_ins[i]); 2615 adev = gpu_ins->adev; 2616 if (!(adev->flags & AMD_IS_APU) && 2617 !gpu_ins->mgpu_fan_enabled) { 2618 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2619 if (ret) 2620 break; 2621 2622 gpu_ins->mgpu_fan_enabled = 1; 2623 } 2624 } 2625 2626 out: 2627 mutex_unlock(&mgpu_info.mutex); 2628 2629 return ret; 2630 } 2631 2632 /** 2633 * amdgpu_device_ip_late_init - run late init for hardware IPs 2634 * 2635 * @adev: amdgpu_device pointer 2636 * 2637 * Late initialization pass for hardware IPs. The list of all the hardware 2638 * IPs that make up the asic is walked and the late_init callbacks are run. 2639 * late_init covers any special initialization that an IP requires 2640 * after all of the have been initialized or something that needs to happen 2641 * late in the init process. 2642 * Returns 0 on success, negative error code on failure. 2643 */ 2644 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2645 { 2646 struct amdgpu_gpu_instance *gpu_instance; 2647 int i = 0, r; 2648 2649 for (i = 0; i < adev->num_ip_blocks; i++) { 2650 if (!adev->ip_blocks[i].status.hw) 2651 continue; 2652 if (adev->ip_blocks[i].version->funcs->late_init) { 2653 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2654 if (r) { 2655 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2656 adev->ip_blocks[i].version->funcs->name, r); 2657 return r; 2658 } 2659 } 2660 adev->ip_blocks[i].status.late_initialized = true; 2661 } 2662 2663 r = amdgpu_ras_late_init(adev); 2664 if (r) { 2665 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2666 return r; 2667 } 2668 2669 amdgpu_ras_set_error_query_ready(adev, true); 2670 2671 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2672 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2673 2674 amdgpu_device_fill_reset_magic(adev); 2675 2676 r = amdgpu_device_enable_mgpu_fan_boost(); 2677 if (r) 2678 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2679 2680 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2681 if (amdgpu_passthrough(adev) && 2682 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2683 adev->asic_type == CHIP_ALDEBARAN)) 2684 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2685 2686 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2687 mutex_lock(&mgpu_info.mutex); 2688 2689 /* 2690 * Reset device p-state to low as this was booted with high. 2691 * 2692 * This should be performed only after all devices from the same 2693 * hive get initialized. 2694 * 2695 * However, it's unknown how many device in the hive in advance. 2696 * As this is counted one by one during devices initializations. 2697 * 2698 * So, we wait for all XGMI interlinked devices initialized. 2699 * This may bring some delays as those devices may come from 2700 * different hives. But that should be OK. 2701 */ 2702 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2703 for (i = 0; i < mgpu_info.num_gpu; i++) { 2704 gpu_instance = &(mgpu_info.gpu_ins[i]); 2705 if (gpu_instance->adev->flags & AMD_IS_APU) 2706 continue; 2707 2708 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2709 AMDGPU_XGMI_PSTATE_MIN); 2710 if (r) { 2711 DRM_ERROR("pstate setting failed (%d).\n", r); 2712 break; 2713 } 2714 } 2715 } 2716 2717 mutex_unlock(&mgpu_info.mutex); 2718 } 2719 2720 return 0; 2721 } 2722 2723 /** 2724 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2725 * 2726 * @adev: amdgpu_device pointer 2727 * 2728 * For ASICs need to disable SMC first 2729 */ 2730 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2731 { 2732 int i, r; 2733 2734 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2735 return; 2736 2737 for (i = 0; i < adev->num_ip_blocks; i++) { 2738 if (!adev->ip_blocks[i].status.hw) 2739 continue; 2740 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2741 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2742 /* XXX handle errors */ 2743 if (r) { 2744 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2745 adev->ip_blocks[i].version->funcs->name, r); 2746 } 2747 adev->ip_blocks[i].status.hw = false; 2748 break; 2749 } 2750 } 2751 } 2752 2753 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2754 { 2755 int i, r; 2756 2757 for (i = 0; i < adev->num_ip_blocks; i++) { 2758 if (!adev->ip_blocks[i].version->funcs->early_fini) 2759 continue; 2760 2761 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2762 if (r) { 2763 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2764 adev->ip_blocks[i].version->funcs->name, r); 2765 } 2766 } 2767 2768 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2769 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2770 2771 amdgpu_amdkfd_suspend(adev, false); 2772 2773 /* Workaroud for ASICs need to disable SMC first */ 2774 amdgpu_device_smu_fini_early(adev); 2775 2776 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2777 if (!adev->ip_blocks[i].status.hw) 2778 continue; 2779 2780 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2781 /* XXX handle errors */ 2782 if (r) { 2783 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2784 adev->ip_blocks[i].version->funcs->name, r); 2785 } 2786 2787 adev->ip_blocks[i].status.hw = false; 2788 } 2789 2790 if (amdgpu_sriov_vf(adev)) { 2791 if (amdgpu_virt_release_full_gpu(adev, false)) 2792 DRM_ERROR("failed to release exclusive mode on fini\n"); 2793 } 2794 2795 return 0; 2796 } 2797 2798 /** 2799 * amdgpu_device_ip_fini - run fini for hardware IPs 2800 * 2801 * @adev: amdgpu_device pointer 2802 * 2803 * Main teardown pass for hardware IPs. The list of all the hardware 2804 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2805 * are run. hw_fini tears down the hardware associated with each IP 2806 * and sw_fini tears down any software state associated with each IP. 2807 * Returns 0 on success, negative error code on failure. 2808 */ 2809 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2810 { 2811 int i, r; 2812 2813 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2814 amdgpu_virt_release_ras_err_handler_data(adev); 2815 2816 if (adev->gmc.xgmi.num_physical_nodes > 1) 2817 amdgpu_xgmi_remove_device(adev); 2818 2819 amdgpu_amdkfd_device_fini_sw(adev); 2820 2821 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2822 if (!adev->ip_blocks[i].status.sw) 2823 continue; 2824 2825 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2826 amdgpu_ucode_free_bo(adev); 2827 amdgpu_free_static_csa(&adev->virt.csa_obj); 2828 amdgpu_device_wb_fini(adev); 2829 amdgpu_device_mem_scratch_fini(adev); 2830 amdgpu_ib_pool_fini(adev); 2831 } 2832 2833 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2834 /* XXX handle errors */ 2835 if (r) { 2836 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2837 adev->ip_blocks[i].version->funcs->name, r); 2838 } 2839 adev->ip_blocks[i].status.sw = false; 2840 adev->ip_blocks[i].status.valid = false; 2841 } 2842 2843 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2844 if (!adev->ip_blocks[i].status.late_initialized) 2845 continue; 2846 if (adev->ip_blocks[i].version->funcs->late_fini) 2847 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2848 adev->ip_blocks[i].status.late_initialized = false; 2849 } 2850 2851 amdgpu_ras_fini(adev); 2852 2853 return 0; 2854 } 2855 2856 /** 2857 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2858 * 2859 * @work: work_struct. 2860 */ 2861 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2862 { 2863 struct amdgpu_device *adev = 2864 container_of(work, struct amdgpu_device, delayed_init_work.work); 2865 int r; 2866 2867 r = amdgpu_ib_ring_tests(adev); 2868 if (r) 2869 DRM_ERROR("ib ring test failed (%d).\n", r); 2870 } 2871 2872 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2873 { 2874 struct amdgpu_device *adev = 2875 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2876 2877 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2878 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2879 2880 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2881 adev->gfx.gfx_off_state = true; 2882 } 2883 2884 /** 2885 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2886 * 2887 * @adev: amdgpu_device pointer 2888 * 2889 * Main suspend function for hardware IPs. The list of all the hardware 2890 * IPs that make up the asic is walked, clockgating is disabled and the 2891 * suspend callbacks are run. suspend puts the hardware and software state 2892 * in each IP into a state suitable for suspend. 2893 * Returns 0 on success, negative error code on failure. 2894 */ 2895 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2896 { 2897 int i, r; 2898 2899 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2900 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2901 2902 /* 2903 * Per PMFW team's suggestion, driver needs to handle gfxoff 2904 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2905 * scenario. Add the missing df cstate disablement here. 2906 */ 2907 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2908 dev_warn(adev->dev, "Failed to disallow df cstate"); 2909 2910 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2911 if (!adev->ip_blocks[i].status.valid) 2912 continue; 2913 2914 /* displays are handled separately */ 2915 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2916 continue; 2917 2918 /* XXX handle errors */ 2919 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2920 /* XXX handle errors */ 2921 if (r) { 2922 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2923 adev->ip_blocks[i].version->funcs->name, r); 2924 return r; 2925 } 2926 2927 adev->ip_blocks[i].status.hw = false; 2928 } 2929 2930 return 0; 2931 } 2932 2933 /** 2934 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2935 * 2936 * @adev: amdgpu_device pointer 2937 * 2938 * Main suspend function for hardware IPs. The list of all the hardware 2939 * IPs that make up the asic is walked, clockgating is disabled and the 2940 * suspend callbacks are run. suspend puts the hardware and software state 2941 * in each IP into a state suitable for suspend. 2942 * Returns 0 on success, negative error code on failure. 2943 */ 2944 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2945 { 2946 int i, r; 2947 2948 if (adev->in_s0ix) 2949 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2950 2951 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2952 if (!adev->ip_blocks[i].status.valid) 2953 continue; 2954 /* displays are handled in phase1 */ 2955 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2956 continue; 2957 /* PSP lost connection when err_event_athub occurs */ 2958 if (amdgpu_ras_intr_triggered() && 2959 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2960 adev->ip_blocks[i].status.hw = false; 2961 continue; 2962 } 2963 2964 /* skip unnecessary suspend if we do not initialize them yet */ 2965 if (adev->gmc.xgmi.pending_reset && 2966 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2968 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2969 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2970 adev->ip_blocks[i].status.hw = false; 2971 continue; 2972 } 2973 2974 /* skip suspend of gfx/mes and psp for S0ix 2975 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2976 * like at runtime. PSP is also part of the always on hardware 2977 * so no need to suspend it. 2978 */ 2979 if (adev->in_s0ix && 2980 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 2983 continue; 2984 2985 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 2986 if (adev->in_s0ix && 2987 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 2988 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2989 continue; 2990 2991 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 2992 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 2993 * from this location and RLC Autoload automatically also gets loaded 2994 * from here based on PMFW -> PSP message during re-init sequence. 2995 * Therefore, the psp suspend & resume should be skipped to avoid destroy 2996 * the TMR and reload FWs again for IMU enabled APU ASICs. 2997 */ 2998 if (amdgpu_in_reset(adev) && 2999 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3001 continue; 3002 3003 /* XXX handle errors */ 3004 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3005 /* XXX handle errors */ 3006 if (r) { 3007 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3008 adev->ip_blocks[i].version->funcs->name, r); 3009 } 3010 adev->ip_blocks[i].status.hw = false; 3011 /* handle putting the SMC in the appropriate state */ 3012 if (!amdgpu_sriov_vf(adev)) { 3013 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3014 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3015 if (r) { 3016 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3017 adev->mp1_state, r); 3018 return r; 3019 } 3020 } 3021 } 3022 } 3023 3024 return 0; 3025 } 3026 3027 /** 3028 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3029 * 3030 * @adev: amdgpu_device pointer 3031 * 3032 * Main suspend function for hardware IPs. The list of all the hardware 3033 * IPs that make up the asic is walked, clockgating is disabled and the 3034 * suspend callbacks are run. suspend puts the hardware and software state 3035 * in each IP into a state suitable for suspend. 3036 * Returns 0 on success, negative error code on failure. 3037 */ 3038 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3039 { 3040 int r; 3041 3042 if (amdgpu_sriov_vf(adev)) { 3043 amdgpu_virt_fini_data_exchange(adev); 3044 amdgpu_virt_request_full_gpu(adev, false); 3045 } 3046 3047 r = amdgpu_device_ip_suspend_phase1(adev); 3048 if (r) 3049 return r; 3050 r = amdgpu_device_ip_suspend_phase2(adev); 3051 3052 if (amdgpu_sriov_vf(adev)) 3053 amdgpu_virt_release_full_gpu(adev, false); 3054 3055 return r; 3056 } 3057 3058 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3059 { 3060 int i, r; 3061 3062 static enum amd_ip_block_type ip_order[] = { 3063 AMD_IP_BLOCK_TYPE_COMMON, 3064 AMD_IP_BLOCK_TYPE_GMC, 3065 AMD_IP_BLOCK_TYPE_PSP, 3066 AMD_IP_BLOCK_TYPE_IH, 3067 }; 3068 3069 for (i = 0; i < adev->num_ip_blocks; i++) { 3070 int j; 3071 struct amdgpu_ip_block *block; 3072 3073 block = &adev->ip_blocks[i]; 3074 block->status.hw = false; 3075 3076 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3077 3078 if (block->version->type != ip_order[j] || 3079 !block->status.valid) 3080 continue; 3081 3082 r = block->version->funcs->hw_init(adev); 3083 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3084 if (r) 3085 return r; 3086 block->status.hw = true; 3087 } 3088 } 3089 3090 return 0; 3091 } 3092 3093 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3094 { 3095 int i, r; 3096 3097 static enum amd_ip_block_type ip_order[] = { 3098 AMD_IP_BLOCK_TYPE_SMC, 3099 AMD_IP_BLOCK_TYPE_DCE, 3100 AMD_IP_BLOCK_TYPE_GFX, 3101 AMD_IP_BLOCK_TYPE_SDMA, 3102 AMD_IP_BLOCK_TYPE_MES, 3103 AMD_IP_BLOCK_TYPE_UVD, 3104 AMD_IP_BLOCK_TYPE_VCE, 3105 AMD_IP_BLOCK_TYPE_VCN, 3106 AMD_IP_BLOCK_TYPE_JPEG 3107 }; 3108 3109 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3110 int j; 3111 struct amdgpu_ip_block *block; 3112 3113 for (j = 0; j < adev->num_ip_blocks; j++) { 3114 block = &adev->ip_blocks[j]; 3115 3116 if (block->version->type != ip_order[i] || 3117 !block->status.valid || 3118 block->status.hw) 3119 continue; 3120 3121 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3122 r = block->version->funcs->resume(adev); 3123 else 3124 r = block->version->funcs->hw_init(adev); 3125 3126 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3127 if (r) 3128 return r; 3129 block->status.hw = true; 3130 } 3131 } 3132 3133 return 0; 3134 } 3135 3136 /** 3137 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3138 * 3139 * @adev: amdgpu_device pointer 3140 * 3141 * First resume function for hardware IPs. The list of all the hardware 3142 * IPs that make up the asic is walked and the resume callbacks are run for 3143 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3144 * after a suspend and updates the software state as necessary. This 3145 * function is also used for restoring the GPU after a GPU reset. 3146 * Returns 0 on success, negative error code on failure. 3147 */ 3148 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3149 { 3150 int i, r; 3151 3152 for (i = 0; i < adev->num_ip_blocks; i++) { 3153 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3154 continue; 3155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3156 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3157 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3158 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3159 3160 r = adev->ip_blocks[i].version->funcs->resume(adev); 3161 if (r) { 3162 DRM_ERROR("resume of IP block <%s> failed %d\n", 3163 adev->ip_blocks[i].version->funcs->name, r); 3164 return r; 3165 } 3166 adev->ip_blocks[i].status.hw = true; 3167 } 3168 } 3169 3170 return 0; 3171 } 3172 3173 /** 3174 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3175 * 3176 * @adev: amdgpu_device pointer 3177 * 3178 * First resume function for hardware IPs. The list of all the hardware 3179 * IPs that make up the asic is walked and the resume callbacks are run for 3180 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3181 * functional state after a suspend and updates the software state as 3182 * necessary. This function is also used for restoring the GPU after a GPU 3183 * reset. 3184 * Returns 0 on success, negative error code on failure. 3185 */ 3186 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3187 { 3188 int i, r; 3189 3190 for (i = 0; i < adev->num_ip_blocks; i++) { 3191 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3192 continue; 3193 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3194 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3195 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3196 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3197 continue; 3198 r = adev->ip_blocks[i].version->funcs->resume(adev); 3199 if (r) { 3200 DRM_ERROR("resume of IP block <%s> failed %d\n", 3201 adev->ip_blocks[i].version->funcs->name, r); 3202 return r; 3203 } 3204 adev->ip_blocks[i].status.hw = true; 3205 } 3206 3207 return 0; 3208 } 3209 3210 /** 3211 * amdgpu_device_ip_resume - run resume for hardware IPs 3212 * 3213 * @adev: amdgpu_device pointer 3214 * 3215 * Main resume function for hardware IPs. The hardware IPs 3216 * are split into two resume functions because they are 3217 * also used in recovering from a GPU reset and some additional 3218 * steps need to be take between them. In this case (S3/S4) they are 3219 * run sequentially. 3220 * Returns 0 on success, negative error code on failure. 3221 */ 3222 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3223 { 3224 int r; 3225 3226 r = amdgpu_device_ip_resume_phase1(adev); 3227 if (r) 3228 return r; 3229 3230 r = amdgpu_device_fw_loading(adev); 3231 if (r) 3232 return r; 3233 3234 r = amdgpu_device_ip_resume_phase2(adev); 3235 3236 return r; 3237 } 3238 3239 /** 3240 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3241 * 3242 * @adev: amdgpu_device pointer 3243 * 3244 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3245 */ 3246 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3247 { 3248 if (amdgpu_sriov_vf(adev)) { 3249 if (adev->is_atom_fw) { 3250 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3251 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3252 } else { 3253 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3254 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3255 } 3256 3257 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3258 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3259 } 3260 } 3261 3262 /** 3263 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3264 * 3265 * @asic_type: AMD asic type 3266 * 3267 * Check if there is DC (new modesetting infrastructre) support for an asic. 3268 * returns true if DC has support, false if not. 3269 */ 3270 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3271 { 3272 switch (asic_type) { 3273 #ifdef CONFIG_DRM_AMDGPU_SI 3274 case CHIP_HAINAN: 3275 #endif 3276 case CHIP_TOPAZ: 3277 /* chips with no display hardware */ 3278 return false; 3279 #if defined(CONFIG_DRM_AMD_DC) 3280 case CHIP_TAHITI: 3281 case CHIP_PITCAIRN: 3282 case CHIP_VERDE: 3283 case CHIP_OLAND: 3284 /* 3285 * We have systems in the wild with these ASICs that require 3286 * LVDS and VGA support which is not supported with DC. 3287 * 3288 * Fallback to the non-DC driver here by default so as not to 3289 * cause regressions. 3290 */ 3291 #if defined(CONFIG_DRM_AMD_DC_SI) 3292 return amdgpu_dc > 0; 3293 #else 3294 return false; 3295 #endif 3296 case CHIP_BONAIRE: 3297 case CHIP_KAVERI: 3298 case CHIP_KABINI: 3299 case CHIP_MULLINS: 3300 /* 3301 * We have systems in the wild with these ASICs that require 3302 * VGA support which is not supported with DC. 3303 * 3304 * Fallback to the non-DC driver here by default so as not to 3305 * cause regressions. 3306 */ 3307 return amdgpu_dc > 0; 3308 default: 3309 return amdgpu_dc != 0; 3310 #else 3311 default: 3312 if (amdgpu_dc > 0) 3313 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3314 return false; 3315 #endif 3316 } 3317 } 3318 3319 /** 3320 * amdgpu_device_has_dc_support - check if dc is supported 3321 * 3322 * @adev: amdgpu_device pointer 3323 * 3324 * Returns true for supported, false for not supported 3325 */ 3326 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3327 { 3328 if (adev->enable_virtual_display || 3329 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3330 return false; 3331 3332 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3333 } 3334 3335 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3336 { 3337 struct amdgpu_device *adev = 3338 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3339 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3340 3341 /* It's a bug to not have a hive within this function */ 3342 if (WARN_ON(!hive)) 3343 return; 3344 3345 /* 3346 * Use task barrier to synchronize all xgmi reset works across the 3347 * hive. task_barrier_enter and task_barrier_exit will block 3348 * until all the threads running the xgmi reset works reach 3349 * those points. task_barrier_full will do both blocks. 3350 */ 3351 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3352 3353 task_barrier_enter(&hive->tb); 3354 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3355 3356 if (adev->asic_reset_res) 3357 goto fail; 3358 3359 task_barrier_exit(&hive->tb); 3360 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3361 3362 if (adev->asic_reset_res) 3363 goto fail; 3364 3365 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3366 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3367 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3368 } else { 3369 3370 task_barrier_full(&hive->tb); 3371 adev->asic_reset_res = amdgpu_asic_reset(adev); 3372 } 3373 3374 fail: 3375 if (adev->asic_reset_res) 3376 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3377 adev->asic_reset_res, adev_to_drm(adev)->unique); 3378 amdgpu_put_xgmi_hive(hive); 3379 } 3380 3381 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3382 { 3383 char *input = amdgpu_lockup_timeout; 3384 char *timeout_setting = NULL; 3385 int index = 0; 3386 long timeout; 3387 int ret = 0; 3388 3389 /* 3390 * By default timeout for non compute jobs is 10000 3391 * and 60000 for compute jobs. 3392 * In SR-IOV or passthrough mode, timeout for compute 3393 * jobs are 60000 by default. 3394 */ 3395 adev->gfx_timeout = msecs_to_jiffies(10000); 3396 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3397 if (amdgpu_sriov_vf(adev)) 3398 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3399 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3400 else 3401 adev->compute_timeout = msecs_to_jiffies(60000); 3402 3403 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3404 while ((timeout_setting = strsep(&input, ",")) && 3405 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3406 ret = kstrtol(timeout_setting, 0, &timeout); 3407 if (ret) 3408 return ret; 3409 3410 if (timeout == 0) { 3411 index++; 3412 continue; 3413 } else if (timeout < 0) { 3414 timeout = MAX_SCHEDULE_TIMEOUT; 3415 dev_warn(adev->dev, "lockup timeout disabled"); 3416 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3417 } else { 3418 timeout = msecs_to_jiffies(timeout); 3419 } 3420 3421 switch (index++) { 3422 case 0: 3423 adev->gfx_timeout = timeout; 3424 break; 3425 case 1: 3426 adev->compute_timeout = timeout; 3427 break; 3428 case 2: 3429 adev->sdma_timeout = timeout; 3430 break; 3431 case 3: 3432 adev->video_timeout = timeout; 3433 break; 3434 default: 3435 break; 3436 } 3437 } 3438 /* 3439 * There is only one value specified and 3440 * it should apply to all non-compute jobs. 3441 */ 3442 if (index == 1) { 3443 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3444 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3445 adev->compute_timeout = adev->gfx_timeout; 3446 } 3447 } 3448 3449 return ret; 3450 } 3451 3452 /** 3453 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3454 * 3455 * @adev: amdgpu_device pointer 3456 * 3457 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3458 */ 3459 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3460 { 3461 struct iommu_domain *domain; 3462 3463 domain = iommu_get_domain_for_dev(adev->dev); 3464 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3465 adev->ram_is_direct_mapped = true; 3466 } 3467 3468 static const struct attribute *amdgpu_dev_attributes[] = { 3469 &dev_attr_pcie_replay_count.attr, 3470 NULL 3471 }; 3472 3473 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3474 { 3475 if (amdgpu_mcbp == 1) 3476 adev->gfx.mcbp = true; 3477 else if (amdgpu_mcbp == 0) 3478 adev->gfx.mcbp = false; 3479 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3480 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3481 adev->gfx.num_gfx_rings) 3482 adev->gfx.mcbp = true; 3483 3484 if (amdgpu_sriov_vf(adev)) 3485 adev->gfx.mcbp = true; 3486 3487 if (adev->gfx.mcbp) 3488 DRM_INFO("MCBP is enabled\n"); 3489 } 3490 3491 /** 3492 * amdgpu_device_init - initialize the driver 3493 * 3494 * @adev: amdgpu_device pointer 3495 * @flags: driver flags 3496 * 3497 * Initializes the driver info and hw (all asics). 3498 * Returns 0 for success or an error on failure. 3499 * Called at driver startup. 3500 */ 3501 int amdgpu_device_init(struct amdgpu_device *adev, 3502 uint32_t flags) 3503 { 3504 struct drm_device *ddev = adev_to_drm(adev); 3505 struct pci_dev *pdev = adev->pdev; 3506 int r, i; 3507 bool px = false; 3508 u32 max_MBps; 3509 int tmp; 3510 3511 adev->shutdown = false; 3512 adev->flags = flags; 3513 3514 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3515 adev->asic_type = amdgpu_force_asic_type; 3516 else 3517 adev->asic_type = flags & AMD_ASIC_MASK; 3518 3519 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3520 if (amdgpu_emu_mode == 1) 3521 adev->usec_timeout *= 10; 3522 adev->gmc.gart_size = 512 * 1024 * 1024; 3523 adev->accel_working = false; 3524 adev->num_rings = 0; 3525 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3526 adev->mman.buffer_funcs = NULL; 3527 adev->mman.buffer_funcs_ring = NULL; 3528 adev->vm_manager.vm_pte_funcs = NULL; 3529 adev->vm_manager.vm_pte_num_scheds = 0; 3530 adev->gmc.gmc_funcs = NULL; 3531 adev->harvest_ip_mask = 0x0; 3532 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3533 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3534 3535 adev->smc_rreg = &amdgpu_invalid_rreg; 3536 adev->smc_wreg = &amdgpu_invalid_wreg; 3537 adev->pcie_rreg = &amdgpu_invalid_rreg; 3538 adev->pcie_wreg = &amdgpu_invalid_wreg; 3539 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3540 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3541 adev->pciep_rreg = &amdgpu_invalid_rreg; 3542 adev->pciep_wreg = &amdgpu_invalid_wreg; 3543 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3544 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3545 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3546 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3547 adev->didt_rreg = &amdgpu_invalid_rreg; 3548 adev->didt_wreg = &amdgpu_invalid_wreg; 3549 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3550 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3551 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3552 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3553 3554 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3555 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3556 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3557 3558 /* mutex initialization are all done here so we 3559 * can recall function without having locking issues 3560 */ 3561 mutex_init(&adev->firmware.mutex); 3562 mutex_init(&adev->pm.mutex); 3563 mutex_init(&adev->gfx.gpu_clock_mutex); 3564 mutex_init(&adev->srbm_mutex); 3565 mutex_init(&adev->gfx.pipe_reserve_mutex); 3566 mutex_init(&adev->gfx.gfx_off_mutex); 3567 mutex_init(&adev->gfx.partition_mutex); 3568 mutex_init(&adev->grbm_idx_mutex); 3569 mutex_init(&adev->mn_lock); 3570 mutex_init(&adev->virt.vf_errors.lock); 3571 hash_init(adev->mn_hash); 3572 mutex_init(&adev->psp.mutex); 3573 mutex_init(&adev->notifier_lock); 3574 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3575 mutex_init(&adev->benchmark_mutex); 3576 3577 amdgpu_device_init_apu_flags(adev); 3578 3579 r = amdgpu_device_check_arguments(adev); 3580 if (r) 3581 return r; 3582 3583 spin_lock_init(&adev->mmio_idx_lock); 3584 spin_lock_init(&adev->smc_idx_lock); 3585 spin_lock_init(&adev->pcie_idx_lock); 3586 spin_lock_init(&adev->uvd_ctx_idx_lock); 3587 spin_lock_init(&adev->didt_idx_lock); 3588 spin_lock_init(&adev->gc_cac_idx_lock); 3589 spin_lock_init(&adev->se_cac_idx_lock); 3590 spin_lock_init(&adev->audio_endpt_idx_lock); 3591 spin_lock_init(&adev->mm_stats.lock); 3592 3593 INIT_LIST_HEAD(&adev->shadow_list); 3594 mutex_init(&adev->shadow_list_lock); 3595 3596 INIT_LIST_HEAD(&adev->reset_list); 3597 3598 INIT_LIST_HEAD(&adev->ras_list); 3599 3600 INIT_DELAYED_WORK(&adev->delayed_init_work, 3601 amdgpu_device_delayed_init_work_handler); 3602 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3603 amdgpu_device_delay_enable_gfx_off); 3604 3605 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3606 3607 adev->gfx.gfx_off_req_count = 1; 3608 adev->gfx.gfx_off_residency = 0; 3609 adev->gfx.gfx_off_entrycount = 0; 3610 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3611 3612 atomic_set(&adev->throttling_logging_enabled, 1); 3613 /* 3614 * If throttling continues, logging will be performed every minute 3615 * to avoid log flooding. "-1" is subtracted since the thermal 3616 * throttling interrupt comes every second. Thus, the total logging 3617 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3618 * for throttling interrupt) = 60 seconds. 3619 */ 3620 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3621 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3622 3623 /* Registers mapping */ 3624 /* TODO: block userspace mapping of io register */ 3625 if (adev->asic_type >= CHIP_BONAIRE) { 3626 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3627 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3628 } else { 3629 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3630 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3631 } 3632 3633 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3634 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3635 3636 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3637 if (!adev->rmmio) 3638 return -ENOMEM; 3639 3640 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3641 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3642 3643 /* 3644 * Reset domain needs to be present early, before XGMI hive discovered 3645 * (if any) and intitialized to use reset sem and in_gpu reset flag 3646 * early on during init and before calling to RREG32. 3647 */ 3648 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3649 if (!adev->reset_domain) 3650 return -ENOMEM; 3651 3652 /* detect hw virtualization here */ 3653 amdgpu_detect_virtualization(adev); 3654 3655 amdgpu_device_get_pcie_info(adev); 3656 3657 r = amdgpu_device_get_job_timeout_settings(adev); 3658 if (r) { 3659 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3660 return r; 3661 } 3662 3663 /* early init functions */ 3664 r = amdgpu_device_ip_early_init(adev); 3665 if (r) 3666 return r; 3667 3668 amdgpu_device_set_mcbp(adev); 3669 3670 /* Get rid of things like offb */ 3671 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3672 if (r) 3673 return r; 3674 3675 /* Enable TMZ based on IP_VERSION */ 3676 amdgpu_gmc_tmz_set(adev); 3677 3678 amdgpu_gmc_noretry_set(adev); 3679 /* Need to get xgmi info early to decide the reset behavior*/ 3680 if (adev->gmc.xgmi.supported) { 3681 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3682 if (r) 3683 return r; 3684 } 3685 3686 /* enable PCIE atomic ops */ 3687 if (amdgpu_sriov_vf(adev)) { 3688 if (adev->virt.fw_reserve.p_pf2vf) 3689 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3690 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3691 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3692 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3693 * internal path natively support atomics, set have_atomics_support to true. 3694 */ 3695 } else if ((adev->flags & AMD_IS_APU) && 3696 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3697 adev->have_atomics_support = true; 3698 } else { 3699 adev->have_atomics_support = 3700 !pci_enable_atomic_ops_to_root(adev->pdev, 3701 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3702 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3703 } 3704 3705 if (!adev->have_atomics_support) 3706 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3707 3708 /* doorbell bar mapping and doorbell index init*/ 3709 amdgpu_doorbell_init(adev); 3710 3711 if (amdgpu_emu_mode == 1) { 3712 /* post the asic on emulation mode */ 3713 emu_soc_asic_init(adev); 3714 goto fence_driver_init; 3715 } 3716 3717 amdgpu_reset_init(adev); 3718 3719 /* detect if we are with an SRIOV vbios */ 3720 if (adev->bios) 3721 amdgpu_device_detect_sriov_bios(adev); 3722 3723 /* check if we need to reset the asic 3724 * E.g., driver was not cleanly unloaded previously, etc. 3725 */ 3726 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3727 if (adev->gmc.xgmi.num_physical_nodes) { 3728 dev_info(adev->dev, "Pending hive reset.\n"); 3729 adev->gmc.xgmi.pending_reset = true; 3730 /* Only need to init necessary block for SMU to handle the reset */ 3731 for (i = 0; i < adev->num_ip_blocks; i++) { 3732 if (!adev->ip_blocks[i].status.valid) 3733 continue; 3734 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3736 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3738 DRM_DEBUG("IP %s disabled for hw_init.\n", 3739 adev->ip_blocks[i].version->funcs->name); 3740 adev->ip_blocks[i].status.hw = true; 3741 } 3742 } 3743 } else { 3744 tmp = amdgpu_reset_method; 3745 /* It should do a default reset when loading or reloading the driver, 3746 * regardless of the module parameter reset_method. 3747 */ 3748 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3749 r = amdgpu_asic_reset(adev); 3750 amdgpu_reset_method = tmp; 3751 if (r) { 3752 dev_err(adev->dev, "asic reset on init failed\n"); 3753 goto failed; 3754 } 3755 } 3756 } 3757 3758 /* Post card if necessary */ 3759 if (amdgpu_device_need_post(adev)) { 3760 if (!adev->bios) { 3761 dev_err(adev->dev, "no vBIOS found\n"); 3762 r = -EINVAL; 3763 goto failed; 3764 } 3765 DRM_INFO("GPU posting now...\n"); 3766 r = amdgpu_device_asic_init(adev); 3767 if (r) { 3768 dev_err(adev->dev, "gpu post error!\n"); 3769 goto failed; 3770 } 3771 } 3772 3773 if (adev->bios) { 3774 if (adev->is_atom_fw) { 3775 /* Initialize clocks */ 3776 r = amdgpu_atomfirmware_get_clock_info(adev); 3777 if (r) { 3778 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3779 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3780 goto failed; 3781 } 3782 } else { 3783 /* Initialize clocks */ 3784 r = amdgpu_atombios_get_clock_info(adev); 3785 if (r) { 3786 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3787 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3788 goto failed; 3789 } 3790 /* init i2c buses */ 3791 if (!amdgpu_device_has_dc_support(adev)) 3792 amdgpu_atombios_i2c_init(adev); 3793 } 3794 } 3795 3796 fence_driver_init: 3797 /* Fence driver */ 3798 r = amdgpu_fence_driver_sw_init(adev); 3799 if (r) { 3800 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3801 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3802 goto failed; 3803 } 3804 3805 /* init the mode config */ 3806 drm_mode_config_init(adev_to_drm(adev)); 3807 3808 r = amdgpu_device_ip_init(adev); 3809 if (r) { 3810 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3811 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3812 goto release_ras_con; 3813 } 3814 3815 amdgpu_fence_driver_hw_init(adev); 3816 3817 dev_info(adev->dev, 3818 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3819 adev->gfx.config.max_shader_engines, 3820 adev->gfx.config.max_sh_per_se, 3821 adev->gfx.config.max_cu_per_sh, 3822 adev->gfx.cu_info.number); 3823 3824 adev->accel_working = true; 3825 3826 amdgpu_vm_check_compute_bug(adev); 3827 3828 /* Initialize the buffer migration limit. */ 3829 if (amdgpu_moverate >= 0) 3830 max_MBps = amdgpu_moverate; 3831 else 3832 max_MBps = 8; /* Allow 8 MB/s. */ 3833 /* Get a log2 for easy divisions. */ 3834 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3835 3836 r = amdgpu_atombios_sysfs_init(adev); 3837 if (r) 3838 drm_err(&adev->ddev, 3839 "registering atombios sysfs failed (%d).\n", r); 3840 3841 r = amdgpu_pm_sysfs_init(adev); 3842 if (r) 3843 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3844 3845 r = amdgpu_ucode_sysfs_init(adev); 3846 if (r) { 3847 adev->ucode_sysfs_en = false; 3848 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3849 } else 3850 adev->ucode_sysfs_en = true; 3851 3852 /* 3853 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3854 * Otherwise the mgpu fan boost feature will be skipped due to the 3855 * gpu instance is counted less. 3856 */ 3857 amdgpu_register_gpu_instance(adev); 3858 3859 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3860 * explicit gating rather than handling it automatically. 3861 */ 3862 if (!adev->gmc.xgmi.pending_reset) { 3863 r = amdgpu_device_ip_late_init(adev); 3864 if (r) { 3865 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3866 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3867 goto release_ras_con; 3868 } 3869 /* must succeed. */ 3870 amdgpu_ras_resume(adev); 3871 queue_delayed_work(system_wq, &adev->delayed_init_work, 3872 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3873 } 3874 3875 if (amdgpu_sriov_vf(adev)) { 3876 amdgpu_virt_release_full_gpu(adev, true); 3877 flush_delayed_work(&adev->delayed_init_work); 3878 } 3879 3880 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3881 if (r) 3882 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3883 3884 amdgpu_fru_sysfs_init(adev); 3885 3886 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3887 r = amdgpu_pmu_init(adev); 3888 if (r) 3889 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3890 3891 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3892 if (amdgpu_device_cache_pci_state(adev->pdev)) 3893 pci_restore_state(pdev); 3894 3895 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3896 /* this will fail for cards that aren't VGA class devices, just 3897 * ignore it 3898 */ 3899 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3900 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3901 3902 px = amdgpu_device_supports_px(ddev); 3903 3904 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3905 apple_gmux_detect(NULL, NULL))) 3906 vga_switcheroo_register_client(adev->pdev, 3907 &amdgpu_switcheroo_ops, px); 3908 3909 if (px) 3910 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3911 3912 if (adev->gmc.xgmi.pending_reset) 3913 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3914 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3915 3916 amdgpu_device_check_iommu_direct_map(adev); 3917 3918 return 0; 3919 3920 release_ras_con: 3921 if (amdgpu_sriov_vf(adev)) 3922 amdgpu_virt_release_full_gpu(adev, true); 3923 3924 /* failed in exclusive mode due to timeout */ 3925 if (amdgpu_sriov_vf(adev) && 3926 !amdgpu_sriov_runtime(adev) && 3927 amdgpu_virt_mmio_blocked(adev) && 3928 !amdgpu_virt_wait_reset(adev)) { 3929 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3930 /* Don't send request since VF is inactive. */ 3931 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3932 adev->virt.ops = NULL; 3933 r = -EAGAIN; 3934 } 3935 amdgpu_release_ras_context(adev); 3936 3937 failed: 3938 amdgpu_vf_error_trans_all(adev); 3939 3940 return r; 3941 } 3942 3943 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3944 { 3945 3946 /* Clear all CPU mappings pointing to this device */ 3947 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3948 3949 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3950 amdgpu_doorbell_fini(adev); 3951 3952 iounmap(adev->rmmio); 3953 adev->rmmio = NULL; 3954 if (adev->mman.aper_base_kaddr) 3955 iounmap(adev->mman.aper_base_kaddr); 3956 adev->mman.aper_base_kaddr = NULL; 3957 3958 /* Memory manager related */ 3959 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3960 arch_phys_wc_del(adev->gmc.vram_mtrr); 3961 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3962 } 3963 } 3964 3965 /** 3966 * amdgpu_device_fini_hw - tear down the driver 3967 * 3968 * @adev: amdgpu_device pointer 3969 * 3970 * Tear down the driver info (all asics). 3971 * Called at driver shutdown. 3972 */ 3973 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3974 { 3975 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3976 flush_delayed_work(&adev->delayed_init_work); 3977 adev->shutdown = true; 3978 3979 /* make sure IB test finished before entering exclusive mode 3980 * to avoid preemption on IB test 3981 */ 3982 if (amdgpu_sriov_vf(adev)) { 3983 amdgpu_virt_request_full_gpu(adev, false); 3984 amdgpu_virt_fini_data_exchange(adev); 3985 } 3986 3987 /* disable all interrupts */ 3988 amdgpu_irq_disable_all(adev); 3989 if (adev->mode_info.mode_config_initialized) { 3990 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3991 drm_helper_force_disable_all(adev_to_drm(adev)); 3992 else 3993 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3994 } 3995 amdgpu_fence_driver_hw_fini(adev); 3996 3997 if (adev->mman.initialized) 3998 drain_workqueue(adev->mman.bdev.wq); 3999 4000 if (adev->pm.sysfs_initialized) 4001 amdgpu_pm_sysfs_fini(adev); 4002 if (adev->ucode_sysfs_en) 4003 amdgpu_ucode_sysfs_fini(adev); 4004 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4005 amdgpu_fru_sysfs_fini(adev); 4006 4007 /* disable ras feature must before hw fini */ 4008 amdgpu_ras_pre_fini(adev); 4009 4010 amdgpu_device_ip_fini_early(adev); 4011 4012 amdgpu_irq_fini_hw(adev); 4013 4014 if (adev->mman.initialized) 4015 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4016 4017 amdgpu_gart_dummy_page_fini(adev); 4018 4019 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4020 amdgpu_device_unmap_mmio(adev); 4021 4022 } 4023 4024 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4025 { 4026 int idx; 4027 bool px; 4028 4029 amdgpu_fence_driver_sw_fini(adev); 4030 amdgpu_device_ip_fini(adev); 4031 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4032 adev->accel_working = false; 4033 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4034 4035 amdgpu_reset_fini(adev); 4036 4037 /* free i2c buses */ 4038 if (!amdgpu_device_has_dc_support(adev)) 4039 amdgpu_i2c_fini(adev); 4040 4041 if (amdgpu_emu_mode != 1) 4042 amdgpu_atombios_fini(adev); 4043 4044 kfree(adev->bios); 4045 adev->bios = NULL; 4046 4047 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4048 4049 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4050 apple_gmux_detect(NULL, NULL))) 4051 vga_switcheroo_unregister_client(adev->pdev); 4052 4053 if (px) 4054 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4055 4056 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4057 vga_client_unregister(adev->pdev); 4058 4059 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4060 4061 iounmap(adev->rmmio); 4062 adev->rmmio = NULL; 4063 amdgpu_doorbell_fini(adev); 4064 drm_dev_exit(idx); 4065 } 4066 4067 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4068 amdgpu_pmu_fini(adev); 4069 if (adev->mman.discovery_bin) 4070 amdgpu_discovery_fini(adev); 4071 4072 amdgpu_reset_put_reset_domain(adev->reset_domain); 4073 adev->reset_domain = NULL; 4074 4075 kfree(adev->pci_state); 4076 4077 } 4078 4079 /** 4080 * amdgpu_device_evict_resources - evict device resources 4081 * @adev: amdgpu device object 4082 * 4083 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4084 * of the vram memory type. Mainly used for evicting device resources 4085 * at suspend time. 4086 * 4087 */ 4088 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4089 { 4090 int ret; 4091 4092 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4093 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4094 return 0; 4095 4096 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4097 if (ret) 4098 DRM_WARN("evicting device resources failed\n"); 4099 return ret; 4100 } 4101 4102 /* 4103 * Suspend & resume. 4104 */ 4105 /** 4106 * amdgpu_device_suspend - initiate device suspend 4107 * 4108 * @dev: drm dev pointer 4109 * @fbcon : notify the fbdev of suspend 4110 * 4111 * Puts the hw in the suspend state (all asics). 4112 * Returns 0 for success or an error on failure. 4113 * Called at driver suspend. 4114 */ 4115 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4116 { 4117 struct amdgpu_device *adev = drm_to_adev(dev); 4118 int r = 0; 4119 4120 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4121 return 0; 4122 4123 adev->in_suspend = true; 4124 4125 /* Evict the majority of BOs before grabbing the full access */ 4126 r = amdgpu_device_evict_resources(adev); 4127 if (r) 4128 return r; 4129 4130 if (amdgpu_sriov_vf(adev)) { 4131 amdgpu_virt_fini_data_exchange(adev); 4132 r = amdgpu_virt_request_full_gpu(adev, false); 4133 if (r) 4134 return r; 4135 } 4136 4137 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4138 DRM_WARN("smart shift update failed\n"); 4139 4140 if (fbcon) 4141 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4142 4143 cancel_delayed_work_sync(&adev->delayed_init_work); 4144 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4145 4146 amdgpu_ras_suspend(adev); 4147 4148 amdgpu_device_ip_suspend_phase1(adev); 4149 4150 if (!adev->in_s0ix) 4151 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4152 4153 r = amdgpu_device_evict_resources(adev); 4154 if (r) 4155 return r; 4156 4157 amdgpu_fence_driver_hw_fini(adev); 4158 4159 amdgpu_device_ip_suspend_phase2(adev); 4160 4161 if (amdgpu_sriov_vf(adev)) 4162 amdgpu_virt_release_full_gpu(adev, false); 4163 4164 return 0; 4165 } 4166 4167 /** 4168 * amdgpu_device_resume - initiate device resume 4169 * 4170 * @dev: drm dev pointer 4171 * @fbcon : notify the fbdev of resume 4172 * 4173 * Bring the hw back to operating state (all asics). 4174 * Returns 0 for success or an error on failure. 4175 * Called at driver resume. 4176 */ 4177 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4178 { 4179 struct amdgpu_device *adev = drm_to_adev(dev); 4180 int r = 0; 4181 4182 if (amdgpu_sriov_vf(adev)) { 4183 r = amdgpu_virt_request_full_gpu(adev, true); 4184 if (r) 4185 return r; 4186 } 4187 4188 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4189 return 0; 4190 4191 if (adev->in_s0ix) 4192 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4193 4194 /* post card */ 4195 if (amdgpu_device_need_post(adev)) { 4196 r = amdgpu_device_asic_init(adev); 4197 if (r) 4198 dev_err(adev->dev, "amdgpu asic init failed\n"); 4199 } 4200 4201 r = amdgpu_device_ip_resume(adev); 4202 4203 if (r) { 4204 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4205 goto exit; 4206 } 4207 amdgpu_fence_driver_hw_init(adev); 4208 4209 r = amdgpu_device_ip_late_init(adev); 4210 if (r) 4211 goto exit; 4212 4213 queue_delayed_work(system_wq, &adev->delayed_init_work, 4214 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4215 4216 if (!adev->in_s0ix) { 4217 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4218 if (r) 4219 goto exit; 4220 } 4221 4222 exit: 4223 if (amdgpu_sriov_vf(adev)) { 4224 amdgpu_virt_init_data_exchange(adev); 4225 amdgpu_virt_release_full_gpu(adev, true); 4226 } 4227 4228 if (r) 4229 return r; 4230 4231 /* Make sure IB tests flushed */ 4232 flush_delayed_work(&adev->delayed_init_work); 4233 4234 if (fbcon) 4235 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4236 4237 amdgpu_ras_resume(adev); 4238 4239 if (adev->mode_info.num_crtc) { 4240 /* 4241 * Most of the connector probing functions try to acquire runtime pm 4242 * refs to ensure that the GPU is powered on when connector polling is 4243 * performed. Since we're calling this from a runtime PM callback, 4244 * trying to acquire rpm refs will cause us to deadlock. 4245 * 4246 * Since we're guaranteed to be holding the rpm lock, it's safe to 4247 * temporarily disable the rpm helpers so this doesn't deadlock us. 4248 */ 4249 #ifdef CONFIG_PM 4250 dev->dev->power.disable_depth++; 4251 #endif 4252 if (!adev->dc_enabled) 4253 drm_helper_hpd_irq_event(dev); 4254 else 4255 drm_kms_helper_hotplug_event(dev); 4256 #ifdef CONFIG_PM 4257 dev->dev->power.disable_depth--; 4258 #endif 4259 } 4260 adev->in_suspend = false; 4261 4262 if (adev->enable_mes) 4263 amdgpu_mes_self_test(adev); 4264 4265 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4266 DRM_WARN("smart shift update failed\n"); 4267 4268 return 0; 4269 } 4270 4271 /** 4272 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4273 * 4274 * @adev: amdgpu_device pointer 4275 * 4276 * The list of all the hardware IPs that make up the asic is walked and 4277 * the check_soft_reset callbacks are run. check_soft_reset determines 4278 * if the asic is still hung or not. 4279 * Returns true if any of the IPs are still in a hung state, false if not. 4280 */ 4281 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4282 { 4283 int i; 4284 bool asic_hang = false; 4285 4286 if (amdgpu_sriov_vf(adev)) 4287 return true; 4288 4289 if (amdgpu_asic_need_full_reset(adev)) 4290 return true; 4291 4292 for (i = 0; i < adev->num_ip_blocks; i++) { 4293 if (!adev->ip_blocks[i].status.valid) 4294 continue; 4295 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4296 adev->ip_blocks[i].status.hang = 4297 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4298 if (adev->ip_blocks[i].status.hang) { 4299 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4300 asic_hang = true; 4301 } 4302 } 4303 return asic_hang; 4304 } 4305 4306 /** 4307 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4308 * 4309 * @adev: amdgpu_device pointer 4310 * 4311 * The list of all the hardware IPs that make up the asic is walked and the 4312 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4313 * handles any IP specific hardware or software state changes that are 4314 * necessary for a soft reset to succeed. 4315 * Returns 0 on success, negative error code on failure. 4316 */ 4317 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4318 { 4319 int i, r = 0; 4320 4321 for (i = 0; i < adev->num_ip_blocks; i++) { 4322 if (!adev->ip_blocks[i].status.valid) 4323 continue; 4324 if (adev->ip_blocks[i].status.hang && 4325 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4326 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4327 if (r) 4328 return r; 4329 } 4330 } 4331 4332 return 0; 4333 } 4334 4335 /** 4336 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4337 * 4338 * @adev: amdgpu_device pointer 4339 * 4340 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4341 * reset is necessary to recover. 4342 * Returns true if a full asic reset is required, false if not. 4343 */ 4344 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4345 { 4346 int i; 4347 4348 if (amdgpu_asic_need_full_reset(adev)) 4349 return true; 4350 4351 for (i = 0; i < adev->num_ip_blocks; i++) { 4352 if (!adev->ip_blocks[i].status.valid) 4353 continue; 4354 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4355 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4356 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4357 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4358 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4359 if (adev->ip_blocks[i].status.hang) { 4360 dev_info(adev->dev, "Some block need full reset!\n"); 4361 return true; 4362 } 4363 } 4364 } 4365 return false; 4366 } 4367 4368 /** 4369 * amdgpu_device_ip_soft_reset - do a soft reset 4370 * 4371 * @adev: amdgpu_device pointer 4372 * 4373 * The list of all the hardware IPs that make up the asic is walked and the 4374 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4375 * IP specific hardware or software state changes that are necessary to soft 4376 * reset the IP. 4377 * Returns 0 on success, negative error code on failure. 4378 */ 4379 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4380 { 4381 int i, r = 0; 4382 4383 for (i = 0; i < adev->num_ip_blocks; i++) { 4384 if (!adev->ip_blocks[i].status.valid) 4385 continue; 4386 if (adev->ip_blocks[i].status.hang && 4387 adev->ip_blocks[i].version->funcs->soft_reset) { 4388 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4389 if (r) 4390 return r; 4391 } 4392 } 4393 4394 return 0; 4395 } 4396 4397 /** 4398 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4399 * 4400 * @adev: amdgpu_device pointer 4401 * 4402 * The list of all the hardware IPs that make up the asic is walked and the 4403 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4404 * handles any IP specific hardware or software state changes that are 4405 * necessary after the IP has been soft reset. 4406 * Returns 0 on success, negative error code on failure. 4407 */ 4408 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4409 { 4410 int i, r = 0; 4411 4412 for (i = 0; i < adev->num_ip_blocks; i++) { 4413 if (!adev->ip_blocks[i].status.valid) 4414 continue; 4415 if (adev->ip_blocks[i].status.hang && 4416 adev->ip_blocks[i].version->funcs->post_soft_reset) 4417 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4418 if (r) 4419 return r; 4420 } 4421 4422 return 0; 4423 } 4424 4425 /** 4426 * amdgpu_device_recover_vram - Recover some VRAM contents 4427 * 4428 * @adev: amdgpu_device pointer 4429 * 4430 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4431 * restore things like GPUVM page tables after a GPU reset where 4432 * the contents of VRAM might be lost. 4433 * 4434 * Returns: 4435 * 0 on success, negative error code on failure. 4436 */ 4437 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4438 { 4439 struct dma_fence *fence = NULL, *next = NULL; 4440 struct amdgpu_bo *shadow; 4441 struct amdgpu_bo_vm *vmbo; 4442 long r = 1, tmo; 4443 4444 if (amdgpu_sriov_runtime(adev)) 4445 tmo = msecs_to_jiffies(8000); 4446 else 4447 tmo = msecs_to_jiffies(100); 4448 4449 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4450 mutex_lock(&adev->shadow_list_lock); 4451 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4452 /* If vm is compute context or adev is APU, shadow will be NULL */ 4453 if (!vmbo->shadow) 4454 continue; 4455 shadow = vmbo->shadow; 4456 4457 /* No need to recover an evicted BO */ 4458 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4459 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4460 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4461 continue; 4462 4463 r = amdgpu_bo_restore_shadow(shadow, &next); 4464 if (r) 4465 break; 4466 4467 if (fence) { 4468 tmo = dma_fence_wait_timeout(fence, false, tmo); 4469 dma_fence_put(fence); 4470 fence = next; 4471 if (tmo == 0) { 4472 r = -ETIMEDOUT; 4473 break; 4474 } else if (tmo < 0) { 4475 r = tmo; 4476 break; 4477 } 4478 } else { 4479 fence = next; 4480 } 4481 } 4482 mutex_unlock(&adev->shadow_list_lock); 4483 4484 if (fence) 4485 tmo = dma_fence_wait_timeout(fence, false, tmo); 4486 dma_fence_put(fence); 4487 4488 if (r < 0 || tmo <= 0) { 4489 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4490 return -EIO; 4491 } 4492 4493 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4494 return 0; 4495 } 4496 4497 4498 /** 4499 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4500 * 4501 * @adev: amdgpu_device pointer 4502 * @from_hypervisor: request from hypervisor 4503 * 4504 * do VF FLR and reinitialize Asic 4505 * return 0 means succeeded otherwise failed 4506 */ 4507 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4508 bool from_hypervisor) 4509 { 4510 int r; 4511 struct amdgpu_hive_info *hive = NULL; 4512 int retry_limit = 0; 4513 4514 retry: 4515 amdgpu_amdkfd_pre_reset(adev); 4516 4517 if (from_hypervisor) 4518 r = amdgpu_virt_request_full_gpu(adev, true); 4519 else 4520 r = amdgpu_virt_reset_gpu(adev); 4521 if (r) 4522 return r; 4523 amdgpu_irq_gpu_reset_resume_helper(adev); 4524 4525 /* some sw clean up VF needs to do before recover */ 4526 amdgpu_virt_post_reset(adev); 4527 4528 /* Resume IP prior to SMC */ 4529 r = amdgpu_device_ip_reinit_early_sriov(adev); 4530 if (r) 4531 goto error; 4532 4533 amdgpu_virt_init_data_exchange(adev); 4534 4535 r = amdgpu_device_fw_loading(adev); 4536 if (r) 4537 return r; 4538 4539 /* now we are okay to resume SMC/CP/SDMA */ 4540 r = amdgpu_device_ip_reinit_late_sriov(adev); 4541 if (r) 4542 goto error; 4543 4544 hive = amdgpu_get_xgmi_hive(adev); 4545 /* Update PSP FW topology after reset */ 4546 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4547 r = amdgpu_xgmi_update_topology(hive, adev); 4548 4549 if (hive) 4550 amdgpu_put_xgmi_hive(hive); 4551 4552 if (!r) { 4553 r = amdgpu_ib_ring_tests(adev); 4554 4555 amdgpu_amdkfd_post_reset(adev); 4556 } 4557 4558 error: 4559 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4560 amdgpu_inc_vram_lost(adev); 4561 r = amdgpu_device_recover_vram(adev); 4562 } 4563 amdgpu_virt_release_full_gpu(adev, true); 4564 4565 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4566 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4567 retry_limit++; 4568 goto retry; 4569 } else 4570 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4571 } 4572 4573 return r; 4574 } 4575 4576 /** 4577 * amdgpu_device_has_job_running - check if there is any job in mirror list 4578 * 4579 * @adev: amdgpu_device pointer 4580 * 4581 * check if there is any job in mirror list 4582 */ 4583 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4584 { 4585 int i; 4586 struct drm_sched_job *job; 4587 4588 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4589 struct amdgpu_ring *ring = adev->rings[i]; 4590 4591 if (!ring || !ring->sched.thread) 4592 continue; 4593 4594 spin_lock(&ring->sched.job_list_lock); 4595 job = list_first_entry_or_null(&ring->sched.pending_list, 4596 struct drm_sched_job, list); 4597 spin_unlock(&ring->sched.job_list_lock); 4598 if (job) 4599 return true; 4600 } 4601 return false; 4602 } 4603 4604 /** 4605 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4606 * 4607 * @adev: amdgpu_device pointer 4608 * 4609 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4610 * a hung GPU. 4611 */ 4612 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4613 { 4614 4615 if (amdgpu_gpu_recovery == 0) 4616 goto disabled; 4617 4618 /* Skip soft reset check in fatal error mode */ 4619 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4620 return true; 4621 4622 if (amdgpu_sriov_vf(adev)) 4623 return true; 4624 4625 if (amdgpu_gpu_recovery == -1) { 4626 switch (adev->asic_type) { 4627 #ifdef CONFIG_DRM_AMDGPU_SI 4628 case CHIP_VERDE: 4629 case CHIP_TAHITI: 4630 case CHIP_PITCAIRN: 4631 case CHIP_OLAND: 4632 case CHIP_HAINAN: 4633 #endif 4634 #ifdef CONFIG_DRM_AMDGPU_CIK 4635 case CHIP_KAVERI: 4636 case CHIP_KABINI: 4637 case CHIP_MULLINS: 4638 #endif 4639 case CHIP_CARRIZO: 4640 case CHIP_STONEY: 4641 case CHIP_CYAN_SKILLFISH: 4642 goto disabled; 4643 default: 4644 break; 4645 } 4646 } 4647 4648 return true; 4649 4650 disabled: 4651 dev_info(adev->dev, "GPU recovery disabled.\n"); 4652 return false; 4653 } 4654 4655 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4656 { 4657 u32 i; 4658 int ret = 0; 4659 4660 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4661 4662 dev_info(adev->dev, "GPU mode1 reset\n"); 4663 4664 /* disable BM */ 4665 pci_clear_master(adev->pdev); 4666 4667 amdgpu_device_cache_pci_state(adev->pdev); 4668 4669 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4670 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4671 ret = amdgpu_dpm_mode1_reset(adev); 4672 } else { 4673 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4674 ret = psp_gpu_reset(adev); 4675 } 4676 4677 if (ret) 4678 goto mode1_reset_failed; 4679 4680 amdgpu_device_load_pci_state(adev->pdev); 4681 ret = amdgpu_psp_wait_for_bootloader(adev); 4682 if (ret) 4683 goto mode1_reset_failed; 4684 4685 /* wait for asic to come out of reset */ 4686 for (i = 0; i < adev->usec_timeout; i++) { 4687 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4688 4689 if (memsize != 0xffffffff) 4690 break; 4691 udelay(1); 4692 } 4693 4694 if (i >= adev->usec_timeout) { 4695 ret = -ETIMEDOUT; 4696 goto mode1_reset_failed; 4697 } 4698 4699 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4700 4701 return 0; 4702 4703 mode1_reset_failed: 4704 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4705 return ret; 4706 } 4707 4708 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4709 struct amdgpu_reset_context *reset_context) 4710 { 4711 int i, r = 0; 4712 struct amdgpu_job *job = NULL; 4713 bool need_full_reset = 4714 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4715 4716 if (reset_context->reset_req_dev == adev) 4717 job = reset_context->job; 4718 4719 if (amdgpu_sriov_vf(adev)) { 4720 /* stop the data exchange thread */ 4721 amdgpu_virt_fini_data_exchange(adev); 4722 } 4723 4724 amdgpu_fence_driver_isr_toggle(adev, true); 4725 4726 /* block all schedulers and reset given job's ring */ 4727 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4728 struct amdgpu_ring *ring = adev->rings[i]; 4729 4730 if (!ring || !ring->sched.thread) 4731 continue; 4732 4733 /* Clear job fence from fence drv to avoid force_completion 4734 * leave NULL and vm flush fence in fence drv 4735 */ 4736 amdgpu_fence_driver_clear_job_fences(ring); 4737 4738 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4739 amdgpu_fence_driver_force_completion(ring); 4740 } 4741 4742 amdgpu_fence_driver_isr_toggle(adev, false); 4743 4744 if (job && job->vm) 4745 drm_sched_increase_karma(&job->base); 4746 4747 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4748 /* If reset handler not implemented, continue; otherwise return */ 4749 if (r == -EOPNOTSUPP) 4750 r = 0; 4751 else 4752 return r; 4753 4754 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4755 if (!amdgpu_sriov_vf(adev)) { 4756 4757 if (!need_full_reset) 4758 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4759 4760 if (!need_full_reset && amdgpu_gpu_recovery && 4761 amdgpu_device_ip_check_soft_reset(adev)) { 4762 amdgpu_device_ip_pre_soft_reset(adev); 4763 r = amdgpu_device_ip_soft_reset(adev); 4764 amdgpu_device_ip_post_soft_reset(adev); 4765 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4766 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4767 need_full_reset = true; 4768 } 4769 } 4770 4771 if (need_full_reset) 4772 r = amdgpu_device_ip_suspend(adev); 4773 if (need_full_reset) 4774 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4775 else 4776 clear_bit(AMDGPU_NEED_FULL_RESET, 4777 &reset_context->flags); 4778 } 4779 4780 return r; 4781 } 4782 4783 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4784 { 4785 int i; 4786 4787 lockdep_assert_held(&adev->reset_domain->sem); 4788 4789 for (i = 0; i < adev->num_regs; i++) { 4790 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4791 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4792 adev->reset_dump_reg_value[i]); 4793 } 4794 4795 return 0; 4796 } 4797 4798 #ifdef CONFIG_DEV_COREDUMP 4799 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4800 size_t count, void *data, size_t datalen) 4801 { 4802 struct drm_printer p; 4803 struct amdgpu_device *adev = data; 4804 struct drm_print_iterator iter; 4805 int i; 4806 4807 iter.data = buffer; 4808 iter.offset = 0; 4809 iter.start = offset; 4810 iter.remain = count; 4811 4812 p = drm_coredump_printer(&iter); 4813 4814 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4815 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4816 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4817 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4818 if (adev->reset_task_info.pid) 4819 drm_printf(&p, "process_name: %s PID: %d\n", 4820 adev->reset_task_info.process_name, 4821 adev->reset_task_info.pid); 4822 4823 if (adev->reset_vram_lost) 4824 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4825 if (adev->num_regs) { 4826 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4827 4828 for (i = 0; i < adev->num_regs; i++) 4829 drm_printf(&p, "0x%08x: 0x%08x\n", 4830 adev->reset_dump_reg_list[i], 4831 adev->reset_dump_reg_value[i]); 4832 } 4833 4834 return count - iter.remain; 4835 } 4836 4837 static void amdgpu_devcoredump_free(void *data) 4838 { 4839 } 4840 4841 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4842 { 4843 struct drm_device *dev = adev_to_drm(adev); 4844 4845 ktime_get_ts64(&adev->reset_time); 4846 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4847 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4848 } 4849 #endif 4850 4851 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4852 struct amdgpu_reset_context *reset_context) 4853 { 4854 struct amdgpu_device *tmp_adev = NULL; 4855 bool need_full_reset, skip_hw_reset, vram_lost = false; 4856 int r = 0; 4857 bool gpu_reset_for_dev_remove = 0; 4858 4859 /* Try reset handler method first */ 4860 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4861 reset_list); 4862 amdgpu_reset_reg_dumps(tmp_adev); 4863 4864 reset_context->reset_device_list = device_list_handle; 4865 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4866 /* If reset handler not implemented, continue; otherwise return */ 4867 if (r == -EOPNOTSUPP) 4868 r = 0; 4869 else 4870 return r; 4871 4872 /* Reset handler not implemented, use the default method */ 4873 need_full_reset = 4874 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4875 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4876 4877 gpu_reset_for_dev_remove = 4878 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4879 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4880 4881 /* 4882 * ASIC reset has to be done on all XGMI hive nodes ASAP 4883 * to allow proper links negotiation in FW (within 1 sec) 4884 */ 4885 if (!skip_hw_reset && need_full_reset) { 4886 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4887 /* For XGMI run all resets in parallel to speed up the process */ 4888 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4889 tmp_adev->gmc.xgmi.pending_reset = false; 4890 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4891 r = -EALREADY; 4892 } else 4893 r = amdgpu_asic_reset(tmp_adev); 4894 4895 if (r) { 4896 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4897 r, adev_to_drm(tmp_adev)->unique); 4898 break; 4899 } 4900 } 4901 4902 /* For XGMI wait for all resets to complete before proceed */ 4903 if (!r) { 4904 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4905 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4906 flush_work(&tmp_adev->xgmi_reset_work); 4907 r = tmp_adev->asic_reset_res; 4908 if (r) 4909 break; 4910 } 4911 } 4912 } 4913 } 4914 4915 if (!r && amdgpu_ras_intr_triggered()) { 4916 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4917 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4918 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4919 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4920 } 4921 4922 amdgpu_ras_intr_cleared(); 4923 } 4924 4925 /* Since the mode1 reset affects base ip blocks, the 4926 * phase1 ip blocks need to be resumed. Otherwise there 4927 * will be a BIOS signature error and the psp bootloader 4928 * can't load kdb on the next amdgpu install. 4929 */ 4930 if (gpu_reset_for_dev_remove) { 4931 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4932 amdgpu_device_ip_resume_phase1(tmp_adev); 4933 4934 goto end; 4935 } 4936 4937 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4938 if (need_full_reset) { 4939 /* post card */ 4940 r = amdgpu_device_asic_init(tmp_adev); 4941 if (r) { 4942 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4943 } else { 4944 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4945 4946 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4947 if (r) 4948 goto out; 4949 4950 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4951 #ifdef CONFIG_DEV_COREDUMP 4952 tmp_adev->reset_vram_lost = vram_lost; 4953 memset(&tmp_adev->reset_task_info, 0, 4954 sizeof(tmp_adev->reset_task_info)); 4955 if (reset_context->job && reset_context->job->vm) 4956 tmp_adev->reset_task_info = 4957 reset_context->job->vm->task_info; 4958 amdgpu_reset_capture_coredumpm(tmp_adev); 4959 #endif 4960 if (vram_lost) { 4961 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4962 amdgpu_inc_vram_lost(tmp_adev); 4963 } 4964 4965 r = amdgpu_device_fw_loading(tmp_adev); 4966 if (r) 4967 return r; 4968 4969 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4970 if (r) 4971 goto out; 4972 4973 if (vram_lost) 4974 amdgpu_device_fill_reset_magic(tmp_adev); 4975 4976 /* 4977 * Add this ASIC as tracked as reset was already 4978 * complete successfully. 4979 */ 4980 amdgpu_register_gpu_instance(tmp_adev); 4981 4982 if (!reset_context->hive && 4983 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4984 amdgpu_xgmi_add_device(tmp_adev); 4985 4986 r = amdgpu_device_ip_late_init(tmp_adev); 4987 if (r) 4988 goto out; 4989 4990 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4991 4992 /* 4993 * The GPU enters bad state once faulty pages 4994 * by ECC has reached the threshold, and ras 4995 * recovery is scheduled next. So add one check 4996 * here to break recovery if it indeed exceeds 4997 * bad page threshold, and remind user to 4998 * retire this GPU or setting one bigger 4999 * bad_page_threshold value to fix this once 5000 * probing driver again. 5001 */ 5002 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5003 /* must succeed. */ 5004 amdgpu_ras_resume(tmp_adev); 5005 } else { 5006 r = -EINVAL; 5007 goto out; 5008 } 5009 5010 /* Update PSP FW topology after reset */ 5011 if (reset_context->hive && 5012 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5013 r = amdgpu_xgmi_update_topology( 5014 reset_context->hive, tmp_adev); 5015 } 5016 } 5017 5018 out: 5019 if (!r) { 5020 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5021 r = amdgpu_ib_ring_tests(tmp_adev); 5022 if (r) { 5023 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5024 need_full_reset = true; 5025 r = -EAGAIN; 5026 goto end; 5027 } 5028 } 5029 5030 if (!r) 5031 r = amdgpu_device_recover_vram(tmp_adev); 5032 else 5033 tmp_adev->asic_reset_res = r; 5034 } 5035 5036 end: 5037 if (need_full_reset) 5038 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5039 else 5040 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5041 return r; 5042 } 5043 5044 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5045 { 5046 5047 switch (amdgpu_asic_reset_method(adev)) { 5048 case AMD_RESET_METHOD_MODE1: 5049 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5050 break; 5051 case AMD_RESET_METHOD_MODE2: 5052 adev->mp1_state = PP_MP1_STATE_RESET; 5053 break; 5054 default: 5055 adev->mp1_state = PP_MP1_STATE_NONE; 5056 break; 5057 } 5058 } 5059 5060 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5061 { 5062 amdgpu_vf_error_trans_all(adev); 5063 adev->mp1_state = PP_MP1_STATE_NONE; 5064 } 5065 5066 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5067 { 5068 struct pci_dev *p = NULL; 5069 5070 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5071 adev->pdev->bus->number, 1); 5072 if (p) { 5073 pm_runtime_enable(&(p->dev)); 5074 pm_runtime_resume(&(p->dev)); 5075 } 5076 5077 pci_dev_put(p); 5078 } 5079 5080 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5081 { 5082 enum amd_reset_method reset_method; 5083 struct pci_dev *p = NULL; 5084 u64 expires; 5085 5086 /* 5087 * For now, only BACO and mode1 reset are confirmed 5088 * to suffer the audio issue without proper suspended. 5089 */ 5090 reset_method = amdgpu_asic_reset_method(adev); 5091 if ((reset_method != AMD_RESET_METHOD_BACO) && 5092 (reset_method != AMD_RESET_METHOD_MODE1)) 5093 return -EINVAL; 5094 5095 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5096 adev->pdev->bus->number, 1); 5097 if (!p) 5098 return -ENODEV; 5099 5100 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5101 if (!expires) 5102 /* 5103 * If we cannot get the audio device autosuspend delay, 5104 * a fixed 4S interval will be used. Considering 3S is 5105 * the audio controller default autosuspend delay setting. 5106 * 4S used here is guaranteed to cover that. 5107 */ 5108 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5109 5110 while (!pm_runtime_status_suspended(&(p->dev))) { 5111 if (!pm_runtime_suspend(&(p->dev))) 5112 break; 5113 5114 if (expires < ktime_get_mono_fast_ns()) { 5115 dev_warn(adev->dev, "failed to suspend display audio\n"); 5116 pci_dev_put(p); 5117 /* TODO: abort the succeeding gpu reset? */ 5118 return -ETIMEDOUT; 5119 } 5120 } 5121 5122 pm_runtime_disable(&(p->dev)); 5123 5124 pci_dev_put(p); 5125 return 0; 5126 } 5127 5128 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5129 { 5130 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5131 5132 #if defined(CONFIG_DEBUG_FS) 5133 if (!amdgpu_sriov_vf(adev)) 5134 cancel_work(&adev->reset_work); 5135 #endif 5136 5137 if (adev->kfd.dev) 5138 cancel_work(&adev->kfd.reset_work); 5139 5140 if (amdgpu_sriov_vf(adev)) 5141 cancel_work(&adev->virt.flr_work); 5142 5143 if (con && adev->ras_enabled) 5144 cancel_work(&con->recovery_work); 5145 5146 } 5147 5148 /** 5149 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5150 * 5151 * @adev: amdgpu_device pointer 5152 * @job: which job trigger hang 5153 * @reset_context: amdgpu reset context pointer 5154 * 5155 * Attempt to reset the GPU if it has hung (all asics). 5156 * Attempt to do soft-reset or full-reset and reinitialize Asic 5157 * Returns 0 for success or an error on failure. 5158 */ 5159 5160 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5161 struct amdgpu_job *job, 5162 struct amdgpu_reset_context *reset_context) 5163 { 5164 struct list_head device_list, *device_list_handle = NULL; 5165 bool job_signaled = false; 5166 struct amdgpu_hive_info *hive = NULL; 5167 struct amdgpu_device *tmp_adev = NULL; 5168 int i, r = 0; 5169 bool need_emergency_restart = false; 5170 bool audio_suspended = false; 5171 bool gpu_reset_for_dev_remove = false; 5172 5173 gpu_reset_for_dev_remove = 5174 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5175 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5176 5177 /* 5178 * Special case: RAS triggered and full reset isn't supported 5179 */ 5180 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5181 5182 /* 5183 * Flush RAM to disk so that after reboot 5184 * the user can read log and see why the system rebooted. 5185 */ 5186 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5187 DRM_WARN("Emergency reboot."); 5188 5189 ksys_sync_helper(); 5190 emergency_restart(); 5191 } 5192 5193 dev_info(adev->dev, "GPU %s begin!\n", 5194 need_emergency_restart ? "jobs stop":"reset"); 5195 5196 if (!amdgpu_sriov_vf(adev)) 5197 hive = amdgpu_get_xgmi_hive(adev); 5198 if (hive) 5199 mutex_lock(&hive->hive_lock); 5200 5201 reset_context->job = job; 5202 reset_context->hive = hive; 5203 /* 5204 * Build list of devices to reset. 5205 * In case we are in XGMI hive mode, resort the device list 5206 * to put adev in the 1st position. 5207 */ 5208 INIT_LIST_HEAD(&device_list); 5209 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5210 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5211 list_add_tail(&tmp_adev->reset_list, &device_list); 5212 if (gpu_reset_for_dev_remove && adev->shutdown) 5213 tmp_adev->shutdown = true; 5214 } 5215 if (!list_is_first(&adev->reset_list, &device_list)) 5216 list_rotate_to_front(&adev->reset_list, &device_list); 5217 device_list_handle = &device_list; 5218 } else { 5219 list_add_tail(&adev->reset_list, &device_list); 5220 device_list_handle = &device_list; 5221 } 5222 5223 /* We need to lock reset domain only once both for XGMI and single device */ 5224 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5225 reset_list); 5226 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5227 5228 /* block all schedulers and reset given job's ring */ 5229 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5230 5231 amdgpu_device_set_mp1_state(tmp_adev); 5232 5233 /* 5234 * Try to put the audio codec into suspend state 5235 * before gpu reset started. 5236 * 5237 * Due to the power domain of the graphics device 5238 * is shared with AZ power domain. Without this, 5239 * we may change the audio hardware from behind 5240 * the audio driver's back. That will trigger 5241 * some audio codec errors. 5242 */ 5243 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5244 audio_suspended = true; 5245 5246 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5247 5248 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5249 5250 if (!amdgpu_sriov_vf(tmp_adev)) 5251 amdgpu_amdkfd_pre_reset(tmp_adev); 5252 5253 /* 5254 * Mark these ASICs to be reseted as untracked first 5255 * And add them back after reset completed 5256 */ 5257 amdgpu_unregister_gpu_instance(tmp_adev); 5258 5259 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5260 5261 /* disable ras on ALL IPs */ 5262 if (!need_emergency_restart && 5263 amdgpu_device_ip_need_full_reset(tmp_adev)) 5264 amdgpu_ras_suspend(tmp_adev); 5265 5266 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5267 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5268 5269 if (!ring || !ring->sched.thread) 5270 continue; 5271 5272 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5273 5274 if (need_emergency_restart) 5275 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5276 } 5277 atomic_inc(&tmp_adev->gpu_reset_counter); 5278 } 5279 5280 if (need_emergency_restart) 5281 goto skip_sched_resume; 5282 5283 /* 5284 * Must check guilty signal here since after this point all old 5285 * HW fences are force signaled. 5286 * 5287 * job->base holds a reference to parent fence 5288 */ 5289 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5290 job_signaled = true; 5291 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5292 goto skip_hw_reset; 5293 } 5294 5295 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5296 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5297 if (gpu_reset_for_dev_remove) { 5298 /* Workaroud for ASICs need to disable SMC first */ 5299 amdgpu_device_smu_fini_early(tmp_adev); 5300 } 5301 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5302 /*TODO Should we stop ?*/ 5303 if (r) { 5304 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5305 r, adev_to_drm(tmp_adev)->unique); 5306 tmp_adev->asic_reset_res = r; 5307 } 5308 5309 /* 5310 * Drop all pending non scheduler resets. Scheduler resets 5311 * were already dropped during drm_sched_stop 5312 */ 5313 amdgpu_device_stop_pending_resets(tmp_adev); 5314 } 5315 5316 /* Actual ASIC resets if needed.*/ 5317 /* Host driver will handle XGMI hive reset for SRIOV */ 5318 if (amdgpu_sriov_vf(adev)) { 5319 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5320 if (r) 5321 adev->asic_reset_res = r; 5322 5323 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5324 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5325 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5326 amdgpu_ras_resume(adev); 5327 } else { 5328 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5329 if (r && r == -EAGAIN) 5330 goto retry; 5331 5332 if (!r && gpu_reset_for_dev_remove) 5333 goto recover_end; 5334 } 5335 5336 skip_hw_reset: 5337 5338 /* Post ASIC reset for all devs .*/ 5339 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5340 5341 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5342 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5343 5344 if (!ring || !ring->sched.thread) 5345 continue; 5346 5347 drm_sched_start(&ring->sched, true); 5348 } 5349 5350 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5351 amdgpu_mes_self_test(tmp_adev); 5352 5353 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5354 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5355 5356 if (tmp_adev->asic_reset_res) 5357 r = tmp_adev->asic_reset_res; 5358 5359 tmp_adev->asic_reset_res = 0; 5360 5361 if (r) { 5362 /* bad news, how to tell it to userspace ? */ 5363 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5364 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5365 } else { 5366 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5367 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5368 DRM_WARN("smart shift update failed\n"); 5369 } 5370 } 5371 5372 skip_sched_resume: 5373 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5374 /* unlock kfd: SRIOV would do it separately */ 5375 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5376 amdgpu_amdkfd_post_reset(tmp_adev); 5377 5378 /* kfd_post_reset will do nothing if kfd device is not initialized, 5379 * need to bring up kfd here if it's not be initialized before 5380 */ 5381 if (!adev->kfd.init_complete) 5382 amdgpu_amdkfd_device_init(adev); 5383 5384 if (audio_suspended) 5385 amdgpu_device_resume_display_audio(tmp_adev); 5386 5387 amdgpu_device_unset_mp1_state(tmp_adev); 5388 5389 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5390 } 5391 5392 recover_end: 5393 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5394 reset_list); 5395 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5396 5397 if (hive) { 5398 mutex_unlock(&hive->hive_lock); 5399 amdgpu_put_xgmi_hive(hive); 5400 } 5401 5402 if (r) 5403 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5404 5405 atomic_set(&adev->reset_domain->reset_res, r); 5406 return r; 5407 } 5408 5409 /** 5410 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5411 * 5412 * @adev: amdgpu_device pointer 5413 * 5414 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5415 * and lanes) of the slot the device is in. Handles APUs and 5416 * virtualized environments where PCIE config space may not be available. 5417 */ 5418 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5419 { 5420 struct pci_dev *pdev; 5421 enum pci_bus_speed speed_cap, platform_speed_cap; 5422 enum pcie_link_width platform_link_width; 5423 5424 if (amdgpu_pcie_gen_cap) 5425 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5426 5427 if (amdgpu_pcie_lane_cap) 5428 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5429 5430 /* covers APUs as well */ 5431 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5432 if (adev->pm.pcie_gen_mask == 0) 5433 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5434 if (adev->pm.pcie_mlw_mask == 0) 5435 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5436 return; 5437 } 5438 5439 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5440 return; 5441 5442 pcie_bandwidth_available(adev->pdev, NULL, 5443 &platform_speed_cap, &platform_link_width); 5444 5445 if (adev->pm.pcie_gen_mask == 0) { 5446 /* asic caps */ 5447 pdev = adev->pdev; 5448 speed_cap = pcie_get_speed_cap(pdev); 5449 if (speed_cap == PCI_SPEED_UNKNOWN) { 5450 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5451 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5452 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5453 } else { 5454 if (speed_cap == PCIE_SPEED_32_0GT) 5455 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5456 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5457 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5458 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5459 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5460 else if (speed_cap == PCIE_SPEED_16_0GT) 5461 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5463 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5464 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5465 else if (speed_cap == PCIE_SPEED_8_0GT) 5466 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5468 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5469 else if (speed_cap == PCIE_SPEED_5_0GT) 5470 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5471 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5472 else 5473 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5474 } 5475 /* platform caps */ 5476 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5477 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5478 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5479 } else { 5480 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5481 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5482 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5483 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5484 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5485 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5486 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5487 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5489 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5490 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5491 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5492 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5493 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5494 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5495 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5496 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5497 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5498 else 5499 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5500 5501 } 5502 } 5503 if (adev->pm.pcie_mlw_mask == 0) { 5504 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5505 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5506 } else { 5507 switch (platform_link_width) { 5508 case PCIE_LNK_X32: 5509 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5510 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5511 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5512 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5515 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5516 break; 5517 case PCIE_LNK_X16: 5518 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5519 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5520 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5521 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5522 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5523 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5524 break; 5525 case PCIE_LNK_X12: 5526 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5531 break; 5532 case PCIE_LNK_X8: 5533 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5534 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5537 break; 5538 case PCIE_LNK_X4: 5539 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5542 break; 5543 case PCIE_LNK_X2: 5544 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5546 break; 5547 case PCIE_LNK_X1: 5548 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5549 break; 5550 default: 5551 break; 5552 } 5553 } 5554 } 5555 } 5556 5557 /** 5558 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5559 * 5560 * @adev: amdgpu_device pointer 5561 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5562 * 5563 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5564 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5565 * @peer_adev. 5566 */ 5567 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5568 struct amdgpu_device *peer_adev) 5569 { 5570 #ifdef CONFIG_HSA_AMD_P2P 5571 uint64_t address_mask = peer_adev->dev->dma_mask ? 5572 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5573 resource_size_t aper_limit = 5574 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5575 bool p2p_access = 5576 !adev->gmc.xgmi.connected_to_cpu && 5577 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5578 5579 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5580 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5581 !(adev->gmc.aper_base & address_mask || 5582 aper_limit & address_mask)); 5583 #else 5584 return false; 5585 #endif 5586 } 5587 5588 int amdgpu_device_baco_enter(struct drm_device *dev) 5589 { 5590 struct amdgpu_device *adev = drm_to_adev(dev); 5591 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5592 5593 if (!amdgpu_device_supports_baco(dev)) 5594 return -ENOTSUPP; 5595 5596 if (ras && adev->ras_enabled && 5597 adev->nbio.funcs->enable_doorbell_interrupt) 5598 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5599 5600 return amdgpu_dpm_baco_enter(adev); 5601 } 5602 5603 int amdgpu_device_baco_exit(struct drm_device *dev) 5604 { 5605 struct amdgpu_device *adev = drm_to_adev(dev); 5606 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5607 int ret = 0; 5608 5609 if (!amdgpu_device_supports_baco(dev)) 5610 return -ENOTSUPP; 5611 5612 ret = amdgpu_dpm_baco_exit(adev); 5613 if (ret) 5614 return ret; 5615 5616 if (ras && adev->ras_enabled && 5617 adev->nbio.funcs->enable_doorbell_interrupt) 5618 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5619 5620 if (amdgpu_passthrough(adev) && 5621 adev->nbio.funcs->clear_doorbell_interrupt) 5622 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5623 5624 return 0; 5625 } 5626 5627 /** 5628 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5629 * @pdev: PCI device struct 5630 * @state: PCI channel state 5631 * 5632 * Description: Called when a PCI error is detected. 5633 * 5634 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5635 */ 5636 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5637 { 5638 struct drm_device *dev = pci_get_drvdata(pdev); 5639 struct amdgpu_device *adev = drm_to_adev(dev); 5640 int i; 5641 5642 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5643 5644 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5645 DRM_WARN("No support for XGMI hive yet..."); 5646 return PCI_ERS_RESULT_DISCONNECT; 5647 } 5648 5649 adev->pci_channel_state = state; 5650 5651 switch (state) { 5652 case pci_channel_io_normal: 5653 return PCI_ERS_RESULT_CAN_RECOVER; 5654 /* Fatal error, prepare for slot reset */ 5655 case pci_channel_io_frozen: 5656 /* 5657 * Locking adev->reset_domain->sem will prevent any external access 5658 * to GPU during PCI error recovery 5659 */ 5660 amdgpu_device_lock_reset_domain(adev->reset_domain); 5661 amdgpu_device_set_mp1_state(adev); 5662 5663 /* 5664 * Block any work scheduling as we do for regular GPU reset 5665 * for the duration of the recovery 5666 */ 5667 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5668 struct amdgpu_ring *ring = adev->rings[i]; 5669 5670 if (!ring || !ring->sched.thread) 5671 continue; 5672 5673 drm_sched_stop(&ring->sched, NULL); 5674 } 5675 atomic_inc(&adev->gpu_reset_counter); 5676 return PCI_ERS_RESULT_NEED_RESET; 5677 case pci_channel_io_perm_failure: 5678 /* Permanent error, prepare for device removal */ 5679 return PCI_ERS_RESULT_DISCONNECT; 5680 } 5681 5682 return PCI_ERS_RESULT_NEED_RESET; 5683 } 5684 5685 /** 5686 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5687 * @pdev: pointer to PCI device 5688 */ 5689 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5690 { 5691 5692 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5693 5694 /* TODO - dump whatever for debugging purposes */ 5695 5696 /* This called only if amdgpu_pci_error_detected returns 5697 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5698 * works, no need to reset slot. 5699 */ 5700 5701 return PCI_ERS_RESULT_RECOVERED; 5702 } 5703 5704 /** 5705 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5706 * @pdev: PCI device struct 5707 * 5708 * Description: This routine is called by the pci error recovery 5709 * code after the PCI slot has been reset, just before we 5710 * should resume normal operations. 5711 */ 5712 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5713 { 5714 struct drm_device *dev = pci_get_drvdata(pdev); 5715 struct amdgpu_device *adev = drm_to_adev(dev); 5716 int r, i; 5717 struct amdgpu_reset_context reset_context; 5718 u32 memsize; 5719 struct list_head device_list; 5720 5721 DRM_INFO("PCI error: slot reset callback!!\n"); 5722 5723 memset(&reset_context, 0, sizeof(reset_context)); 5724 5725 INIT_LIST_HEAD(&device_list); 5726 list_add_tail(&adev->reset_list, &device_list); 5727 5728 /* wait for asic to come out of reset */ 5729 msleep(500); 5730 5731 /* Restore PCI confspace */ 5732 amdgpu_device_load_pci_state(pdev); 5733 5734 /* confirm ASIC came out of reset */ 5735 for (i = 0; i < adev->usec_timeout; i++) { 5736 memsize = amdgpu_asic_get_config_memsize(adev); 5737 5738 if (memsize != 0xffffffff) 5739 break; 5740 udelay(1); 5741 } 5742 if (memsize == 0xffffffff) { 5743 r = -ETIME; 5744 goto out; 5745 } 5746 5747 reset_context.method = AMD_RESET_METHOD_NONE; 5748 reset_context.reset_req_dev = adev; 5749 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5750 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5751 5752 adev->no_hw_access = true; 5753 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5754 adev->no_hw_access = false; 5755 if (r) 5756 goto out; 5757 5758 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5759 5760 out: 5761 if (!r) { 5762 if (amdgpu_device_cache_pci_state(adev->pdev)) 5763 pci_restore_state(adev->pdev); 5764 5765 DRM_INFO("PCIe error recovery succeeded\n"); 5766 } else { 5767 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5768 amdgpu_device_unset_mp1_state(adev); 5769 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5770 } 5771 5772 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5773 } 5774 5775 /** 5776 * amdgpu_pci_resume() - resume normal ops after PCI reset 5777 * @pdev: pointer to PCI device 5778 * 5779 * Called when the error recovery driver tells us that its 5780 * OK to resume normal operation. 5781 */ 5782 void amdgpu_pci_resume(struct pci_dev *pdev) 5783 { 5784 struct drm_device *dev = pci_get_drvdata(pdev); 5785 struct amdgpu_device *adev = drm_to_adev(dev); 5786 int i; 5787 5788 5789 DRM_INFO("PCI error: resume callback!!\n"); 5790 5791 /* Only continue execution for the case of pci_channel_io_frozen */ 5792 if (adev->pci_channel_state != pci_channel_io_frozen) 5793 return; 5794 5795 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5796 struct amdgpu_ring *ring = adev->rings[i]; 5797 5798 if (!ring || !ring->sched.thread) 5799 continue; 5800 5801 drm_sched_start(&ring->sched, true); 5802 } 5803 5804 amdgpu_device_unset_mp1_state(adev); 5805 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5806 } 5807 5808 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5809 { 5810 struct drm_device *dev = pci_get_drvdata(pdev); 5811 struct amdgpu_device *adev = drm_to_adev(dev); 5812 int r; 5813 5814 r = pci_save_state(pdev); 5815 if (!r) { 5816 kfree(adev->pci_state); 5817 5818 adev->pci_state = pci_store_saved_state(pdev); 5819 5820 if (!adev->pci_state) { 5821 DRM_ERROR("Failed to store PCI saved state"); 5822 return false; 5823 } 5824 } else { 5825 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5826 return false; 5827 } 5828 5829 return true; 5830 } 5831 5832 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5833 { 5834 struct drm_device *dev = pci_get_drvdata(pdev); 5835 struct amdgpu_device *adev = drm_to_adev(dev); 5836 int r; 5837 5838 if (!adev->pci_state) 5839 return false; 5840 5841 r = pci_load_saved_state(pdev, adev->pci_state); 5842 5843 if (!r) { 5844 pci_restore_state(pdev); 5845 } else { 5846 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5847 return false; 5848 } 5849 5850 return true; 5851 } 5852 5853 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5854 struct amdgpu_ring *ring) 5855 { 5856 #ifdef CONFIG_X86_64 5857 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5858 return; 5859 #endif 5860 if (adev->gmc.xgmi.connected_to_cpu) 5861 return; 5862 5863 if (ring && ring->funcs->emit_hdp_flush) 5864 amdgpu_ring_emit_hdp_flush(ring); 5865 else 5866 amdgpu_asic_flush_hdp(adev, ring); 5867 } 5868 5869 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5870 struct amdgpu_ring *ring) 5871 { 5872 #ifdef CONFIG_X86_64 5873 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5874 return; 5875 #endif 5876 if (adev->gmc.xgmi.connected_to_cpu) 5877 return; 5878 5879 amdgpu_asic_invalidate_hdp(adev, ring); 5880 } 5881 5882 int amdgpu_in_reset(struct amdgpu_device *adev) 5883 { 5884 return atomic_read(&adev->reset_domain->in_gpu_reset); 5885 } 5886 5887 /** 5888 * amdgpu_device_halt() - bring hardware to some kind of halt state 5889 * 5890 * @adev: amdgpu_device pointer 5891 * 5892 * Bring hardware to some kind of halt state so that no one can touch it 5893 * any more. It will help to maintain error context when error occurred. 5894 * Compare to a simple hang, the system will keep stable at least for SSH 5895 * access. Then it should be trivial to inspect the hardware state and 5896 * see what's going on. Implemented as following: 5897 * 5898 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5899 * clears all CPU mappings to device, disallows remappings through page faults 5900 * 2. amdgpu_irq_disable_all() disables all interrupts 5901 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5902 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5903 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5904 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5905 * flush any in flight DMA operations 5906 */ 5907 void amdgpu_device_halt(struct amdgpu_device *adev) 5908 { 5909 struct pci_dev *pdev = adev->pdev; 5910 struct drm_device *ddev = adev_to_drm(adev); 5911 5912 amdgpu_xcp_dev_unplug(adev); 5913 drm_dev_unplug(ddev); 5914 5915 amdgpu_irq_disable_all(adev); 5916 5917 amdgpu_fence_driver_hw_fini(adev); 5918 5919 adev->no_hw_access = true; 5920 5921 amdgpu_device_unmap_mmio(adev); 5922 5923 pci_disable_device(pdev); 5924 pci_wait_for_pending_transaction(pdev); 5925 } 5926 5927 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5928 u32 reg) 5929 { 5930 unsigned long flags, address, data; 5931 u32 r; 5932 5933 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5934 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5935 5936 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5937 WREG32(address, reg * 4); 5938 (void)RREG32(address); 5939 r = RREG32(data); 5940 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5941 return r; 5942 } 5943 5944 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5945 u32 reg, u32 v) 5946 { 5947 unsigned long flags, address, data; 5948 5949 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5950 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5951 5952 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5953 WREG32(address, reg * 4); 5954 (void)RREG32(address); 5955 WREG32(data, v); 5956 (void)RREG32(data); 5957 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5958 } 5959 5960 /** 5961 * amdgpu_device_switch_gang - switch to a new gang 5962 * @adev: amdgpu_device pointer 5963 * @gang: the gang to switch to 5964 * 5965 * Try to switch to a new gang. 5966 * Returns: NULL if we switched to the new gang or a reference to the current 5967 * gang leader. 5968 */ 5969 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5970 struct dma_fence *gang) 5971 { 5972 struct dma_fence *old = NULL; 5973 5974 do { 5975 dma_fence_put(old); 5976 rcu_read_lock(); 5977 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5978 rcu_read_unlock(); 5979 5980 if (old == gang) 5981 break; 5982 5983 if (!dma_fence_is_signaled(old)) 5984 return old; 5985 5986 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5987 old, gang) != old); 5988 5989 dma_fence_put(old); 5990 return NULL; 5991 } 5992 5993 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5994 { 5995 switch (adev->asic_type) { 5996 #ifdef CONFIG_DRM_AMDGPU_SI 5997 case CHIP_HAINAN: 5998 #endif 5999 case CHIP_TOPAZ: 6000 /* chips with no display hardware */ 6001 return false; 6002 #ifdef CONFIG_DRM_AMDGPU_SI 6003 case CHIP_TAHITI: 6004 case CHIP_PITCAIRN: 6005 case CHIP_VERDE: 6006 case CHIP_OLAND: 6007 #endif 6008 #ifdef CONFIG_DRM_AMDGPU_CIK 6009 case CHIP_BONAIRE: 6010 case CHIP_HAWAII: 6011 case CHIP_KAVERI: 6012 case CHIP_KABINI: 6013 case CHIP_MULLINS: 6014 #endif 6015 case CHIP_TONGA: 6016 case CHIP_FIJI: 6017 case CHIP_POLARIS10: 6018 case CHIP_POLARIS11: 6019 case CHIP_POLARIS12: 6020 case CHIP_VEGAM: 6021 case CHIP_CARRIZO: 6022 case CHIP_STONEY: 6023 /* chips with display hardware */ 6024 return true; 6025 default: 6026 /* IP discovery */ 6027 if (!adev->ip_versions[DCE_HWIP][0] || 6028 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6029 return false; 6030 return true; 6031 } 6032 } 6033 6034 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6035 uint32_t inst, uint32_t reg_addr, char reg_name[], 6036 uint32_t expected_value, uint32_t mask) 6037 { 6038 uint32_t ret = 0; 6039 uint32_t old_ = 0; 6040 uint32_t tmp_ = RREG32(reg_addr); 6041 uint32_t loop = adev->usec_timeout; 6042 6043 while ((tmp_ & (mask)) != (expected_value)) { 6044 if (old_ != tmp_) { 6045 loop = adev->usec_timeout; 6046 old_ = tmp_; 6047 } else 6048 udelay(1); 6049 tmp_ = RREG32(reg_addr); 6050 loop--; 6051 if (!loop) { 6052 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6053 inst, reg_name, (uint32_t)expected_value, 6054 (uint32_t)(tmp_ & (mask))); 6055 ret = -ETIMEDOUT; 6056 break; 6057 } 6058 } 6059 return ret; 6060 } 6061