1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * 511 * this function is invoked only for the debugfs register access 512 */ 513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 514 uint32_t reg, uint32_t v, 515 uint32_t xcc_id) 516 { 517 if (amdgpu_device_skip_hw_access(adev)) 518 return; 519 520 if (amdgpu_sriov_fullaccess(adev) && 521 adev->gfx.rlc.funcs && 522 adev->gfx.rlc.funcs->is_rlcg_access_range) { 523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 525 } else if ((reg * 4) >= adev->rmmio_size) { 526 adev->pcie_wreg(adev, reg * 4, v); 527 } else { 528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 529 } 530 } 531 532 /** 533 * amdgpu_device_indirect_rreg - read an indirect register 534 * 535 * @adev: amdgpu_device pointer 536 * @reg_addr: indirect register address to read from 537 * 538 * Returns the value of indirect register @reg_addr 539 */ 540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 541 u32 reg_addr) 542 { 543 unsigned long flags, pcie_index, pcie_data; 544 void __iomem *pcie_index_offset; 545 void __iomem *pcie_data_offset; 546 u32 r; 547 548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 550 551 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 554 555 writel(reg_addr, pcie_index_offset); 556 readl(pcie_index_offset); 557 r = readl(pcie_data_offset); 558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 559 560 return r; 561 } 562 563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 564 u64 reg_addr) 565 { 566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 567 u32 r; 568 void __iomem *pcie_index_offset; 569 void __iomem *pcie_index_hi_offset; 570 void __iomem *pcie_data_offset; 571 572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 574 if (adev->nbio.funcs->get_pcie_index_hi_offset) 575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 576 else 577 pcie_index_hi = 0; 578 579 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 582 if (pcie_index_hi != 0) 583 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 584 pcie_index_hi * 4; 585 586 writel(reg_addr, pcie_index_offset); 587 readl(pcie_index_offset); 588 if (pcie_index_hi != 0) { 589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 590 readl(pcie_index_hi_offset); 591 } 592 r = readl(pcie_data_offset); 593 594 /* clear the high bits */ 595 if (pcie_index_hi != 0) { 596 writel(0, pcie_index_hi_offset); 597 readl(pcie_index_hi_offset); 598 } 599 600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 601 602 return r; 603 } 604 605 /** 606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 607 * 608 * @adev: amdgpu_device pointer 609 * @reg_addr: indirect register address to read from 610 * 611 * Returns the value of indirect register @reg_addr 612 */ 613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 614 u32 reg_addr) 615 { 616 unsigned long flags, pcie_index, pcie_data; 617 void __iomem *pcie_index_offset; 618 void __iomem *pcie_data_offset; 619 u64 r; 620 621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @reg_addr: indirect register offset 646 * @reg_data: indirect register data 647 * 648 */ 649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 650 u32 reg_addr, u32 reg_data) 651 { 652 unsigned long flags, pcie_index, pcie_data; 653 void __iomem *pcie_index_offset; 654 void __iomem *pcie_data_offset; 655 656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 671 u64 reg_addr, u32 reg_data) 672 { 673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 674 void __iomem *pcie_index_offset; 675 void __iomem *pcie_index_hi_offset; 676 void __iomem *pcie_data_offset; 677 678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 680 if (adev->nbio.funcs->get_pcie_index_hi_offset) 681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 682 else 683 pcie_index_hi = 0; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 if (pcie_index_hi != 0) 689 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 690 pcie_index_hi * 4; 691 692 writel(reg_addr, pcie_index_offset); 693 readl(pcie_index_offset); 694 if (pcie_index_hi != 0) { 695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 696 readl(pcie_index_hi_offset); 697 } 698 writel(reg_data, pcie_data_offset); 699 readl(pcie_data_offset); 700 701 /* clear the high bits */ 702 if (pcie_index_hi != 0) { 703 writel(0, pcie_index_hi_offset); 704 readl(pcie_index_hi_offset); 705 } 706 707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 708 } 709 710 /** 711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 712 * 713 * @adev: amdgpu_device pointer 714 * @reg_addr: indirect register offset 715 * @reg_data: indirect register data 716 * 717 */ 718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 719 u32 reg_addr, u64 reg_data) 720 { 721 unsigned long flags, pcie_index, pcie_data; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 727 728 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 731 732 /* write low 32 bits */ 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 736 readl(pcie_data_offset); 737 /* write high 32 bits */ 738 writel(reg_addr + 4, pcie_index_offset); 739 readl(pcie_index_offset); 740 writel((u32)(reg_data >> 32), pcie_data_offset); 741 readl(pcie_data_offset); 742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 743 } 744 745 /** 746 * amdgpu_device_get_rev_id - query device rev_id 747 * 748 * @adev: amdgpu_device pointer 749 * 750 * Return device rev_id 751 */ 752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 753 { 754 return adev->nbio.funcs->get_rev_id(adev); 755 } 756 757 /** 758 * amdgpu_invalid_rreg - dummy reg read function 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: offset of register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 * Returns the value in the register. 766 */ 767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 768 { 769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 770 BUG(); 771 return 0; 772 } 773 774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 775 { 776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 777 BUG(); 778 return 0; 779 } 780 781 /** 782 * amdgpu_invalid_wreg - dummy reg write function 783 * 784 * @adev: amdgpu_device pointer 785 * @reg: offset of register 786 * @v: value to write to the register 787 * 788 * Dummy register read function. Used for register blocks 789 * that certain asics don't have (all asics). 790 */ 791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 792 { 793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 794 reg, v); 795 BUG(); 796 } 797 798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 799 { 800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 801 reg, v); 802 BUG(); 803 } 804 805 /** 806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 807 * 808 * @adev: amdgpu_device pointer 809 * @reg: offset of register 810 * 811 * Dummy register read function. Used for register blocks 812 * that certain asics don't have (all asics). 813 * Returns the value in the register. 814 */ 815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 816 { 817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 818 BUG(); 819 return 0; 820 } 821 822 /** 823 * amdgpu_invalid_wreg64 - dummy reg write function 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: offset of register 827 * @v: value to write to the register 828 * 829 * Dummy register read function. Used for register blocks 830 * that certain asics don't have (all asics). 831 */ 832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 833 { 834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 835 reg, v); 836 BUG(); 837 } 838 839 /** 840 * amdgpu_block_invalid_rreg - dummy reg read function 841 * 842 * @adev: amdgpu_device pointer 843 * @block: offset of instance 844 * @reg: offset of register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 * Returns the value in the register. 849 */ 850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 851 uint32_t block, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 854 reg, block); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_block_invalid_wreg - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @block: offset of instance 864 * @reg: offset of register 865 * @v: value to write to the register 866 * 867 * Dummy register read function. Used for register blocks 868 * that certain asics don't have (all asics). 869 */ 870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 871 uint32_t block, 872 uint32_t reg, uint32_t v) 873 { 874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 875 reg, block, v); 876 BUG(); 877 } 878 879 /** 880 * amdgpu_device_asic_init - Wrapper for atom asic_init 881 * 882 * @adev: amdgpu_device pointer 883 * 884 * Does any asic specific work and then calls atom asic init. 885 */ 886 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 887 { 888 amdgpu_asic_pre_asic_init(adev); 889 890 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 891 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 892 return amdgpu_atomfirmware_asic_init(adev, true); 893 else 894 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 895 } 896 897 /** 898 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 899 * 900 * @adev: amdgpu_device pointer 901 * 902 * Allocates a scratch page of VRAM for use by various things in the 903 * driver. 904 */ 905 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 906 { 907 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 908 AMDGPU_GEM_DOMAIN_VRAM | 909 AMDGPU_GEM_DOMAIN_GTT, 910 &adev->mem_scratch.robj, 911 &adev->mem_scratch.gpu_addr, 912 (void **)&adev->mem_scratch.ptr); 913 } 914 915 /** 916 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Frees the VRAM scratch page. 921 */ 922 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 923 { 924 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 925 } 926 927 /** 928 * amdgpu_device_program_register_sequence - program an array of registers. 929 * 930 * @adev: amdgpu_device pointer 931 * @registers: pointer to the register array 932 * @array_size: size of the register array 933 * 934 * Programs an array or registers with and or masks. 935 * This is a helper for setting golden registers. 936 */ 937 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 938 const u32 *registers, 939 const u32 array_size) 940 { 941 u32 tmp, reg, and_mask, or_mask; 942 int i; 943 944 if (array_size % 3) 945 return; 946 947 for (i = 0; i < array_size; i += 3) { 948 reg = registers[i + 0]; 949 and_mask = registers[i + 1]; 950 or_mask = registers[i + 2]; 951 952 if (and_mask == 0xffffffff) { 953 tmp = or_mask; 954 } else { 955 tmp = RREG32(reg); 956 tmp &= ~and_mask; 957 if (adev->family >= AMDGPU_FAMILY_AI) 958 tmp |= (or_mask & and_mask); 959 else 960 tmp |= or_mask; 961 } 962 WREG32(reg, tmp); 963 } 964 } 965 966 /** 967 * amdgpu_device_pci_config_reset - reset the GPU 968 * 969 * @adev: amdgpu_device pointer 970 * 971 * Resets the GPU using the pci config reset sequence. 972 * Only applicable to asics prior to vega10. 973 */ 974 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 975 { 976 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 977 } 978 979 /** 980 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 981 * 982 * @adev: amdgpu_device pointer 983 * 984 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 985 */ 986 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 987 { 988 return pci_reset_function(adev->pdev); 989 } 990 991 /* 992 * amdgpu_device_wb_*() 993 * Writeback is the method by which the GPU updates special pages in memory 994 * with the status of certain GPU events (fences, ring pointers,etc.). 995 */ 996 997 /** 998 * amdgpu_device_wb_fini - Disable Writeback and free memory 999 * 1000 * @adev: amdgpu_device pointer 1001 * 1002 * Disables Writeback and frees the Writeback memory (all asics). 1003 * Used at driver shutdown. 1004 */ 1005 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1006 { 1007 if (adev->wb.wb_obj) { 1008 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1009 &adev->wb.gpu_addr, 1010 (void **)&adev->wb.wb); 1011 adev->wb.wb_obj = NULL; 1012 } 1013 } 1014 1015 /** 1016 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Initializes writeback and allocates writeback memory (all asics). 1021 * Used at driver startup. 1022 * Returns 0 on success or an -error on failure. 1023 */ 1024 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1025 { 1026 int r; 1027 1028 if (adev->wb.wb_obj == NULL) { 1029 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1030 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1031 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1032 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1033 (void **)&adev->wb.wb); 1034 if (r) { 1035 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1036 return r; 1037 } 1038 1039 adev->wb.num_wb = AMDGPU_MAX_WB; 1040 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1041 1042 /* clear wb memory */ 1043 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1044 } 1045 1046 return 0; 1047 } 1048 1049 /** 1050 * amdgpu_device_wb_get - Allocate a wb entry 1051 * 1052 * @adev: amdgpu_device pointer 1053 * @wb: wb index 1054 * 1055 * Allocate a wb slot for use by the driver (all asics). 1056 * Returns 0 on success or -EINVAL on failure. 1057 */ 1058 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1059 { 1060 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1061 1062 if (offset < adev->wb.num_wb) { 1063 __set_bit(offset, adev->wb.used); 1064 *wb = offset << 3; /* convert to dw offset */ 1065 return 0; 1066 } else { 1067 return -EINVAL; 1068 } 1069 } 1070 1071 /** 1072 * amdgpu_device_wb_free - Free a wb entry 1073 * 1074 * @adev: amdgpu_device pointer 1075 * @wb: wb index 1076 * 1077 * Free a wb slot allocated for use by the driver (all asics) 1078 */ 1079 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1080 { 1081 wb >>= 3; 1082 if (wb < adev->wb.num_wb) 1083 __clear_bit(wb, adev->wb.used); 1084 } 1085 1086 /** 1087 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1088 * 1089 * @adev: amdgpu_device pointer 1090 * 1091 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1092 * to fail, but if any of the BARs is not accessible after the size we abort 1093 * driver loading by returning -ENODEV. 1094 */ 1095 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1096 { 1097 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1098 struct pci_bus *root; 1099 struct resource *res; 1100 unsigned int i; 1101 u16 cmd; 1102 int r; 1103 1104 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1105 return 0; 1106 1107 /* Bypass for VF */ 1108 if (amdgpu_sriov_vf(adev)) 1109 return 0; 1110 1111 /* skip if the bios has already enabled large BAR */ 1112 if (adev->gmc.real_vram_size && 1113 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1114 return 0; 1115 1116 /* Check if the root BUS has 64bit memory resources */ 1117 root = adev->pdev->bus; 1118 while (root->parent) 1119 root = root->parent; 1120 1121 pci_bus_for_each_resource(root, res, i) { 1122 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1123 res->start > 0x100000000ull) 1124 break; 1125 } 1126 1127 /* Trying to resize is pointless without a root hub window above 4GB */ 1128 if (!res) 1129 return 0; 1130 1131 /* Limit the BAR size to what is available */ 1132 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1133 rbar_size); 1134 1135 /* Disable memory decoding while we change the BAR addresses and size */ 1136 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1137 pci_write_config_word(adev->pdev, PCI_COMMAND, 1138 cmd & ~PCI_COMMAND_MEMORY); 1139 1140 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1141 amdgpu_doorbell_fini(adev); 1142 if (adev->asic_type >= CHIP_BONAIRE) 1143 pci_release_resource(adev->pdev, 2); 1144 1145 pci_release_resource(adev->pdev, 0); 1146 1147 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1148 if (r == -ENOSPC) 1149 DRM_INFO("Not enough PCI address space for a large BAR."); 1150 else if (r && r != -ENOTSUPP) 1151 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1152 1153 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1154 1155 /* When the doorbell or fb BAR isn't available we have no chance of 1156 * using the device. 1157 */ 1158 r = amdgpu_doorbell_init(adev); 1159 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1160 return -ENODEV; 1161 1162 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1163 1164 return 0; 1165 } 1166 1167 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1168 { 1169 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1170 return false; 1171 1172 return true; 1173 } 1174 1175 /* 1176 * GPU helpers function. 1177 */ 1178 /** 1179 * amdgpu_device_need_post - check if the hw need post or not 1180 * 1181 * @adev: amdgpu_device pointer 1182 * 1183 * Check if the asic has been initialized (all asics) at driver startup 1184 * or post is needed if hw reset is performed. 1185 * Returns true if need or false if not. 1186 */ 1187 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1188 { 1189 uint32_t reg; 1190 1191 if (amdgpu_sriov_vf(adev)) 1192 return false; 1193 1194 if (!amdgpu_device_read_bios(adev)) 1195 return false; 1196 1197 if (amdgpu_passthrough(adev)) { 1198 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1199 * some old smc fw still need driver do vPost otherwise gpu hang, while 1200 * those smc fw version above 22.15 doesn't have this flaw, so we force 1201 * vpost executed for smc version below 22.15 1202 */ 1203 if (adev->asic_type == CHIP_FIJI) { 1204 int err; 1205 uint32_t fw_ver; 1206 1207 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1208 /* force vPost if error occured */ 1209 if (err) 1210 return true; 1211 1212 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1213 if (fw_ver < 0x00160e00) 1214 return true; 1215 } 1216 } 1217 1218 /* Don't post if we need to reset whole hive on init */ 1219 if (adev->gmc.xgmi.pending_reset) 1220 return false; 1221 1222 if (adev->has_hw_reset) { 1223 adev->has_hw_reset = false; 1224 return true; 1225 } 1226 1227 /* bios scratch used on CIK+ */ 1228 if (adev->asic_type >= CHIP_BONAIRE) 1229 return amdgpu_atombios_scratch_need_asic_init(adev); 1230 1231 /* check MEM_SIZE for older asics */ 1232 reg = amdgpu_asic_get_config_memsize(adev); 1233 1234 if ((reg != 0) && (reg != 0xffffffff)) 1235 return false; 1236 1237 return true; 1238 } 1239 1240 /* 1241 * On APUs with >= 64GB white flickering has been observed w/ SG enabled. 1242 * Disable S/G on such systems until we have a proper fix. 1243 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 1244 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 1245 */ 1246 bool amdgpu_sg_display_supported(struct amdgpu_device *adev) 1247 { 1248 switch (amdgpu_sg_display) { 1249 case -1: 1250 break; 1251 case 0: 1252 return false; 1253 case 1: 1254 return true; 1255 default: 1256 return false; 1257 } 1258 if ((totalram_pages() << (PAGE_SHIFT - 10)) + 1259 (adev->gmc.real_vram_size / 1024) >= 64000000) { 1260 DRM_WARN("Disabling S/G due to >=64GB RAM\n"); 1261 return false; 1262 } 1263 return true; 1264 } 1265 1266 /* 1267 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1268 * speed switching. Until we have confirmation from Intel that a specific host 1269 * supports it, it's safer that we keep it disabled for all. 1270 * 1271 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1272 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1273 */ 1274 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1275 { 1276 #if IS_ENABLED(CONFIG_X86) 1277 struct cpuinfo_x86 *c = &cpu_data(0); 1278 1279 if (c->x86_vendor == X86_VENDOR_INTEL) 1280 return false; 1281 #endif 1282 return true; 1283 } 1284 1285 /** 1286 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1287 * 1288 * @adev: amdgpu_device pointer 1289 * 1290 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1291 * be set for this device. 1292 * 1293 * Returns true if it should be used or false if not. 1294 */ 1295 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1296 { 1297 switch (amdgpu_aspm) { 1298 case -1: 1299 break; 1300 case 0: 1301 return false; 1302 case 1: 1303 return true; 1304 default: 1305 return false; 1306 } 1307 return pcie_aspm_enabled(adev->pdev); 1308 } 1309 1310 bool amdgpu_device_aspm_support_quirk(void) 1311 { 1312 #if IS_ENABLED(CONFIG_X86) 1313 struct cpuinfo_x86 *c = &cpu_data(0); 1314 1315 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1316 #else 1317 return true; 1318 #endif 1319 } 1320 1321 /* if we get transitioned to only one device, take VGA back */ 1322 /** 1323 * amdgpu_device_vga_set_decode - enable/disable vga decode 1324 * 1325 * @pdev: PCI device pointer 1326 * @state: enable/disable vga decode 1327 * 1328 * Enable/disable vga decode (all asics). 1329 * Returns VGA resource flags. 1330 */ 1331 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1332 bool state) 1333 { 1334 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1335 1336 amdgpu_asic_set_vga_state(adev, state); 1337 if (state) 1338 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1339 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1340 else 1341 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1342 } 1343 1344 /** 1345 * amdgpu_device_check_block_size - validate the vm block size 1346 * 1347 * @adev: amdgpu_device pointer 1348 * 1349 * Validates the vm block size specified via module parameter. 1350 * The vm block size defines number of bits in page table versus page directory, 1351 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1352 * page table and the remaining bits are in the page directory. 1353 */ 1354 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1355 { 1356 /* defines number of bits in page table versus page directory, 1357 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1358 * page table and the remaining bits are in the page directory 1359 */ 1360 if (amdgpu_vm_block_size == -1) 1361 return; 1362 1363 if (amdgpu_vm_block_size < 9) { 1364 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1365 amdgpu_vm_block_size); 1366 amdgpu_vm_block_size = -1; 1367 } 1368 } 1369 1370 /** 1371 * amdgpu_device_check_vm_size - validate the vm size 1372 * 1373 * @adev: amdgpu_device pointer 1374 * 1375 * Validates the vm size in GB specified via module parameter. 1376 * The VM size is the size of the GPU virtual memory space in GB. 1377 */ 1378 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1379 { 1380 /* no need to check the default value */ 1381 if (amdgpu_vm_size == -1) 1382 return; 1383 1384 if (amdgpu_vm_size < 1) { 1385 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1386 amdgpu_vm_size); 1387 amdgpu_vm_size = -1; 1388 } 1389 } 1390 1391 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1392 { 1393 struct sysinfo si; 1394 bool is_os_64 = (sizeof(void *) == 8); 1395 uint64_t total_memory; 1396 uint64_t dram_size_seven_GB = 0x1B8000000; 1397 uint64_t dram_size_three_GB = 0xB8000000; 1398 1399 if (amdgpu_smu_memory_pool_size == 0) 1400 return; 1401 1402 if (!is_os_64) { 1403 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1404 goto def_value; 1405 } 1406 si_meminfo(&si); 1407 total_memory = (uint64_t)si.totalram * si.mem_unit; 1408 1409 if ((amdgpu_smu_memory_pool_size == 1) || 1410 (amdgpu_smu_memory_pool_size == 2)) { 1411 if (total_memory < dram_size_three_GB) 1412 goto def_value1; 1413 } else if ((amdgpu_smu_memory_pool_size == 4) || 1414 (amdgpu_smu_memory_pool_size == 8)) { 1415 if (total_memory < dram_size_seven_GB) 1416 goto def_value1; 1417 } else { 1418 DRM_WARN("Smu memory pool size not supported\n"); 1419 goto def_value; 1420 } 1421 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1422 1423 return; 1424 1425 def_value1: 1426 DRM_WARN("No enough system memory\n"); 1427 def_value: 1428 adev->pm.smu_prv_buffer_size = 0; 1429 } 1430 1431 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1432 { 1433 if (!(adev->flags & AMD_IS_APU) || 1434 adev->asic_type < CHIP_RAVEN) 1435 return 0; 1436 1437 switch (adev->asic_type) { 1438 case CHIP_RAVEN: 1439 if (adev->pdev->device == 0x15dd) 1440 adev->apu_flags |= AMD_APU_IS_RAVEN; 1441 if (adev->pdev->device == 0x15d8) 1442 adev->apu_flags |= AMD_APU_IS_PICASSO; 1443 break; 1444 case CHIP_RENOIR: 1445 if ((adev->pdev->device == 0x1636) || 1446 (adev->pdev->device == 0x164c)) 1447 adev->apu_flags |= AMD_APU_IS_RENOIR; 1448 else 1449 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1450 break; 1451 case CHIP_VANGOGH: 1452 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1453 break; 1454 case CHIP_YELLOW_CARP: 1455 break; 1456 case CHIP_CYAN_SKILLFISH: 1457 if ((adev->pdev->device == 0x13FE) || 1458 (adev->pdev->device == 0x143F)) 1459 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1460 break; 1461 default: 1462 break; 1463 } 1464 1465 return 0; 1466 } 1467 1468 /** 1469 * amdgpu_device_check_arguments - validate module params 1470 * 1471 * @adev: amdgpu_device pointer 1472 * 1473 * Validates certain module parameters and updates 1474 * the associated values used by the driver (all asics). 1475 */ 1476 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1477 { 1478 if (amdgpu_sched_jobs < 4) { 1479 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1480 amdgpu_sched_jobs); 1481 amdgpu_sched_jobs = 4; 1482 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1483 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1484 amdgpu_sched_jobs); 1485 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1486 } 1487 1488 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1489 /* gart size must be greater or equal to 32M */ 1490 dev_warn(adev->dev, "gart size (%d) too small\n", 1491 amdgpu_gart_size); 1492 amdgpu_gart_size = -1; 1493 } 1494 1495 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1496 /* gtt size must be greater or equal to 32M */ 1497 dev_warn(adev->dev, "gtt size (%d) too small\n", 1498 amdgpu_gtt_size); 1499 amdgpu_gtt_size = -1; 1500 } 1501 1502 /* valid range is between 4 and 9 inclusive */ 1503 if (amdgpu_vm_fragment_size != -1 && 1504 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1505 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1506 amdgpu_vm_fragment_size = -1; 1507 } 1508 1509 if (amdgpu_sched_hw_submission < 2) { 1510 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1511 amdgpu_sched_hw_submission); 1512 amdgpu_sched_hw_submission = 2; 1513 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1514 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1515 amdgpu_sched_hw_submission); 1516 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1517 } 1518 1519 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1520 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1521 amdgpu_reset_method = -1; 1522 } 1523 1524 amdgpu_device_check_smu_prv_buffer_size(adev); 1525 1526 amdgpu_device_check_vm_size(adev); 1527 1528 amdgpu_device_check_block_size(adev); 1529 1530 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1531 1532 return 0; 1533 } 1534 1535 /** 1536 * amdgpu_switcheroo_set_state - set switcheroo state 1537 * 1538 * @pdev: pci dev pointer 1539 * @state: vga_switcheroo state 1540 * 1541 * Callback for the switcheroo driver. Suspends or resumes 1542 * the asics before or after it is powered up using ACPI methods. 1543 */ 1544 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1545 enum vga_switcheroo_state state) 1546 { 1547 struct drm_device *dev = pci_get_drvdata(pdev); 1548 int r; 1549 1550 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1551 return; 1552 1553 if (state == VGA_SWITCHEROO_ON) { 1554 pr_info("switched on\n"); 1555 /* don't suspend or resume card normally */ 1556 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1557 1558 pci_set_power_state(pdev, PCI_D0); 1559 amdgpu_device_load_pci_state(pdev); 1560 r = pci_enable_device(pdev); 1561 if (r) 1562 DRM_WARN("pci_enable_device failed (%d)\n", r); 1563 amdgpu_device_resume(dev, true); 1564 1565 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1566 } else { 1567 pr_info("switched off\n"); 1568 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1569 amdgpu_device_suspend(dev, true); 1570 amdgpu_device_cache_pci_state(pdev); 1571 /* Shut down the device */ 1572 pci_disable_device(pdev); 1573 pci_set_power_state(pdev, PCI_D3cold); 1574 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1575 } 1576 } 1577 1578 /** 1579 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1580 * 1581 * @pdev: pci dev pointer 1582 * 1583 * Callback for the switcheroo driver. Check of the switcheroo 1584 * state can be changed. 1585 * Returns true if the state can be changed, false if not. 1586 */ 1587 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1588 { 1589 struct drm_device *dev = pci_get_drvdata(pdev); 1590 1591 /* 1592 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1593 * locking inversion with the driver load path. And the access here is 1594 * completely racy anyway. So don't bother with locking for now. 1595 */ 1596 return atomic_read(&dev->open_count) == 0; 1597 } 1598 1599 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1600 .set_gpu_state = amdgpu_switcheroo_set_state, 1601 .reprobe = NULL, 1602 .can_switch = amdgpu_switcheroo_can_switch, 1603 }; 1604 1605 /** 1606 * amdgpu_device_ip_set_clockgating_state - set the CG state 1607 * 1608 * @dev: amdgpu_device pointer 1609 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1610 * @state: clockgating state (gate or ungate) 1611 * 1612 * Sets the requested clockgating state for all instances of 1613 * the hardware IP specified. 1614 * Returns the error code from the last instance. 1615 */ 1616 int amdgpu_device_ip_set_clockgating_state(void *dev, 1617 enum amd_ip_block_type block_type, 1618 enum amd_clockgating_state state) 1619 { 1620 struct amdgpu_device *adev = dev; 1621 int i, r = 0; 1622 1623 for (i = 0; i < adev->num_ip_blocks; i++) { 1624 if (!adev->ip_blocks[i].status.valid) 1625 continue; 1626 if (adev->ip_blocks[i].version->type != block_type) 1627 continue; 1628 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1629 continue; 1630 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1631 (void *)adev, state); 1632 if (r) 1633 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1634 adev->ip_blocks[i].version->funcs->name, r); 1635 } 1636 return r; 1637 } 1638 1639 /** 1640 * amdgpu_device_ip_set_powergating_state - set the PG state 1641 * 1642 * @dev: amdgpu_device pointer 1643 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1644 * @state: powergating state (gate or ungate) 1645 * 1646 * Sets the requested powergating state for all instances of 1647 * the hardware IP specified. 1648 * Returns the error code from the last instance. 1649 */ 1650 int amdgpu_device_ip_set_powergating_state(void *dev, 1651 enum amd_ip_block_type block_type, 1652 enum amd_powergating_state state) 1653 { 1654 struct amdgpu_device *adev = dev; 1655 int i, r = 0; 1656 1657 for (i = 0; i < adev->num_ip_blocks; i++) { 1658 if (!adev->ip_blocks[i].status.valid) 1659 continue; 1660 if (adev->ip_blocks[i].version->type != block_type) 1661 continue; 1662 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1663 continue; 1664 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1665 (void *)adev, state); 1666 if (r) 1667 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1668 adev->ip_blocks[i].version->funcs->name, r); 1669 } 1670 return r; 1671 } 1672 1673 /** 1674 * amdgpu_device_ip_get_clockgating_state - get the CG state 1675 * 1676 * @adev: amdgpu_device pointer 1677 * @flags: clockgating feature flags 1678 * 1679 * Walks the list of IPs on the device and updates the clockgating 1680 * flags for each IP. 1681 * Updates @flags with the feature flags for each hardware IP where 1682 * clockgating is enabled. 1683 */ 1684 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1685 u64 *flags) 1686 { 1687 int i; 1688 1689 for (i = 0; i < adev->num_ip_blocks; i++) { 1690 if (!adev->ip_blocks[i].status.valid) 1691 continue; 1692 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1693 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1694 } 1695 } 1696 1697 /** 1698 * amdgpu_device_ip_wait_for_idle - wait for idle 1699 * 1700 * @adev: amdgpu_device pointer 1701 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1702 * 1703 * Waits for the request hardware IP to be idle. 1704 * Returns 0 for success or a negative error code on failure. 1705 */ 1706 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1707 enum amd_ip_block_type block_type) 1708 { 1709 int i, r; 1710 1711 for (i = 0; i < adev->num_ip_blocks; i++) { 1712 if (!adev->ip_blocks[i].status.valid) 1713 continue; 1714 if (adev->ip_blocks[i].version->type == block_type) { 1715 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1716 if (r) 1717 return r; 1718 break; 1719 } 1720 } 1721 return 0; 1722 1723 } 1724 1725 /** 1726 * amdgpu_device_ip_is_idle - is the hardware IP idle 1727 * 1728 * @adev: amdgpu_device pointer 1729 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1730 * 1731 * Check if the hardware IP is idle or not. 1732 * Returns true if it the IP is idle, false if not. 1733 */ 1734 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1735 enum amd_ip_block_type block_type) 1736 { 1737 int i; 1738 1739 for (i = 0; i < adev->num_ip_blocks; i++) { 1740 if (!adev->ip_blocks[i].status.valid) 1741 continue; 1742 if (adev->ip_blocks[i].version->type == block_type) 1743 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1744 } 1745 return true; 1746 1747 } 1748 1749 /** 1750 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1751 * 1752 * @adev: amdgpu_device pointer 1753 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1754 * 1755 * Returns a pointer to the hardware IP block structure 1756 * if it exists for the asic, otherwise NULL. 1757 */ 1758 struct amdgpu_ip_block * 1759 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1760 enum amd_ip_block_type type) 1761 { 1762 int i; 1763 1764 for (i = 0; i < adev->num_ip_blocks; i++) 1765 if (adev->ip_blocks[i].version->type == type) 1766 return &adev->ip_blocks[i]; 1767 1768 return NULL; 1769 } 1770 1771 /** 1772 * amdgpu_device_ip_block_version_cmp 1773 * 1774 * @adev: amdgpu_device pointer 1775 * @type: enum amd_ip_block_type 1776 * @major: major version 1777 * @minor: minor version 1778 * 1779 * return 0 if equal or greater 1780 * return 1 if smaller or the ip_block doesn't exist 1781 */ 1782 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1783 enum amd_ip_block_type type, 1784 u32 major, u32 minor) 1785 { 1786 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1787 1788 if (ip_block && ((ip_block->version->major > major) || 1789 ((ip_block->version->major == major) && 1790 (ip_block->version->minor >= minor)))) 1791 return 0; 1792 1793 return 1; 1794 } 1795 1796 /** 1797 * amdgpu_device_ip_block_add 1798 * 1799 * @adev: amdgpu_device pointer 1800 * @ip_block_version: pointer to the IP to add 1801 * 1802 * Adds the IP block driver information to the collection of IPs 1803 * on the asic. 1804 */ 1805 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1806 const struct amdgpu_ip_block_version *ip_block_version) 1807 { 1808 if (!ip_block_version) 1809 return -EINVAL; 1810 1811 switch (ip_block_version->type) { 1812 case AMD_IP_BLOCK_TYPE_VCN: 1813 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1814 return 0; 1815 break; 1816 case AMD_IP_BLOCK_TYPE_JPEG: 1817 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1818 return 0; 1819 break; 1820 default: 1821 break; 1822 } 1823 1824 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1825 ip_block_version->funcs->name); 1826 1827 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1828 1829 return 0; 1830 } 1831 1832 /** 1833 * amdgpu_device_enable_virtual_display - enable virtual display feature 1834 * 1835 * @adev: amdgpu_device pointer 1836 * 1837 * Enabled the virtual display feature if the user has enabled it via 1838 * the module parameter virtual_display. This feature provides a virtual 1839 * display hardware on headless boards or in virtualized environments. 1840 * This function parses and validates the configuration string specified by 1841 * the user and configues the virtual display configuration (number of 1842 * virtual connectors, crtcs, etc.) specified. 1843 */ 1844 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1845 { 1846 adev->enable_virtual_display = false; 1847 1848 if (amdgpu_virtual_display) { 1849 const char *pci_address_name = pci_name(adev->pdev); 1850 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1851 1852 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1853 pciaddstr_tmp = pciaddstr; 1854 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1855 pciaddname = strsep(&pciaddname_tmp, ","); 1856 if (!strcmp("all", pciaddname) 1857 || !strcmp(pci_address_name, pciaddname)) { 1858 long num_crtc; 1859 int res = -1; 1860 1861 adev->enable_virtual_display = true; 1862 1863 if (pciaddname_tmp) 1864 res = kstrtol(pciaddname_tmp, 10, 1865 &num_crtc); 1866 1867 if (!res) { 1868 if (num_crtc < 1) 1869 num_crtc = 1; 1870 if (num_crtc > 6) 1871 num_crtc = 6; 1872 adev->mode_info.num_crtc = num_crtc; 1873 } else { 1874 adev->mode_info.num_crtc = 1; 1875 } 1876 break; 1877 } 1878 } 1879 1880 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1881 amdgpu_virtual_display, pci_address_name, 1882 adev->enable_virtual_display, adev->mode_info.num_crtc); 1883 1884 kfree(pciaddstr); 1885 } 1886 } 1887 1888 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1889 { 1890 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1891 adev->mode_info.num_crtc = 1; 1892 adev->enable_virtual_display = true; 1893 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1894 adev->enable_virtual_display, adev->mode_info.num_crtc); 1895 } 1896 } 1897 1898 /** 1899 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1900 * 1901 * @adev: amdgpu_device pointer 1902 * 1903 * Parses the asic configuration parameters specified in the gpu info 1904 * firmware and makes them availale to the driver for use in configuring 1905 * the asic. 1906 * Returns 0 on success, -EINVAL on failure. 1907 */ 1908 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1909 { 1910 const char *chip_name; 1911 char fw_name[40]; 1912 int err; 1913 const struct gpu_info_firmware_header_v1_0 *hdr; 1914 1915 adev->firmware.gpu_info_fw = NULL; 1916 1917 if (adev->mman.discovery_bin) { 1918 /* 1919 * FIXME: The bounding box is still needed by Navi12, so 1920 * temporarily read it from gpu_info firmware. Should be dropped 1921 * when DAL no longer needs it. 1922 */ 1923 if (adev->asic_type != CHIP_NAVI12) 1924 return 0; 1925 } 1926 1927 switch (adev->asic_type) { 1928 default: 1929 return 0; 1930 case CHIP_VEGA10: 1931 chip_name = "vega10"; 1932 break; 1933 case CHIP_VEGA12: 1934 chip_name = "vega12"; 1935 break; 1936 case CHIP_RAVEN: 1937 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1938 chip_name = "raven2"; 1939 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1940 chip_name = "picasso"; 1941 else 1942 chip_name = "raven"; 1943 break; 1944 case CHIP_ARCTURUS: 1945 chip_name = "arcturus"; 1946 break; 1947 case CHIP_NAVI12: 1948 chip_name = "navi12"; 1949 break; 1950 } 1951 1952 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1953 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1954 if (err) { 1955 dev_err(adev->dev, 1956 "Failed to get gpu_info firmware \"%s\"\n", 1957 fw_name); 1958 goto out; 1959 } 1960 1961 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1962 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1963 1964 switch (hdr->version_major) { 1965 case 1: 1966 { 1967 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1968 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1969 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1970 1971 /* 1972 * Should be droped when DAL no longer needs it. 1973 */ 1974 if (adev->asic_type == CHIP_NAVI12) 1975 goto parse_soc_bounding_box; 1976 1977 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1978 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1979 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1980 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1981 adev->gfx.config.max_texture_channel_caches = 1982 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1983 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1984 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1985 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1986 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1987 adev->gfx.config.double_offchip_lds_buf = 1988 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1989 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1990 adev->gfx.cu_info.max_waves_per_simd = 1991 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1992 adev->gfx.cu_info.max_scratch_slots_per_cu = 1993 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1994 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1995 if (hdr->version_minor >= 1) { 1996 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1997 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1998 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1999 adev->gfx.config.num_sc_per_sh = 2000 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2001 adev->gfx.config.num_packer_per_sc = 2002 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2003 } 2004 2005 parse_soc_bounding_box: 2006 /* 2007 * soc bounding box info is not integrated in disocovery table, 2008 * we always need to parse it from gpu info firmware if needed. 2009 */ 2010 if (hdr->version_minor == 2) { 2011 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2012 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2013 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2014 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2015 } 2016 break; 2017 } 2018 default: 2019 dev_err(adev->dev, 2020 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2021 err = -EINVAL; 2022 goto out; 2023 } 2024 out: 2025 return err; 2026 } 2027 2028 /** 2029 * amdgpu_device_ip_early_init - run early init for hardware IPs 2030 * 2031 * @adev: amdgpu_device pointer 2032 * 2033 * Early initialization pass for hardware IPs. The hardware IPs that make 2034 * up each asic are discovered each IP's early_init callback is run. This 2035 * is the first stage in initializing the asic. 2036 * Returns 0 on success, negative error code on failure. 2037 */ 2038 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2039 { 2040 struct drm_device *dev = adev_to_drm(adev); 2041 struct pci_dev *parent; 2042 int i, r; 2043 bool total; 2044 2045 amdgpu_device_enable_virtual_display(adev); 2046 2047 if (amdgpu_sriov_vf(adev)) { 2048 r = amdgpu_virt_request_full_gpu(adev, true); 2049 if (r) 2050 return r; 2051 } 2052 2053 switch (adev->asic_type) { 2054 #ifdef CONFIG_DRM_AMDGPU_SI 2055 case CHIP_VERDE: 2056 case CHIP_TAHITI: 2057 case CHIP_PITCAIRN: 2058 case CHIP_OLAND: 2059 case CHIP_HAINAN: 2060 adev->family = AMDGPU_FAMILY_SI; 2061 r = si_set_ip_blocks(adev); 2062 if (r) 2063 return r; 2064 break; 2065 #endif 2066 #ifdef CONFIG_DRM_AMDGPU_CIK 2067 case CHIP_BONAIRE: 2068 case CHIP_HAWAII: 2069 case CHIP_KAVERI: 2070 case CHIP_KABINI: 2071 case CHIP_MULLINS: 2072 if (adev->flags & AMD_IS_APU) 2073 adev->family = AMDGPU_FAMILY_KV; 2074 else 2075 adev->family = AMDGPU_FAMILY_CI; 2076 2077 r = cik_set_ip_blocks(adev); 2078 if (r) 2079 return r; 2080 break; 2081 #endif 2082 case CHIP_TOPAZ: 2083 case CHIP_TONGA: 2084 case CHIP_FIJI: 2085 case CHIP_POLARIS10: 2086 case CHIP_POLARIS11: 2087 case CHIP_POLARIS12: 2088 case CHIP_VEGAM: 2089 case CHIP_CARRIZO: 2090 case CHIP_STONEY: 2091 if (adev->flags & AMD_IS_APU) 2092 adev->family = AMDGPU_FAMILY_CZ; 2093 else 2094 adev->family = AMDGPU_FAMILY_VI; 2095 2096 r = vi_set_ip_blocks(adev); 2097 if (r) 2098 return r; 2099 break; 2100 default: 2101 r = amdgpu_discovery_set_ip_blocks(adev); 2102 if (r) 2103 return r; 2104 break; 2105 } 2106 2107 if (amdgpu_has_atpx() && 2108 (amdgpu_is_atpx_hybrid() || 2109 amdgpu_has_atpx_dgpu_power_cntl()) && 2110 ((adev->flags & AMD_IS_APU) == 0) && 2111 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2112 adev->flags |= AMD_IS_PX; 2113 2114 if (!(adev->flags & AMD_IS_APU)) { 2115 parent = pci_upstream_bridge(adev->pdev); 2116 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2117 } 2118 2119 2120 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2121 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2122 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2123 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2124 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2125 2126 total = true; 2127 for (i = 0; i < adev->num_ip_blocks; i++) { 2128 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2129 DRM_WARN("disabled ip block: %d <%s>\n", 2130 i, adev->ip_blocks[i].version->funcs->name); 2131 adev->ip_blocks[i].status.valid = false; 2132 } else { 2133 if (adev->ip_blocks[i].version->funcs->early_init) { 2134 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2135 if (r == -ENOENT) { 2136 adev->ip_blocks[i].status.valid = false; 2137 } else if (r) { 2138 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2139 adev->ip_blocks[i].version->funcs->name, r); 2140 total = false; 2141 } else { 2142 adev->ip_blocks[i].status.valid = true; 2143 } 2144 } else { 2145 adev->ip_blocks[i].status.valid = true; 2146 } 2147 } 2148 /* get the vbios after the asic_funcs are set up */ 2149 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2150 r = amdgpu_device_parse_gpu_info_fw(adev); 2151 if (r) 2152 return r; 2153 2154 /* Read BIOS */ 2155 if (amdgpu_device_read_bios(adev)) { 2156 if (!amdgpu_get_bios(adev)) 2157 return -EINVAL; 2158 2159 r = amdgpu_atombios_init(adev); 2160 if (r) { 2161 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2162 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2163 return r; 2164 } 2165 } 2166 2167 /*get pf2vf msg info at it's earliest time*/ 2168 if (amdgpu_sriov_vf(adev)) 2169 amdgpu_virt_init_data_exchange(adev); 2170 2171 } 2172 } 2173 if (!total) 2174 return -ENODEV; 2175 2176 amdgpu_amdkfd_device_probe(adev); 2177 adev->cg_flags &= amdgpu_cg_mask; 2178 adev->pg_flags &= amdgpu_pg_mask; 2179 2180 return 0; 2181 } 2182 2183 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2184 { 2185 int i, r; 2186 2187 for (i = 0; i < adev->num_ip_blocks; i++) { 2188 if (!adev->ip_blocks[i].status.sw) 2189 continue; 2190 if (adev->ip_blocks[i].status.hw) 2191 continue; 2192 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2193 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2194 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2195 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2196 if (r) { 2197 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2198 adev->ip_blocks[i].version->funcs->name, r); 2199 return r; 2200 } 2201 adev->ip_blocks[i].status.hw = true; 2202 } 2203 } 2204 2205 return 0; 2206 } 2207 2208 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2209 { 2210 int i, r; 2211 2212 for (i = 0; i < adev->num_ip_blocks; i++) { 2213 if (!adev->ip_blocks[i].status.sw) 2214 continue; 2215 if (adev->ip_blocks[i].status.hw) 2216 continue; 2217 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2218 if (r) { 2219 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2220 adev->ip_blocks[i].version->funcs->name, r); 2221 return r; 2222 } 2223 adev->ip_blocks[i].status.hw = true; 2224 } 2225 2226 return 0; 2227 } 2228 2229 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2230 { 2231 int r = 0; 2232 int i; 2233 uint32_t smu_version; 2234 2235 if (adev->asic_type >= CHIP_VEGA10) { 2236 for (i = 0; i < adev->num_ip_blocks; i++) { 2237 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2238 continue; 2239 2240 if (!adev->ip_blocks[i].status.sw) 2241 continue; 2242 2243 /* no need to do the fw loading again if already done*/ 2244 if (adev->ip_blocks[i].status.hw == true) 2245 break; 2246 2247 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2248 r = adev->ip_blocks[i].version->funcs->resume(adev); 2249 if (r) { 2250 DRM_ERROR("resume of IP block <%s> failed %d\n", 2251 adev->ip_blocks[i].version->funcs->name, r); 2252 return r; 2253 } 2254 } else { 2255 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2256 if (r) { 2257 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2258 adev->ip_blocks[i].version->funcs->name, r); 2259 return r; 2260 } 2261 } 2262 2263 adev->ip_blocks[i].status.hw = true; 2264 break; 2265 } 2266 } 2267 2268 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2269 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2270 2271 return r; 2272 } 2273 2274 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2275 { 2276 long timeout; 2277 int r, i; 2278 2279 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2280 struct amdgpu_ring *ring = adev->rings[i]; 2281 2282 /* No need to setup the GPU scheduler for rings that don't need it */ 2283 if (!ring || ring->no_scheduler) 2284 continue; 2285 2286 switch (ring->funcs->type) { 2287 case AMDGPU_RING_TYPE_GFX: 2288 timeout = adev->gfx_timeout; 2289 break; 2290 case AMDGPU_RING_TYPE_COMPUTE: 2291 timeout = adev->compute_timeout; 2292 break; 2293 case AMDGPU_RING_TYPE_SDMA: 2294 timeout = adev->sdma_timeout; 2295 break; 2296 default: 2297 timeout = adev->video_timeout; 2298 break; 2299 } 2300 2301 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2302 ring->num_hw_submission, 0, 2303 timeout, adev->reset_domain->wq, 2304 ring->sched_score, ring->name, 2305 adev->dev); 2306 if (r) { 2307 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2308 ring->name); 2309 return r; 2310 } 2311 } 2312 2313 amdgpu_xcp_update_partition_sched_list(adev); 2314 2315 return 0; 2316 } 2317 2318 2319 /** 2320 * amdgpu_device_ip_init - run init for hardware IPs 2321 * 2322 * @adev: amdgpu_device pointer 2323 * 2324 * Main initialization pass for hardware IPs. The list of all the hardware 2325 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2326 * are run. sw_init initializes the software state associated with each IP 2327 * and hw_init initializes the hardware associated with each IP. 2328 * Returns 0 on success, negative error code on failure. 2329 */ 2330 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2331 { 2332 int i, r; 2333 2334 r = amdgpu_ras_init(adev); 2335 if (r) 2336 return r; 2337 2338 for (i = 0; i < adev->num_ip_blocks; i++) { 2339 if (!adev->ip_blocks[i].status.valid) 2340 continue; 2341 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2342 if (r) { 2343 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2344 adev->ip_blocks[i].version->funcs->name, r); 2345 goto init_failed; 2346 } 2347 adev->ip_blocks[i].status.sw = true; 2348 2349 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2350 /* need to do common hw init early so everything is set up for gmc */ 2351 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2352 if (r) { 2353 DRM_ERROR("hw_init %d failed %d\n", i, r); 2354 goto init_failed; 2355 } 2356 adev->ip_blocks[i].status.hw = true; 2357 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2358 /* need to do gmc hw init early so we can allocate gpu mem */ 2359 /* Try to reserve bad pages early */ 2360 if (amdgpu_sriov_vf(adev)) 2361 amdgpu_virt_exchange_data(adev); 2362 2363 r = amdgpu_device_mem_scratch_init(adev); 2364 if (r) { 2365 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2366 goto init_failed; 2367 } 2368 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2369 if (r) { 2370 DRM_ERROR("hw_init %d failed %d\n", i, r); 2371 goto init_failed; 2372 } 2373 r = amdgpu_device_wb_init(adev); 2374 if (r) { 2375 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2376 goto init_failed; 2377 } 2378 adev->ip_blocks[i].status.hw = true; 2379 2380 /* right after GMC hw init, we create CSA */ 2381 if (adev->gfx.mcbp) { 2382 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2383 AMDGPU_GEM_DOMAIN_VRAM | 2384 AMDGPU_GEM_DOMAIN_GTT, 2385 AMDGPU_CSA_SIZE); 2386 if (r) { 2387 DRM_ERROR("allocate CSA failed %d\n", r); 2388 goto init_failed; 2389 } 2390 } 2391 } 2392 } 2393 2394 if (amdgpu_sriov_vf(adev)) 2395 amdgpu_virt_init_data_exchange(adev); 2396 2397 r = amdgpu_ib_pool_init(adev); 2398 if (r) { 2399 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2400 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2401 goto init_failed; 2402 } 2403 2404 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2405 if (r) 2406 goto init_failed; 2407 2408 r = amdgpu_device_ip_hw_init_phase1(adev); 2409 if (r) 2410 goto init_failed; 2411 2412 r = amdgpu_device_fw_loading(adev); 2413 if (r) 2414 goto init_failed; 2415 2416 r = amdgpu_device_ip_hw_init_phase2(adev); 2417 if (r) 2418 goto init_failed; 2419 2420 /* 2421 * retired pages will be loaded from eeprom and reserved here, 2422 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2423 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2424 * for I2C communication which only true at this point. 2425 * 2426 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2427 * failure from bad gpu situation and stop amdgpu init process 2428 * accordingly. For other failed cases, it will still release all 2429 * the resource and print error message, rather than returning one 2430 * negative value to upper level. 2431 * 2432 * Note: theoretically, this should be called before all vram allocations 2433 * to protect retired page from abusing 2434 */ 2435 r = amdgpu_ras_recovery_init(adev); 2436 if (r) 2437 goto init_failed; 2438 2439 /** 2440 * In case of XGMI grab extra reference for reset domain for this device 2441 */ 2442 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2443 if (amdgpu_xgmi_add_device(adev) == 0) { 2444 if (!amdgpu_sriov_vf(adev)) { 2445 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2446 2447 if (WARN_ON(!hive)) { 2448 r = -ENOENT; 2449 goto init_failed; 2450 } 2451 2452 if (!hive->reset_domain || 2453 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2454 r = -ENOENT; 2455 amdgpu_put_xgmi_hive(hive); 2456 goto init_failed; 2457 } 2458 2459 /* Drop the early temporary reset domain we created for device */ 2460 amdgpu_reset_put_reset_domain(adev->reset_domain); 2461 adev->reset_domain = hive->reset_domain; 2462 amdgpu_put_xgmi_hive(hive); 2463 } 2464 } 2465 } 2466 2467 r = amdgpu_device_init_schedulers(adev); 2468 if (r) 2469 goto init_failed; 2470 2471 /* Don't init kfd if whole hive need to be reset during init */ 2472 if (!adev->gmc.xgmi.pending_reset) { 2473 kgd2kfd_init_zone_device(adev); 2474 amdgpu_amdkfd_device_init(adev); 2475 } 2476 2477 amdgpu_fru_get_product_info(adev); 2478 2479 init_failed: 2480 2481 return r; 2482 } 2483 2484 /** 2485 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2486 * 2487 * @adev: amdgpu_device pointer 2488 * 2489 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2490 * this function before a GPU reset. If the value is retained after a 2491 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2492 */ 2493 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2494 { 2495 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2496 } 2497 2498 /** 2499 * amdgpu_device_check_vram_lost - check if vram is valid 2500 * 2501 * @adev: amdgpu_device pointer 2502 * 2503 * Checks the reset magic value written to the gart pointer in VRAM. 2504 * The driver calls this after a GPU reset to see if the contents of 2505 * VRAM is lost or now. 2506 * returns true if vram is lost, false if not. 2507 */ 2508 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2509 { 2510 if (memcmp(adev->gart.ptr, adev->reset_magic, 2511 AMDGPU_RESET_MAGIC_NUM)) 2512 return true; 2513 2514 if (!amdgpu_in_reset(adev)) 2515 return false; 2516 2517 /* 2518 * For all ASICs with baco/mode1 reset, the VRAM is 2519 * always assumed to be lost. 2520 */ 2521 switch (amdgpu_asic_reset_method(adev)) { 2522 case AMD_RESET_METHOD_BACO: 2523 case AMD_RESET_METHOD_MODE1: 2524 return true; 2525 default: 2526 return false; 2527 } 2528 } 2529 2530 /** 2531 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2532 * 2533 * @adev: amdgpu_device pointer 2534 * @state: clockgating state (gate or ungate) 2535 * 2536 * The list of all the hardware IPs that make up the asic is walked and the 2537 * set_clockgating_state callbacks are run. 2538 * Late initialization pass enabling clockgating for hardware IPs. 2539 * Fini or suspend, pass disabling clockgating for hardware IPs. 2540 * Returns 0 on success, negative error code on failure. 2541 */ 2542 2543 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2544 enum amd_clockgating_state state) 2545 { 2546 int i, j, r; 2547 2548 if (amdgpu_emu_mode == 1) 2549 return 0; 2550 2551 for (j = 0; j < adev->num_ip_blocks; j++) { 2552 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2553 if (!adev->ip_blocks[i].status.late_initialized) 2554 continue; 2555 /* skip CG for GFX, SDMA on S0ix */ 2556 if (adev->in_s0ix && 2557 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2558 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2559 continue; 2560 /* skip CG for VCE/UVD, it's handled specially */ 2561 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2562 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2563 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2564 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2565 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2566 /* enable clockgating to save power */ 2567 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2568 state); 2569 if (r) { 2570 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2571 adev->ip_blocks[i].version->funcs->name, r); 2572 return r; 2573 } 2574 } 2575 } 2576 2577 return 0; 2578 } 2579 2580 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2581 enum amd_powergating_state state) 2582 { 2583 int i, j, r; 2584 2585 if (amdgpu_emu_mode == 1) 2586 return 0; 2587 2588 for (j = 0; j < adev->num_ip_blocks; j++) { 2589 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2590 if (!adev->ip_blocks[i].status.late_initialized) 2591 continue; 2592 /* skip PG for GFX, SDMA on S0ix */ 2593 if (adev->in_s0ix && 2594 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2595 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2596 continue; 2597 /* skip CG for VCE/UVD, it's handled specially */ 2598 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2599 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2600 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2601 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2602 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2603 /* enable powergating to save power */ 2604 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2605 state); 2606 if (r) { 2607 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2608 adev->ip_blocks[i].version->funcs->name, r); 2609 return r; 2610 } 2611 } 2612 } 2613 return 0; 2614 } 2615 2616 static int amdgpu_device_enable_mgpu_fan_boost(void) 2617 { 2618 struct amdgpu_gpu_instance *gpu_ins; 2619 struct amdgpu_device *adev; 2620 int i, ret = 0; 2621 2622 mutex_lock(&mgpu_info.mutex); 2623 2624 /* 2625 * MGPU fan boost feature should be enabled 2626 * only when there are two or more dGPUs in 2627 * the system 2628 */ 2629 if (mgpu_info.num_dgpu < 2) 2630 goto out; 2631 2632 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2633 gpu_ins = &(mgpu_info.gpu_ins[i]); 2634 adev = gpu_ins->adev; 2635 if (!(adev->flags & AMD_IS_APU) && 2636 !gpu_ins->mgpu_fan_enabled) { 2637 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2638 if (ret) 2639 break; 2640 2641 gpu_ins->mgpu_fan_enabled = 1; 2642 } 2643 } 2644 2645 out: 2646 mutex_unlock(&mgpu_info.mutex); 2647 2648 return ret; 2649 } 2650 2651 /** 2652 * amdgpu_device_ip_late_init - run late init for hardware IPs 2653 * 2654 * @adev: amdgpu_device pointer 2655 * 2656 * Late initialization pass for hardware IPs. The list of all the hardware 2657 * IPs that make up the asic is walked and the late_init callbacks are run. 2658 * late_init covers any special initialization that an IP requires 2659 * after all of the have been initialized or something that needs to happen 2660 * late in the init process. 2661 * Returns 0 on success, negative error code on failure. 2662 */ 2663 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2664 { 2665 struct amdgpu_gpu_instance *gpu_instance; 2666 int i = 0, r; 2667 2668 for (i = 0; i < adev->num_ip_blocks; i++) { 2669 if (!adev->ip_blocks[i].status.hw) 2670 continue; 2671 if (adev->ip_blocks[i].version->funcs->late_init) { 2672 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2673 if (r) { 2674 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2675 adev->ip_blocks[i].version->funcs->name, r); 2676 return r; 2677 } 2678 } 2679 adev->ip_blocks[i].status.late_initialized = true; 2680 } 2681 2682 r = amdgpu_ras_late_init(adev); 2683 if (r) { 2684 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2685 return r; 2686 } 2687 2688 amdgpu_ras_set_error_query_ready(adev, true); 2689 2690 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2691 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2692 2693 amdgpu_device_fill_reset_magic(adev); 2694 2695 r = amdgpu_device_enable_mgpu_fan_boost(); 2696 if (r) 2697 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2698 2699 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2700 if (amdgpu_passthrough(adev) && 2701 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2702 adev->asic_type == CHIP_ALDEBARAN)) 2703 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2704 2705 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2706 mutex_lock(&mgpu_info.mutex); 2707 2708 /* 2709 * Reset device p-state to low as this was booted with high. 2710 * 2711 * This should be performed only after all devices from the same 2712 * hive get initialized. 2713 * 2714 * However, it's unknown how many device in the hive in advance. 2715 * As this is counted one by one during devices initializations. 2716 * 2717 * So, we wait for all XGMI interlinked devices initialized. 2718 * This may bring some delays as those devices may come from 2719 * different hives. But that should be OK. 2720 */ 2721 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2722 for (i = 0; i < mgpu_info.num_gpu; i++) { 2723 gpu_instance = &(mgpu_info.gpu_ins[i]); 2724 if (gpu_instance->adev->flags & AMD_IS_APU) 2725 continue; 2726 2727 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2728 AMDGPU_XGMI_PSTATE_MIN); 2729 if (r) { 2730 DRM_ERROR("pstate setting failed (%d).\n", r); 2731 break; 2732 } 2733 } 2734 } 2735 2736 mutex_unlock(&mgpu_info.mutex); 2737 } 2738 2739 return 0; 2740 } 2741 2742 /** 2743 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2744 * 2745 * @adev: amdgpu_device pointer 2746 * 2747 * For ASICs need to disable SMC first 2748 */ 2749 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2750 { 2751 int i, r; 2752 2753 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2754 return; 2755 2756 for (i = 0; i < adev->num_ip_blocks; i++) { 2757 if (!adev->ip_blocks[i].status.hw) 2758 continue; 2759 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2760 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2761 /* XXX handle errors */ 2762 if (r) { 2763 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2764 adev->ip_blocks[i].version->funcs->name, r); 2765 } 2766 adev->ip_blocks[i].status.hw = false; 2767 break; 2768 } 2769 } 2770 } 2771 2772 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2773 { 2774 int i, r; 2775 2776 for (i = 0; i < adev->num_ip_blocks; i++) { 2777 if (!adev->ip_blocks[i].version->funcs->early_fini) 2778 continue; 2779 2780 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2781 if (r) { 2782 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2783 adev->ip_blocks[i].version->funcs->name, r); 2784 } 2785 } 2786 2787 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2788 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2789 2790 amdgpu_amdkfd_suspend(adev, false); 2791 2792 /* Workaroud for ASICs need to disable SMC first */ 2793 amdgpu_device_smu_fini_early(adev); 2794 2795 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2796 if (!adev->ip_blocks[i].status.hw) 2797 continue; 2798 2799 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2800 /* XXX handle errors */ 2801 if (r) { 2802 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2803 adev->ip_blocks[i].version->funcs->name, r); 2804 } 2805 2806 adev->ip_blocks[i].status.hw = false; 2807 } 2808 2809 if (amdgpu_sriov_vf(adev)) { 2810 if (amdgpu_virt_release_full_gpu(adev, false)) 2811 DRM_ERROR("failed to release exclusive mode on fini\n"); 2812 } 2813 2814 return 0; 2815 } 2816 2817 /** 2818 * amdgpu_device_ip_fini - run fini for hardware IPs 2819 * 2820 * @adev: amdgpu_device pointer 2821 * 2822 * Main teardown pass for hardware IPs. The list of all the hardware 2823 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2824 * are run. hw_fini tears down the hardware associated with each IP 2825 * and sw_fini tears down any software state associated with each IP. 2826 * Returns 0 on success, negative error code on failure. 2827 */ 2828 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2829 { 2830 int i, r; 2831 2832 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2833 amdgpu_virt_release_ras_err_handler_data(adev); 2834 2835 if (adev->gmc.xgmi.num_physical_nodes > 1) 2836 amdgpu_xgmi_remove_device(adev); 2837 2838 amdgpu_amdkfd_device_fini_sw(adev); 2839 2840 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2841 if (!adev->ip_blocks[i].status.sw) 2842 continue; 2843 2844 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2845 amdgpu_ucode_free_bo(adev); 2846 amdgpu_free_static_csa(&adev->virt.csa_obj); 2847 amdgpu_device_wb_fini(adev); 2848 amdgpu_device_mem_scratch_fini(adev); 2849 amdgpu_ib_pool_fini(adev); 2850 } 2851 2852 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2853 /* XXX handle errors */ 2854 if (r) { 2855 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2856 adev->ip_blocks[i].version->funcs->name, r); 2857 } 2858 adev->ip_blocks[i].status.sw = false; 2859 adev->ip_blocks[i].status.valid = false; 2860 } 2861 2862 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2863 if (!adev->ip_blocks[i].status.late_initialized) 2864 continue; 2865 if (adev->ip_blocks[i].version->funcs->late_fini) 2866 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2867 adev->ip_blocks[i].status.late_initialized = false; 2868 } 2869 2870 amdgpu_ras_fini(adev); 2871 2872 return 0; 2873 } 2874 2875 /** 2876 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2877 * 2878 * @work: work_struct. 2879 */ 2880 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2881 { 2882 struct amdgpu_device *adev = 2883 container_of(work, struct amdgpu_device, delayed_init_work.work); 2884 int r; 2885 2886 r = amdgpu_ib_ring_tests(adev); 2887 if (r) 2888 DRM_ERROR("ib ring test failed (%d).\n", r); 2889 } 2890 2891 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2892 { 2893 struct amdgpu_device *adev = 2894 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2895 2896 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2897 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2898 2899 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2900 adev->gfx.gfx_off_state = true; 2901 } 2902 2903 /** 2904 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2905 * 2906 * @adev: amdgpu_device pointer 2907 * 2908 * Main suspend function for hardware IPs. The list of all the hardware 2909 * IPs that make up the asic is walked, clockgating is disabled and the 2910 * suspend callbacks are run. suspend puts the hardware and software state 2911 * in each IP into a state suitable for suspend. 2912 * Returns 0 on success, negative error code on failure. 2913 */ 2914 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2915 { 2916 int i, r; 2917 2918 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2919 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2920 2921 /* 2922 * Per PMFW team's suggestion, driver needs to handle gfxoff 2923 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2924 * scenario. Add the missing df cstate disablement here. 2925 */ 2926 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2927 dev_warn(adev->dev, "Failed to disallow df cstate"); 2928 2929 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2930 if (!adev->ip_blocks[i].status.valid) 2931 continue; 2932 2933 /* displays are handled separately */ 2934 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2935 continue; 2936 2937 /* XXX handle errors */ 2938 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2939 /* XXX handle errors */ 2940 if (r) { 2941 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2942 adev->ip_blocks[i].version->funcs->name, r); 2943 return r; 2944 } 2945 2946 adev->ip_blocks[i].status.hw = false; 2947 } 2948 2949 return 0; 2950 } 2951 2952 /** 2953 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2954 * 2955 * @adev: amdgpu_device pointer 2956 * 2957 * Main suspend function for hardware IPs. The list of all the hardware 2958 * IPs that make up the asic is walked, clockgating is disabled and the 2959 * suspend callbacks are run. suspend puts the hardware and software state 2960 * in each IP into a state suitable for suspend. 2961 * Returns 0 on success, negative error code on failure. 2962 */ 2963 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2964 { 2965 int i, r; 2966 2967 if (adev->in_s0ix) 2968 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2969 2970 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2971 if (!adev->ip_blocks[i].status.valid) 2972 continue; 2973 /* displays are handled in phase1 */ 2974 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2975 continue; 2976 /* PSP lost connection when err_event_athub occurs */ 2977 if (amdgpu_ras_intr_triggered() && 2978 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2979 adev->ip_blocks[i].status.hw = false; 2980 continue; 2981 } 2982 2983 /* skip unnecessary suspend if we do not initialize them yet */ 2984 if (adev->gmc.xgmi.pending_reset && 2985 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2986 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2987 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2988 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2989 adev->ip_blocks[i].status.hw = false; 2990 continue; 2991 } 2992 2993 /* skip suspend of gfx/mes and psp for S0ix 2994 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2995 * like at runtime. PSP is also part of the always on hardware 2996 * so no need to suspend it. 2997 */ 2998 if (adev->in_s0ix && 2999 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3001 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3002 continue; 3003 3004 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3005 if (adev->in_s0ix && 3006 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3007 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3008 continue; 3009 3010 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3011 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3012 * from this location and RLC Autoload automatically also gets loaded 3013 * from here based on PMFW -> PSP message during re-init sequence. 3014 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3015 * the TMR and reload FWs again for IMU enabled APU ASICs. 3016 */ 3017 if (amdgpu_in_reset(adev) && 3018 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3019 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3020 continue; 3021 3022 /* XXX handle errors */ 3023 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3024 /* XXX handle errors */ 3025 if (r) { 3026 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3027 adev->ip_blocks[i].version->funcs->name, r); 3028 } 3029 adev->ip_blocks[i].status.hw = false; 3030 /* handle putting the SMC in the appropriate state */ 3031 if (!amdgpu_sriov_vf(adev)) { 3032 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3033 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3034 if (r) { 3035 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3036 adev->mp1_state, r); 3037 return r; 3038 } 3039 } 3040 } 3041 } 3042 3043 return 0; 3044 } 3045 3046 /** 3047 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3048 * 3049 * @adev: amdgpu_device pointer 3050 * 3051 * Main suspend function for hardware IPs. The list of all the hardware 3052 * IPs that make up the asic is walked, clockgating is disabled and the 3053 * suspend callbacks are run. suspend puts the hardware and software state 3054 * in each IP into a state suitable for suspend. 3055 * Returns 0 on success, negative error code on failure. 3056 */ 3057 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3058 { 3059 int r; 3060 3061 if (amdgpu_sriov_vf(adev)) { 3062 amdgpu_virt_fini_data_exchange(adev); 3063 amdgpu_virt_request_full_gpu(adev, false); 3064 } 3065 3066 r = amdgpu_device_ip_suspend_phase1(adev); 3067 if (r) 3068 return r; 3069 r = amdgpu_device_ip_suspend_phase2(adev); 3070 3071 if (amdgpu_sriov_vf(adev)) 3072 amdgpu_virt_release_full_gpu(adev, false); 3073 3074 return r; 3075 } 3076 3077 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3078 { 3079 int i, r; 3080 3081 static enum amd_ip_block_type ip_order[] = { 3082 AMD_IP_BLOCK_TYPE_COMMON, 3083 AMD_IP_BLOCK_TYPE_GMC, 3084 AMD_IP_BLOCK_TYPE_PSP, 3085 AMD_IP_BLOCK_TYPE_IH, 3086 }; 3087 3088 for (i = 0; i < adev->num_ip_blocks; i++) { 3089 int j; 3090 struct amdgpu_ip_block *block; 3091 3092 block = &adev->ip_blocks[i]; 3093 block->status.hw = false; 3094 3095 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3096 3097 if (block->version->type != ip_order[j] || 3098 !block->status.valid) 3099 continue; 3100 3101 r = block->version->funcs->hw_init(adev); 3102 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3103 if (r) 3104 return r; 3105 block->status.hw = true; 3106 } 3107 } 3108 3109 return 0; 3110 } 3111 3112 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3113 { 3114 int i, r; 3115 3116 static enum amd_ip_block_type ip_order[] = { 3117 AMD_IP_BLOCK_TYPE_SMC, 3118 AMD_IP_BLOCK_TYPE_DCE, 3119 AMD_IP_BLOCK_TYPE_GFX, 3120 AMD_IP_BLOCK_TYPE_SDMA, 3121 AMD_IP_BLOCK_TYPE_MES, 3122 AMD_IP_BLOCK_TYPE_UVD, 3123 AMD_IP_BLOCK_TYPE_VCE, 3124 AMD_IP_BLOCK_TYPE_VCN, 3125 AMD_IP_BLOCK_TYPE_JPEG 3126 }; 3127 3128 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3129 int j; 3130 struct amdgpu_ip_block *block; 3131 3132 for (j = 0; j < adev->num_ip_blocks; j++) { 3133 block = &adev->ip_blocks[j]; 3134 3135 if (block->version->type != ip_order[i] || 3136 !block->status.valid || 3137 block->status.hw) 3138 continue; 3139 3140 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3141 r = block->version->funcs->resume(adev); 3142 else 3143 r = block->version->funcs->hw_init(adev); 3144 3145 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3146 if (r) 3147 return r; 3148 block->status.hw = true; 3149 } 3150 } 3151 3152 return 0; 3153 } 3154 3155 /** 3156 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3157 * 3158 * @adev: amdgpu_device pointer 3159 * 3160 * First resume function for hardware IPs. The list of all the hardware 3161 * IPs that make up the asic is walked and the resume callbacks are run for 3162 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3163 * after a suspend and updates the software state as necessary. This 3164 * function is also used for restoring the GPU after a GPU reset. 3165 * Returns 0 on success, negative error code on failure. 3166 */ 3167 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3168 { 3169 int i, r; 3170 3171 for (i = 0; i < adev->num_ip_blocks; i++) { 3172 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3173 continue; 3174 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3176 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3177 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3178 3179 r = adev->ip_blocks[i].version->funcs->resume(adev); 3180 if (r) { 3181 DRM_ERROR("resume of IP block <%s> failed %d\n", 3182 adev->ip_blocks[i].version->funcs->name, r); 3183 return r; 3184 } 3185 adev->ip_blocks[i].status.hw = true; 3186 } 3187 } 3188 3189 return 0; 3190 } 3191 3192 /** 3193 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3194 * 3195 * @adev: amdgpu_device pointer 3196 * 3197 * First resume function for hardware IPs. The list of all the hardware 3198 * IPs that make up the asic is walked and the resume callbacks are run for 3199 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3200 * functional state after a suspend and updates the software state as 3201 * necessary. This function is also used for restoring the GPU after a GPU 3202 * reset. 3203 * Returns 0 on success, negative error code on failure. 3204 */ 3205 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3206 { 3207 int i, r; 3208 3209 for (i = 0; i < adev->num_ip_blocks; i++) { 3210 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3211 continue; 3212 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3213 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3214 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3215 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3216 continue; 3217 r = adev->ip_blocks[i].version->funcs->resume(adev); 3218 if (r) { 3219 DRM_ERROR("resume of IP block <%s> failed %d\n", 3220 adev->ip_blocks[i].version->funcs->name, r); 3221 return r; 3222 } 3223 adev->ip_blocks[i].status.hw = true; 3224 } 3225 3226 return 0; 3227 } 3228 3229 /** 3230 * amdgpu_device_ip_resume - run resume for hardware IPs 3231 * 3232 * @adev: amdgpu_device pointer 3233 * 3234 * Main resume function for hardware IPs. The hardware IPs 3235 * are split into two resume functions because they are 3236 * also used in recovering from a GPU reset and some additional 3237 * steps need to be take between them. In this case (S3/S4) they are 3238 * run sequentially. 3239 * Returns 0 on success, negative error code on failure. 3240 */ 3241 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3242 { 3243 int r; 3244 3245 r = amdgpu_device_ip_resume_phase1(adev); 3246 if (r) 3247 return r; 3248 3249 r = amdgpu_device_fw_loading(adev); 3250 if (r) 3251 return r; 3252 3253 r = amdgpu_device_ip_resume_phase2(adev); 3254 3255 return r; 3256 } 3257 3258 /** 3259 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3260 * 3261 * @adev: amdgpu_device pointer 3262 * 3263 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3264 */ 3265 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3266 { 3267 if (amdgpu_sriov_vf(adev)) { 3268 if (adev->is_atom_fw) { 3269 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3270 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3271 } else { 3272 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3273 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3274 } 3275 3276 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3277 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3278 } 3279 } 3280 3281 /** 3282 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3283 * 3284 * @asic_type: AMD asic type 3285 * 3286 * Check if there is DC (new modesetting infrastructre) support for an asic. 3287 * returns true if DC has support, false if not. 3288 */ 3289 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3290 { 3291 switch (asic_type) { 3292 #ifdef CONFIG_DRM_AMDGPU_SI 3293 case CHIP_HAINAN: 3294 #endif 3295 case CHIP_TOPAZ: 3296 /* chips with no display hardware */ 3297 return false; 3298 #if defined(CONFIG_DRM_AMD_DC) 3299 case CHIP_TAHITI: 3300 case CHIP_PITCAIRN: 3301 case CHIP_VERDE: 3302 case CHIP_OLAND: 3303 /* 3304 * We have systems in the wild with these ASICs that require 3305 * LVDS and VGA support which is not supported with DC. 3306 * 3307 * Fallback to the non-DC driver here by default so as not to 3308 * cause regressions. 3309 */ 3310 #if defined(CONFIG_DRM_AMD_DC_SI) 3311 return amdgpu_dc > 0; 3312 #else 3313 return false; 3314 #endif 3315 case CHIP_BONAIRE: 3316 case CHIP_KAVERI: 3317 case CHIP_KABINI: 3318 case CHIP_MULLINS: 3319 /* 3320 * We have systems in the wild with these ASICs that require 3321 * VGA support which is not supported with DC. 3322 * 3323 * Fallback to the non-DC driver here by default so as not to 3324 * cause regressions. 3325 */ 3326 return amdgpu_dc > 0; 3327 default: 3328 return amdgpu_dc != 0; 3329 #else 3330 default: 3331 if (amdgpu_dc > 0) 3332 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3333 return false; 3334 #endif 3335 } 3336 } 3337 3338 /** 3339 * amdgpu_device_has_dc_support - check if dc is supported 3340 * 3341 * @adev: amdgpu_device pointer 3342 * 3343 * Returns true for supported, false for not supported 3344 */ 3345 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3346 { 3347 if (adev->enable_virtual_display || 3348 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3349 return false; 3350 3351 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3352 } 3353 3354 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3355 { 3356 struct amdgpu_device *adev = 3357 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3358 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3359 3360 /* It's a bug to not have a hive within this function */ 3361 if (WARN_ON(!hive)) 3362 return; 3363 3364 /* 3365 * Use task barrier to synchronize all xgmi reset works across the 3366 * hive. task_barrier_enter and task_barrier_exit will block 3367 * until all the threads running the xgmi reset works reach 3368 * those points. task_barrier_full will do both blocks. 3369 */ 3370 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3371 3372 task_barrier_enter(&hive->tb); 3373 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3374 3375 if (adev->asic_reset_res) 3376 goto fail; 3377 3378 task_barrier_exit(&hive->tb); 3379 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3380 3381 if (adev->asic_reset_res) 3382 goto fail; 3383 3384 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3385 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3386 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3387 } else { 3388 3389 task_barrier_full(&hive->tb); 3390 adev->asic_reset_res = amdgpu_asic_reset(adev); 3391 } 3392 3393 fail: 3394 if (adev->asic_reset_res) 3395 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3396 adev->asic_reset_res, adev_to_drm(adev)->unique); 3397 amdgpu_put_xgmi_hive(hive); 3398 } 3399 3400 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3401 { 3402 char *input = amdgpu_lockup_timeout; 3403 char *timeout_setting = NULL; 3404 int index = 0; 3405 long timeout; 3406 int ret = 0; 3407 3408 /* 3409 * By default timeout for non compute jobs is 10000 3410 * and 60000 for compute jobs. 3411 * In SR-IOV or passthrough mode, timeout for compute 3412 * jobs are 60000 by default. 3413 */ 3414 adev->gfx_timeout = msecs_to_jiffies(10000); 3415 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3416 if (amdgpu_sriov_vf(adev)) 3417 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3418 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3419 else 3420 adev->compute_timeout = msecs_to_jiffies(60000); 3421 3422 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3423 while ((timeout_setting = strsep(&input, ",")) && 3424 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3425 ret = kstrtol(timeout_setting, 0, &timeout); 3426 if (ret) 3427 return ret; 3428 3429 if (timeout == 0) { 3430 index++; 3431 continue; 3432 } else if (timeout < 0) { 3433 timeout = MAX_SCHEDULE_TIMEOUT; 3434 dev_warn(adev->dev, "lockup timeout disabled"); 3435 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3436 } else { 3437 timeout = msecs_to_jiffies(timeout); 3438 } 3439 3440 switch (index++) { 3441 case 0: 3442 adev->gfx_timeout = timeout; 3443 break; 3444 case 1: 3445 adev->compute_timeout = timeout; 3446 break; 3447 case 2: 3448 adev->sdma_timeout = timeout; 3449 break; 3450 case 3: 3451 adev->video_timeout = timeout; 3452 break; 3453 default: 3454 break; 3455 } 3456 } 3457 /* 3458 * There is only one value specified and 3459 * it should apply to all non-compute jobs. 3460 */ 3461 if (index == 1) { 3462 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3463 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3464 adev->compute_timeout = adev->gfx_timeout; 3465 } 3466 } 3467 3468 return ret; 3469 } 3470 3471 /** 3472 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3473 * 3474 * @adev: amdgpu_device pointer 3475 * 3476 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3477 */ 3478 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3479 { 3480 struct iommu_domain *domain; 3481 3482 domain = iommu_get_domain_for_dev(adev->dev); 3483 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3484 adev->ram_is_direct_mapped = true; 3485 } 3486 3487 static const struct attribute *amdgpu_dev_attributes[] = { 3488 &dev_attr_pcie_replay_count.attr, 3489 NULL 3490 }; 3491 3492 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3493 { 3494 if (amdgpu_mcbp == 1) 3495 adev->gfx.mcbp = true; 3496 else if (amdgpu_mcbp == 0) 3497 adev->gfx.mcbp = false; 3498 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3499 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3500 adev->gfx.num_gfx_rings) 3501 adev->gfx.mcbp = true; 3502 3503 if (amdgpu_sriov_vf(adev)) 3504 adev->gfx.mcbp = true; 3505 3506 if (adev->gfx.mcbp) 3507 DRM_INFO("MCBP is enabled\n"); 3508 } 3509 3510 /** 3511 * amdgpu_device_init - initialize the driver 3512 * 3513 * @adev: amdgpu_device pointer 3514 * @flags: driver flags 3515 * 3516 * Initializes the driver info and hw (all asics). 3517 * Returns 0 for success or an error on failure. 3518 * Called at driver startup. 3519 */ 3520 int amdgpu_device_init(struct amdgpu_device *adev, 3521 uint32_t flags) 3522 { 3523 struct drm_device *ddev = adev_to_drm(adev); 3524 struct pci_dev *pdev = adev->pdev; 3525 int r, i; 3526 bool px = false; 3527 u32 max_MBps; 3528 int tmp; 3529 3530 adev->shutdown = false; 3531 adev->flags = flags; 3532 3533 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3534 adev->asic_type = amdgpu_force_asic_type; 3535 else 3536 adev->asic_type = flags & AMD_ASIC_MASK; 3537 3538 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3539 if (amdgpu_emu_mode == 1) 3540 adev->usec_timeout *= 10; 3541 adev->gmc.gart_size = 512 * 1024 * 1024; 3542 adev->accel_working = false; 3543 adev->num_rings = 0; 3544 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3545 adev->mman.buffer_funcs = NULL; 3546 adev->mman.buffer_funcs_ring = NULL; 3547 adev->vm_manager.vm_pte_funcs = NULL; 3548 adev->vm_manager.vm_pte_num_scheds = 0; 3549 adev->gmc.gmc_funcs = NULL; 3550 adev->harvest_ip_mask = 0x0; 3551 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3552 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3553 3554 adev->smc_rreg = &amdgpu_invalid_rreg; 3555 adev->smc_wreg = &amdgpu_invalid_wreg; 3556 adev->pcie_rreg = &amdgpu_invalid_rreg; 3557 adev->pcie_wreg = &amdgpu_invalid_wreg; 3558 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3559 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3560 adev->pciep_rreg = &amdgpu_invalid_rreg; 3561 adev->pciep_wreg = &amdgpu_invalid_wreg; 3562 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3563 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3564 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3565 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3566 adev->didt_rreg = &amdgpu_invalid_rreg; 3567 adev->didt_wreg = &amdgpu_invalid_wreg; 3568 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3569 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3570 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3571 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3572 3573 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3574 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3575 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3576 3577 /* mutex initialization are all done here so we 3578 * can recall function without having locking issues 3579 */ 3580 mutex_init(&adev->firmware.mutex); 3581 mutex_init(&adev->pm.mutex); 3582 mutex_init(&adev->gfx.gpu_clock_mutex); 3583 mutex_init(&adev->srbm_mutex); 3584 mutex_init(&adev->gfx.pipe_reserve_mutex); 3585 mutex_init(&adev->gfx.gfx_off_mutex); 3586 mutex_init(&adev->gfx.partition_mutex); 3587 mutex_init(&adev->grbm_idx_mutex); 3588 mutex_init(&adev->mn_lock); 3589 mutex_init(&adev->virt.vf_errors.lock); 3590 hash_init(adev->mn_hash); 3591 mutex_init(&adev->psp.mutex); 3592 mutex_init(&adev->notifier_lock); 3593 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3594 mutex_init(&adev->benchmark_mutex); 3595 3596 amdgpu_device_init_apu_flags(adev); 3597 3598 r = amdgpu_device_check_arguments(adev); 3599 if (r) 3600 return r; 3601 3602 spin_lock_init(&adev->mmio_idx_lock); 3603 spin_lock_init(&adev->smc_idx_lock); 3604 spin_lock_init(&adev->pcie_idx_lock); 3605 spin_lock_init(&adev->uvd_ctx_idx_lock); 3606 spin_lock_init(&adev->didt_idx_lock); 3607 spin_lock_init(&adev->gc_cac_idx_lock); 3608 spin_lock_init(&adev->se_cac_idx_lock); 3609 spin_lock_init(&adev->audio_endpt_idx_lock); 3610 spin_lock_init(&adev->mm_stats.lock); 3611 3612 INIT_LIST_HEAD(&adev->shadow_list); 3613 mutex_init(&adev->shadow_list_lock); 3614 3615 INIT_LIST_HEAD(&adev->reset_list); 3616 3617 INIT_LIST_HEAD(&adev->ras_list); 3618 3619 INIT_DELAYED_WORK(&adev->delayed_init_work, 3620 amdgpu_device_delayed_init_work_handler); 3621 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3622 amdgpu_device_delay_enable_gfx_off); 3623 3624 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3625 3626 adev->gfx.gfx_off_req_count = 1; 3627 adev->gfx.gfx_off_residency = 0; 3628 adev->gfx.gfx_off_entrycount = 0; 3629 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3630 3631 atomic_set(&adev->throttling_logging_enabled, 1); 3632 /* 3633 * If throttling continues, logging will be performed every minute 3634 * to avoid log flooding. "-1" is subtracted since the thermal 3635 * throttling interrupt comes every second. Thus, the total logging 3636 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3637 * for throttling interrupt) = 60 seconds. 3638 */ 3639 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3640 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3641 3642 /* Registers mapping */ 3643 /* TODO: block userspace mapping of io register */ 3644 if (adev->asic_type >= CHIP_BONAIRE) { 3645 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3646 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3647 } else { 3648 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3649 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3650 } 3651 3652 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3653 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3654 3655 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3656 if (!adev->rmmio) 3657 return -ENOMEM; 3658 3659 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3660 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3661 3662 /* 3663 * Reset domain needs to be present early, before XGMI hive discovered 3664 * (if any) and intitialized to use reset sem and in_gpu reset flag 3665 * early on during init and before calling to RREG32. 3666 */ 3667 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3668 if (!adev->reset_domain) 3669 return -ENOMEM; 3670 3671 /* detect hw virtualization here */ 3672 amdgpu_detect_virtualization(adev); 3673 3674 amdgpu_device_get_pcie_info(adev); 3675 3676 r = amdgpu_device_get_job_timeout_settings(adev); 3677 if (r) { 3678 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3679 return r; 3680 } 3681 3682 /* early init functions */ 3683 r = amdgpu_device_ip_early_init(adev); 3684 if (r) 3685 return r; 3686 3687 amdgpu_device_set_mcbp(adev); 3688 3689 /* Get rid of things like offb */ 3690 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3691 if (r) 3692 return r; 3693 3694 /* Enable TMZ based on IP_VERSION */ 3695 amdgpu_gmc_tmz_set(adev); 3696 3697 amdgpu_gmc_noretry_set(adev); 3698 /* Need to get xgmi info early to decide the reset behavior*/ 3699 if (adev->gmc.xgmi.supported) { 3700 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3701 if (r) 3702 return r; 3703 } 3704 3705 /* enable PCIE atomic ops */ 3706 if (amdgpu_sriov_vf(adev)) { 3707 if (adev->virt.fw_reserve.p_pf2vf) 3708 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3709 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3710 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3711 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3712 * internal path natively support atomics, set have_atomics_support to true. 3713 */ 3714 } else if ((adev->flags & AMD_IS_APU) && 3715 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3716 adev->have_atomics_support = true; 3717 } else { 3718 adev->have_atomics_support = 3719 !pci_enable_atomic_ops_to_root(adev->pdev, 3720 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3721 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3722 } 3723 3724 if (!adev->have_atomics_support) 3725 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3726 3727 /* doorbell bar mapping and doorbell index init*/ 3728 amdgpu_doorbell_init(adev); 3729 3730 if (amdgpu_emu_mode == 1) { 3731 /* post the asic on emulation mode */ 3732 emu_soc_asic_init(adev); 3733 goto fence_driver_init; 3734 } 3735 3736 amdgpu_reset_init(adev); 3737 3738 /* detect if we are with an SRIOV vbios */ 3739 if (adev->bios) 3740 amdgpu_device_detect_sriov_bios(adev); 3741 3742 /* check if we need to reset the asic 3743 * E.g., driver was not cleanly unloaded previously, etc. 3744 */ 3745 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3746 if (adev->gmc.xgmi.num_physical_nodes) { 3747 dev_info(adev->dev, "Pending hive reset.\n"); 3748 adev->gmc.xgmi.pending_reset = true; 3749 /* Only need to init necessary block for SMU to handle the reset */ 3750 for (i = 0; i < adev->num_ip_blocks; i++) { 3751 if (!adev->ip_blocks[i].status.valid) 3752 continue; 3753 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3754 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3755 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3756 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3757 DRM_DEBUG("IP %s disabled for hw_init.\n", 3758 adev->ip_blocks[i].version->funcs->name); 3759 adev->ip_blocks[i].status.hw = true; 3760 } 3761 } 3762 } else { 3763 tmp = amdgpu_reset_method; 3764 /* It should do a default reset when loading or reloading the driver, 3765 * regardless of the module parameter reset_method. 3766 */ 3767 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3768 r = amdgpu_asic_reset(adev); 3769 amdgpu_reset_method = tmp; 3770 if (r) { 3771 dev_err(adev->dev, "asic reset on init failed\n"); 3772 goto failed; 3773 } 3774 } 3775 } 3776 3777 /* Post card if necessary */ 3778 if (amdgpu_device_need_post(adev)) { 3779 if (!adev->bios) { 3780 dev_err(adev->dev, "no vBIOS found\n"); 3781 r = -EINVAL; 3782 goto failed; 3783 } 3784 DRM_INFO("GPU posting now...\n"); 3785 r = amdgpu_device_asic_init(adev); 3786 if (r) { 3787 dev_err(adev->dev, "gpu post error!\n"); 3788 goto failed; 3789 } 3790 } 3791 3792 if (adev->bios) { 3793 if (adev->is_atom_fw) { 3794 /* Initialize clocks */ 3795 r = amdgpu_atomfirmware_get_clock_info(adev); 3796 if (r) { 3797 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3798 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3799 goto failed; 3800 } 3801 } else { 3802 /* Initialize clocks */ 3803 r = amdgpu_atombios_get_clock_info(adev); 3804 if (r) { 3805 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3806 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3807 goto failed; 3808 } 3809 /* init i2c buses */ 3810 if (!amdgpu_device_has_dc_support(adev)) 3811 amdgpu_atombios_i2c_init(adev); 3812 } 3813 } 3814 3815 fence_driver_init: 3816 /* Fence driver */ 3817 r = amdgpu_fence_driver_sw_init(adev); 3818 if (r) { 3819 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3820 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3821 goto failed; 3822 } 3823 3824 /* init the mode config */ 3825 drm_mode_config_init(adev_to_drm(adev)); 3826 3827 r = amdgpu_device_ip_init(adev); 3828 if (r) { 3829 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3830 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3831 goto release_ras_con; 3832 } 3833 3834 amdgpu_fence_driver_hw_init(adev); 3835 3836 dev_info(adev->dev, 3837 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3838 adev->gfx.config.max_shader_engines, 3839 adev->gfx.config.max_sh_per_se, 3840 adev->gfx.config.max_cu_per_sh, 3841 adev->gfx.cu_info.number); 3842 3843 adev->accel_working = true; 3844 3845 amdgpu_vm_check_compute_bug(adev); 3846 3847 /* Initialize the buffer migration limit. */ 3848 if (amdgpu_moverate >= 0) 3849 max_MBps = amdgpu_moverate; 3850 else 3851 max_MBps = 8; /* Allow 8 MB/s. */ 3852 /* Get a log2 for easy divisions. */ 3853 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3854 3855 r = amdgpu_atombios_sysfs_init(adev); 3856 if (r) 3857 drm_err(&adev->ddev, 3858 "registering atombios sysfs failed (%d).\n", r); 3859 3860 r = amdgpu_pm_sysfs_init(adev); 3861 if (r) 3862 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3863 3864 r = amdgpu_ucode_sysfs_init(adev); 3865 if (r) { 3866 adev->ucode_sysfs_en = false; 3867 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3868 } else 3869 adev->ucode_sysfs_en = true; 3870 3871 /* 3872 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3873 * Otherwise the mgpu fan boost feature will be skipped due to the 3874 * gpu instance is counted less. 3875 */ 3876 amdgpu_register_gpu_instance(adev); 3877 3878 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3879 * explicit gating rather than handling it automatically. 3880 */ 3881 if (!adev->gmc.xgmi.pending_reset) { 3882 r = amdgpu_device_ip_late_init(adev); 3883 if (r) { 3884 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3885 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3886 goto release_ras_con; 3887 } 3888 /* must succeed. */ 3889 amdgpu_ras_resume(adev); 3890 queue_delayed_work(system_wq, &adev->delayed_init_work, 3891 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3892 } 3893 3894 if (amdgpu_sriov_vf(adev)) { 3895 amdgpu_virt_release_full_gpu(adev, true); 3896 flush_delayed_work(&adev->delayed_init_work); 3897 } 3898 3899 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3900 if (r) 3901 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3902 3903 amdgpu_fru_sysfs_init(adev); 3904 3905 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3906 r = amdgpu_pmu_init(adev); 3907 if (r) 3908 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3909 3910 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3911 if (amdgpu_device_cache_pci_state(adev->pdev)) 3912 pci_restore_state(pdev); 3913 3914 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3915 /* this will fail for cards that aren't VGA class devices, just 3916 * ignore it 3917 */ 3918 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3919 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3920 3921 px = amdgpu_device_supports_px(ddev); 3922 3923 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3924 apple_gmux_detect(NULL, NULL))) 3925 vga_switcheroo_register_client(adev->pdev, 3926 &amdgpu_switcheroo_ops, px); 3927 3928 if (px) 3929 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3930 3931 if (adev->gmc.xgmi.pending_reset) 3932 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3933 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3934 3935 amdgpu_device_check_iommu_direct_map(adev); 3936 3937 return 0; 3938 3939 release_ras_con: 3940 if (amdgpu_sriov_vf(adev)) 3941 amdgpu_virt_release_full_gpu(adev, true); 3942 3943 /* failed in exclusive mode due to timeout */ 3944 if (amdgpu_sriov_vf(adev) && 3945 !amdgpu_sriov_runtime(adev) && 3946 amdgpu_virt_mmio_blocked(adev) && 3947 !amdgpu_virt_wait_reset(adev)) { 3948 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3949 /* Don't send request since VF is inactive. */ 3950 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3951 adev->virt.ops = NULL; 3952 r = -EAGAIN; 3953 } 3954 amdgpu_release_ras_context(adev); 3955 3956 failed: 3957 amdgpu_vf_error_trans_all(adev); 3958 3959 return r; 3960 } 3961 3962 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3963 { 3964 3965 /* Clear all CPU mappings pointing to this device */ 3966 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3967 3968 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3969 amdgpu_doorbell_fini(adev); 3970 3971 iounmap(adev->rmmio); 3972 adev->rmmio = NULL; 3973 if (adev->mman.aper_base_kaddr) 3974 iounmap(adev->mman.aper_base_kaddr); 3975 adev->mman.aper_base_kaddr = NULL; 3976 3977 /* Memory manager related */ 3978 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3979 arch_phys_wc_del(adev->gmc.vram_mtrr); 3980 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3981 } 3982 } 3983 3984 /** 3985 * amdgpu_device_fini_hw - tear down the driver 3986 * 3987 * @adev: amdgpu_device pointer 3988 * 3989 * Tear down the driver info (all asics). 3990 * Called at driver shutdown. 3991 */ 3992 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3993 { 3994 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3995 flush_delayed_work(&adev->delayed_init_work); 3996 adev->shutdown = true; 3997 3998 /* make sure IB test finished before entering exclusive mode 3999 * to avoid preemption on IB test 4000 */ 4001 if (amdgpu_sriov_vf(adev)) { 4002 amdgpu_virt_request_full_gpu(adev, false); 4003 amdgpu_virt_fini_data_exchange(adev); 4004 } 4005 4006 /* disable all interrupts */ 4007 amdgpu_irq_disable_all(adev); 4008 if (adev->mode_info.mode_config_initialized) { 4009 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4010 drm_helper_force_disable_all(adev_to_drm(adev)); 4011 else 4012 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4013 } 4014 amdgpu_fence_driver_hw_fini(adev); 4015 4016 if (adev->mman.initialized) 4017 drain_workqueue(adev->mman.bdev.wq); 4018 4019 if (adev->pm.sysfs_initialized) 4020 amdgpu_pm_sysfs_fini(adev); 4021 if (adev->ucode_sysfs_en) 4022 amdgpu_ucode_sysfs_fini(adev); 4023 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4024 amdgpu_fru_sysfs_fini(adev); 4025 4026 /* disable ras feature must before hw fini */ 4027 amdgpu_ras_pre_fini(adev); 4028 4029 amdgpu_device_ip_fini_early(adev); 4030 4031 amdgpu_irq_fini_hw(adev); 4032 4033 if (adev->mman.initialized) 4034 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4035 4036 amdgpu_gart_dummy_page_fini(adev); 4037 4038 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4039 amdgpu_device_unmap_mmio(adev); 4040 4041 } 4042 4043 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4044 { 4045 int idx; 4046 bool px; 4047 4048 amdgpu_fence_driver_sw_fini(adev); 4049 amdgpu_device_ip_fini(adev); 4050 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4051 adev->accel_working = false; 4052 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4053 4054 amdgpu_reset_fini(adev); 4055 4056 /* free i2c buses */ 4057 if (!amdgpu_device_has_dc_support(adev)) 4058 amdgpu_i2c_fini(adev); 4059 4060 if (amdgpu_emu_mode != 1) 4061 amdgpu_atombios_fini(adev); 4062 4063 kfree(adev->bios); 4064 adev->bios = NULL; 4065 4066 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4067 4068 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4069 apple_gmux_detect(NULL, NULL))) 4070 vga_switcheroo_unregister_client(adev->pdev); 4071 4072 if (px) 4073 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4074 4075 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4076 vga_client_unregister(adev->pdev); 4077 4078 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4079 4080 iounmap(adev->rmmio); 4081 adev->rmmio = NULL; 4082 amdgpu_doorbell_fini(adev); 4083 drm_dev_exit(idx); 4084 } 4085 4086 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4087 amdgpu_pmu_fini(adev); 4088 if (adev->mman.discovery_bin) 4089 amdgpu_discovery_fini(adev); 4090 4091 amdgpu_reset_put_reset_domain(adev->reset_domain); 4092 adev->reset_domain = NULL; 4093 4094 kfree(adev->pci_state); 4095 4096 } 4097 4098 /** 4099 * amdgpu_device_evict_resources - evict device resources 4100 * @adev: amdgpu device object 4101 * 4102 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4103 * of the vram memory type. Mainly used for evicting device resources 4104 * at suspend time. 4105 * 4106 */ 4107 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4108 { 4109 int ret; 4110 4111 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4112 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4113 return 0; 4114 4115 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4116 if (ret) 4117 DRM_WARN("evicting device resources failed\n"); 4118 return ret; 4119 } 4120 4121 /* 4122 * Suspend & resume. 4123 */ 4124 /** 4125 * amdgpu_device_suspend - initiate device suspend 4126 * 4127 * @dev: drm dev pointer 4128 * @fbcon : notify the fbdev of suspend 4129 * 4130 * Puts the hw in the suspend state (all asics). 4131 * Returns 0 for success or an error on failure. 4132 * Called at driver suspend. 4133 */ 4134 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4135 { 4136 struct amdgpu_device *adev = drm_to_adev(dev); 4137 int r = 0; 4138 4139 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4140 return 0; 4141 4142 adev->in_suspend = true; 4143 4144 /* Evict the majority of BOs before grabbing the full access */ 4145 r = amdgpu_device_evict_resources(adev); 4146 if (r) 4147 return r; 4148 4149 if (amdgpu_sriov_vf(adev)) { 4150 amdgpu_virt_fini_data_exchange(adev); 4151 r = amdgpu_virt_request_full_gpu(adev, false); 4152 if (r) 4153 return r; 4154 } 4155 4156 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4157 DRM_WARN("smart shift update failed\n"); 4158 4159 if (fbcon) 4160 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4161 4162 cancel_delayed_work_sync(&adev->delayed_init_work); 4163 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4164 4165 amdgpu_ras_suspend(adev); 4166 4167 amdgpu_device_ip_suspend_phase1(adev); 4168 4169 if (!adev->in_s0ix) 4170 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4171 4172 r = amdgpu_device_evict_resources(adev); 4173 if (r) 4174 return r; 4175 4176 amdgpu_fence_driver_hw_fini(adev); 4177 4178 amdgpu_device_ip_suspend_phase2(adev); 4179 4180 if (amdgpu_sriov_vf(adev)) 4181 amdgpu_virt_release_full_gpu(adev, false); 4182 4183 return 0; 4184 } 4185 4186 /** 4187 * amdgpu_device_resume - initiate device resume 4188 * 4189 * @dev: drm dev pointer 4190 * @fbcon : notify the fbdev of resume 4191 * 4192 * Bring the hw back to operating state (all asics). 4193 * Returns 0 for success or an error on failure. 4194 * Called at driver resume. 4195 */ 4196 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4197 { 4198 struct amdgpu_device *adev = drm_to_adev(dev); 4199 int r = 0; 4200 4201 if (amdgpu_sriov_vf(adev)) { 4202 r = amdgpu_virt_request_full_gpu(adev, true); 4203 if (r) 4204 return r; 4205 } 4206 4207 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4208 return 0; 4209 4210 if (adev->in_s0ix) 4211 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4212 4213 /* post card */ 4214 if (amdgpu_device_need_post(adev)) { 4215 r = amdgpu_device_asic_init(adev); 4216 if (r) 4217 dev_err(adev->dev, "amdgpu asic init failed\n"); 4218 } 4219 4220 r = amdgpu_device_ip_resume(adev); 4221 4222 if (r) { 4223 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4224 goto exit; 4225 } 4226 amdgpu_fence_driver_hw_init(adev); 4227 4228 r = amdgpu_device_ip_late_init(adev); 4229 if (r) 4230 goto exit; 4231 4232 queue_delayed_work(system_wq, &adev->delayed_init_work, 4233 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4234 4235 if (!adev->in_s0ix) { 4236 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4237 if (r) 4238 goto exit; 4239 } 4240 4241 exit: 4242 if (amdgpu_sriov_vf(adev)) { 4243 amdgpu_virt_init_data_exchange(adev); 4244 amdgpu_virt_release_full_gpu(adev, true); 4245 } 4246 4247 if (r) 4248 return r; 4249 4250 /* Make sure IB tests flushed */ 4251 flush_delayed_work(&adev->delayed_init_work); 4252 4253 if (fbcon) 4254 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4255 4256 amdgpu_ras_resume(adev); 4257 4258 if (adev->mode_info.num_crtc) { 4259 /* 4260 * Most of the connector probing functions try to acquire runtime pm 4261 * refs to ensure that the GPU is powered on when connector polling is 4262 * performed. Since we're calling this from a runtime PM callback, 4263 * trying to acquire rpm refs will cause us to deadlock. 4264 * 4265 * Since we're guaranteed to be holding the rpm lock, it's safe to 4266 * temporarily disable the rpm helpers so this doesn't deadlock us. 4267 */ 4268 #ifdef CONFIG_PM 4269 dev->dev->power.disable_depth++; 4270 #endif 4271 if (!adev->dc_enabled) 4272 drm_helper_hpd_irq_event(dev); 4273 else 4274 drm_kms_helper_hotplug_event(dev); 4275 #ifdef CONFIG_PM 4276 dev->dev->power.disable_depth--; 4277 #endif 4278 } 4279 adev->in_suspend = false; 4280 4281 if (adev->enable_mes) 4282 amdgpu_mes_self_test(adev); 4283 4284 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4285 DRM_WARN("smart shift update failed\n"); 4286 4287 return 0; 4288 } 4289 4290 /** 4291 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4292 * 4293 * @adev: amdgpu_device pointer 4294 * 4295 * The list of all the hardware IPs that make up the asic is walked and 4296 * the check_soft_reset callbacks are run. check_soft_reset determines 4297 * if the asic is still hung or not. 4298 * Returns true if any of the IPs are still in a hung state, false if not. 4299 */ 4300 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4301 { 4302 int i; 4303 bool asic_hang = false; 4304 4305 if (amdgpu_sriov_vf(adev)) 4306 return true; 4307 4308 if (amdgpu_asic_need_full_reset(adev)) 4309 return true; 4310 4311 for (i = 0; i < adev->num_ip_blocks; i++) { 4312 if (!adev->ip_blocks[i].status.valid) 4313 continue; 4314 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4315 adev->ip_blocks[i].status.hang = 4316 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4317 if (adev->ip_blocks[i].status.hang) { 4318 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4319 asic_hang = true; 4320 } 4321 } 4322 return asic_hang; 4323 } 4324 4325 /** 4326 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4327 * 4328 * @adev: amdgpu_device pointer 4329 * 4330 * The list of all the hardware IPs that make up the asic is walked and the 4331 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4332 * handles any IP specific hardware or software state changes that are 4333 * necessary for a soft reset to succeed. 4334 * Returns 0 on success, negative error code on failure. 4335 */ 4336 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4337 { 4338 int i, r = 0; 4339 4340 for (i = 0; i < adev->num_ip_blocks; i++) { 4341 if (!adev->ip_blocks[i].status.valid) 4342 continue; 4343 if (adev->ip_blocks[i].status.hang && 4344 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4345 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4346 if (r) 4347 return r; 4348 } 4349 } 4350 4351 return 0; 4352 } 4353 4354 /** 4355 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4356 * 4357 * @adev: amdgpu_device pointer 4358 * 4359 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4360 * reset is necessary to recover. 4361 * Returns true if a full asic reset is required, false if not. 4362 */ 4363 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4364 { 4365 int i; 4366 4367 if (amdgpu_asic_need_full_reset(adev)) 4368 return true; 4369 4370 for (i = 0; i < adev->num_ip_blocks; i++) { 4371 if (!adev->ip_blocks[i].status.valid) 4372 continue; 4373 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4374 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4375 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4376 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4377 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4378 if (adev->ip_blocks[i].status.hang) { 4379 dev_info(adev->dev, "Some block need full reset!\n"); 4380 return true; 4381 } 4382 } 4383 } 4384 return false; 4385 } 4386 4387 /** 4388 * amdgpu_device_ip_soft_reset - do a soft reset 4389 * 4390 * @adev: amdgpu_device pointer 4391 * 4392 * The list of all the hardware IPs that make up the asic is walked and the 4393 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4394 * IP specific hardware or software state changes that are necessary to soft 4395 * reset the IP. 4396 * Returns 0 on success, negative error code on failure. 4397 */ 4398 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4399 { 4400 int i, r = 0; 4401 4402 for (i = 0; i < adev->num_ip_blocks; i++) { 4403 if (!adev->ip_blocks[i].status.valid) 4404 continue; 4405 if (adev->ip_blocks[i].status.hang && 4406 adev->ip_blocks[i].version->funcs->soft_reset) { 4407 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4408 if (r) 4409 return r; 4410 } 4411 } 4412 4413 return 0; 4414 } 4415 4416 /** 4417 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4418 * 4419 * @adev: amdgpu_device pointer 4420 * 4421 * The list of all the hardware IPs that make up the asic is walked and the 4422 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4423 * handles any IP specific hardware or software state changes that are 4424 * necessary after the IP has been soft reset. 4425 * Returns 0 on success, negative error code on failure. 4426 */ 4427 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4428 { 4429 int i, r = 0; 4430 4431 for (i = 0; i < adev->num_ip_blocks; i++) { 4432 if (!adev->ip_blocks[i].status.valid) 4433 continue; 4434 if (adev->ip_blocks[i].status.hang && 4435 adev->ip_blocks[i].version->funcs->post_soft_reset) 4436 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4437 if (r) 4438 return r; 4439 } 4440 4441 return 0; 4442 } 4443 4444 /** 4445 * amdgpu_device_recover_vram - Recover some VRAM contents 4446 * 4447 * @adev: amdgpu_device pointer 4448 * 4449 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4450 * restore things like GPUVM page tables after a GPU reset where 4451 * the contents of VRAM might be lost. 4452 * 4453 * Returns: 4454 * 0 on success, negative error code on failure. 4455 */ 4456 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4457 { 4458 struct dma_fence *fence = NULL, *next = NULL; 4459 struct amdgpu_bo *shadow; 4460 struct amdgpu_bo_vm *vmbo; 4461 long r = 1, tmo; 4462 4463 if (amdgpu_sriov_runtime(adev)) 4464 tmo = msecs_to_jiffies(8000); 4465 else 4466 tmo = msecs_to_jiffies(100); 4467 4468 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4469 mutex_lock(&adev->shadow_list_lock); 4470 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4471 /* If vm is compute context or adev is APU, shadow will be NULL */ 4472 if (!vmbo->shadow) 4473 continue; 4474 shadow = vmbo->shadow; 4475 4476 /* No need to recover an evicted BO */ 4477 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4478 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4479 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4480 continue; 4481 4482 r = amdgpu_bo_restore_shadow(shadow, &next); 4483 if (r) 4484 break; 4485 4486 if (fence) { 4487 tmo = dma_fence_wait_timeout(fence, false, tmo); 4488 dma_fence_put(fence); 4489 fence = next; 4490 if (tmo == 0) { 4491 r = -ETIMEDOUT; 4492 break; 4493 } else if (tmo < 0) { 4494 r = tmo; 4495 break; 4496 } 4497 } else { 4498 fence = next; 4499 } 4500 } 4501 mutex_unlock(&adev->shadow_list_lock); 4502 4503 if (fence) 4504 tmo = dma_fence_wait_timeout(fence, false, tmo); 4505 dma_fence_put(fence); 4506 4507 if (r < 0 || tmo <= 0) { 4508 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4509 return -EIO; 4510 } 4511 4512 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4513 return 0; 4514 } 4515 4516 4517 /** 4518 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4519 * 4520 * @adev: amdgpu_device pointer 4521 * @from_hypervisor: request from hypervisor 4522 * 4523 * do VF FLR and reinitialize Asic 4524 * return 0 means succeeded otherwise failed 4525 */ 4526 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4527 bool from_hypervisor) 4528 { 4529 int r; 4530 struct amdgpu_hive_info *hive = NULL; 4531 int retry_limit = 0; 4532 4533 retry: 4534 amdgpu_amdkfd_pre_reset(adev); 4535 4536 if (from_hypervisor) 4537 r = amdgpu_virt_request_full_gpu(adev, true); 4538 else 4539 r = amdgpu_virt_reset_gpu(adev); 4540 if (r) 4541 return r; 4542 amdgpu_irq_gpu_reset_resume_helper(adev); 4543 4544 /* some sw clean up VF needs to do before recover */ 4545 amdgpu_virt_post_reset(adev); 4546 4547 /* Resume IP prior to SMC */ 4548 r = amdgpu_device_ip_reinit_early_sriov(adev); 4549 if (r) 4550 goto error; 4551 4552 amdgpu_virt_init_data_exchange(adev); 4553 4554 r = amdgpu_device_fw_loading(adev); 4555 if (r) 4556 return r; 4557 4558 /* now we are okay to resume SMC/CP/SDMA */ 4559 r = amdgpu_device_ip_reinit_late_sriov(adev); 4560 if (r) 4561 goto error; 4562 4563 hive = amdgpu_get_xgmi_hive(adev); 4564 /* Update PSP FW topology after reset */ 4565 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4566 r = amdgpu_xgmi_update_topology(hive, adev); 4567 4568 if (hive) 4569 amdgpu_put_xgmi_hive(hive); 4570 4571 if (!r) { 4572 r = amdgpu_ib_ring_tests(adev); 4573 4574 amdgpu_amdkfd_post_reset(adev); 4575 } 4576 4577 error: 4578 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4579 amdgpu_inc_vram_lost(adev); 4580 r = amdgpu_device_recover_vram(adev); 4581 } 4582 amdgpu_virt_release_full_gpu(adev, true); 4583 4584 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4585 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4586 retry_limit++; 4587 goto retry; 4588 } else 4589 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4590 } 4591 4592 return r; 4593 } 4594 4595 /** 4596 * amdgpu_device_has_job_running - check if there is any job in mirror list 4597 * 4598 * @adev: amdgpu_device pointer 4599 * 4600 * check if there is any job in mirror list 4601 */ 4602 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4603 { 4604 int i; 4605 struct drm_sched_job *job; 4606 4607 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4608 struct amdgpu_ring *ring = adev->rings[i]; 4609 4610 if (!ring || !ring->sched.thread) 4611 continue; 4612 4613 spin_lock(&ring->sched.job_list_lock); 4614 job = list_first_entry_or_null(&ring->sched.pending_list, 4615 struct drm_sched_job, list); 4616 spin_unlock(&ring->sched.job_list_lock); 4617 if (job) 4618 return true; 4619 } 4620 return false; 4621 } 4622 4623 /** 4624 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4625 * 4626 * @adev: amdgpu_device pointer 4627 * 4628 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4629 * a hung GPU. 4630 */ 4631 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4632 { 4633 4634 if (amdgpu_gpu_recovery == 0) 4635 goto disabled; 4636 4637 /* Skip soft reset check in fatal error mode */ 4638 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4639 return true; 4640 4641 if (amdgpu_sriov_vf(adev)) 4642 return true; 4643 4644 if (amdgpu_gpu_recovery == -1) { 4645 switch (adev->asic_type) { 4646 #ifdef CONFIG_DRM_AMDGPU_SI 4647 case CHIP_VERDE: 4648 case CHIP_TAHITI: 4649 case CHIP_PITCAIRN: 4650 case CHIP_OLAND: 4651 case CHIP_HAINAN: 4652 #endif 4653 #ifdef CONFIG_DRM_AMDGPU_CIK 4654 case CHIP_KAVERI: 4655 case CHIP_KABINI: 4656 case CHIP_MULLINS: 4657 #endif 4658 case CHIP_CARRIZO: 4659 case CHIP_STONEY: 4660 case CHIP_CYAN_SKILLFISH: 4661 goto disabled; 4662 default: 4663 break; 4664 } 4665 } 4666 4667 return true; 4668 4669 disabled: 4670 dev_info(adev->dev, "GPU recovery disabled.\n"); 4671 return false; 4672 } 4673 4674 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4675 { 4676 u32 i; 4677 int ret = 0; 4678 4679 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4680 4681 dev_info(adev->dev, "GPU mode1 reset\n"); 4682 4683 /* disable BM */ 4684 pci_clear_master(adev->pdev); 4685 4686 amdgpu_device_cache_pci_state(adev->pdev); 4687 4688 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4689 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4690 ret = amdgpu_dpm_mode1_reset(adev); 4691 } else { 4692 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4693 ret = psp_gpu_reset(adev); 4694 } 4695 4696 if (ret) 4697 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4698 4699 amdgpu_device_load_pci_state(adev->pdev); 4700 4701 /* wait for asic to come out of reset */ 4702 for (i = 0; i < adev->usec_timeout; i++) { 4703 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4704 4705 if (memsize != 0xffffffff) 4706 break; 4707 udelay(1); 4708 } 4709 4710 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4711 return ret; 4712 } 4713 4714 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4715 struct amdgpu_reset_context *reset_context) 4716 { 4717 int i, r = 0; 4718 struct amdgpu_job *job = NULL; 4719 bool need_full_reset = 4720 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4721 4722 if (reset_context->reset_req_dev == adev) 4723 job = reset_context->job; 4724 4725 if (amdgpu_sriov_vf(adev)) { 4726 /* stop the data exchange thread */ 4727 amdgpu_virt_fini_data_exchange(adev); 4728 } 4729 4730 amdgpu_fence_driver_isr_toggle(adev, true); 4731 4732 /* block all schedulers and reset given job's ring */ 4733 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4734 struct amdgpu_ring *ring = adev->rings[i]; 4735 4736 if (!ring || !ring->sched.thread) 4737 continue; 4738 4739 /* Clear job fence from fence drv to avoid force_completion 4740 * leave NULL and vm flush fence in fence drv 4741 */ 4742 amdgpu_fence_driver_clear_job_fences(ring); 4743 4744 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4745 amdgpu_fence_driver_force_completion(ring); 4746 } 4747 4748 amdgpu_fence_driver_isr_toggle(adev, false); 4749 4750 if (job && job->vm) 4751 drm_sched_increase_karma(&job->base); 4752 4753 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4754 /* If reset handler not implemented, continue; otherwise return */ 4755 if (r == -EOPNOTSUPP) 4756 r = 0; 4757 else 4758 return r; 4759 4760 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4761 if (!amdgpu_sriov_vf(adev)) { 4762 4763 if (!need_full_reset) 4764 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4765 4766 if (!need_full_reset && amdgpu_gpu_recovery && 4767 amdgpu_device_ip_check_soft_reset(adev)) { 4768 amdgpu_device_ip_pre_soft_reset(adev); 4769 r = amdgpu_device_ip_soft_reset(adev); 4770 amdgpu_device_ip_post_soft_reset(adev); 4771 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4772 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4773 need_full_reset = true; 4774 } 4775 } 4776 4777 if (need_full_reset) 4778 r = amdgpu_device_ip_suspend(adev); 4779 if (need_full_reset) 4780 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4781 else 4782 clear_bit(AMDGPU_NEED_FULL_RESET, 4783 &reset_context->flags); 4784 } 4785 4786 return r; 4787 } 4788 4789 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4790 { 4791 int i; 4792 4793 lockdep_assert_held(&adev->reset_domain->sem); 4794 4795 for (i = 0; i < adev->num_regs; i++) { 4796 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4797 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4798 adev->reset_dump_reg_value[i]); 4799 } 4800 4801 return 0; 4802 } 4803 4804 #ifdef CONFIG_DEV_COREDUMP 4805 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4806 size_t count, void *data, size_t datalen) 4807 { 4808 struct drm_printer p; 4809 struct amdgpu_device *adev = data; 4810 struct drm_print_iterator iter; 4811 int i; 4812 4813 iter.data = buffer; 4814 iter.offset = 0; 4815 iter.start = offset; 4816 iter.remain = count; 4817 4818 p = drm_coredump_printer(&iter); 4819 4820 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4821 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4822 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4823 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4824 if (adev->reset_task_info.pid) 4825 drm_printf(&p, "process_name: %s PID: %d\n", 4826 adev->reset_task_info.process_name, 4827 adev->reset_task_info.pid); 4828 4829 if (adev->reset_vram_lost) 4830 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4831 if (adev->num_regs) { 4832 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4833 4834 for (i = 0; i < adev->num_regs; i++) 4835 drm_printf(&p, "0x%08x: 0x%08x\n", 4836 adev->reset_dump_reg_list[i], 4837 adev->reset_dump_reg_value[i]); 4838 } 4839 4840 return count - iter.remain; 4841 } 4842 4843 static void amdgpu_devcoredump_free(void *data) 4844 { 4845 } 4846 4847 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4848 { 4849 struct drm_device *dev = adev_to_drm(adev); 4850 4851 ktime_get_ts64(&adev->reset_time); 4852 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4853 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4854 } 4855 #endif 4856 4857 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4858 struct amdgpu_reset_context *reset_context) 4859 { 4860 struct amdgpu_device *tmp_adev = NULL; 4861 bool need_full_reset, skip_hw_reset, vram_lost = false; 4862 int r = 0; 4863 bool gpu_reset_for_dev_remove = 0; 4864 4865 /* Try reset handler method first */ 4866 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4867 reset_list); 4868 amdgpu_reset_reg_dumps(tmp_adev); 4869 4870 reset_context->reset_device_list = device_list_handle; 4871 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4872 /* If reset handler not implemented, continue; otherwise return */ 4873 if (r == -EOPNOTSUPP) 4874 r = 0; 4875 else 4876 return r; 4877 4878 /* Reset handler not implemented, use the default method */ 4879 need_full_reset = 4880 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4881 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4882 4883 gpu_reset_for_dev_remove = 4884 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4885 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4886 4887 /* 4888 * ASIC reset has to be done on all XGMI hive nodes ASAP 4889 * to allow proper links negotiation in FW (within 1 sec) 4890 */ 4891 if (!skip_hw_reset && need_full_reset) { 4892 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4893 /* For XGMI run all resets in parallel to speed up the process */ 4894 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4895 tmp_adev->gmc.xgmi.pending_reset = false; 4896 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4897 r = -EALREADY; 4898 } else 4899 r = amdgpu_asic_reset(tmp_adev); 4900 4901 if (r) { 4902 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4903 r, adev_to_drm(tmp_adev)->unique); 4904 break; 4905 } 4906 } 4907 4908 /* For XGMI wait for all resets to complete before proceed */ 4909 if (!r) { 4910 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4911 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4912 flush_work(&tmp_adev->xgmi_reset_work); 4913 r = tmp_adev->asic_reset_res; 4914 if (r) 4915 break; 4916 } 4917 } 4918 } 4919 } 4920 4921 if (!r && amdgpu_ras_intr_triggered()) { 4922 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4923 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4924 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4925 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4926 } 4927 4928 amdgpu_ras_intr_cleared(); 4929 } 4930 4931 /* Since the mode1 reset affects base ip blocks, the 4932 * phase1 ip blocks need to be resumed. Otherwise there 4933 * will be a BIOS signature error and the psp bootloader 4934 * can't load kdb on the next amdgpu install. 4935 */ 4936 if (gpu_reset_for_dev_remove) { 4937 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4938 amdgpu_device_ip_resume_phase1(tmp_adev); 4939 4940 goto end; 4941 } 4942 4943 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4944 if (need_full_reset) { 4945 /* post card */ 4946 r = amdgpu_device_asic_init(tmp_adev); 4947 if (r) { 4948 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4949 } else { 4950 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4951 4952 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4953 if (r) 4954 goto out; 4955 4956 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4957 #ifdef CONFIG_DEV_COREDUMP 4958 tmp_adev->reset_vram_lost = vram_lost; 4959 memset(&tmp_adev->reset_task_info, 0, 4960 sizeof(tmp_adev->reset_task_info)); 4961 if (reset_context->job && reset_context->job->vm) 4962 tmp_adev->reset_task_info = 4963 reset_context->job->vm->task_info; 4964 amdgpu_reset_capture_coredumpm(tmp_adev); 4965 #endif 4966 if (vram_lost) { 4967 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4968 amdgpu_inc_vram_lost(tmp_adev); 4969 } 4970 4971 r = amdgpu_device_fw_loading(tmp_adev); 4972 if (r) 4973 return r; 4974 4975 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4976 if (r) 4977 goto out; 4978 4979 if (vram_lost) 4980 amdgpu_device_fill_reset_magic(tmp_adev); 4981 4982 /* 4983 * Add this ASIC as tracked as reset was already 4984 * complete successfully. 4985 */ 4986 amdgpu_register_gpu_instance(tmp_adev); 4987 4988 if (!reset_context->hive && 4989 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4990 amdgpu_xgmi_add_device(tmp_adev); 4991 4992 r = amdgpu_device_ip_late_init(tmp_adev); 4993 if (r) 4994 goto out; 4995 4996 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4997 4998 /* 4999 * The GPU enters bad state once faulty pages 5000 * by ECC has reached the threshold, and ras 5001 * recovery is scheduled next. So add one check 5002 * here to break recovery if it indeed exceeds 5003 * bad page threshold, and remind user to 5004 * retire this GPU or setting one bigger 5005 * bad_page_threshold value to fix this once 5006 * probing driver again. 5007 */ 5008 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5009 /* must succeed. */ 5010 amdgpu_ras_resume(tmp_adev); 5011 } else { 5012 r = -EINVAL; 5013 goto out; 5014 } 5015 5016 /* Update PSP FW topology after reset */ 5017 if (reset_context->hive && 5018 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5019 r = amdgpu_xgmi_update_topology( 5020 reset_context->hive, tmp_adev); 5021 } 5022 } 5023 5024 out: 5025 if (!r) { 5026 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5027 r = amdgpu_ib_ring_tests(tmp_adev); 5028 if (r) { 5029 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5030 need_full_reset = true; 5031 r = -EAGAIN; 5032 goto end; 5033 } 5034 } 5035 5036 if (!r) 5037 r = amdgpu_device_recover_vram(tmp_adev); 5038 else 5039 tmp_adev->asic_reset_res = r; 5040 } 5041 5042 end: 5043 if (need_full_reset) 5044 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5045 else 5046 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5047 return r; 5048 } 5049 5050 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5051 { 5052 5053 switch (amdgpu_asic_reset_method(adev)) { 5054 case AMD_RESET_METHOD_MODE1: 5055 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5056 break; 5057 case AMD_RESET_METHOD_MODE2: 5058 adev->mp1_state = PP_MP1_STATE_RESET; 5059 break; 5060 default: 5061 adev->mp1_state = PP_MP1_STATE_NONE; 5062 break; 5063 } 5064 } 5065 5066 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5067 { 5068 amdgpu_vf_error_trans_all(adev); 5069 adev->mp1_state = PP_MP1_STATE_NONE; 5070 } 5071 5072 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5073 { 5074 struct pci_dev *p = NULL; 5075 5076 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5077 adev->pdev->bus->number, 1); 5078 if (p) { 5079 pm_runtime_enable(&(p->dev)); 5080 pm_runtime_resume(&(p->dev)); 5081 } 5082 5083 pci_dev_put(p); 5084 } 5085 5086 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5087 { 5088 enum amd_reset_method reset_method; 5089 struct pci_dev *p = NULL; 5090 u64 expires; 5091 5092 /* 5093 * For now, only BACO and mode1 reset are confirmed 5094 * to suffer the audio issue without proper suspended. 5095 */ 5096 reset_method = amdgpu_asic_reset_method(adev); 5097 if ((reset_method != AMD_RESET_METHOD_BACO) && 5098 (reset_method != AMD_RESET_METHOD_MODE1)) 5099 return -EINVAL; 5100 5101 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5102 adev->pdev->bus->number, 1); 5103 if (!p) 5104 return -ENODEV; 5105 5106 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5107 if (!expires) 5108 /* 5109 * If we cannot get the audio device autosuspend delay, 5110 * a fixed 4S interval will be used. Considering 3S is 5111 * the audio controller default autosuspend delay setting. 5112 * 4S used here is guaranteed to cover that. 5113 */ 5114 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5115 5116 while (!pm_runtime_status_suspended(&(p->dev))) { 5117 if (!pm_runtime_suspend(&(p->dev))) 5118 break; 5119 5120 if (expires < ktime_get_mono_fast_ns()) { 5121 dev_warn(adev->dev, "failed to suspend display audio\n"); 5122 pci_dev_put(p); 5123 /* TODO: abort the succeeding gpu reset? */ 5124 return -ETIMEDOUT; 5125 } 5126 } 5127 5128 pm_runtime_disable(&(p->dev)); 5129 5130 pci_dev_put(p); 5131 return 0; 5132 } 5133 5134 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5135 { 5136 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5137 5138 #if defined(CONFIG_DEBUG_FS) 5139 if (!amdgpu_sriov_vf(adev)) 5140 cancel_work(&adev->reset_work); 5141 #endif 5142 5143 if (adev->kfd.dev) 5144 cancel_work(&adev->kfd.reset_work); 5145 5146 if (amdgpu_sriov_vf(adev)) 5147 cancel_work(&adev->virt.flr_work); 5148 5149 if (con && adev->ras_enabled) 5150 cancel_work(&con->recovery_work); 5151 5152 } 5153 5154 /** 5155 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5156 * 5157 * @adev: amdgpu_device pointer 5158 * @job: which job trigger hang 5159 * @reset_context: amdgpu reset context pointer 5160 * 5161 * Attempt to reset the GPU if it has hung (all asics). 5162 * Attempt to do soft-reset or full-reset and reinitialize Asic 5163 * Returns 0 for success or an error on failure. 5164 */ 5165 5166 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5167 struct amdgpu_job *job, 5168 struct amdgpu_reset_context *reset_context) 5169 { 5170 struct list_head device_list, *device_list_handle = NULL; 5171 bool job_signaled = false; 5172 struct amdgpu_hive_info *hive = NULL; 5173 struct amdgpu_device *tmp_adev = NULL; 5174 int i, r = 0; 5175 bool need_emergency_restart = false; 5176 bool audio_suspended = false; 5177 bool gpu_reset_for_dev_remove = false; 5178 5179 gpu_reset_for_dev_remove = 5180 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5181 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5182 5183 /* 5184 * Special case: RAS triggered and full reset isn't supported 5185 */ 5186 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5187 5188 /* 5189 * Flush RAM to disk so that after reboot 5190 * the user can read log and see why the system rebooted. 5191 */ 5192 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5193 DRM_WARN("Emergency reboot."); 5194 5195 ksys_sync_helper(); 5196 emergency_restart(); 5197 } 5198 5199 dev_info(adev->dev, "GPU %s begin!\n", 5200 need_emergency_restart ? "jobs stop":"reset"); 5201 5202 if (!amdgpu_sriov_vf(adev)) 5203 hive = amdgpu_get_xgmi_hive(adev); 5204 if (hive) 5205 mutex_lock(&hive->hive_lock); 5206 5207 reset_context->job = job; 5208 reset_context->hive = hive; 5209 /* 5210 * Build list of devices to reset. 5211 * In case we are in XGMI hive mode, resort the device list 5212 * to put adev in the 1st position. 5213 */ 5214 INIT_LIST_HEAD(&device_list); 5215 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5216 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5217 list_add_tail(&tmp_adev->reset_list, &device_list); 5218 if (gpu_reset_for_dev_remove && adev->shutdown) 5219 tmp_adev->shutdown = true; 5220 } 5221 if (!list_is_first(&adev->reset_list, &device_list)) 5222 list_rotate_to_front(&adev->reset_list, &device_list); 5223 device_list_handle = &device_list; 5224 } else { 5225 list_add_tail(&adev->reset_list, &device_list); 5226 device_list_handle = &device_list; 5227 } 5228 5229 /* We need to lock reset domain only once both for XGMI and single device */ 5230 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5231 reset_list); 5232 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5233 5234 /* block all schedulers and reset given job's ring */ 5235 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5236 5237 amdgpu_device_set_mp1_state(tmp_adev); 5238 5239 /* 5240 * Try to put the audio codec into suspend state 5241 * before gpu reset started. 5242 * 5243 * Due to the power domain of the graphics device 5244 * is shared with AZ power domain. Without this, 5245 * we may change the audio hardware from behind 5246 * the audio driver's back. That will trigger 5247 * some audio codec errors. 5248 */ 5249 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5250 audio_suspended = true; 5251 5252 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5253 5254 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5255 5256 if (!amdgpu_sriov_vf(tmp_adev)) 5257 amdgpu_amdkfd_pre_reset(tmp_adev); 5258 5259 /* 5260 * Mark these ASICs to be reseted as untracked first 5261 * And add them back after reset completed 5262 */ 5263 amdgpu_unregister_gpu_instance(tmp_adev); 5264 5265 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5266 5267 /* disable ras on ALL IPs */ 5268 if (!need_emergency_restart && 5269 amdgpu_device_ip_need_full_reset(tmp_adev)) 5270 amdgpu_ras_suspend(tmp_adev); 5271 5272 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5273 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5274 5275 if (!ring || !ring->sched.thread) 5276 continue; 5277 5278 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5279 5280 if (need_emergency_restart) 5281 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5282 } 5283 atomic_inc(&tmp_adev->gpu_reset_counter); 5284 } 5285 5286 if (need_emergency_restart) 5287 goto skip_sched_resume; 5288 5289 /* 5290 * Must check guilty signal here since after this point all old 5291 * HW fences are force signaled. 5292 * 5293 * job->base holds a reference to parent fence 5294 */ 5295 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5296 job_signaled = true; 5297 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5298 goto skip_hw_reset; 5299 } 5300 5301 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5302 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5303 if (gpu_reset_for_dev_remove) { 5304 /* Workaroud for ASICs need to disable SMC first */ 5305 amdgpu_device_smu_fini_early(tmp_adev); 5306 } 5307 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5308 /*TODO Should we stop ?*/ 5309 if (r) { 5310 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5311 r, adev_to_drm(tmp_adev)->unique); 5312 tmp_adev->asic_reset_res = r; 5313 } 5314 5315 /* 5316 * Drop all pending non scheduler resets. Scheduler resets 5317 * were already dropped during drm_sched_stop 5318 */ 5319 amdgpu_device_stop_pending_resets(tmp_adev); 5320 } 5321 5322 /* Actual ASIC resets if needed.*/ 5323 /* Host driver will handle XGMI hive reset for SRIOV */ 5324 if (amdgpu_sriov_vf(adev)) { 5325 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5326 if (r) 5327 adev->asic_reset_res = r; 5328 5329 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5330 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5331 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5332 amdgpu_ras_resume(adev); 5333 } else { 5334 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5335 if (r && r == -EAGAIN) 5336 goto retry; 5337 5338 if (!r && gpu_reset_for_dev_remove) 5339 goto recover_end; 5340 } 5341 5342 skip_hw_reset: 5343 5344 /* Post ASIC reset for all devs .*/ 5345 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5346 5347 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5348 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5349 5350 if (!ring || !ring->sched.thread) 5351 continue; 5352 5353 drm_sched_start(&ring->sched, true); 5354 } 5355 5356 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5357 amdgpu_mes_self_test(tmp_adev); 5358 5359 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5360 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5361 5362 if (tmp_adev->asic_reset_res) 5363 r = tmp_adev->asic_reset_res; 5364 5365 tmp_adev->asic_reset_res = 0; 5366 5367 if (r) { 5368 /* bad news, how to tell it to userspace ? */ 5369 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5370 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5371 } else { 5372 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5373 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5374 DRM_WARN("smart shift update failed\n"); 5375 } 5376 } 5377 5378 skip_sched_resume: 5379 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5380 /* unlock kfd: SRIOV would do it separately */ 5381 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5382 amdgpu_amdkfd_post_reset(tmp_adev); 5383 5384 /* kfd_post_reset will do nothing if kfd device is not initialized, 5385 * need to bring up kfd here if it's not be initialized before 5386 */ 5387 if (!adev->kfd.init_complete) 5388 amdgpu_amdkfd_device_init(adev); 5389 5390 if (audio_suspended) 5391 amdgpu_device_resume_display_audio(tmp_adev); 5392 5393 amdgpu_device_unset_mp1_state(tmp_adev); 5394 5395 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5396 } 5397 5398 recover_end: 5399 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5400 reset_list); 5401 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5402 5403 if (hive) { 5404 mutex_unlock(&hive->hive_lock); 5405 amdgpu_put_xgmi_hive(hive); 5406 } 5407 5408 if (r) 5409 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5410 5411 atomic_set(&adev->reset_domain->reset_res, r); 5412 return r; 5413 } 5414 5415 /** 5416 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5417 * 5418 * @adev: amdgpu_device pointer 5419 * 5420 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5421 * and lanes) of the slot the device is in. Handles APUs and 5422 * virtualized environments where PCIE config space may not be available. 5423 */ 5424 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5425 { 5426 struct pci_dev *pdev; 5427 enum pci_bus_speed speed_cap, platform_speed_cap; 5428 enum pcie_link_width platform_link_width; 5429 5430 if (amdgpu_pcie_gen_cap) 5431 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5432 5433 if (amdgpu_pcie_lane_cap) 5434 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5435 5436 /* covers APUs as well */ 5437 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5438 if (adev->pm.pcie_gen_mask == 0) 5439 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5440 if (adev->pm.pcie_mlw_mask == 0) 5441 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5442 return; 5443 } 5444 5445 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5446 return; 5447 5448 pcie_bandwidth_available(adev->pdev, NULL, 5449 &platform_speed_cap, &platform_link_width); 5450 5451 if (adev->pm.pcie_gen_mask == 0) { 5452 /* asic caps */ 5453 pdev = adev->pdev; 5454 speed_cap = pcie_get_speed_cap(pdev); 5455 if (speed_cap == PCI_SPEED_UNKNOWN) { 5456 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5457 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5458 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5459 } else { 5460 if (speed_cap == PCIE_SPEED_32_0GT) 5461 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5463 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5464 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5465 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5466 else if (speed_cap == PCIE_SPEED_16_0GT) 5467 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5468 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5469 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5470 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5471 else if (speed_cap == PCIE_SPEED_8_0GT) 5472 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5473 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5474 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5475 else if (speed_cap == PCIE_SPEED_5_0GT) 5476 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5477 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5478 else 5479 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5480 } 5481 /* platform caps */ 5482 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5483 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5484 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5485 } else { 5486 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5487 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5489 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5490 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5491 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5492 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5493 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5494 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5495 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5496 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5497 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5498 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5499 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5500 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5501 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5502 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5503 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5504 else 5505 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5506 5507 } 5508 } 5509 if (adev->pm.pcie_mlw_mask == 0) { 5510 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5511 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5512 } else { 5513 switch (platform_link_width) { 5514 case PCIE_LNK_X32: 5515 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5516 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5517 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5518 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5519 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5520 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5521 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5522 break; 5523 case PCIE_LNK_X16: 5524 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5525 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5530 break; 5531 case PCIE_LNK_X12: 5532 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5534 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5537 break; 5538 case PCIE_LNK_X8: 5539 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5542 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5543 break; 5544 case PCIE_LNK_X4: 5545 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5548 break; 5549 case PCIE_LNK_X2: 5550 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5552 break; 5553 case PCIE_LNK_X1: 5554 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5555 break; 5556 default: 5557 break; 5558 } 5559 } 5560 } 5561 } 5562 5563 /** 5564 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5565 * 5566 * @adev: amdgpu_device pointer 5567 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5568 * 5569 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5570 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5571 * @peer_adev. 5572 */ 5573 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5574 struct amdgpu_device *peer_adev) 5575 { 5576 #ifdef CONFIG_HSA_AMD_P2P 5577 uint64_t address_mask = peer_adev->dev->dma_mask ? 5578 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5579 resource_size_t aper_limit = 5580 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5581 bool p2p_access = 5582 !adev->gmc.xgmi.connected_to_cpu && 5583 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5584 5585 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5586 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5587 !(adev->gmc.aper_base & address_mask || 5588 aper_limit & address_mask)); 5589 #else 5590 return false; 5591 #endif 5592 } 5593 5594 int amdgpu_device_baco_enter(struct drm_device *dev) 5595 { 5596 struct amdgpu_device *adev = drm_to_adev(dev); 5597 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5598 5599 if (!amdgpu_device_supports_baco(dev)) 5600 return -ENOTSUPP; 5601 5602 if (ras && adev->ras_enabled && 5603 adev->nbio.funcs->enable_doorbell_interrupt) 5604 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5605 5606 return amdgpu_dpm_baco_enter(adev); 5607 } 5608 5609 int amdgpu_device_baco_exit(struct drm_device *dev) 5610 { 5611 struct amdgpu_device *adev = drm_to_adev(dev); 5612 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5613 int ret = 0; 5614 5615 if (!amdgpu_device_supports_baco(dev)) 5616 return -ENOTSUPP; 5617 5618 ret = amdgpu_dpm_baco_exit(adev); 5619 if (ret) 5620 return ret; 5621 5622 if (ras && adev->ras_enabled && 5623 adev->nbio.funcs->enable_doorbell_interrupt) 5624 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5625 5626 if (amdgpu_passthrough(adev) && 5627 adev->nbio.funcs->clear_doorbell_interrupt) 5628 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5629 5630 return 0; 5631 } 5632 5633 /** 5634 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5635 * @pdev: PCI device struct 5636 * @state: PCI channel state 5637 * 5638 * Description: Called when a PCI error is detected. 5639 * 5640 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5641 */ 5642 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5643 { 5644 struct drm_device *dev = pci_get_drvdata(pdev); 5645 struct amdgpu_device *adev = drm_to_adev(dev); 5646 int i; 5647 5648 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5649 5650 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5651 DRM_WARN("No support for XGMI hive yet..."); 5652 return PCI_ERS_RESULT_DISCONNECT; 5653 } 5654 5655 adev->pci_channel_state = state; 5656 5657 switch (state) { 5658 case pci_channel_io_normal: 5659 return PCI_ERS_RESULT_CAN_RECOVER; 5660 /* Fatal error, prepare for slot reset */ 5661 case pci_channel_io_frozen: 5662 /* 5663 * Locking adev->reset_domain->sem will prevent any external access 5664 * to GPU during PCI error recovery 5665 */ 5666 amdgpu_device_lock_reset_domain(adev->reset_domain); 5667 amdgpu_device_set_mp1_state(adev); 5668 5669 /* 5670 * Block any work scheduling as we do for regular GPU reset 5671 * for the duration of the recovery 5672 */ 5673 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5674 struct amdgpu_ring *ring = adev->rings[i]; 5675 5676 if (!ring || !ring->sched.thread) 5677 continue; 5678 5679 drm_sched_stop(&ring->sched, NULL); 5680 } 5681 atomic_inc(&adev->gpu_reset_counter); 5682 return PCI_ERS_RESULT_NEED_RESET; 5683 case pci_channel_io_perm_failure: 5684 /* Permanent error, prepare for device removal */ 5685 return PCI_ERS_RESULT_DISCONNECT; 5686 } 5687 5688 return PCI_ERS_RESULT_NEED_RESET; 5689 } 5690 5691 /** 5692 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5693 * @pdev: pointer to PCI device 5694 */ 5695 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5696 { 5697 5698 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5699 5700 /* TODO - dump whatever for debugging purposes */ 5701 5702 /* This called only if amdgpu_pci_error_detected returns 5703 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5704 * works, no need to reset slot. 5705 */ 5706 5707 return PCI_ERS_RESULT_RECOVERED; 5708 } 5709 5710 /** 5711 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5712 * @pdev: PCI device struct 5713 * 5714 * Description: This routine is called by the pci error recovery 5715 * code after the PCI slot has been reset, just before we 5716 * should resume normal operations. 5717 */ 5718 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5719 { 5720 struct drm_device *dev = pci_get_drvdata(pdev); 5721 struct amdgpu_device *adev = drm_to_adev(dev); 5722 int r, i; 5723 struct amdgpu_reset_context reset_context; 5724 u32 memsize; 5725 struct list_head device_list; 5726 5727 DRM_INFO("PCI error: slot reset callback!!\n"); 5728 5729 memset(&reset_context, 0, sizeof(reset_context)); 5730 5731 INIT_LIST_HEAD(&device_list); 5732 list_add_tail(&adev->reset_list, &device_list); 5733 5734 /* wait for asic to come out of reset */ 5735 msleep(500); 5736 5737 /* Restore PCI confspace */ 5738 amdgpu_device_load_pci_state(pdev); 5739 5740 /* confirm ASIC came out of reset */ 5741 for (i = 0; i < adev->usec_timeout; i++) { 5742 memsize = amdgpu_asic_get_config_memsize(adev); 5743 5744 if (memsize != 0xffffffff) 5745 break; 5746 udelay(1); 5747 } 5748 if (memsize == 0xffffffff) { 5749 r = -ETIME; 5750 goto out; 5751 } 5752 5753 reset_context.method = AMD_RESET_METHOD_NONE; 5754 reset_context.reset_req_dev = adev; 5755 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5756 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5757 5758 adev->no_hw_access = true; 5759 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5760 adev->no_hw_access = false; 5761 if (r) 5762 goto out; 5763 5764 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5765 5766 out: 5767 if (!r) { 5768 if (amdgpu_device_cache_pci_state(adev->pdev)) 5769 pci_restore_state(adev->pdev); 5770 5771 DRM_INFO("PCIe error recovery succeeded\n"); 5772 } else { 5773 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5774 amdgpu_device_unset_mp1_state(adev); 5775 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5776 } 5777 5778 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5779 } 5780 5781 /** 5782 * amdgpu_pci_resume() - resume normal ops after PCI reset 5783 * @pdev: pointer to PCI device 5784 * 5785 * Called when the error recovery driver tells us that its 5786 * OK to resume normal operation. 5787 */ 5788 void amdgpu_pci_resume(struct pci_dev *pdev) 5789 { 5790 struct drm_device *dev = pci_get_drvdata(pdev); 5791 struct amdgpu_device *adev = drm_to_adev(dev); 5792 int i; 5793 5794 5795 DRM_INFO("PCI error: resume callback!!\n"); 5796 5797 /* Only continue execution for the case of pci_channel_io_frozen */ 5798 if (adev->pci_channel_state != pci_channel_io_frozen) 5799 return; 5800 5801 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5802 struct amdgpu_ring *ring = adev->rings[i]; 5803 5804 if (!ring || !ring->sched.thread) 5805 continue; 5806 5807 drm_sched_start(&ring->sched, true); 5808 } 5809 5810 amdgpu_device_unset_mp1_state(adev); 5811 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5812 } 5813 5814 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5815 { 5816 struct drm_device *dev = pci_get_drvdata(pdev); 5817 struct amdgpu_device *adev = drm_to_adev(dev); 5818 int r; 5819 5820 r = pci_save_state(pdev); 5821 if (!r) { 5822 kfree(adev->pci_state); 5823 5824 adev->pci_state = pci_store_saved_state(pdev); 5825 5826 if (!adev->pci_state) { 5827 DRM_ERROR("Failed to store PCI saved state"); 5828 return false; 5829 } 5830 } else { 5831 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5832 return false; 5833 } 5834 5835 return true; 5836 } 5837 5838 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5839 { 5840 struct drm_device *dev = pci_get_drvdata(pdev); 5841 struct amdgpu_device *adev = drm_to_adev(dev); 5842 int r; 5843 5844 if (!adev->pci_state) 5845 return false; 5846 5847 r = pci_load_saved_state(pdev, adev->pci_state); 5848 5849 if (!r) { 5850 pci_restore_state(pdev); 5851 } else { 5852 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5853 return false; 5854 } 5855 5856 return true; 5857 } 5858 5859 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5860 struct amdgpu_ring *ring) 5861 { 5862 #ifdef CONFIG_X86_64 5863 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5864 return; 5865 #endif 5866 if (adev->gmc.xgmi.connected_to_cpu) 5867 return; 5868 5869 if (ring && ring->funcs->emit_hdp_flush) 5870 amdgpu_ring_emit_hdp_flush(ring); 5871 else 5872 amdgpu_asic_flush_hdp(adev, ring); 5873 } 5874 5875 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5876 struct amdgpu_ring *ring) 5877 { 5878 #ifdef CONFIG_X86_64 5879 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5880 return; 5881 #endif 5882 if (adev->gmc.xgmi.connected_to_cpu) 5883 return; 5884 5885 amdgpu_asic_invalidate_hdp(adev, ring); 5886 } 5887 5888 int amdgpu_in_reset(struct amdgpu_device *adev) 5889 { 5890 return atomic_read(&adev->reset_domain->in_gpu_reset); 5891 } 5892 5893 /** 5894 * amdgpu_device_halt() - bring hardware to some kind of halt state 5895 * 5896 * @adev: amdgpu_device pointer 5897 * 5898 * Bring hardware to some kind of halt state so that no one can touch it 5899 * any more. It will help to maintain error context when error occurred. 5900 * Compare to a simple hang, the system will keep stable at least for SSH 5901 * access. Then it should be trivial to inspect the hardware state and 5902 * see what's going on. Implemented as following: 5903 * 5904 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5905 * clears all CPU mappings to device, disallows remappings through page faults 5906 * 2. amdgpu_irq_disable_all() disables all interrupts 5907 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5908 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5909 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5910 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5911 * flush any in flight DMA operations 5912 */ 5913 void amdgpu_device_halt(struct amdgpu_device *adev) 5914 { 5915 struct pci_dev *pdev = adev->pdev; 5916 struct drm_device *ddev = adev_to_drm(adev); 5917 5918 amdgpu_xcp_dev_unplug(adev); 5919 drm_dev_unplug(ddev); 5920 5921 amdgpu_irq_disable_all(adev); 5922 5923 amdgpu_fence_driver_hw_fini(adev); 5924 5925 adev->no_hw_access = true; 5926 5927 amdgpu_device_unmap_mmio(adev); 5928 5929 pci_disable_device(pdev); 5930 pci_wait_for_pending_transaction(pdev); 5931 } 5932 5933 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5934 u32 reg) 5935 { 5936 unsigned long flags, address, data; 5937 u32 r; 5938 5939 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5940 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5941 5942 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5943 WREG32(address, reg * 4); 5944 (void)RREG32(address); 5945 r = RREG32(data); 5946 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5947 return r; 5948 } 5949 5950 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5951 u32 reg, u32 v) 5952 { 5953 unsigned long flags, address, data; 5954 5955 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5956 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5957 5958 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5959 WREG32(address, reg * 4); 5960 (void)RREG32(address); 5961 WREG32(data, v); 5962 (void)RREG32(data); 5963 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5964 } 5965 5966 /** 5967 * amdgpu_device_switch_gang - switch to a new gang 5968 * @adev: amdgpu_device pointer 5969 * @gang: the gang to switch to 5970 * 5971 * Try to switch to a new gang. 5972 * Returns: NULL if we switched to the new gang or a reference to the current 5973 * gang leader. 5974 */ 5975 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5976 struct dma_fence *gang) 5977 { 5978 struct dma_fence *old = NULL; 5979 5980 do { 5981 dma_fence_put(old); 5982 rcu_read_lock(); 5983 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5984 rcu_read_unlock(); 5985 5986 if (old == gang) 5987 break; 5988 5989 if (!dma_fence_is_signaled(old)) 5990 return old; 5991 5992 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5993 old, gang) != old); 5994 5995 dma_fence_put(old); 5996 return NULL; 5997 } 5998 5999 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6000 { 6001 switch (adev->asic_type) { 6002 #ifdef CONFIG_DRM_AMDGPU_SI 6003 case CHIP_HAINAN: 6004 #endif 6005 case CHIP_TOPAZ: 6006 /* chips with no display hardware */ 6007 return false; 6008 #ifdef CONFIG_DRM_AMDGPU_SI 6009 case CHIP_TAHITI: 6010 case CHIP_PITCAIRN: 6011 case CHIP_VERDE: 6012 case CHIP_OLAND: 6013 #endif 6014 #ifdef CONFIG_DRM_AMDGPU_CIK 6015 case CHIP_BONAIRE: 6016 case CHIP_HAWAII: 6017 case CHIP_KAVERI: 6018 case CHIP_KABINI: 6019 case CHIP_MULLINS: 6020 #endif 6021 case CHIP_TONGA: 6022 case CHIP_FIJI: 6023 case CHIP_POLARIS10: 6024 case CHIP_POLARIS11: 6025 case CHIP_POLARIS12: 6026 case CHIP_VEGAM: 6027 case CHIP_CARRIZO: 6028 case CHIP_STONEY: 6029 /* chips with display hardware */ 6030 return true; 6031 default: 6032 /* IP discovery */ 6033 if (!adev->ip_versions[DCE_HWIP][0] || 6034 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6035 return false; 6036 return true; 6037 } 6038 } 6039 6040 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6041 uint32_t inst, uint32_t reg_addr, char reg_name[], 6042 uint32_t expected_value, uint32_t mask) 6043 { 6044 uint32_t ret = 0; 6045 uint32_t old_ = 0; 6046 uint32_t tmp_ = RREG32(reg_addr); 6047 uint32_t loop = adev->usec_timeout; 6048 6049 while ((tmp_ & (mask)) != (expected_value)) { 6050 if (old_ != tmp_) { 6051 loop = adev->usec_timeout; 6052 old_ = tmp_; 6053 } else 6054 udelay(1); 6055 tmp_ = RREG32(reg_addr); 6056 loop--; 6057 if (!loop) { 6058 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6059 inst, reg_name, (uint32_t)expected_value, 6060 (uint32_t)(tmp_ & (mask))); 6061 ret = -ETIMEDOUT; 6062 break; 6063 } 6064 } 6065 return ret; 6066 } 6067