1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 101 static const struct drm_driver amdgpu_kms_driver; 102 103 const char *amdgpu_asic_name[] = { 104 "TAHITI", 105 "PITCAIRN", 106 "VERDE", 107 "OLAND", 108 "HAINAN", 109 "BONAIRE", 110 "KAVERI", 111 "KABINI", 112 "HAWAII", 113 "MULLINS", 114 "TOPAZ", 115 "TONGA", 116 "FIJI", 117 "CARRIZO", 118 "STONEY", 119 "POLARIS10", 120 "POLARIS11", 121 "POLARIS12", 122 "VEGAM", 123 "VEGA10", 124 "VEGA12", 125 "VEGA20", 126 "RAVEN", 127 "ARCTURUS", 128 "RENOIR", 129 "ALDEBARAN", 130 "NAVI10", 131 "CYAN_SKILLFISH", 132 "NAVI14", 133 "NAVI12", 134 "SIENNA_CICHLID", 135 "NAVY_FLOUNDER", 136 "VANGOGH", 137 "DIMGREY_CAVEFISH", 138 "BEIGE_GOBY", 139 "YELLOW_CARP", 140 "IP DISCOVERY", 141 "LAST", 142 }; 143 144 /** 145 * DOC: pcie_replay_count 146 * 147 * The amdgpu driver provides a sysfs API for reporting the total number 148 * of PCIe replays (NAKs) 149 * The file pcie_replay_count is used for this and returns the total 150 * number of replays as a sum of the NAKs generated and NAKs received 151 */ 152 153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 154 struct device_attribute *attr, char *buf) 155 { 156 struct drm_device *ddev = dev_get_drvdata(dev); 157 struct amdgpu_device *adev = drm_to_adev(ddev); 158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 159 160 return sysfs_emit(buf, "%llu\n", cnt); 161 } 162 163 static DEVICE_ATTR(pcie_replay_count, 0444, 164 amdgpu_device_get_pcie_replay_count, NULL); 165 166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 167 168 169 /** 170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 171 * 172 * @dev: drm_device pointer 173 * 174 * Returns true if the device is a dGPU with ATPX power control, 175 * otherwise return false. 176 */ 177 bool amdgpu_device_supports_px(struct drm_device *dev) 178 { 179 struct amdgpu_device *adev = drm_to_adev(dev); 180 181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 182 return true; 183 return false; 184 } 185 186 /** 187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 188 * 189 * @dev: drm_device pointer 190 * 191 * Returns true if the device is a dGPU with ACPI power control, 192 * otherwise return false. 193 */ 194 bool amdgpu_device_supports_boco(struct drm_device *dev) 195 { 196 struct amdgpu_device *adev = drm_to_adev(dev); 197 198 if (adev->has_pr3 || 199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 200 return true; 201 return false; 202 } 203 204 /** 205 * amdgpu_device_supports_baco - Does the device support BACO 206 * 207 * @dev: drm_device pointer 208 * 209 * Returns true if the device supporte BACO, 210 * otherwise return false. 211 */ 212 bool amdgpu_device_supports_baco(struct drm_device *dev) 213 { 214 struct amdgpu_device *adev = drm_to_adev(dev); 215 216 return amdgpu_asic_supports_baco(adev); 217 } 218 219 /** 220 * amdgpu_device_supports_smart_shift - Is the device dGPU with 221 * smart shift support 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with Smart Shift support, 226 * otherwise returns false. 227 */ 228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 229 { 230 return (amdgpu_device_supports_boco(dev) && 231 amdgpu_acpi_is_power_shift_control_supported()); 232 } 233 234 /* 235 * VRAM access helper functions 236 */ 237 238 /** 239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 240 * 241 * @adev: amdgpu_device pointer 242 * @pos: offset of the buffer in vram 243 * @buf: virtual address of the buffer in system memory 244 * @size: read/write size, sizeof(@buf) must > @size 245 * @write: true - write to vram, otherwise - read from vram 246 */ 247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 248 void *buf, size_t size, bool write) 249 { 250 unsigned long flags; 251 uint32_t hi = ~0, tmp = 0; 252 uint32_t *data = buf; 253 uint64_t last; 254 int idx; 255 256 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 257 return; 258 259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 260 261 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 262 for (last = pos + size; pos < last; pos += 4) { 263 tmp = pos >> 31; 264 265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 266 if (tmp != hi) { 267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 268 hi = tmp; 269 } 270 if (write) 271 WREG32_NO_KIQ(mmMM_DATA, *data++); 272 else 273 *data++ = RREG32_NO_KIQ(mmMM_DATA); 274 } 275 276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 277 drm_dev_exit(idx); 278 } 279 280 /** 281 * amdgpu_device_aper_access - access vram by vram aperature 282 * 283 * @adev: amdgpu_device pointer 284 * @pos: offset of the buffer in vram 285 * @buf: virtual address of the buffer in system memory 286 * @size: read/write size, sizeof(@buf) must > @size 287 * @write: true - write to vram, otherwise - read from vram 288 * 289 * The return value means how many bytes have been transferred. 290 */ 291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 292 void *buf, size_t size, bool write) 293 { 294 #ifdef CONFIG_64BIT 295 void __iomem *addr; 296 size_t count = 0; 297 uint64_t last; 298 299 if (!adev->mman.aper_base_kaddr) 300 return 0; 301 302 last = min(pos + size, adev->gmc.visible_vram_size); 303 if (last > pos) { 304 addr = adev->mman.aper_base_kaddr + pos; 305 count = last - pos; 306 307 if (write) { 308 memcpy_toio(addr, buf, count); 309 /* Make sure HDP write cache flush happens without any reordering 310 * after the system memory contents are sent over PCIe device 311 */ 312 mb(); 313 amdgpu_device_flush_hdp(adev, NULL); 314 } else { 315 amdgpu_device_invalidate_hdp(adev, NULL); 316 /* Make sure HDP read cache is invalidated before issuing a read 317 * to the PCIe device 318 */ 319 mb(); 320 memcpy_fromio(buf, addr, count); 321 } 322 323 } 324 325 return count; 326 #else 327 return 0; 328 #endif 329 } 330 331 /** 332 * amdgpu_device_vram_access - read/write a buffer in vram 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 */ 340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 341 void *buf, size_t size, bool write) 342 { 343 size_t count; 344 345 /* try to using vram apreature to access vram first */ 346 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 347 size -= count; 348 if (size) { 349 /* using MM to access rest vram */ 350 pos += count; 351 buf += count; 352 amdgpu_device_mm_access(adev, pos, buf, size, write); 353 } 354 } 355 356 /* 357 * register access helper functions. 358 */ 359 360 /* Check if hw access should be skipped because of hotplug or device error */ 361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 362 { 363 if (adev->no_hw_access) 364 return true; 365 366 #ifdef CONFIG_LOCKDEP 367 /* 368 * This is a bit complicated to understand, so worth a comment. What we assert 369 * here is that the GPU reset is not running on another thread in parallel. 370 * 371 * For this we trylock the read side of the reset semaphore, if that succeeds 372 * we know that the reset is not running in paralell. 373 * 374 * If the trylock fails we assert that we are either already holding the read 375 * side of the lock or are the reset thread itself and hold the write side of 376 * the lock. 377 */ 378 if (in_task()) { 379 if (down_read_trylock(&adev->reset_domain->sem)) 380 up_read(&adev->reset_domain->sem); 381 else 382 lockdep_assert_held(&adev->reset_domain->sem); 383 } 384 #endif 385 return false; 386 } 387 388 /** 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register 390 * 391 * @adev: amdgpu_device pointer 392 * @reg: dword aligned register offset 393 * @acc_flags: access flags which require special behavior 394 * 395 * Returns the 32 bit value from the offset specified. 396 */ 397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t acc_flags) 399 { 400 uint32_t ret; 401 402 if (amdgpu_device_skip_hw_access(adev)) 403 return 0; 404 405 if ((reg * 4) < adev->rmmio_size) { 406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 407 amdgpu_sriov_runtime(adev) && 408 down_read_trylock(&adev->reset_domain->sem)) { 409 ret = amdgpu_kiq_rreg(adev, reg); 410 up_read(&adev->reset_domain->sem); 411 } else { 412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 413 } 414 } else { 415 ret = adev->pcie_rreg(adev, reg * 4); 416 } 417 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 419 420 return ret; 421 } 422 423 /* 424 * MMIO register read with bytes helper functions 425 * @offset:bytes offset from MMIO start 426 */ 427 428 /** 429 * amdgpu_mm_rreg8 - read a memory mapped IO register 430 * 431 * @adev: amdgpu_device pointer 432 * @offset: byte aligned register offset 433 * 434 * Returns the 8 bit value from the offset specified. 435 */ 436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 437 { 438 if (amdgpu_device_skip_hw_access(adev)) 439 return 0; 440 441 if (offset < adev->rmmio_size) 442 return (readb(adev->rmmio + offset)); 443 BUG(); 444 } 445 446 /* 447 * MMIO register write with bytes helper functions 448 * @offset:bytes offset from MMIO start 449 * @value: the value want to be written to the register 450 */ 451 452 /** 453 * amdgpu_mm_wreg8 - read a memory mapped IO register 454 * 455 * @adev: amdgpu_device pointer 456 * @offset: byte aligned register offset 457 * @value: 8 bit value to write 458 * 459 * Writes the value specified to the offset specified. 460 */ 461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 462 { 463 if (amdgpu_device_skip_hw_access(adev)) 464 return; 465 466 if (offset < adev->rmmio_size) 467 writeb(value, adev->rmmio + offset); 468 else 469 BUG(); 470 } 471 472 /** 473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 474 * 475 * @adev: amdgpu_device pointer 476 * @reg: dword aligned register offset 477 * @v: 32 bit value to write to the register 478 * @acc_flags: access flags which require special behavior 479 * 480 * Writes the value specified to the offset specified. 481 */ 482 void amdgpu_device_wreg(struct amdgpu_device *adev, 483 uint32_t reg, uint32_t v, 484 uint32_t acc_flags) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return; 488 489 if ((reg * 4) < adev->rmmio_size) { 490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 491 amdgpu_sriov_runtime(adev) && 492 down_read_trylock(&adev->reset_domain->sem)) { 493 amdgpu_kiq_wreg(adev, reg, v); 494 up_read(&adev->reset_domain->sem); 495 } else { 496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 497 } 498 } else { 499 adev->pcie_wreg(adev, reg * 4, v); 500 } 501 502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 503 } 504 505 /** 506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 507 * 508 * @adev: amdgpu_device pointer 509 * @reg: mmio/rlc register 510 * @v: value to write 511 * 512 * this function is invoked only for the debugfs register access 513 */ 514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 515 uint32_t reg, uint32_t v, 516 uint32_t xcc_id) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (amdgpu_sriov_fullaccess(adev) && 522 adev->gfx.rlc.funcs && 523 adev->gfx.rlc.funcs->is_rlcg_access_range) { 524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 526 } else if ((reg * 4) >= adev->rmmio_size) { 527 adev->pcie_wreg(adev, reg * 4, v); 528 } else { 529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 530 } 531 } 532 533 /** 534 * amdgpu_device_indirect_rreg - read an indirect register 535 * 536 * @adev: amdgpu_device pointer 537 * @reg_addr: indirect register address to read from 538 * 539 * Returns the value of indirect register @reg_addr 540 */ 541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 542 u32 reg_addr) 543 { 544 unsigned long flags, pcie_index, pcie_data; 545 void __iomem *pcie_index_offset; 546 void __iomem *pcie_data_offset; 547 u32 r; 548 549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 551 552 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 555 556 writel(reg_addr, pcie_index_offset); 557 readl(pcie_index_offset); 558 r = readl(pcie_data_offset); 559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 560 561 return r; 562 } 563 564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 565 u64 reg_addr) 566 { 567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 568 u32 r; 569 void __iomem *pcie_index_offset; 570 void __iomem *pcie_index_hi_offset; 571 void __iomem *pcie_data_offset; 572 573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 575 if (adev->nbio.funcs->get_pcie_index_hi_offset) 576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 577 else 578 pcie_index_hi = 0; 579 580 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 583 if (pcie_index_hi != 0) 584 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 585 pcie_index_hi * 4; 586 587 writel(reg_addr, pcie_index_offset); 588 readl(pcie_index_offset); 589 if (pcie_index_hi != 0) { 590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 591 readl(pcie_index_hi_offset); 592 } 593 r = readl(pcie_data_offset); 594 595 /* clear the high bits */ 596 if (pcie_index_hi != 0) { 597 writel(0, pcie_index_hi_offset); 598 readl(pcie_index_hi_offset); 599 } 600 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @reg_addr: indirect register address to read from 611 * 612 * Returns the value of indirect register @reg_addr 613 */ 614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 615 u32 reg_addr) 616 { 617 unsigned long flags, pcie_index, pcie_data; 618 void __iomem *pcie_index_offset; 619 void __iomem *pcie_data_offset; 620 u64 r; 621 622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 624 625 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 628 629 /* read low 32 bits */ 630 writel(reg_addr, pcie_index_offset); 631 readl(pcie_index_offset); 632 r = readl(pcie_data_offset); 633 /* read high 32 bits */ 634 writel(reg_addr + 4, pcie_index_offset); 635 readl(pcie_index_offset); 636 r |= ((u64)readl(pcie_data_offset) << 32); 637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 638 639 return r; 640 } 641 642 /** 643 * amdgpu_device_indirect_wreg - write an indirect register address 644 * 645 * @adev: amdgpu_device pointer 646 * @reg_addr: indirect register offset 647 * @reg_data: indirect register data 648 * 649 */ 650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 651 u32 reg_addr, u32 reg_data) 652 { 653 unsigned long flags, pcie_index, pcie_data; 654 void __iomem *pcie_index_offset; 655 void __iomem *pcie_data_offset; 656 657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 659 660 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 663 664 writel(reg_addr, pcie_index_offset); 665 readl(pcie_index_offset); 666 writel(reg_data, pcie_data_offset); 667 readl(pcie_data_offset); 668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 669 } 670 671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 672 u64 reg_addr, u32 reg_data) 673 { 674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 675 void __iomem *pcie_index_offset; 676 void __iomem *pcie_index_hi_offset; 677 void __iomem *pcie_data_offset; 678 679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 681 if (adev->nbio.funcs->get_pcie_index_hi_offset) 682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 683 else 684 pcie_index_hi = 0; 685 686 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 689 if (pcie_index_hi != 0) 690 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 691 pcie_index_hi * 4; 692 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 if (pcie_index_hi != 0) { 696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 697 readl(pcie_index_hi_offset); 698 } 699 writel(reg_data, pcie_data_offset); 700 readl(pcie_data_offset); 701 702 /* clear the high bits */ 703 if (pcie_index_hi != 0) { 704 writel(0, pcie_index_hi_offset); 705 readl(pcie_index_hi_offset); 706 } 707 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @reg_addr: indirect register offset 716 * @reg_data: indirect register data 717 * 718 */ 719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 720 u32 reg_addr, u64 reg_data) 721 { 722 unsigned long flags, pcie_index, pcie_data; 723 void __iomem *pcie_index_offset; 724 void __iomem *pcie_data_offset; 725 726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_device_get_rev_id - query device rev_id 748 * 749 * @adev: amdgpu_device pointer 750 * 751 * Return device rev_id 752 */ 753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 754 { 755 return adev->nbio.funcs->get_rev_id(adev); 756 } 757 758 /** 759 * amdgpu_invalid_rreg - dummy reg read function 760 * 761 * @adev: amdgpu_device pointer 762 * @reg: offset of register 763 * 764 * Dummy register read function. Used for register blocks 765 * that certain asics don't have (all asics). 766 * Returns the value in the register. 767 */ 768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 769 { 770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 771 BUG(); 772 return 0; 773 } 774 775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 776 { 777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 778 BUG(); 779 return 0; 780 } 781 782 /** 783 * amdgpu_invalid_wreg - dummy reg write function 784 * 785 * @adev: amdgpu_device pointer 786 * @reg: offset of register 787 * @v: value to write to the register 788 * 789 * Dummy register read function. Used for register blocks 790 * that certain asics don't have (all asics). 791 */ 792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 793 { 794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 795 reg, v); 796 BUG(); 797 } 798 799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 800 { 801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 802 reg, v); 803 BUG(); 804 } 805 806 /** 807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: offset of register 811 * 812 * Dummy register read function. Used for register blocks 813 * that certain asics don't have (all asics). 814 * Returns the value in the register. 815 */ 816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 817 { 818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 819 BUG(); 820 return 0; 821 } 822 823 /** 824 * amdgpu_invalid_wreg64 - dummy reg write function 825 * 826 * @adev: amdgpu_device pointer 827 * @reg: offset of register 828 * @v: value to write to the register 829 * 830 * Dummy register read function. Used for register blocks 831 * that certain asics don't have (all asics). 832 */ 833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 834 { 835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 836 reg, v); 837 BUG(); 838 } 839 840 /** 841 * amdgpu_block_invalid_rreg - dummy reg read function 842 * 843 * @adev: amdgpu_device pointer 844 * @block: offset of instance 845 * @reg: offset of register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 * Returns the value in the register. 850 */ 851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 852 uint32_t block, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 855 reg, block); 856 BUG(); 857 return 0; 858 } 859 860 /** 861 * amdgpu_block_invalid_wreg - dummy reg write function 862 * 863 * @adev: amdgpu_device pointer 864 * @block: offset of instance 865 * @reg: offset of register 866 * @v: value to write to the register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 */ 871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 872 uint32_t block, 873 uint32_t reg, uint32_t v) 874 { 875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 876 reg, block, v); 877 BUG(); 878 } 879 880 /** 881 * amdgpu_device_asic_init - Wrapper for atom asic_init 882 * 883 * @adev: amdgpu_device pointer 884 * 885 * Does any asic specific work and then calls atom asic init. 886 */ 887 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 888 { 889 int ret; 890 891 amdgpu_asic_pre_asic_init(adev); 892 893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 895 amdgpu_psp_wait_for_bootloader(adev); 896 ret = amdgpu_atomfirmware_asic_init(adev, true); 897 return ret; 898 } else { 899 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 900 } 901 902 return 0; 903 } 904 905 /** 906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Allocates a scratch page of VRAM for use by various things in the 911 * driver. 912 */ 913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 914 { 915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 916 AMDGPU_GEM_DOMAIN_VRAM | 917 AMDGPU_GEM_DOMAIN_GTT, 918 &adev->mem_scratch.robj, 919 &adev->mem_scratch.gpu_addr, 920 (void **)&adev->mem_scratch.ptr); 921 } 922 923 /** 924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 925 * 926 * @adev: amdgpu_device pointer 927 * 928 * Frees the VRAM scratch page. 929 */ 930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 931 { 932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 933 } 934 935 /** 936 * amdgpu_device_program_register_sequence - program an array of registers. 937 * 938 * @adev: amdgpu_device pointer 939 * @registers: pointer to the register array 940 * @array_size: size of the register array 941 * 942 * Programs an array or registers with and or masks. 943 * This is a helper for setting golden registers. 944 */ 945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 946 const u32 *registers, 947 const u32 array_size) 948 { 949 u32 tmp, reg, and_mask, or_mask; 950 int i; 951 952 if (array_size % 3) 953 return; 954 955 for (i = 0; i < array_size; i += 3) { 956 reg = registers[i + 0]; 957 and_mask = registers[i + 1]; 958 or_mask = registers[i + 2]; 959 960 if (and_mask == 0xffffffff) { 961 tmp = or_mask; 962 } else { 963 tmp = RREG32(reg); 964 tmp &= ~and_mask; 965 if (adev->family >= AMDGPU_FAMILY_AI) 966 tmp |= (or_mask & and_mask); 967 else 968 tmp |= or_mask; 969 } 970 WREG32(reg, tmp); 971 } 972 } 973 974 /** 975 * amdgpu_device_pci_config_reset - reset the GPU 976 * 977 * @adev: amdgpu_device pointer 978 * 979 * Resets the GPU using the pci config reset sequence. 980 * Only applicable to asics prior to vega10. 981 */ 982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 983 { 984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 985 } 986 987 /** 988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 993 */ 994 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 995 { 996 return pci_reset_function(adev->pdev); 997 } 998 999 /* 1000 * amdgpu_device_wb_*() 1001 * Writeback is the method by which the GPU updates special pages in memory 1002 * with the status of certain GPU events (fences, ring pointers,etc.). 1003 */ 1004 1005 /** 1006 * amdgpu_device_wb_fini - Disable Writeback and free memory 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Disables Writeback and frees the Writeback memory (all asics). 1011 * Used at driver shutdown. 1012 */ 1013 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1014 { 1015 if (adev->wb.wb_obj) { 1016 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1017 &adev->wb.gpu_addr, 1018 (void **)&adev->wb.wb); 1019 adev->wb.wb_obj = NULL; 1020 } 1021 } 1022 1023 /** 1024 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1025 * 1026 * @adev: amdgpu_device pointer 1027 * 1028 * Initializes writeback and allocates writeback memory (all asics). 1029 * Used at driver startup. 1030 * Returns 0 on success or an -error on failure. 1031 */ 1032 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1033 { 1034 int r; 1035 1036 if (adev->wb.wb_obj == NULL) { 1037 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1038 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1039 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1040 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1041 (void **)&adev->wb.wb); 1042 if (r) { 1043 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1044 return r; 1045 } 1046 1047 adev->wb.num_wb = AMDGPU_MAX_WB; 1048 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1049 1050 /* clear wb memory */ 1051 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1052 } 1053 1054 return 0; 1055 } 1056 1057 /** 1058 * amdgpu_device_wb_get - Allocate a wb entry 1059 * 1060 * @adev: amdgpu_device pointer 1061 * @wb: wb index 1062 * 1063 * Allocate a wb slot for use by the driver (all asics). 1064 * Returns 0 on success or -EINVAL on failure. 1065 */ 1066 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1067 { 1068 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1069 1070 if (offset < adev->wb.num_wb) { 1071 __set_bit(offset, adev->wb.used); 1072 *wb = offset << 3; /* convert to dw offset */ 1073 return 0; 1074 } else { 1075 return -EINVAL; 1076 } 1077 } 1078 1079 /** 1080 * amdgpu_device_wb_free - Free a wb entry 1081 * 1082 * @adev: amdgpu_device pointer 1083 * @wb: wb index 1084 * 1085 * Free a wb slot allocated for use by the driver (all asics) 1086 */ 1087 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1088 { 1089 wb >>= 3; 1090 if (wb < adev->wb.num_wb) 1091 __clear_bit(wb, adev->wb.used); 1092 } 1093 1094 /** 1095 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1096 * 1097 * @adev: amdgpu_device pointer 1098 * 1099 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1100 * to fail, but if any of the BARs is not accessible after the size we abort 1101 * driver loading by returning -ENODEV. 1102 */ 1103 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1104 { 1105 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1106 struct pci_bus *root; 1107 struct resource *res; 1108 unsigned int i; 1109 u16 cmd; 1110 int r; 1111 1112 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1113 return 0; 1114 1115 /* Bypass for VF */ 1116 if (amdgpu_sriov_vf(adev)) 1117 return 0; 1118 1119 /* skip if the bios has already enabled large BAR */ 1120 if (adev->gmc.real_vram_size && 1121 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1122 return 0; 1123 1124 /* Check if the root BUS has 64bit memory resources */ 1125 root = adev->pdev->bus; 1126 while (root->parent) 1127 root = root->parent; 1128 1129 pci_bus_for_each_resource(root, res, i) { 1130 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1131 res->start > 0x100000000ull) 1132 break; 1133 } 1134 1135 /* Trying to resize is pointless without a root hub window above 4GB */ 1136 if (!res) 1137 return 0; 1138 1139 /* Limit the BAR size to what is available */ 1140 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1141 rbar_size); 1142 1143 /* Disable memory decoding while we change the BAR addresses and size */ 1144 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1145 pci_write_config_word(adev->pdev, PCI_COMMAND, 1146 cmd & ~PCI_COMMAND_MEMORY); 1147 1148 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1149 amdgpu_doorbell_fini(adev); 1150 if (adev->asic_type >= CHIP_BONAIRE) 1151 pci_release_resource(adev->pdev, 2); 1152 1153 pci_release_resource(adev->pdev, 0); 1154 1155 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1156 if (r == -ENOSPC) 1157 DRM_INFO("Not enough PCI address space for a large BAR."); 1158 else if (r && r != -ENOTSUPP) 1159 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1160 1161 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1162 1163 /* When the doorbell or fb BAR isn't available we have no chance of 1164 * using the device. 1165 */ 1166 r = amdgpu_doorbell_init(adev); 1167 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1168 return -ENODEV; 1169 1170 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1171 1172 return 0; 1173 } 1174 1175 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1176 { 1177 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1178 return false; 1179 1180 return true; 1181 } 1182 1183 /* 1184 * GPU helpers function. 1185 */ 1186 /** 1187 * amdgpu_device_need_post - check if the hw need post or not 1188 * 1189 * @adev: amdgpu_device pointer 1190 * 1191 * Check if the asic has been initialized (all asics) at driver startup 1192 * or post is needed if hw reset is performed. 1193 * Returns true if need or false if not. 1194 */ 1195 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1196 { 1197 uint32_t reg; 1198 1199 if (amdgpu_sriov_vf(adev)) 1200 return false; 1201 1202 if (!amdgpu_device_read_bios(adev)) 1203 return false; 1204 1205 if (amdgpu_passthrough(adev)) { 1206 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1207 * some old smc fw still need driver do vPost otherwise gpu hang, while 1208 * those smc fw version above 22.15 doesn't have this flaw, so we force 1209 * vpost executed for smc version below 22.15 1210 */ 1211 if (adev->asic_type == CHIP_FIJI) { 1212 int err; 1213 uint32_t fw_ver; 1214 1215 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1216 /* force vPost if error occured */ 1217 if (err) 1218 return true; 1219 1220 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1221 if (fw_ver < 0x00160e00) 1222 return true; 1223 } 1224 } 1225 1226 /* Don't post if we need to reset whole hive on init */ 1227 if (adev->gmc.xgmi.pending_reset) 1228 return false; 1229 1230 if (adev->has_hw_reset) { 1231 adev->has_hw_reset = false; 1232 return true; 1233 } 1234 1235 /* bios scratch used on CIK+ */ 1236 if (adev->asic_type >= CHIP_BONAIRE) 1237 return amdgpu_atombios_scratch_need_asic_init(adev); 1238 1239 /* check MEM_SIZE for older asics */ 1240 reg = amdgpu_asic_get_config_memsize(adev); 1241 1242 if ((reg != 0) && (reg != 0xffffffff)) 1243 return false; 1244 1245 return true; 1246 } 1247 1248 /* 1249 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1250 * speed switching. Until we have confirmation from Intel that a specific host 1251 * supports it, it's safer that we keep it disabled for all. 1252 * 1253 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1254 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1255 */ 1256 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1257 { 1258 #if IS_ENABLED(CONFIG_X86) 1259 struct cpuinfo_x86 *c = &cpu_data(0); 1260 1261 if (c->x86_vendor == X86_VENDOR_INTEL) 1262 return false; 1263 #endif 1264 return true; 1265 } 1266 1267 /** 1268 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1273 * be set for this device. 1274 * 1275 * Returns true if it should be used or false if not. 1276 */ 1277 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1278 { 1279 switch (amdgpu_aspm) { 1280 case -1: 1281 break; 1282 case 0: 1283 return false; 1284 case 1: 1285 return true; 1286 default: 1287 return false; 1288 } 1289 return pcie_aspm_enabled(adev->pdev); 1290 } 1291 1292 bool amdgpu_device_aspm_support_quirk(void) 1293 { 1294 #if IS_ENABLED(CONFIG_X86) 1295 struct cpuinfo_x86 *c = &cpu_data(0); 1296 1297 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1298 #else 1299 return true; 1300 #endif 1301 } 1302 1303 /* if we get transitioned to only one device, take VGA back */ 1304 /** 1305 * amdgpu_device_vga_set_decode - enable/disable vga decode 1306 * 1307 * @pdev: PCI device pointer 1308 * @state: enable/disable vga decode 1309 * 1310 * Enable/disable vga decode (all asics). 1311 * Returns VGA resource flags. 1312 */ 1313 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1314 bool state) 1315 { 1316 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1317 1318 amdgpu_asic_set_vga_state(adev, state); 1319 if (state) 1320 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1321 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1322 else 1323 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1324 } 1325 1326 /** 1327 * amdgpu_device_check_block_size - validate the vm block size 1328 * 1329 * @adev: amdgpu_device pointer 1330 * 1331 * Validates the vm block size specified via module parameter. 1332 * The vm block size defines number of bits in page table versus page directory, 1333 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1334 * page table and the remaining bits are in the page directory. 1335 */ 1336 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1337 { 1338 /* defines number of bits in page table versus page directory, 1339 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1340 * page table and the remaining bits are in the page directory 1341 */ 1342 if (amdgpu_vm_block_size == -1) 1343 return; 1344 1345 if (amdgpu_vm_block_size < 9) { 1346 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1347 amdgpu_vm_block_size); 1348 amdgpu_vm_block_size = -1; 1349 } 1350 } 1351 1352 /** 1353 * amdgpu_device_check_vm_size - validate the vm size 1354 * 1355 * @adev: amdgpu_device pointer 1356 * 1357 * Validates the vm size in GB specified via module parameter. 1358 * The VM size is the size of the GPU virtual memory space in GB. 1359 */ 1360 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1361 { 1362 /* no need to check the default value */ 1363 if (amdgpu_vm_size == -1) 1364 return; 1365 1366 if (amdgpu_vm_size < 1) { 1367 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1368 amdgpu_vm_size); 1369 amdgpu_vm_size = -1; 1370 } 1371 } 1372 1373 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1374 { 1375 struct sysinfo si; 1376 bool is_os_64 = (sizeof(void *) == 8); 1377 uint64_t total_memory; 1378 uint64_t dram_size_seven_GB = 0x1B8000000; 1379 uint64_t dram_size_three_GB = 0xB8000000; 1380 1381 if (amdgpu_smu_memory_pool_size == 0) 1382 return; 1383 1384 if (!is_os_64) { 1385 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1386 goto def_value; 1387 } 1388 si_meminfo(&si); 1389 total_memory = (uint64_t)si.totalram * si.mem_unit; 1390 1391 if ((amdgpu_smu_memory_pool_size == 1) || 1392 (amdgpu_smu_memory_pool_size == 2)) { 1393 if (total_memory < dram_size_three_GB) 1394 goto def_value1; 1395 } else if ((amdgpu_smu_memory_pool_size == 4) || 1396 (amdgpu_smu_memory_pool_size == 8)) { 1397 if (total_memory < dram_size_seven_GB) 1398 goto def_value1; 1399 } else { 1400 DRM_WARN("Smu memory pool size not supported\n"); 1401 goto def_value; 1402 } 1403 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1404 1405 return; 1406 1407 def_value1: 1408 DRM_WARN("No enough system memory\n"); 1409 def_value: 1410 adev->pm.smu_prv_buffer_size = 0; 1411 } 1412 1413 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1414 { 1415 if (!(adev->flags & AMD_IS_APU) || 1416 adev->asic_type < CHIP_RAVEN) 1417 return 0; 1418 1419 switch (adev->asic_type) { 1420 case CHIP_RAVEN: 1421 if (adev->pdev->device == 0x15dd) 1422 adev->apu_flags |= AMD_APU_IS_RAVEN; 1423 if (adev->pdev->device == 0x15d8) 1424 adev->apu_flags |= AMD_APU_IS_PICASSO; 1425 break; 1426 case CHIP_RENOIR: 1427 if ((adev->pdev->device == 0x1636) || 1428 (adev->pdev->device == 0x164c)) 1429 adev->apu_flags |= AMD_APU_IS_RENOIR; 1430 else 1431 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1432 break; 1433 case CHIP_VANGOGH: 1434 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1435 break; 1436 case CHIP_YELLOW_CARP: 1437 break; 1438 case CHIP_CYAN_SKILLFISH: 1439 if ((adev->pdev->device == 0x13FE) || 1440 (adev->pdev->device == 0x143F)) 1441 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1442 break; 1443 default: 1444 break; 1445 } 1446 1447 return 0; 1448 } 1449 1450 /** 1451 * amdgpu_device_check_arguments - validate module params 1452 * 1453 * @adev: amdgpu_device pointer 1454 * 1455 * Validates certain module parameters and updates 1456 * the associated values used by the driver (all asics). 1457 */ 1458 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1459 { 1460 if (amdgpu_sched_jobs < 4) { 1461 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1462 amdgpu_sched_jobs); 1463 amdgpu_sched_jobs = 4; 1464 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1465 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1466 amdgpu_sched_jobs); 1467 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1468 } 1469 1470 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1471 /* gart size must be greater or equal to 32M */ 1472 dev_warn(adev->dev, "gart size (%d) too small\n", 1473 amdgpu_gart_size); 1474 amdgpu_gart_size = -1; 1475 } 1476 1477 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1478 /* gtt size must be greater or equal to 32M */ 1479 dev_warn(adev->dev, "gtt size (%d) too small\n", 1480 amdgpu_gtt_size); 1481 amdgpu_gtt_size = -1; 1482 } 1483 1484 /* valid range is between 4 and 9 inclusive */ 1485 if (amdgpu_vm_fragment_size != -1 && 1486 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1487 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1488 amdgpu_vm_fragment_size = -1; 1489 } 1490 1491 if (amdgpu_sched_hw_submission < 2) { 1492 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1493 amdgpu_sched_hw_submission); 1494 amdgpu_sched_hw_submission = 2; 1495 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1496 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1497 amdgpu_sched_hw_submission); 1498 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1499 } 1500 1501 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1502 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1503 amdgpu_reset_method = -1; 1504 } 1505 1506 amdgpu_device_check_smu_prv_buffer_size(adev); 1507 1508 amdgpu_device_check_vm_size(adev); 1509 1510 amdgpu_device_check_block_size(adev); 1511 1512 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1513 1514 return 0; 1515 } 1516 1517 /** 1518 * amdgpu_switcheroo_set_state - set switcheroo state 1519 * 1520 * @pdev: pci dev pointer 1521 * @state: vga_switcheroo state 1522 * 1523 * Callback for the switcheroo driver. Suspends or resumes 1524 * the asics before or after it is powered up using ACPI methods. 1525 */ 1526 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1527 enum vga_switcheroo_state state) 1528 { 1529 struct drm_device *dev = pci_get_drvdata(pdev); 1530 int r; 1531 1532 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1533 return; 1534 1535 if (state == VGA_SWITCHEROO_ON) { 1536 pr_info("switched on\n"); 1537 /* don't suspend or resume card normally */ 1538 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1539 1540 pci_set_power_state(pdev, PCI_D0); 1541 amdgpu_device_load_pci_state(pdev); 1542 r = pci_enable_device(pdev); 1543 if (r) 1544 DRM_WARN("pci_enable_device failed (%d)\n", r); 1545 amdgpu_device_resume(dev, true); 1546 1547 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1548 } else { 1549 pr_info("switched off\n"); 1550 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1551 amdgpu_device_suspend(dev, true); 1552 amdgpu_device_cache_pci_state(pdev); 1553 /* Shut down the device */ 1554 pci_disable_device(pdev); 1555 pci_set_power_state(pdev, PCI_D3cold); 1556 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1557 } 1558 } 1559 1560 /** 1561 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1562 * 1563 * @pdev: pci dev pointer 1564 * 1565 * Callback for the switcheroo driver. Check of the switcheroo 1566 * state can be changed. 1567 * Returns true if the state can be changed, false if not. 1568 */ 1569 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1570 { 1571 struct drm_device *dev = pci_get_drvdata(pdev); 1572 1573 /* 1574 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1575 * locking inversion with the driver load path. And the access here is 1576 * completely racy anyway. So don't bother with locking for now. 1577 */ 1578 return atomic_read(&dev->open_count) == 0; 1579 } 1580 1581 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1582 .set_gpu_state = amdgpu_switcheroo_set_state, 1583 .reprobe = NULL, 1584 .can_switch = amdgpu_switcheroo_can_switch, 1585 }; 1586 1587 /** 1588 * amdgpu_device_ip_set_clockgating_state - set the CG state 1589 * 1590 * @dev: amdgpu_device pointer 1591 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1592 * @state: clockgating state (gate or ungate) 1593 * 1594 * Sets the requested clockgating state for all instances of 1595 * the hardware IP specified. 1596 * Returns the error code from the last instance. 1597 */ 1598 int amdgpu_device_ip_set_clockgating_state(void *dev, 1599 enum amd_ip_block_type block_type, 1600 enum amd_clockgating_state state) 1601 { 1602 struct amdgpu_device *adev = dev; 1603 int i, r = 0; 1604 1605 for (i = 0; i < adev->num_ip_blocks; i++) { 1606 if (!adev->ip_blocks[i].status.valid) 1607 continue; 1608 if (adev->ip_blocks[i].version->type != block_type) 1609 continue; 1610 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1611 continue; 1612 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1613 (void *)adev, state); 1614 if (r) 1615 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1616 adev->ip_blocks[i].version->funcs->name, r); 1617 } 1618 return r; 1619 } 1620 1621 /** 1622 * amdgpu_device_ip_set_powergating_state - set the PG state 1623 * 1624 * @dev: amdgpu_device pointer 1625 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1626 * @state: powergating state (gate or ungate) 1627 * 1628 * Sets the requested powergating state for all instances of 1629 * the hardware IP specified. 1630 * Returns the error code from the last instance. 1631 */ 1632 int amdgpu_device_ip_set_powergating_state(void *dev, 1633 enum amd_ip_block_type block_type, 1634 enum amd_powergating_state state) 1635 { 1636 struct amdgpu_device *adev = dev; 1637 int i, r = 0; 1638 1639 for (i = 0; i < adev->num_ip_blocks; i++) { 1640 if (!adev->ip_blocks[i].status.valid) 1641 continue; 1642 if (adev->ip_blocks[i].version->type != block_type) 1643 continue; 1644 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1645 continue; 1646 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1647 (void *)adev, state); 1648 if (r) 1649 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1650 adev->ip_blocks[i].version->funcs->name, r); 1651 } 1652 return r; 1653 } 1654 1655 /** 1656 * amdgpu_device_ip_get_clockgating_state - get the CG state 1657 * 1658 * @adev: amdgpu_device pointer 1659 * @flags: clockgating feature flags 1660 * 1661 * Walks the list of IPs on the device and updates the clockgating 1662 * flags for each IP. 1663 * Updates @flags with the feature flags for each hardware IP where 1664 * clockgating is enabled. 1665 */ 1666 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1667 u64 *flags) 1668 { 1669 int i; 1670 1671 for (i = 0; i < adev->num_ip_blocks; i++) { 1672 if (!adev->ip_blocks[i].status.valid) 1673 continue; 1674 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1675 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1676 } 1677 } 1678 1679 /** 1680 * amdgpu_device_ip_wait_for_idle - wait for idle 1681 * 1682 * @adev: amdgpu_device pointer 1683 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1684 * 1685 * Waits for the request hardware IP to be idle. 1686 * Returns 0 for success or a negative error code on failure. 1687 */ 1688 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1689 enum amd_ip_block_type block_type) 1690 { 1691 int i, r; 1692 1693 for (i = 0; i < adev->num_ip_blocks; i++) { 1694 if (!adev->ip_blocks[i].status.valid) 1695 continue; 1696 if (adev->ip_blocks[i].version->type == block_type) { 1697 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1698 if (r) 1699 return r; 1700 break; 1701 } 1702 } 1703 return 0; 1704 1705 } 1706 1707 /** 1708 * amdgpu_device_ip_is_idle - is the hardware IP idle 1709 * 1710 * @adev: amdgpu_device pointer 1711 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1712 * 1713 * Check if the hardware IP is idle or not. 1714 * Returns true if it the IP is idle, false if not. 1715 */ 1716 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1717 enum amd_ip_block_type block_type) 1718 { 1719 int i; 1720 1721 for (i = 0; i < adev->num_ip_blocks; i++) { 1722 if (!adev->ip_blocks[i].status.valid) 1723 continue; 1724 if (adev->ip_blocks[i].version->type == block_type) 1725 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1726 } 1727 return true; 1728 1729 } 1730 1731 /** 1732 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1733 * 1734 * @adev: amdgpu_device pointer 1735 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1736 * 1737 * Returns a pointer to the hardware IP block structure 1738 * if it exists for the asic, otherwise NULL. 1739 */ 1740 struct amdgpu_ip_block * 1741 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1742 enum amd_ip_block_type type) 1743 { 1744 int i; 1745 1746 for (i = 0; i < adev->num_ip_blocks; i++) 1747 if (adev->ip_blocks[i].version->type == type) 1748 return &adev->ip_blocks[i]; 1749 1750 return NULL; 1751 } 1752 1753 /** 1754 * amdgpu_device_ip_block_version_cmp 1755 * 1756 * @adev: amdgpu_device pointer 1757 * @type: enum amd_ip_block_type 1758 * @major: major version 1759 * @minor: minor version 1760 * 1761 * return 0 if equal or greater 1762 * return 1 if smaller or the ip_block doesn't exist 1763 */ 1764 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1765 enum amd_ip_block_type type, 1766 u32 major, u32 minor) 1767 { 1768 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1769 1770 if (ip_block && ((ip_block->version->major > major) || 1771 ((ip_block->version->major == major) && 1772 (ip_block->version->minor >= minor)))) 1773 return 0; 1774 1775 return 1; 1776 } 1777 1778 /** 1779 * amdgpu_device_ip_block_add 1780 * 1781 * @adev: amdgpu_device pointer 1782 * @ip_block_version: pointer to the IP to add 1783 * 1784 * Adds the IP block driver information to the collection of IPs 1785 * on the asic. 1786 */ 1787 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1788 const struct amdgpu_ip_block_version *ip_block_version) 1789 { 1790 if (!ip_block_version) 1791 return -EINVAL; 1792 1793 switch (ip_block_version->type) { 1794 case AMD_IP_BLOCK_TYPE_VCN: 1795 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1796 return 0; 1797 break; 1798 case AMD_IP_BLOCK_TYPE_JPEG: 1799 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1800 return 0; 1801 break; 1802 default: 1803 break; 1804 } 1805 1806 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1807 ip_block_version->funcs->name); 1808 1809 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1810 1811 return 0; 1812 } 1813 1814 /** 1815 * amdgpu_device_enable_virtual_display - enable virtual display feature 1816 * 1817 * @adev: amdgpu_device pointer 1818 * 1819 * Enabled the virtual display feature if the user has enabled it via 1820 * the module parameter virtual_display. This feature provides a virtual 1821 * display hardware on headless boards or in virtualized environments. 1822 * This function parses and validates the configuration string specified by 1823 * the user and configues the virtual display configuration (number of 1824 * virtual connectors, crtcs, etc.) specified. 1825 */ 1826 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1827 { 1828 adev->enable_virtual_display = false; 1829 1830 if (amdgpu_virtual_display) { 1831 const char *pci_address_name = pci_name(adev->pdev); 1832 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1833 1834 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1835 pciaddstr_tmp = pciaddstr; 1836 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1837 pciaddname = strsep(&pciaddname_tmp, ","); 1838 if (!strcmp("all", pciaddname) 1839 || !strcmp(pci_address_name, pciaddname)) { 1840 long num_crtc; 1841 int res = -1; 1842 1843 adev->enable_virtual_display = true; 1844 1845 if (pciaddname_tmp) 1846 res = kstrtol(pciaddname_tmp, 10, 1847 &num_crtc); 1848 1849 if (!res) { 1850 if (num_crtc < 1) 1851 num_crtc = 1; 1852 if (num_crtc > 6) 1853 num_crtc = 6; 1854 adev->mode_info.num_crtc = num_crtc; 1855 } else { 1856 adev->mode_info.num_crtc = 1; 1857 } 1858 break; 1859 } 1860 } 1861 1862 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1863 amdgpu_virtual_display, pci_address_name, 1864 adev->enable_virtual_display, adev->mode_info.num_crtc); 1865 1866 kfree(pciaddstr); 1867 } 1868 } 1869 1870 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1871 { 1872 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1873 adev->mode_info.num_crtc = 1; 1874 adev->enable_virtual_display = true; 1875 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1876 adev->enable_virtual_display, adev->mode_info.num_crtc); 1877 } 1878 } 1879 1880 /** 1881 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1882 * 1883 * @adev: amdgpu_device pointer 1884 * 1885 * Parses the asic configuration parameters specified in the gpu info 1886 * firmware and makes them availale to the driver for use in configuring 1887 * the asic. 1888 * Returns 0 on success, -EINVAL on failure. 1889 */ 1890 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1891 { 1892 const char *chip_name; 1893 char fw_name[40]; 1894 int err; 1895 const struct gpu_info_firmware_header_v1_0 *hdr; 1896 1897 adev->firmware.gpu_info_fw = NULL; 1898 1899 if (adev->mman.discovery_bin) { 1900 /* 1901 * FIXME: The bounding box is still needed by Navi12, so 1902 * temporarily read it from gpu_info firmware. Should be dropped 1903 * when DAL no longer needs it. 1904 */ 1905 if (adev->asic_type != CHIP_NAVI12) 1906 return 0; 1907 } 1908 1909 switch (adev->asic_type) { 1910 default: 1911 return 0; 1912 case CHIP_VEGA10: 1913 chip_name = "vega10"; 1914 break; 1915 case CHIP_VEGA12: 1916 chip_name = "vega12"; 1917 break; 1918 case CHIP_RAVEN: 1919 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1920 chip_name = "raven2"; 1921 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1922 chip_name = "picasso"; 1923 else 1924 chip_name = "raven"; 1925 break; 1926 case CHIP_ARCTURUS: 1927 chip_name = "arcturus"; 1928 break; 1929 case CHIP_NAVI12: 1930 chip_name = "navi12"; 1931 break; 1932 } 1933 1934 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1935 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1936 if (err) { 1937 dev_err(adev->dev, 1938 "Failed to get gpu_info firmware \"%s\"\n", 1939 fw_name); 1940 goto out; 1941 } 1942 1943 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1944 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1945 1946 switch (hdr->version_major) { 1947 case 1: 1948 { 1949 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1950 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1951 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1952 1953 /* 1954 * Should be droped when DAL no longer needs it. 1955 */ 1956 if (adev->asic_type == CHIP_NAVI12) 1957 goto parse_soc_bounding_box; 1958 1959 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1960 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1961 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1962 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1963 adev->gfx.config.max_texture_channel_caches = 1964 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1965 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1966 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1967 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1968 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1969 adev->gfx.config.double_offchip_lds_buf = 1970 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1971 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1972 adev->gfx.cu_info.max_waves_per_simd = 1973 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1974 adev->gfx.cu_info.max_scratch_slots_per_cu = 1975 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1976 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1977 if (hdr->version_minor >= 1) { 1978 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1979 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1980 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1981 adev->gfx.config.num_sc_per_sh = 1982 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1983 adev->gfx.config.num_packer_per_sc = 1984 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1985 } 1986 1987 parse_soc_bounding_box: 1988 /* 1989 * soc bounding box info is not integrated in disocovery table, 1990 * we always need to parse it from gpu info firmware if needed. 1991 */ 1992 if (hdr->version_minor == 2) { 1993 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1994 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1995 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1996 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1997 } 1998 break; 1999 } 2000 default: 2001 dev_err(adev->dev, 2002 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2003 err = -EINVAL; 2004 goto out; 2005 } 2006 out: 2007 return err; 2008 } 2009 2010 /** 2011 * amdgpu_device_ip_early_init - run early init for hardware IPs 2012 * 2013 * @adev: amdgpu_device pointer 2014 * 2015 * Early initialization pass for hardware IPs. The hardware IPs that make 2016 * up each asic are discovered each IP's early_init callback is run. This 2017 * is the first stage in initializing the asic. 2018 * Returns 0 on success, negative error code on failure. 2019 */ 2020 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2021 { 2022 struct pci_dev *parent; 2023 int i, r; 2024 bool total; 2025 2026 amdgpu_device_enable_virtual_display(adev); 2027 2028 if (amdgpu_sriov_vf(adev)) { 2029 r = amdgpu_virt_request_full_gpu(adev, true); 2030 if (r) 2031 return r; 2032 } 2033 2034 switch (adev->asic_type) { 2035 #ifdef CONFIG_DRM_AMDGPU_SI 2036 case CHIP_VERDE: 2037 case CHIP_TAHITI: 2038 case CHIP_PITCAIRN: 2039 case CHIP_OLAND: 2040 case CHIP_HAINAN: 2041 adev->family = AMDGPU_FAMILY_SI; 2042 r = si_set_ip_blocks(adev); 2043 if (r) 2044 return r; 2045 break; 2046 #endif 2047 #ifdef CONFIG_DRM_AMDGPU_CIK 2048 case CHIP_BONAIRE: 2049 case CHIP_HAWAII: 2050 case CHIP_KAVERI: 2051 case CHIP_KABINI: 2052 case CHIP_MULLINS: 2053 if (adev->flags & AMD_IS_APU) 2054 adev->family = AMDGPU_FAMILY_KV; 2055 else 2056 adev->family = AMDGPU_FAMILY_CI; 2057 2058 r = cik_set_ip_blocks(adev); 2059 if (r) 2060 return r; 2061 break; 2062 #endif 2063 case CHIP_TOPAZ: 2064 case CHIP_TONGA: 2065 case CHIP_FIJI: 2066 case CHIP_POLARIS10: 2067 case CHIP_POLARIS11: 2068 case CHIP_POLARIS12: 2069 case CHIP_VEGAM: 2070 case CHIP_CARRIZO: 2071 case CHIP_STONEY: 2072 if (adev->flags & AMD_IS_APU) 2073 adev->family = AMDGPU_FAMILY_CZ; 2074 else 2075 adev->family = AMDGPU_FAMILY_VI; 2076 2077 r = vi_set_ip_blocks(adev); 2078 if (r) 2079 return r; 2080 break; 2081 default: 2082 r = amdgpu_discovery_set_ip_blocks(adev); 2083 if (r) 2084 return r; 2085 break; 2086 } 2087 2088 if (amdgpu_has_atpx() && 2089 (amdgpu_is_atpx_hybrid() || 2090 amdgpu_has_atpx_dgpu_power_cntl()) && 2091 ((adev->flags & AMD_IS_APU) == 0) && 2092 !dev_is_removable(&adev->pdev->dev)) 2093 adev->flags |= AMD_IS_PX; 2094 2095 if (!(adev->flags & AMD_IS_APU)) { 2096 parent = pcie_find_root_port(adev->pdev); 2097 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2098 } 2099 2100 2101 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2102 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2103 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2104 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2105 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2106 if (!amdgpu_device_pcie_dynamic_switching_supported()) 2107 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2108 2109 total = true; 2110 for (i = 0; i < adev->num_ip_blocks; i++) { 2111 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2112 DRM_WARN("disabled ip block: %d <%s>\n", 2113 i, adev->ip_blocks[i].version->funcs->name); 2114 adev->ip_blocks[i].status.valid = false; 2115 } else { 2116 if (adev->ip_blocks[i].version->funcs->early_init) { 2117 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2118 if (r == -ENOENT) { 2119 adev->ip_blocks[i].status.valid = false; 2120 } else if (r) { 2121 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2122 adev->ip_blocks[i].version->funcs->name, r); 2123 total = false; 2124 } else { 2125 adev->ip_blocks[i].status.valid = true; 2126 } 2127 } else { 2128 adev->ip_blocks[i].status.valid = true; 2129 } 2130 } 2131 /* get the vbios after the asic_funcs are set up */ 2132 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2133 r = amdgpu_device_parse_gpu_info_fw(adev); 2134 if (r) 2135 return r; 2136 2137 /* Read BIOS */ 2138 if (amdgpu_device_read_bios(adev)) { 2139 if (!amdgpu_get_bios(adev)) 2140 return -EINVAL; 2141 2142 r = amdgpu_atombios_init(adev); 2143 if (r) { 2144 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2145 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2146 return r; 2147 } 2148 } 2149 2150 /*get pf2vf msg info at it's earliest time*/ 2151 if (amdgpu_sriov_vf(adev)) 2152 amdgpu_virt_init_data_exchange(adev); 2153 2154 } 2155 } 2156 if (!total) 2157 return -ENODEV; 2158 2159 amdgpu_amdkfd_device_probe(adev); 2160 adev->cg_flags &= amdgpu_cg_mask; 2161 adev->pg_flags &= amdgpu_pg_mask; 2162 2163 return 0; 2164 } 2165 2166 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2167 { 2168 int i, r; 2169 2170 for (i = 0; i < adev->num_ip_blocks; i++) { 2171 if (!adev->ip_blocks[i].status.sw) 2172 continue; 2173 if (adev->ip_blocks[i].status.hw) 2174 continue; 2175 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2176 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2177 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2178 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2179 if (r) { 2180 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2181 adev->ip_blocks[i].version->funcs->name, r); 2182 return r; 2183 } 2184 adev->ip_blocks[i].status.hw = true; 2185 } 2186 } 2187 2188 return 0; 2189 } 2190 2191 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2192 { 2193 int i, r; 2194 2195 for (i = 0; i < adev->num_ip_blocks; i++) { 2196 if (!adev->ip_blocks[i].status.sw) 2197 continue; 2198 if (adev->ip_blocks[i].status.hw) 2199 continue; 2200 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2201 if (r) { 2202 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 return r; 2205 } 2206 adev->ip_blocks[i].status.hw = true; 2207 } 2208 2209 return 0; 2210 } 2211 2212 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2213 { 2214 int r = 0; 2215 int i; 2216 uint32_t smu_version; 2217 2218 if (adev->asic_type >= CHIP_VEGA10) { 2219 for (i = 0; i < adev->num_ip_blocks; i++) { 2220 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2221 continue; 2222 2223 if (!adev->ip_blocks[i].status.sw) 2224 continue; 2225 2226 /* no need to do the fw loading again if already done*/ 2227 if (adev->ip_blocks[i].status.hw == true) 2228 break; 2229 2230 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2231 r = adev->ip_blocks[i].version->funcs->resume(adev); 2232 if (r) { 2233 DRM_ERROR("resume of IP block <%s> failed %d\n", 2234 adev->ip_blocks[i].version->funcs->name, r); 2235 return r; 2236 } 2237 } else { 2238 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2239 if (r) { 2240 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2241 adev->ip_blocks[i].version->funcs->name, r); 2242 return r; 2243 } 2244 } 2245 2246 adev->ip_blocks[i].status.hw = true; 2247 break; 2248 } 2249 } 2250 2251 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2252 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2253 2254 return r; 2255 } 2256 2257 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2258 { 2259 long timeout; 2260 int r, i; 2261 2262 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2263 struct amdgpu_ring *ring = adev->rings[i]; 2264 2265 /* No need to setup the GPU scheduler for rings that don't need it */ 2266 if (!ring || ring->no_scheduler) 2267 continue; 2268 2269 switch (ring->funcs->type) { 2270 case AMDGPU_RING_TYPE_GFX: 2271 timeout = adev->gfx_timeout; 2272 break; 2273 case AMDGPU_RING_TYPE_COMPUTE: 2274 timeout = adev->compute_timeout; 2275 break; 2276 case AMDGPU_RING_TYPE_SDMA: 2277 timeout = adev->sdma_timeout; 2278 break; 2279 default: 2280 timeout = adev->video_timeout; 2281 break; 2282 } 2283 2284 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2285 ring->num_hw_submission, 0, 2286 timeout, adev->reset_domain->wq, 2287 ring->sched_score, ring->name, 2288 adev->dev); 2289 if (r) { 2290 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2291 ring->name); 2292 return r; 2293 } 2294 } 2295 2296 amdgpu_xcp_update_partition_sched_list(adev); 2297 2298 return 0; 2299 } 2300 2301 2302 /** 2303 * amdgpu_device_ip_init - run init for hardware IPs 2304 * 2305 * @adev: amdgpu_device pointer 2306 * 2307 * Main initialization pass for hardware IPs. The list of all the hardware 2308 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2309 * are run. sw_init initializes the software state associated with each IP 2310 * and hw_init initializes the hardware associated with each IP. 2311 * Returns 0 on success, negative error code on failure. 2312 */ 2313 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2314 { 2315 int i, r; 2316 2317 r = amdgpu_ras_init(adev); 2318 if (r) 2319 return r; 2320 2321 for (i = 0; i < adev->num_ip_blocks; i++) { 2322 if (!adev->ip_blocks[i].status.valid) 2323 continue; 2324 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2325 if (r) { 2326 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2327 adev->ip_blocks[i].version->funcs->name, r); 2328 goto init_failed; 2329 } 2330 adev->ip_blocks[i].status.sw = true; 2331 2332 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2333 /* need to do common hw init early so everything is set up for gmc */ 2334 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2335 if (r) { 2336 DRM_ERROR("hw_init %d failed %d\n", i, r); 2337 goto init_failed; 2338 } 2339 adev->ip_blocks[i].status.hw = true; 2340 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2341 /* need to do gmc hw init early so we can allocate gpu mem */ 2342 /* Try to reserve bad pages early */ 2343 if (amdgpu_sriov_vf(adev)) 2344 amdgpu_virt_exchange_data(adev); 2345 2346 r = amdgpu_device_mem_scratch_init(adev); 2347 if (r) { 2348 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2349 goto init_failed; 2350 } 2351 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2352 if (r) { 2353 DRM_ERROR("hw_init %d failed %d\n", i, r); 2354 goto init_failed; 2355 } 2356 r = amdgpu_device_wb_init(adev); 2357 if (r) { 2358 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2359 goto init_failed; 2360 } 2361 adev->ip_blocks[i].status.hw = true; 2362 2363 /* right after GMC hw init, we create CSA */ 2364 if (adev->gfx.mcbp) { 2365 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2366 AMDGPU_GEM_DOMAIN_VRAM | 2367 AMDGPU_GEM_DOMAIN_GTT, 2368 AMDGPU_CSA_SIZE); 2369 if (r) { 2370 DRM_ERROR("allocate CSA failed %d\n", r); 2371 goto init_failed; 2372 } 2373 } 2374 } 2375 } 2376 2377 if (amdgpu_sriov_vf(adev)) 2378 amdgpu_virt_init_data_exchange(adev); 2379 2380 r = amdgpu_ib_pool_init(adev); 2381 if (r) { 2382 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2383 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2384 goto init_failed; 2385 } 2386 2387 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2388 if (r) 2389 goto init_failed; 2390 2391 r = amdgpu_device_ip_hw_init_phase1(adev); 2392 if (r) 2393 goto init_failed; 2394 2395 r = amdgpu_device_fw_loading(adev); 2396 if (r) 2397 goto init_failed; 2398 2399 r = amdgpu_device_ip_hw_init_phase2(adev); 2400 if (r) 2401 goto init_failed; 2402 2403 /* 2404 * retired pages will be loaded from eeprom and reserved here, 2405 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2406 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2407 * for I2C communication which only true at this point. 2408 * 2409 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2410 * failure from bad gpu situation and stop amdgpu init process 2411 * accordingly. For other failed cases, it will still release all 2412 * the resource and print error message, rather than returning one 2413 * negative value to upper level. 2414 * 2415 * Note: theoretically, this should be called before all vram allocations 2416 * to protect retired page from abusing 2417 */ 2418 r = amdgpu_ras_recovery_init(adev); 2419 if (r) 2420 goto init_failed; 2421 2422 /** 2423 * In case of XGMI grab extra reference for reset domain for this device 2424 */ 2425 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2426 if (amdgpu_xgmi_add_device(adev) == 0) { 2427 if (!amdgpu_sriov_vf(adev)) { 2428 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2429 2430 if (WARN_ON(!hive)) { 2431 r = -ENOENT; 2432 goto init_failed; 2433 } 2434 2435 if (!hive->reset_domain || 2436 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2437 r = -ENOENT; 2438 amdgpu_put_xgmi_hive(hive); 2439 goto init_failed; 2440 } 2441 2442 /* Drop the early temporary reset domain we created for device */ 2443 amdgpu_reset_put_reset_domain(adev->reset_domain); 2444 adev->reset_domain = hive->reset_domain; 2445 amdgpu_put_xgmi_hive(hive); 2446 } 2447 } 2448 } 2449 2450 r = amdgpu_device_init_schedulers(adev); 2451 if (r) 2452 goto init_failed; 2453 2454 /* Don't init kfd if whole hive need to be reset during init */ 2455 if (!adev->gmc.xgmi.pending_reset) { 2456 kgd2kfd_init_zone_device(adev); 2457 amdgpu_amdkfd_device_init(adev); 2458 } 2459 2460 amdgpu_fru_get_product_info(adev); 2461 2462 init_failed: 2463 2464 return r; 2465 } 2466 2467 /** 2468 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2469 * 2470 * @adev: amdgpu_device pointer 2471 * 2472 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2473 * this function before a GPU reset. If the value is retained after a 2474 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2475 */ 2476 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2477 { 2478 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2479 } 2480 2481 /** 2482 * amdgpu_device_check_vram_lost - check if vram is valid 2483 * 2484 * @adev: amdgpu_device pointer 2485 * 2486 * Checks the reset magic value written to the gart pointer in VRAM. 2487 * The driver calls this after a GPU reset to see if the contents of 2488 * VRAM is lost or now. 2489 * returns true if vram is lost, false if not. 2490 */ 2491 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2492 { 2493 if (memcmp(adev->gart.ptr, adev->reset_magic, 2494 AMDGPU_RESET_MAGIC_NUM)) 2495 return true; 2496 2497 if (!amdgpu_in_reset(adev)) 2498 return false; 2499 2500 /* 2501 * For all ASICs with baco/mode1 reset, the VRAM is 2502 * always assumed to be lost. 2503 */ 2504 switch (amdgpu_asic_reset_method(adev)) { 2505 case AMD_RESET_METHOD_BACO: 2506 case AMD_RESET_METHOD_MODE1: 2507 return true; 2508 default: 2509 return false; 2510 } 2511 } 2512 2513 /** 2514 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2515 * 2516 * @adev: amdgpu_device pointer 2517 * @state: clockgating state (gate or ungate) 2518 * 2519 * The list of all the hardware IPs that make up the asic is walked and the 2520 * set_clockgating_state callbacks are run. 2521 * Late initialization pass enabling clockgating for hardware IPs. 2522 * Fini or suspend, pass disabling clockgating for hardware IPs. 2523 * Returns 0 on success, negative error code on failure. 2524 */ 2525 2526 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2527 enum amd_clockgating_state state) 2528 { 2529 int i, j, r; 2530 2531 if (amdgpu_emu_mode == 1) 2532 return 0; 2533 2534 for (j = 0; j < adev->num_ip_blocks; j++) { 2535 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2536 if (!adev->ip_blocks[i].status.late_initialized) 2537 continue; 2538 /* skip CG for GFX, SDMA on S0ix */ 2539 if (adev->in_s0ix && 2540 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2541 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2542 continue; 2543 /* skip CG for VCE/UVD, it's handled specially */ 2544 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2545 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2546 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2547 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2548 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2549 /* enable clockgating to save power */ 2550 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2551 state); 2552 if (r) { 2553 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2554 adev->ip_blocks[i].version->funcs->name, r); 2555 return r; 2556 } 2557 } 2558 } 2559 2560 return 0; 2561 } 2562 2563 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2564 enum amd_powergating_state state) 2565 { 2566 int i, j, r; 2567 2568 if (amdgpu_emu_mode == 1) 2569 return 0; 2570 2571 for (j = 0; j < adev->num_ip_blocks; j++) { 2572 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2573 if (!adev->ip_blocks[i].status.late_initialized) 2574 continue; 2575 /* skip PG for GFX, SDMA on S0ix */ 2576 if (adev->in_s0ix && 2577 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2579 continue; 2580 /* skip CG for VCE/UVD, it's handled specially */ 2581 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2582 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2583 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2584 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2585 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2586 /* enable powergating to save power */ 2587 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2588 state); 2589 if (r) { 2590 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2591 adev->ip_blocks[i].version->funcs->name, r); 2592 return r; 2593 } 2594 } 2595 } 2596 return 0; 2597 } 2598 2599 static int amdgpu_device_enable_mgpu_fan_boost(void) 2600 { 2601 struct amdgpu_gpu_instance *gpu_ins; 2602 struct amdgpu_device *adev; 2603 int i, ret = 0; 2604 2605 mutex_lock(&mgpu_info.mutex); 2606 2607 /* 2608 * MGPU fan boost feature should be enabled 2609 * only when there are two or more dGPUs in 2610 * the system 2611 */ 2612 if (mgpu_info.num_dgpu < 2) 2613 goto out; 2614 2615 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2616 gpu_ins = &(mgpu_info.gpu_ins[i]); 2617 adev = gpu_ins->adev; 2618 if (!(adev->flags & AMD_IS_APU) && 2619 !gpu_ins->mgpu_fan_enabled) { 2620 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2621 if (ret) 2622 break; 2623 2624 gpu_ins->mgpu_fan_enabled = 1; 2625 } 2626 } 2627 2628 out: 2629 mutex_unlock(&mgpu_info.mutex); 2630 2631 return ret; 2632 } 2633 2634 /** 2635 * amdgpu_device_ip_late_init - run late init for hardware IPs 2636 * 2637 * @adev: amdgpu_device pointer 2638 * 2639 * Late initialization pass for hardware IPs. The list of all the hardware 2640 * IPs that make up the asic is walked and the late_init callbacks are run. 2641 * late_init covers any special initialization that an IP requires 2642 * after all of the have been initialized or something that needs to happen 2643 * late in the init process. 2644 * Returns 0 on success, negative error code on failure. 2645 */ 2646 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2647 { 2648 struct amdgpu_gpu_instance *gpu_instance; 2649 int i = 0, r; 2650 2651 for (i = 0; i < adev->num_ip_blocks; i++) { 2652 if (!adev->ip_blocks[i].status.hw) 2653 continue; 2654 if (adev->ip_blocks[i].version->funcs->late_init) { 2655 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2656 if (r) { 2657 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2658 adev->ip_blocks[i].version->funcs->name, r); 2659 return r; 2660 } 2661 } 2662 adev->ip_blocks[i].status.late_initialized = true; 2663 } 2664 2665 r = amdgpu_ras_late_init(adev); 2666 if (r) { 2667 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2668 return r; 2669 } 2670 2671 amdgpu_ras_set_error_query_ready(adev, true); 2672 2673 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2674 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2675 2676 amdgpu_device_fill_reset_magic(adev); 2677 2678 r = amdgpu_device_enable_mgpu_fan_boost(); 2679 if (r) 2680 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2681 2682 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2683 if (amdgpu_passthrough(adev) && 2684 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2685 adev->asic_type == CHIP_ALDEBARAN)) 2686 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2687 2688 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2689 mutex_lock(&mgpu_info.mutex); 2690 2691 /* 2692 * Reset device p-state to low as this was booted with high. 2693 * 2694 * This should be performed only after all devices from the same 2695 * hive get initialized. 2696 * 2697 * However, it's unknown how many device in the hive in advance. 2698 * As this is counted one by one during devices initializations. 2699 * 2700 * So, we wait for all XGMI interlinked devices initialized. 2701 * This may bring some delays as those devices may come from 2702 * different hives. But that should be OK. 2703 */ 2704 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2705 for (i = 0; i < mgpu_info.num_gpu; i++) { 2706 gpu_instance = &(mgpu_info.gpu_ins[i]); 2707 if (gpu_instance->adev->flags & AMD_IS_APU) 2708 continue; 2709 2710 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2711 AMDGPU_XGMI_PSTATE_MIN); 2712 if (r) { 2713 DRM_ERROR("pstate setting failed (%d).\n", r); 2714 break; 2715 } 2716 } 2717 } 2718 2719 mutex_unlock(&mgpu_info.mutex); 2720 } 2721 2722 return 0; 2723 } 2724 2725 /** 2726 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2727 * 2728 * @adev: amdgpu_device pointer 2729 * 2730 * For ASICs need to disable SMC first 2731 */ 2732 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2733 { 2734 int i, r; 2735 2736 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2737 return; 2738 2739 for (i = 0; i < adev->num_ip_blocks; i++) { 2740 if (!adev->ip_blocks[i].status.hw) 2741 continue; 2742 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2743 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2744 /* XXX handle errors */ 2745 if (r) { 2746 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2747 adev->ip_blocks[i].version->funcs->name, r); 2748 } 2749 adev->ip_blocks[i].status.hw = false; 2750 break; 2751 } 2752 } 2753 } 2754 2755 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2756 { 2757 int i, r; 2758 2759 for (i = 0; i < adev->num_ip_blocks; i++) { 2760 if (!adev->ip_blocks[i].version->funcs->early_fini) 2761 continue; 2762 2763 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2764 if (r) { 2765 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2766 adev->ip_blocks[i].version->funcs->name, r); 2767 } 2768 } 2769 2770 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2771 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2772 2773 amdgpu_amdkfd_suspend(adev, false); 2774 2775 /* Workaroud for ASICs need to disable SMC first */ 2776 amdgpu_device_smu_fini_early(adev); 2777 2778 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2779 if (!adev->ip_blocks[i].status.hw) 2780 continue; 2781 2782 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2783 /* XXX handle errors */ 2784 if (r) { 2785 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2786 adev->ip_blocks[i].version->funcs->name, r); 2787 } 2788 2789 adev->ip_blocks[i].status.hw = false; 2790 } 2791 2792 if (amdgpu_sriov_vf(adev)) { 2793 if (amdgpu_virt_release_full_gpu(adev, false)) 2794 DRM_ERROR("failed to release exclusive mode on fini\n"); 2795 } 2796 2797 return 0; 2798 } 2799 2800 /** 2801 * amdgpu_device_ip_fini - run fini for hardware IPs 2802 * 2803 * @adev: amdgpu_device pointer 2804 * 2805 * Main teardown pass for hardware IPs. The list of all the hardware 2806 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2807 * are run. hw_fini tears down the hardware associated with each IP 2808 * and sw_fini tears down any software state associated with each IP. 2809 * Returns 0 on success, negative error code on failure. 2810 */ 2811 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2812 { 2813 int i, r; 2814 2815 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2816 amdgpu_virt_release_ras_err_handler_data(adev); 2817 2818 if (adev->gmc.xgmi.num_physical_nodes > 1) 2819 amdgpu_xgmi_remove_device(adev); 2820 2821 amdgpu_amdkfd_device_fini_sw(adev); 2822 2823 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2824 if (!adev->ip_blocks[i].status.sw) 2825 continue; 2826 2827 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2828 amdgpu_ucode_free_bo(adev); 2829 amdgpu_free_static_csa(&adev->virt.csa_obj); 2830 amdgpu_device_wb_fini(adev); 2831 amdgpu_device_mem_scratch_fini(adev); 2832 amdgpu_ib_pool_fini(adev); 2833 } 2834 2835 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2836 /* XXX handle errors */ 2837 if (r) { 2838 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2839 adev->ip_blocks[i].version->funcs->name, r); 2840 } 2841 adev->ip_blocks[i].status.sw = false; 2842 adev->ip_blocks[i].status.valid = false; 2843 } 2844 2845 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2846 if (!adev->ip_blocks[i].status.late_initialized) 2847 continue; 2848 if (adev->ip_blocks[i].version->funcs->late_fini) 2849 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2850 adev->ip_blocks[i].status.late_initialized = false; 2851 } 2852 2853 amdgpu_ras_fini(adev); 2854 2855 return 0; 2856 } 2857 2858 /** 2859 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2860 * 2861 * @work: work_struct. 2862 */ 2863 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2864 { 2865 struct amdgpu_device *adev = 2866 container_of(work, struct amdgpu_device, delayed_init_work.work); 2867 int r; 2868 2869 r = amdgpu_ib_ring_tests(adev); 2870 if (r) 2871 DRM_ERROR("ib ring test failed (%d).\n", r); 2872 } 2873 2874 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2875 { 2876 struct amdgpu_device *adev = 2877 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2878 2879 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2880 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2881 2882 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2883 adev->gfx.gfx_off_state = true; 2884 } 2885 2886 /** 2887 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2888 * 2889 * @adev: amdgpu_device pointer 2890 * 2891 * Main suspend function for hardware IPs. The list of all the hardware 2892 * IPs that make up the asic is walked, clockgating is disabled and the 2893 * suspend callbacks are run. suspend puts the hardware and software state 2894 * in each IP into a state suitable for suspend. 2895 * Returns 0 on success, negative error code on failure. 2896 */ 2897 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2898 { 2899 int i, r; 2900 2901 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2902 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2903 2904 /* 2905 * Per PMFW team's suggestion, driver needs to handle gfxoff 2906 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2907 * scenario. Add the missing df cstate disablement here. 2908 */ 2909 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2910 dev_warn(adev->dev, "Failed to disallow df cstate"); 2911 2912 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2913 if (!adev->ip_blocks[i].status.valid) 2914 continue; 2915 2916 /* displays are handled separately */ 2917 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2918 continue; 2919 2920 /* XXX handle errors */ 2921 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2922 /* XXX handle errors */ 2923 if (r) { 2924 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2925 adev->ip_blocks[i].version->funcs->name, r); 2926 return r; 2927 } 2928 2929 adev->ip_blocks[i].status.hw = false; 2930 } 2931 2932 return 0; 2933 } 2934 2935 /** 2936 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2937 * 2938 * @adev: amdgpu_device pointer 2939 * 2940 * Main suspend function for hardware IPs. The list of all the hardware 2941 * IPs that make up the asic is walked, clockgating is disabled and the 2942 * suspend callbacks are run. suspend puts the hardware and software state 2943 * in each IP into a state suitable for suspend. 2944 * Returns 0 on success, negative error code on failure. 2945 */ 2946 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2947 { 2948 int i, r; 2949 2950 if (adev->in_s0ix) 2951 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2952 2953 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2954 if (!adev->ip_blocks[i].status.valid) 2955 continue; 2956 /* displays are handled in phase1 */ 2957 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2958 continue; 2959 /* PSP lost connection when err_event_athub occurs */ 2960 if (amdgpu_ras_intr_triggered() && 2961 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2962 adev->ip_blocks[i].status.hw = false; 2963 continue; 2964 } 2965 2966 /* skip unnecessary suspend if we do not initialize them yet */ 2967 if (adev->gmc.xgmi.pending_reset && 2968 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2969 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2970 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2971 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2972 adev->ip_blocks[i].status.hw = false; 2973 continue; 2974 } 2975 2976 /* skip suspend of gfx/mes and psp for S0ix 2977 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2978 * like at runtime. PSP is also part of the always on hardware 2979 * so no need to suspend it. 2980 */ 2981 if (adev->in_s0ix && 2982 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 2985 continue; 2986 2987 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 2988 if (adev->in_s0ix && 2989 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 2990 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2991 continue; 2992 2993 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 2994 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 2995 * from this location and RLC Autoload automatically also gets loaded 2996 * from here based on PMFW -> PSP message during re-init sequence. 2997 * Therefore, the psp suspend & resume should be skipped to avoid destroy 2998 * the TMR and reload FWs again for IMU enabled APU ASICs. 2999 */ 3000 if (amdgpu_in_reset(adev) && 3001 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3002 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3003 continue; 3004 3005 /* XXX handle errors */ 3006 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3007 /* XXX handle errors */ 3008 if (r) { 3009 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3010 adev->ip_blocks[i].version->funcs->name, r); 3011 } 3012 adev->ip_blocks[i].status.hw = false; 3013 /* handle putting the SMC in the appropriate state */ 3014 if (!amdgpu_sriov_vf(adev)) { 3015 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3016 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3017 if (r) { 3018 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3019 adev->mp1_state, r); 3020 return r; 3021 } 3022 } 3023 } 3024 } 3025 3026 return 0; 3027 } 3028 3029 /** 3030 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3031 * 3032 * @adev: amdgpu_device pointer 3033 * 3034 * Main suspend function for hardware IPs. The list of all the hardware 3035 * IPs that make up the asic is walked, clockgating is disabled and the 3036 * suspend callbacks are run. suspend puts the hardware and software state 3037 * in each IP into a state suitable for suspend. 3038 * Returns 0 on success, negative error code on failure. 3039 */ 3040 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3041 { 3042 int r; 3043 3044 if (amdgpu_sriov_vf(adev)) { 3045 amdgpu_virt_fini_data_exchange(adev); 3046 amdgpu_virt_request_full_gpu(adev, false); 3047 } 3048 3049 r = amdgpu_device_ip_suspend_phase1(adev); 3050 if (r) 3051 return r; 3052 r = amdgpu_device_ip_suspend_phase2(adev); 3053 3054 if (amdgpu_sriov_vf(adev)) 3055 amdgpu_virt_release_full_gpu(adev, false); 3056 3057 return r; 3058 } 3059 3060 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3061 { 3062 int i, r; 3063 3064 static enum amd_ip_block_type ip_order[] = { 3065 AMD_IP_BLOCK_TYPE_COMMON, 3066 AMD_IP_BLOCK_TYPE_GMC, 3067 AMD_IP_BLOCK_TYPE_PSP, 3068 AMD_IP_BLOCK_TYPE_IH, 3069 }; 3070 3071 for (i = 0; i < adev->num_ip_blocks; i++) { 3072 int j; 3073 struct amdgpu_ip_block *block; 3074 3075 block = &adev->ip_blocks[i]; 3076 block->status.hw = false; 3077 3078 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3079 3080 if (block->version->type != ip_order[j] || 3081 !block->status.valid) 3082 continue; 3083 3084 r = block->version->funcs->hw_init(adev); 3085 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3086 if (r) 3087 return r; 3088 block->status.hw = true; 3089 } 3090 } 3091 3092 return 0; 3093 } 3094 3095 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3096 { 3097 int i, r; 3098 3099 static enum amd_ip_block_type ip_order[] = { 3100 AMD_IP_BLOCK_TYPE_SMC, 3101 AMD_IP_BLOCK_TYPE_DCE, 3102 AMD_IP_BLOCK_TYPE_GFX, 3103 AMD_IP_BLOCK_TYPE_SDMA, 3104 AMD_IP_BLOCK_TYPE_MES, 3105 AMD_IP_BLOCK_TYPE_UVD, 3106 AMD_IP_BLOCK_TYPE_VCE, 3107 AMD_IP_BLOCK_TYPE_VCN, 3108 AMD_IP_BLOCK_TYPE_JPEG 3109 }; 3110 3111 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3112 int j; 3113 struct amdgpu_ip_block *block; 3114 3115 for (j = 0; j < adev->num_ip_blocks; j++) { 3116 block = &adev->ip_blocks[j]; 3117 3118 if (block->version->type != ip_order[i] || 3119 !block->status.valid || 3120 block->status.hw) 3121 continue; 3122 3123 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3124 r = block->version->funcs->resume(adev); 3125 else 3126 r = block->version->funcs->hw_init(adev); 3127 3128 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3129 if (r) 3130 return r; 3131 block->status.hw = true; 3132 } 3133 } 3134 3135 return 0; 3136 } 3137 3138 /** 3139 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3140 * 3141 * @adev: amdgpu_device pointer 3142 * 3143 * First resume function for hardware IPs. The list of all the hardware 3144 * IPs that make up the asic is walked and the resume callbacks are run for 3145 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3146 * after a suspend and updates the software state as necessary. This 3147 * function is also used for restoring the GPU after a GPU reset. 3148 * Returns 0 on success, negative error code on failure. 3149 */ 3150 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3151 { 3152 int i, r; 3153 3154 for (i = 0; i < adev->num_ip_blocks; i++) { 3155 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3156 continue; 3157 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3158 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3159 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3160 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3161 3162 r = adev->ip_blocks[i].version->funcs->resume(adev); 3163 if (r) { 3164 DRM_ERROR("resume of IP block <%s> failed %d\n", 3165 adev->ip_blocks[i].version->funcs->name, r); 3166 return r; 3167 } 3168 adev->ip_blocks[i].status.hw = true; 3169 } 3170 } 3171 3172 return 0; 3173 } 3174 3175 /** 3176 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3177 * 3178 * @adev: amdgpu_device pointer 3179 * 3180 * First resume function for hardware IPs. The list of all the hardware 3181 * IPs that make up the asic is walked and the resume callbacks are run for 3182 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3183 * functional state after a suspend and updates the software state as 3184 * necessary. This function is also used for restoring the GPU after a GPU 3185 * reset. 3186 * Returns 0 on success, negative error code on failure. 3187 */ 3188 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3189 { 3190 int i, r; 3191 3192 for (i = 0; i < adev->num_ip_blocks; i++) { 3193 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3194 continue; 3195 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3196 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3197 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3198 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3199 continue; 3200 r = adev->ip_blocks[i].version->funcs->resume(adev); 3201 if (r) { 3202 DRM_ERROR("resume of IP block <%s> failed %d\n", 3203 adev->ip_blocks[i].version->funcs->name, r); 3204 return r; 3205 } 3206 adev->ip_blocks[i].status.hw = true; 3207 } 3208 3209 return 0; 3210 } 3211 3212 /** 3213 * amdgpu_device_ip_resume - run resume for hardware IPs 3214 * 3215 * @adev: amdgpu_device pointer 3216 * 3217 * Main resume function for hardware IPs. The hardware IPs 3218 * are split into two resume functions because they are 3219 * also used in recovering from a GPU reset and some additional 3220 * steps need to be take between them. In this case (S3/S4) they are 3221 * run sequentially. 3222 * Returns 0 on success, negative error code on failure. 3223 */ 3224 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3225 { 3226 int r; 3227 3228 r = amdgpu_device_ip_resume_phase1(adev); 3229 if (r) 3230 return r; 3231 3232 r = amdgpu_device_fw_loading(adev); 3233 if (r) 3234 return r; 3235 3236 r = amdgpu_device_ip_resume_phase2(adev); 3237 3238 return r; 3239 } 3240 3241 /** 3242 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3243 * 3244 * @adev: amdgpu_device pointer 3245 * 3246 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3247 */ 3248 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3249 { 3250 if (amdgpu_sriov_vf(adev)) { 3251 if (adev->is_atom_fw) { 3252 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3253 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3254 } else { 3255 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3256 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3257 } 3258 3259 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3260 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3261 } 3262 } 3263 3264 /** 3265 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3266 * 3267 * @asic_type: AMD asic type 3268 * 3269 * Check if there is DC (new modesetting infrastructre) support for an asic. 3270 * returns true if DC has support, false if not. 3271 */ 3272 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3273 { 3274 switch (asic_type) { 3275 #ifdef CONFIG_DRM_AMDGPU_SI 3276 case CHIP_HAINAN: 3277 #endif 3278 case CHIP_TOPAZ: 3279 /* chips with no display hardware */ 3280 return false; 3281 #if defined(CONFIG_DRM_AMD_DC) 3282 case CHIP_TAHITI: 3283 case CHIP_PITCAIRN: 3284 case CHIP_VERDE: 3285 case CHIP_OLAND: 3286 /* 3287 * We have systems in the wild with these ASICs that require 3288 * LVDS and VGA support which is not supported with DC. 3289 * 3290 * Fallback to the non-DC driver here by default so as not to 3291 * cause regressions. 3292 */ 3293 #if defined(CONFIG_DRM_AMD_DC_SI) 3294 return amdgpu_dc > 0; 3295 #else 3296 return false; 3297 #endif 3298 case CHIP_BONAIRE: 3299 case CHIP_KAVERI: 3300 case CHIP_KABINI: 3301 case CHIP_MULLINS: 3302 /* 3303 * We have systems in the wild with these ASICs that require 3304 * VGA support which is not supported with DC. 3305 * 3306 * Fallback to the non-DC driver here by default so as not to 3307 * cause regressions. 3308 */ 3309 return amdgpu_dc > 0; 3310 default: 3311 return amdgpu_dc != 0; 3312 #else 3313 default: 3314 if (amdgpu_dc > 0) 3315 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3316 return false; 3317 #endif 3318 } 3319 } 3320 3321 /** 3322 * amdgpu_device_has_dc_support - check if dc is supported 3323 * 3324 * @adev: amdgpu_device pointer 3325 * 3326 * Returns true for supported, false for not supported 3327 */ 3328 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3329 { 3330 if (adev->enable_virtual_display || 3331 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3332 return false; 3333 3334 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3335 } 3336 3337 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3338 { 3339 struct amdgpu_device *adev = 3340 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3341 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3342 3343 /* It's a bug to not have a hive within this function */ 3344 if (WARN_ON(!hive)) 3345 return; 3346 3347 /* 3348 * Use task barrier to synchronize all xgmi reset works across the 3349 * hive. task_barrier_enter and task_barrier_exit will block 3350 * until all the threads running the xgmi reset works reach 3351 * those points. task_barrier_full will do both blocks. 3352 */ 3353 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3354 3355 task_barrier_enter(&hive->tb); 3356 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3357 3358 if (adev->asic_reset_res) 3359 goto fail; 3360 3361 task_barrier_exit(&hive->tb); 3362 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3363 3364 if (adev->asic_reset_res) 3365 goto fail; 3366 3367 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3368 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3369 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3370 } else { 3371 3372 task_barrier_full(&hive->tb); 3373 adev->asic_reset_res = amdgpu_asic_reset(adev); 3374 } 3375 3376 fail: 3377 if (adev->asic_reset_res) 3378 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3379 adev->asic_reset_res, adev_to_drm(adev)->unique); 3380 amdgpu_put_xgmi_hive(hive); 3381 } 3382 3383 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3384 { 3385 char *input = amdgpu_lockup_timeout; 3386 char *timeout_setting = NULL; 3387 int index = 0; 3388 long timeout; 3389 int ret = 0; 3390 3391 /* 3392 * By default timeout for non compute jobs is 10000 3393 * and 60000 for compute jobs. 3394 * In SR-IOV or passthrough mode, timeout for compute 3395 * jobs are 60000 by default. 3396 */ 3397 adev->gfx_timeout = msecs_to_jiffies(10000); 3398 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3399 if (amdgpu_sriov_vf(adev)) 3400 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3401 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3402 else 3403 adev->compute_timeout = msecs_to_jiffies(60000); 3404 3405 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3406 while ((timeout_setting = strsep(&input, ",")) && 3407 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3408 ret = kstrtol(timeout_setting, 0, &timeout); 3409 if (ret) 3410 return ret; 3411 3412 if (timeout == 0) { 3413 index++; 3414 continue; 3415 } else if (timeout < 0) { 3416 timeout = MAX_SCHEDULE_TIMEOUT; 3417 dev_warn(adev->dev, "lockup timeout disabled"); 3418 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3419 } else { 3420 timeout = msecs_to_jiffies(timeout); 3421 } 3422 3423 switch (index++) { 3424 case 0: 3425 adev->gfx_timeout = timeout; 3426 break; 3427 case 1: 3428 adev->compute_timeout = timeout; 3429 break; 3430 case 2: 3431 adev->sdma_timeout = timeout; 3432 break; 3433 case 3: 3434 adev->video_timeout = timeout; 3435 break; 3436 default: 3437 break; 3438 } 3439 } 3440 /* 3441 * There is only one value specified and 3442 * it should apply to all non-compute jobs. 3443 */ 3444 if (index == 1) { 3445 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3446 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3447 adev->compute_timeout = adev->gfx_timeout; 3448 } 3449 } 3450 3451 return ret; 3452 } 3453 3454 /** 3455 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3456 * 3457 * @adev: amdgpu_device pointer 3458 * 3459 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3460 */ 3461 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3462 { 3463 struct iommu_domain *domain; 3464 3465 domain = iommu_get_domain_for_dev(adev->dev); 3466 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3467 adev->ram_is_direct_mapped = true; 3468 } 3469 3470 static const struct attribute *amdgpu_dev_attributes[] = { 3471 &dev_attr_pcie_replay_count.attr, 3472 NULL 3473 }; 3474 3475 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3476 { 3477 if (amdgpu_mcbp == 1) 3478 adev->gfx.mcbp = true; 3479 else if (amdgpu_mcbp == 0) 3480 adev->gfx.mcbp = false; 3481 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3482 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3483 adev->gfx.num_gfx_rings) 3484 adev->gfx.mcbp = true; 3485 3486 if (amdgpu_sriov_vf(adev)) 3487 adev->gfx.mcbp = true; 3488 3489 if (adev->gfx.mcbp) 3490 DRM_INFO("MCBP is enabled\n"); 3491 } 3492 3493 /** 3494 * amdgpu_device_init - initialize the driver 3495 * 3496 * @adev: amdgpu_device pointer 3497 * @flags: driver flags 3498 * 3499 * Initializes the driver info and hw (all asics). 3500 * Returns 0 for success or an error on failure. 3501 * Called at driver startup. 3502 */ 3503 int amdgpu_device_init(struct amdgpu_device *adev, 3504 uint32_t flags) 3505 { 3506 struct drm_device *ddev = adev_to_drm(adev); 3507 struct pci_dev *pdev = adev->pdev; 3508 int r, i; 3509 bool px = false; 3510 u32 max_MBps; 3511 int tmp; 3512 3513 adev->shutdown = false; 3514 adev->flags = flags; 3515 3516 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3517 adev->asic_type = amdgpu_force_asic_type; 3518 else 3519 adev->asic_type = flags & AMD_ASIC_MASK; 3520 3521 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3522 if (amdgpu_emu_mode == 1) 3523 adev->usec_timeout *= 10; 3524 adev->gmc.gart_size = 512 * 1024 * 1024; 3525 adev->accel_working = false; 3526 adev->num_rings = 0; 3527 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3528 adev->mman.buffer_funcs = NULL; 3529 adev->mman.buffer_funcs_ring = NULL; 3530 adev->vm_manager.vm_pte_funcs = NULL; 3531 adev->vm_manager.vm_pte_num_scheds = 0; 3532 adev->gmc.gmc_funcs = NULL; 3533 adev->harvest_ip_mask = 0x0; 3534 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3535 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3536 3537 adev->smc_rreg = &amdgpu_invalid_rreg; 3538 adev->smc_wreg = &amdgpu_invalid_wreg; 3539 adev->pcie_rreg = &amdgpu_invalid_rreg; 3540 adev->pcie_wreg = &amdgpu_invalid_wreg; 3541 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3542 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3543 adev->pciep_rreg = &amdgpu_invalid_rreg; 3544 adev->pciep_wreg = &amdgpu_invalid_wreg; 3545 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3546 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3547 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3548 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3549 adev->didt_rreg = &amdgpu_invalid_rreg; 3550 adev->didt_wreg = &amdgpu_invalid_wreg; 3551 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3552 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3553 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3554 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3555 3556 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3557 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3558 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3559 3560 /* mutex initialization are all done here so we 3561 * can recall function without having locking issues 3562 */ 3563 mutex_init(&adev->firmware.mutex); 3564 mutex_init(&adev->pm.mutex); 3565 mutex_init(&adev->gfx.gpu_clock_mutex); 3566 mutex_init(&adev->srbm_mutex); 3567 mutex_init(&adev->gfx.pipe_reserve_mutex); 3568 mutex_init(&adev->gfx.gfx_off_mutex); 3569 mutex_init(&adev->gfx.partition_mutex); 3570 mutex_init(&adev->grbm_idx_mutex); 3571 mutex_init(&adev->mn_lock); 3572 mutex_init(&adev->virt.vf_errors.lock); 3573 hash_init(adev->mn_hash); 3574 mutex_init(&adev->psp.mutex); 3575 mutex_init(&adev->notifier_lock); 3576 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3577 mutex_init(&adev->benchmark_mutex); 3578 3579 amdgpu_device_init_apu_flags(adev); 3580 3581 r = amdgpu_device_check_arguments(adev); 3582 if (r) 3583 return r; 3584 3585 spin_lock_init(&adev->mmio_idx_lock); 3586 spin_lock_init(&adev->smc_idx_lock); 3587 spin_lock_init(&adev->pcie_idx_lock); 3588 spin_lock_init(&adev->uvd_ctx_idx_lock); 3589 spin_lock_init(&adev->didt_idx_lock); 3590 spin_lock_init(&adev->gc_cac_idx_lock); 3591 spin_lock_init(&adev->se_cac_idx_lock); 3592 spin_lock_init(&adev->audio_endpt_idx_lock); 3593 spin_lock_init(&adev->mm_stats.lock); 3594 3595 INIT_LIST_HEAD(&adev->shadow_list); 3596 mutex_init(&adev->shadow_list_lock); 3597 3598 INIT_LIST_HEAD(&adev->reset_list); 3599 3600 INIT_LIST_HEAD(&adev->ras_list); 3601 3602 INIT_DELAYED_WORK(&adev->delayed_init_work, 3603 amdgpu_device_delayed_init_work_handler); 3604 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3605 amdgpu_device_delay_enable_gfx_off); 3606 3607 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3608 3609 adev->gfx.gfx_off_req_count = 1; 3610 adev->gfx.gfx_off_residency = 0; 3611 adev->gfx.gfx_off_entrycount = 0; 3612 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3613 3614 atomic_set(&adev->throttling_logging_enabled, 1); 3615 /* 3616 * If throttling continues, logging will be performed every minute 3617 * to avoid log flooding. "-1" is subtracted since the thermal 3618 * throttling interrupt comes every second. Thus, the total logging 3619 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3620 * for throttling interrupt) = 60 seconds. 3621 */ 3622 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3623 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3624 3625 /* Registers mapping */ 3626 /* TODO: block userspace mapping of io register */ 3627 if (adev->asic_type >= CHIP_BONAIRE) { 3628 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3629 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3630 } else { 3631 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3632 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3633 } 3634 3635 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3636 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3637 3638 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3639 if (!adev->rmmio) 3640 return -ENOMEM; 3641 3642 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3643 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3644 3645 /* 3646 * Reset domain needs to be present early, before XGMI hive discovered 3647 * (if any) and intitialized to use reset sem and in_gpu reset flag 3648 * early on during init and before calling to RREG32. 3649 */ 3650 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3651 if (!adev->reset_domain) 3652 return -ENOMEM; 3653 3654 /* detect hw virtualization here */ 3655 amdgpu_detect_virtualization(adev); 3656 3657 amdgpu_device_get_pcie_info(adev); 3658 3659 r = amdgpu_device_get_job_timeout_settings(adev); 3660 if (r) { 3661 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3662 return r; 3663 } 3664 3665 /* early init functions */ 3666 r = amdgpu_device_ip_early_init(adev); 3667 if (r) 3668 return r; 3669 3670 amdgpu_device_set_mcbp(adev); 3671 3672 /* Get rid of things like offb */ 3673 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3674 if (r) 3675 return r; 3676 3677 /* Enable TMZ based on IP_VERSION */ 3678 amdgpu_gmc_tmz_set(adev); 3679 3680 amdgpu_gmc_noretry_set(adev); 3681 /* Need to get xgmi info early to decide the reset behavior*/ 3682 if (adev->gmc.xgmi.supported) { 3683 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3684 if (r) 3685 return r; 3686 } 3687 3688 /* enable PCIE atomic ops */ 3689 if (amdgpu_sriov_vf(adev)) { 3690 if (adev->virt.fw_reserve.p_pf2vf) 3691 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3692 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3693 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3694 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3695 * internal path natively support atomics, set have_atomics_support to true. 3696 */ 3697 } else if ((adev->flags & AMD_IS_APU) && 3698 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3699 adev->have_atomics_support = true; 3700 } else { 3701 adev->have_atomics_support = 3702 !pci_enable_atomic_ops_to_root(adev->pdev, 3703 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3704 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3705 } 3706 3707 if (!adev->have_atomics_support) 3708 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3709 3710 /* doorbell bar mapping and doorbell index init*/ 3711 amdgpu_doorbell_init(adev); 3712 3713 if (amdgpu_emu_mode == 1) { 3714 /* post the asic on emulation mode */ 3715 emu_soc_asic_init(adev); 3716 goto fence_driver_init; 3717 } 3718 3719 amdgpu_reset_init(adev); 3720 3721 /* detect if we are with an SRIOV vbios */ 3722 if (adev->bios) 3723 amdgpu_device_detect_sriov_bios(adev); 3724 3725 /* check if we need to reset the asic 3726 * E.g., driver was not cleanly unloaded previously, etc. 3727 */ 3728 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3729 if (adev->gmc.xgmi.num_physical_nodes) { 3730 dev_info(adev->dev, "Pending hive reset.\n"); 3731 adev->gmc.xgmi.pending_reset = true; 3732 /* Only need to init necessary block for SMU to handle the reset */ 3733 for (i = 0; i < adev->num_ip_blocks; i++) { 3734 if (!adev->ip_blocks[i].status.valid) 3735 continue; 3736 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3738 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3739 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3740 DRM_DEBUG("IP %s disabled for hw_init.\n", 3741 adev->ip_blocks[i].version->funcs->name); 3742 adev->ip_blocks[i].status.hw = true; 3743 } 3744 } 3745 } else { 3746 tmp = amdgpu_reset_method; 3747 /* It should do a default reset when loading or reloading the driver, 3748 * regardless of the module parameter reset_method. 3749 */ 3750 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3751 r = amdgpu_asic_reset(adev); 3752 amdgpu_reset_method = tmp; 3753 if (r) { 3754 dev_err(adev->dev, "asic reset on init failed\n"); 3755 goto failed; 3756 } 3757 } 3758 } 3759 3760 /* Post card if necessary */ 3761 if (amdgpu_device_need_post(adev)) { 3762 if (!adev->bios) { 3763 dev_err(adev->dev, "no vBIOS found\n"); 3764 r = -EINVAL; 3765 goto failed; 3766 } 3767 DRM_INFO("GPU posting now...\n"); 3768 r = amdgpu_device_asic_init(adev); 3769 if (r) { 3770 dev_err(adev->dev, "gpu post error!\n"); 3771 goto failed; 3772 } 3773 } 3774 3775 if (adev->bios) { 3776 if (adev->is_atom_fw) { 3777 /* Initialize clocks */ 3778 r = amdgpu_atomfirmware_get_clock_info(adev); 3779 if (r) { 3780 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3781 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3782 goto failed; 3783 } 3784 } else { 3785 /* Initialize clocks */ 3786 r = amdgpu_atombios_get_clock_info(adev); 3787 if (r) { 3788 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3789 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3790 goto failed; 3791 } 3792 /* init i2c buses */ 3793 if (!amdgpu_device_has_dc_support(adev)) 3794 amdgpu_atombios_i2c_init(adev); 3795 } 3796 } 3797 3798 fence_driver_init: 3799 /* Fence driver */ 3800 r = amdgpu_fence_driver_sw_init(adev); 3801 if (r) { 3802 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3803 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3804 goto failed; 3805 } 3806 3807 /* init the mode config */ 3808 drm_mode_config_init(adev_to_drm(adev)); 3809 3810 r = amdgpu_device_ip_init(adev); 3811 if (r) { 3812 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3813 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3814 goto release_ras_con; 3815 } 3816 3817 amdgpu_fence_driver_hw_init(adev); 3818 3819 dev_info(adev->dev, 3820 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3821 adev->gfx.config.max_shader_engines, 3822 adev->gfx.config.max_sh_per_se, 3823 adev->gfx.config.max_cu_per_sh, 3824 adev->gfx.cu_info.number); 3825 3826 adev->accel_working = true; 3827 3828 amdgpu_vm_check_compute_bug(adev); 3829 3830 /* Initialize the buffer migration limit. */ 3831 if (amdgpu_moverate >= 0) 3832 max_MBps = amdgpu_moverate; 3833 else 3834 max_MBps = 8; /* Allow 8 MB/s. */ 3835 /* Get a log2 for easy divisions. */ 3836 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3837 3838 r = amdgpu_atombios_sysfs_init(adev); 3839 if (r) 3840 drm_err(&adev->ddev, 3841 "registering atombios sysfs failed (%d).\n", r); 3842 3843 r = amdgpu_pm_sysfs_init(adev); 3844 if (r) 3845 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3846 3847 r = amdgpu_ucode_sysfs_init(adev); 3848 if (r) { 3849 adev->ucode_sysfs_en = false; 3850 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3851 } else 3852 adev->ucode_sysfs_en = true; 3853 3854 /* 3855 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3856 * Otherwise the mgpu fan boost feature will be skipped due to the 3857 * gpu instance is counted less. 3858 */ 3859 amdgpu_register_gpu_instance(adev); 3860 3861 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3862 * explicit gating rather than handling it automatically. 3863 */ 3864 if (!adev->gmc.xgmi.pending_reset) { 3865 r = amdgpu_device_ip_late_init(adev); 3866 if (r) { 3867 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3868 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3869 goto release_ras_con; 3870 } 3871 /* must succeed. */ 3872 amdgpu_ras_resume(adev); 3873 queue_delayed_work(system_wq, &adev->delayed_init_work, 3874 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3875 } 3876 3877 if (amdgpu_sriov_vf(adev)) { 3878 amdgpu_virt_release_full_gpu(adev, true); 3879 flush_delayed_work(&adev->delayed_init_work); 3880 } 3881 3882 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3883 if (r) 3884 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3885 3886 amdgpu_fru_sysfs_init(adev); 3887 3888 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3889 r = amdgpu_pmu_init(adev); 3890 if (r) 3891 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3892 3893 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3894 if (amdgpu_device_cache_pci_state(adev->pdev)) 3895 pci_restore_state(pdev); 3896 3897 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3898 /* this will fail for cards that aren't VGA class devices, just 3899 * ignore it 3900 */ 3901 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3902 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3903 3904 px = amdgpu_device_supports_px(ddev); 3905 3906 if (px || (!dev_is_removable(&adev->pdev->dev) && 3907 apple_gmux_detect(NULL, NULL))) 3908 vga_switcheroo_register_client(adev->pdev, 3909 &amdgpu_switcheroo_ops, px); 3910 3911 if (px) 3912 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3913 3914 if (adev->gmc.xgmi.pending_reset) 3915 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3916 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3917 3918 amdgpu_device_check_iommu_direct_map(adev); 3919 3920 return 0; 3921 3922 release_ras_con: 3923 if (amdgpu_sriov_vf(adev)) 3924 amdgpu_virt_release_full_gpu(adev, true); 3925 3926 /* failed in exclusive mode due to timeout */ 3927 if (amdgpu_sriov_vf(adev) && 3928 !amdgpu_sriov_runtime(adev) && 3929 amdgpu_virt_mmio_blocked(adev) && 3930 !amdgpu_virt_wait_reset(adev)) { 3931 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3932 /* Don't send request since VF is inactive. */ 3933 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3934 adev->virt.ops = NULL; 3935 r = -EAGAIN; 3936 } 3937 amdgpu_release_ras_context(adev); 3938 3939 failed: 3940 amdgpu_vf_error_trans_all(adev); 3941 3942 return r; 3943 } 3944 3945 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3946 { 3947 3948 /* Clear all CPU mappings pointing to this device */ 3949 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3950 3951 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3952 amdgpu_doorbell_fini(adev); 3953 3954 iounmap(adev->rmmio); 3955 adev->rmmio = NULL; 3956 if (adev->mman.aper_base_kaddr) 3957 iounmap(adev->mman.aper_base_kaddr); 3958 adev->mman.aper_base_kaddr = NULL; 3959 3960 /* Memory manager related */ 3961 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3962 arch_phys_wc_del(adev->gmc.vram_mtrr); 3963 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3964 } 3965 } 3966 3967 /** 3968 * amdgpu_device_fini_hw - tear down the driver 3969 * 3970 * @adev: amdgpu_device pointer 3971 * 3972 * Tear down the driver info (all asics). 3973 * Called at driver shutdown. 3974 */ 3975 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3976 { 3977 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3978 flush_delayed_work(&adev->delayed_init_work); 3979 adev->shutdown = true; 3980 3981 /* make sure IB test finished before entering exclusive mode 3982 * to avoid preemption on IB test 3983 */ 3984 if (amdgpu_sriov_vf(adev)) { 3985 amdgpu_virt_request_full_gpu(adev, false); 3986 amdgpu_virt_fini_data_exchange(adev); 3987 } 3988 3989 /* disable all interrupts */ 3990 amdgpu_irq_disable_all(adev); 3991 if (adev->mode_info.mode_config_initialized) { 3992 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3993 drm_helper_force_disable_all(adev_to_drm(adev)); 3994 else 3995 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3996 } 3997 amdgpu_fence_driver_hw_fini(adev); 3998 3999 if (adev->mman.initialized) 4000 drain_workqueue(adev->mman.bdev.wq); 4001 4002 if (adev->pm.sysfs_initialized) 4003 amdgpu_pm_sysfs_fini(adev); 4004 if (adev->ucode_sysfs_en) 4005 amdgpu_ucode_sysfs_fini(adev); 4006 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4007 amdgpu_fru_sysfs_fini(adev); 4008 4009 /* disable ras feature must before hw fini */ 4010 amdgpu_ras_pre_fini(adev); 4011 4012 amdgpu_device_ip_fini_early(adev); 4013 4014 amdgpu_irq_fini_hw(adev); 4015 4016 if (adev->mman.initialized) 4017 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4018 4019 amdgpu_gart_dummy_page_fini(adev); 4020 4021 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4022 amdgpu_device_unmap_mmio(adev); 4023 4024 } 4025 4026 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4027 { 4028 int idx; 4029 bool px; 4030 4031 amdgpu_fence_driver_sw_fini(adev); 4032 amdgpu_device_ip_fini(adev); 4033 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4034 adev->accel_working = false; 4035 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4036 4037 amdgpu_reset_fini(adev); 4038 4039 /* free i2c buses */ 4040 if (!amdgpu_device_has_dc_support(adev)) 4041 amdgpu_i2c_fini(adev); 4042 4043 if (amdgpu_emu_mode != 1) 4044 amdgpu_atombios_fini(adev); 4045 4046 kfree(adev->bios); 4047 adev->bios = NULL; 4048 4049 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4050 4051 if (px || (!dev_is_removable(&adev->pdev->dev) && 4052 apple_gmux_detect(NULL, NULL))) 4053 vga_switcheroo_unregister_client(adev->pdev); 4054 4055 if (px) 4056 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4057 4058 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4059 vga_client_unregister(adev->pdev); 4060 4061 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4062 4063 iounmap(adev->rmmio); 4064 adev->rmmio = NULL; 4065 amdgpu_doorbell_fini(adev); 4066 drm_dev_exit(idx); 4067 } 4068 4069 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4070 amdgpu_pmu_fini(adev); 4071 if (adev->mman.discovery_bin) 4072 amdgpu_discovery_fini(adev); 4073 4074 amdgpu_reset_put_reset_domain(adev->reset_domain); 4075 adev->reset_domain = NULL; 4076 4077 kfree(adev->pci_state); 4078 4079 } 4080 4081 /** 4082 * amdgpu_device_evict_resources - evict device resources 4083 * @adev: amdgpu device object 4084 * 4085 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4086 * of the vram memory type. Mainly used for evicting device resources 4087 * at suspend time. 4088 * 4089 */ 4090 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4091 { 4092 int ret; 4093 4094 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4095 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4096 return 0; 4097 4098 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4099 if (ret) 4100 DRM_WARN("evicting device resources failed\n"); 4101 return ret; 4102 } 4103 4104 /* 4105 * Suspend & resume. 4106 */ 4107 /** 4108 * amdgpu_device_suspend - initiate device suspend 4109 * 4110 * @dev: drm dev pointer 4111 * @fbcon : notify the fbdev of suspend 4112 * 4113 * Puts the hw in the suspend state (all asics). 4114 * Returns 0 for success or an error on failure. 4115 * Called at driver suspend. 4116 */ 4117 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4118 { 4119 struct amdgpu_device *adev = drm_to_adev(dev); 4120 int r = 0; 4121 4122 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4123 return 0; 4124 4125 adev->in_suspend = true; 4126 4127 /* Evict the majority of BOs before grabbing the full access */ 4128 r = amdgpu_device_evict_resources(adev); 4129 if (r) 4130 return r; 4131 4132 if (amdgpu_sriov_vf(adev)) { 4133 amdgpu_virt_fini_data_exchange(adev); 4134 r = amdgpu_virt_request_full_gpu(adev, false); 4135 if (r) 4136 return r; 4137 } 4138 4139 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4140 DRM_WARN("smart shift update failed\n"); 4141 4142 if (fbcon) 4143 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4144 4145 cancel_delayed_work_sync(&adev->delayed_init_work); 4146 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4147 4148 amdgpu_ras_suspend(adev); 4149 4150 amdgpu_device_ip_suspend_phase1(adev); 4151 4152 if (!adev->in_s0ix) 4153 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4154 4155 r = amdgpu_device_evict_resources(adev); 4156 if (r) 4157 return r; 4158 4159 amdgpu_fence_driver_hw_fini(adev); 4160 4161 amdgpu_device_ip_suspend_phase2(adev); 4162 4163 if (amdgpu_sriov_vf(adev)) 4164 amdgpu_virt_release_full_gpu(adev, false); 4165 4166 return 0; 4167 } 4168 4169 /** 4170 * amdgpu_device_resume - initiate device resume 4171 * 4172 * @dev: drm dev pointer 4173 * @fbcon : notify the fbdev of resume 4174 * 4175 * Bring the hw back to operating state (all asics). 4176 * Returns 0 for success or an error on failure. 4177 * Called at driver resume. 4178 */ 4179 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4180 { 4181 struct amdgpu_device *adev = drm_to_adev(dev); 4182 int r = 0; 4183 4184 if (amdgpu_sriov_vf(adev)) { 4185 r = amdgpu_virt_request_full_gpu(adev, true); 4186 if (r) 4187 return r; 4188 } 4189 4190 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4191 return 0; 4192 4193 if (adev->in_s0ix) 4194 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4195 4196 /* post card */ 4197 if (amdgpu_device_need_post(adev)) { 4198 r = amdgpu_device_asic_init(adev); 4199 if (r) 4200 dev_err(adev->dev, "amdgpu asic init failed\n"); 4201 } 4202 4203 r = amdgpu_device_ip_resume(adev); 4204 4205 if (r) { 4206 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4207 goto exit; 4208 } 4209 amdgpu_fence_driver_hw_init(adev); 4210 4211 r = amdgpu_device_ip_late_init(adev); 4212 if (r) 4213 goto exit; 4214 4215 queue_delayed_work(system_wq, &adev->delayed_init_work, 4216 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4217 4218 if (!adev->in_s0ix) { 4219 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4220 if (r) 4221 goto exit; 4222 } 4223 4224 exit: 4225 if (amdgpu_sriov_vf(adev)) { 4226 amdgpu_virt_init_data_exchange(adev); 4227 amdgpu_virt_release_full_gpu(adev, true); 4228 } 4229 4230 if (r) 4231 return r; 4232 4233 /* Make sure IB tests flushed */ 4234 flush_delayed_work(&adev->delayed_init_work); 4235 4236 if (fbcon) 4237 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4238 4239 amdgpu_ras_resume(adev); 4240 4241 if (adev->mode_info.num_crtc) { 4242 /* 4243 * Most of the connector probing functions try to acquire runtime pm 4244 * refs to ensure that the GPU is powered on when connector polling is 4245 * performed. Since we're calling this from a runtime PM callback, 4246 * trying to acquire rpm refs will cause us to deadlock. 4247 * 4248 * Since we're guaranteed to be holding the rpm lock, it's safe to 4249 * temporarily disable the rpm helpers so this doesn't deadlock us. 4250 */ 4251 #ifdef CONFIG_PM 4252 dev->dev->power.disable_depth++; 4253 #endif 4254 if (!adev->dc_enabled) 4255 drm_helper_hpd_irq_event(dev); 4256 else 4257 drm_kms_helper_hotplug_event(dev); 4258 #ifdef CONFIG_PM 4259 dev->dev->power.disable_depth--; 4260 #endif 4261 } 4262 adev->in_suspend = false; 4263 4264 if (adev->enable_mes) 4265 amdgpu_mes_self_test(adev); 4266 4267 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4268 DRM_WARN("smart shift update failed\n"); 4269 4270 return 0; 4271 } 4272 4273 /** 4274 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4275 * 4276 * @adev: amdgpu_device pointer 4277 * 4278 * The list of all the hardware IPs that make up the asic is walked and 4279 * the check_soft_reset callbacks are run. check_soft_reset determines 4280 * if the asic is still hung or not. 4281 * Returns true if any of the IPs are still in a hung state, false if not. 4282 */ 4283 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4284 { 4285 int i; 4286 bool asic_hang = false; 4287 4288 if (amdgpu_sriov_vf(adev)) 4289 return true; 4290 4291 if (amdgpu_asic_need_full_reset(adev)) 4292 return true; 4293 4294 for (i = 0; i < adev->num_ip_blocks; i++) { 4295 if (!adev->ip_blocks[i].status.valid) 4296 continue; 4297 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4298 adev->ip_blocks[i].status.hang = 4299 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4300 if (adev->ip_blocks[i].status.hang) { 4301 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4302 asic_hang = true; 4303 } 4304 } 4305 return asic_hang; 4306 } 4307 4308 /** 4309 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4310 * 4311 * @adev: amdgpu_device pointer 4312 * 4313 * The list of all the hardware IPs that make up the asic is walked and the 4314 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4315 * handles any IP specific hardware or software state changes that are 4316 * necessary for a soft reset to succeed. 4317 * Returns 0 on success, negative error code on failure. 4318 */ 4319 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4320 { 4321 int i, r = 0; 4322 4323 for (i = 0; i < adev->num_ip_blocks; i++) { 4324 if (!adev->ip_blocks[i].status.valid) 4325 continue; 4326 if (adev->ip_blocks[i].status.hang && 4327 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4328 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4329 if (r) 4330 return r; 4331 } 4332 } 4333 4334 return 0; 4335 } 4336 4337 /** 4338 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4339 * 4340 * @adev: amdgpu_device pointer 4341 * 4342 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4343 * reset is necessary to recover. 4344 * Returns true if a full asic reset is required, false if not. 4345 */ 4346 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4347 { 4348 int i; 4349 4350 if (amdgpu_asic_need_full_reset(adev)) 4351 return true; 4352 4353 for (i = 0; i < adev->num_ip_blocks; i++) { 4354 if (!adev->ip_blocks[i].status.valid) 4355 continue; 4356 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4357 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4358 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4359 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4360 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4361 if (adev->ip_blocks[i].status.hang) { 4362 dev_info(adev->dev, "Some block need full reset!\n"); 4363 return true; 4364 } 4365 } 4366 } 4367 return false; 4368 } 4369 4370 /** 4371 * amdgpu_device_ip_soft_reset - do a soft reset 4372 * 4373 * @adev: amdgpu_device pointer 4374 * 4375 * The list of all the hardware IPs that make up the asic is walked and the 4376 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4377 * IP specific hardware or software state changes that are necessary to soft 4378 * reset the IP. 4379 * Returns 0 on success, negative error code on failure. 4380 */ 4381 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4382 { 4383 int i, r = 0; 4384 4385 for (i = 0; i < adev->num_ip_blocks; i++) { 4386 if (!adev->ip_blocks[i].status.valid) 4387 continue; 4388 if (adev->ip_blocks[i].status.hang && 4389 adev->ip_blocks[i].version->funcs->soft_reset) { 4390 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4391 if (r) 4392 return r; 4393 } 4394 } 4395 4396 return 0; 4397 } 4398 4399 /** 4400 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4401 * 4402 * @adev: amdgpu_device pointer 4403 * 4404 * The list of all the hardware IPs that make up the asic is walked and the 4405 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4406 * handles any IP specific hardware or software state changes that are 4407 * necessary after the IP has been soft reset. 4408 * Returns 0 on success, negative error code on failure. 4409 */ 4410 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4411 { 4412 int i, r = 0; 4413 4414 for (i = 0; i < adev->num_ip_blocks; i++) { 4415 if (!adev->ip_blocks[i].status.valid) 4416 continue; 4417 if (adev->ip_blocks[i].status.hang && 4418 adev->ip_blocks[i].version->funcs->post_soft_reset) 4419 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4420 if (r) 4421 return r; 4422 } 4423 4424 return 0; 4425 } 4426 4427 /** 4428 * amdgpu_device_recover_vram - Recover some VRAM contents 4429 * 4430 * @adev: amdgpu_device pointer 4431 * 4432 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4433 * restore things like GPUVM page tables after a GPU reset where 4434 * the contents of VRAM might be lost. 4435 * 4436 * Returns: 4437 * 0 on success, negative error code on failure. 4438 */ 4439 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4440 { 4441 struct dma_fence *fence = NULL, *next = NULL; 4442 struct amdgpu_bo *shadow; 4443 struct amdgpu_bo_vm *vmbo; 4444 long r = 1, tmo; 4445 4446 if (amdgpu_sriov_runtime(adev)) 4447 tmo = msecs_to_jiffies(8000); 4448 else 4449 tmo = msecs_to_jiffies(100); 4450 4451 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4452 mutex_lock(&adev->shadow_list_lock); 4453 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4454 /* If vm is compute context or adev is APU, shadow will be NULL */ 4455 if (!vmbo->shadow) 4456 continue; 4457 shadow = vmbo->shadow; 4458 4459 /* No need to recover an evicted BO */ 4460 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4461 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4462 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4463 continue; 4464 4465 r = amdgpu_bo_restore_shadow(shadow, &next); 4466 if (r) 4467 break; 4468 4469 if (fence) { 4470 tmo = dma_fence_wait_timeout(fence, false, tmo); 4471 dma_fence_put(fence); 4472 fence = next; 4473 if (tmo == 0) { 4474 r = -ETIMEDOUT; 4475 break; 4476 } else if (tmo < 0) { 4477 r = tmo; 4478 break; 4479 } 4480 } else { 4481 fence = next; 4482 } 4483 } 4484 mutex_unlock(&adev->shadow_list_lock); 4485 4486 if (fence) 4487 tmo = dma_fence_wait_timeout(fence, false, tmo); 4488 dma_fence_put(fence); 4489 4490 if (r < 0 || tmo <= 0) { 4491 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4492 return -EIO; 4493 } 4494 4495 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4496 return 0; 4497 } 4498 4499 4500 /** 4501 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4502 * 4503 * @adev: amdgpu_device pointer 4504 * @from_hypervisor: request from hypervisor 4505 * 4506 * do VF FLR and reinitialize Asic 4507 * return 0 means succeeded otherwise failed 4508 */ 4509 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4510 bool from_hypervisor) 4511 { 4512 int r; 4513 struct amdgpu_hive_info *hive = NULL; 4514 int retry_limit = 0; 4515 4516 retry: 4517 amdgpu_amdkfd_pre_reset(adev); 4518 4519 if (from_hypervisor) 4520 r = amdgpu_virt_request_full_gpu(adev, true); 4521 else 4522 r = amdgpu_virt_reset_gpu(adev); 4523 if (r) 4524 return r; 4525 amdgpu_irq_gpu_reset_resume_helper(adev); 4526 4527 /* some sw clean up VF needs to do before recover */ 4528 amdgpu_virt_post_reset(adev); 4529 4530 /* Resume IP prior to SMC */ 4531 r = amdgpu_device_ip_reinit_early_sriov(adev); 4532 if (r) 4533 goto error; 4534 4535 amdgpu_virt_init_data_exchange(adev); 4536 4537 r = amdgpu_device_fw_loading(adev); 4538 if (r) 4539 return r; 4540 4541 /* now we are okay to resume SMC/CP/SDMA */ 4542 r = amdgpu_device_ip_reinit_late_sriov(adev); 4543 if (r) 4544 goto error; 4545 4546 hive = amdgpu_get_xgmi_hive(adev); 4547 /* Update PSP FW topology after reset */ 4548 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4549 r = amdgpu_xgmi_update_topology(hive, adev); 4550 4551 if (hive) 4552 amdgpu_put_xgmi_hive(hive); 4553 4554 if (!r) { 4555 r = amdgpu_ib_ring_tests(adev); 4556 4557 amdgpu_amdkfd_post_reset(adev); 4558 } 4559 4560 error: 4561 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4562 amdgpu_inc_vram_lost(adev); 4563 r = amdgpu_device_recover_vram(adev); 4564 } 4565 amdgpu_virt_release_full_gpu(adev, true); 4566 4567 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4568 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4569 retry_limit++; 4570 goto retry; 4571 } else 4572 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4573 } 4574 4575 return r; 4576 } 4577 4578 /** 4579 * amdgpu_device_has_job_running - check if there is any job in mirror list 4580 * 4581 * @adev: amdgpu_device pointer 4582 * 4583 * check if there is any job in mirror list 4584 */ 4585 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4586 { 4587 int i; 4588 struct drm_sched_job *job; 4589 4590 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4591 struct amdgpu_ring *ring = adev->rings[i]; 4592 4593 if (!ring || !ring->sched.thread) 4594 continue; 4595 4596 spin_lock(&ring->sched.job_list_lock); 4597 job = list_first_entry_or_null(&ring->sched.pending_list, 4598 struct drm_sched_job, list); 4599 spin_unlock(&ring->sched.job_list_lock); 4600 if (job) 4601 return true; 4602 } 4603 return false; 4604 } 4605 4606 /** 4607 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4608 * 4609 * @adev: amdgpu_device pointer 4610 * 4611 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4612 * a hung GPU. 4613 */ 4614 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4615 { 4616 4617 if (amdgpu_gpu_recovery == 0) 4618 goto disabled; 4619 4620 /* Skip soft reset check in fatal error mode */ 4621 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4622 return true; 4623 4624 if (amdgpu_sriov_vf(adev)) 4625 return true; 4626 4627 if (amdgpu_gpu_recovery == -1) { 4628 switch (adev->asic_type) { 4629 #ifdef CONFIG_DRM_AMDGPU_SI 4630 case CHIP_VERDE: 4631 case CHIP_TAHITI: 4632 case CHIP_PITCAIRN: 4633 case CHIP_OLAND: 4634 case CHIP_HAINAN: 4635 #endif 4636 #ifdef CONFIG_DRM_AMDGPU_CIK 4637 case CHIP_KAVERI: 4638 case CHIP_KABINI: 4639 case CHIP_MULLINS: 4640 #endif 4641 case CHIP_CARRIZO: 4642 case CHIP_STONEY: 4643 case CHIP_CYAN_SKILLFISH: 4644 goto disabled; 4645 default: 4646 break; 4647 } 4648 } 4649 4650 return true; 4651 4652 disabled: 4653 dev_info(adev->dev, "GPU recovery disabled.\n"); 4654 return false; 4655 } 4656 4657 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4658 { 4659 u32 i; 4660 int ret = 0; 4661 4662 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4663 4664 dev_info(adev->dev, "GPU mode1 reset\n"); 4665 4666 /* disable BM */ 4667 pci_clear_master(adev->pdev); 4668 4669 amdgpu_device_cache_pci_state(adev->pdev); 4670 4671 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4672 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4673 ret = amdgpu_dpm_mode1_reset(adev); 4674 } else { 4675 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4676 ret = psp_gpu_reset(adev); 4677 } 4678 4679 if (ret) 4680 goto mode1_reset_failed; 4681 4682 amdgpu_device_load_pci_state(adev->pdev); 4683 ret = amdgpu_psp_wait_for_bootloader(adev); 4684 if (ret) 4685 goto mode1_reset_failed; 4686 4687 /* wait for asic to come out of reset */ 4688 for (i = 0; i < adev->usec_timeout; i++) { 4689 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4690 4691 if (memsize != 0xffffffff) 4692 break; 4693 udelay(1); 4694 } 4695 4696 if (i >= adev->usec_timeout) { 4697 ret = -ETIMEDOUT; 4698 goto mode1_reset_failed; 4699 } 4700 4701 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4702 4703 return 0; 4704 4705 mode1_reset_failed: 4706 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4707 return ret; 4708 } 4709 4710 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4711 struct amdgpu_reset_context *reset_context) 4712 { 4713 int i, r = 0; 4714 struct amdgpu_job *job = NULL; 4715 bool need_full_reset = 4716 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4717 4718 if (reset_context->reset_req_dev == adev) 4719 job = reset_context->job; 4720 4721 if (amdgpu_sriov_vf(adev)) { 4722 /* stop the data exchange thread */ 4723 amdgpu_virt_fini_data_exchange(adev); 4724 } 4725 4726 amdgpu_fence_driver_isr_toggle(adev, true); 4727 4728 /* block all schedulers and reset given job's ring */ 4729 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4730 struct amdgpu_ring *ring = adev->rings[i]; 4731 4732 if (!ring || !ring->sched.thread) 4733 continue; 4734 4735 /* Clear job fence from fence drv to avoid force_completion 4736 * leave NULL and vm flush fence in fence drv 4737 */ 4738 amdgpu_fence_driver_clear_job_fences(ring); 4739 4740 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4741 amdgpu_fence_driver_force_completion(ring); 4742 } 4743 4744 amdgpu_fence_driver_isr_toggle(adev, false); 4745 4746 if (job && job->vm) 4747 drm_sched_increase_karma(&job->base); 4748 4749 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4750 /* If reset handler not implemented, continue; otherwise return */ 4751 if (r == -EOPNOTSUPP) 4752 r = 0; 4753 else 4754 return r; 4755 4756 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4757 if (!amdgpu_sriov_vf(adev)) { 4758 4759 if (!need_full_reset) 4760 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4761 4762 if (!need_full_reset && amdgpu_gpu_recovery && 4763 amdgpu_device_ip_check_soft_reset(adev)) { 4764 amdgpu_device_ip_pre_soft_reset(adev); 4765 r = amdgpu_device_ip_soft_reset(adev); 4766 amdgpu_device_ip_post_soft_reset(adev); 4767 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4768 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4769 need_full_reset = true; 4770 } 4771 } 4772 4773 if (need_full_reset) 4774 r = amdgpu_device_ip_suspend(adev); 4775 if (need_full_reset) 4776 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4777 else 4778 clear_bit(AMDGPU_NEED_FULL_RESET, 4779 &reset_context->flags); 4780 } 4781 4782 return r; 4783 } 4784 4785 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4786 { 4787 int i; 4788 4789 lockdep_assert_held(&adev->reset_domain->sem); 4790 4791 for (i = 0; i < adev->num_regs; i++) { 4792 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4793 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4794 adev->reset_dump_reg_value[i]); 4795 } 4796 4797 return 0; 4798 } 4799 4800 #ifdef CONFIG_DEV_COREDUMP 4801 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4802 size_t count, void *data, size_t datalen) 4803 { 4804 struct drm_printer p; 4805 struct amdgpu_device *adev = data; 4806 struct drm_print_iterator iter; 4807 int i; 4808 4809 iter.data = buffer; 4810 iter.offset = 0; 4811 iter.start = offset; 4812 iter.remain = count; 4813 4814 p = drm_coredump_printer(&iter); 4815 4816 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4817 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4818 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4819 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4820 if (adev->reset_task_info.pid) 4821 drm_printf(&p, "process_name: %s PID: %d\n", 4822 adev->reset_task_info.process_name, 4823 adev->reset_task_info.pid); 4824 4825 if (adev->reset_vram_lost) 4826 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4827 if (adev->num_regs) { 4828 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4829 4830 for (i = 0; i < adev->num_regs; i++) 4831 drm_printf(&p, "0x%08x: 0x%08x\n", 4832 adev->reset_dump_reg_list[i], 4833 adev->reset_dump_reg_value[i]); 4834 } 4835 4836 return count - iter.remain; 4837 } 4838 4839 static void amdgpu_devcoredump_free(void *data) 4840 { 4841 } 4842 4843 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4844 { 4845 struct drm_device *dev = adev_to_drm(adev); 4846 4847 ktime_get_ts64(&adev->reset_time); 4848 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4849 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4850 } 4851 #endif 4852 4853 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4854 struct amdgpu_reset_context *reset_context) 4855 { 4856 struct amdgpu_device *tmp_adev = NULL; 4857 bool need_full_reset, skip_hw_reset, vram_lost = false; 4858 int r = 0; 4859 bool gpu_reset_for_dev_remove = 0; 4860 4861 /* Try reset handler method first */ 4862 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4863 reset_list); 4864 amdgpu_reset_reg_dumps(tmp_adev); 4865 4866 reset_context->reset_device_list = device_list_handle; 4867 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4868 /* If reset handler not implemented, continue; otherwise return */ 4869 if (r == -EOPNOTSUPP) 4870 r = 0; 4871 else 4872 return r; 4873 4874 /* Reset handler not implemented, use the default method */ 4875 need_full_reset = 4876 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4877 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4878 4879 gpu_reset_for_dev_remove = 4880 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4881 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4882 4883 /* 4884 * ASIC reset has to be done on all XGMI hive nodes ASAP 4885 * to allow proper links negotiation in FW (within 1 sec) 4886 */ 4887 if (!skip_hw_reset && need_full_reset) { 4888 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4889 /* For XGMI run all resets in parallel to speed up the process */ 4890 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4891 tmp_adev->gmc.xgmi.pending_reset = false; 4892 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4893 r = -EALREADY; 4894 } else 4895 r = amdgpu_asic_reset(tmp_adev); 4896 4897 if (r) { 4898 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4899 r, adev_to_drm(tmp_adev)->unique); 4900 break; 4901 } 4902 } 4903 4904 /* For XGMI wait for all resets to complete before proceed */ 4905 if (!r) { 4906 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4907 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4908 flush_work(&tmp_adev->xgmi_reset_work); 4909 r = tmp_adev->asic_reset_res; 4910 if (r) 4911 break; 4912 } 4913 } 4914 } 4915 } 4916 4917 if (!r && amdgpu_ras_intr_triggered()) { 4918 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4919 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4920 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4921 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4922 } 4923 4924 amdgpu_ras_intr_cleared(); 4925 } 4926 4927 /* Since the mode1 reset affects base ip blocks, the 4928 * phase1 ip blocks need to be resumed. Otherwise there 4929 * will be a BIOS signature error and the psp bootloader 4930 * can't load kdb on the next amdgpu install. 4931 */ 4932 if (gpu_reset_for_dev_remove) { 4933 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4934 amdgpu_device_ip_resume_phase1(tmp_adev); 4935 4936 goto end; 4937 } 4938 4939 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4940 if (need_full_reset) { 4941 /* post card */ 4942 r = amdgpu_device_asic_init(tmp_adev); 4943 if (r) { 4944 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4945 } else { 4946 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4947 4948 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4949 if (r) 4950 goto out; 4951 4952 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4953 #ifdef CONFIG_DEV_COREDUMP 4954 tmp_adev->reset_vram_lost = vram_lost; 4955 memset(&tmp_adev->reset_task_info, 0, 4956 sizeof(tmp_adev->reset_task_info)); 4957 if (reset_context->job && reset_context->job->vm) 4958 tmp_adev->reset_task_info = 4959 reset_context->job->vm->task_info; 4960 amdgpu_reset_capture_coredumpm(tmp_adev); 4961 #endif 4962 if (vram_lost) { 4963 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4964 amdgpu_inc_vram_lost(tmp_adev); 4965 } 4966 4967 r = amdgpu_device_fw_loading(tmp_adev); 4968 if (r) 4969 return r; 4970 4971 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4972 if (r) 4973 goto out; 4974 4975 if (vram_lost) 4976 amdgpu_device_fill_reset_magic(tmp_adev); 4977 4978 /* 4979 * Add this ASIC as tracked as reset was already 4980 * complete successfully. 4981 */ 4982 amdgpu_register_gpu_instance(tmp_adev); 4983 4984 if (!reset_context->hive && 4985 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4986 amdgpu_xgmi_add_device(tmp_adev); 4987 4988 r = amdgpu_device_ip_late_init(tmp_adev); 4989 if (r) 4990 goto out; 4991 4992 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4993 4994 /* 4995 * The GPU enters bad state once faulty pages 4996 * by ECC has reached the threshold, and ras 4997 * recovery is scheduled next. So add one check 4998 * here to break recovery if it indeed exceeds 4999 * bad page threshold, and remind user to 5000 * retire this GPU or setting one bigger 5001 * bad_page_threshold value to fix this once 5002 * probing driver again. 5003 */ 5004 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5005 /* must succeed. */ 5006 amdgpu_ras_resume(tmp_adev); 5007 } else { 5008 r = -EINVAL; 5009 goto out; 5010 } 5011 5012 /* Update PSP FW topology after reset */ 5013 if (reset_context->hive && 5014 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5015 r = amdgpu_xgmi_update_topology( 5016 reset_context->hive, tmp_adev); 5017 } 5018 } 5019 5020 out: 5021 if (!r) { 5022 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5023 r = amdgpu_ib_ring_tests(tmp_adev); 5024 if (r) { 5025 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5026 need_full_reset = true; 5027 r = -EAGAIN; 5028 goto end; 5029 } 5030 } 5031 5032 if (!r) 5033 r = amdgpu_device_recover_vram(tmp_adev); 5034 else 5035 tmp_adev->asic_reset_res = r; 5036 } 5037 5038 end: 5039 if (need_full_reset) 5040 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5041 else 5042 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5043 return r; 5044 } 5045 5046 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5047 { 5048 5049 switch (amdgpu_asic_reset_method(adev)) { 5050 case AMD_RESET_METHOD_MODE1: 5051 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5052 break; 5053 case AMD_RESET_METHOD_MODE2: 5054 adev->mp1_state = PP_MP1_STATE_RESET; 5055 break; 5056 default: 5057 adev->mp1_state = PP_MP1_STATE_NONE; 5058 break; 5059 } 5060 } 5061 5062 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5063 { 5064 amdgpu_vf_error_trans_all(adev); 5065 adev->mp1_state = PP_MP1_STATE_NONE; 5066 } 5067 5068 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5069 { 5070 struct pci_dev *p = NULL; 5071 5072 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5073 adev->pdev->bus->number, 1); 5074 if (p) { 5075 pm_runtime_enable(&(p->dev)); 5076 pm_runtime_resume(&(p->dev)); 5077 } 5078 5079 pci_dev_put(p); 5080 } 5081 5082 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5083 { 5084 enum amd_reset_method reset_method; 5085 struct pci_dev *p = NULL; 5086 u64 expires; 5087 5088 /* 5089 * For now, only BACO and mode1 reset are confirmed 5090 * to suffer the audio issue without proper suspended. 5091 */ 5092 reset_method = amdgpu_asic_reset_method(adev); 5093 if ((reset_method != AMD_RESET_METHOD_BACO) && 5094 (reset_method != AMD_RESET_METHOD_MODE1)) 5095 return -EINVAL; 5096 5097 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5098 adev->pdev->bus->number, 1); 5099 if (!p) 5100 return -ENODEV; 5101 5102 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5103 if (!expires) 5104 /* 5105 * If we cannot get the audio device autosuspend delay, 5106 * a fixed 4S interval will be used. Considering 3S is 5107 * the audio controller default autosuspend delay setting. 5108 * 4S used here is guaranteed to cover that. 5109 */ 5110 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5111 5112 while (!pm_runtime_status_suspended(&(p->dev))) { 5113 if (!pm_runtime_suspend(&(p->dev))) 5114 break; 5115 5116 if (expires < ktime_get_mono_fast_ns()) { 5117 dev_warn(adev->dev, "failed to suspend display audio\n"); 5118 pci_dev_put(p); 5119 /* TODO: abort the succeeding gpu reset? */ 5120 return -ETIMEDOUT; 5121 } 5122 } 5123 5124 pm_runtime_disable(&(p->dev)); 5125 5126 pci_dev_put(p); 5127 return 0; 5128 } 5129 5130 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5131 { 5132 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5133 5134 #if defined(CONFIG_DEBUG_FS) 5135 if (!amdgpu_sriov_vf(adev)) 5136 cancel_work(&adev->reset_work); 5137 #endif 5138 5139 if (adev->kfd.dev) 5140 cancel_work(&adev->kfd.reset_work); 5141 5142 if (amdgpu_sriov_vf(adev)) 5143 cancel_work(&adev->virt.flr_work); 5144 5145 if (con && adev->ras_enabled) 5146 cancel_work(&con->recovery_work); 5147 5148 } 5149 5150 /** 5151 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5152 * 5153 * @adev: amdgpu_device pointer 5154 * @job: which job trigger hang 5155 * @reset_context: amdgpu reset context pointer 5156 * 5157 * Attempt to reset the GPU if it has hung (all asics). 5158 * Attempt to do soft-reset or full-reset and reinitialize Asic 5159 * Returns 0 for success or an error on failure. 5160 */ 5161 5162 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5163 struct amdgpu_job *job, 5164 struct amdgpu_reset_context *reset_context) 5165 { 5166 struct list_head device_list, *device_list_handle = NULL; 5167 bool job_signaled = false; 5168 struct amdgpu_hive_info *hive = NULL; 5169 struct amdgpu_device *tmp_adev = NULL; 5170 int i, r = 0; 5171 bool need_emergency_restart = false; 5172 bool audio_suspended = false; 5173 bool gpu_reset_for_dev_remove = false; 5174 5175 gpu_reset_for_dev_remove = 5176 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5177 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5178 5179 /* 5180 * Special case: RAS triggered and full reset isn't supported 5181 */ 5182 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5183 5184 /* 5185 * Flush RAM to disk so that after reboot 5186 * the user can read log and see why the system rebooted. 5187 */ 5188 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5189 amdgpu_ras_get_context(adev)->reboot) { 5190 DRM_WARN("Emergency reboot."); 5191 5192 ksys_sync_helper(); 5193 emergency_restart(); 5194 } 5195 5196 dev_info(adev->dev, "GPU %s begin!\n", 5197 need_emergency_restart ? "jobs stop":"reset"); 5198 5199 if (!amdgpu_sriov_vf(adev)) 5200 hive = amdgpu_get_xgmi_hive(adev); 5201 if (hive) 5202 mutex_lock(&hive->hive_lock); 5203 5204 reset_context->job = job; 5205 reset_context->hive = hive; 5206 /* 5207 * Build list of devices to reset. 5208 * In case we are in XGMI hive mode, resort the device list 5209 * to put adev in the 1st position. 5210 */ 5211 INIT_LIST_HEAD(&device_list); 5212 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5213 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5214 list_add_tail(&tmp_adev->reset_list, &device_list); 5215 if (gpu_reset_for_dev_remove && adev->shutdown) 5216 tmp_adev->shutdown = true; 5217 } 5218 if (!list_is_first(&adev->reset_list, &device_list)) 5219 list_rotate_to_front(&adev->reset_list, &device_list); 5220 device_list_handle = &device_list; 5221 } else { 5222 list_add_tail(&adev->reset_list, &device_list); 5223 device_list_handle = &device_list; 5224 } 5225 5226 /* We need to lock reset domain only once both for XGMI and single device */ 5227 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5228 reset_list); 5229 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5230 5231 /* block all schedulers and reset given job's ring */ 5232 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5233 5234 amdgpu_device_set_mp1_state(tmp_adev); 5235 5236 /* 5237 * Try to put the audio codec into suspend state 5238 * before gpu reset started. 5239 * 5240 * Due to the power domain of the graphics device 5241 * is shared with AZ power domain. Without this, 5242 * we may change the audio hardware from behind 5243 * the audio driver's back. That will trigger 5244 * some audio codec errors. 5245 */ 5246 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5247 audio_suspended = true; 5248 5249 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5250 5251 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5252 5253 if (!amdgpu_sriov_vf(tmp_adev)) 5254 amdgpu_amdkfd_pre_reset(tmp_adev); 5255 5256 /* 5257 * Mark these ASICs to be reseted as untracked first 5258 * And add them back after reset completed 5259 */ 5260 amdgpu_unregister_gpu_instance(tmp_adev); 5261 5262 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5263 5264 /* disable ras on ALL IPs */ 5265 if (!need_emergency_restart && 5266 amdgpu_device_ip_need_full_reset(tmp_adev)) 5267 amdgpu_ras_suspend(tmp_adev); 5268 5269 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5270 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5271 5272 if (!ring || !ring->sched.thread) 5273 continue; 5274 5275 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5276 5277 if (need_emergency_restart) 5278 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5279 } 5280 atomic_inc(&tmp_adev->gpu_reset_counter); 5281 } 5282 5283 if (need_emergency_restart) 5284 goto skip_sched_resume; 5285 5286 /* 5287 * Must check guilty signal here since after this point all old 5288 * HW fences are force signaled. 5289 * 5290 * job->base holds a reference to parent fence 5291 */ 5292 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5293 job_signaled = true; 5294 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5295 goto skip_hw_reset; 5296 } 5297 5298 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5299 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5300 if (gpu_reset_for_dev_remove) { 5301 /* Workaroud for ASICs need to disable SMC first */ 5302 amdgpu_device_smu_fini_early(tmp_adev); 5303 } 5304 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5305 /*TODO Should we stop ?*/ 5306 if (r) { 5307 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5308 r, adev_to_drm(tmp_adev)->unique); 5309 tmp_adev->asic_reset_res = r; 5310 } 5311 5312 /* 5313 * Drop all pending non scheduler resets. Scheduler resets 5314 * were already dropped during drm_sched_stop 5315 */ 5316 amdgpu_device_stop_pending_resets(tmp_adev); 5317 } 5318 5319 /* Actual ASIC resets if needed.*/ 5320 /* Host driver will handle XGMI hive reset for SRIOV */ 5321 if (amdgpu_sriov_vf(adev)) { 5322 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5323 if (r) 5324 adev->asic_reset_res = r; 5325 5326 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5327 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5328 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5329 amdgpu_ras_resume(adev); 5330 } else { 5331 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5332 if (r && r == -EAGAIN) 5333 goto retry; 5334 5335 if (!r && gpu_reset_for_dev_remove) 5336 goto recover_end; 5337 } 5338 5339 skip_hw_reset: 5340 5341 /* Post ASIC reset for all devs .*/ 5342 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5343 5344 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5345 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5346 5347 if (!ring || !ring->sched.thread) 5348 continue; 5349 5350 drm_sched_start(&ring->sched, true); 5351 } 5352 5353 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5354 amdgpu_mes_self_test(tmp_adev); 5355 5356 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5357 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5358 5359 if (tmp_adev->asic_reset_res) 5360 r = tmp_adev->asic_reset_res; 5361 5362 tmp_adev->asic_reset_res = 0; 5363 5364 if (r) { 5365 /* bad news, how to tell it to userspace ? */ 5366 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5367 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5368 } else { 5369 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5370 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5371 DRM_WARN("smart shift update failed\n"); 5372 } 5373 } 5374 5375 skip_sched_resume: 5376 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5377 /* unlock kfd: SRIOV would do it separately */ 5378 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5379 amdgpu_amdkfd_post_reset(tmp_adev); 5380 5381 /* kfd_post_reset will do nothing if kfd device is not initialized, 5382 * need to bring up kfd here if it's not be initialized before 5383 */ 5384 if (!adev->kfd.init_complete) 5385 amdgpu_amdkfd_device_init(adev); 5386 5387 if (audio_suspended) 5388 amdgpu_device_resume_display_audio(tmp_adev); 5389 5390 amdgpu_device_unset_mp1_state(tmp_adev); 5391 5392 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5393 } 5394 5395 recover_end: 5396 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5397 reset_list); 5398 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5399 5400 if (hive) { 5401 mutex_unlock(&hive->hive_lock); 5402 amdgpu_put_xgmi_hive(hive); 5403 } 5404 5405 if (r) 5406 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5407 5408 atomic_set(&adev->reset_domain->reset_res, r); 5409 return r; 5410 } 5411 5412 /** 5413 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5414 * 5415 * @adev: amdgpu_device pointer 5416 * 5417 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5418 * and lanes) of the slot the device is in. Handles APUs and 5419 * virtualized environments where PCIE config space may not be available. 5420 */ 5421 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5422 { 5423 struct pci_dev *pdev; 5424 enum pci_bus_speed speed_cap, platform_speed_cap; 5425 enum pcie_link_width platform_link_width; 5426 5427 if (amdgpu_pcie_gen_cap) 5428 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5429 5430 if (amdgpu_pcie_lane_cap) 5431 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5432 5433 /* covers APUs as well */ 5434 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5435 if (adev->pm.pcie_gen_mask == 0) 5436 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5437 if (adev->pm.pcie_mlw_mask == 0) 5438 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5439 return; 5440 } 5441 5442 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5443 return; 5444 5445 pcie_bandwidth_available(adev->pdev, NULL, 5446 &platform_speed_cap, &platform_link_width); 5447 5448 if (adev->pm.pcie_gen_mask == 0) { 5449 /* asic caps */ 5450 pdev = adev->pdev; 5451 speed_cap = pcie_get_speed_cap(pdev); 5452 if (speed_cap == PCI_SPEED_UNKNOWN) { 5453 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5454 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5455 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5456 } else { 5457 if (speed_cap == PCIE_SPEED_32_0GT) 5458 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5459 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5460 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5461 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5463 else if (speed_cap == PCIE_SPEED_16_0GT) 5464 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5465 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5466 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5468 else if (speed_cap == PCIE_SPEED_8_0GT) 5469 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5470 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5471 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5472 else if (speed_cap == PCIE_SPEED_5_0GT) 5473 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5474 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5475 else 5476 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5477 } 5478 /* platform caps */ 5479 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5480 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5481 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5482 } else { 5483 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5484 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5485 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5486 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5487 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5489 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5490 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5491 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5492 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5493 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5494 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5495 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5496 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5497 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5498 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5499 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5500 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5501 else 5502 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5503 5504 } 5505 } 5506 if (adev->pm.pcie_mlw_mask == 0) { 5507 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5508 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5509 } else { 5510 switch (platform_link_width) { 5511 case PCIE_LNK_X32: 5512 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5515 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5516 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5517 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5518 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5519 break; 5520 case PCIE_LNK_X16: 5521 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5522 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5523 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5524 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5525 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5527 break; 5528 case PCIE_LNK_X12: 5529 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5532 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5534 break; 5535 case PCIE_LNK_X8: 5536 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5540 break; 5541 case PCIE_LNK_X4: 5542 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5543 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5545 break; 5546 case PCIE_LNK_X2: 5547 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5549 break; 5550 case PCIE_LNK_X1: 5551 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5552 break; 5553 default: 5554 break; 5555 } 5556 } 5557 } 5558 } 5559 5560 /** 5561 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5562 * 5563 * @adev: amdgpu_device pointer 5564 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5565 * 5566 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5567 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5568 * @peer_adev. 5569 */ 5570 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5571 struct amdgpu_device *peer_adev) 5572 { 5573 #ifdef CONFIG_HSA_AMD_P2P 5574 uint64_t address_mask = peer_adev->dev->dma_mask ? 5575 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5576 resource_size_t aper_limit = 5577 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5578 bool p2p_access = 5579 !adev->gmc.xgmi.connected_to_cpu && 5580 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5581 5582 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5583 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5584 !(adev->gmc.aper_base & address_mask || 5585 aper_limit & address_mask)); 5586 #else 5587 return false; 5588 #endif 5589 } 5590 5591 int amdgpu_device_baco_enter(struct drm_device *dev) 5592 { 5593 struct amdgpu_device *adev = drm_to_adev(dev); 5594 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5595 5596 if (!amdgpu_device_supports_baco(dev)) 5597 return -ENOTSUPP; 5598 5599 if (ras && adev->ras_enabled && 5600 adev->nbio.funcs->enable_doorbell_interrupt) 5601 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5602 5603 return amdgpu_dpm_baco_enter(adev); 5604 } 5605 5606 int amdgpu_device_baco_exit(struct drm_device *dev) 5607 { 5608 struct amdgpu_device *adev = drm_to_adev(dev); 5609 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5610 int ret = 0; 5611 5612 if (!amdgpu_device_supports_baco(dev)) 5613 return -ENOTSUPP; 5614 5615 ret = amdgpu_dpm_baco_exit(adev); 5616 if (ret) 5617 return ret; 5618 5619 if (ras && adev->ras_enabled && 5620 adev->nbio.funcs->enable_doorbell_interrupt) 5621 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5622 5623 if (amdgpu_passthrough(adev) && 5624 adev->nbio.funcs->clear_doorbell_interrupt) 5625 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5626 5627 return 0; 5628 } 5629 5630 /** 5631 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5632 * @pdev: PCI device struct 5633 * @state: PCI channel state 5634 * 5635 * Description: Called when a PCI error is detected. 5636 * 5637 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5638 */ 5639 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5640 { 5641 struct drm_device *dev = pci_get_drvdata(pdev); 5642 struct amdgpu_device *adev = drm_to_adev(dev); 5643 int i; 5644 5645 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5646 5647 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5648 DRM_WARN("No support for XGMI hive yet..."); 5649 return PCI_ERS_RESULT_DISCONNECT; 5650 } 5651 5652 adev->pci_channel_state = state; 5653 5654 switch (state) { 5655 case pci_channel_io_normal: 5656 return PCI_ERS_RESULT_CAN_RECOVER; 5657 /* Fatal error, prepare for slot reset */ 5658 case pci_channel_io_frozen: 5659 /* 5660 * Locking adev->reset_domain->sem will prevent any external access 5661 * to GPU during PCI error recovery 5662 */ 5663 amdgpu_device_lock_reset_domain(adev->reset_domain); 5664 amdgpu_device_set_mp1_state(adev); 5665 5666 /* 5667 * Block any work scheduling as we do for regular GPU reset 5668 * for the duration of the recovery 5669 */ 5670 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5671 struct amdgpu_ring *ring = adev->rings[i]; 5672 5673 if (!ring || !ring->sched.thread) 5674 continue; 5675 5676 drm_sched_stop(&ring->sched, NULL); 5677 } 5678 atomic_inc(&adev->gpu_reset_counter); 5679 return PCI_ERS_RESULT_NEED_RESET; 5680 case pci_channel_io_perm_failure: 5681 /* Permanent error, prepare for device removal */ 5682 return PCI_ERS_RESULT_DISCONNECT; 5683 } 5684 5685 return PCI_ERS_RESULT_NEED_RESET; 5686 } 5687 5688 /** 5689 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5690 * @pdev: pointer to PCI device 5691 */ 5692 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5693 { 5694 5695 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5696 5697 /* TODO - dump whatever for debugging purposes */ 5698 5699 /* This called only if amdgpu_pci_error_detected returns 5700 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5701 * works, no need to reset slot. 5702 */ 5703 5704 return PCI_ERS_RESULT_RECOVERED; 5705 } 5706 5707 /** 5708 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5709 * @pdev: PCI device struct 5710 * 5711 * Description: This routine is called by the pci error recovery 5712 * code after the PCI slot has been reset, just before we 5713 * should resume normal operations. 5714 */ 5715 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5716 { 5717 struct drm_device *dev = pci_get_drvdata(pdev); 5718 struct amdgpu_device *adev = drm_to_adev(dev); 5719 int r, i; 5720 struct amdgpu_reset_context reset_context; 5721 u32 memsize; 5722 struct list_head device_list; 5723 5724 DRM_INFO("PCI error: slot reset callback!!\n"); 5725 5726 memset(&reset_context, 0, sizeof(reset_context)); 5727 5728 INIT_LIST_HEAD(&device_list); 5729 list_add_tail(&adev->reset_list, &device_list); 5730 5731 /* wait for asic to come out of reset */ 5732 msleep(500); 5733 5734 /* Restore PCI confspace */ 5735 amdgpu_device_load_pci_state(pdev); 5736 5737 /* confirm ASIC came out of reset */ 5738 for (i = 0; i < adev->usec_timeout; i++) { 5739 memsize = amdgpu_asic_get_config_memsize(adev); 5740 5741 if (memsize != 0xffffffff) 5742 break; 5743 udelay(1); 5744 } 5745 if (memsize == 0xffffffff) { 5746 r = -ETIME; 5747 goto out; 5748 } 5749 5750 reset_context.method = AMD_RESET_METHOD_NONE; 5751 reset_context.reset_req_dev = adev; 5752 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5753 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5754 5755 adev->no_hw_access = true; 5756 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5757 adev->no_hw_access = false; 5758 if (r) 5759 goto out; 5760 5761 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5762 5763 out: 5764 if (!r) { 5765 if (amdgpu_device_cache_pci_state(adev->pdev)) 5766 pci_restore_state(adev->pdev); 5767 5768 DRM_INFO("PCIe error recovery succeeded\n"); 5769 } else { 5770 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5771 amdgpu_device_unset_mp1_state(adev); 5772 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5773 } 5774 5775 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5776 } 5777 5778 /** 5779 * amdgpu_pci_resume() - resume normal ops after PCI reset 5780 * @pdev: pointer to PCI device 5781 * 5782 * Called when the error recovery driver tells us that its 5783 * OK to resume normal operation. 5784 */ 5785 void amdgpu_pci_resume(struct pci_dev *pdev) 5786 { 5787 struct drm_device *dev = pci_get_drvdata(pdev); 5788 struct amdgpu_device *adev = drm_to_adev(dev); 5789 int i; 5790 5791 5792 DRM_INFO("PCI error: resume callback!!\n"); 5793 5794 /* Only continue execution for the case of pci_channel_io_frozen */ 5795 if (adev->pci_channel_state != pci_channel_io_frozen) 5796 return; 5797 5798 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5799 struct amdgpu_ring *ring = adev->rings[i]; 5800 5801 if (!ring || !ring->sched.thread) 5802 continue; 5803 5804 drm_sched_start(&ring->sched, true); 5805 } 5806 5807 amdgpu_device_unset_mp1_state(adev); 5808 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5809 } 5810 5811 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5812 { 5813 struct drm_device *dev = pci_get_drvdata(pdev); 5814 struct amdgpu_device *adev = drm_to_adev(dev); 5815 int r; 5816 5817 r = pci_save_state(pdev); 5818 if (!r) { 5819 kfree(adev->pci_state); 5820 5821 adev->pci_state = pci_store_saved_state(pdev); 5822 5823 if (!adev->pci_state) { 5824 DRM_ERROR("Failed to store PCI saved state"); 5825 return false; 5826 } 5827 } else { 5828 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5829 return false; 5830 } 5831 5832 return true; 5833 } 5834 5835 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5836 { 5837 struct drm_device *dev = pci_get_drvdata(pdev); 5838 struct amdgpu_device *adev = drm_to_adev(dev); 5839 int r; 5840 5841 if (!adev->pci_state) 5842 return false; 5843 5844 r = pci_load_saved_state(pdev, adev->pci_state); 5845 5846 if (!r) { 5847 pci_restore_state(pdev); 5848 } else { 5849 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5850 return false; 5851 } 5852 5853 return true; 5854 } 5855 5856 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5857 struct amdgpu_ring *ring) 5858 { 5859 #ifdef CONFIG_X86_64 5860 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5861 return; 5862 #endif 5863 if (adev->gmc.xgmi.connected_to_cpu) 5864 return; 5865 5866 if (ring && ring->funcs->emit_hdp_flush) 5867 amdgpu_ring_emit_hdp_flush(ring); 5868 else 5869 amdgpu_asic_flush_hdp(adev, ring); 5870 } 5871 5872 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5873 struct amdgpu_ring *ring) 5874 { 5875 #ifdef CONFIG_X86_64 5876 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5877 return; 5878 #endif 5879 if (adev->gmc.xgmi.connected_to_cpu) 5880 return; 5881 5882 amdgpu_asic_invalidate_hdp(adev, ring); 5883 } 5884 5885 int amdgpu_in_reset(struct amdgpu_device *adev) 5886 { 5887 return atomic_read(&adev->reset_domain->in_gpu_reset); 5888 } 5889 5890 /** 5891 * amdgpu_device_halt() - bring hardware to some kind of halt state 5892 * 5893 * @adev: amdgpu_device pointer 5894 * 5895 * Bring hardware to some kind of halt state so that no one can touch it 5896 * any more. It will help to maintain error context when error occurred. 5897 * Compare to a simple hang, the system will keep stable at least for SSH 5898 * access. Then it should be trivial to inspect the hardware state and 5899 * see what's going on. Implemented as following: 5900 * 5901 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5902 * clears all CPU mappings to device, disallows remappings through page faults 5903 * 2. amdgpu_irq_disable_all() disables all interrupts 5904 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5905 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5906 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5907 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5908 * flush any in flight DMA operations 5909 */ 5910 void amdgpu_device_halt(struct amdgpu_device *adev) 5911 { 5912 struct pci_dev *pdev = adev->pdev; 5913 struct drm_device *ddev = adev_to_drm(adev); 5914 5915 amdgpu_xcp_dev_unplug(adev); 5916 drm_dev_unplug(ddev); 5917 5918 amdgpu_irq_disable_all(adev); 5919 5920 amdgpu_fence_driver_hw_fini(adev); 5921 5922 adev->no_hw_access = true; 5923 5924 amdgpu_device_unmap_mmio(adev); 5925 5926 pci_disable_device(pdev); 5927 pci_wait_for_pending_transaction(pdev); 5928 } 5929 5930 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5931 u32 reg) 5932 { 5933 unsigned long flags, address, data; 5934 u32 r; 5935 5936 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5937 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5938 5939 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5940 WREG32(address, reg * 4); 5941 (void)RREG32(address); 5942 r = RREG32(data); 5943 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5944 return r; 5945 } 5946 5947 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5948 u32 reg, u32 v) 5949 { 5950 unsigned long flags, address, data; 5951 5952 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5953 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5954 5955 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5956 WREG32(address, reg * 4); 5957 (void)RREG32(address); 5958 WREG32(data, v); 5959 (void)RREG32(data); 5960 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5961 } 5962 5963 /** 5964 * amdgpu_device_switch_gang - switch to a new gang 5965 * @adev: amdgpu_device pointer 5966 * @gang: the gang to switch to 5967 * 5968 * Try to switch to a new gang. 5969 * Returns: NULL if we switched to the new gang or a reference to the current 5970 * gang leader. 5971 */ 5972 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5973 struct dma_fence *gang) 5974 { 5975 struct dma_fence *old = NULL; 5976 5977 do { 5978 dma_fence_put(old); 5979 rcu_read_lock(); 5980 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5981 rcu_read_unlock(); 5982 5983 if (old == gang) 5984 break; 5985 5986 if (!dma_fence_is_signaled(old)) 5987 return old; 5988 5989 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5990 old, gang) != old); 5991 5992 dma_fence_put(old); 5993 return NULL; 5994 } 5995 5996 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5997 { 5998 switch (adev->asic_type) { 5999 #ifdef CONFIG_DRM_AMDGPU_SI 6000 case CHIP_HAINAN: 6001 #endif 6002 case CHIP_TOPAZ: 6003 /* chips with no display hardware */ 6004 return false; 6005 #ifdef CONFIG_DRM_AMDGPU_SI 6006 case CHIP_TAHITI: 6007 case CHIP_PITCAIRN: 6008 case CHIP_VERDE: 6009 case CHIP_OLAND: 6010 #endif 6011 #ifdef CONFIG_DRM_AMDGPU_CIK 6012 case CHIP_BONAIRE: 6013 case CHIP_HAWAII: 6014 case CHIP_KAVERI: 6015 case CHIP_KABINI: 6016 case CHIP_MULLINS: 6017 #endif 6018 case CHIP_TONGA: 6019 case CHIP_FIJI: 6020 case CHIP_POLARIS10: 6021 case CHIP_POLARIS11: 6022 case CHIP_POLARIS12: 6023 case CHIP_VEGAM: 6024 case CHIP_CARRIZO: 6025 case CHIP_STONEY: 6026 /* chips with display hardware */ 6027 return true; 6028 default: 6029 /* IP discovery */ 6030 if (!adev->ip_versions[DCE_HWIP][0] || 6031 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6032 return false; 6033 return true; 6034 } 6035 } 6036 6037 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6038 uint32_t inst, uint32_t reg_addr, char reg_name[], 6039 uint32_t expected_value, uint32_t mask) 6040 { 6041 uint32_t ret = 0; 6042 uint32_t old_ = 0; 6043 uint32_t tmp_ = RREG32(reg_addr); 6044 uint32_t loop = adev->usec_timeout; 6045 6046 while ((tmp_ & (mask)) != (expected_value)) { 6047 if (old_ != tmp_) { 6048 loop = adev->usec_timeout; 6049 old_ = tmp_; 6050 } else 6051 udelay(1); 6052 tmp_ = RREG32(reg_addr); 6053 loop--; 6054 if (!loop) { 6055 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6056 inst, reg_name, (uint32_t)expected_value, 6057 (uint32_t)(tmp_ & (mask))); 6058 ret = -ETIMEDOUT; 6059 break; 6060 } 6061 } 6062 return ret; 6063 } 6064