1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * 511 * this function is invoked only for the debugfs register access 512 */ 513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 514 uint32_t reg, uint32_t v, 515 uint32_t xcc_id) 516 { 517 if (amdgpu_device_skip_hw_access(adev)) 518 return; 519 520 if (amdgpu_sriov_fullaccess(adev) && 521 adev->gfx.rlc.funcs && 522 adev->gfx.rlc.funcs->is_rlcg_access_range) { 523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 525 } else if ((reg * 4) >= adev->rmmio_size) { 526 adev->pcie_wreg(adev, reg * 4, v); 527 } else { 528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 529 } 530 } 531 532 /** 533 * amdgpu_device_indirect_rreg - read an indirect register 534 * 535 * @adev: amdgpu_device pointer 536 * @reg_addr: indirect register address to read from 537 * 538 * Returns the value of indirect register @reg_addr 539 */ 540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 541 u32 reg_addr) 542 { 543 unsigned long flags, pcie_index, pcie_data; 544 void __iomem *pcie_index_offset; 545 void __iomem *pcie_data_offset; 546 u32 r; 547 548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 550 551 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 554 555 writel(reg_addr, pcie_index_offset); 556 readl(pcie_index_offset); 557 r = readl(pcie_data_offset); 558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 559 560 return r; 561 } 562 563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 564 u64 reg_addr) 565 { 566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 567 u32 r; 568 void __iomem *pcie_index_offset; 569 void __iomem *pcie_index_hi_offset; 570 void __iomem *pcie_data_offset; 571 572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 574 if (adev->nbio.funcs->get_pcie_index_hi_offset) 575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 576 else 577 pcie_index_hi = 0; 578 579 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 582 if (pcie_index_hi != 0) 583 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 584 pcie_index_hi * 4; 585 586 writel(reg_addr, pcie_index_offset); 587 readl(pcie_index_offset); 588 if (pcie_index_hi != 0) { 589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 590 readl(pcie_index_hi_offset); 591 } 592 r = readl(pcie_data_offset); 593 594 /* clear the high bits */ 595 if (pcie_index_hi != 0) { 596 writel(0, pcie_index_hi_offset); 597 readl(pcie_index_hi_offset); 598 } 599 600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 601 602 return r; 603 } 604 605 /** 606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 607 * 608 * @adev: amdgpu_device pointer 609 * @reg_addr: indirect register address to read from 610 * 611 * Returns the value of indirect register @reg_addr 612 */ 613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 614 u32 reg_addr) 615 { 616 unsigned long flags, pcie_index, pcie_data; 617 void __iomem *pcie_index_offset; 618 void __iomem *pcie_data_offset; 619 u64 r; 620 621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @reg_addr: indirect register offset 646 * @reg_data: indirect register data 647 * 648 */ 649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 650 u32 reg_addr, u32 reg_data) 651 { 652 unsigned long flags, pcie_index, pcie_data; 653 void __iomem *pcie_index_offset; 654 void __iomem *pcie_data_offset; 655 656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 671 u64 reg_addr, u32 reg_data) 672 { 673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 674 void __iomem *pcie_index_offset; 675 void __iomem *pcie_index_hi_offset; 676 void __iomem *pcie_data_offset; 677 678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 680 if (adev->nbio.funcs->get_pcie_index_hi_offset) 681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 682 else 683 pcie_index_hi = 0; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 if (pcie_index_hi != 0) 689 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 690 pcie_index_hi * 4; 691 692 writel(reg_addr, pcie_index_offset); 693 readl(pcie_index_offset); 694 if (pcie_index_hi != 0) { 695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 696 readl(pcie_index_hi_offset); 697 } 698 writel(reg_data, pcie_data_offset); 699 readl(pcie_data_offset); 700 701 /* clear the high bits */ 702 if (pcie_index_hi != 0) { 703 writel(0, pcie_index_hi_offset); 704 readl(pcie_index_hi_offset); 705 } 706 707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 708 } 709 710 /** 711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 712 * 713 * @adev: amdgpu_device pointer 714 * @reg_addr: indirect register offset 715 * @reg_data: indirect register data 716 * 717 */ 718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 719 u32 reg_addr, u64 reg_data) 720 { 721 unsigned long flags, pcie_index, pcie_data; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 727 728 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 731 732 /* write low 32 bits */ 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 736 readl(pcie_data_offset); 737 /* write high 32 bits */ 738 writel(reg_addr + 4, pcie_index_offset); 739 readl(pcie_index_offset); 740 writel((u32)(reg_data >> 32), pcie_data_offset); 741 readl(pcie_data_offset); 742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 743 } 744 745 /** 746 * amdgpu_device_get_rev_id - query device rev_id 747 * 748 * @adev: amdgpu_device pointer 749 * 750 * Return device rev_id 751 */ 752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 753 { 754 return adev->nbio.funcs->get_rev_id(adev); 755 } 756 757 /** 758 * amdgpu_invalid_rreg - dummy reg read function 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: offset of register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 * Returns the value in the register. 766 */ 767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 768 { 769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 770 BUG(); 771 return 0; 772 } 773 774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 775 { 776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 777 BUG(); 778 return 0; 779 } 780 781 /** 782 * amdgpu_invalid_wreg - dummy reg write function 783 * 784 * @adev: amdgpu_device pointer 785 * @reg: offset of register 786 * @v: value to write to the register 787 * 788 * Dummy register read function. Used for register blocks 789 * that certain asics don't have (all asics). 790 */ 791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 792 { 793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 794 reg, v); 795 BUG(); 796 } 797 798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 799 { 800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 801 reg, v); 802 BUG(); 803 } 804 805 /** 806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 807 * 808 * @adev: amdgpu_device pointer 809 * @reg: offset of register 810 * 811 * Dummy register read function. Used for register blocks 812 * that certain asics don't have (all asics). 813 * Returns the value in the register. 814 */ 815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 816 { 817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 818 BUG(); 819 return 0; 820 } 821 822 /** 823 * amdgpu_invalid_wreg64 - dummy reg write function 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: offset of register 827 * @v: value to write to the register 828 * 829 * Dummy register read function. Used for register blocks 830 * that certain asics don't have (all asics). 831 */ 832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 833 { 834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 835 reg, v); 836 BUG(); 837 } 838 839 /** 840 * amdgpu_block_invalid_rreg - dummy reg read function 841 * 842 * @adev: amdgpu_device pointer 843 * @block: offset of instance 844 * @reg: offset of register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 * Returns the value in the register. 849 */ 850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 851 uint32_t block, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 854 reg, block); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_block_invalid_wreg - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @block: offset of instance 864 * @reg: offset of register 865 * @v: value to write to the register 866 * 867 * Dummy register read function. Used for register blocks 868 * that certain asics don't have (all asics). 869 */ 870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 871 uint32_t block, 872 uint32_t reg, uint32_t v) 873 { 874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 875 reg, block, v); 876 BUG(); 877 } 878 879 /** 880 * amdgpu_device_asic_init - Wrapper for atom asic_init 881 * 882 * @adev: amdgpu_device pointer 883 * 884 * Does any asic specific work and then calls atom asic init. 885 */ 886 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 887 { 888 amdgpu_asic_pre_asic_init(adev); 889 890 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 891 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 892 return amdgpu_atomfirmware_asic_init(adev, true); 893 else 894 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 895 } 896 897 /** 898 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 899 * 900 * @adev: amdgpu_device pointer 901 * 902 * Allocates a scratch page of VRAM for use by various things in the 903 * driver. 904 */ 905 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 906 { 907 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 908 AMDGPU_GEM_DOMAIN_VRAM | 909 AMDGPU_GEM_DOMAIN_GTT, 910 &adev->mem_scratch.robj, 911 &adev->mem_scratch.gpu_addr, 912 (void **)&adev->mem_scratch.ptr); 913 } 914 915 /** 916 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Frees the VRAM scratch page. 921 */ 922 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 923 { 924 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 925 } 926 927 /** 928 * amdgpu_device_program_register_sequence - program an array of registers. 929 * 930 * @adev: amdgpu_device pointer 931 * @registers: pointer to the register array 932 * @array_size: size of the register array 933 * 934 * Programs an array or registers with and or masks. 935 * This is a helper for setting golden registers. 936 */ 937 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 938 const u32 *registers, 939 const u32 array_size) 940 { 941 u32 tmp, reg, and_mask, or_mask; 942 int i; 943 944 if (array_size % 3) 945 return; 946 947 for (i = 0; i < array_size; i += 3) { 948 reg = registers[i + 0]; 949 and_mask = registers[i + 1]; 950 or_mask = registers[i + 2]; 951 952 if (and_mask == 0xffffffff) { 953 tmp = or_mask; 954 } else { 955 tmp = RREG32(reg); 956 tmp &= ~and_mask; 957 if (adev->family >= AMDGPU_FAMILY_AI) 958 tmp |= (or_mask & and_mask); 959 else 960 tmp |= or_mask; 961 } 962 WREG32(reg, tmp); 963 } 964 } 965 966 /** 967 * amdgpu_device_pci_config_reset - reset the GPU 968 * 969 * @adev: amdgpu_device pointer 970 * 971 * Resets the GPU using the pci config reset sequence. 972 * Only applicable to asics prior to vega10. 973 */ 974 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 975 { 976 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 977 } 978 979 /** 980 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 981 * 982 * @adev: amdgpu_device pointer 983 * 984 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 985 */ 986 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 987 { 988 return pci_reset_function(adev->pdev); 989 } 990 991 /* 992 * amdgpu_device_wb_*() 993 * Writeback is the method by which the GPU updates special pages in memory 994 * with the status of certain GPU events (fences, ring pointers,etc.). 995 */ 996 997 /** 998 * amdgpu_device_wb_fini - Disable Writeback and free memory 999 * 1000 * @adev: amdgpu_device pointer 1001 * 1002 * Disables Writeback and frees the Writeback memory (all asics). 1003 * Used at driver shutdown. 1004 */ 1005 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1006 { 1007 if (adev->wb.wb_obj) { 1008 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1009 &adev->wb.gpu_addr, 1010 (void **)&adev->wb.wb); 1011 adev->wb.wb_obj = NULL; 1012 } 1013 } 1014 1015 /** 1016 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Initializes writeback and allocates writeback memory (all asics). 1021 * Used at driver startup. 1022 * Returns 0 on success or an -error on failure. 1023 */ 1024 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1025 { 1026 int r; 1027 1028 if (adev->wb.wb_obj == NULL) { 1029 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1030 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1031 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1032 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1033 (void **)&adev->wb.wb); 1034 if (r) { 1035 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1036 return r; 1037 } 1038 1039 adev->wb.num_wb = AMDGPU_MAX_WB; 1040 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1041 1042 /* clear wb memory */ 1043 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1044 } 1045 1046 return 0; 1047 } 1048 1049 /** 1050 * amdgpu_device_wb_get - Allocate a wb entry 1051 * 1052 * @adev: amdgpu_device pointer 1053 * @wb: wb index 1054 * 1055 * Allocate a wb slot for use by the driver (all asics). 1056 * Returns 0 on success or -EINVAL on failure. 1057 */ 1058 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1059 { 1060 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1061 1062 if (offset < adev->wb.num_wb) { 1063 __set_bit(offset, adev->wb.used); 1064 *wb = offset << 3; /* convert to dw offset */ 1065 return 0; 1066 } else { 1067 return -EINVAL; 1068 } 1069 } 1070 1071 /** 1072 * amdgpu_device_wb_free - Free a wb entry 1073 * 1074 * @adev: amdgpu_device pointer 1075 * @wb: wb index 1076 * 1077 * Free a wb slot allocated for use by the driver (all asics) 1078 */ 1079 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1080 { 1081 wb >>= 3; 1082 if (wb < adev->wb.num_wb) 1083 __clear_bit(wb, adev->wb.used); 1084 } 1085 1086 /** 1087 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1088 * 1089 * @adev: amdgpu_device pointer 1090 * 1091 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1092 * to fail, but if any of the BARs is not accessible after the size we abort 1093 * driver loading by returning -ENODEV. 1094 */ 1095 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1096 { 1097 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1098 struct pci_bus *root; 1099 struct resource *res; 1100 unsigned int i; 1101 u16 cmd; 1102 int r; 1103 1104 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1105 return 0; 1106 1107 /* Bypass for VF */ 1108 if (amdgpu_sriov_vf(adev)) 1109 return 0; 1110 1111 /* skip if the bios has already enabled large BAR */ 1112 if (adev->gmc.real_vram_size && 1113 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1114 return 0; 1115 1116 /* Check if the root BUS has 64bit memory resources */ 1117 root = adev->pdev->bus; 1118 while (root->parent) 1119 root = root->parent; 1120 1121 pci_bus_for_each_resource(root, res, i) { 1122 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1123 res->start > 0x100000000ull) 1124 break; 1125 } 1126 1127 /* Trying to resize is pointless without a root hub window above 4GB */ 1128 if (!res) 1129 return 0; 1130 1131 /* Limit the BAR size to what is available */ 1132 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1133 rbar_size); 1134 1135 /* Disable memory decoding while we change the BAR addresses and size */ 1136 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1137 pci_write_config_word(adev->pdev, PCI_COMMAND, 1138 cmd & ~PCI_COMMAND_MEMORY); 1139 1140 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1141 amdgpu_doorbell_fini(adev); 1142 if (adev->asic_type >= CHIP_BONAIRE) 1143 pci_release_resource(adev->pdev, 2); 1144 1145 pci_release_resource(adev->pdev, 0); 1146 1147 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1148 if (r == -ENOSPC) 1149 DRM_INFO("Not enough PCI address space for a large BAR."); 1150 else if (r && r != -ENOTSUPP) 1151 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1152 1153 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1154 1155 /* When the doorbell or fb BAR isn't available we have no chance of 1156 * using the device. 1157 */ 1158 r = amdgpu_doorbell_init(adev); 1159 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1160 return -ENODEV; 1161 1162 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1163 1164 return 0; 1165 } 1166 1167 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1168 { 1169 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1170 return false; 1171 1172 return true; 1173 } 1174 1175 /* 1176 * GPU helpers function. 1177 */ 1178 /** 1179 * amdgpu_device_need_post - check if the hw need post or not 1180 * 1181 * @adev: amdgpu_device pointer 1182 * 1183 * Check if the asic has been initialized (all asics) at driver startup 1184 * or post is needed if hw reset is performed. 1185 * Returns true if need or false if not. 1186 */ 1187 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1188 { 1189 uint32_t reg; 1190 1191 if (amdgpu_sriov_vf(adev)) 1192 return false; 1193 1194 if (!amdgpu_device_read_bios(adev)) 1195 return false; 1196 1197 if (amdgpu_passthrough(adev)) { 1198 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1199 * some old smc fw still need driver do vPost otherwise gpu hang, while 1200 * those smc fw version above 22.15 doesn't have this flaw, so we force 1201 * vpost executed for smc version below 22.15 1202 */ 1203 if (adev->asic_type == CHIP_FIJI) { 1204 int err; 1205 uint32_t fw_ver; 1206 1207 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1208 /* force vPost if error occured */ 1209 if (err) 1210 return true; 1211 1212 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1213 if (fw_ver < 0x00160e00) 1214 return true; 1215 } 1216 } 1217 1218 /* Don't post if we need to reset whole hive on init */ 1219 if (adev->gmc.xgmi.pending_reset) 1220 return false; 1221 1222 if (adev->has_hw_reset) { 1223 adev->has_hw_reset = false; 1224 return true; 1225 } 1226 1227 /* bios scratch used on CIK+ */ 1228 if (adev->asic_type >= CHIP_BONAIRE) 1229 return amdgpu_atombios_scratch_need_asic_init(adev); 1230 1231 /* check MEM_SIZE for older asics */ 1232 reg = amdgpu_asic_get_config_memsize(adev); 1233 1234 if ((reg != 0) && (reg != 0xffffffff)) 1235 return false; 1236 1237 return true; 1238 } 1239 1240 /* 1241 * On APUs with >= 64GB white flickering has been observed w/ SG enabled. 1242 * Disable S/G on such systems until we have a proper fix. 1243 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 1244 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 1245 */ 1246 bool amdgpu_sg_display_supported(struct amdgpu_device *adev) 1247 { 1248 switch (amdgpu_sg_display) { 1249 case -1: 1250 break; 1251 case 0: 1252 return false; 1253 case 1: 1254 return true; 1255 default: 1256 return false; 1257 } 1258 if ((totalram_pages() << (PAGE_SHIFT - 10)) + 1259 (adev->gmc.real_vram_size / 1024) >= 64000000) { 1260 DRM_WARN("Disabling S/G due to >=64GB RAM\n"); 1261 return false; 1262 } 1263 return true; 1264 } 1265 1266 /* 1267 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1268 * speed switching. Until we have confirmation from Intel that a specific host 1269 * supports it, it's safer that we keep it disabled for all. 1270 * 1271 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1272 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1273 */ 1274 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1275 { 1276 #if IS_ENABLED(CONFIG_X86) 1277 struct cpuinfo_x86 *c = &cpu_data(0); 1278 1279 if (c->x86_vendor == X86_VENDOR_INTEL) 1280 return false; 1281 #endif 1282 return true; 1283 } 1284 1285 /** 1286 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1287 * 1288 * @adev: amdgpu_device pointer 1289 * 1290 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1291 * be set for this device. 1292 * 1293 * Returns true if it should be used or false if not. 1294 */ 1295 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1296 { 1297 switch (amdgpu_aspm) { 1298 case -1: 1299 break; 1300 case 0: 1301 return false; 1302 case 1: 1303 return true; 1304 default: 1305 return false; 1306 } 1307 return pcie_aspm_enabled(adev->pdev); 1308 } 1309 1310 bool amdgpu_device_aspm_support_quirk(void) 1311 { 1312 #if IS_ENABLED(CONFIG_X86) 1313 struct cpuinfo_x86 *c = &cpu_data(0); 1314 1315 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1316 #else 1317 return true; 1318 #endif 1319 } 1320 1321 /* if we get transitioned to only one device, take VGA back */ 1322 /** 1323 * amdgpu_device_vga_set_decode - enable/disable vga decode 1324 * 1325 * @pdev: PCI device pointer 1326 * @state: enable/disable vga decode 1327 * 1328 * Enable/disable vga decode (all asics). 1329 * Returns VGA resource flags. 1330 */ 1331 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1332 bool state) 1333 { 1334 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1335 1336 amdgpu_asic_set_vga_state(adev, state); 1337 if (state) 1338 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1339 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1340 else 1341 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1342 } 1343 1344 /** 1345 * amdgpu_device_check_block_size - validate the vm block size 1346 * 1347 * @adev: amdgpu_device pointer 1348 * 1349 * Validates the vm block size specified via module parameter. 1350 * The vm block size defines number of bits in page table versus page directory, 1351 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1352 * page table and the remaining bits are in the page directory. 1353 */ 1354 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1355 { 1356 /* defines number of bits in page table versus page directory, 1357 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1358 * page table and the remaining bits are in the page directory 1359 */ 1360 if (amdgpu_vm_block_size == -1) 1361 return; 1362 1363 if (amdgpu_vm_block_size < 9) { 1364 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1365 amdgpu_vm_block_size); 1366 amdgpu_vm_block_size = -1; 1367 } 1368 } 1369 1370 /** 1371 * amdgpu_device_check_vm_size - validate the vm size 1372 * 1373 * @adev: amdgpu_device pointer 1374 * 1375 * Validates the vm size in GB specified via module parameter. 1376 * The VM size is the size of the GPU virtual memory space in GB. 1377 */ 1378 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1379 { 1380 /* no need to check the default value */ 1381 if (amdgpu_vm_size == -1) 1382 return; 1383 1384 if (amdgpu_vm_size < 1) { 1385 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1386 amdgpu_vm_size); 1387 amdgpu_vm_size = -1; 1388 } 1389 } 1390 1391 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1392 { 1393 struct sysinfo si; 1394 bool is_os_64 = (sizeof(void *) == 8); 1395 uint64_t total_memory; 1396 uint64_t dram_size_seven_GB = 0x1B8000000; 1397 uint64_t dram_size_three_GB = 0xB8000000; 1398 1399 if (amdgpu_smu_memory_pool_size == 0) 1400 return; 1401 1402 if (!is_os_64) { 1403 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1404 goto def_value; 1405 } 1406 si_meminfo(&si); 1407 total_memory = (uint64_t)si.totalram * si.mem_unit; 1408 1409 if ((amdgpu_smu_memory_pool_size == 1) || 1410 (amdgpu_smu_memory_pool_size == 2)) { 1411 if (total_memory < dram_size_three_GB) 1412 goto def_value1; 1413 } else if ((amdgpu_smu_memory_pool_size == 4) || 1414 (amdgpu_smu_memory_pool_size == 8)) { 1415 if (total_memory < dram_size_seven_GB) 1416 goto def_value1; 1417 } else { 1418 DRM_WARN("Smu memory pool size not supported\n"); 1419 goto def_value; 1420 } 1421 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1422 1423 return; 1424 1425 def_value1: 1426 DRM_WARN("No enough system memory\n"); 1427 def_value: 1428 adev->pm.smu_prv_buffer_size = 0; 1429 } 1430 1431 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1432 { 1433 if (!(adev->flags & AMD_IS_APU) || 1434 adev->asic_type < CHIP_RAVEN) 1435 return 0; 1436 1437 switch (adev->asic_type) { 1438 case CHIP_RAVEN: 1439 if (adev->pdev->device == 0x15dd) 1440 adev->apu_flags |= AMD_APU_IS_RAVEN; 1441 if (adev->pdev->device == 0x15d8) 1442 adev->apu_flags |= AMD_APU_IS_PICASSO; 1443 break; 1444 case CHIP_RENOIR: 1445 if ((adev->pdev->device == 0x1636) || 1446 (adev->pdev->device == 0x164c)) 1447 adev->apu_flags |= AMD_APU_IS_RENOIR; 1448 else 1449 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1450 break; 1451 case CHIP_VANGOGH: 1452 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1453 break; 1454 case CHIP_YELLOW_CARP: 1455 break; 1456 case CHIP_CYAN_SKILLFISH: 1457 if ((adev->pdev->device == 0x13FE) || 1458 (adev->pdev->device == 0x143F)) 1459 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1460 break; 1461 default: 1462 break; 1463 } 1464 1465 return 0; 1466 } 1467 1468 /** 1469 * amdgpu_device_check_arguments - validate module params 1470 * 1471 * @adev: amdgpu_device pointer 1472 * 1473 * Validates certain module parameters and updates 1474 * the associated values used by the driver (all asics). 1475 */ 1476 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1477 { 1478 if (amdgpu_sched_jobs < 4) { 1479 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1480 amdgpu_sched_jobs); 1481 amdgpu_sched_jobs = 4; 1482 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1483 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1484 amdgpu_sched_jobs); 1485 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1486 } 1487 1488 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1489 /* gart size must be greater or equal to 32M */ 1490 dev_warn(adev->dev, "gart size (%d) too small\n", 1491 amdgpu_gart_size); 1492 amdgpu_gart_size = -1; 1493 } 1494 1495 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1496 /* gtt size must be greater or equal to 32M */ 1497 dev_warn(adev->dev, "gtt size (%d) too small\n", 1498 amdgpu_gtt_size); 1499 amdgpu_gtt_size = -1; 1500 } 1501 1502 /* valid range is between 4 and 9 inclusive */ 1503 if (amdgpu_vm_fragment_size != -1 && 1504 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1505 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1506 amdgpu_vm_fragment_size = -1; 1507 } 1508 1509 if (amdgpu_sched_hw_submission < 2) { 1510 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1511 amdgpu_sched_hw_submission); 1512 amdgpu_sched_hw_submission = 2; 1513 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1514 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1515 amdgpu_sched_hw_submission); 1516 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1517 } 1518 1519 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1520 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1521 amdgpu_reset_method = -1; 1522 } 1523 1524 amdgpu_device_check_smu_prv_buffer_size(adev); 1525 1526 amdgpu_device_check_vm_size(adev); 1527 1528 amdgpu_device_check_block_size(adev); 1529 1530 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1531 1532 return 0; 1533 } 1534 1535 /** 1536 * amdgpu_switcheroo_set_state - set switcheroo state 1537 * 1538 * @pdev: pci dev pointer 1539 * @state: vga_switcheroo state 1540 * 1541 * Callback for the switcheroo driver. Suspends or resumes 1542 * the asics before or after it is powered up using ACPI methods. 1543 */ 1544 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1545 enum vga_switcheroo_state state) 1546 { 1547 struct drm_device *dev = pci_get_drvdata(pdev); 1548 int r; 1549 1550 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1551 return; 1552 1553 if (state == VGA_SWITCHEROO_ON) { 1554 pr_info("switched on\n"); 1555 /* don't suspend or resume card normally */ 1556 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1557 1558 pci_set_power_state(pdev, PCI_D0); 1559 amdgpu_device_load_pci_state(pdev); 1560 r = pci_enable_device(pdev); 1561 if (r) 1562 DRM_WARN("pci_enable_device failed (%d)\n", r); 1563 amdgpu_device_resume(dev, true); 1564 1565 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1566 } else { 1567 pr_info("switched off\n"); 1568 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1569 amdgpu_device_suspend(dev, true); 1570 amdgpu_device_cache_pci_state(pdev); 1571 /* Shut down the device */ 1572 pci_disable_device(pdev); 1573 pci_set_power_state(pdev, PCI_D3cold); 1574 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1575 } 1576 } 1577 1578 /** 1579 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1580 * 1581 * @pdev: pci dev pointer 1582 * 1583 * Callback for the switcheroo driver. Check of the switcheroo 1584 * state can be changed. 1585 * Returns true if the state can be changed, false if not. 1586 */ 1587 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1588 { 1589 struct drm_device *dev = pci_get_drvdata(pdev); 1590 1591 /* 1592 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1593 * locking inversion with the driver load path. And the access here is 1594 * completely racy anyway. So don't bother with locking for now. 1595 */ 1596 return atomic_read(&dev->open_count) == 0; 1597 } 1598 1599 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1600 .set_gpu_state = amdgpu_switcheroo_set_state, 1601 .reprobe = NULL, 1602 .can_switch = amdgpu_switcheroo_can_switch, 1603 }; 1604 1605 /** 1606 * amdgpu_device_ip_set_clockgating_state - set the CG state 1607 * 1608 * @dev: amdgpu_device pointer 1609 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1610 * @state: clockgating state (gate or ungate) 1611 * 1612 * Sets the requested clockgating state for all instances of 1613 * the hardware IP specified. 1614 * Returns the error code from the last instance. 1615 */ 1616 int amdgpu_device_ip_set_clockgating_state(void *dev, 1617 enum amd_ip_block_type block_type, 1618 enum amd_clockgating_state state) 1619 { 1620 struct amdgpu_device *adev = dev; 1621 int i, r = 0; 1622 1623 for (i = 0; i < adev->num_ip_blocks; i++) { 1624 if (!adev->ip_blocks[i].status.valid) 1625 continue; 1626 if (adev->ip_blocks[i].version->type != block_type) 1627 continue; 1628 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1629 continue; 1630 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1631 (void *)adev, state); 1632 if (r) 1633 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1634 adev->ip_blocks[i].version->funcs->name, r); 1635 } 1636 return r; 1637 } 1638 1639 /** 1640 * amdgpu_device_ip_set_powergating_state - set the PG state 1641 * 1642 * @dev: amdgpu_device pointer 1643 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1644 * @state: powergating state (gate or ungate) 1645 * 1646 * Sets the requested powergating state for all instances of 1647 * the hardware IP specified. 1648 * Returns the error code from the last instance. 1649 */ 1650 int amdgpu_device_ip_set_powergating_state(void *dev, 1651 enum amd_ip_block_type block_type, 1652 enum amd_powergating_state state) 1653 { 1654 struct amdgpu_device *adev = dev; 1655 int i, r = 0; 1656 1657 for (i = 0; i < adev->num_ip_blocks; i++) { 1658 if (!adev->ip_blocks[i].status.valid) 1659 continue; 1660 if (adev->ip_blocks[i].version->type != block_type) 1661 continue; 1662 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1663 continue; 1664 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1665 (void *)adev, state); 1666 if (r) 1667 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1668 adev->ip_blocks[i].version->funcs->name, r); 1669 } 1670 return r; 1671 } 1672 1673 /** 1674 * amdgpu_device_ip_get_clockgating_state - get the CG state 1675 * 1676 * @adev: amdgpu_device pointer 1677 * @flags: clockgating feature flags 1678 * 1679 * Walks the list of IPs on the device and updates the clockgating 1680 * flags for each IP. 1681 * Updates @flags with the feature flags for each hardware IP where 1682 * clockgating is enabled. 1683 */ 1684 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1685 u64 *flags) 1686 { 1687 int i; 1688 1689 for (i = 0; i < adev->num_ip_blocks; i++) { 1690 if (!adev->ip_blocks[i].status.valid) 1691 continue; 1692 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1693 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1694 } 1695 } 1696 1697 /** 1698 * amdgpu_device_ip_wait_for_idle - wait for idle 1699 * 1700 * @adev: amdgpu_device pointer 1701 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1702 * 1703 * Waits for the request hardware IP to be idle. 1704 * Returns 0 for success or a negative error code on failure. 1705 */ 1706 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1707 enum amd_ip_block_type block_type) 1708 { 1709 int i, r; 1710 1711 for (i = 0; i < adev->num_ip_blocks; i++) { 1712 if (!adev->ip_blocks[i].status.valid) 1713 continue; 1714 if (adev->ip_blocks[i].version->type == block_type) { 1715 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1716 if (r) 1717 return r; 1718 break; 1719 } 1720 } 1721 return 0; 1722 1723 } 1724 1725 /** 1726 * amdgpu_device_ip_is_idle - is the hardware IP idle 1727 * 1728 * @adev: amdgpu_device pointer 1729 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1730 * 1731 * Check if the hardware IP is idle or not. 1732 * Returns true if it the IP is idle, false if not. 1733 */ 1734 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1735 enum amd_ip_block_type block_type) 1736 { 1737 int i; 1738 1739 for (i = 0; i < adev->num_ip_blocks; i++) { 1740 if (!adev->ip_blocks[i].status.valid) 1741 continue; 1742 if (adev->ip_blocks[i].version->type == block_type) 1743 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1744 } 1745 return true; 1746 1747 } 1748 1749 /** 1750 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1751 * 1752 * @adev: amdgpu_device pointer 1753 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1754 * 1755 * Returns a pointer to the hardware IP block structure 1756 * if it exists for the asic, otherwise NULL. 1757 */ 1758 struct amdgpu_ip_block * 1759 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1760 enum amd_ip_block_type type) 1761 { 1762 int i; 1763 1764 for (i = 0; i < adev->num_ip_blocks; i++) 1765 if (adev->ip_blocks[i].version->type == type) 1766 return &adev->ip_blocks[i]; 1767 1768 return NULL; 1769 } 1770 1771 /** 1772 * amdgpu_device_ip_block_version_cmp 1773 * 1774 * @adev: amdgpu_device pointer 1775 * @type: enum amd_ip_block_type 1776 * @major: major version 1777 * @minor: minor version 1778 * 1779 * return 0 if equal or greater 1780 * return 1 if smaller or the ip_block doesn't exist 1781 */ 1782 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1783 enum amd_ip_block_type type, 1784 u32 major, u32 minor) 1785 { 1786 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1787 1788 if (ip_block && ((ip_block->version->major > major) || 1789 ((ip_block->version->major == major) && 1790 (ip_block->version->minor >= minor)))) 1791 return 0; 1792 1793 return 1; 1794 } 1795 1796 /** 1797 * amdgpu_device_ip_block_add 1798 * 1799 * @adev: amdgpu_device pointer 1800 * @ip_block_version: pointer to the IP to add 1801 * 1802 * Adds the IP block driver information to the collection of IPs 1803 * on the asic. 1804 */ 1805 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1806 const struct amdgpu_ip_block_version *ip_block_version) 1807 { 1808 if (!ip_block_version) 1809 return -EINVAL; 1810 1811 switch (ip_block_version->type) { 1812 case AMD_IP_BLOCK_TYPE_VCN: 1813 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1814 return 0; 1815 break; 1816 case AMD_IP_BLOCK_TYPE_JPEG: 1817 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1818 return 0; 1819 break; 1820 default: 1821 break; 1822 } 1823 1824 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1825 ip_block_version->funcs->name); 1826 1827 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1828 1829 return 0; 1830 } 1831 1832 /** 1833 * amdgpu_device_enable_virtual_display - enable virtual display feature 1834 * 1835 * @adev: amdgpu_device pointer 1836 * 1837 * Enabled the virtual display feature if the user has enabled it via 1838 * the module parameter virtual_display. This feature provides a virtual 1839 * display hardware on headless boards or in virtualized environments. 1840 * This function parses and validates the configuration string specified by 1841 * the user and configues the virtual display configuration (number of 1842 * virtual connectors, crtcs, etc.) specified. 1843 */ 1844 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1845 { 1846 adev->enable_virtual_display = false; 1847 1848 if (amdgpu_virtual_display) { 1849 const char *pci_address_name = pci_name(adev->pdev); 1850 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1851 1852 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1853 pciaddstr_tmp = pciaddstr; 1854 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1855 pciaddname = strsep(&pciaddname_tmp, ","); 1856 if (!strcmp("all", pciaddname) 1857 || !strcmp(pci_address_name, pciaddname)) { 1858 long num_crtc; 1859 int res = -1; 1860 1861 adev->enable_virtual_display = true; 1862 1863 if (pciaddname_tmp) 1864 res = kstrtol(pciaddname_tmp, 10, 1865 &num_crtc); 1866 1867 if (!res) { 1868 if (num_crtc < 1) 1869 num_crtc = 1; 1870 if (num_crtc > 6) 1871 num_crtc = 6; 1872 adev->mode_info.num_crtc = num_crtc; 1873 } else { 1874 adev->mode_info.num_crtc = 1; 1875 } 1876 break; 1877 } 1878 } 1879 1880 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1881 amdgpu_virtual_display, pci_address_name, 1882 adev->enable_virtual_display, adev->mode_info.num_crtc); 1883 1884 kfree(pciaddstr); 1885 } 1886 } 1887 1888 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1889 { 1890 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1891 adev->mode_info.num_crtc = 1; 1892 adev->enable_virtual_display = true; 1893 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1894 adev->enable_virtual_display, adev->mode_info.num_crtc); 1895 } 1896 } 1897 1898 /** 1899 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1900 * 1901 * @adev: amdgpu_device pointer 1902 * 1903 * Parses the asic configuration parameters specified in the gpu info 1904 * firmware and makes them availale to the driver for use in configuring 1905 * the asic. 1906 * Returns 0 on success, -EINVAL on failure. 1907 */ 1908 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1909 { 1910 const char *chip_name; 1911 char fw_name[40]; 1912 int err; 1913 const struct gpu_info_firmware_header_v1_0 *hdr; 1914 1915 adev->firmware.gpu_info_fw = NULL; 1916 1917 if (adev->mman.discovery_bin) { 1918 /* 1919 * FIXME: The bounding box is still needed by Navi12, so 1920 * temporarily read it from gpu_info firmware. Should be dropped 1921 * when DAL no longer needs it. 1922 */ 1923 if (adev->asic_type != CHIP_NAVI12) 1924 return 0; 1925 } 1926 1927 switch (adev->asic_type) { 1928 default: 1929 return 0; 1930 case CHIP_VEGA10: 1931 chip_name = "vega10"; 1932 break; 1933 case CHIP_VEGA12: 1934 chip_name = "vega12"; 1935 break; 1936 case CHIP_RAVEN: 1937 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1938 chip_name = "raven2"; 1939 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1940 chip_name = "picasso"; 1941 else 1942 chip_name = "raven"; 1943 break; 1944 case CHIP_ARCTURUS: 1945 chip_name = "arcturus"; 1946 break; 1947 case CHIP_NAVI12: 1948 chip_name = "navi12"; 1949 break; 1950 } 1951 1952 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1953 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1954 if (err) { 1955 dev_err(adev->dev, 1956 "Failed to get gpu_info firmware \"%s\"\n", 1957 fw_name); 1958 goto out; 1959 } 1960 1961 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1962 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1963 1964 switch (hdr->version_major) { 1965 case 1: 1966 { 1967 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1968 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1969 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1970 1971 /* 1972 * Should be droped when DAL no longer needs it. 1973 */ 1974 if (adev->asic_type == CHIP_NAVI12) 1975 goto parse_soc_bounding_box; 1976 1977 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1978 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1979 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1980 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1981 adev->gfx.config.max_texture_channel_caches = 1982 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1983 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1984 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1985 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1986 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1987 adev->gfx.config.double_offchip_lds_buf = 1988 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1989 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1990 adev->gfx.cu_info.max_waves_per_simd = 1991 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1992 adev->gfx.cu_info.max_scratch_slots_per_cu = 1993 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1994 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1995 if (hdr->version_minor >= 1) { 1996 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1997 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1998 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1999 adev->gfx.config.num_sc_per_sh = 2000 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2001 adev->gfx.config.num_packer_per_sc = 2002 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2003 } 2004 2005 parse_soc_bounding_box: 2006 /* 2007 * soc bounding box info is not integrated in disocovery table, 2008 * we always need to parse it from gpu info firmware if needed. 2009 */ 2010 if (hdr->version_minor == 2) { 2011 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2012 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2013 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2014 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2015 } 2016 break; 2017 } 2018 default: 2019 dev_err(adev->dev, 2020 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2021 err = -EINVAL; 2022 goto out; 2023 } 2024 out: 2025 return err; 2026 } 2027 2028 /** 2029 * amdgpu_device_ip_early_init - run early init for hardware IPs 2030 * 2031 * @adev: amdgpu_device pointer 2032 * 2033 * Early initialization pass for hardware IPs. The hardware IPs that make 2034 * up each asic are discovered each IP's early_init callback is run. This 2035 * is the first stage in initializing the asic. 2036 * Returns 0 on success, negative error code on failure. 2037 */ 2038 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2039 { 2040 struct drm_device *dev = adev_to_drm(adev); 2041 struct pci_dev *parent; 2042 int i, r; 2043 bool total; 2044 2045 amdgpu_device_enable_virtual_display(adev); 2046 2047 if (amdgpu_sriov_vf(adev)) { 2048 r = amdgpu_virt_request_full_gpu(adev, true); 2049 if (r) 2050 return r; 2051 } 2052 2053 switch (adev->asic_type) { 2054 #ifdef CONFIG_DRM_AMDGPU_SI 2055 case CHIP_VERDE: 2056 case CHIP_TAHITI: 2057 case CHIP_PITCAIRN: 2058 case CHIP_OLAND: 2059 case CHIP_HAINAN: 2060 adev->family = AMDGPU_FAMILY_SI; 2061 r = si_set_ip_blocks(adev); 2062 if (r) 2063 return r; 2064 break; 2065 #endif 2066 #ifdef CONFIG_DRM_AMDGPU_CIK 2067 case CHIP_BONAIRE: 2068 case CHIP_HAWAII: 2069 case CHIP_KAVERI: 2070 case CHIP_KABINI: 2071 case CHIP_MULLINS: 2072 if (adev->flags & AMD_IS_APU) 2073 adev->family = AMDGPU_FAMILY_KV; 2074 else 2075 adev->family = AMDGPU_FAMILY_CI; 2076 2077 r = cik_set_ip_blocks(adev); 2078 if (r) 2079 return r; 2080 break; 2081 #endif 2082 case CHIP_TOPAZ: 2083 case CHIP_TONGA: 2084 case CHIP_FIJI: 2085 case CHIP_POLARIS10: 2086 case CHIP_POLARIS11: 2087 case CHIP_POLARIS12: 2088 case CHIP_VEGAM: 2089 case CHIP_CARRIZO: 2090 case CHIP_STONEY: 2091 if (adev->flags & AMD_IS_APU) 2092 adev->family = AMDGPU_FAMILY_CZ; 2093 else 2094 adev->family = AMDGPU_FAMILY_VI; 2095 2096 r = vi_set_ip_blocks(adev); 2097 if (r) 2098 return r; 2099 break; 2100 default: 2101 r = amdgpu_discovery_set_ip_blocks(adev); 2102 if (r) 2103 return r; 2104 break; 2105 } 2106 2107 if (amdgpu_has_atpx() && 2108 (amdgpu_is_atpx_hybrid() || 2109 amdgpu_has_atpx_dgpu_power_cntl()) && 2110 ((adev->flags & AMD_IS_APU) == 0) && 2111 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2112 adev->flags |= AMD_IS_PX; 2113 2114 if (!(adev->flags & AMD_IS_APU)) { 2115 parent = pci_upstream_bridge(adev->pdev); 2116 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2117 } 2118 2119 2120 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2121 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2122 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2123 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2124 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2125 2126 total = true; 2127 for (i = 0; i < adev->num_ip_blocks; i++) { 2128 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2129 DRM_WARN("disabled ip block: %d <%s>\n", 2130 i, adev->ip_blocks[i].version->funcs->name); 2131 adev->ip_blocks[i].status.valid = false; 2132 } else { 2133 if (adev->ip_blocks[i].version->funcs->early_init) { 2134 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2135 if (r == -ENOENT) { 2136 adev->ip_blocks[i].status.valid = false; 2137 } else if (r) { 2138 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2139 adev->ip_blocks[i].version->funcs->name, r); 2140 total = false; 2141 } else { 2142 adev->ip_blocks[i].status.valid = true; 2143 } 2144 } else { 2145 adev->ip_blocks[i].status.valid = true; 2146 } 2147 } 2148 /* get the vbios after the asic_funcs are set up */ 2149 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2150 r = amdgpu_device_parse_gpu_info_fw(adev); 2151 if (r) 2152 return r; 2153 2154 /* Read BIOS */ 2155 if (amdgpu_device_read_bios(adev)) { 2156 if (!amdgpu_get_bios(adev)) 2157 return -EINVAL; 2158 2159 r = amdgpu_atombios_init(adev); 2160 if (r) { 2161 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2162 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2163 return r; 2164 } 2165 } 2166 2167 /*get pf2vf msg info at it's earliest time*/ 2168 if (amdgpu_sriov_vf(adev)) 2169 amdgpu_virt_init_data_exchange(adev); 2170 2171 } 2172 } 2173 if (!total) 2174 return -ENODEV; 2175 2176 amdgpu_amdkfd_device_probe(adev); 2177 adev->cg_flags &= amdgpu_cg_mask; 2178 adev->pg_flags &= amdgpu_pg_mask; 2179 2180 return 0; 2181 } 2182 2183 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2184 { 2185 int i, r; 2186 2187 for (i = 0; i < adev->num_ip_blocks; i++) { 2188 if (!adev->ip_blocks[i].status.sw) 2189 continue; 2190 if (adev->ip_blocks[i].status.hw) 2191 continue; 2192 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2193 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2194 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2195 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2196 if (r) { 2197 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2198 adev->ip_blocks[i].version->funcs->name, r); 2199 return r; 2200 } 2201 adev->ip_blocks[i].status.hw = true; 2202 } 2203 } 2204 2205 return 0; 2206 } 2207 2208 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2209 { 2210 int i, r; 2211 2212 for (i = 0; i < adev->num_ip_blocks; i++) { 2213 if (!adev->ip_blocks[i].status.sw) 2214 continue; 2215 if (adev->ip_blocks[i].status.hw) 2216 continue; 2217 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2218 if (r) { 2219 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2220 adev->ip_blocks[i].version->funcs->name, r); 2221 return r; 2222 } 2223 adev->ip_blocks[i].status.hw = true; 2224 } 2225 2226 return 0; 2227 } 2228 2229 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2230 { 2231 int r = 0; 2232 int i; 2233 uint32_t smu_version; 2234 2235 if (adev->asic_type >= CHIP_VEGA10) { 2236 for (i = 0; i < adev->num_ip_blocks; i++) { 2237 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2238 continue; 2239 2240 if (!adev->ip_blocks[i].status.sw) 2241 continue; 2242 2243 /* no need to do the fw loading again if already done*/ 2244 if (adev->ip_blocks[i].status.hw == true) 2245 break; 2246 2247 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2248 r = adev->ip_blocks[i].version->funcs->resume(adev); 2249 if (r) { 2250 DRM_ERROR("resume of IP block <%s> failed %d\n", 2251 adev->ip_blocks[i].version->funcs->name, r); 2252 return r; 2253 } 2254 } else { 2255 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2256 if (r) { 2257 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2258 adev->ip_blocks[i].version->funcs->name, r); 2259 return r; 2260 } 2261 } 2262 2263 adev->ip_blocks[i].status.hw = true; 2264 break; 2265 } 2266 } 2267 2268 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2269 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2270 2271 return r; 2272 } 2273 2274 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2275 { 2276 long timeout; 2277 int r, i; 2278 2279 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2280 struct amdgpu_ring *ring = adev->rings[i]; 2281 2282 /* No need to setup the GPU scheduler for rings that don't need it */ 2283 if (!ring || ring->no_scheduler) 2284 continue; 2285 2286 switch (ring->funcs->type) { 2287 case AMDGPU_RING_TYPE_GFX: 2288 timeout = adev->gfx_timeout; 2289 break; 2290 case AMDGPU_RING_TYPE_COMPUTE: 2291 timeout = adev->compute_timeout; 2292 break; 2293 case AMDGPU_RING_TYPE_SDMA: 2294 timeout = adev->sdma_timeout; 2295 break; 2296 default: 2297 timeout = adev->video_timeout; 2298 break; 2299 } 2300 2301 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2302 ring->num_hw_submission, 0, 2303 timeout, adev->reset_domain->wq, 2304 ring->sched_score, ring->name, 2305 adev->dev); 2306 if (r) { 2307 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2308 ring->name); 2309 return r; 2310 } 2311 } 2312 2313 amdgpu_xcp_update_partition_sched_list(adev); 2314 2315 return 0; 2316 } 2317 2318 2319 /** 2320 * amdgpu_device_ip_init - run init for hardware IPs 2321 * 2322 * @adev: amdgpu_device pointer 2323 * 2324 * Main initialization pass for hardware IPs. The list of all the hardware 2325 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2326 * are run. sw_init initializes the software state associated with each IP 2327 * and hw_init initializes the hardware associated with each IP. 2328 * Returns 0 on success, negative error code on failure. 2329 */ 2330 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2331 { 2332 int i, r; 2333 2334 r = amdgpu_ras_init(adev); 2335 if (r) 2336 return r; 2337 2338 for (i = 0; i < adev->num_ip_blocks; i++) { 2339 if (!adev->ip_blocks[i].status.valid) 2340 continue; 2341 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2342 if (r) { 2343 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2344 adev->ip_blocks[i].version->funcs->name, r); 2345 goto init_failed; 2346 } 2347 adev->ip_blocks[i].status.sw = true; 2348 2349 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2350 /* need to do common hw init early so everything is set up for gmc */ 2351 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2352 if (r) { 2353 DRM_ERROR("hw_init %d failed %d\n", i, r); 2354 goto init_failed; 2355 } 2356 adev->ip_blocks[i].status.hw = true; 2357 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2358 /* need to do gmc hw init early so we can allocate gpu mem */ 2359 /* Try to reserve bad pages early */ 2360 if (amdgpu_sriov_vf(adev)) 2361 amdgpu_virt_exchange_data(adev); 2362 2363 r = amdgpu_device_mem_scratch_init(adev); 2364 if (r) { 2365 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2366 goto init_failed; 2367 } 2368 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2369 if (r) { 2370 DRM_ERROR("hw_init %d failed %d\n", i, r); 2371 goto init_failed; 2372 } 2373 r = amdgpu_device_wb_init(adev); 2374 if (r) { 2375 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2376 goto init_failed; 2377 } 2378 adev->ip_blocks[i].status.hw = true; 2379 2380 /* right after GMC hw init, we create CSA */ 2381 if (adev->gfx.mcbp) { 2382 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2383 AMDGPU_GEM_DOMAIN_VRAM | 2384 AMDGPU_GEM_DOMAIN_GTT, 2385 AMDGPU_CSA_SIZE); 2386 if (r) { 2387 DRM_ERROR("allocate CSA failed %d\n", r); 2388 goto init_failed; 2389 } 2390 } 2391 } 2392 } 2393 2394 if (amdgpu_sriov_vf(adev)) 2395 amdgpu_virt_init_data_exchange(adev); 2396 2397 r = amdgpu_ib_pool_init(adev); 2398 if (r) { 2399 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2400 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2401 goto init_failed; 2402 } 2403 2404 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2405 if (r) 2406 goto init_failed; 2407 2408 r = amdgpu_device_ip_hw_init_phase1(adev); 2409 if (r) 2410 goto init_failed; 2411 2412 r = amdgpu_device_fw_loading(adev); 2413 if (r) 2414 goto init_failed; 2415 2416 r = amdgpu_device_ip_hw_init_phase2(adev); 2417 if (r) 2418 goto init_failed; 2419 2420 /* 2421 * retired pages will be loaded from eeprom and reserved here, 2422 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2423 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2424 * for I2C communication which only true at this point. 2425 * 2426 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2427 * failure from bad gpu situation and stop amdgpu init process 2428 * accordingly. For other failed cases, it will still release all 2429 * the resource and print error message, rather than returning one 2430 * negative value to upper level. 2431 * 2432 * Note: theoretically, this should be called before all vram allocations 2433 * to protect retired page from abusing 2434 */ 2435 r = amdgpu_ras_recovery_init(adev); 2436 if (r) 2437 goto init_failed; 2438 2439 /** 2440 * In case of XGMI grab extra reference for reset domain for this device 2441 */ 2442 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2443 if (amdgpu_xgmi_add_device(adev) == 0) { 2444 if (!amdgpu_sriov_vf(adev)) { 2445 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2446 2447 if (WARN_ON(!hive)) { 2448 r = -ENOENT; 2449 goto init_failed; 2450 } 2451 2452 if (!hive->reset_domain || 2453 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2454 r = -ENOENT; 2455 amdgpu_put_xgmi_hive(hive); 2456 goto init_failed; 2457 } 2458 2459 /* Drop the early temporary reset domain we created for device */ 2460 amdgpu_reset_put_reset_domain(adev->reset_domain); 2461 adev->reset_domain = hive->reset_domain; 2462 amdgpu_put_xgmi_hive(hive); 2463 } 2464 } 2465 } 2466 2467 r = amdgpu_device_init_schedulers(adev); 2468 if (r) 2469 goto init_failed; 2470 2471 /* Don't init kfd if whole hive need to be reset during init */ 2472 if (!adev->gmc.xgmi.pending_reset) { 2473 kgd2kfd_init_zone_device(adev); 2474 amdgpu_amdkfd_device_init(adev); 2475 } 2476 2477 amdgpu_fru_get_product_info(adev); 2478 2479 init_failed: 2480 2481 return r; 2482 } 2483 2484 /** 2485 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2486 * 2487 * @adev: amdgpu_device pointer 2488 * 2489 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2490 * this function before a GPU reset. If the value is retained after a 2491 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2492 */ 2493 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2494 { 2495 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2496 } 2497 2498 /** 2499 * amdgpu_device_check_vram_lost - check if vram is valid 2500 * 2501 * @adev: amdgpu_device pointer 2502 * 2503 * Checks the reset magic value written to the gart pointer in VRAM. 2504 * The driver calls this after a GPU reset to see if the contents of 2505 * VRAM is lost or now. 2506 * returns true if vram is lost, false if not. 2507 */ 2508 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2509 { 2510 if (memcmp(adev->gart.ptr, adev->reset_magic, 2511 AMDGPU_RESET_MAGIC_NUM)) 2512 return true; 2513 2514 if (!amdgpu_in_reset(adev)) 2515 return false; 2516 2517 /* 2518 * For all ASICs with baco/mode1 reset, the VRAM is 2519 * always assumed to be lost. 2520 */ 2521 switch (amdgpu_asic_reset_method(adev)) { 2522 case AMD_RESET_METHOD_BACO: 2523 case AMD_RESET_METHOD_MODE1: 2524 return true; 2525 default: 2526 return false; 2527 } 2528 } 2529 2530 /** 2531 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2532 * 2533 * @adev: amdgpu_device pointer 2534 * @state: clockgating state (gate or ungate) 2535 * 2536 * The list of all the hardware IPs that make up the asic is walked and the 2537 * set_clockgating_state callbacks are run. 2538 * Late initialization pass enabling clockgating for hardware IPs. 2539 * Fini or suspend, pass disabling clockgating for hardware IPs. 2540 * Returns 0 on success, negative error code on failure. 2541 */ 2542 2543 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2544 enum amd_clockgating_state state) 2545 { 2546 int i, j, r; 2547 2548 if (amdgpu_emu_mode == 1) 2549 return 0; 2550 2551 for (j = 0; j < adev->num_ip_blocks; j++) { 2552 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2553 if (!adev->ip_blocks[i].status.late_initialized) 2554 continue; 2555 /* skip CG for GFX, SDMA on S0ix */ 2556 if (adev->in_s0ix && 2557 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2558 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2559 continue; 2560 /* skip CG for VCE/UVD, it's handled specially */ 2561 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2562 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2563 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2564 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2565 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2566 /* enable clockgating to save power */ 2567 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2568 state); 2569 if (r) { 2570 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2571 adev->ip_blocks[i].version->funcs->name, r); 2572 return r; 2573 } 2574 } 2575 } 2576 2577 return 0; 2578 } 2579 2580 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2581 enum amd_powergating_state state) 2582 { 2583 int i, j, r; 2584 2585 if (amdgpu_emu_mode == 1) 2586 return 0; 2587 2588 for (j = 0; j < adev->num_ip_blocks; j++) { 2589 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2590 if (!adev->ip_blocks[i].status.late_initialized) 2591 continue; 2592 /* skip PG for GFX, SDMA on S0ix */ 2593 if (adev->in_s0ix && 2594 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2595 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2596 continue; 2597 /* skip CG for VCE/UVD, it's handled specially */ 2598 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2599 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2600 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2601 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2602 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2603 /* enable powergating to save power */ 2604 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2605 state); 2606 if (r) { 2607 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2608 adev->ip_blocks[i].version->funcs->name, r); 2609 return r; 2610 } 2611 } 2612 } 2613 return 0; 2614 } 2615 2616 static int amdgpu_device_enable_mgpu_fan_boost(void) 2617 { 2618 struct amdgpu_gpu_instance *gpu_ins; 2619 struct amdgpu_device *adev; 2620 int i, ret = 0; 2621 2622 mutex_lock(&mgpu_info.mutex); 2623 2624 /* 2625 * MGPU fan boost feature should be enabled 2626 * only when there are two or more dGPUs in 2627 * the system 2628 */ 2629 if (mgpu_info.num_dgpu < 2) 2630 goto out; 2631 2632 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2633 gpu_ins = &(mgpu_info.gpu_ins[i]); 2634 adev = gpu_ins->adev; 2635 if (!(adev->flags & AMD_IS_APU) && 2636 !gpu_ins->mgpu_fan_enabled) { 2637 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2638 if (ret) 2639 break; 2640 2641 gpu_ins->mgpu_fan_enabled = 1; 2642 } 2643 } 2644 2645 out: 2646 mutex_unlock(&mgpu_info.mutex); 2647 2648 return ret; 2649 } 2650 2651 /** 2652 * amdgpu_device_ip_late_init - run late init for hardware IPs 2653 * 2654 * @adev: amdgpu_device pointer 2655 * 2656 * Late initialization pass for hardware IPs. The list of all the hardware 2657 * IPs that make up the asic is walked and the late_init callbacks are run. 2658 * late_init covers any special initialization that an IP requires 2659 * after all of the have been initialized or something that needs to happen 2660 * late in the init process. 2661 * Returns 0 on success, negative error code on failure. 2662 */ 2663 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2664 { 2665 struct amdgpu_gpu_instance *gpu_instance; 2666 int i = 0, r; 2667 2668 for (i = 0; i < adev->num_ip_blocks; i++) { 2669 if (!adev->ip_blocks[i].status.hw) 2670 continue; 2671 if (adev->ip_blocks[i].version->funcs->late_init) { 2672 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2673 if (r) { 2674 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2675 adev->ip_blocks[i].version->funcs->name, r); 2676 return r; 2677 } 2678 } 2679 adev->ip_blocks[i].status.late_initialized = true; 2680 } 2681 2682 r = amdgpu_ras_late_init(adev); 2683 if (r) { 2684 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2685 return r; 2686 } 2687 2688 amdgpu_ras_set_error_query_ready(adev, true); 2689 2690 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2691 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2692 2693 amdgpu_device_fill_reset_magic(adev); 2694 2695 r = amdgpu_device_enable_mgpu_fan_boost(); 2696 if (r) 2697 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2698 2699 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2700 if (amdgpu_passthrough(adev) && 2701 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2702 adev->asic_type == CHIP_ALDEBARAN)) 2703 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2704 2705 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2706 mutex_lock(&mgpu_info.mutex); 2707 2708 /* 2709 * Reset device p-state to low as this was booted with high. 2710 * 2711 * This should be performed only after all devices from the same 2712 * hive get initialized. 2713 * 2714 * However, it's unknown how many device in the hive in advance. 2715 * As this is counted one by one during devices initializations. 2716 * 2717 * So, we wait for all XGMI interlinked devices initialized. 2718 * This may bring some delays as those devices may come from 2719 * different hives. But that should be OK. 2720 */ 2721 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2722 for (i = 0; i < mgpu_info.num_gpu; i++) { 2723 gpu_instance = &(mgpu_info.gpu_ins[i]); 2724 if (gpu_instance->adev->flags & AMD_IS_APU) 2725 continue; 2726 2727 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2728 AMDGPU_XGMI_PSTATE_MIN); 2729 if (r) { 2730 DRM_ERROR("pstate setting failed (%d).\n", r); 2731 break; 2732 } 2733 } 2734 } 2735 2736 mutex_unlock(&mgpu_info.mutex); 2737 } 2738 2739 return 0; 2740 } 2741 2742 /** 2743 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2744 * 2745 * @adev: amdgpu_device pointer 2746 * 2747 * For ASICs need to disable SMC first 2748 */ 2749 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2750 { 2751 int i, r; 2752 2753 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2754 return; 2755 2756 for (i = 0; i < adev->num_ip_blocks; i++) { 2757 if (!adev->ip_blocks[i].status.hw) 2758 continue; 2759 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2760 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2761 /* XXX handle errors */ 2762 if (r) { 2763 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2764 adev->ip_blocks[i].version->funcs->name, r); 2765 } 2766 adev->ip_blocks[i].status.hw = false; 2767 break; 2768 } 2769 } 2770 } 2771 2772 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2773 { 2774 int i, r; 2775 2776 for (i = 0; i < adev->num_ip_blocks; i++) { 2777 if (!adev->ip_blocks[i].version->funcs->early_fini) 2778 continue; 2779 2780 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2781 if (r) { 2782 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2783 adev->ip_blocks[i].version->funcs->name, r); 2784 } 2785 } 2786 2787 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2788 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2789 2790 amdgpu_amdkfd_suspend(adev, false); 2791 2792 /* Workaroud for ASICs need to disable SMC first */ 2793 amdgpu_device_smu_fini_early(adev); 2794 2795 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2796 if (!adev->ip_blocks[i].status.hw) 2797 continue; 2798 2799 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2800 /* XXX handle errors */ 2801 if (r) { 2802 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2803 adev->ip_blocks[i].version->funcs->name, r); 2804 } 2805 2806 adev->ip_blocks[i].status.hw = false; 2807 } 2808 2809 if (amdgpu_sriov_vf(adev)) { 2810 if (amdgpu_virt_release_full_gpu(adev, false)) 2811 DRM_ERROR("failed to release exclusive mode on fini\n"); 2812 } 2813 2814 return 0; 2815 } 2816 2817 /** 2818 * amdgpu_device_ip_fini - run fini for hardware IPs 2819 * 2820 * @adev: amdgpu_device pointer 2821 * 2822 * Main teardown pass for hardware IPs. The list of all the hardware 2823 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2824 * are run. hw_fini tears down the hardware associated with each IP 2825 * and sw_fini tears down any software state associated with each IP. 2826 * Returns 0 on success, negative error code on failure. 2827 */ 2828 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2829 { 2830 int i, r; 2831 2832 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2833 amdgpu_virt_release_ras_err_handler_data(adev); 2834 2835 if (adev->gmc.xgmi.num_physical_nodes > 1) 2836 amdgpu_xgmi_remove_device(adev); 2837 2838 amdgpu_amdkfd_device_fini_sw(adev); 2839 2840 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2841 if (!adev->ip_blocks[i].status.sw) 2842 continue; 2843 2844 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2845 amdgpu_ucode_free_bo(adev); 2846 amdgpu_free_static_csa(&adev->virt.csa_obj); 2847 amdgpu_device_wb_fini(adev); 2848 amdgpu_device_mem_scratch_fini(adev); 2849 amdgpu_ib_pool_fini(adev); 2850 } 2851 2852 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2853 /* XXX handle errors */ 2854 if (r) { 2855 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2856 adev->ip_blocks[i].version->funcs->name, r); 2857 } 2858 adev->ip_blocks[i].status.sw = false; 2859 adev->ip_blocks[i].status.valid = false; 2860 } 2861 2862 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2863 if (!adev->ip_blocks[i].status.late_initialized) 2864 continue; 2865 if (adev->ip_blocks[i].version->funcs->late_fini) 2866 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2867 adev->ip_blocks[i].status.late_initialized = false; 2868 } 2869 2870 amdgpu_ras_fini(adev); 2871 2872 return 0; 2873 } 2874 2875 /** 2876 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2877 * 2878 * @work: work_struct. 2879 */ 2880 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2881 { 2882 struct amdgpu_device *adev = 2883 container_of(work, struct amdgpu_device, delayed_init_work.work); 2884 int r; 2885 2886 r = amdgpu_ib_ring_tests(adev); 2887 if (r) 2888 DRM_ERROR("ib ring test failed (%d).\n", r); 2889 } 2890 2891 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2892 { 2893 struct amdgpu_device *adev = 2894 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2895 2896 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2897 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2898 2899 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2900 adev->gfx.gfx_off_state = true; 2901 } 2902 2903 /** 2904 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2905 * 2906 * @adev: amdgpu_device pointer 2907 * 2908 * Main suspend function for hardware IPs. The list of all the hardware 2909 * IPs that make up the asic is walked, clockgating is disabled and the 2910 * suspend callbacks are run. suspend puts the hardware and software state 2911 * in each IP into a state suitable for suspend. 2912 * Returns 0 on success, negative error code on failure. 2913 */ 2914 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2915 { 2916 int i, r; 2917 2918 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2919 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2920 2921 /* 2922 * Per PMFW team's suggestion, driver needs to handle gfxoff 2923 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2924 * scenario. Add the missing df cstate disablement here. 2925 */ 2926 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2927 dev_warn(adev->dev, "Failed to disallow df cstate"); 2928 2929 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2930 if (!adev->ip_blocks[i].status.valid) 2931 continue; 2932 2933 /* displays are handled separately */ 2934 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2935 continue; 2936 2937 /* XXX handle errors */ 2938 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2939 /* XXX handle errors */ 2940 if (r) { 2941 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2942 adev->ip_blocks[i].version->funcs->name, r); 2943 return r; 2944 } 2945 2946 adev->ip_blocks[i].status.hw = false; 2947 } 2948 2949 return 0; 2950 } 2951 2952 /** 2953 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2954 * 2955 * @adev: amdgpu_device pointer 2956 * 2957 * Main suspend function for hardware IPs. The list of all the hardware 2958 * IPs that make up the asic is walked, clockgating is disabled and the 2959 * suspend callbacks are run. suspend puts the hardware and software state 2960 * in each IP into a state suitable for suspend. 2961 * Returns 0 on success, negative error code on failure. 2962 */ 2963 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2964 { 2965 int i, r; 2966 2967 if (adev->in_s0ix) 2968 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2969 2970 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2971 if (!adev->ip_blocks[i].status.valid) 2972 continue; 2973 /* displays are handled in phase1 */ 2974 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2975 continue; 2976 /* PSP lost connection when err_event_athub occurs */ 2977 if (amdgpu_ras_intr_triggered() && 2978 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2979 adev->ip_blocks[i].status.hw = false; 2980 continue; 2981 } 2982 2983 /* skip unnecessary suspend if we do not initialize them yet */ 2984 if (adev->gmc.xgmi.pending_reset && 2985 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2986 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2987 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2988 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2989 adev->ip_blocks[i].status.hw = false; 2990 continue; 2991 } 2992 2993 /* skip suspend of gfx/mes and psp for S0ix 2994 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2995 * like at runtime. PSP is also part of the always on hardware 2996 * so no need to suspend it. 2997 */ 2998 if (adev->in_s0ix && 2999 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3001 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3002 continue; 3003 3004 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3005 if (adev->in_s0ix && 3006 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3007 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3008 continue; 3009 3010 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3011 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3012 * from this location and RLC Autoload automatically also gets loaded 3013 * from here based on PMFW -> PSP message during re-init sequence. 3014 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3015 * the TMR and reload FWs again for IMU enabled APU ASICs. 3016 */ 3017 if (amdgpu_in_reset(adev) && 3018 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3019 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3020 continue; 3021 3022 /* XXX handle errors */ 3023 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3024 /* XXX handle errors */ 3025 if (r) { 3026 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3027 adev->ip_blocks[i].version->funcs->name, r); 3028 } 3029 adev->ip_blocks[i].status.hw = false; 3030 /* handle putting the SMC in the appropriate state */ 3031 if (!amdgpu_sriov_vf(adev)) { 3032 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3033 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3034 if (r) { 3035 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3036 adev->mp1_state, r); 3037 return r; 3038 } 3039 } 3040 } 3041 } 3042 3043 return 0; 3044 } 3045 3046 /** 3047 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3048 * 3049 * @adev: amdgpu_device pointer 3050 * 3051 * Main suspend function for hardware IPs. The list of all the hardware 3052 * IPs that make up the asic is walked, clockgating is disabled and the 3053 * suspend callbacks are run. suspend puts the hardware and software state 3054 * in each IP into a state suitable for suspend. 3055 * Returns 0 on success, negative error code on failure. 3056 */ 3057 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3058 { 3059 int r; 3060 3061 if (amdgpu_sriov_vf(adev)) { 3062 amdgpu_virt_fini_data_exchange(adev); 3063 amdgpu_virt_request_full_gpu(adev, false); 3064 } 3065 3066 r = amdgpu_device_ip_suspend_phase1(adev); 3067 if (r) 3068 return r; 3069 r = amdgpu_device_ip_suspend_phase2(adev); 3070 3071 if (amdgpu_sriov_vf(adev)) 3072 amdgpu_virt_release_full_gpu(adev, false); 3073 3074 return r; 3075 } 3076 3077 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3078 { 3079 int i, r; 3080 3081 static enum amd_ip_block_type ip_order[] = { 3082 AMD_IP_BLOCK_TYPE_COMMON, 3083 AMD_IP_BLOCK_TYPE_GMC, 3084 AMD_IP_BLOCK_TYPE_PSP, 3085 AMD_IP_BLOCK_TYPE_IH, 3086 }; 3087 3088 for (i = 0; i < adev->num_ip_blocks; i++) { 3089 int j; 3090 struct amdgpu_ip_block *block; 3091 3092 block = &adev->ip_blocks[i]; 3093 block->status.hw = false; 3094 3095 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3096 3097 if (block->version->type != ip_order[j] || 3098 !block->status.valid) 3099 continue; 3100 3101 r = block->version->funcs->hw_init(adev); 3102 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3103 if (r) 3104 return r; 3105 block->status.hw = true; 3106 } 3107 } 3108 3109 return 0; 3110 } 3111 3112 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3113 { 3114 int i, r; 3115 3116 static enum amd_ip_block_type ip_order[] = { 3117 AMD_IP_BLOCK_TYPE_SMC, 3118 AMD_IP_BLOCK_TYPE_DCE, 3119 AMD_IP_BLOCK_TYPE_GFX, 3120 AMD_IP_BLOCK_TYPE_SDMA, 3121 AMD_IP_BLOCK_TYPE_MES, 3122 AMD_IP_BLOCK_TYPE_UVD, 3123 AMD_IP_BLOCK_TYPE_VCE, 3124 AMD_IP_BLOCK_TYPE_VCN, 3125 AMD_IP_BLOCK_TYPE_JPEG 3126 }; 3127 3128 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3129 int j; 3130 struct amdgpu_ip_block *block; 3131 3132 for (j = 0; j < adev->num_ip_blocks; j++) { 3133 block = &adev->ip_blocks[j]; 3134 3135 if (block->version->type != ip_order[i] || 3136 !block->status.valid || 3137 block->status.hw) 3138 continue; 3139 3140 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3141 r = block->version->funcs->resume(adev); 3142 else 3143 r = block->version->funcs->hw_init(adev); 3144 3145 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3146 if (r) 3147 return r; 3148 block->status.hw = true; 3149 } 3150 } 3151 3152 return 0; 3153 } 3154 3155 /** 3156 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3157 * 3158 * @adev: amdgpu_device pointer 3159 * 3160 * First resume function for hardware IPs. The list of all the hardware 3161 * IPs that make up the asic is walked and the resume callbacks are run for 3162 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3163 * after a suspend and updates the software state as necessary. This 3164 * function is also used for restoring the GPU after a GPU reset. 3165 * Returns 0 on success, negative error code on failure. 3166 */ 3167 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3168 { 3169 int i, r; 3170 3171 for (i = 0; i < adev->num_ip_blocks; i++) { 3172 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3173 continue; 3174 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3176 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3177 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3178 3179 r = adev->ip_blocks[i].version->funcs->resume(adev); 3180 if (r) { 3181 DRM_ERROR("resume of IP block <%s> failed %d\n", 3182 adev->ip_blocks[i].version->funcs->name, r); 3183 return r; 3184 } 3185 adev->ip_blocks[i].status.hw = true; 3186 } 3187 } 3188 3189 return 0; 3190 } 3191 3192 /** 3193 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3194 * 3195 * @adev: amdgpu_device pointer 3196 * 3197 * First resume function for hardware IPs. The list of all the hardware 3198 * IPs that make up the asic is walked and the resume callbacks are run for 3199 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3200 * functional state after a suspend and updates the software state as 3201 * necessary. This function is also used for restoring the GPU after a GPU 3202 * reset. 3203 * Returns 0 on success, negative error code on failure. 3204 */ 3205 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3206 { 3207 int i, r; 3208 3209 for (i = 0; i < adev->num_ip_blocks; i++) { 3210 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3211 continue; 3212 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3213 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3214 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3215 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3216 continue; 3217 r = adev->ip_blocks[i].version->funcs->resume(adev); 3218 if (r) { 3219 DRM_ERROR("resume of IP block <%s> failed %d\n", 3220 adev->ip_blocks[i].version->funcs->name, r); 3221 return r; 3222 } 3223 adev->ip_blocks[i].status.hw = true; 3224 } 3225 3226 return 0; 3227 } 3228 3229 /** 3230 * amdgpu_device_ip_resume - run resume for hardware IPs 3231 * 3232 * @adev: amdgpu_device pointer 3233 * 3234 * Main resume function for hardware IPs. The hardware IPs 3235 * are split into two resume functions because they are 3236 * also used in recovering from a GPU reset and some additional 3237 * steps need to be take between them. In this case (S3/S4) they are 3238 * run sequentially. 3239 * Returns 0 on success, negative error code on failure. 3240 */ 3241 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3242 { 3243 int r; 3244 3245 r = amdgpu_device_ip_resume_phase1(adev); 3246 if (r) 3247 return r; 3248 3249 r = amdgpu_device_fw_loading(adev); 3250 if (r) 3251 return r; 3252 3253 r = amdgpu_device_ip_resume_phase2(adev); 3254 3255 return r; 3256 } 3257 3258 /** 3259 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3260 * 3261 * @adev: amdgpu_device pointer 3262 * 3263 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3264 */ 3265 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3266 { 3267 if (amdgpu_sriov_vf(adev)) { 3268 if (adev->is_atom_fw) { 3269 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3270 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3271 } else { 3272 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3273 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3274 } 3275 3276 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3277 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3278 } 3279 } 3280 3281 /** 3282 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3283 * 3284 * @asic_type: AMD asic type 3285 * 3286 * Check if there is DC (new modesetting infrastructre) support for an asic. 3287 * returns true if DC has support, false if not. 3288 */ 3289 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3290 { 3291 switch (asic_type) { 3292 #ifdef CONFIG_DRM_AMDGPU_SI 3293 case CHIP_HAINAN: 3294 #endif 3295 case CHIP_TOPAZ: 3296 /* chips with no display hardware */ 3297 return false; 3298 #if defined(CONFIG_DRM_AMD_DC) 3299 case CHIP_TAHITI: 3300 case CHIP_PITCAIRN: 3301 case CHIP_VERDE: 3302 case CHIP_OLAND: 3303 /* 3304 * We have systems in the wild with these ASICs that require 3305 * LVDS and VGA support which is not supported with DC. 3306 * 3307 * Fallback to the non-DC driver here by default so as not to 3308 * cause regressions. 3309 */ 3310 #if defined(CONFIG_DRM_AMD_DC_SI) 3311 return amdgpu_dc > 0; 3312 #else 3313 return false; 3314 #endif 3315 case CHIP_BONAIRE: 3316 case CHIP_KAVERI: 3317 case CHIP_KABINI: 3318 case CHIP_MULLINS: 3319 /* 3320 * We have systems in the wild with these ASICs that require 3321 * VGA support which is not supported with DC. 3322 * 3323 * Fallback to the non-DC driver here by default so as not to 3324 * cause regressions. 3325 */ 3326 return amdgpu_dc > 0; 3327 default: 3328 return amdgpu_dc != 0; 3329 #else 3330 default: 3331 if (amdgpu_dc > 0) 3332 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3333 return false; 3334 #endif 3335 } 3336 } 3337 3338 /** 3339 * amdgpu_device_has_dc_support - check if dc is supported 3340 * 3341 * @adev: amdgpu_device pointer 3342 * 3343 * Returns true for supported, false for not supported 3344 */ 3345 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3346 { 3347 if (adev->enable_virtual_display || 3348 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3349 return false; 3350 3351 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3352 } 3353 3354 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3355 { 3356 struct amdgpu_device *adev = 3357 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3358 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3359 3360 /* It's a bug to not have a hive within this function */ 3361 if (WARN_ON(!hive)) 3362 return; 3363 3364 /* 3365 * Use task barrier to synchronize all xgmi reset works across the 3366 * hive. task_barrier_enter and task_barrier_exit will block 3367 * until all the threads running the xgmi reset works reach 3368 * those points. task_barrier_full will do both blocks. 3369 */ 3370 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3371 3372 task_barrier_enter(&hive->tb); 3373 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3374 3375 if (adev->asic_reset_res) 3376 goto fail; 3377 3378 task_barrier_exit(&hive->tb); 3379 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3380 3381 if (adev->asic_reset_res) 3382 goto fail; 3383 3384 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3385 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3386 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3387 } else { 3388 3389 task_barrier_full(&hive->tb); 3390 adev->asic_reset_res = amdgpu_asic_reset(adev); 3391 } 3392 3393 fail: 3394 if (adev->asic_reset_res) 3395 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3396 adev->asic_reset_res, adev_to_drm(adev)->unique); 3397 amdgpu_put_xgmi_hive(hive); 3398 } 3399 3400 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3401 { 3402 char *input = amdgpu_lockup_timeout; 3403 char *timeout_setting = NULL; 3404 int index = 0; 3405 long timeout; 3406 int ret = 0; 3407 3408 /* 3409 * By default timeout for non compute jobs is 10000 3410 * and 60000 for compute jobs. 3411 * In SR-IOV or passthrough mode, timeout for compute 3412 * jobs are 60000 by default. 3413 */ 3414 adev->gfx_timeout = msecs_to_jiffies(10000); 3415 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3416 if (amdgpu_sriov_vf(adev)) 3417 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3418 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3419 else 3420 adev->compute_timeout = msecs_to_jiffies(60000); 3421 3422 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3423 while ((timeout_setting = strsep(&input, ",")) && 3424 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3425 ret = kstrtol(timeout_setting, 0, &timeout); 3426 if (ret) 3427 return ret; 3428 3429 if (timeout == 0) { 3430 index++; 3431 continue; 3432 } else if (timeout < 0) { 3433 timeout = MAX_SCHEDULE_TIMEOUT; 3434 dev_warn(adev->dev, "lockup timeout disabled"); 3435 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3436 } else { 3437 timeout = msecs_to_jiffies(timeout); 3438 } 3439 3440 switch (index++) { 3441 case 0: 3442 adev->gfx_timeout = timeout; 3443 break; 3444 case 1: 3445 adev->compute_timeout = timeout; 3446 break; 3447 case 2: 3448 adev->sdma_timeout = timeout; 3449 break; 3450 case 3: 3451 adev->video_timeout = timeout; 3452 break; 3453 default: 3454 break; 3455 } 3456 } 3457 /* 3458 * There is only one value specified and 3459 * it should apply to all non-compute jobs. 3460 */ 3461 if (index == 1) { 3462 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3463 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3464 adev->compute_timeout = adev->gfx_timeout; 3465 } 3466 } 3467 3468 return ret; 3469 } 3470 3471 /** 3472 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3473 * 3474 * @adev: amdgpu_device pointer 3475 * 3476 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3477 */ 3478 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3479 { 3480 struct iommu_domain *domain; 3481 3482 domain = iommu_get_domain_for_dev(adev->dev); 3483 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3484 adev->ram_is_direct_mapped = true; 3485 } 3486 3487 static const struct attribute *amdgpu_dev_attributes[] = { 3488 &dev_attr_pcie_replay_count.attr, 3489 NULL 3490 }; 3491 3492 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3493 { 3494 if (amdgpu_mcbp == 1) 3495 adev->gfx.mcbp = true; 3496 else if (amdgpu_mcbp == 0) 3497 adev->gfx.mcbp = false; 3498 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3499 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3500 adev->gfx.num_gfx_rings) 3501 adev->gfx.mcbp = true; 3502 3503 if (amdgpu_sriov_vf(adev)) 3504 adev->gfx.mcbp = true; 3505 3506 if (adev->gfx.mcbp) 3507 DRM_INFO("MCBP is enabled\n"); 3508 } 3509 3510 /** 3511 * amdgpu_device_init - initialize the driver 3512 * 3513 * @adev: amdgpu_device pointer 3514 * @flags: driver flags 3515 * 3516 * Initializes the driver info and hw (all asics). 3517 * Returns 0 for success or an error on failure. 3518 * Called at driver startup. 3519 */ 3520 int amdgpu_device_init(struct amdgpu_device *adev, 3521 uint32_t flags) 3522 { 3523 struct drm_device *ddev = adev_to_drm(adev); 3524 struct pci_dev *pdev = adev->pdev; 3525 int r, i; 3526 bool px = false; 3527 u32 max_MBps; 3528 int tmp; 3529 3530 adev->shutdown = false; 3531 adev->flags = flags; 3532 3533 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3534 adev->asic_type = amdgpu_force_asic_type; 3535 else 3536 adev->asic_type = flags & AMD_ASIC_MASK; 3537 3538 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3539 if (amdgpu_emu_mode == 1) 3540 adev->usec_timeout *= 10; 3541 adev->gmc.gart_size = 512 * 1024 * 1024; 3542 adev->accel_working = false; 3543 adev->num_rings = 0; 3544 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3545 adev->mman.buffer_funcs = NULL; 3546 adev->mman.buffer_funcs_ring = NULL; 3547 adev->vm_manager.vm_pte_funcs = NULL; 3548 adev->vm_manager.vm_pte_num_scheds = 0; 3549 adev->gmc.gmc_funcs = NULL; 3550 adev->harvest_ip_mask = 0x0; 3551 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3552 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3553 3554 adev->smc_rreg = &amdgpu_invalid_rreg; 3555 adev->smc_wreg = &amdgpu_invalid_wreg; 3556 adev->pcie_rreg = &amdgpu_invalid_rreg; 3557 adev->pcie_wreg = &amdgpu_invalid_wreg; 3558 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3559 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3560 adev->pciep_rreg = &amdgpu_invalid_rreg; 3561 adev->pciep_wreg = &amdgpu_invalid_wreg; 3562 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3563 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3564 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3565 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3566 adev->didt_rreg = &amdgpu_invalid_rreg; 3567 adev->didt_wreg = &amdgpu_invalid_wreg; 3568 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3569 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3570 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3571 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3572 3573 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3574 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3575 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3576 3577 /* mutex initialization are all done here so we 3578 * can recall function without having locking issues 3579 */ 3580 mutex_init(&adev->firmware.mutex); 3581 mutex_init(&adev->pm.mutex); 3582 mutex_init(&adev->gfx.gpu_clock_mutex); 3583 mutex_init(&adev->srbm_mutex); 3584 mutex_init(&adev->gfx.pipe_reserve_mutex); 3585 mutex_init(&adev->gfx.gfx_off_mutex); 3586 mutex_init(&adev->gfx.partition_mutex); 3587 mutex_init(&adev->grbm_idx_mutex); 3588 mutex_init(&adev->mn_lock); 3589 mutex_init(&adev->virt.vf_errors.lock); 3590 hash_init(adev->mn_hash); 3591 mutex_init(&adev->psp.mutex); 3592 mutex_init(&adev->notifier_lock); 3593 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3594 mutex_init(&adev->benchmark_mutex); 3595 3596 amdgpu_device_init_apu_flags(adev); 3597 3598 r = amdgpu_device_check_arguments(adev); 3599 if (r) 3600 return r; 3601 3602 spin_lock_init(&adev->mmio_idx_lock); 3603 spin_lock_init(&adev->smc_idx_lock); 3604 spin_lock_init(&adev->pcie_idx_lock); 3605 spin_lock_init(&adev->uvd_ctx_idx_lock); 3606 spin_lock_init(&adev->didt_idx_lock); 3607 spin_lock_init(&adev->gc_cac_idx_lock); 3608 spin_lock_init(&adev->se_cac_idx_lock); 3609 spin_lock_init(&adev->audio_endpt_idx_lock); 3610 spin_lock_init(&adev->mm_stats.lock); 3611 3612 INIT_LIST_HEAD(&adev->shadow_list); 3613 mutex_init(&adev->shadow_list_lock); 3614 3615 INIT_LIST_HEAD(&adev->reset_list); 3616 3617 INIT_LIST_HEAD(&adev->ras_list); 3618 3619 INIT_DELAYED_WORK(&adev->delayed_init_work, 3620 amdgpu_device_delayed_init_work_handler); 3621 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3622 amdgpu_device_delay_enable_gfx_off); 3623 3624 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3625 3626 adev->gfx.gfx_off_req_count = 1; 3627 adev->gfx.gfx_off_residency = 0; 3628 adev->gfx.gfx_off_entrycount = 0; 3629 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3630 3631 atomic_set(&adev->throttling_logging_enabled, 1); 3632 /* 3633 * If throttling continues, logging will be performed every minute 3634 * to avoid log flooding. "-1" is subtracted since the thermal 3635 * throttling interrupt comes every second. Thus, the total logging 3636 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3637 * for throttling interrupt) = 60 seconds. 3638 */ 3639 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3640 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3641 3642 /* Registers mapping */ 3643 /* TODO: block userspace mapping of io register */ 3644 if (adev->asic_type >= CHIP_BONAIRE) { 3645 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3646 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3647 } else { 3648 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3649 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3650 } 3651 3652 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3653 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3654 3655 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3656 if (!adev->rmmio) 3657 return -ENOMEM; 3658 3659 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3660 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3661 3662 /* 3663 * Reset domain needs to be present early, before XGMI hive discovered 3664 * (if any) and intitialized to use reset sem and in_gpu reset flag 3665 * early on during init and before calling to RREG32. 3666 */ 3667 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3668 if (!adev->reset_domain) 3669 return -ENOMEM; 3670 3671 /* detect hw virtualization here */ 3672 amdgpu_detect_virtualization(adev); 3673 3674 amdgpu_device_get_pcie_info(adev); 3675 3676 r = amdgpu_device_get_job_timeout_settings(adev); 3677 if (r) { 3678 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3679 return r; 3680 } 3681 3682 /* early init functions */ 3683 r = amdgpu_device_ip_early_init(adev); 3684 if (r) 3685 return r; 3686 3687 amdgpu_device_set_mcbp(adev); 3688 3689 /* Get rid of things like offb */ 3690 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3691 if (r) 3692 return r; 3693 3694 /* Enable TMZ based on IP_VERSION */ 3695 amdgpu_gmc_tmz_set(adev); 3696 3697 amdgpu_gmc_noretry_set(adev); 3698 /* Need to get xgmi info early to decide the reset behavior*/ 3699 if (adev->gmc.xgmi.supported) { 3700 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3701 if (r) 3702 return r; 3703 } 3704 3705 /* enable PCIE atomic ops */ 3706 if (amdgpu_sriov_vf(adev)) { 3707 if (adev->virt.fw_reserve.p_pf2vf) 3708 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3709 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3710 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3711 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3712 * internal path natively support atomics, set have_atomics_support to true. 3713 */ 3714 } else if ((adev->flags & AMD_IS_APU) && 3715 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3716 adev->have_atomics_support = true; 3717 } else { 3718 adev->have_atomics_support = 3719 !pci_enable_atomic_ops_to_root(adev->pdev, 3720 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3721 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3722 } 3723 3724 if (!adev->have_atomics_support) 3725 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3726 3727 /* doorbell bar mapping and doorbell index init*/ 3728 amdgpu_doorbell_init(adev); 3729 3730 if (amdgpu_emu_mode == 1) { 3731 /* post the asic on emulation mode */ 3732 emu_soc_asic_init(adev); 3733 goto fence_driver_init; 3734 } 3735 3736 amdgpu_reset_init(adev); 3737 3738 /* detect if we are with an SRIOV vbios */ 3739 if (adev->bios) 3740 amdgpu_device_detect_sriov_bios(adev); 3741 3742 /* check if we need to reset the asic 3743 * E.g., driver was not cleanly unloaded previously, etc. 3744 */ 3745 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3746 if (adev->gmc.xgmi.num_physical_nodes) { 3747 dev_info(adev->dev, "Pending hive reset.\n"); 3748 adev->gmc.xgmi.pending_reset = true; 3749 /* Only need to init necessary block for SMU to handle the reset */ 3750 for (i = 0; i < adev->num_ip_blocks; i++) { 3751 if (!adev->ip_blocks[i].status.valid) 3752 continue; 3753 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3754 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3755 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3756 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3757 DRM_DEBUG("IP %s disabled for hw_init.\n", 3758 adev->ip_blocks[i].version->funcs->name); 3759 adev->ip_blocks[i].status.hw = true; 3760 } 3761 } 3762 } else { 3763 tmp = amdgpu_reset_method; 3764 /* It should do a default reset when loading or reloading the driver, 3765 * regardless of the module parameter reset_method. 3766 */ 3767 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3768 r = amdgpu_asic_reset(adev); 3769 amdgpu_reset_method = tmp; 3770 if (r) { 3771 dev_err(adev->dev, "asic reset on init failed\n"); 3772 goto failed; 3773 } 3774 } 3775 } 3776 3777 /* Post card if necessary */ 3778 if (amdgpu_device_need_post(adev)) { 3779 if (!adev->bios) { 3780 dev_err(adev->dev, "no vBIOS found\n"); 3781 r = -EINVAL; 3782 goto failed; 3783 } 3784 DRM_INFO("GPU posting now...\n"); 3785 r = amdgpu_device_asic_init(adev); 3786 if (r) { 3787 dev_err(adev->dev, "gpu post error!\n"); 3788 goto failed; 3789 } 3790 } 3791 3792 if (adev->bios) { 3793 if (adev->is_atom_fw) { 3794 /* Initialize clocks */ 3795 r = amdgpu_atomfirmware_get_clock_info(adev); 3796 if (r) { 3797 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3798 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3799 goto failed; 3800 } 3801 } else { 3802 /* Initialize clocks */ 3803 r = amdgpu_atombios_get_clock_info(adev); 3804 if (r) { 3805 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3806 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3807 goto failed; 3808 } 3809 /* init i2c buses */ 3810 if (!amdgpu_device_has_dc_support(adev)) 3811 amdgpu_atombios_i2c_init(adev); 3812 } 3813 } 3814 3815 fence_driver_init: 3816 /* Fence driver */ 3817 r = amdgpu_fence_driver_sw_init(adev); 3818 if (r) { 3819 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3820 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3821 goto failed; 3822 } 3823 3824 /* init the mode config */ 3825 drm_mode_config_init(adev_to_drm(adev)); 3826 3827 r = amdgpu_device_ip_init(adev); 3828 if (r) { 3829 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3830 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3831 goto release_ras_con; 3832 } 3833 3834 amdgpu_fence_driver_hw_init(adev); 3835 3836 dev_info(adev->dev, 3837 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3838 adev->gfx.config.max_shader_engines, 3839 adev->gfx.config.max_sh_per_se, 3840 adev->gfx.config.max_cu_per_sh, 3841 adev->gfx.cu_info.number); 3842 3843 adev->accel_working = true; 3844 3845 amdgpu_vm_check_compute_bug(adev); 3846 3847 /* Initialize the buffer migration limit. */ 3848 if (amdgpu_moverate >= 0) 3849 max_MBps = amdgpu_moverate; 3850 else 3851 max_MBps = 8; /* Allow 8 MB/s. */ 3852 /* Get a log2 for easy divisions. */ 3853 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3854 3855 r = amdgpu_atombios_sysfs_init(adev); 3856 if (r) 3857 drm_err(&adev->ddev, 3858 "registering atombios sysfs failed (%d).\n", r); 3859 3860 r = amdgpu_pm_sysfs_init(adev); 3861 if (r) 3862 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3863 3864 r = amdgpu_ucode_sysfs_init(adev); 3865 if (r) { 3866 adev->ucode_sysfs_en = false; 3867 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3868 } else 3869 adev->ucode_sysfs_en = true; 3870 3871 /* 3872 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3873 * Otherwise the mgpu fan boost feature will be skipped due to the 3874 * gpu instance is counted less. 3875 */ 3876 amdgpu_register_gpu_instance(adev); 3877 3878 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3879 * explicit gating rather than handling it automatically. 3880 */ 3881 if (!adev->gmc.xgmi.pending_reset) { 3882 r = amdgpu_device_ip_late_init(adev); 3883 if (r) { 3884 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3885 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3886 goto release_ras_con; 3887 } 3888 /* must succeed. */ 3889 amdgpu_ras_resume(adev); 3890 queue_delayed_work(system_wq, &adev->delayed_init_work, 3891 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3892 } 3893 3894 if (amdgpu_sriov_vf(adev)) { 3895 amdgpu_virt_release_full_gpu(adev, true); 3896 flush_delayed_work(&adev->delayed_init_work); 3897 } 3898 3899 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3900 if (r) 3901 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3902 3903 amdgpu_fru_sysfs_init(adev); 3904 3905 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3906 r = amdgpu_pmu_init(adev); 3907 if (r) 3908 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3909 3910 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3911 if (amdgpu_device_cache_pci_state(adev->pdev)) 3912 pci_restore_state(pdev); 3913 3914 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3915 /* this will fail for cards that aren't VGA class devices, just 3916 * ignore it 3917 */ 3918 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3919 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3920 3921 px = amdgpu_device_supports_px(ddev); 3922 3923 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3924 apple_gmux_detect(NULL, NULL))) 3925 vga_switcheroo_register_client(adev->pdev, 3926 &amdgpu_switcheroo_ops, px); 3927 3928 if (px) 3929 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3930 3931 if (adev->gmc.xgmi.pending_reset) 3932 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3933 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3934 3935 amdgpu_device_check_iommu_direct_map(adev); 3936 3937 return 0; 3938 3939 release_ras_con: 3940 if (amdgpu_sriov_vf(adev)) 3941 amdgpu_virt_release_full_gpu(adev, true); 3942 3943 /* failed in exclusive mode due to timeout */ 3944 if (amdgpu_sriov_vf(adev) && 3945 !amdgpu_sriov_runtime(adev) && 3946 amdgpu_virt_mmio_blocked(adev) && 3947 !amdgpu_virt_wait_reset(adev)) { 3948 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3949 /* Don't send request since VF is inactive. */ 3950 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3951 adev->virt.ops = NULL; 3952 r = -EAGAIN; 3953 } 3954 amdgpu_release_ras_context(adev); 3955 3956 failed: 3957 amdgpu_vf_error_trans_all(adev); 3958 3959 return r; 3960 } 3961 3962 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3963 { 3964 3965 /* Clear all CPU mappings pointing to this device */ 3966 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3967 3968 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3969 amdgpu_doorbell_fini(adev); 3970 3971 iounmap(adev->rmmio); 3972 adev->rmmio = NULL; 3973 if (adev->mman.aper_base_kaddr) 3974 iounmap(adev->mman.aper_base_kaddr); 3975 adev->mman.aper_base_kaddr = NULL; 3976 3977 /* Memory manager related */ 3978 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3979 arch_phys_wc_del(adev->gmc.vram_mtrr); 3980 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3981 } 3982 } 3983 3984 /** 3985 * amdgpu_device_fini_hw - tear down the driver 3986 * 3987 * @adev: amdgpu_device pointer 3988 * 3989 * Tear down the driver info (all asics). 3990 * Called at driver shutdown. 3991 */ 3992 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3993 { 3994 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3995 flush_delayed_work(&adev->delayed_init_work); 3996 adev->shutdown = true; 3997 3998 /* make sure IB test finished before entering exclusive mode 3999 * to avoid preemption on IB test 4000 */ 4001 if (amdgpu_sriov_vf(adev)) { 4002 amdgpu_virt_request_full_gpu(adev, false); 4003 amdgpu_virt_fini_data_exchange(adev); 4004 } 4005 4006 /* disable all interrupts */ 4007 amdgpu_irq_disable_all(adev); 4008 if (adev->mode_info.mode_config_initialized) { 4009 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4010 drm_helper_force_disable_all(adev_to_drm(adev)); 4011 else 4012 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4013 } 4014 amdgpu_fence_driver_hw_fini(adev); 4015 4016 if (adev->mman.initialized) 4017 drain_workqueue(adev->mman.bdev.wq); 4018 4019 if (adev->pm.sysfs_initialized) 4020 amdgpu_pm_sysfs_fini(adev); 4021 if (adev->ucode_sysfs_en) 4022 amdgpu_ucode_sysfs_fini(adev); 4023 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4024 amdgpu_fru_sysfs_fini(adev); 4025 4026 /* disable ras feature must before hw fini */ 4027 amdgpu_ras_pre_fini(adev); 4028 4029 amdgpu_device_ip_fini_early(adev); 4030 4031 amdgpu_irq_fini_hw(adev); 4032 4033 if (adev->mman.initialized) 4034 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4035 4036 amdgpu_gart_dummy_page_fini(adev); 4037 4038 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4039 amdgpu_device_unmap_mmio(adev); 4040 4041 } 4042 4043 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4044 { 4045 int idx; 4046 bool px; 4047 4048 amdgpu_fence_driver_sw_fini(adev); 4049 amdgpu_device_ip_fini(adev); 4050 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4051 adev->accel_working = false; 4052 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4053 4054 amdgpu_reset_fini(adev); 4055 4056 /* free i2c buses */ 4057 if (!amdgpu_device_has_dc_support(adev)) 4058 amdgpu_i2c_fini(adev); 4059 4060 if (amdgpu_emu_mode != 1) 4061 amdgpu_atombios_fini(adev); 4062 4063 kfree(adev->bios); 4064 adev->bios = NULL; 4065 4066 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4067 4068 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4069 apple_gmux_detect(NULL, NULL))) 4070 vga_switcheroo_unregister_client(adev->pdev); 4071 4072 if (px) 4073 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4074 4075 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4076 vga_client_unregister(adev->pdev); 4077 4078 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4079 4080 iounmap(adev->rmmio); 4081 adev->rmmio = NULL; 4082 amdgpu_doorbell_fini(adev); 4083 drm_dev_exit(idx); 4084 } 4085 4086 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4087 amdgpu_pmu_fini(adev); 4088 if (adev->mman.discovery_bin) 4089 amdgpu_discovery_fini(adev); 4090 4091 amdgpu_reset_put_reset_domain(adev->reset_domain); 4092 adev->reset_domain = NULL; 4093 4094 kfree(adev->pci_state); 4095 4096 } 4097 4098 /** 4099 * amdgpu_device_evict_resources - evict device resources 4100 * @adev: amdgpu device object 4101 * 4102 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4103 * of the vram memory type. Mainly used for evicting device resources 4104 * at suspend time. 4105 * 4106 */ 4107 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4108 { 4109 int ret; 4110 4111 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4112 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4113 return 0; 4114 4115 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4116 if (ret) 4117 DRM_WARN("evicting device resources failed\n"); 4118 return ret; 4119 } 4120 4121 /* 4122 * Suspend & resume. 4123 */ 4124 /** 4125 * amdgpu_device_suspend - initiate device suspend 4126 * 4127 * @dev: drm dev pointer 4128 * @fbcon : notify the fbdev of suspend 4129 * 4130 * Puts the hw in the suspend state (all asics). 4131 * Returns 0 for success or an error on failure. 4132 * Called at driver suspend. 4133 */ 4134 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4135 { 4136 struct amdgpu_device *adev = drm_to_adev(dev); 4137 int r = 0; 4138 4139 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4140 return 0; 4141 4142 adev->in_suspend = true; 4143 4144 /* Evict the majority of BOs before grabbing the full access */ 4145 r = amdgpu_device_evict_resources(adev); 4146 if (r) 4147 return r; 4148 4149 if (amdgpu_sriov_vf(adev)) { 4150 amdgpu_virt_fini_data_exchange(adev); 4151 r = amdgpu_virt_request_full_gpu(adev, false); 4152 if (r) 4153 return r; 4154 } 4155 4156 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4157 DRM_WARN("smart shift update failed\n"); 4158 4159 if (fbcon) 4160 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4161 4162 cancel_delayed_work_sync(&adev->delayed_init_work); 4163 4164 amdgpu_ras_suspend(adev); 4165 4166 amdgpu_device_ip_suspend_phase1(adev); 4167 4168 if (!adev->in_s0ix) 4169 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4170 4171 r = amdgpu_device_evict_resources(adev); 4172 if (r) 4173 return r; 4174 4175 amdgpu_fence_driver_hw_fini(adev); 4176 4177 amdgpu_device_ip_suspend_phase2(adev); 4178 4179 if (amdgpu_sriov_vf(adev)) 4180 amdgpu_virt_release_full_gpu(adev, false); 4181 4182 return 0; 4183 } 4184 4185 /** 4186 * amdgpu_device_resume - initiate device resume 4187 * 4188 * @dev: drm dev pointer 4189 * @fbcon : notify the fbdev of resume 4190 * 4191 * Bring the hw back to operating state (all asics). 4192 * Returns 0 for success or an error on failure. 4193 * Called at driver resume. 4194 */ 4195 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4196 { 4197 struct amdgpu_device *adev = drm_to_adev(dev); 4198 int r = 0; 4199 4200 if (amdgpu_sriov_vf(adev)) { 4201 r = amdgpu_virt_request_full_gpu(adev, true); 4202 if (r) 4203 return r; 4204 } 4205 4206 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4207 return 0; 4208 4209 if (adev->in_s0ix) 4210 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4211 4212 /* post card */ 4213 if (amdgpu_device_need_post(adev)) { 4214 r = amdgpu_device_asic_init(adev); 4215 if (r) 4216 dev_err(adev->dev, "amdgpu asic init failed\n"); 4217 } 4218 4219 r = amdgpu_device_ip_resume(adev); 4220 4221 if (r) { 4222 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4223 goto exit; 4224 } 4225 amdgpu_fence_driver_hw_init(adev); 4226 4227 r = amdgpu_device_ip_late_init(adev); 4228 if (r) 4229 goto exit; 4230 4231 queue_delayed_work(system_wq, &adev->delayed_init_work, 4232 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4233 4234 if (!adev->in_s0ix) { 4235 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4236 if (r) 4237 goto exit; 4238 } 4239 4240 exit: 4241 if (amdgpu_sriov_vf(adev)) { 4242 amdgpu_virt_init_data_exchange(adev); 4243 amdgpu_virt_release_full_gpu(adev, true); 4244 } 4245 4246 if (r) 4247 return r; 4248 4249 /* Make sure IB tests flushed */ 4250 flush_delayed_work(&adev->delayed_init_work); 4251 4252 if (fbcon) 4253 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4254 4255 amdgpu_ras_resume(adev); 4256 4257 if (adev->mode_info.num_crtc) { 4258 /* 4259 * Most of the connector probing functions try to acquire runtime pm 4260 * refs to ensure that the GPU is powered on when connector polling is 4261 * performed. Since we're calling this from a runtime PM callback, 4262 * trying to acquire rpm refs will cause us to deadlock. 4263 * 4264 * Since we're guaranteed to be holding the rpm lock, it's safe to 4265 * temporarily disable the rpm helpers so this doesn't deadlock us. 4266 */ 4267 #ifdef CONFIG_PM 4268 dev->dev->power.disable_depth++; 4269 #endif 4270 if (!adev->dc_enabled) 4271 drm_helper_hpd_irq_event(dev); 4272 else 4273 drm_kms_helper_hotplug_event(dev); 4274 #ifdef CONFIG_PM 4275 dev->dev->power.disable_depth--; 4276 #endif 4277 } 4278 adev->in_suspend = false; 4279 4280 if (adev->enable_mes) 4281 amdgpu_mes_self_test(adev); 4282 4283 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4284 DRM_WARN("smart shift update failed\n"); 4285 4286 return 0; 4287 } 4288 4289 /** 4290 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4291 * 4292 * @adev: amdgpu_device pointer 4293 * 4294 * The list of all the hardware IPs that make up the asic is walked and 4295 * the check_soft_reset callbacks are run. check_soft_reset determines 4296 * if the asic is still hung or not. 4297 * Returns true if any of the IPs are still in a hung state, false if not. 4298 */ 4299 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4300 { 4301 int i; 4302 bool asic_hang = false; 4303 4304 if (amdgpu_sriov_vf(adev)) 4305 return true; 4306 4307 if (amdgpu_asic_need_full_reset(adev)) 4308 return true; 4309 4310 for (i = 0; i < adev->num_ip_blocks; i++) { 4311 if (!adev->ip_blocks[i].status.valid) 4312 continue; 4313 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4314 adev->ip_blocks[i].status.hang = 4315 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4316 if (adev->ip_blocks[i].status.hang) { 4317 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4318 asic_hang = true; 4319 } 4320 } 4321 return asic_hang; 4322 } 4323 4324 /** 4325 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4326 * 4327 * @adev: amdgpu_device pointer 4328 * 4329 * The list of all the hardware IPs that make up the asic is walked and the 4330 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4331 * handles any IP specific hardware or software state changes that are 4332 * necessary for a soft reset to succeed. 4333 * Returns 0 on success, negative error code on failure. 4334 */ 4335 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4336 { 4337 int i, r = 0; 4338 4339 for (i = 0; i < adev->num_ip_blocks; i++) { 4340 if (!adev->ip_blocks[i].status.valid) 4341 continue; 4342 if (adev->ip_blocks[i].status.hang && 4343 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4344 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4345 if (r) 4346 return r; 4347 } 4348 } 4349 4350 return 0; 4351 } 4352 4353 /** 4354 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4355 * 4356 * @adev: amdgpu_device pointer 4357 * 4358 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4359 * reset is necessary to recover. 4360 * Returns true if a full asic reset is required, false if not. 4361 */ 4362 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4363 { 4364 int i; 4365 4366 if (amdgpu_asic_need_full_reset(adev)) 4367 return true; 4368 4369 for (i = 0; i < adev->num_ip_blocks; i++) { 4370 if (!adev->ip_blocks[i].status.valid) 4371 continue; 4372 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4373 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4374 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4375 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4376 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4377 if (adev->ip_blocks[i].status.hang) { 4378 dev_info(adev->dev, "Some block need full reset!\n"); 4379 return true; 4380 } 4381 } 4382 } 4383 return false; 4384 } 4385 4386 /** 4387 * amdgpu_device_ip_soft_reset - do a soft reset 4388 * 4389 * @adev: amdgpu_device pointer 4390 * 4391 * The list of all the hardware IPs that make up the asic is walked and the 4392 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4393 * IP specific hardware or software state changes that are necessary to soft 4394 * reset the IP. 4395 * Returns 0 on success, negative error code on failure. 4396 */ 4397 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4398 { 4399 int i, r = 0; 4400 4401 for (i = 0; i < adev->num_ip_blocks; i++) { 4402 if (!adev->ip_blocks[i].status.valid) 4403 continue; 4404 if (adev->ip_blocks[i].status.hang && 4405 adev->ip_blocks[i].version->funcs->soft_reset) { 4406 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4407 if (r) 4408 return r; 4409 } 4410 } 4411 4412 return 0; 4413 } 4414 4415 /** 4416 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4417 * 4418 * @adev: amdgpu_device pointer 4419 * 4420 * The list of all the hardware IPs that make up the asic is walked and the 4421 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4422 * handles any IP specific hardware or software state changes that are 4423 * necessary after the IP has been soft reset. 4424 * Returns 0 on success, negative error code on failure. 4425 */ 4426 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4427 { 4428 int i, r = 0; 4429 4430 for (i = 0; i < adev->num_ip_blocks; i++) { 4431 if (!adev->ip_blocks[i].status.valid) 4432 continue; 4433 if (adev->ip_blocks[i].status.hang && 4434 adev->ip_blocks[i].version->funcs->post_soft_reset) 4435 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4436 if (r) 4437 return r; 4438 } 4439 4440 return 0; 4441 } 4442 4443 /** 4444 * amdgpu_device_recover_vram - Recover some VRAM contents 4445 * 4446 * @adev: amdgpu_device pointer 4447 * 4448 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4449 * restore things like GPUVM page tables after a GPU reset where 4450 * the contents of VRAM might be lost. 4451 * 4452 * Returns: 4453 * 0 on success, negative error code on failure. 4454 */ 4455 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4456 { 4457 struct dma_fence *fence = NULL, *next = NULL; 4458 struct amdgpu_bo *shadow; 4459 struct amdgpu_bo_vm *vmbo; 4460 long r = 1, tmo; 4461 4462 if (amdgpu_sriov_runtime(adev)) 4463 tmo = msecs_to_jiffies(8000); 4464 else 4465 tmo = msecs_to_jiffies(100); 4466 4467 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4468 mutex_lock(&adev->shadow_list_lock); 4469 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4470 /* If vm is compute context or adev is APU, shadow will be NULL */ 4471 if (!vmbo->shadow) 4472 continue; 4473 shadow = vmbo->shadow; 4474 4475 /* No need to recover an evicted BO */ 4476 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4477 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4478 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4479 continue; 4480 4481 r = amdgpu_bo_restore_shadow(shadow, &next); 4482 if (r) 4483 break; 4484 4485 if (fence) { 4486 tmo = dma_fence_wait_timeout(fence, false, tmo); 4487 dma_fence_put(fence); 4488 fence = next; 4489 if (tmo == 0) { 4490 r = -ETIMEDOUT; 4491 break; 4492 } else if (tmo < 0) { 4493 r = tmo; 4494 break; 4495 } 4496 } else { 4497 fence = next; 4498 } 4499 } 4500 mutex_unlock(&adev->shadow_list_lock); 4501 4502 if (fence) 4503 tmo = dma_fence_wait_timeout(fence, false, tmo); 4504 dma_fence_put(fence); 4505 4506 if (r < 0 || tmo <= 0) { 4507 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4508 return -EIO; 4509 } 4510 4511 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4512 return 0; 4513 } 4514 4515 4516 /** 4517 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4518 * 4519 * @adev: amdgpu_device pointer 4520 * @from_hypervisor: request from hypervisor 4521 * 4522 * do VF FLR and reinitialize Asic 4523 * return 0 means succeeded otherwise failed 4524 */ 4525 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4526 bool from_hypervisor) 4527 { 4528 int r; 4529 struct amdgpu_hive_info *hive = NULL; 4530 int retry_limit = 0; 4531 4532 retry: 4533 amdgpu_amdkfd_pre_reset(adev); 4534 4535 if (from_hypervisor) 4536 r = amdgpu_virt_request_full_gpu(adev, true); 4537 else 4538 r = amdgpu_virt_reset_gpu(adev); 4539 if (r) 4540 return r; 4541 amdgpu_irq_gpu_reset_resume_helper(adev); 4542 4543 /* some sw clean up VF needs to do before recover */ 4544 amdgpu_virt_post_reset(adev); 4545 4546 /* Resume IP prior to SMC */ 4547 r = amdgpu_device_ip_reinit_early_sriov(adev); 4548 if (r) 4549 goto error; 4550 4551 amdgpu_virt_init_data_exchange(adev); 4552 4553 r = amdgpu_device_fw_loading(adev); 4554 if (r) 4555 return r; 4556 4557 /* now we are okay to resume SMC/CP/SDMA */ 4558 r = amdgpu_device_ip_reinit_late_sriov(adev); 4559 if (r) 4560 goto error; 4561 4562 hive = amdgpu_get_xgmi_hive(adev); 4563 /* Update PSP FW topology after reset */ 4564 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4565 r = amdgpu_xgmi_update_topology(hive, adev); 4566 4567 if (hive) 4568 amdgpu_put_xgmi_hive(hive); 4569 4570 if (!r) { 4571 r = amdgpu_ib_ring_tests(adev); 4572 4573 amdgpu_amdkfd_post_reset(adev); 4574 } 4575 4576 error: 4577 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4578 amdgpu_inc_vram_lost(adev); 4579 r = amdgpu_device_recover_vram(adev); 4580 } 4581 amdgpu_virt_release_full_gpu(adev, true); 4582 4583 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4584 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4585 retry_limit++; 4586 goto retry; 4587 } else 4588 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4589 } 4590 4591 return r; 4592 } 4593 4594 /** 4595 * amdgpu_device_has_job_running - check if there is any job in mirror list 4596 * 4597 * @adev: amdgpu_device pointer 4598 * 4599 * check if there is any job in mirror list 4600 */ 4601 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4602 { 4603 int i; 4604 struct drm_sched_job *job; 4605 4606 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4607 struct amdgpu_ring *ring = adev->rings[i]; 4608 4609 if (!ring || !ring->sched.thread) 4610 continue; 4611 4612 spin_lock(&ring->sched.job_list_lock); 4613 job = list_first_entry_or_null(&ring->sched.pending_list, 4614 struct drm_sched_job, list); 4615 spin_unlock(&ring->sched.job_list_lock); 4616 if (job) 4617 return true; 4618 } 4619 return false; 4620 } 4621 4622 /** 4623 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4624 * 4625 * @adev: amdgpu_device pointer 4626 * 4627 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4628 * a hung GPU. 4629 */ 4630 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4631 { 4632 4633 if (amdgpu_gpu_recovery == 0) 4634 goto disabled; 4635 4636 /* Skip soft reset check in fatal error mode */ 4637 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4638 return true; 4639 4640 if (amdgpu_sriov_vf(adev)) 4641 return true; 4642 4643 if (amdgpu_gpu_recovery == -1) { 4644 switch (adev->asic_type) { 4645 #ifdef CONFIG_DRM_AMDGPU_SI 4646 case CHIP_VERDE: 4647 case CHIP_TAHITI: 4648 case CHIP_PITCAIRN: 4649 case CHIP_OLAND: 4650 case CHIP_HAINAN: 4651 #endif 4652 #ifdef CONFIG_DRM_AMDGPU_CIK 4653 case CHIP_KAVERI: 4654 case CHIP_KABINI: 4655 case CHIP_MULLINS: 4656 #endif 4657 case CHIP_CARRIZO: 4658 case CHIP_STONEY: 4659 case CHIP_CYAN_SKILLFISH: 4660 goto disabled; 4661 default: 4662 break; 4663 } 4664 } 4665 4666 return true; 4667 4668 disabled: 4669 dev_info(adev->dev, "GPU recovery disabled.\n"); 4670 return false; 4671 } 4672 4673 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4674 { 4675 u32 i; 4676 int ret = 0; 4677 4678 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4679 4680 dev_info(adev->dev, "GPU mode1 reset\n"); 4681 4682 /* disable BM */ 4683 pci_clear_master(adev->pdev); 4684 4685 amdgpu_device_cache_pci_state(adev->pdev); 4686 4687 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4688 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4689 ret = amdgpu_dpm_mode1_reset(adev); 4690 } else { 4691 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4692 ret = psp_gpu_reset(adev); 4693 } 4694 4695 if (ret) 4696 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4697 4698 amdgpu_device_load_pci_state(adev->pdev); 4699 4700 /* wait for asic to come out of reset */ 4701 for (i = 0; i < adev->usec_timeout; i++) { 4702 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4703 4704 if (memsize != 0xffffffff) 4705 break; 4706 udelay(1); 4707 } 4708 4709 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4710 return ret; 4711 } 4712 4713 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4714 struct amdgpu_reset_context *reset_context) 4715 { 4716 int i, r = 0; 4717 struct amdgpu_job *job = NULL; 4718 bool need_full_reset = 4719 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4720 4721 if (reset_context->reset_req_dev == adev) 4722 job = reset_context->job; 4723 4724 if (amdgpu_sriov_vf(adev)) { 4725 /* stop the data exchange thread */ 4726 amdgpu_virt_fini_data_exchange(adev); 4727 } 4728 4729 amdgpu_fence_driver_isr_toggle(adev, true); 4730 4731 /* block all schedulers and reset given job's ring */ 4732 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4733 struct amdgpu_ring *ring = adev->rings[i]; 4734 4735 if (!ring || !ring->sched.thread) 4736 continue; 4737 4738 /* Clear job fence from fence drv to avoid force_completion 4739 * leave NULL and vm flush fence in fence drv 4740 */ 4741 amdgpu_fence_driver_clear_job_fences(ring); 4742 4743 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4744 amdgpu_fence_driver_force_completion(ring); 4745 } 4746 4747 amdgpu_fence_driver_isr_toggle(adev, false); 4748 4749 if (job && job->vm) 4750 drm_sched_increase_karma(&job->base); 4751 4752 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4753 /* If reset handler not implemented, continue; otherwise return */ 4754 if (r == -EOPNOTSUPP) 4755 r = 0; 4756 else 4757 return r; 4758 4759 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4760 if (!amdgpu_sriov_vf(adev)) { 4761 4762 if (!need_full_reset) 4763 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4764 4765 if (!need_full_reset && amdgpu_gpu_recovery && 4766 amdgpu_device_ip_check_soft_reset(adev)) { 4767 amdgpu_device_ip_pre_soft_reset(adev); 4768 r = amdgpu_device_ip_soft_reset(adev); 4769 amdgpu_device_ip_post_soft_reset(adev); 4770 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4771 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4772 need_full_reset = true; 4773 } 4774 } 4775 4776 if (need_full_reset) 4777 r = amdgpu_device_ip_suspend(adev); 4778 if (need_full_reset) 4779 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4780 else 4781 clear_bit(AMDGPU_NEED_FULL_RESET, 4782 &reset_context->flags); 4783 } 4784 4785 return r; 4786 } 4787 4788 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4789 { 4790 int i; 4791 4792 lockdep_assert_held(&adev->reset_domain->sem); 4793 4794 for (i = 0; i < adev->num_regs; i++) { 4795 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4796 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4797 adev->reset_dump_reg_value[i]); 4798 } 4799 4800 return 0; 4801 } 4802 4803 #ifdef CONFIG_DEV_COREDUMP 4804 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4805 size_t count, void *data, size_t datalen) 4806 { 4807 struct drm_printer p; 4808 struct amdgpu_device *adev = data; 4809 struct drm_print_iterator iter; 4810 int i; 4811 4812 iter.data = buffer; 4813 iter.offset = 0; 4814 iter.start = offset; 4815 iter.remain = count; 4816 4817 p = drm_coredump_printer(&iter); 4818 4819 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4820 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4821 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4822 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4823 if (adev->reset_task_info.pid) 4824 drm_printf(&p, "process_name: %s PID: %d\n", 4825 adev->reset_task_info.process_name, 4826 adev->reset_task_info.pid); 4827 4828 if (adev->reset_vram_lost) 4829 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4830 if (adev->num_regs) { 4831 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4832 4833 for (i = 0; i < adev->num_regs; i++) 4834 drm_printf(&p, "0x%08x: 0x%08x\n", 4835 adev->reset_dump_reg_list[i], 4836 adev->reset_dump_reg_value[i]); 4837 } 4838 4839 return count - iter.remain; 4840 } 4841 4842 static void amdgpu_devcoredump_free(void *data) 4843 { 4844 } 4845 4846 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4847 { 4848 struct drm_device *dev = adev_to_drm(adev); 4849 4850 ktime_get_ts64(&adev->reset_time); 4851 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4852 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4853 } 4854 #endif 4855 4856 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4857 struct amdgpu_reset_context *reset_context) 4858 { 4859 struct amdgpu_device *tmp_adev = NULL; 4860 bool need_full_reset, skip_hw_reset, vram_lost = false; 4861 int r = 0; 4862 bool gpu_reset_for_dev_remove = 0; 4863 4864 /* Try reset handler method first */ 4865 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4866 reset_list); 4867 amdgpu_reset_reg_dumps(tmp_adev); 4868 4869 reset_context->reset_device_list = device_list_handle; 4870 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4871 /* If reset handler not implemented, continue; otherwise return */ 4872 if (r == -EOPNOTSUPP) 4873 r = 0; 4874 else 4875 return r; 4876 4877 /* Reset handler not implemented, use the default method */ 4878 need_full_reset = 4879 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4880 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4881 4882 gpu_reset_for_dev_remove = 4883 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4884 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4885 4886 /* 4887 * ASIC reset has to be done on all XGMI hive nodes ASAP 4888 * to allow proper links negotiation in FW (within 1 sec) 4889 */ 4890 if (!skip_hw_reset && need_full_reset) { 4891 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4892 /* For XGMI run all resets in parallel to speed up the process */ 4893 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4894 tmp_adev->gmc.xgmi.pending_reset = false; 4895 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4896 r = -EALREADY; 4897 } else 4898 r = amdgpu_asic_reset(tmp_adev); 4899 4900 if (r) { 4901 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4902 r, adev_to_drm(tmp_adev)->unique); 4903 break; 4904 } 4905 } 4906 4907 /* For XGMI wait for all resets to complete before proceed */ 4908 if (!r) { 4909 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4910 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4911 flush_work(&tmp_adev->xgmi_reset_work); 4912 r = tmp_adev->asic_reset_res; 4913 if (r) 4914 break; 4915 } 4916 } 4917 } 4918 } 4919 4920 if (!r && amdgpu_ras_intr_triggered()) { 4921 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4922 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4923 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4924 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4925 } 4926 4927 amdgpu_ras_intr_cleared(); 4928 } 4929 4930 /* Since the mode1 reset affects base ip blocks, the 4931 * phase1 ip blocks need to be resumed. Otherwise there 4932 * will be a BIOS signature error and the psp bootloader 4933 * can't load kdb on the next amdgpu install. 4934 */ 4935 if (gpu_reset_for_dev_remove) { 4936 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4937 amdgpu_device_ip_resume_phase1(tmp_adev); 4938 4939 goto end; 4940 } 4941 4942 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4943 if (need_full_reset) { 4944 /* post card */ 4945 r = amdgpu_device_asic_init(tmp_adev); 4946 if (r) { 4947 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4948 } else { 4949 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4950 4951 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4952 if (r) 4953 goto out; 4954 4955 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4956 #ifdef CONFIG_DEV_COREDUMP 4957 tmp_adev->reset_vram_lost = vram_lost; 4958 memset(&tmp_adev->reset_task_info, 0, 4959 sizeof(tmp_adev->reset_task_info)); 4960 if (reset_context->job && reset_context->job->vm) 4961 tmp_adev->reset_task_info = 4962 reset_context->job->vm->task_info; 4963 amdgpu_reset_capture_coredumpm(tmp_adev); 4964 #endif 4965 if (vram_lost) { 4966 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4967 amdgpu_inc_vram_lost(tmp_adev); 4968 } 4969 4970 r = amdgpu_device_fw_loading(tmp_adev); 4971 if (r) 4972 return r; 4973 4974 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4975 if (r) 4976 goto out; 4977 4978 if (vram_lost) 4979 amdgpu_device_fill_reset_magic(tmp_adev); 4980 4981 /* 4982 * Add this ASIC as tracked as reset was already 4983 * complete successfully. 4984 */ 4985 amdgpu_register_gpu_instance(tmp_adev); 4986 4987 if (!reset_context->hive && 4988 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4989 amdgpu_xgmi_add_device(tmp_adev); 4990 4991 r = amdgpu_device_ip_late_init(tmp_adev); 4992 if (r) 4993 goto out; 4994 4995 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4996 4997 /* 4998 * The GPU enters bad state once faulty pages 4999 * by ECC has reached the threshold, and ras 5000 * recovery is scheduled next. So add one check 5001 * here to break recovery if it indeed exceeds 5002 * bad page threshold, and remind user to 5003 * retire this GPU or setting one bigger 5004 * bad_page_threshold value to fix this once 5005 * probing driver again. 5006 */ 5007 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5008 /* must succeed. */ 5009 amdgpu_ras_resume(tmp_adev); 5010 } else { 5011 r = -EINVAL; 5012 goto out; 5013 } 5014 5015 /* Update PSP FW topology after reset */ 5016 if (reset_context->hive && 5017 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5018 r = amdgpu_xgmi_update_topology( 5019 reset_context->hive, tmp_adev); 5020 } 5021 } 5022 5023 out: 5024 if (!r) { 5025 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5026 r = amdgpu_ib_ring_tests(tmp_adev); 5027 if (r) { 5028 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5029 need_full_reset = true; 5030 r = -EAGAIN; 5031 goto end; 5032 } 5033 } 5034 5035 if (!r) 5036 r = amdgpu_device_recover_vram(tmp_adev); 5037 else 5038 tmp_adev->asic_reset_res = r; 5039 } 5040 5041 end: 5042 if (need_full_reset) 5043 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5044 else 5045 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5046 return r; 5047 } 5048 5049 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5050 { 5051 5052 switch (amdgpu_asic_reset_method(adev)) { 5053 case AMD_RESET_METHOD_MODE1: 5054 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5055 break; 5056 case AMD_RESET_METHOD_MODE2: 5057 adev->mp1_state = PP_MP1_STATE_RESET; 5058 break; 5059 default: 5060 adev->mp1_state = PP_MP1_STATE_NONE; 5061 break; 5062 } 5063 } 5064 5065 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5066 { 5067 amdgpu_vf_error_trans_all(adev); 5068 adev->mp1_state = PP_MP1_STATE_NONE; 5069 } 5070 5071 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5072 { 5073 struct pci_dev *p = NULL; 5074 5075 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5076 adev->pdev->bus->number, 1); 5077 if (p) { 5078 pm_runtime_enable(&(p->dev)); 5079 pm_runtime_resume(&(p->dev)); 5080 } 5081 5082 pci_dev_put(p); 5083 } 5084 5085 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5086 { 5087 enum amd_reset_method reset_method; 5088 struct pci_dev *p = NULL; 5089 u64 expires; 5090 5091 /* 5092 * For now, only BACO and mode1 reset are confirmed 5093 * to suffer the audio issue without proper suspended. 5094 */ 5095 reset_method = amdgpu_asic_reset_method(adev); 5096 if ((reset_method != AMD_RESET_METHOD_BACO) && 5097 (reset_method != AMD_RESET_METHOD_MODE1)) 5098 return -EINVAL; 5099 5100 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5101 adev->pdev->bus->number, 1); 5102 if (!p) 5103 return -ENODEV; 5104 5105 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5106 if (!expires) 5107 /* 5108 * If we cannot get the audio device autosuspend delay, 5109 * a fixed 4S interval will be used. Considering 3S is 5110 * the audio controller default autosuspend delay setting. 5111 * 4S used here is guaranteed to cover that. 5112 */ 5113 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5114 5115 while (!pm_runtime_status_suspended(&(p->dev))) { 5116 if (!pm_runtime_suspend(&(p->dev))) 5117 break; 5118 5119 if (expires < ktime_get_mono_fast_ns()) { 5120 dev_warn(adev->dev, "failed to suspend display audio\n"); 5121 pci_dev_put(p); 5122 /* TODO: abort the succeeding gpu reset? */ 5123 return -ETIMEDOUT; 5124 } 5125 } 5126 5127 pm_runtime_disable(&(p->dev)); 5128 5129 pci_dev_put(p); 5130 return 0; 5131 } 5132 5133 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5134 { 5135 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5136 5137 #if defined(CONFIG_DEBUG_FS) 5138 if (!amdgpu_sriov_vf(adev)) 5139 cancel_work(&adev->reset_work); 5140 #endif 5141 5142 if (adev->kfd.dev) 5143 cancel_work(&adev->kfd.reset_work); 5144 5145 if (amdgpu_sriov_vf(adev)) 5146 cancel_work(&adev->virt.flr_work); 5147 5148 if (con && adev->ras_enabled) 5149 cancel_work(&con->recovery_work); 5150 5151 } 5152 5153 /** 5154 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5155 * 5156 * @adev: amdgpu_device pointer 5157 * @job: which job trigger hang 5158 * @reset_context: amdgpu reset context pointer 5159 * 5160 * Attempt to reset the GPU if it has hung (all asics). 5161 * Attempt to do soft-reset or full-reset and reinitialize Asic 5162 * Returns 0 for success or an error on failure. 5163 */ 5164 5165 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5166 struct amdgpu_job *job, 5167 struct amdgpu_reset_context *reset_context) 5168 { 5169 struct list_head device_list, *device_list_handle = NULL; 5170 bool job_signaled = false; 5171 struct amdgpu_hive_info *hive = NULL; 5172 struct amdgpu_device *tmp_adev = NULL; 5173 int i, r = 0; 5174 bool need_emergency_restart = false; 5175 bool audio_suspended = false; 5176 bool gpu_reset_for_dev_remove = false; 5177 5178 gpu_reset_for_dev_remove = 5179 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5180 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5181 5182 /* 5183 * Special case: RAS triggered and full reset isn't supported 5184 */ 5185 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5186 5187 /* 5188 * Flush RAM to disk so that after reboot 5189 * the user can read log and see why the system rebooted. 5190 */ 5191 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5192 DRM_WARN("Emergency reboot."); 5193 5194 ksys_sync_helper(); 5195 emergency_restart(); 5196 } 5197 5198 dev_info(adev->dev, "GPU %s begin!\n", 5199 need_emergency_restart ? "jobs stop":"reset"); 5200 5201 if (!amdgpu_sriov_vf(adev)) 5202 hive = amdgpu_get_xgmi_hive(adev); 5203 if (hive) 5204 mutex_lock(&hive->hive_lock); 5205 5206 reset_context->job = job; 5207 reset_context->hive = hive; 5208 /* 5209 * Build list of devices to reset. 5210 * In case we are in XGMI hive mode, resort the device list 5211 * to put adev in the 1st position. 5212 */ 5213 INIT_LIST_HEAD(&device_list); 5214 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5215 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5216 list_add_tail(&tmp_adev->reset_list, &device_list); 5217 if (gpu_reset_for_dev_remove && adev->shutdown) 5218 tmp_adev->shutdown = true; 5219 } 5220 if (!list_is_first(&adev->reset_list, &device_list)) 5221 list_rotate_to_front(&adev->reset_list, &device_list); 5222 device_list_handle = &device_list; 5223 } else { 5224 list_add_tail(&adev->reset_list, &device_list); 5225 device_list_handle = &device_list; 5226 } 5227 5228 /* We need to lock reset domain only once both for XGMI and single device */ 5229 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5230 reset_list); 5231 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5232 5233 /* block all schedulers and reset given job's ring */ 5234 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5235 5236 amdgpu_device_set_mp1_state(tmp_adev); 5237 5238 /* 5239 * Try to put the audio codec into suspend state 5240 * before gpu reset started. 5241 * 5242 * Due to the power domain of the graphics device 5243 * is shared with AZ power domain. Without this, 5244 * we may change the audio hardware from behind 5245 * the audio driver's back. That will trigger 5246 * some audio codec errors. 5247 */ 5248 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5249 audio_suspended = true; 5250 5251 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5252 5253 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5254 5255 if (!amdgpu_sriov_vf(tmp_adev)) 5256 amdgpu_amdkfd_pre_reset(tmp_adev); 5257 5258 /* 5259 * Mark these ASICs to be reseted as untracked first 5260 * And add them back after reset completed 5261 */ 5262 amdgpu_unregister_gpu_instance(tmp_adev); 5263 5264 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5265 5266 /* disable ras on ALL IPs */ 5267 if (!need_emergency_restart && 5268 amdgpu_device_ip_need_full_reset(tmp_adev)) 5269 amdgpu_ras_suspend(tmp_adev); 5270 5271 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5272 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5273 5274 if (!ring || !ring->sched.thread) 5275 continue; 5276 5277 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5278 5279 if (need_emergency_restart) 5280 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5281 } 5282 atomic_inc(&tmp_adev->gpu_reset_counter); 5283 } 5284 5285 if (need_emergency_restart) 5286 goto skip_sched_resume; 5287 5288 /* 5289 * Must check guilty signal here since after this point all old 5290 * HW fences are force signaled. 5291 * 5292 * job->base holds a reference to parent fence 5293 */ 5294 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5295 job_signaled = true; 5296 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5297 goto skip_hw_reset; 5298 } 5299 5300 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5301 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5302 if (gpu_reset_for_dev_remove) { 5303 /* Workaroud for ASICs need to disable SMC first */ 5304 amdgpu_device_smu_fini_early(tmp_adev); 5305 } 5306 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5307 /*TODO Should we stop ?*/ 5308 if (r) { 5309 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5310 r, adev_to_drm(tmp_adev)->unique); 5311 tmp_adev->asic_reset_res = r; 5312 } 5313 5314 /* 5315 * Drop all pending non scheduler resets. Scheduler resets 5316 * were already dropped during drm_sched_stop 5317 */ 5318 amdgpu_device_stop_pending_resets(tmp_adev); 5319 } 5320 5321 /* Actual ASIC resets if needed.*/ 5322 /* Host driver will handle XGMI hive reset for SRIOV */ 5323 if (amdgpu_sriov_vf(adev)) { 5324 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5325 if (r) 5326 adev->asic_reset_res = r; 5327 5328 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5329 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5330 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5331 amdgpu_ras_resume(adev); 5332 } else { 5333 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5334 if (r && r == -EAGAIN) 5335 goto retry; 5336 5337 if (!r && gpu_reset_for_dev_remove) 5338 goto recover_end; 5339 } 5340 5341 skip_hw_reset: 5342 5343 /* Post ASIC reset for all devs .*/ 5344 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5345 5346 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5347 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5348 5349 if (!ring || !ring->sched.thread) 5350 continue; 5351 5352 drm_sched_start(&ring->sched, true); 5353 } 5354 5355 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5356 amdgpu_mes_self_test(tmp_adev); 5357 5358 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5359 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5360 5361 if (tmp_adev->asic_reset_res) 5362 r = tmp_adev->asic_reset_res; 5363 5364 tmp_adev->asic_reset_res = 0; 5365 5366 if (r) { 5367 /* bad news, how to tell it to userspace ? */ 5368 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5369 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5370 } else { 5371 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5372 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5373 DRM_WARN("smart shift update failed\n"); 5374 } 5375 } 5376 5377 skip_sched_resume: 5378 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5379 /* unlock kfd: SRIOV would do it separately */ 5380 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5381 amdgpu_amdkfd_post_reset(tmp_adev); 5382 5383 /* kfd_post_reset will do nothing if kfd device is not initialized, 5384 * need to bring up kfd here if it's not be initialized before 5385 */ 5386 if (!adev->kfd.init_complete) 5387 amdgpu_amdkfd_device_init(adev); 5388 5389 if (audio_suspended) 5390 amdgpu_device_resume_display_audio(tmp_adev); 5391 5392 amdgpu_device_unset_mp1_state(tmp_adev); 5393 5394 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5395 } 5396 5397 recover_end: 5398 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5399 reset_list); 5400 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5401 5402 if (hive) { 5403 mutex_unlock(&hive->hive_lock); 5404 amdgpu_put_xgmi_hive(hive); 5405 } 5406 5407 if (r) 5408 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5409 5410 atomic_set(&adev->reset_domain->reset_res, r); 5411 return r; 5412 } 5413 5414 /** 5415 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5416 * 5417 * @adev: amdgpu_device pointer 5418 * 5419 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5420 * and lanes) of the slot the device is in. Handles APUs and 5421 * virtualized environments where PCIE config space may not be available. 5422 */ 5423 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5424 { 5425 struct pci_dev *pdev; 5426 enum pci_bus_speed speed_cap, platform_speed_cap; 5427 enum pcie_link_width platform_link_width; 5428 5429 if (amdgpu_pcie_gen_cap) 5430 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5431 5432 if (amdgpu_pcie_lane_cap) 5433 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5434 5435 /* covers APUs as well */ 5436 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5437 if (adev->pm.pcie_gen_mask == 0) 5438 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5439 if (adev->pm.pcie_mlw_mask == 0) 5440 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5441 return; 5442 } 5443 5444 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5445 return; 5446 5447 pcie_bandwidth_available(adev->pdev, NULL, 5448 &platform_speed_cap, &platform_link_width); 5449 5450 if (adev->pm.pcie_gen_mask == 0) { 5451 /* asic caps */ 5452 pdev = adev->pdev; 5453 speed_cap = pcie_get_speed_cap(pdev); 5454 if (speed_cap == PCI_SPEED_UNKNOWN) { 5455 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5456 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5457 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5458 } else { 5459 if (speed_cap == PCIE_SPEED_32_0GT) 5460 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5461 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5463 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5464 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5465 else if (speed_cap == PCIE_SPEED_16_0GT) 5466 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5468 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5469 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5470 else if (speed_cap == PCIE_SPEED_8_0GT) 5471 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5472 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5473 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5474 else if (speed_cap == PCIE_SPEED_5_0GT) 5475 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5476 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5477 else 5478 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5479 } 5480 /* platform caps */ 5481 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5482 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5483 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5484 } else { 5485 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5486 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5487 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5489 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5490 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5491 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5492 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5493 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5494 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5495 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5496 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5497 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5498 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5499 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5500 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5501 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5502 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5503 else 5504 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5505 5506 } 5507 } 5508 if (adev->pm.pcie_mlw_mask == 0) { 5509 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5510 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5511 } else { 5512 switch (platform_link_width) { 5513 case PCIE_LNK_X32: 5514 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5515 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5516 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5517 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5518 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5519 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5520 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5521 break; 5522 case PCIE_LNK_X16: 5523 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5524 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5525 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5529 break; 5530 case PCIE_LNK_X12: 5531 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5532 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5534 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5536 break; 5537 case PCIE_LNK_X8: 5538 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5542 break; 5543 case PCIE_LNK_X4: 5544 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5547 break; 5548 case PCIE_LNK_X2: 5549 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5551 break; 5552 case PCIE_LNK_X1: 5553 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5554 break; 5555 default: 5556 break; 5557 } 5558 } 5559 } 5560 } 5561 5562 /** 5563 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5564 * 5565 * @adev: amdgpu_device pointer 5566 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5567 * 5568 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5569 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5570 * @peer_adev. 5571 */ 5572 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5573 struct amdgpu_device *peer_adev) 5574 { 5575 #ifdef CONFIG_HSA_AMD_P2P 5576 uint64_t address_mask = peer_adev->dev->dma_mask ? 5577 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5578 resource_size_t aper_limit = 5579 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5580 bool p2p_access = 5581 !adev->gmc.xgmi.connected_to_cpu && 5582 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5583 5584 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5585 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5586 !(adev->gmc.aper_base & address_mask || 5587 aper_limit & address_mask)); 5588 #else 5589 return false; 5590 #endif 5591 } 5592 5593 int amdgpu_device_baco_enter(struct drm_device *dev) 5594 { 5595 struct amdgpu_device *adev = drm_to_adev(dev); 5596 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5597 5598 if (!amdgpu_device_supports_baco(dev)) 5599 return -ENOTSUPP; 5600 5601 if (ras && adev->ras_enabled && 5602 adev->nbio.funcs->enable_doorbell_interrupt) 5603 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5604 5605 return amdgpu_dpm_baco_enter(adev); 5606 } 5607 5608 int amdgpu_device_baco_exit(struct drm_device *dev) 5609 { 5610 struct amdgpu_device *adev = drm_to_adev(dev); 5611 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5612 int ret = 0; 5613 5614 if (!amdgpu_device_supports_baco(dev)) 5615 return -ENOTSUPP; 5616 5617 ret = amdgpu_dpm_baco_exit(adev); 5618 if (ret) 5619 return ret; 5620 5621 if (ras && adev->ras_enabled && 5622 adev->nbio.funcs->enable_doorbell_interrupt) 5623 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5624 5625 if (amdgpu_passthrough(adev) && 5626 adev->nbio.funcs->clear_doorbell_interrupt) 5627 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5628 5629 return 0; 5630 } 5631 5632 /** 5633 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5634 * @pdev: PCI device struct 5635 * @state: PCI channel state 5636 * 5637 * Description: Called when a PCI error is detected. 5638 * 5639 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5640 */ 5641 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5642 { 5643 struct drm_device *dev = pci_get_drvdata(pdev); 5644 struct amdgpu_device *adev = drm_to_adev(dev); 5645 int i; 5646 5647 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5648 5649 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5650 DRM_WARN("No support for XGMI hive yet..."); 5651 return PCI_ERS_RESULT_DISCONNECT; 5652 } 5653 5654 adev->pci_channel_state = state; 5655 5656 switch (state) { 5657 case pci_channel_io_normal: 5658 return PCI_ERS_RESULT_CAN_RECOVER; 5659 /* Fatal error, prepare for slot reset */ 5660 case pci_channel_io_frozen: 5661 /* 5662 * Locking adev->reset_domain->sem will prevent any external access 5663 * to GPU during PCI error recovery 5664 */ 5665 amdgpu_device_lock_reset_domain(adev->reset_domain); 5666 amdgpu_device_set_mp1_state(adev); 5667 5668 /* 5669 * Block any work scheduling as we do for regular GPU reset 5670 * for the duration of the recovery 5671 */ 5672 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5673 struct amdgpu_ring *ring = adev->rings[i]; 5674 5675 if (!ring || !ring->sched.thread) 5676 continue; 5677 5678 drm_sched_stop(&ring->sched, NULL); 5679 } 5680 atomic_inc(&adev->gpu_reset_counter); 5681 return PCI_ERS_RESULT_NEED_RESET; 5682 case pci_channel_io_perm_failure: 5683 /* Permanent error, prepare for device removal */ 5684 return PCI_ERS_RESULT_DISCONNECT; 5685 } 5686 5687 return PCI_ERS_RESULT_NEED_RESET; 5688 } 5689 5690 /** 5691 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5692 * @pdev: pointer to PCI device 5693 */ 5694 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5695 { 5696 5697 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5698 5699 /* TODO - dump whatever for debugging purposes */ 5700 5701 /* This called only if amdgpu_pci_error_detected returns 5702 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5703 * works, no need to reset slot. 5704 */ 5705 5706 return PCI_ERS_RESULT_RECOVERED; 5707 } 5708 5709 /** 5710 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5711 * @pdev: PCI device struct 5712 * 5713 * Description: This routine is called by the pci error recovery 5714 * code after the PCI slot has been reset, just before we 5715 * should resume normal operations. 5716 */ 5717 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5718 { 5719 struct drm_device *dev = pci_get_drvdata(pdev); 5720 struct amdgpu_device *adev = drm_to_adev(dev); 5721 int r, i; 5722 struct amdgpu_reset_context reset_context; 5723 u32 memsize; 5724 struct list_head device_list; 5725 5726 DRM_INFO("PCI error: slot reset callback!!\n"); 5727 5728 memset(&reset_context, 0, sizeof(reset_context)); 5729 5730 INIT_LIST_HEAD(&device_list); 5731 list_add_tail(&adev->reset_list, &device_list); 5732 5733 /* wait for asic to come out of reset */ 5734 msleep(500); 5735 5736 /* Restore PCI confspace */ 5737 amdgpu_device_load_pci_state(pdev); 5738 5739 /* confirm ASIC came out of reset */ 5740 for (i = 0; i < adev->usec_timeout; i++) { 5741 memsize = amdgpu_asic_get_config_memsize(adev); 5742 5743 if (memsize != 0xffffffff) 5744 break; 5745 udelay(1); 5746 } 5747 if (memsize == 0xffffffff) { 5748 r = -ETIME; 5749 goto out; 5750 } 5751 5752 reset_context.method = AMD_RESET_METHOD_NONE; 5753 reset_context.reset_req_dev = adev; 5754 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5755 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5756 5757 adev->no_hw_access = true; 5758 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5759 adev->no_hw_access = false; 5760 if (r) 5761 goto out; 5762 5763 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5764 5765 out: 5766 if (!r) { 5767 if (amdgpu_device_cache_pci_state(adev->pdev)) 5768 pci_restore_state(adev->pdev); 5769 5770 DRM_INFO("PCIe error recovery succeeded\n"); 5771 } else { 5772 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5773 amdgpu_device_unset_mp1_state(adev); 5774 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5775 } 5776 5777 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5778 } 5779 5780 /** 5781 * amdgpu_pci_resume() - resume normal ops after PCI reset 5782 * @pdev: pointer to PCI device 5783 * 5784 * Called when the error recovery driver tells us that its 5785 * OK to resume normal operation. 5786 */ 5787 void amdgpu_pci_resume(struct pci_dev *pdev) 5788 { 5789 struct drm_device *dev = pci_get_drvdata(pdev); 5790 struct amdgpu_device *adev = drm_to_adev(dev); 5791 int i; 5792 5793 5794 DRM_INFO("PCI error: resume callback!!\n"); 5795 5796 /* Only continue execution for the case of pci_channel_io_frozen */ 5797 if (adev->pci_channel_state != pci_channel_io_frozen) 5798 return; 5799 5800 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5801 struct amdgpu_ring *ring = adev->rings[i]; 5802 5803 if (!ring || !ring->sched.thread) 5804 continue; 5805 5806 drm_sched_start(&ring->sched, true); 5807 } 5808 5809 amdgpu_device_unset_mp1_state(adev); 5810 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5811 } 5812 5813 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5814 { 5815 struct drm_device *dev = pci_get_drvdata(pdev); 5816 struct amdgpu_device *adev = drm_to_adev(dev); 5817 int r; 5818 5819 r = pci_save_state(pdev); 5820 if (!r) { 5821 kfree(adev->pci_state); 5822 5823 adev->pci_state = pci_store_saved_state(pdev); 5824 5825 if (!adev->pci_state) { 5826 DRM_ERROR("Failed to store PCI saved state"); 5827 return false; 5828 } 5829 } else { 5830 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5831 return false; 5832 } 5833 5834 return true; 5835 } 5836 5837 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5838 { 5839 struct drm_device *dev = pci_get_drvdata(pdev); 5840 struct amdgpu_device *adev = drm_to_adev(dev); 5841 int r; 5842 5843 if (!adev->pci_state) 5844 return false; 5845 5846 r = pci_load_saved_state(pdev, adev->pci_state); 5847 5848 if (!r) { 5849 pci_restore_state(pdev); 5850 } else { 5851 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5852 return false; 5853 } 5854 5855 return true; 5856 } 5857 5858 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5859 struct amdgpu_ring *ring) 5860 { 5861 #ifdef CONFIG_X86_64 5862 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5863 return; 5864 #endif 5865 if (adev->gmc.xgmi.connected_to_cpu) 5866 return; 5867 5868 if (ring && ring->funcs->emit_hdp_flush) 5869 amdgpu_ring_emit_hdp_flush(ring); 5870 else 5871 amdgpu_asic_flush_hdp(adev, ring); 5872 } 5873 5874 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5875 struct amdgpu_ring *ring) 5876 { 5877 #ifdef CONFIG_X86_64 5878 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5879 return; 5880 #endif 5881 if (adev->gmc.xgmi.connected_to_cpu) 5882 return; 5883 5884 amdgpu_asic_invalidate_hdp(adev, ring); 5885 } 5886 5887 int amdgpu_in_reset(struct amdgpu_device *adev) 5888 { 5889 return atomic_read(&adev->reset_domain->in_gpu_reset); 5890 } 5891 5892 /** 5893 * amdgpu_device_halt() - bring hardware to some kind of halt state 5894 * 5895 * @adev: amdgpu_device pointer 5896 * 5897 * Bring hardware to some kind of halt state so that no one can touch it 5898 * any more. It will help to maintain error context when error occurred. 5899 * Compare to a simple hang, the system will keep stable at least for SSH 5900 * access. Then it should be trivial to inspect the hardware state and 5901 * see what's going on. Implemented as following: 5902 * 5903 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5904 * clears all CPU mappings to device, disallows remappings through page faults 5905 * 2. amdgpu_irq_disable_all() disables all interrupts 5906 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5907 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5908 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5909 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5910 * flush any in flight DMA operations 5911 */ 5912 void amdgpu_device_halt(struct amdgpu_device *adev) 5913 { 5914 struct pci_dev *pdev = adev->pdev; 5915 struct drm_device *ddev = adev_to_drm(adev); 5916 5917 amdgpu_xcp_dev_unplug(adev); 5918 drm_dev_unplug(ddev); 5919 5920 amdgpu_irq_disable_all(adev); 5921 5922 amdgpu_fence_driver_hw_fini(adev); 5923 5924 adev->no_hw_access = true; 5925 5926 amdgpu_device_unmap_mmio(adev); 5927 5928 pci_disable_device(pdev); 5929 pci_wait_for_pending_transaction(pdev); 5930 } 5931 5932 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5933 u32 reg) 5934 { 5935 unsigned long flags, address, data; 5936 u32 r; 5937 5938 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5939 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5940 5941 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5942 WREG32(address, reg * 4); 5943 (void)RREG32(address); 5944 r = RREG32(data); 5945 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5946 return r; 5947 } 5948 5949 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5950 u32 reg, u32 v) 5951 { 5952 unsigned long flags, address, data; 5953 5954 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5955 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5956 5957 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5958 WREG32(address, reg * 4); 5959 (void)RREG32(address); 5960 WREG32(data, v); 5961 (void)RREG32(data); 5962 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5963 } 5964 5965 /** 5966 * amdgpu_device_switch_gang - switch to a new gang 5967 * @adev: amdgpu_device pointer 5968 * @gang: the gang to switch to 5969 * 5970 * Try to switch to a new gang. 5971 * Returns: NULL if we switched to the new gang or a reference to the current 5972 * gang leader. 5973 */ 5974 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5975 struct dma_fence *gang) 5976 { 5977 struct dma_fence *old = NULL; 5978 5979 do { 5980 dma_fence_put(old); 5981 rcu_read_lock(); 5982 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5983 rcu_read_unlock(); 5984 5985 if (old == gang) 5986 break; 5987 5988 if (!dma_fence_is_signaled(old)) 5989 return old; 5990 5991 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5992 old, gang) != old); 5993 5994 dma_fence_put(old); 5995 return NULL; 5996 } 5997 5998 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5999 { 6000 switch (adev->asic_type) { 6001 #ifdef CONFIG_DRM_AMDGPU_SI 6002 case CHIP_HAINAN: 6003 #endif 6004 case CHIP_TOPAZ: 6005 /* chips with no display hardware */ 6006 return false; 6007 #ifdef CONFIG_DRM_AMDGPU_SI 6008 case CHIP_TAHITI: 6009 case CHIP_PITCAIRN: 6010 case CHIP_VERDE: 6011 case CHIP_OLAND: 6012 #endif 6013 #ifdef CONFIG_DRM_AMDGPU_CIK 6014 case CHIP_BONAIRE: 6015 case CHIP_HAWAII: 6016 case CHIP_KAVERI: 6017 case CHIP_KABINI: 6018 case CHIP_MULLINS: 6019 #endif 6020 case CHIP_TONGA: 6021 case CHIP_FIJI: 6022 case CHIP_POLARIS10: 6023 case CHIP_POLARIS11: 6024 case CHIP_POLARIS12: 6025 case CHIP_VEGAM: 6026 case CHIP_CARRIZO: 6027 case CHIP_STONEY: 6028 /* chips with display hardware */ 6029 return true; 6030 default: 6031 /* IP discovery */ 6032 if (!adev->ip_versions[DCE_HWIP][0] || 6033 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6034 return false; 6035 return true; 6036 } 6037 } 6038 6039 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6040 uint32_t inst, uint32_t reg_addr, char reg_name[], 6041 uint32_t expected_value, uint32_t mask) 6042 { 6043 uint32_t ret = 0; 6044 uint32_t old_ = 0; 6045 uint32_t tmp_ = RREG32(reg_addr); 6046 uint32_t loop = adev->usec_timeout; 6047 6048 while ((tmp_ & (mask)) != (expected_value)) { 6049 if (old_ != tmp_) { 6050 loop = adev->usec_timeout; 6051 old_ = tmp_; 6052 } else 6053 udelay(1); 6054 tmp_ = RREG32(reg_addr); 6055 loop--; 6056 if (!loop) { 6057 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6058 inst, reg_name, (uint32_t)expected_value, 6059 (uint32_t)(tmp_ & (mask))); 6060 ret = -ETIMEDOUT; 6061 break; 6062 } 6063 } 6064 return ret; 6065 } 6066