1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 /** 168 * DOC: product_name 169 * 170 * The amdgpu driver provides a sysfs API for reporting the product name 171 * for the device 172 * The file product_name is used for this and returns the product name 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_name(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return sysfs_emit(buf, "%s\n", adev->product_name); 184 } 185 186 static DEVICE_ATTR(product_name, 0444, 187 amdgpu_device_get_product_name, NULL); 188 189 /** 190 * DOC: product_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the part number 193 * for the device 194 * The file product_number is used for this and returns the part number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_product_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return sysfs_emit(buf, "%s\n", adev->product_number); 206 } 207 208 static DEVICE_ATTR(product_number, 0444, 209 amdgpu_device_get_product_number, NULL); 210 211 /** 212 * DOC: serial_number 213 * 214 * The amdgpu driver provides a sysfs API for reporting the serial number 215 * for the device 216 * The file serial_number is used for this and returns the serial number 217 * as returned from the FRU. 218 * NOTE: This is only available for certain server cards 219 */ 220 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 222 struct device_attribute *attr, char *buf) 223 { 224 struct drm_device *ddev = dev_get_drvdata(dev); 225 struct amdgpu_device *adev = drm_to_adev(ddev); 226 227 return sysfs_emit(buf, "%s\n", adev->serial); 228 } 229 230 static DEVICE_ATTR(serial_number, 0444, 231 amdgpu_device_get_serial_number, NULL); 232 233 /** 234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 235 * 236 * @dev: drm_device pointer 237 * 238 * Returns true if the device is a dGPU with ATPX power control, 239 * otherwise return false. 240 */ 241 bool amdgpu_device_supports_px(struct drm_device *dev) 242 { 243 struct amdgpu_device *adev = drm_to_adev(dev); 244 245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device is a dGPU with ACPI power control, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_boco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 if (adev->has_pr3 || 263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 264 return true; 265 return false; 266 } 267 268 /** 269 * amdgpu_device_supports_baco - Does the device support BACO 270 * 271 * @dev: drm_device pointer 272 * 273 * Returns true if the device supporte BACO, 274 * otherwise return false. 275 */ 276 bool amdgpu_device_supports_baco(struct drm_device *dev) 277 { 278 struct amdgpu_device *adev = drm_to_adev(dev); 279 280 return amdgpu_asic_supports_baco(adev); 281 } 282 283 /** 284 * amdgpu_device_supports_smart_shift - Is the device dGPU with 285 * smart shift support 286 * 287 * @dev: drm_device pointer 288 * 289 * Returns true if the device is a dGPU with Smart Shift support, 290 * otherwise returns false. 291 */ 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 293 { 294 return (amdgpu_device_supports_boco(dev) && 295 amdgpu_acpi_is_power_shift_control_supported()); 296 } 297 298 /* 299 * VRAM access helper functions 300 */ 301 302 /** 303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 304 * 305 * @adev: amdgpu_device pointer 306 * @pos: offset of the buffer in vram 307 * @buf: virtual address of the buffer in system memory 308 * @size: read/write size, sizeof(@buf) must > @size 309 * @write: true - write to vram, otherwise - read from vram 310 */ 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 312 void *buf, size_t size, bool write) 313 { 314 unsigned long flags; 315 uint32_t hi = ~0, tmp = 0; 316 uint32_t *data = buf; 317 uint64_t last; 318 int idx; 319 320 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 321 return; 322 323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 324 325 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 326 for (last = pos + size; pos < last; pos += 4) { 327 tmp = pos >> 31; 328 329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 330 if (tmp != hi) { 331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 332 hi = tmp; 333 } 334 if (write) 335 WREG32_NO_KIQ(mmMM_DATA, *data++); 336 else 337 *data++ = RREG32_NO_KIQ(mmMM_DATA); 338 } 339 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 drm_dev_exit(idx); 342 } 343 344 /** 345 * amdgpu_device_aper_access - access vram by vram aperature 346 * 347 * @adev: amdgpu_device pointer 348 * @pos: offset of the buffer in vram 349 * @buf: virtual address of the buffer in system memory 350 * @size: read/write size, sizeof(@buf) must > @size 351 * @write: true - write to vram, otherwise - read from vram 352 * 353 * The return value means how many bytes have been transferred. 354 */ 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 356 void *buf, size_t size, bool write) 357 { 358 #ifdef CONFIG_64BIT 359 void __iomem *addr; 360 size_t count = 0; 361 uint64_t last; 362 363 if (!adev->mman.aper_base_kaddr) 364 return 0; 365 366 last = min(pos + size, adev->gmc.visible_vram_size); 367 if (last > pos) { 368 addr = adev->mman.aper_base_kaddr + pos; 369 count = last - pos; 370 371 if (write) { 372 memcpy_toio(addr, buf, count); 373 mb(); 374 amdgpu_device_flush_hdp(adev, NULL); 375 } else { 376 amdgpu_device_invalidate_hdp(adev, NULL); 377 mb(); 378 memcpy_fromio(buf, addr, count); 379 } 380 381 } 382 383 return count; 384 #else 385 return 0; 386 #endif 387 } 388 389 /** 390 * amdgpu_device_vram_access - read/write a buffer in vram 391 * 392 * @adev: amdgpu_device pointer 393 * @pos: offset of the buffer in vram 394 * @buf: virtual address of the buffer in system memory 395 * @size: read/write size, sizeof(@buf) must > @size 396 * @write: true - write to vram, otherwise - read from vram 397 */ 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 399 void *buf, size_t size, bool write) 400 { 401 size_t count; 402 403 /* try to using vram apreature to access vram first */ 404 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 405 size -= count; 406 if (size) { 407 /* using MM to access rest vram */ 408 pos += count; 409 buf += count; 410 amdgpu_device_mm_access(adev, pos, buf, size, write); 411 } 412 } 413 414 /* 415 * register access helper functions. 416 */ 417 418 /* Check if hw access should be skipped because of hotplug or device error */ 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 420 { 421 if (adev->no_hw_access) 422 return true; 423 424 #ifdef CONFIG_LOCKDEP 425 /* 426 * This is a bit complicated to understand, so worth a comment. What we assert 427 * here is that the GPU reset is not running on another thread in parallel. 428 * 429 * For this we trylock the read side of the reset semaphore, if that succeeds 430 * we know that the reset is not running in paralell. 431 * 432 * If the trylock fails we assert that we are either already holding the read 433 * side of the lock or are the reset thread itself and hold the write side of 434 * the lock. 435 */ 436 if (in_task()) { 437 if (down_read_trylock(&adev->reset_domain->sem)) 438 up_read(&adev->reset_domain->sem); 439 else 440 lockdep_assert_held(&adev->reset_domain->sem); 441 } 442 #endif 443 return false; 444 } 445 446 /** 447 * amdgpu_device_rreg - read a memory mapped IO or indirect register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * @acc_flags: access flags which require special behavior 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t acc_flags) 457 { 458 uint32_t ret; 459 460 if (amdgpu_device_skip_hw_access(adev)) 461 return 0; 462 463 if ((reg * 4) < adev->rmmio_size) { 464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 465 amdgpu_sriov_runtime(adev) && 466 down_read_trylock(&adev->reset_domain->sem)) { 467 ret = amdgpu_kiq_rreg(adev, reg); 468 up_read(&adev->reset_domain->sem); 469 } else { 470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 471 } 472 } else { 473 ret = adev->pcie_rreg(adev, reg * 4); 474 } 475 476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 477 478 return ret; 479 } 480 481 /* 482 * MMIO register read with bytes helper functions 483 * @offset:bytes offset from MMIO start 484 */ 485 486 /** 487 * amdgpu_mm_rreg8 - read a memory mapped IO register 488 * 489 * @adev: amdgpu_device pointer 490 * @offset: byte aligned register offset 491 * 492 * Returns the 8 bit value from the offset specified. 493 */ 494 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 495 { 496 if (amdgpu_device_skip_hw_access(adev)) 497 return 0; 498 499 if (offset < adev->rmmio_size) 500 return (readb(adev->rmmio + offset)); 501 BUG(); 502 } 503 504 /* 505 * MMIO register write with bytes helper functions 506 * @offset:bytes offset from MMIO start 507 * @value: the value want to be written to the register 508 */ 509 510 /** 511 * amdgpu_mm_wreg8 - read a memory mapped IO register 512 * 513 * @adev: amdgpu_device pointer 514 * @offset: byte aligned register offset 515 * @value: 8 bit value to write 516 * 517 * Writes the value specified to the offset specified. 518 */ 519 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 520 { 521 if (amdgpu_device_skip_hw_access(adev)) 522 return; 523 524 if (offset < adev->rmmio_size) 525 writeb(value, adev->rmmio + offset); 526 else 527 BUG(); 528 } 529 530 /** 531 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 532 * 533 * @adev: amdgpu_device pointer 534 * @reg: dword aligned register offset 535 * @v: 32 bit value to write to the register 536 * @acc_flags: access flags which require special behavior 537 * 538 * Writes the value specified to the offset specified. 539 */ 540 void amdgpu_device_wreg(struct amdgpu_device *adev, 541 uint32_t reg, uint32_t v, 542 uint32_t acc_flags) 543 { 544 if (amdgpu_device_skip_hw_access(adev)) 545 return; 546 547 if ((reg * 4) < adev->rmmio_size) { 548 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 549 amdgpu_sriov_runtime(adev) && 550 down_read_trylock(&adev->reset_domain->sem)) { 551 amdgpu_kiq_wreg(adev, reg, v); 552 up_read(&adev->reset_domain->sem); 553 } else { 554 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 555 } 556 } else { 557 adev->pcie_wreg(adev, reg * 4, v); 558 } 559 560 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 561 } 562 563 /** 564 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 565 * 566 * @adev: amdgpu_device pointer 567 * @reg: mmio/rlc register 568 * @v: value to write 569 * 570 * this function is invoked only for the debugfs register access 571 */ 572 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 573 uint32_t reg, uint32_t v, 574 uint32_t xcc_id) 575 { 576 if (amdgpu_device_skip_hw_access(adev)) 577 return; 578 579 if (amdgpu_sriov_fullaccess(adev) && 580 adev->gfx.rlc.funcs && 581 adev->gfx.rlc.funcs->is_rlcg_access_range) { 582 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 583 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 584 } else if ((reg * 4) >= adev->rmmio_size) { 585 adev->pcie_wreg(adev, reg * 4, v); 586 } else { 587 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 588 } 589 } 590 591 /** 592 * amdgpu_device_indirect_rreg - read an indirect register 593 * 594 * @adev: amdgpu_device pointer 595 * @reg_addr: indirect register address to read from 596 * 597 * Returns the value of indirect register @reg_addr 598 */ 599 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 600 u32 reg_addr) 601 { 602 unsigned long flags, pcie_index, pcie_data; 603 void __iomem *pcie_index_offset; 604 void __iomem *pcie_data_offset; 605 u32 r; 606 607 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 608 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 609 610 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 611 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 612 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 613 614 writel(reg_addr, pcie_index_offset); 615 readl(pcie_index_offset); 616 r = readl(pcie_data_offset); 617 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 618 619 return r; 620 } 621 622 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 623 u64 reg_addr) 624 { 625 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 626 u32 r; 627 void __iomem *pcie_index_offset; 628 void __iomem *pcie_index_hi_offset; 629 void __iomem *pcie_data_offset; 630 631 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 632 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 633 if (adev->nbio.funcs->get_pcie_index_hi_offset) 634 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 635 else 636 pcie_index_hi = 0; 637 638 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 639 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 640 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 641 if (pcie_index_hi != 0) 642 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 643 pcie_index_hi * 4; 644 645 writel(reg_addr, pcie_index_offset); 646 readl(pcie_index_offset); 647 if (pcie_index_hi != 0) { 648 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 649 readl(pcie_index_hi_offset); 650 } 651 r = readl(pcie_data_offset); 652 653 /* clear the high bits */ 654 if (pcie_index_hi != 0) { 655 writel(0, pcie_index_hi_offset); 656 readl(pcie_index_hi_offset); 657 } 658 659 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 660 661 return r; 662 } 663 664 /** 665 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 666 * 667 * @adev: amdgpu_device pointer 668 * @reg_addr: indirect register address to read from 669 * 670 * Returns the value of indirect register @reg_addr 671 */ 672 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 673 u32 reg_addr) 674 { 675 unsigned long flags, pcie_index, pcie_data; 676 void __iomem *pcie_index_offset; 677 void __iomem *pcie_data_offset; 678 u64 r; 679 680 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 681 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 682 683 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 684 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 685 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 686 687 /* read low 32 bits */ 688 writel(reg_addr, pcie_index_offset); 689 readl(pcie_index_offset); 690 r = readl(pcie_data_offset); 691 /* read high 32 bits */ 692 writel(reg_addr + 4, pcie_index_offset); 693 readl(pcie_index_offset); 694 r |= ((u64)readl(pcie_data_offset) << 32); 695 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 696 697 return r; 698 } 699 700 /** 701 * amdgpu_device_indirect_wreg - write an indirect register address 702 * 703 * @adev: amdgpu_device pointer 704 * @reg_addr: indirect register offset 705 * @reg_data: indirect register data 706 * 707 */ 708 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 709 u32 reg_addr, u32 reg_data) 710 { 711 unsigned long flags, pcie_index, pcie_data; 712 void __iomem *pcie_index_offset; 713 void __iomem *pcie_data_offset; 714 715 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 716 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 717 718 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 719 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 720 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 721 722 writel(reg_addr, pcie_index_offset); 723 readl(pcie_index_offset); 724 writel(reg_data, pcie_data_offset); 725 readl(pcie_data_offset); 726 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 727 } 728 729 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 730 u64 reg_addr, u32 reg_data) 731 { 732 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 733 void __iomem *pcie_index_offset; 734 void __iomem *pcie_index_hi_offset; 735 void __iomem *pcie_data_offset; 736 737 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 738 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 739 if (adev->nbio.funcs->get_pcie_index_hi_offset) 740 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 741 else 742 pcie_index_hi = 0; 743 744 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 745 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 746 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 747 if (pcie_index_hi != 0) 748 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 749 pcie_index_hi * 4; 750 751 writel(reg_addr, pcie_index_offset); 752 readl(pcie_index_offset); 753 if (pcie_index_hi != 0) { 754 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 755 readl(pcie_index_hi_offset); 756 } 757 writel(reg_data, pcie_data_offset); 758 readl(pcie_data_offset); 759 760 /* clear the high bits */ 761 if (pcie_index_hi != 0) { 762 writel(0, pcie_index_hi_offset); 763 readl(pcie_index_hi_offset); 764 } 765 766 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 767 } 768 769 /** 770 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 771 * 772 * @adev: amdgpu_device pointer 773 * @reg_addr: indirect register offset 774 * @reg_data: indirect register data 775 * 776 */ 777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 778 u32 reg_addr, u64 reg_data) 779 { 780 unsigned long flags, pcie_index, pcie_data; 781 void __iomem *pcie_index_offset; 782 void __iomem *pcie_data_offset; 783 784 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 785 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 786 787 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 788 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 789 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 790 791 /* write low 32 bits */ 792 writel(reg_addr, pcie_index_offset); 793 readl(pcie_index_offset); 794 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 795 readl(pcie_data_offset); 796 /* write high 32 bits */ 797 writel(reg_addr + 4, pcie_index_offset); 798 readl(pcie_index_offset); 799 writel((u32)(reg_data >> 32), pcie_data_offset); 800 readl(pcie_data_offset); 801 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 802 } 803 804 /** 805 * amdgpu_device_get_rev_id - query device rev_id 806 * 807 * @adev: amdgpu_device pointer 808 * 809 * Return device rev_id 810 */ 811 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 812 { 813 return adev->nbio.funcs->get_rev_id(adev); 814 } 815 816 /** 817 * amdgpu_invalid_rreg - dummy reg read function 818 * 819 * @adev: amdgpu_device pointer 820 * @reg: offset of register 821 * 822 * Dummy register read function. Used for register blocks 823 * that certain asics don't have (all asics). 824 * Returns the value in the register. 825 */ 826 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 827 { 828 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 829 BUG(); 830 return 0; 831 } 832 833 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 834 { 835 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 836 BUG(); 837 return 0; 838 } 839 840 /** 841 * amdgpu_invalid_wreg - dummy reg write function 842 * 843 * @adev: amdgpu_device pointer 844 * @reg: offset of register 845 * @v: value to write to the register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 */ 850 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 851 { 852 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 853 reg, v); 854 BUG(); 855 } 856 857 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 858 { 859 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 860 reg, v); 861 BUG(); 862 } 863 864 /** 865 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 866 * 867 * @adev: amdgpu_device pointer 868 * @reg: offset of register 869 * 870 * Dummy register read function. Used for register blocks 871 * that certain asics don't have (all asics). 872 * Returns the value in the register. 873 */ 874 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 875 { 876 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 877 BUG(); 878 return 0; 879 } 880 881 /** 882 * amdgpu_invalid_wreg64 - dummy reg write function 883 * 884 * @adev: amdgpu_device pointer 885 * @reg: offset of register 886 * @v: value to write to the register 887 * 888 * Dummy register read function. Used for register blocks 889 * that certain asics don't have (all asics). 890 */ 891 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 892 { 893 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 894 reg, v); 895 BUG(); 896 } 897 898 /** 899 * amdgpu_block_invalid_rreg - dummy reg read function 900 * 901 * @adev: amdgpu_device pointer 902 * @block: offset of instance 903 * @reg: offset of register 904 * 905 * Dummy register read function. Used for register blocks 906 * that certain asics don't have (all asics). 907 * Returns the value in the register. 908 */ 909 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 910 uint32_t block, uint32_t reg) 911 { 912 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 913 reg, block); 914 BUG(); 915 return 0; 916 } 917 918 /** 919 * amdgpu_block_invalid_wreg - dummy reg write function 920 * 921 * @adev: amdgpu_device pointer 922 * @block: offset of instance 923 * @reg: offset of register 924 * @v: value to write to the register 925 * 926 * Dummy register read function. Used for register blocks 927 * that certain asics don't have (all asics). 928 */ 929 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 930 uint32_t block, 931 uint32_t reg, uint32_t v) 932 { 933 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 934 reg, block, v); 935 BUG(); 936 } 937 938 /** 939 * amdgpu_device_asic_init - Wrapper for atom asic_init 940 * 941 * @adev: amdgpu_device pointer 942 * 943 * Does any asic specific work and then calls atom asic init. 944 */ 945 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 946 { 947 amdgpu_asic_pre_asic_init(adev); 948 949 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 950 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 951 return amdgpu_atomfirmware_asic_init(adev, true); 952 else 953 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 954 } 955 956 /** 957 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 958 * 959 * @adev: amdgpu_device pointer 960 * 961 * Allocates a scratch page of VRAM for use by various things in the 962 * driver. 963 */ 964 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 965 { 966 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 967 AMDGPU_GEM_DOMAIN_VRAM | 968 AMDGPU_GEM_DOMAIN_GTT, 969 &adev->mem_scratch.robj, 970 &adev->mem_scratch.gpu_addr, 971 (void **)&adev->mem_scratch.ptr); 972 } 973 974 /** 975 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 976 * 977 * @adev: amdgpu_device pointer 978 * 979 * Frees the VRAM scratch page. 980 */ 981 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 982 { 983 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 984 } 985 986 /** 987 * amdgpu_device_program_register_sequence - program an array of registers. 988 * 989 * @adev: amdgpu_device pointer 990 * @registers: pointer to the register array 991 * @array_size: size of the register array 992 * 993 * Programs an array or registers with and or masks. 994 * This is a helper for setting golden registers. 995 */ 996 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 997 const u32 *registers, 998 const u32 array_size) 999 { 1000 u32 tmp, reg, and_mask, or_mask; 1001 int i; 1002 1003 if (array_size % 3) 1004 return; 1005 1006 for (i = 0; i < array_size; i += 3) { 1007 reg = registers[i + 0]; 1008 and_mask = registers[i + 1]; 1009 or_mask = registers[i + 2]; 1010 1011 if (and_mask == 0xffffffff) { 1012 tmp = or_mask; 1013 } else { 1014 tmp = RREG32(reg); 1015 tmp &= ~and_mask; 1016 if (adev->family >= AMDGPU_FAMILY_AI) 1017 tmp |= (or_mask & and_mask); 1018 else 1019 tmp |= or_mask; 1020 } 1021 WREG32(reg, tmp); 1022 } 1023 } 1024 1025 /** 1026 * amdgpu_device_pci_config_reset - reset the GPU 1027 * 1028 * @adev: amdgpu_device pointer 1029 * 1030 * Resets the GPU using the pci config reset sequence. 1031 * Only applicable to asics prior to vega10. 1032 */ 1033 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1034 { 1035 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1036 } 1037 1038 /** 1039 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1040 * 1041 * @adev: amdgpu_device pointer 1042 * 1043 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1044 */ 1045 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1046 { 1047 return pci_reset_function(adev->pdev); 1048 } 1049 1050 /* 1051 * amdgpu_device_wb_*() 1052 * Writeback is the method by which the GPU updates special pages in memory 1053 * with the status of certain GPU events (fences, ring pointers,etc.). 1054 */ 1055 1056 /** 1057 * amdgpu_device_wb_fini - Disable Writeback and free memory 1058 * 1059 * @adev: amdgpu_device pointer 1060 * 1061 * Disables Writeback and frees the Writeback memory (all asics). 1062 * Used at driver shutdown. 1063 */ 1064 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1065 { 1066 if (adev->wb.wb_obj) { 1067 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1068 &adev->wb.gpu_addr, 1069 (void **)&adev->wb.wb); 1070 adev->wb.wb_obj = NULL; 1071 } 1072 } 1073 1074 /** 1075 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1076 * 1077 * @adev: amdgpu_device pointer 1078 * 1079 * Initializes writeback and allocates writeback memory (all asics). 1080 * Used at driver startup. 1081 * Returns 0 on success or an -error on failure. 1082 */ 1083 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1084 { 1085 int r; 1086 1087 if (adev->wb.wb_obj == NULL) { 1088 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1089 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1090 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1091 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1092 (void **)&adev->wb.wb); 1093 if (r) { 1094 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1095 return r; 1096 } 1097 1098 adev->wb.num_wb = AMDGPU_MAX_WB; 1099 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1100 1101 /* clear wb memory */ 1102 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1103 } 1104 1105 return 0; 1106 } 1107 1108 /** 1109 * amdgpu_device_wb_get - Allocate a wb entry 1110 * 1111 * @adev: amdgpu_device pointer 1112 * @wb: wb index 1113 * 1114 * Allocate a wb slot for use by the driver (all asics). 1115 * Returns 0 on success or -EINVAL on failure. 1116 */ 1117 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1118 { 1119 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1120 1121 if (offset < adev->wb.num_wb) { 1122 __set_bit(offset, adev->wb.used); 1123 *wb = offset << 3; /* convert to dw offset */ 1124 return 0; 1125 } else { 1126 return -EINVAL; 1127 } 1128 } 1129 1130 /** 1131 * amdgpu_device_wb_free - Free a wb entry 1132 * 1133 * @adev: amdgpu_device pointer 1134 * @wb: wb index 1135 * 1136 * Free a wb slot allocated for use by the driver (all asics) 1137 */ 1138 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1139 { 1140 wb >>= 3; 1141 if (wb < adev->wb.num_wb) 1142 __clear_bit(wb, adev->wb.used); 1143 } 1144 1145 /** 1146 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1147 * 1148 * @adev: amdgpu_device pointer 1149 * 1150 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1151 * to fail, but if any of the BARs is not accessible after the size we abort 1152 * driver loading by returning -ENODEV. 1153 */ 1154 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1155 { 1156 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1157 struct pci_bus *root; 1158 struct resource *res; 1159 unsigned int i; 1160 u16 cmd; 1161 int r; 1162 1163 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1164 return 0; 1165 1166 /* Bypass for VF */ 1167 if (amdgpu_sriov_vf(adev)) 1168 return 0; 1169 1170 /* skip if the bios has already enabled large BAR */ 1171 if (adev->gmc.real_vram_size && 1172 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1173 return 0; 1174 1175 /* Check if the root BUS has 64bit memory resources */ 1176 root = adev->pdev->bus; 1177 while (root->parent) 1178 root = root->parent; 1179 1180 pci_bus_for_each_resource(root, res, i) { 1181 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1182 res->start > 0x100000000ull) 1183 break; 1184 } 1185 1186 /* Trying to resize is pointless without a root hub window above 4GB */ 1187 if (!res) 1188 return 0; 1189 1190 /* Limit the BAR size to what is available */ 1191 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1192 rbar_size); 1193 1194 /* Disable memory decoding while we change the BAR addresses and size */ 1195 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1196 pci_write_config_word(adev->pdev, PCI_COMMAND, 1197 cmd & ~PCI_COMMAND_MEMORY); 1198 1199 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1200 amdgpu_doorbell_fini(adev); 1201 if (adev->asic_type >= CHIP_BONAIRE) 1202 pci_release_resource(adev->pdev, 2); 1203 1204 pci_release_resource(adev->pdev, 0); 1205 1206 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1207 if (r == -ENOSPC) 1208 DRM_INFO("Not enough PCI address space for a large BAR."); 1209 else if (r && r != -ENOTSUPP) 1210 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1211 1212 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1213 1214 /* When the doorbell or fb BAR isn't available we have no chance of 1215 * using the device. 1216 */ 1217 r = amdgpu_doorbell_init(adev); 1218 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1219 return -ENODEV; 1220 1221 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1222 1223 return 0; 1224 } 1225 1226 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1227 { 1228 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1229 return false; 1230 1231 return true; 1232 } 1233 1234 /* 1235 * GPU helpers function. 1236 */ 1237 /** 1238 * amdgpu_device_need_post - check if the hw need post or not 1239 * 1240 * @adev: amdgpu_device pointer 1241 * 1242 * Check if the asic has been initialized (all asics) at driver startup 1243 * or post is needed if hw reset is performed. 1244 * Returns true if need or false if not. 1245 */ 1246 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1247 { 1248 uint32_t reg; 1249 1250 if (amdgpu_sriov_vf(adev)) 1251 return false; 1252 1253 if (!amdgpu_device_read_bios(adev)) 1254 return false; 1255 1256 if (amdgpu_passthrough(adev)) { 1257 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1258 * some old smc fw still need driver do vPost otherwise gpu hang, while 1259 * those smc fw version above 22.15 doesn't have this flaw, so we force 1260 * vpost executed for smc version below 22.15 1261 */ 1262 if (adev->asic_type == CHIP_FIJI) { 1263 int err; 1264 uint32_t fw_ver; 1265 1266 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1267 /* force vPost if error occured */ 1268 if (err) 1269 return true; 1270 1271 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1272 if (fw_ver < 0x00160e00) 1273 return true; 1274 } 1275 } 1276 1277 /* Don't post if we need to reset whole hive on init */ 1278 if (adev->gmc.xgmi.pending_reset) 1279 return false; 1280 1281 if (adev->has_hw_reset) { 1282 adev->has_hw_reset = false; 1283 return true; 1284 } 1285 1286 /* bios scratch used on CIK+ */ 1287 if (adev->asic_type >= CHIP_BONAIRE) 1288 return amdgpu_atombios_scratch_need_asic_init(adev); 1289 1290 /* check MEM_SIZE for older asics */ 1291 reg = amdgpu_asic_get_config_memsize(adev); 1292 1293 if ((reg != 0) && (reg != 0xffffffff)) 1294 return false; 1295 1296 return true; 1297 } 1298 1299 /* 1300 * On APUs with >= 64GB white flickering has been observed w/ SG enabled. 1301 * Disable S/G on such systems until we have a proper fix. 1302 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 1303 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 1304 */ 1305 bool amdgpu_sg_display_supported(struct amdgpu_device *adev) 1306 { 1307 switch (amdgpu_sg_display) { 1308 case -1: 1309 break; 1310 case 0: 1311 return false; 1312 case 1: 1313 return true; 1314 default: 1315 return false; 1316 } 1317 if ((totalram_pages() << (PAGE_SHIFT - 10)) + 1318 (adev->gmc.real_vram_size / 1024) >= 64000000) { 1319 DRM_WARN("Disabling S/G due to >=64GB RAM\n"); 1320 return false; 1321 } 1322 return true; 1323 } 1324 1325 /* 1326 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1327 * speed switching. Until we have confirmation from Intel that a specific host 1328 * supports it, it's safer that we keep it disabled for all. 1329 * 1330 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1331 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1332 */ 1333 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1334 { 1335 #if IS_ENABLED(CONFIG_X86) 1336 struct cpuinfo_x86 *c = &cpu_data(0); 1337 1338 if (c->x86_vendor == X86_VENDOR_INTEL) 1339 return false; 1340 #endif 1341 return true; 1342 } 1343 1344 /** 1345 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1346 * 1347 * @adev: amdgpu_device pointer 1348 * 1349 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1350 * be set for this device. 1351 * 1352 * Returns true if it should be used or false if not. 1353 */ 1354 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1355 { 1356 switch (amdgpu_aspm) { 1357 case -1: 1358 break; 1359 case 0: 1360 return false; 1361 case 1: 1362 return true; 1363 default: 1364 return false; 1365 } 1366 return pcie_aspm_enabled(adev->pdev); 1367 } 1368 1369 bool amdgpu_device_aspm_support_quirk(void) 1370 { 1371 #if IS_ENABLED(CONFIG_X86) 1372 struct cpuinfo_x86 *c = &cpu_data(0); 1373 1374 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1375 #else 1376 return true; 1377 #endif 1378 } 1379 1380 /* if we get transitioned to only one device, take VGA back */ 1381 /** 1382 * amdgpu_device_vga_set_decode - enable/disable vga decode 1383 * 1384 * @pdev: PCI device pointer 1385 * @state: enable/disable vga decode 1386 * 1387 * Enable/disable vga decode (all asics). 1388 * Returns VGA resource flags. 1389 */ 1390 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1391 bool state) 1392 { 1393 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1394 1395 amdgpu_asic_set_vga_state(adev, state); 1396 if (state) 1397 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1398 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1399 else 1400 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1401 } 1402 1403 /** 1404 * amdgpu_device_check_block_size - validate the vm block size 1405 * 1406 * @adev: amdgpu_device pointer 1407 * 1408 * Validates the vm block size specified via module parameter. 1409 * The vm block size defines number of bits in page table versus page directory, 1410 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1411 * page table and the remaining bits are in the page directory. 1412 */ 1413 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1414 { 1415 /* defines number of bits in page table versus page directory, 1416 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1417 * page table and the remaining bits are in the page directory 1418 */ 1419 if (amdgpu_vm_block_size == -1) 1420 return; 1421 1422 if (amdgpu_vm_block_size < 9) { 1423 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1424 amdgpu_vm_block_size); 1425 amdgpu_vm_block_size = -1; 1426 } 1427 } 1428 1429 /** 1430 * amdgpu_device_check_vm_size - validate the vm size 1431 * 1432 * @adev: amdgpu_device pointer 1433 * 1434 * Validates the vm size in GB specified via module parameter. 1435 * The VM size is the size of the GPU virtual memory space in GB. 1436 */ 1437 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1438 { 1439 /* no need to check the default value */ 1440 if (amdgpu_vm_size == -1) 1441 return; 1442 1443 if (amdgpu_vm_size < 1) { 1444 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1445 amdgpu_vm_size); 1446 amdgpu_vm_size = -1; 1447 } 1448 } 1449 1450 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1451 { 1452 struct sysinfo si; 1453 bool is_os_64 = (sizeof(void *) == 8); 1454 uint64_t total_memory; 1455 uint64_t dram_size_seven_GB = 0x1B8000000; 1456 uint64_t dram_size_three_GB = 0xB8000000; 1457 1458 if (amdgpu_smu_memory_pool_size == 0) 1459 return; 1460 1461 if (!is_os_64) { 1462 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1463 goto def_value; 1464 } 1465 si_meminfo(&si); 1466 total_memory = (uint64_t)si.totalram * si.mem_unit; 1467 1468 if ((amdgpu_smu_memory_pool_size == 1) || 1469 (amdgpu_smu_memory_pool_size == 2)) { 1470 if (total_memory < dram_size_three_GB) 1471 goto def_value1; 1472 } else if ((amdgpu_smu_memory_pool_size == 4) || 1473 (amdgpu_smu_memory_pool_size == 8)) { 1474 if (total_memory < dram_size_seven_GB) 1475 goto def_value1; 1476 } else { 1477 DRM_WARN("Smu memory pool size not supported\n"); 1478 goto def_value; 1479 } 1480 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1481 1482 return; 1483 1484 def_value1: 1485 DRM_WARN("No enough system memory\n"); 1486 def_value: 1487 adev->pm.smu_prv_buffer_size = 0; 1488 } 1489 1490 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1491 { 1492 if (!(adev->flags & AMD_IS_APU) || 1493 adev->asic_type < CHIP_RAVEN) 1494 return 0; 1495 1496 switch (adev->asic_type) { 1497 case CHIP_RAVEN: 1498 if (adev->pdev->device == 0x15dd) 1499 adev->apu_flags |= AMD_APU_IS_RAVEN; 1500 if (adev->pdev->device == 0x15d8) 1501 adev->apu_flags |= AMD_APU_IS_PICASSO; 1502 break; 1503 case CHIP_RENOIR: 1504 if ((adev->pdev->device == 0x1636) || 1505 (adev->pdev->device == 0x164c)) 1506 adev->apu_flags |= AMD_APU_IS_RENOIR; 1507 else 1508 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1509 break; 1510 case CHIP_VANGOGH: 1511 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1512 break; 1513 case CHIP_YELLOW_CARP: 1514 break; 1515 case CHIP_CYAN_SKILLFISH: 1516 if ((adev->pdev->device == 0x13FE) || 1517 (adev->pdev->device == 0x143F)) 1518 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1519 break; 1520 default: 1521 break; 1522 } 1523 1524 return 0; 1525 } 1526 1527 /** 1528 * amdgpu_device_check_arguments - validate module params 1529 * 1530 * @adev: amdgpu_device pointer 1531 * 1532 * Validates certain module parameters and updates 1533 * the associated values used by the driver (all asics). 1534 */ 1535 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1536 { 1537 if (amdgpu_sched_jobs < 4) { 1538 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1539 amdgpu_sched_jobs); 1540 amdgpu_sched_jobs = 4; 1541 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1542 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1543 amdgpu_sched_jobs); 1544 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1545 } 1546 1547 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1548 /* gart size must be greater or equal to 32M */ 1549 dev_warn(adev->dev, "gart size (%d) too small\n", 1550 amdgpu_gart_size); 1551 amdgpu_gart_size = -1; 1552 } 1553 1554 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1555 /* gtt size must be greater or equal to 32M */ 1556 dev_warn(adev->dev, "gtt size (%d) too small\n", 1557 amdgpu_gtt_size); 1558 amdgpu_gtt_size = -1; 1559 } 1560 1561 /* valid range is between 4 and 9 inclusive */ 1562 if (amdgpu_vm_fragment_size != -1 && 1563 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1564 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1565 amdgpu_vm_fragment_size = -1; 1566 } 1567 1568 if (amdgpu_sched_hw_submission < 2) { 1569 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1570 amdgpu_sched_hw_submission); 1571 amdgpu_sched_hw_submission = 2; 1572 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1573 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1574 amdgpu_sched_hw_submission); 1575 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1576 } 1577 1578 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1579 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1580 amdgpu_reset_method = -1; 1581 } 1582 1583 amdgpu_device_check_smu_prv_buffer_size(adev); 1584 1585 amdgpu_device_check_vm_size(adev); 1586 1587 amdgpu_device_check_block_size(adev); 1588 1589 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1590 1591 return 0; 1592 } 1593 1594 /** 1595 * amdgpu_switcheroo_set_state - set switcheroo state 1596 * 1597 * @pdev: pci dev pointer 1598 * @state: vga_switcheroo state 1599 * 1600 * Callback for the switcheroo driver. Suspends or resumes 1601 * the asics before or after it is powered up using ACPI methods. 1602 */ 1603 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1604 enum vga_switcheroo_state state) 1605 { 1606 struct drm_device *dev = pci_get_drvdata(pdev); 1607 int r; 1608 1609 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1610 return; 1611 1612 if (state == VGA_SWITCHEROO_ON) { 1613 pr_info("switched on\n"); 1614 /* don't suspend or resume card normally */ 1615 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1616 1617 pci_set_power_state(pdev, PCI_D0); 1618 amdgpu_device_load_pci_state(pdev); 1619 r = pci_enable_device(pdev); 1620 if (r) 1621 DRM_WARN("pci_enable_device failed (%d)\n", r); 1622 amdgpu_device_resume(dev, true); 1623 1624 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1625 } else { 1626 pr_info("switched off\n"); 1627 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1628 amdgpu_device_suspend(dev, true); 1629 amdgpu_device_cache_pci_state(pdev); 1630 /* Shut down the device */ 1631 pci_disable_device(pdev); 1632 pci_set_power_state(pdev, PCI_D3cold); 1633 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1634 } 1635 } 1636 1637 /** 1638 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1639 * 1640 * @pdev: pci dev pointer 1641 * 1642 * Callback for the switcheroo driver. Check of the switcheroo 1643 * state can be changed. 1644 * Returns true if the state can be changed, false if not. 1645 */ 1646 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1647 { 1648 struct drm_device *dev = pci_get_drvdata(pdev); 1649 1650 /* 1651 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1652 * locking inversion with the driver load path. And the access here is 1653 * completely racy anyway. So don't bother with locking for now. 1654 */ 1655 return atomic_read(&dev->open_count) == 0; 1656 } 1657 1658 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1659 .set_gpu_state = amdgpu_switcheroo_set_state, 1660 .reprobe = NULL, 1661 .can_switch = amdgpu_switcheroo_can_switch, 1662 }; 1663 1664 /** 1665 * amdgpu_device_ip_set_clockgating_state - set the CG state 1666 * 1667 * @dev: amdgpu_device pointer 1668 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1669 * @state: clockgating state (gate or ungate) 1670 * 1671 * Sets the requested clockgating state for all instances of 1672 * the hardware IP specified. 1673 * Returns the error code from the last instance. 1674 */ 1675 int amdgpu_device_ip_set_clockgating_state(void *dev, 1676 enum amd_ip_block_type block_type, 1677 enum amd_clockgating_state state) 1678 { 1679 struct amdgpu_device *adev = dev; 1680 int i, r = 0; 1681 1682 for (i = 0; i < adev->num_ip_blocks; i++) { 1683 if (!adev->ip_blocks[i].status.valid) 1684 continue; 1685 if (adev->ip_blocks[i].version->type != block_type) 1686 continue; 1687 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1688 continue; 1689 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1690 (void *)adev, state); 1691 if (r) 1692 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1693 adev->ip_blocks[i].version->funcs->name, r); 1694 } 1695 return r; 1696 } 1697 1698 /** 1699 * amdgpu_device_ip_set_powergating_state - set the PG state 1700 * 1701 * @dev: amdgpu_device pointer 1702 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1703 * @state: powergating state (gate or ungate) 1704 * 1705 * Sets the requested powergating state for all instances of 1706 * the hardware IP specified. 1707 * Returns the error code from the last instance. 1708 */ 1709 int amdgpu_device_ip_set_powergating_state(void *dev, 1710 enum amd_ip_block_type block_type, 1711 enum amd_powergating_state state) 1712 { 1713 struct amdgpu_device *adev = dev; 1714 int i, r = 0; 1715 1716 for (i = 0; i < adev->num_ip_blocks; i++) { 1717 if (!adev->ip_blocks[i].status.valid) 1718 continue; 1719 if (adev->ip_blocks[i].version->type != block_type) 1720 continue; 1721 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1722 continue; 1723 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1724 (void *)adev, state); 1725 if (r) 1726 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1727 adev->ip_blocks[i].version->funcs->name, r); 1728 } 1729 return r; 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_get_clockgating_state - get the CG state 1734 * 1735 * @adev: amdgpu_device pointer 1736 * @flags: clockgating feature flags 1737 * 1738 * Walks the list of IPs on the device and updates the clockgating 1739 * flags for each IP. 1740 * Updates @flags with the feature flags for each hardware IP where 1741 * clockgating is enabled. 1742 */ 1743 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1744 u64 *flags) 1745 { 1746 int i; 1747 1748 for (i = 0; i < adev->num_ip_blocks; i++) { 1749 if (!adev->ip_blocks[i].status.valid) 1750 continue; 1751 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1752 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1753 } 1754 } 1755 1756 /** 1757 * amdgpu_device_ip_wait_for_idle - wait for idle 1758 * 1759 * @adev: amdgpu_device pointer 1760 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1761 * 1762 * Waits for the request hardware IP to be idle. 1763 * Returns 0 for success or a negative error code on failure. 1764 */ 1765 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1766 enum amd_ip_block_type block_type) 1767 { 1768 int i, r; 1769 1770 for (i = 0; i < adev->num_ip_blocks; i++) { 1771 if (!adev->ip_blocks[i].status.valid) 1772 continue; 1773 if (adev->ip_blocks[i].version->type == block_type) { 1774 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1775 if (r) 1776 return r; 1777 break; 1778 } 1779 } 1780 return 0; 1781 1782 } 1783 1784 /** 1785 * amdgpu_device_ip_is_idle - is the hardware IP idle 1786 * 1787 * @adev: amdgpu_device pointer 1788 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1789 * 1790 * Check if the hardware IP is idle or not. 1791 * Returns true if it the IP is idle, false if not. 1792 */ 1793 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1794 enum amd_ip_block_type block_type) 1795 { 1796 int i; 1797 1798 for (i = 0; i < adev->num_ip_blocks; i++) { 1799 if (!adev->ip_blocks[i].status.valid) 1800 continue; 1801 if (adev->ip_blocks[i].version->type == block_type) 1802 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1803 } 1804 return true; 1805 1806 } 1807 1808 /** 1809 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1810 * 1811 * @adev: amdgpu_device pointer 1812 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1813 * 1814 * Returns a pointer to the hardware IP block structure 1815 * if it exists for the asic, otherwise NULL. 1816 */ 1817 struct amdgpu_ip_block * 1818 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1819 enum amd_ip_block_type type) 1820 { 1821 int i; 1822 1823 for (i = 0; i < adev->num_ip_blocks; i++) 1824 if (adev->ip_blocks[i].version->type == type) 1825 return &adev->ip_blocks[i]; 1826 1827 return NULL; 1828 } 1829 1830 /** 1831 * amdgpu_device_ip_block_version_cmp 1832 * 1833 * @adev: amdgpu_device pointer 1834 * @type: enum amd_ip_block_type 1835 * @major: major version 1836 * @minor: minor version 1837 * 1838 * return 0 if equal or greater 1839 * return 1 if smaller or the ip_block doesn't exist 1840 */ 1841 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1842 enum amd_ip_block_type type, 1843 u32 major, u32 minor) 1844 { 1845 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1846 1847 if (ip_block && ((ip_block->version->major > major) || 1848 ((ip_block->version->major == major) && 1849 (ip_block->version->minor >= minor)))) 1850 return 0; 1851 1852 return 1; 1853 } 1854 1855 /** 1856 * amdgpu_device_ip_block_add 1857 * 1858 * @adev: amdgpu_device pointer 1859 * @ip_block_version: pointer to the IP to add 1860 * 1861 * Adds the IP block driver information to the collection of IPs 1862 * on the asic. 1863 */ 1864 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1865 const struct amdgpu_ip_block_version *ip_block_version) 1866 { 1867 if (!ip_block_version) 1868 return -EINVAL; 1869 1870 switch (ip_block_version->type) { 1871 case AMD_IP_BLOCK_TYPE_VCN: 1872 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1873 return 0; 1874 break; 1875 case AMD_IP_BLOCK_TYPE_JPEG: 1876 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1877 return 0; 1878 break; 1879 default: 1880 break; 1881 } 1882 1883 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1884 ip_block_version->funcs->name); 1885 1886 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1887 1888 return 0; 1889 } 1890 1891 /** 1892 * amdgpu_device_enable_virtual_display - enable virtual display feature 1893 * 1894 * @adev: amdgpu_device pointer 1895 * 1896 * Enabled the virtual display feature if the user has enabled it via 1897 * the module parameter virtual_display. This feature provides a virtual 1898 * display hardware on headless boards or in virtualized environments. 1899 * This function parses and validates the configuration string specified by 1900 * the user and configues the virtual display configuration (number of 1901 * virtual connectors, crtcs, etc.) specified. 1902 */ 1903 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1904 { 1905 adev->enable_virtual_display = false; 1906 1907 if (amdgpu_virtual_display) { 1908 const char *pci_address_name = pci_name(adev->pdev); 1909 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1910 1911 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1912 pciaddstr_tmp = pciaddstr; 1913 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1914 pciaddname = strsep(&pciaddname_tmp, ","); 1915 if (!strcmp("all", pciaddname) 1916 || !strcmp(pci_address_name, pciaddname)) { 1917 long num_crtc; 1918 int res = -1; 1919 1920 adev->enable_virtual_display = true; 1921 1922 if (pciaddname_tmp) 1923 res = kstrtol(pciaddname_tmp, 10, 1924 &num_crtc); 1925 1926 if (!res) { 1927 if (num_crtc < 1) 1928 num_crtc = 1; 1929 if (num_crtc > 6) 1930 num_crtc = 6; 1931 adev->mode_info.num_crtc = num_crtc; 1932 } else { 1933 adev->mode_info.num_crtc = 1; 1934 } 1935 break; 1936 } 1937 } 1938 1939 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1940 amdgpu_virtual_display, pci_address_name, 1941 adev->enable_virtual_display, adev->mode_info.num_crtc); 1942 1943 kfree(pciaddstr); 1944 } 1945 } 1946 1947 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1948 { 1949 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1950 adev->mode_info.num_crtc = 1; 1951 adev->enable_virtual_display = true; 1952 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1953 adev->enable_virtual_display, adev->mode_info.num_crtc); 1954 } 1955 } 1956 1957 /** 1958 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1959 * 1960 * @adev: amdgpu_device pointer 1961 * 1962 * Parses the asic configuration parameters specified in the gpu info 1963 * firmware and makes them availale to the driver for use in configuring 1964 * the asic. 1965 * Returns 0 on success, -EINVAL on failure. 1966 */ 1967 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1968 { 1969 const char *chip_name; 1970 char fw_name[40]; 1971 int err; 1972 const struct gpu_info_firmware_header_v1_0 *hdr; 1973 1974 adev->firmware.gpu_info_fw = NULL; 1975 1976 if (adev->mman.discovery_bin) { 1977 /* 1978 * FIXME: The bounding box is still needed by Navi12, so 1979 * temporarily read it from gpu_info firmware. Should be dropped 1980 * when DAL no longer needs it. 1981 */ 1982 if (adev->asic_type != CHIP_NAVI12) 1983 return 0; 1984 } 1985 1986 switch (adev->asic_type) { 1987 default: 1988 return 0; 1989 case CHIP_VEGA10: 1990 chip_name = "vega10"; 1991 break; 1992 case CHIP_VEGA12: 1993 chip_name = "vega12"; 1994 break; 1995 case CHIP_RAVEN: 1996 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1997 chip_name = "raven2"; 1998 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1999 chip_name = "picasso"; 2000 else 2001 chip_name = "raven"; 2002 break; 2003 case CHIP_ARCTURUS: 2004 chip_name = "arcturus"; 2005 break; 2006 case CHIP_NAVI12: 2007 chip_name = "navi12"; 2008 break; 2009 } 2010 2011 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2012 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2013 if (err) { 2014 dev_err(adev->dev, 2015 "Failed to get gpu_info firmware \"%s\"\n", 2016 fw_name); 2017 goto out; 2018 } 2019 2020 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2021 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2022 2023 switch (hdr->version_major) { 2024 case 1: 2025 { 2026 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2027 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2028 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2029 2030 /* 2031 * Should be droped when DAL no longer needs it. 2032 */ 2033 if (adev->asic_type == CHIP_NAVI12) 2034 goto parse_soc_bounding_box; 2035 2036 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2037 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2038 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2039 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2040 adev->gfx.config.max_texture_channel_caches = 2041 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2042 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2043 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2044 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2045 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2046 adev->gfx.config.double_offchip_lds_buf = 2047 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2048 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2049 adev->gfx.cu_info.max_waves_per_simd = 2050 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2051 adev->gfx.cu_info.max_scratch_slots_per_cu = 2052 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2053 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2054 if (hdr->version_minor >= 1) { 2055 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2056 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2057 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2058 adev->gfx.config.num_sc_per_sh = 2059 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2060 adev->gfx.config.num_packer_per_sc = 2061 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2062 } 2063 2064 parse_soc_bounding_box: 2065 /* 2066 * soc bounding box info is not integrated in disocovery table, 2067 * we always need to parse it from gpu info firmware if needed. 2068 */ 2069 if (hdr->version_minor == 2) { 2070 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2071 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2072 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2073 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2074 } 2075 break; 2076 } 2077 default: 2078 dev_err(adev->dev, 2079 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2080 err = -EINVAL; 2081 goto out; 2082 } 2083 out: 2084 return err; 2085 } 2086 2087 /** 2088 * amdgpu_device_ip_early_init - run early init for hardware IPs 2089 * 2090 * @adev: amdgpu_device pointer 2091 * 2092 * Early initialization pass for hardware IPs. The hardware IPs that make 2093 * up each asic are discovered each IP's early_init callback is run. This 2094 * is the first stage in initializing the asic. 2095 * Returns 0 on success, negative error code on failure. 2096 */ 2097 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2098 { 2099 struct drm_device *dev = adev_to_drm(adev); 2100 struct pci_dev *parent; 2101 int i, r; 2102 bool total; 2103 2104 amdgpu_device_enable_virtual_display(adev); 2105 2106 if (amdgpu_sriov_vf(adev)) { 2107 r = amdgpu_virt_request_full_gpu(adev, true); 2108 if (r) 2109 return r; 2110 } 2111 2112 switch (adev->asic_type) { 2113 #ifdef CONFIG_DRM_AMDGPU_SI 2114 case CHIP_VERDE: 2115 case CHIP_TAHITI: 2116 case CHIP_PITCAIRN: 2117 case CHIP_OLAND: 2118 case CHIP_HAINAN: 2119 adev->family = AMDGPU_FAMILY_SI; 2120 r = si_set_ip_blocks(adev); 2121 if (r) 2122 return r; 2123 break; 2124 #endif 2125 #ifdef CONFIG_DRM_AMDGPU_CIK 2126 case CHIP_BONAIRE: 2127 case CHIP_HAWAII: 2128 case CHIP_KAVERI: 2129 case CHIP_KABINI: 2130 case CHIP_MULLINS: 2131 if (adev->flags & AMD_IS_APU) 2132 adev->family = AMDGPU_FAMILY_KV; 2133 else 2134 adev->family = AMDGPU_FAMILY_CI; 2135 2136 r = cik_set_ip_blocks(adev); 2137 if (r) 2138 return r; 2139 break; 2140 #endif 2141 case CHIP_TOPAZ: 2142 case CHIP_TONGA: 2143 case CHIP_FIJI: 2144 case CHIP_POLARIS10: 2145 case CHIP_POLARIS11: 2146 case CHIP_POLARIS12: 2147 case CHIP_VEGAM: 2148 case CHIP_CARRIZO: 2149 case CHIP_STONEY: 2150 if (adev->flags & AMD_IS_APU) 2151 adev->family = AMDGPU_FAMILY_CZ; 2152 else 2153 adev->family = AMDGPU_FAMILY_VI; 2154 2155 r = vi_set_ip_blocks(adev); 2156 if (r) 2157 return r; 2158 break; 2159 default: 2160 r = amdgpu_discovery_set_ip_blocks(adev); 2161 if (r) 2162 return r; 2163 break; 2164 } 2165 2166 if (amdgpu_has_atpx() && 2167 (amdgpu_is_atpx_hybrid() || 2168 amdgpu_has_atpx_dgpu_power_cntl()) && 2169 ((adev->flags & AMD_IS_APU) == 0) && 2170 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2171 adev->flags |= AMD_IS_PX; 2172 2173 if (!(adev->flags & AMD_IS_APU)) { 2174 parent = pci_upstream_bridge(adev->pdev); 2175 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2176 } 2177 2178 2179 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2180 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2181 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2182 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2183 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2184 2185 total = true; 2186 for (i = 0; i < adev->num_ip_blocks; i++) { 2187 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2188 DRM_WARN("disabled ip block: %d <%s>\n", 2189 i, adev->ip_blocks[i].version->funcs->name); 2190 adev->ip_blocks[i].status.valid = false; 2191 } else { 2192 if (adev->ip_blocks[i].version->funcs->early_init) { 2193 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2194 if (r == -ENOENT) { 2195 adev->ip_blocks[i].status.valid = false; 2196 } else if (r) { 2197 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2198 adev->ip_blocks[i].version->funcs->name, r); 2199 total = false; 2200 } else { 2201 adev->ip_blocks[i].status.valid = true; 2202 } 2203 } else { 2204 adev->ip_blocks[i].status.valid = true; 2205 } 2206 } 2207 /* get the vbios after the asic_funcs are set up */ 2208 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2209 r = amdgpu_device_parse_gpu_info_fw(adev); 2210 if (r) 2211 return r; 2212 2213 /* Read BIOS */ 2214 if (amdgpu_device_read_bios(adev)) { 2215 if (!amdgpu_get_bios(adev)) 2216 return -EINVAL; 2217 2218 r = amdgpu_atombios_init(adev); 2219 if (r) { 2220 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2221 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2222 return r; 2223 } 2224 } 2225 2226 /*get pf2vf msg info at it's earliest time*/ 2227 if (amdgpu_sriov_vf(adev)) 2228 amdgpu_virt_init_data_exchange(adev); 2229 2230 } 2231 } 2232 if (!total) 2233 return -ENODEV; 2234 2235 amdgpu_amdkfd_device_probe(adev); 2236 adev->cg_flags &= amdgpu_cg_mask; 2237 adev->pg_flags &= amdgpu_pg_mask; 2238 2239 return 0; 2240 } 2241 2242 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2243 { 2244 int i, r; 2245 2246 for (i = 0; i < adev->num_ip_blocks; i++) { 2247 if (!adev->ip_blocks[i].status.sw) 2248 continue; 2249 if (adev->ip_blocks[i].status.hw) 2250 continue; 2251 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2252 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2253 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2254 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2255 if (r) { 2256 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2257 adev->ip_blocks[i].version->funcs->name, r); 2258 return r; 2259 } 2260 adev->ip_blocks[i].status.hw = true; 2261 } 2262 } 2263 2264 return 0; 2265 } 2266 2267 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2268 { 2269 int i, r; 2270 2271 for (i = 0; i < adev->num_ip_blocks; i++) { 2272 if (!adev->ip_blocks[i].status.sw) 2273 continue; 2274 if (adev->ip_blocks[i].status.hw) 2275 continue; 2276 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2277 if (r) { 2278 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2279 adev->ip_blocks[i].version->funcs->name, r); 2280 return r; 2281 } 2282 adev->ip_blocks[i].status.hw = true; 2283 } 2284 2285 return 0; 2286 } 2287 2288 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2289 { 2290 int r = 0; 2291 int i; 2292 uint32_t smu_version; 2293 2294 if (adev->asic_type >= CHIP_VEGA10) { 2295 for (i = 0; i < adev->num_ip_blocks; i++) { 2296 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2297 continue; 2298 2299 if (!adev->ip_blocks[i].status.sw) 2300 continue; 2301 2302 /* no need to do the fw loading again if already done*/ 2303 if (adev->ip_blocks[i].status.hw == true) 2304 break; 2305 2306 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2307 r = adev->ip_blocks[i].version->funcs->resume(adev); 2308 if (r) { 2309 DRM_ERROR("resume of IP block <%s> failed %d\n", 2310 adev->ip_blocks[i].version->funcs->name, r); 2311 return r; 2312 } 2313 } else { 2314 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2315 if (r) { 2316 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2317 adev->ip_blocks[i].version->funcs->name, r); 2318 return r; 2319 } 2320 } 2321 2322 adev->ip_blocks[i].status.hw = true; 2323 break; 2324 } 2325 } 2326 2327 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2328 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2329 2330 return r; 2331 } 2332 2333 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2334 { 2335 long timeout; 2336 int r, i; 2337 2338 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2339 struct amdgpu_ring *ring = adev->rings[i]; 2340 2341 /* No need to setup the GPU scheduler for rings that don't need it */ 2342 if (!ring || ring->no_scheduler) 2343 continue; 2344 2345 switch (ring->funcs->type) { 2346 case AMDGPU_RING_TYPE_GFX: 2347 timeout = adev->gfx_timeout; 2348 break; 2349 case AMDGPU_RING_TYPE_COMPUTE: 2350 timeout = adev->compute_timeout; 2351 break; 2352 case AMDGPU_RING_TYPE_SDMA: 2353 timeout = adev->sdma_timeout; 2354 break; 2355 default: 2356 timeout = adev->video_timeout; 2357 break; 2358 } 2359 2360 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2361 ring->num_hw_submission, 0, 2362 timeout, adev->reset_domain->wq, 2363 ring->sched_score, ring->name, 2364 adev->dev); 2365 if (r) { 2366 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2367 ring->name); 2368 return r; 2369 } 2370 } 2371 2372 amdgpu_xcp_update_partition_sched_list(adev); 2373 2374 return 0; 2375 } 2376 2377 2378 /** 2379 * amdgpu_device_ip_init - run init for hardware IPs 2380 * 2381 * @adev: amdgpu_device pointer 2382 * 2383 * Main initialization pass for hardware IPs. The list of all the hardware 2384 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2385 * are run. sw_init initializes the software state associated with each IP 2386 * and hw_init initializes the hardware associated with each IP. 2387 * Returns 0 on success, negative error code on failure. 2388 */ 2389 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2390 { 2391 int i, r; 2392 2393 r = amdgpu_ras_init(adev); 2394 if (r) 2395 return r; 2396 2397 for (i = 0; i < adev->num_ip_blocks; i++) { 2398 if (!adev->ip_blocks[i].status.valid) 2399 continue; 2400 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2401 if (r) { 2402 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2403 adev->ip_blocks[i].version->funcs->name, r); 2404 goto init_failed; 2405 } 2406 adev->ip_blocks[i].status.sw = true; 2407 2408 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2409 /* need to do common hw init early so everything is set up for gmc */ 2410 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2411 if (r) { 2412 DRM_ERROR("hw_init %d failed %d\n", i, r); 2413 goto init_failed; 2414 } 2415 adev->ip_blocks[i].status.hw = true; 2416 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2417 /* need to do gmc hw init early so we can allocate gpu mem */ 2418 /* Try to reserve bad pages early */ 2419 if (amdgpu_sriov_vf(adev)) 2420 amdgpu_virt_exchange_data(adev); 2421 2422 r = amdgpu_device_mem_scratch_init(adev); 2423 if (r) { 2424 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2425 goto init_failed; 2426 } 2427 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2428 if (r) { 2429 DRM_ERROR("hw_init %d failed %d\n", i, r); 2430 goto init_failed; 2431 } 2432 r = amdgpu_device_wb_init(adev); 2433 if (r) { 2434 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2435 goto init_failed; 2436 } 2437 adev->ip_blocks[i].status.hw = true; 2438 2439 /* right after GMC hw init, we create CSA */ 2440 if (adev->gfx.mcbp) { 2441 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2442 AMDGPU_GEM_DOMAIN_VRAM | 2443 AMDGPU_GEM_DOMAIN_GTT, 2444 AMDGPU_CSA_SIZE); 2445 if (r) { 2446 DRM_ERROR("allocate CSA failed %d\n", r); 2447 goto init_failed; 2448 } 2449 } 2450 } 2451 } 2452 2453 if (amdgpu_sriov_vf(adev)) 2454 amdgpu_virt_init_data_exchange(adev); 2455 2456 r = amdgpu_ib_pool_init(adev); 2457 if (r) { 2458 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2459 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2460 goto init_failed; 2461 } 2462 2463 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2464 if (r) 2465 goto init_failed; 2466 2467 r = amdgpu_device_ip_hw_init_phase1(adev); 2468 if (r) 2469 goto init_failed; 2470 2471 r = amdgpu_device_fw_loading(adev); 2472 if (r) 2473 goto init_failed; 2474 2475 r = amdgpu_device_ip_hw_init_phase2(adev); 2476 if (r) 2477 goto init_failed; 2478 2479 /* 2480 * retired pages will be loaded from eeprom and reserved here, 2481 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2482 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2483 * for I2C communication which only true at this point. 2484 * 2485 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2486 * failure from bad gpu situation and stop amdgpu init process 2487 * accordingly. For other failed cases, it will still release all 2488 * the resource and print error message, rather than returning one 2489 * negative value to upper level. 2490 * 2491 * Note: theoretically, this should be called before all vram allocations 2492 * to protect retired page from abusing 2493 */ 2494 r = amdgpu_ras_recovery_init(adev); 2495 if (r) 2496 goto init_failed; 2497 2498 /** 2499 * In case of XGMI grab extra reference for reset domain for this device 2500 */ 2501 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2502 if (amdgpu_xgmi_add_device(adev) == 0) { 2503 if (!amdgpu_sriov_vf(adev)) { 2504 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2505 2506 if (WARN_ON(!hive)) { 2507 r = -ENOENT; 2508 goto init_failed; 2509 } 2510 2511 if (!hive->reset_domain || 2512 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2513 r = -ENOENT; 2514 amdgpu_put_xgmi_hive(hive); 2515 goto init_failed; 2516 } 2517 2518 /* Drop the early temporary reset domain we created for device */ 2519 amdgpu_reset_put_reset_domain(adev->reset_domain); 2520 adev->reset_domain = hive->reset_domain; 2521 amdgpu_put_xgmi_hive(hive); 2522 } 2523 } 2524 } 2525 2526 r = amdgpu_device_init_schedulers(adev); 2527 if (r) 2528 goto init_failed; 2529 2530 /* Don't init kfd if whole hive need to be reset during init */ 2531 if (!adev->gmc.xgmi.pending_reset) { 2532 kgd2kfd_init_zone_device(adev); 2533 amdgpu_amdkfd_device_init(adev); 2534 } 2535 2536 amdgpu_fru_get_product_info(adev); 2537 2538 init_failed: 2539 2540 return r; 2541 } 2542 2543 /** 2544 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2545 * 2546 * @adev: amdgpu_device pointer 2547 * 2548 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2549 * this function before a GPU reset. If the value is retained after a 2550 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2551 */ 2552 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2553 { 2554 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2555 } 2556 2557 /** 2558 * amdgpu_device_check_vram_lost - check if vram is valid 2559 * 2560 * @adev: amdgpu_device pointer 2561 * 2562 * Checks the reset magic value written to the gart pointer in VRAM. 2563 * The driver calls this after a GPU reset to see if the contents of 2564 * VRAM is lost or now. 2565 * returns true if vram is lost, false if not. 2566 */ 2567 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2568 { 2569 if (memcmp(adev->gart.ptr, adev->reset_magic, 2570 AMDGPU_RESET_MAGIC_NUM)) 2571 return true; 2572 2573 if (!amdgpu_in_reset(adev)) 2574 return false; 2575 2576 /* 2577 * For all ASICs with baco/mode1 reset, the VRAM is 2578 * always assumed to be lost. 2579 */ 2580 switch (amdgpu_asic_reset_method(adev)) { 2581 case AMD_RESET_METHOD_BACO: 2582 case AMD_RESET_METHOD_MODE1: 2583 return true; 2584 default: 2585 return false; 2586 } 2587 } 2588 2589 /** 2590 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2591 * 2592 * @adev: amdgpu_device pointer 2593 * @state: clockgating state (gate or ungate) 2594 * 2595 * The list of all the hardware IPs that make up the asic is walked and the 2596 * set_clockgating_state callbacks are run. 2597 * Late initialization pass enabling clockgating for hardware IPs. 2598 * Fini or suspend, pass disabling clockgating for hardware IPs. 2599 * Returns 0 on success, negative error code on failure. 2600 */ 2601 2602 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2603 enum amd_clockgating_state state) 2604 { 2605 int i, j, r; 2606 2607 if (amdgpu_emu_mode == 1) 2608 return 0; 2609 2610 for (j = 0; j < adev->num_ip_blocks; j++) { 2611 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2612 if (!adev->ip_blocks[i].status.late_initialized) 2613 continue; 2614 /* skip CG for GFX, SDMA on S0ix */ 2615 if (adev->in_s0ix && 2616 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2617 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2618 continue; 2619 /* skip CG for VCE/UVD, it's handled specially */ 2620 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2621 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2622 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2623 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2624 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2625 /* enable clockgating to save power */ 2626 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2627 state); 2628 if (r) { 2629 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2630 adev->ip_blocks[i].version->funcs->name, r); 2631 return r; 2632 } 2633 } 2634 } 2635 2636 return 0; 2637 } 2638 2639 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2640 enum amd_powergating_state state) 2641 { 2642 int i, j, r; 2643 2644 if (amdgpu_emu_mode == 1) 2645 return 0; 2646 2647 for (j = 0; j < adev->num_ip_blocks; j++) { 2648 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2649 if (!adev->ip_blocks[i].status.late_initialized) 2650 continue; 2651 /* skip PG for GFX, SDMA on S0ix */ 2652 if (adev->in_s0ix && 2653 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2654 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2655 continue; 2656 /* skip CG for VCE/UVD, it's handled specially */ 2657 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2658 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2659 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2660 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2661 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2662 /* enable powergating to save power */ 2663 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2664 state); 2665 if (r) { 2666 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2667 adev->ip_blocks[i].version->funcs->name, r); 2668 return r; 2669 } 2670 } 2671 } 2672 return 0; 2673 } 2674 2675 static int amdgpu_device_enable_mgpu_fan_boost(void) 2676 { 2677 struct amdgpu_gpu_instance *gpu_ins; 2678 struct amdgpu_device *adev; 2679 int i, ret = 0; 2680 2681 mutex_lock(&mgpu_info.mutex); 2682 2683 /* 2684 * MGPU fan boost feature should be enabled 2685 * only when there are two or more dGPUs in 2686 * the system 2687 */ 2688 if (mgpu_info.num_dgpu < 2) 2689 goto out; 2690 2691 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2692 gpu_ins = &(mgpu_info.gpu_ins[i]); 2693 adev = gpu_ins->adev; 2694 if (!(adev->flags & AMD_IS_APU) && 2695 !gpu_ins->mgpu_fan_enabled) { 2696 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2697 if (ret) 2698 break; 2699 2700 gpu_ins->mgpu_fan_enabled = 1; 2701 } 2702 } 2703 2704 out: 2705 mutex_unlock(&mgpu_info.mutex); 2706 2707 return ret; 2708 } 2709 2710 /** 2711 * amdgpu_device_ip_late_init - run late init for hardware IPs 2712 * 2713 * @adev: amdgpu_device pointer 2714 * 2715 * Late initialization pass for hardware IPs. The list of all the hardware 2716 * IPs that make up the asic is walked and the late_init callbacks are run. 2717 * late_init covers any special initialization that an IP requires 2718 * after all of the have been initialized or something that needs to happen 2719 * late in the init process. 2720 * Returns 0 on success, negative error code on failure. 2721 */ 2722 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2723 { 2724 struct amdgpu_gpu_instance *gpu_instance; 2725 int i = 0, r; 2726 2727 for (i = 0; i < adev->num_ip_blocks; i++) { 2728 if (!adev->ip_blocks[i].status.hw) 2729 continue; 2730 if (adev->ip_blocks[i].version->funcs->late_init) { 2731 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2732 if (r) { 2733 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2734 adev->ip_blocks[i].version->funcs->name, r); 2735 return r; 2736 } 2737 } 2738 adev->ip_blocks[i].status.late_initialized = true; 2739 } 2740 2741 r = amdgpu_ras_late_init(adev); 2742 if (r) { 2743 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2744 return r; 2745 } 2746 2747 amdgpu_ras_set_error_query_ready(adev, true); 2748 2749 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2750 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2751 2752 amdgpu_device_fill_reset_magic(adev); 2753 2754 r = amdgpu_device_enable_mgpu_fan_boost(); 2755 if (r) 2756 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2757 2758 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2759 if (amdgpu_passthrough(adev) && 2760 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2761 adev->asic_type == CHIP_ALDEBARAN)) 2762 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2763 2764 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2765 mutex_lock(&mgpu_info.mutex); 2766 2767 /* 2768 * Reset device p-state to low as this was booted with high. 2769 * 2770 * This should be performed only after all devices from the same 2771 * hive get initialized. 2772 * 2773 * However, it's unknown how many device in the hive in advance. 2774 * As this is counted one by one during devices initializations. 2775 * 2776 * So, we wait for all XGMI interlinked devices initialized. 2777 * This may bring some delays as those devices may come from 2778 * different hives. But that should be OK. 2779 */ 2780 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2781 for (i = 0; i < mgpu_info.num_gpu; i++) { 2782 gpu_instance = &(mgpu_info.gpu_ins[i]); 2783 if (gpu_instance->adev->flags & AMD_IS_APU) 2784 continue; 2785 2786 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2787 AMDGPU_XGMI_PSTATE_MIN); 2788 if (r) { 2789 DRM_ERROR("pstate setting failed (%d).\n", r); 2790 break; 2791 } 2792 } 2793 } 2794 2795 mutex_unlock(&mgpu_info.mutex); 2796 } 2797 2798 return 0; 2799 } 2800 2801 /** 2802 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2803 * 2804 * @adev: amdgpu_device pointer 2805 * 2806 * For ASICs need to disable SMC first 2807 */ 2808 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2809 { 2810 int i, r; 2811 2812 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2813 return; 2814 2815 for (i = 0; i < adev->num_ip_blocks; i++) { 2816 if (!adev->ip_blocks[i].status.hw) 2817 continue; 2818 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2819 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2820 /* XXX handle errors */ 2821 if (r) { 2822 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2823 adev->ip_blocks[i].version->funcs->name, r); 2824 } 2825 adev->ip_blocks[i].status.hw = false; 2826 break; 2827 } 2828 } 2829 } 2830 2831 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2832 { 2833 int i, r; 2834 2835 for (i = 0; i < adev->num_ip_blocks; i++) { 2836 if (!adev->ip_blocks[i].version->funcs->early_fini) 2837 continue; 2838 2839 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2840 if (r) { 2841 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2842 adev->ip_blocks[i].version->funcs->name, r); 2843 } 2844 } 2845 2846 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2847 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2848 2849 amdgpu_amdkfd_suspend(adev, false); 2850 2851 /* Workaroud for ASICs need to disable SMC first */ 2852 amdgpu_device_smu_fini_early(adev); 2853 2854 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2855 if (!adev->ip_blocks[i].status.hw) 2856 continue; 2857 2858 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2859 /* XXX handle errors */ 2860 if (r) { 2861 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2862 adev->ip_blocks[i].version->funcs->name, r); 2863 } 2864 2865 adev->ip_blocks[i].status.hw = false; 2866 } 2867 2868 if (amdgpu_sriov_vf(adev)) { 2869 if (amdgpu_virt_release_full_gpu(adev, false)) 2870 DRM_ERROR("failed to release exclusive mode on fini\n"); 2871 } 2872 2873 return 0; 2874 } 2875 2876 /** 2877 * amdgpu_device_ip_fini - run fini for hardware IPs 2878 * 2879 * @adev: amdgpu_device pointer 2880 * 2881 * Main teardown pass for hardware IPs. The list of all the hardware 2882 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2883 * are run. hw_fini tears down the hardware associated with each IP 2884 * and sw_fini tears down any software state associated with each IP. 2885 * Returns 0 on success, negative error code on failure. 2886 */ 2887 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2888 { 2889 int i, r; 2890 2891 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2892 amdgpu_virt_release_ras_err_handler_data(adev); 2893 2894 if (adev->gmc.xgmi.num_physical_nodes > 1) 2895 amdgpu_xgmi_remove_device(adev); 2896 2897 amdgpu_amdkfd_device_fini_sw(adev); 2898 2899 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2900 if (!adev->ip_blocks[i].status.sw) 2901 continue; 2902 2903 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2904 amdgpu_ucode_free_bo(adev); 2905 amdgpu_free_static_csa(&adev->virt.csa_obj); 2906 amdgpu_device_wb_fini(adev); 2907 amdgpu_device_mem_scratch_fini(adev); 2908 amdgpu_ib_pool_fini(adev); 2909 } 2910 2911 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2912 /* XXX handle errors */ 2913 if (r) { 2914 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2915 adev->ip_blocks[i].version->funcs->name, r); 2916 } 2917 adev->ip_blocks[i].status.sw = false; 2918 adev->ip_blocks[i].status.valid = false; 2919 } 2920 2921 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2922 if (!adev->ip_blocks[i].status.late_initialized) 2923 continue; 2924 if (adev->ip_blocks[i].version->funcs->late_fini) 2925 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2926 adev->ip_blocks[i].status.late_initialized = false; 2927 } 2928 2929 amdgpu_ras_fini(adev); 2930 2931 return 0; 2932 } 2933 2934 /** 2935 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2936 * 2937 * @work: work_struct. 2938 */ 2939 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2940 { 2941 struct amdgpu_device *adev = 2942 container_of(work, struct amdgpu_device, delayed_init_work.work); 2943 int r; 2944 2945 r = amdgpu_ib_ring_tests(adev); 2946 if (r) 2947 DRM_ERROR("ib ring test failed (%d).\n", r); 2948 } 2949 2950 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2951 { 2952 struct amdgpu_device *adev = 2953 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2954 2955 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2956 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2957 2958 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2959 adev->gfx.gfx_off_state = true; 2960 } 2961 2962 /** 2963 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2964 * 2965 * @adev: amdgpu_device pointer 2966 * 2967 * Main suspend function for hardware IPs. The list of all the hardware 2968 * IPs that make up the asic is walked, clockgating is disabled and the 2969 * suspend callbacks are run. suspend puts the hardware and software state 2970 * in each IP into a state suitable for suspend. 2971 * Returns 0 on success, negative error code on failure. 2972 */ 2973 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2974 { 2975 int i, r; 2976 2977 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2978 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2979 2980 /* 2981 * Per PMFW team's suggestion, driver needs to handle gfxoff 2982 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2983 * scenario. Add the missing df cstate disablement here. 2984 */ 2985 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2986 dev_warn(adev->dev, "Failed to disallow df cstate"); 2987 2988 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2989 if (!adev->ip_blocks[i].status.valid) 2990 continue; 2991 2992 /* displays are handled separately */ 2993 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2994 continue; 2995 2996 /* XXX handle errors */ 2997 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2998 /* XXX handle errors */ 2999 if (r) { 3000 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3001 adev->ip_blocks[i].version->funcs->name, r); 3002 return r; 3003 } 3004 3005 adev->ip_blocks[i].status.hw = false; 3006 } 3007 3008 return 0; 3009 } 3010 3011 /** 3012 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3013 * 3014 * @adev: amdgpu_device pointer 3015 * 3016 * Main suspend function for hardware IPs. The list of all the hardware 3017 * IPs that make up the asic is walked, clockgating is disabled and the 3018 * suspend callbacks are run. suspend puts the hardware and software state 3019 * in each IP into a state suitable for suspend. 3020 * Returns 0 on success, negative error code on failure. 3021 */ 3022 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3023 { 3024 int i, r; 3025 3026 if (adev->in_s0ix) 3027 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3028 3029 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3030 if (!adev->ip_blocks[i].status.valid) 3031 continue; 3032 /* displays are handled in phase1 */ 3033 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3034 continue; 3035 /* PSP lost connection when err_event_athub occurs */ 3036 if (amdgpu_ras_intr_triggered() && 3037 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3038 adev->ip_blocks[i].status.hw = false; 3039 continue; 3040 } 3041 3042 /* skip unnecessary suspend if we do not initialize them yet */ 3043 if (adev->gmc.xgmi.pending_reset && 3044 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3045 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3046 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3047 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3048 adev->ip_blocks[i].status.hw = false; 3049 continue; 3050 } 3051 3052 /* skip suspend of gfx/mes and psp for S0ix 3053 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3054 * like at runtime. PSP is also part of the always on hardware 3055 * so no need to suspend it. 3056 */ 3057 if (adev->in_s0ix && 3058 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3059 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3060 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3061 continue; 3062 3063 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3064 if (adev->in_s0ix && 3065 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3066 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3067 continue; 3068 3069 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3070 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3071 * from this location and RLC Autoload automatically also gets loaded 3072 * from here based on PMFW -> PSP message during re-init sequence. 3073 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3074 * the TMR and reload FWs again for IMU enabled APU ASICs. 3075 */ 3076 if (amdgpu_in_reset(adev) && 3077 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3078 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3079 continue; 3080 3081 /* XXX handle errors */ 3082 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3083 /* XXX handle errors */ 3084 if (r) { 3085 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3086 adev->ip_blocks[i].version->funcs->name, r); 3087 } 3088 adev->ip_blocks[i].status.hw = false; 3089 /* handle putting the SMC in the appropriate state */ 3090 if (!amdgpu_sriov_vf(adev)) { 3091 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3092 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3093 if (r) { 3094 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3095 adev->mp1_state, r); 3096 return r; 3097 } 3098 } 3099 } 3100 } 3101 3102 return 0; 3103 } 3104 3105 /** 3106 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3107 * 3108 * @adev: amdgpu_device pointer 3109 * 3110 * Main suspend function for hardware IPs. The list of all the hardware 3111 * IPs that make up the asic is walked, clockgating is disabled and the 3112 * suspend callbacks are run. suspend puts the hardware and software state 3113 * in each IP into a state suitable for suspend. 3114 * Returns 0 on success, negative error code on failure. 3115 */ 3116 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3117 { 3118 int r; 3119 3120 if (amdgpu_sriov_vf(adev)) { 3121 amdgpu_virt_fini_data_exchange(adev); 3122 amdgpu_virt_request_full_gpu(adev, false); 3123 } 3124 3125 r = amdgpu_device_ip_suspend_phase1(adev); 3126 if (r) 3127 return r; 3128 r = amdgpu_device_ip_suspend_phase2(adev); 3129 3130 if (amdgpu_sriov_vf(adev)) 3131 amdgpu_virt_release_full_gpu(adev, false); 3132 3133 return r; 3134 } 3135 3136 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3137 { 3138 int i, r; 3139 3140 static enum amd_ip_block_type ip_order[] = { 3141 AMD_IP_BLOCK_TYPE_COMMON, 3142 AMD_IP_BLOCK_TYPE_GMC, 3143 AMD_IP_BLOCK_TYPE_PSP, 3144 AMD_IP_BLOCK_TYPE_IH, 3145 }; 3146 3147 for (i = 0; i < adev->num_ip_blocks; i++) { 3148 int j; 3149 struct amdgpu_ip_block *block; 3150 3151 block = &adev->ip_blocks[i]; 3152 block->status.hw = false; 3153 3154 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3155 3156 if (block->version->type != ip_order[j] || 3157 !block->status.valid) 3158 continue; 3159 3160 r = block->version->funcs->hw_init(adev); 3161 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3162 if (r) 3163 return r; 3164 block->status.hw = true; 3165 } 3166 } 3167 3168 return 0; 3169 } 3170 3171 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3172 { 3173 int i, r; 3174 3175 static enum amd_ip_block_type ip_order[] = { 3176 AMD_IP_BLOCK_TYPE_SMC, 3177 AMD_IP_BLOCK_TYPE_DCE, 3178 AMD_IP_BLOCK_TYPE_GFX, 3179 AMD_IP_BLOCK_TYPE_SDMA, 3180 AMD_IP_BLOCK_TYPE_MES, 3181 AMD_IP_BLOCK_TYPE_UVD, 3182 AMD_IP_BLOCK_TYPE_VCE, 3183 AMD_IP_BLOCK_TYPE_VCN, 3184 AMD_IP_BLOCK_TYPE_JPEG 3185 }; 3186 3187 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3188 int j; 3189 struct amdgpu_ip_block *block; 3190 3191 for (j = 0; j < adev->num_ip_blocks; j++) { 3192 block = &adev->ip_blocks[j]; 3193 3194 if (block->version->type != ip_order[i] || 3195 !block->status.valid || 3196 block->status.hw) 3197 continue; 3198 3199 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3200 r = block->version->funcs->resume(adev); 3201 else 3202 r = block->version->funcs->hw_init(adev); 3203 3204 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3205 if (r) 3206 return r; 3207 block->status.hw = true; 3208 } 3209 } 3210 3211 return 0; 3212 } 3213 3214 /** 3215 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3216 * 3217 * @adev: amdgpu_device pointer 3218 * 3219 * First resume function for hardware IPs. The list of all the hardware 3220 * IPs that make up the asic is walked and the resume callbacks are run for 3221 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3222 * after a suspend and updates the software state as necessary. This 3223 * function is also used for restoring the GPU after a GPU reset. 3224 * Returns 0 on success, negative error code on failure. 3225 */ 3226 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3227 { 3228 int i, r; 3229 3230 for (i = 0; i < adev->num_ip_blocks; i++) { 3231 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3232 continue; 3233 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3234 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3235 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3236 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3237 3238 r = adev->ip_blocks[i].version->funcs->resume(adev); 3239 if (r) { 3240 DRM_ERROR("resume of IP block <%s> failed %d\n", 3241 adev->ip_blocks[i].version->funcs->name, r); 3242 return r; 3243 } 3244 adev->ip_blocks[i].status.hw = true; 3245 } 3246 } 3247 3248 return 0; 3249 } 3250 3251 /** 3252 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3253 * 3254 * @adev: amdgpu_device pointer 3255 * 3256 * First resume function for hardware IPs. The list of all the hardware 3257 * IPs that make up the asic is walked and the resume callbacks are run for 3258 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3259 * functional state after a suspend and updates the software state as 3260 * necessary. This function is also used for restoring the GPU after a GPU 3261 * reset. 3262 * Returns 0 on success, negative error code on failure. 3263 */ 3264 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3265 { 3266 int i, r; 3267 3268 for (i = 0; i < adev->num_ip_blocks; i++) { 3269 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3270 continue; 3271 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3272 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3273 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3274 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3275 continue; 3276 r = adev->ip_blocks[i].version->funcs->resume(adev); 3277 if (r) { 3278 DRM_ERROR("resume of IP block <%s> failed %d\n", 3279 adev->ip_blocks[i].version->funcs->name, r); 3280 return r; 3281 } 3282 adev->ip_blocks[i].status.hw = true; 3283 } 3284 3285 return 0; 3286 } 3287 3288 /** 3289 * amdgpu_device_ip_resume - run resume for hardware IPs 3290 * 3291 * @adev: amdgpu_device pointer 3292 * 3293 * Main resume function for hardware IPs. The hardware IPs 3294 * are split into two resume functions because they are 3295 * also used in recovering from a GPU reset and some additional 3296 * steps need to be take between them. In this case (S3/S4) they are 3297 * run sequentially. 3298 * Returns 0 on success, negative error code on failure. 3299 */ 3300 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3301 { 3302 int r; 3303 3304 if (!adev->in_s0ix) { 3305 r = amdgpu_amdkfd_resume_iommu(adev); 3306 if (r) 3307 return r; 3308 } 3309 3310 r = amdgpu_device_ip_resume_phase1(adev); 3311 if (r) 3312 return r; 3313 3314 r = amdgpu_device_fw_loading(adev); 3315 if (r) 3316 return r; 3317 3318 r = amdgpu_device_ip_resume_phase2(adev); 3319 3320 return r; 3321 } 3322 3323 /** 3324 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3325 * 3326 * @adev: amdgpu_device pointer 3327 * 3328 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3329 */ 3330 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3331 { 3332 if (amdgpu_sriov_vf(adev)) { 3333 if (adev->is_atom_fw) { 3334 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3335 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3336 } else { 3337 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3338 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3339 } 3340 3341 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3342 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3343 } 3344 } 3345 3346 /** 3347 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3348 * 3349 * @asic_type: AMD asic type 3350 * 3351 * Check if there is DC (new modesetting infrastructre) support for an asic. 3352 * returns true if DC has support, false if not. 3353 */ 3354 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3355 { 3356 switch (asic_type) { 3357 #ifdef CONFIG_DRM_AMDGPU_SI 3358 case CHIP_HAINAN: 3359 #endif 3360 case CHIP_TOPAZ: 3361 /* chips with no display hardware */ 3362 return false; 3363 #if defined(CONFIG_DRM_AMD_DC) 3364 case CHIP_TAHITI: 3365 case CHIP_PITCAIRN: 3366 case CHIP_VERDE: 3367 case CHIP_OLAND: 3368 /* 3369 * We have systems in the wild with these ASICs that require 3370 * LVDS and VGA support which is not supported with DC. 3371 * 3372 * Fallback to the non-DC driver here by default so as not to 3373 * cause regressions. 3374 */ 3375 #if defined(CONFIG_DRM_AMD_DC_SI) 3376 return amdgpu_dc > 0; 3377 #else 3378 return false; 3379 #endif 3380 case CHIP_BONAIRE: 3381 case CHIP_KAVERI: 3382 case CHIP_KABINI: 3383 case CHIP_MULLINS: 3384 /* 3385 * We have systems in the wild with these ASICs that require 3386 * VGA support which is not supported with DC. 3387 * 3388 * Fallback to the non-DC driver here by default so as not to 3389 * cause regressions. 3390 */ 3391 return amdgpu_dc > 0; 3392 default: 3393 return amdgpu_dc != 0; 3394 #else 3395 default: 3396 if (amdgpu_dc > 0) 3397 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3398 return false; 3399 #endif 3400 } 3401 } 3402 3403 /** 3404 * amdgpu_device_has_dc_support - check if dc is supported 3405 * 3406 * @adev: amdgpu_device pointer 3407 * 3408 * Returns true for supported, false for not supported 3409 */ 3410 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3411 { 3412 if (adev->enable_virtual_display || 3413 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3414 return false; 3415 3416 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3417 } 3418 3419 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3420 { 3421 struct amdgpu_device *adev = 3422 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3423 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3424 3425 /* It's a bug to not have a hive within this function */ 3426 if (WARN_ON(!hive)) 3427 return; 3428 3429 /* 3430 * Use task barrier to synchronize all xgmi reset works across the 3431 * hive. task_barrier_enter and task_barrier_exit will block 3432 * until all the threads running the xgmi reset works reach 3433 * those points. task_barrier_full will do both blocks. 3434 */ 3435 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3436 3437 task_barrier_enter(&hive->tb); 3438 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3439 3440 if (adev->asic_reset_res) 3441 goto fail; 3442 3443 task_barrier_exit(&hive->tb); 3444 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3445 3446 if (adev->asic_reset_res) 3447 goto fail; 3448 3449 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3450 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3451 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3452 } else { 3453 3454 task_barrier_full(&hive->tb); 3455 adev->asic_reset_res = amdgpu_asic_reset(adev); 3456 } 3457 3458 fail: 3459 if (adev->asic_reset_res) 3460 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3461 adev->asic_reset_res, adev_to_drm(adev)->unique); 3462 amdgpu_put_xgmi_hive(hive); 3463 } 3464 3465 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3466 { 3467 char *input = amdgpu_lockup_timeout; 3468 char *timeout_setting = NULL; 3469 int index = 0; 3470 long timeout; 3471 int ret = 0; 3472 3473 /* 3474 * By default timeout for non compute jobs is 10000 3475 * and 60000 for compute jobs. 3476 * In SR-IOV or passthrough mode, timeout for compute 3477 * jobs are 60000 by default. 3478 */ 3479 adev->gfx_timeout = msecs_to_jiffies(10000); 3480 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3481 if (amdgpu_sriov_vf(adev)) 3482 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3483 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3484 else 3485 adev->compute_timeout = msecs_to_jiffies(60000); 3486 3487 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3488 while ((timeout_setting = strsep(&input, ",")) && 3489 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3490 ret = kstrtol(timeout_setting, 0, &timeout); 3491 if (ret) 3492 return ret; 3493 3494 if (timeout == 0) { 3495 index++; 3496 continue; 3497 } else if (timeout < 0) { 3498 timeout = MAX_SCHEDULE_TIMEOUT; 3499 dev_warn(adev->dev, "lockup timeout disabled"); 3500 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3501 } else { 3502 timeout = msecs_to_jiffies(timeout); 3503 } 3504 3505 switch (index++) { 3506 case 0: 3507 adev->gfx_timeout = timeout; 3508 break; 3509 case 1: 3510 adev->compute_timeout = timeout; 3511 break; 3512 case 2: 3513 adev->sdma_timeout = timeout; 3514 break; 3515 case 3: 3516 adev->video_timeout = timeout; 3517 break; 3518 default: 3519 break; 3520 } 3521 } 3522 /* 3523 * There is only one value specified and 3524 * it should apply to all non-compute jobs. 3525 */ 3526 if (index == 1) { 3527 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3528 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3529 adev->compute_timeout = adev->gfx_timeout; 3530 } 3531 } 3532 3533 return ret; 3534 } 3535 3536 /** 3537 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3538 * 3539 * @adev: amdgpu_device pointer 3540 * 3541 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3542 */ 3543 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3544 { 3545 struct iommu_domain *domain; 3546 3547 domain = iommu_get_domain_for_dev(adev->dev); 3548 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3549 adev->ram_is_direct_mapped = true; 3550 } 3551 3552 static const struct attribute *amdgpu_dev_attributes[] = { 3553 &dev_attr_product_name.attr, 3554 &dev_attr_product_number.attr, 3555 &dev_attr_serial_number.attr, 3556 &dev_attr_pcie_replay_count.attr, 3557 NULL 3558 }; 3559 3560 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3561 { 3562 if (amdgpu_mcbp == 1) 3563 adev->gfx.mcbp = true; 3564 3565 if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3566 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3567 adev->gfx.num_gfx_rings) 3568 adev->gfx.mcbp = true; 3569 3570 if (amdgpu_sriov_vf(adev)) 3571 adev->gfx.mcbp = true; 3572 3573 if (adev->gfx.mcbp) 3574 DRM_INFO("MCBP is enabled\n"); 3575 } 3576 3577 /** 3578 * amdgpu_device_init - initialize the driver 3579 * 3580 * @adev: amdgpu_device pointer 3581 * @flags: driver flags 3582 * 3583 * Initializes the driver info and hw (all asics). 3584 * Returns 0 for success or an error on failure. 3585 * Called at driver startup. 3586 */ 3587 int amdgpu_device_init(struct amdgpu_device *adev, 3588 uint32_t flags) 3589 { 3590 struct drm_device *ddev = adev_to_drm(adev); 3591 struct pci_dev *pdev = adev->pdev; 3592 int r, i; 3593 bool px = false; 3594 u32 max_MBps; 3595 int tmp; 3596 3597 adev->shutdown = false; 3598 adev->flags = flags; 3599 3600 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3601 adev->asic_type = amdgpu_force_asic_type; 3602 else 3603 adev->asic_type = flags & AMD_ASIC_MASK; 3604 3605 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3606 if (amdgpu_emu_mode == 1) 3607 adev->usec_timeout *= 10; 3608 adev->gmc.gart_size = 512 * 1024 * 1024; 3609 adev->accel_working = false; 3610 adev->num_rings = 0; 3611 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3612 adev->mman.buffer_funcs = NULL; 3613 adev->mman.buffer_funcs_ring = NULL; 3614 adev->vm_manager.vm_pte_funcs = NULL; 3615 adev->vm_manager.vm_pte_num_scheds = 0; 3616 adev->gmc.gmc_funcs = NULL; 3617 adev->harvest_ip_mask = 0x0; 3618 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3619 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3620 3621 adev->smc_rreg = &amdgpu_invalid_rreg; 3622 adev->smc_wreg = &amdgpu_invalid_wreg; 3623 adev->pcie_rreg = &amdgpu_invalid_rreg; 3624 adev->pcie_wreg = &amdgpu_invalid_wreg; 3625 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3626 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3627 adev->pciep_rreg = &amdgpu_invalid_rreg; 3628 adev->pciep_wreg = &amdgpu_invalid_wreg; 3629 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3630 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3631 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3632 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3633 adev->didt_rreg = &amdgpu_invalid_rreg; 3634 adev->didt_wreg = &amdgpu_invalid_wreg; 3635 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3636 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3637 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3638 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3639 3640 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3641 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3642 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3643 3644 /* mutex initialization are all done here so we 3645 * can recall function without having locking issues 3646 */ 3647 mutex_init(&adev->firmware.mutex); 3648 mutex_init(&adev->pm.mutex); 3649 mutex_init(&adev->gfx.gpu_clock_mutex); 3650 mutex_init(&adev->srbm_mutex); 3651 mutex_init(&adev->gfx.pipe_reserve_mutex); 3652 mutex_init(&adev->gfx.gfx_off_mutex); 3653 mutex_init(&adev->gfx.partition_mutex); 3654 mutex_init(&adev->grbm_idx_mutex); 3655 mutex_init(&adev->mn_lock); 3656 mutex_init(&adev->virt.vf_errors.lock); 3657 hash_init(adev->mn_hash); 3658 mutex_init(&adev->psp.mutex); 3659 mutex_init(&adev->notifier_lock); 3660 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3661 mutex_init(&adev->benchmark_mutex); 3662 3663 amdgpu_device_init_apu_flags(adev); 3664 3665 r = amdgpu_device_check_arguments(adev); 3666 if (r) 3667 return r; 3668 3669 spin_lock_init(&adev->mmio_idx_lock); 3670 spin_lock_init(&adev->smc_idx_lock); 3671 spin_lock_init(&adev->pcie_idx_lock); 3672 spin_lock_init(&adev->uvd_ctx_idx_lock); 3673 spin_lock_init(&adev->didt_idx_lock); 3674 spin_lock_init(&adev->gc_cac_idx_lock); 3675 spin_lock_init(&adev->se_cac_idx_lock); 3676 spin_lock_init(&adev->audio_endpt_idx_lock); 3677 spin_lock_init(&adev->mm_stats.lock); 3678 3679 INIT_LIST_HEAD(&adev->shadow_list); 3680 mutex_init(&adev->shadow_list_lock); 3681 3682 INIT_LIST_HEAD(&adev->reset_list); 3683 3684 INIT_LIST_HEAD(&adev->ras_list); 3685 3686 INIT_DELAYED_WORK(&adev->delayed_init_work, 3687 amdgpu_device_delayed_init_work_handler); 3688 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3689 amdgpu_device_delay_enable_gfx_off); 3690 3691 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3692 3693 adev->gfx.gfx_off_req_count = 1; 3694 adev->gfx.gfx_off_residency = 0; 3695 adev->gfx.gfx_off_entrycount = 0; 3696 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3697 3698 atomic_set(&adev->throttling_logging_enabled, 1); 3699 /* 3700 * If throttling continues, logging will be performed every minute 3701 * to avoid log flooding. "-1" is subtracted since the thermal 3702 * throttling interrupt comes every second. Thus, the total logging 3703 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3704 * for throttling interrupt) = 60 seconds. 3705 */ 3706 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3707 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3708 3709 /* Registers mapping */ 3710 /* TODO: block userspace mapping of io register */ 3711 if (adev->asic_type >= CHIP_BONAIRE) { 3712 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3713 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3714 } else { 3715 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3716 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3717 } 3718 3719 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3720 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3721 3722 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3723 if (!adev->rmmio) 3724 return -ENOMEM; 3725 3726 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3727 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3728 3729 /* 3730 * Reset domain needs to be present early, before XGMI hive discovered 3731 * (if any) and intitialized to use reset sem and in_gpu reset flag 3732 * early on during init and before calling to RREG32. 3733 */ 3734 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3735 if (!adev->reset_domain) 3736 return -ENOMEM; 3737 3738 /* detect hw virtualization here */ 3739 amdgpu_detect_virtualization(adev); 3740 3741 amdgpu_device_get_pcie_info(adev); 3742 3743 r = amdgpu_device_get_job_timeout_settings(adev); 3744 if (r) { 3745 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3746 return r; 3747 } 3748 3749 /* early init functions */ 3750 r = amdgpu_device_ip_early_init(adev); 3751 if (r) 3752 return r; 3753 3754 amdgpu_device_set_mcbp(adev); 3755 3756 /* Get rid of things like offb */ 3757 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3758 if (r) 3759 return r; 3760 3761 /* Enable TMZ based on IP_VERSION */ 3762 amdgpu_gmc_tmz_set(adev); 3763 3764 amdgpu_gmc_noretry_set(adev); 3765 /* Need to get xgmi info early to decide the reset behavior*/ 3766 if (adev->gmc.xgmi.supported) { 3767 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3768 if (r) 3769 return r; 3770 } 3771 3772 /* enable PCIE atomic ops */ 3773 if (amdgpu_sriov_vf(adev)) { 3774 if (adev->virt.fw_reserve.p_pf2vf) 3775 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3776 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3777 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3778 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3779 * internal path natively support atomics, set have_atomics_support to true. 3780 */ 3781 } else if ((adev->flags & AMD_IS_APU) && 3782 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3783 adev->have_atomics_support = true; 3784 } else { 3785 adev->have_atomics_support = 3786 !pci_enable_atomic_ops_to_root(adev->pdev, 3787 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3788 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3789 } 3790 3791 if (!adev->have_atomics_support) 3792 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3793 3794 /* doorbell bar mapping and doorbell index init*/ 3795 amdgpu_doorbell_init(adev); 3796 3797 if (amdgpu_emu_mode == 1) { 3798 /* post the asic on emulation mode */ 3799 emu_soc_asic_init(adev); 3800 goto fence_driver_init; 3801 } 3802 3803 amdgpu_reset_init(adev); 3804 3805 /* detect if we are with an SRIOV vbios */ 3806 if (adev->bios) 3807 amdgpu_device_detect_sriov_bios(adev); 3808 3809 /* check if we need to reset the asic 3810 * E.g., driver was not cleanly unloaded previously, etc. 3811 */ 3812 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3813 if (adev->gmc.xgmi.num_physical_nodes) { 3814 dev_info(adev->dev, "Pending hive reset.\n"); 3815 adev->gmc.xgmi.pending_reset = true; 3816 /* Only need to init necessary block for SMU to handle the reset */ 3817 for (i = 0; i < adev->num_ip_blocks; i++) { 3818 if (!adev->ip_blocks[i].status.valid) 3819 continue; 3820 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3821 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3822 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3823 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3824 DRM_DEBUG("IP %s disabled for hw_init.\n", 3825 adev->ip_blocks[i].version->funcs->name); 3826 adev->ip_blocks[i].status.hw = true; 3827 } 3828 } 3829 } else { 3830 tmp = amdgpu_reset_method; 3831 /* It should do a default reset when loading or reloading the driver, 3832 * regardless of the module parameter reset_method. 3833 */ 3834 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3835 r = amdgpu_asic_reset(adev); 3836 amdgpu_reset_method = tmp; 3837 if (r) { 3838 dev_err(adev->dev, "asic reset on init failed\n"); 3839 goto failed; 3840 } 3841 } 3842 } 3843 3844 /* Post card if necessary */ 3845 if (amdgpu_device_need_post(adev)) { 3846 if (!adev->bios) { 3847 dev_err(adev->dev, "no vBIOS found\n"); 3848 r = -EINVAL; 3849 goto failed; 3850 } 3851 DRM_INFO("GPU posting now...\n"); 3852 r = amdgpu_device_asic_init(adev); 3853 if (r) { 3854 dev_err(adev->dev, "gpu post error!\n"); 3855 goto failed; 3856 } 3857 } 3858 3859 if (adev->bios) { 3860 if (adev->is_atom_fw) { 3861 /* Initialize clocks */ 3862 r = amdgpu_atomfirmware_get_clock_info(adev); 3863 if (r) { 3864 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3865 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3866 goto failed; 3867 } 3868 } else { 3869 /* Initialize clocks */ 3870 r = amdgpu_atombios_get_clock_info(adev); 3871 if (r) { 3872 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3873 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3874 goto failed; 3875 } 3876 /* init i2c buses */ 3877 if (!amdgpu_device_has_dc_support(adev)) 3878 amdgpu_atombios_i2c_init(adev); 3879 } 3880 } 3881 3882 fence_driver_init: 3883 /* Fence driver */ 3884 r = amdgpu_fence_driver_sw_init(adev); 3885 if (r) { 3886 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3887 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3888 goto failed; 3889 } 3890 3891 /* init the mode config */ 3892 drm_mode_config_init(adev_to_drm(adev)); 3893 3894 r = amdgpu_device_ip_init(adev); 3895 if (r) { 3896 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3897 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3898 goto release_ras_con; 3899 } 3900 3901 amdgpu_fence_driver_hw_init(adev); 3902 3903 dev_info(adev->dev, 3904 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3905 adev->gfx.config.max_shader_engines, 3906 adev->gfx.config.max_sh_per_se, 3907 adev->gfx.config.max_cu_per_sh, 3908 adev->gfx.cu_info.number); 3909 3910 adev->accel_working = true; 3911 3912 amdgpu_vm_check_compute_bug(adev); 3913 3914 /* Initialize the buffer migration limit. */ 3915 if (amdgpu_moverate >= 0) 3916 max_MBps = amdgpu_moverate; 3917 else 3918 max_MBps = 8; /* Allow 8 MB/s. */ 3919 /* Get a log2 for easy divisions. */ 3920 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3921 3922 r = amdgpu_atombios_sysfs_init(adev); 3923 if (r) 3924 drm_err(&adev->ddev, 3925 "registering atombios sysfs failed (%d).\n", r); 3926 3927 r = amdgpu_pm_sysfs_init(adev); 3928 if (r) 3929 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3930 3931 r = amdgpu_ucode_sysfs_init(adev); 3932 if (r) { 3933 adev->ucode_sysfs_en = false; 3934 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3935 } else 3936 adev->ucode_sysfs_en = true; 3937 3938 /* 3939 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3940 * Otherwise the mgpu fan boost feature will be skipped due to the 3941 * gpu instance is counted less. 3942 */ 3943 amdgpu_register_gpu_instance(adev); 3944 3945 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3946 * explicit gating rather than handling it automatically. 3947 */ 3948 if (!adev->gmc.xgmi.pending_reset) { 3949 r = amdgpu_device_ip_late_init(adev); 3950 if (r) { 3951 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3952 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3953 goto release_ras_con; 3954 } 3955 /* must succeed. */ 3956 amdgpu_ras_resume(adev); 3957 queue_delayed_work(system_wq, &adev->delayed_init_work, 3958 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3959 } 3960 3961 if (amdgpu_sriov_vf(adev)) { 3962 amdgpu_virt_release_full_gpu(adev, true); 3963 flush_delayed_work(&adev->delayed_init_work); 3964 } 3965 3966 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3967 if (r) 3968 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3969 3970 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3971 r = amdgpu_pmu_init(adev); 3972 if (r) 3973 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3974 3975 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3976 if (amdgpu_device_cache_pci_state(adev->pdev)) 3977 pci_restore_state(pdev); 3978 3979 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3980 /* this will fail for cards that aren't VGA class devices, just 3981 * ignore it 3982 */ 3983 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3984 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3985 3986 px = amdgpu_device_supports_px(ddev); 3987 3988 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3989 apple_gmux_detect(NULL, NULL))) 3990 vga_switcheroo_register_client(adev->pdev, 3991 &amdgpu_switcheroo_ops, px); 3992 3993 if (px) 3994 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3995 3996 if (adev->gmc.xgmi.pending_reset) 3997 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3998 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3999 4000 amdgpu_device_check_iommu_direct_map(adev); 4001 4002 return 0; 4003 4004 release_ras_con: 4005 if (amdgpu_sriov_vf(adev)) 4006 amdgpu_virt_release_full_gpu(adev, true); 4007 4008 /* failed in exclusive mode due to timeout */ 4009 if (amdgpu_sriov_vf(adev) && 4010 !amdgpu_sriov_runtime(adev) && 4011 amdgpu_virt_mmio_blocked(adev) && 4012 !amdgpu_virt_wait_reset(adev)) { 4013 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4014 /* Don't send request since VF is inactive. */ 4015 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4016 adev->virt.ops = NULL; 4017 r = -EAGAIN; 4018 } 4019 amdgpu_release_ras_context(adev); 4020 4021 failed: 4022 amdgpu_vf_error_trans_all(adev); 4023 4024 return r; 4025 } 4026 4027 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4028 { 4029 4030 /* Clear all CPU mappings pointing to this device */ 4031 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4032 4033 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4034 amdgpu_doorbell_fini(adev); 4035 4036 iounmap(adev->rmmio); 4037 adev->rmmio = NULL; 4038 if (adev->mman.aper_base_kaddr) 4039 iounmap(adev->mman.aper_base_kaddr); 4040 adev->mman.aper_base_kaddr = NULL; 4041 4042 /* Memory manager related */ 4043 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4044 arch_phys_wc_del(adev->gmc.vram_mtrr); 4045 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4046 } 4047 } 4048 4049 /** 4050 * amdgpu_device_fini_hw - tear down the driver 4051 * 4052 * @adev: amdgpu_device pointer 4053 * 4054 * Tear down the driver info (all asics). 4055 * Called at driver shutdown. 4056 */ 4057 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4058 { 4059 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4060 flush_delayed_work(&adev->delayed_init_work); 4061 adev->shutdown = true; 4062 4063 /* make sure IB test finished before entering exclusive mode 4064 * to avoid preemption on IB test 4065 */ 4066 if (amdgpu_sriov_vf(adev)) { 4067 amdgpu_virt_request_full_gpu(adev, false); 4068 amdgpu_virt_fini_data_exchange(adev); 4069 } 4070 4071 /* disable all interrupts */ 4072 amdgpu_irq_disable_all(adev); 4073 if (adev->mode_info.mode_config_initialized) { 4074 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4075 drm_helper_force_disable_all(adev_to_drm(adev)); 4076 else 4077 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4078 } 4079 amdgpu_fence_driver_hw_fini(adev); 4080 4081 if (adev->mman.initialized) 4082 drain_workqueue(adev->mman.bdev.wq); 4083 4084 if (adev->pm.sysfs_initialized) 4085 amdgpu_pm_sysfs_fini(adev); 4086 if (adev->ucode_sysfs_en) 4087 amdgpu_ucode_sysfs_fini(adev); 4088 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4089 4090 /* disable ras feature must before hw fini */ 4091 amdgpu_ras_pre_fini(adev); 4092 4093 amdgpu_device_ip_fini_early(adev); 4094 4095 amdgpu_irq_fini_hw(adev); 4096 4097 if (adev->mman.initialized) 4098 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4099 4100 amdgpu_gart_dummy_page_fini(adev); 4101 4102 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4103 amdgpu_device_unmap_mmio(adev); 4104 4105 } 4106 4107 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4108 { 4109 int idx; 4110 bool px; 4111 4112 amdgpu_fence_driver_sw_fini(adev); 4113 amdgpu_device_ip_fini(adev); 4114 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4115 adev->accel_working = false; 4116 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4117 4118 amdgpu_reset_fini(adev); 4119 4120 /* free i2c buses */ 4121 if (!amdgpu_device_has_dc_support(adev)) 4122 amdgpu_i2c_fini(adev); 4123 4124 if (amdgpu_emu_mode != 1) 4125 amdgpu_atombios_fini(adev); 4126 4127 kfree(adev->bios); 4128 adev->bios = NULL; 4129 4130 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4131 4132 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4133 apple_gmux_detect(NULL, NULL))) 4134 vga_switcheroo_unregister_client(adev->pdev); 4135 4136 if (px) 4137 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4138 4139 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4140 vga_client_unregister(adev->pdev); 4141 4142 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4143 4144 iounmap(adev->rmmio); 4145 adev->rmmio = NULL; 4146 amdgpu_doorbell_fini(adev); 4147 drm_dev_exit(idx); 4148 } 4149 4150 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4151 amdgpu_pmu_fini(adev); 4152 if (adev->mman.discovery_bin) 4153 amdgpu_discovery_fini(adev); 4154 4155 amdgpu_reset_put_reset_domain(adev->reset_domain); 4156 adev->reset_domain = NULL; 4157 4158 kfree(adev->pci_state); 4159 4160 } 4161 4162 /** 4163 * amdgpu_device_evict_resources - evict device resources 4164 * @adev: amdgpu device object 4165 * 4166 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4167 * of the vram memory type. Mainly used for evicting device resources 4168 * at suspend time. 4169 * 4170 */ 4171 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4172 { 4173 int ret; 4174 4175 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4176 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4177 return 0; 4178 4179 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4180 if (ret) 4181 DRM_WARN("evicting device resources failed\n"); 4182 return ret; 4183 } 4184 4185 /* 4186 * Suspend & resume. 4187 */ 4188 /** 4189 * amdgpu_device_suspend - initiate device suspend 4190 * 4191 * @dev: drm dev pointer 4192 * @fbcon : notify the fbdev of suspend 4193 * 4194 * Puts the hw in the suspend state (all asics). 4195 * Returns 0 for success or an error on failure. 4196 * Called at driver suspend. 4197 */ 4198 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4199 { 4200 struct amdgpu_device *adev = drm_to_adev(dev); 4201 int r = 0; 4202 4203 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4204 return 0; 4205 4206 adev->in_suspend = true; 4207 4208 /* Evict the majority of BOs before grabbing the full access */ 4209 r = amdgpu_device_evict_resources(adev); 4210 if (r) 4211 return r; 4212 4213 if (amdgpu_sriov_vf(adev)) { 4214 amdgpu_virt_fini_data_exchange(adev); 4215 r = amdgpu_virt_request_full_gpu(adev, false); 4216 if (r) 4217 return r; 4218 } 4219 4220 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4221 DRM_WARN("smart shift update failed\n"); 4222 4223 if (fbcon) 4224 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4225 4226 cancel_delayed_work_sync(&adev->delayed_init_work); 4227 4228 amdgpu_ras_suspend(adev); 4229 4230 amdgpu_device_ip_suspend_phase1(adev); 4231 4232 if (!adev->in_s0ix) 4233 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4234 4235 r = amdgpu_device_evict_resources(adev); 4236 if (r) 4237 return r; 4238 4239 amdgpu_fence_driver_hw_fini(adev); 4240 4241 amdgpu_device_ip_suspend_phase2(adev); 4242 4243 if (amdgpu_sriov_vf(adev)) 4244 amdgpu_virt_release_full_gpu(adev, false); 4245 4246 return 0; 4247 } 4248 4249 /** 4250 * amdgpu_device_resume - initiate device resume 4251 * 4252 * @dev: drm dev pointer 4253 * @fbcon : notify the fbdev of resume 4254 * 4255 * Bring the hw back to operating state (all asics). 4256 * Returns 0 for success or an error on failure. 4257 * Called at driver resume. 4258 */ 4259 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4260 { 4261 struct amdgpu_device *adev = drm_to_adev(dev); 4262 int r = 0; 4263 4264 if (amdgpu_sriov_vf(adev)) { 4265 r = amdgpu_virt_request_full_gpu(adev, true); 4266 if (r) 4267 return r; 4268 } 4269 4270 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4271 return 0; 4272 4273 if (adev->in_s0ix) 4274 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4275 4276 /* post card */ 4277 if (amdgpu_device_need_post(adev)) { 4278 r = amdgpu_device_asic_init(adev); 4279 if (r) 4280 dev_err(adev->dev, "amdgpu asic init failed\n"); 4281 } 4282 4283 r = amdgpu_device_ip_resume(adev); 4284 4285 if (r) { 4286 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4287 goto exit; 4288 } 4289 amdgpu_fence_driver_hw_init(adev); 4290 4291 r = amdgpu_device_ip_late_init(adev); 4292 if (r) 4293 goto exit; 4294 4295 queue_delayed_work(system_wq, &adev->delayed_init_work, 4296 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4297 4298 if (!adev->in_s0ix) { 4299 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4300 if (r) 4301 goto exit; 4302 } 4303 4304 exit: 4305 if (amdgpu_sriov_vf(adev)) { 4306 amdgpu_virt_init_data_exchange(adev); 4307 amdgpu_virt_release_full_gpu(adev, true); 4308 } 4309 4310 if (r) 4311 return r; 4312 4313 /* Make sure IB tests flushed */ 4314 flush_delayed_work(&adev->delayed_init_work); 4315 4316 if (fbcon) 4317 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4318 4319 amdgpu_ras_resume(adev); 4320 4321 if (adev->mode_info.num_crtc) { 4322 /* 4323 * Most of the connector probing functions try to acquire runtime pm 4324 * refs to ensure that the GPU is powered on when connector polling is 4325 * performed. Since we're calling this from a runtime PM callback, 4326 * trying to acquire rpm refs will cause us to deadlock. 4327 * 4328 * Since we're guaranteed to be holding the rpm lock, it's safe to 4329 * temporarily disable the rpm helpers so this doesn't deadlock us. 4330 */ 4331 #ifdef CONFIG_PM 4332 dev->dev->power.disable_depth++; 4333 #endif 4334 if (!adev->dc_enabled) 4335 drm_helper_hpd_irq_event(dev); 4336 else 4337 drm_kms_helper_hotplug_event(dev); 4338 #ifdef CONFIG_PM 4339 dev->dev->power.disable_depth--; 4340 #endif 4341 } 4342 adev->in_suspend = false; 4343 4344 if (adev->enable_mes) 4345 amdgpu_mes_self_test(adev); 4346 4347 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4348 DRM_WARN("smart shift update failed\n"); 4349 4350 return 0; 4351 } 4352 4353 /** 4354 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4355 * 4356 * @adev: amdgpu_device pointer 4357 * 4358 * The list of all the hardware IPs that make up the asic is walked and 4359 * the check_soft_reset callbacks are run. check_soft_reset determines 4360 * if the asic is still hung or not. 4361 * Returns true if any of the IPs are still in a hung state, false if not. 4362 */ 4363 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4364 { 4365 int i; 4366 bool asic_hang = false; 4367 4368 if (amdgpu_sriov_vf(adev)) 4369 return true; 4370 4371 if (amdgpu_asic_need_full_reset(adev)) 4372 return true; 4373 4374 for (i = 0; i < adev->num_ip_blocks; i++) { 4375 if (!adev->ip_blocks[i].status.valid) 4376 continue; 4377 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4378 adev->ip_blocks[i].status.hang = 4379 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4380 if (adev->ip_blocks[i].status.hang) { 4381 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4382 asic_hang = true; 4383 } 4384 } 4385 return asic_hang; 4386 } 4387 4388 /** 4389 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4390 * 4391 * @adev: amdgpu_device pointer 4392 * 4393 * The list of all the hardware IPs that make up the asic is walked and the 4394 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4395 * handles any IP specific hardware or software state changes that are 4396 * necessary for a soft reset to succeed. 4397 * Returns 0 on success, negative error code on failure. 4398 */ 4399 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4400 { 4401 int i, r = 0; 4402 4403 for (i = 0; i < adev->num_ip_blocks; i++) { 4404 if (!adev->ip_blocks[i].status.valid) 4405 continue; 4406 if (adev->ip_blocks[i].status.hang && 4407 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4408 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4409 if (r) 4410 return r; 4411 } 4412 } 4413 4414 return 0; 4415 } 4416 4417 /** 4418 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4419 * 4420 * @adev: amdgpu_device pointer 4421 * 4422 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4423 * reset is necessary to recover. 4424 * Returns true if a full asic reset is required, false if not. 4425 */ 4426 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4427 { 4428 int i; 4429 4430 if (amdgpu_asic_need_full_reset(adev)) 4431 return true; 4432 4433 for (i = 0; i < adev->num_ip_blocks; i++) { 4434 if (!adev->ip_blocks[i].status.valid) 4435 continue; 4436 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4437 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4438 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4439 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4440 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4441 if (adev->ip_blocks[i].status.hang) { 4442 dev_info(adev->dev, "Some block need full reset!\n"); 4443 return true; 4444 } 4445 } 4446 } 4447 return false; 4448 } 4449 4450 /** 4451 * amdgpu_device_ip_soft_reset - do a soft reset 4452 * 4453 * @adev: amdgpu_device pointer 4454 * 4455 * The list of all the hardware IPs that make up the asic is walked and the 4456 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4457 * IP specific hardware or software state changes that are necessary to soft 4458 * reset the IP. 4459 * Returns 0 on success, negative error code on failure. 4460 */ 4461 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4462 { 4463 int i, r = 0; 4464 4465 for (i = 0; i < adev->num_ip_blocks; i++) { 4466 if (!adev->ip_blocks[i].status.valid) 4467 continue; 4468 if (adev->ip_blocks[i].status.hang && 4469 adev->ip_blocks[i].version->funcs->soft_reset) { 4470 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4471 if (r) 4472 return r; 4473 } 4474 } 4475 4476 return 0; 4477 } 4478 4479 /** 4480 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4481 * 4482 * @adev: amdgpu_device pointer 4483 * 4484 * The list of all the hardware IPs that make up the asic is walked and the 4485 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4486 * handles any IP specific hardware or software state changes that are 4487 * necessary after the IP has been soft reset. 4488 * Returns 0 on success, negative error code on failure. 4489 */ 4490 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4491 { 4492 int i, r = 0; 4493 4494 for (i = 0; i < adev->num_ip_blocks; i++) { 4495 if (!adev->ip_blocks[i].status.valid) 4496 continue; 4497 if (adev->ip_blocks[i].status.hang && 4498 adev->ip_blocks[i].version->funcs->post_soft_reset) 4499 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4500 if (r) 4501 return r; 4502 } 4503 4504 return 0; 4505 } 4506 4507 /** 4508 * amdgpu_device_recover_vram - Recover some VRAM contents 4509 * 4510 * @adev: amdgpu_device pointer 4511 * 4512 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4513 * restore things like GPUVM page tables after a GPU reset where 4514 * the contents of VRAM might be lost. 4515 * 4516 * Returns: 4517 * 0 on success, negative error code on failure. 4518 */ 4519 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4520 { 4521 struct dma_fence *fence = NULL, *next = NULL; 4522 struct amdgpu_bo *shadow; 4523 struct amdgpu_bo_vm *vmbo; 4524 long r = 1, tmo; 4525 4526 if (amdgpu_sriov_runtime(adev)) 4527 tmo = msecs_to_jiffies(8000); 4528 else 4529 tmo = msecs_to_jiffies(100); 4530 4531 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4532 mutex_lock(&adev->shadow_list_lock); 4533 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4534 /* If vm is compute context or adev is APU, shadow will be NULL */ 4535 if (!vmbo->shadow) 4536 continue; 4537 shadow = vmbo->shadow; 4538 4539 /* No need to recover an evicted BO */ 4540 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4541 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4542 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4543 continue; 4544 4545 r = amdgpu_bo_restore_shadow(shadow, &next); 4546 if (r) 4547 break; 4548 4549 if (fence) { 4550 tmo = dma_fence_wait_timeout(fence, false, tmo); 4551 dma_fence_put(fence); 4552 fence = next; 4553 if (tmo == 0) { 4554 r = -ETIMEDOUT; 4555 break; 4556 } else if (tmo < 0) { 4557 r = tmo; 4558 break; 4559 } 4560 } else { 4561 fence = next; 4562 } 4563 } 4564 mutex_unlock(&adev->shadow_list_lock); 4565 4566 if (fence) 4567 tmo = dma_fence_wait_timeout(fence, false, tmo); 4568 dma_fence_put(fence); 4569 4570 if (r < 0 || tmo <= 0) { 4571 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4572 return -EIO; 4573 } 4574 4575 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4576 return 0; 4577 } 4578 4579 4580 /** 4581 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4582 * 4583 * @adev: amdgpu_device pointer 4584 * @from_hypervisor: request from hypervisor 4585 * 4586 * do VF FLR and reinitialize Asic 4587 * return 0 means succeeded otherwise failed 4588 */ 4589 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4590 bool from_hypervisor) 4591 { 4592 int r; 4593 struct amdgpu_hive_info *hive = NULL; 4594 int retry_limit = 0; 4595 4596 retry: 4597 amdgpu_amdkfd_pre_reset(adev); 4598 4599 if (from_hypervisor) 4600 r = amdgpu_virt_request_full_gpu(adev, true); 4601 else 4602 r = amdgpu_virt_reset_gpu(adev); 4603 if (r) 4604 return r; 4605 4606 /* some sw clean up VF needs to do before recover */ 4607 amdgpu_virt_post_reset(adev); 4608 4609 /* Resume IP prior to SMC */ 4610 r = amdgpu_device_ip_reinit_early_sriov(adev); 4611 if (r) 4612 goto error; 4613 4614 amdgpu_virt_init_data_exchange(adev); 4615 4616 r = amdgpu_device_fw_loading(adev); 4617 if (r) 4618 return r; 4619 4620 /* now we are okay to resume SMC/CP/SDMA */ 4621 r = amdgpu_device_ip_reinit_late_sriov(adev); 4622 if (r) 4623 goto error; 4624 4625 hive = amdgpu_get_xgmi_hive(adev); 4626 /* Update PSP FW topology after reset */ 4627 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4628 r = amdgpu_xgmi_update_topology(hive, adev); 4629 4630 if (hive) 4631 amdgpu_put_xgmi_hive(hive); 4632 4633 if (!r) { 4634 amdgpu_irq_gpu_reset_resume_helper(adev); 4635 r = amdgpu_ib_ring_tests(adev); 4636 4637 amdgpu_amdkfd_post_reset(adev); 4638 } 4639 4640 error: 4641 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4642 amdgpu_inc_vram_lost(adev); 4643 r = amdgpu_device_recover_vram(adev); 4644 } 4645 amdgpu_virt_release_full_gpu(adev, true); 4646 4647 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4648 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4649 retry_limit++; 4650 goto retry; 4651 } else 4652 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4653 } 4654 4655 return r; 4656 } 4657 4658 /** 4659 * amdgpu_device_has_job_running - check if there is any job in mirror list 4660 * 4661 * @adev: amdgpu_device pointer 4662 * 4663 * check if there is any job in mirror list 4664 */ 4665 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4666 { 4667 int i; 4668 struct drm_sched_job *job; 4669 4670 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4671 struct amdgpu_ring *ring = adev->rings[i]; 4672 4673 if (!ring || !ring->sched.thread) 4674 continue; 4675 4676 spin_lock(&ring->sched.job_list_lock); 4677 job = list_first_entry_or_null(&ring->sched.pending_list, 4678 struct drm_sched_job, list); 4679 spin_unlock(&ring->sched.job_list_lock); 4680 if (job) 4681 return true; 4682 } 4683 return false; 4684 } 4685 4686 /** 4687 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4688 * 4689 * @adev: amdgpu_device pointer 4690 * 4691 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4692 * a hung GPU. 4693 */ 4694 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4695 { 4696 4697 if (amdgpu_gpu_recovery == 0) 4698 goto disabled; 4699 4700 /* Skip soft reset check in fatal error mode */ 4701 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4702 return true; 4703 4704 if (amdgpu_sriov_vf(adev)) 4705 return true; 4706 4707 if (amdgpu_gpu_recovery == -1) { 4708 switch (adev->asic_type) { 4709 #ifdef CONFIG_DRM_AMDGPU_SI 4710 case CHIP_VERDE: 4711 case CHIP_TAHITI: 4712 case CHIP_PITCAIRN: 4713 case CHIP_OLAND: 4714 case CHIP_HAINAN: 4715 #endif 4716 #ifdef CONFIG_DRM_AMDGPU_CIK 4717 case CHIP_KAVERI: 4718 case CHIP_KABINI: 4719 case CHIP_MULLINS: 4720 #endif 4721 case CHIP_CARRIZO: 4722 case CHIP_STONEY: 4723 case CHIP_CYAN_SKILLFISH: 4724 goto disabled; 4725 default: 4726 break; 4727 } 4728 } 4729 4730 return true; 4731 4732 disabled: 4733 dev_info(adev->dev, "GPU recovery disabled.\n"); 4734 return false; 4735 } 4736 4737 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4738 { 4739 u32 i; 4740 int ret = 0; 4741 4742 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4743 4744 dev_info(adev->dev, "GPU mode1 reset\n"); 4745 4746 /* disable BM */ 4747 pci_clear_master(adev->pdev); 4748 4749 amdgpu_device_cache_pci_state(adev->pdev); 4750 4751 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4752 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4753 ret = amdgpu_dpm_mode1_reset(adev); 4754 } else { 4755 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4756 ret = psp_gpu_reset(adev); 4757 } 4758 4759 if (ret) 4760 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4761 4762 amdgpu_device_load_pci_state(adev->pdev); 4763 4764 /* wait for asic to come out of reset */ 4765 for (i = 0; i < adev->usec_timeout; i++) { 4766 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4767 4768 if (memsize != 0xffffffff) 4769 break; 4770 udelay(1); 4771 } 4772 4773 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4774 return ret; 4775 } 4776 4777 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4778 struct amdgpu_reset_context *reset_context) 4779 { 4780 int i, r = 0; 4781 struct amdgpu_job *job = NULL; 4782 bool need_full_reset = 4783 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4784 4785 if (reset_context->reset_req_dev == adev) 4786 job = reset_context->job; 4787 4788 if (amdgpu_sriov_vf(adev)) { 4789 /* stop the data exchange thread */ 4790 amdgpu_virt_fini_data_exchange(adev); 4791 } 4792 4793 amdgpu_fence_driver_isr_toggle(adev, true); 4794 4795 /* block all schedulers and reset given job's ring */ 4796 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4797 struct amdgpu_ring *ring = adev->rings[i]; 4798 4799 if (!ring || !ring->sched.thread) 4800 continue; 4801 4802 /* Clear job fence from fence drv to avoid force_completion 4803 * leave NULL and vm flush fence in fence drv 4804 */ 4805 amdgpu_fence_driver_clear_job_fences(ring); 4806 4807 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4808 amdgpu_fence_driver_force_completion(ring); 4809 } 4810 4811 amdgpu_fence_driver_isr_toggle(adev, false); 4812 4813 if (job && job->vm) 4814 drm_sched_increase_karma(&job->base); 4815 4816 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4817 /* If reset handler not implemented, continue; otherwise return */ 4818 if (r == -EOPNOTSUPP) 4819 r = 0; 4820 else 4821 return r; 4822 4823 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4824 if (!amdgpu_sriov_vf(adev)) { 4825 4826 if (!need_full_reset) 4827 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4828 4829 if (!need_full_reset && amdgpu_gpu_recovery && 4830 amdgpu_device_ip_check_soft_reset(adev)) { 4831 amdgpu_device_ip_pre_soft_reset(adev); 4832 r = amdgpu_device_ip_soft_reset(adev); 4833 amdgpu_device_ip_post_soft_reset(adev); 4834 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4835 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4836 need_full_reset = true; 4837 } 4838 } 4839 4840 if (need_full_reset) 4841 r = amdgpu_device_ip_suspend(adev); 4842 if (need_full_reset) 4843 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4844 else 4845 clear_bit(AMDGPU_NEED_FULL_RESET, 4846 &reset_context->flags); 4847 } 4848 4849 return r; 4850 } 4851 4852 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4853 { 4854 int i; 4855 4856 lockdep_assert_held(&adev->reset_domain->sem); 4857 4858 for (i = 0; i < adev->num_regs; i++) { 4859 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4860 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4861 adev->reset_dump_reg_value[i]); 4862 } 4863 4864 return 0; 4865 } 4866 4867 #ifdef CONFIG_DEV_COREDUMP 4868 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4869 size_t count, void *data, size_t datalen) 4870 { 4871 struct drm_printer p; 4872 struct amdgpu_device *adev = data; 4873 struct drm_print_iterator iter; 4874 int i; 4875 4876 iter.data = buffer; 4877 iter.offset = 0; 4878 iter.start = offset; 4879 iter.remain = count; 4880 4881 p = drm_coredump_printer(&iter); 4882 4883 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4884 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4885 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4886 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4887 if (adev->reset_task_info.pid) 4888 drm_printf(&p, "process_name: %s PID: %d\n", 4889 adev->reset_task_info.process_name, 4890 adev->reset_task_info.pid); 4891 4892 if (adev->reset_vram_lost) 4893 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4894 if (adev->num_regs) { 4895 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4896 4897 for (i = 0; i < adev->num_regs; i++) 4898 drm_printf(&p, "0x%08x: 0x%08x\n", 4899 adev->reset_dump_reg_list[i], 4900 adev->reset_dump_reg_value[i]); 4901 } 4902 4903 return count - iter.remain; 4904 } 4905 4906 static void amdgpu_devcoredump_free(void *data) 4907 { 4908 } 4909 4910 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4911 { 4912 struct drm_device *dev = adev_to_drm(adev); 4913 4914 ktime_get_ts64(&adev->reset_time); 4915 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4916 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4917 } 4918 #endif 4919 4920 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4921 struct amdgpu_reset_context *reset_context) 4922 { 4923 struct amdgpu_device *tmp_adev = NULL; 4924 bool need_full_reset, skip_hw_reset, vram_lost = false; 4925 int r = 0; 4926 bool gpu_reset_for_dev_remove = 0; 4927 4928 /* Try reset handler method first */ 4929 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4930 reset_list); 4931 amdgpu_reset_reg_dumps(tmp_adev); 4932 4933 reset_context->reset_device_list = device_list_handle; 4934 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4935 /* If reset handler not implemented, continue; otherwise return */ 4936 if (r == -EOPNOTSUPP) 4937 r = 0; 4938 else 4939 return r; 4940 4941 /* Reset handler not implemented, use the default method */ 4942 need_full_reset = 4943 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4944 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4945 4946 gpu_reset_for_dev_remove = 4947 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4948 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4949 4950 /* 4951 * ASIC reset has to be done on all XGMI hive nodes ASAP 4952 * to allow proper links negotiation in FW (within 1 sec) 4953 */ 4954 if (!skip_hw_reset && need_full_reset) { 4955 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4956 /* For XGMI run all resets in parallel to speed up the process */ 4957 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4958 tmp_adev->gmc.xgmi.pending_reset = false; 4959 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4960 r = -EALREADY; 4961 } else 4962 r = amdgpu_asic_reset(tmp_adev); 4963 4964 if (r) { 4965 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4966 r, adev_to_drm(tmp_adev)->unique); 4967 break; 4968 } 4969 } 4970 4971 /* For XGMI wait for all resets to complete before proceed */ 4972 if (!r) { 4973 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4974 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4975 flush_work(&tmp_adev->xgmi_reset_work); 4976 r = tmp_adev->asic_reset_res; 4977 if (r) 4978 break; 4979 } 4980 } 4981 } 4982 } 4983 4984 if (!r && amdgpu_ras_intr_triggered()) { 4985 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4986 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4987 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4988 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4989 } 4990 4991 amdgpu_ras_intr_cleared(); 4992 } 4993 4994 /* Since the mode1 reset affects base ip blocks, the 4995 * phase1 ip blocks need to be resumed. Otherwise there 4996 * will be a BIOS signature error and the psp bootloader 4997 * can't load kdb on the next amdgpu install. 4998 */ 4999 if (gpu_reset_for_dev_remove) { 5000 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5001 amdgpu_device_ip_resume_phase1(tmp_adev); 5002 5003 goto end; 5004 } 5005 5006 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5007 if (need_full_reset) { 5008 /* post card */ 5009 r = amdgpu_device_asic_init(tmp_adev); 5010 if (r) { 5011 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5012 } else { 5013 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5014 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5015 if (r) 5016 goto out; 5017 5018 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5019 if (r) 5020 goto out; 5021 5022 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5023 #ifdef CONFIG_DEV_COREDUMP 5024 tmp_adev->reset_vram_lost = vram_lost; 5025 memset(&tmp_adev->reset_task_info, 0, 5026 sizeof(tmp_adev->reset_task_info)); 5027 if (reset_context->job && reset_context->job->vm) 5028 tmp_adev->reset_task_info = 5029 reset_context->job->vm->task_info; 5030 amdgpu_reset_capture_coredumpm(tmp_adev); 5031 #endif 5032 if (vram_lost) { 5033 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5034 amdgpu_inc_vram_lost(tmp_adev); 5035 } 5036 5037 r = amdgpu_device_fw_loading(tmp_adev); 5038 if (r) 5039 return r; 5040 5041 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5042 if (r) 5043 goto out; 5044 5045 if (vram_lost) 5046 amdgpu_device_fill_reset_magic(tmp_adev); 5047 5048 /* 5049 * Add this ASIC as tracked as reset was already 5050 * complete successfully. 5051 */ 5052 amdgpu_register_gpu_instance(tmp_adev); 5053 5054 if (!reset_context->hive && 5055 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5056 amdgpu_xgmi_add_device(tmp_adev); 5057 5058 r = amdgpu_device_ip_late_init(tmp_adev); 5059 if (r) 5060 goto out; 5061 5062 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5063 5064 /* 5065 * The GPU enters bad state once faulty pages 5066 * by ECC has reached the threshold, and ras 5067 * recovery is scheduled next. So add one check 5068 * here to break recovery if it indeed exceeds 5069 * bad page threshold, and remind user to 5070 * retire this GPU or setting one bigger 5071 * bad_page_threshold value to fix this once 5072 * probing driver again. 5073 */ 5074 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5075 /* must succeed. */ 5076 amdgpu_ras_resume(tmp_adev); 5077 } else { 5078 r = -EINVAL; 5079 goto out; 5080 } 5081 5082 /* Update PSP FW topology after reset */ 5083 if (reset_context->hive && 5084 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5085 r = amdgpu_xgmi_update_topology( 5086 reset_context->hive, tmp_adev); 5087 } 5088 } 5089 5090 out: 5091 if (!r) { 5092 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5093 r = amdgpu_ib_ring_tests(tmp_adev); 5094 if (r) { 5095 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5096 need_full_reset = true; 5097 r = -EAGAIN; 5098 goto end; 5099 } 5100 } 5101 5102 if (!r) 5103 r = amdgpu_device_recover_vram(tmp_adev); 5104 else 5105 tmp_adev->asic_reset_res = r; 5106 } 5107 5108 end: 5109 if (need_full_reset) 5110 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5111 else 5112 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5113 return r; 5114 } 5115 5116 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5117 { 5118 5119 switch (amdgpu_asic_reset_method(adev)) { 5120 case AMD_RESET_METHOD_MODE1: 5121 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5122 break; 5123 case AMD_RESET_METHOD_MODE2: 5124 adev->mp1_state = PP_MP1_STATE_RESET; 5125 break; 5126 default: 5127 adev->mp1_state = PP_MP1_STATE_NONE; 5128 break; 5129 } 5130 } 5131 5132 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5133 { 5134 amdgpu_vf_error_trans_all(adev); 5135 adev->mp1_state = PP_MP1_STATE_NONE; 5136 } 5137 5138 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5139 { 5140 struct pci_dev *p = NULL; 5141 5142 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5143 adev->pdev->bus->number, 1); 5144 if (p) { 5145 pm_runtime_enable(&(p->dev)); 5146 pm_runtime_resume(&(p->dev)); 5147 } 5148 5149 pci_dev_put(p); 5150 } 5151 5152 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5153 { 5154 enum amd_reset_method reset_method; 5155 struct pci_dev *p = NULL; 5156 u64 expires; 5157 5158 /* 5159 * For now, only BACO and mode1 reset are confirmed 5160 * to suffer the audio issue without proper suspended. 5161 */ 5162 reset_method = amdgpu_asic_reset_method(adev); 5163 if ((reset_method != AMD_RESET_METHOD_BACO) && 5164 (reset_method != AMD_RESET_METHOD_MODE1)) 5165 return -EINVAL; 5166 5167 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5168 adev->pdev->bus->number, 1); 5169 if (!p) 5170 return -ENODEV; 5171 5172 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5173 if (!expires) 5174 /* 5175 * If we cannot get the audio device autosuspend delay, 5176 * a fixed 4S interval will be used. Considering 3S is 5177 * the audio controller default autosuspend delay setting. 5178 * 4S used here is guaranteed to cover that. 5179 */ 5180 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5181 5182 while (!pm_runtime_status_suspended(&(p->dev))) { 5183 if (!pm_runtime_suspend(&(p->dev))) 5184 break; 5185 5186 if (expires < ktime_get_mono_fast_ns()) { 5187 dev_warn(adev->dev, "failed to suspend display audio\n"); 5188 pci_dev_put(p); 5189 /* TODO: abort the succeeding gpu reset? */ 5190 return -ETIMEDOUT; 5191 } 5192 } 5193 5194 pm_runtime_disable(&(p->dev)); 5195 5196 pci_dev_put(p); 5197 return 0; 5198 } 5199 5200 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5201 { 5202 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5203 5204 #if defined(CONFIG_DEBUG_FS) 5205 if (!amdgpu_sriov_vf(adev)) 5206 cancel_work(&adev->reset_work); 5207 #endif 5208 5209 if (adev->kfd.dev) 5210 cancel_work(&adev->kfd.reset_work); 5211 5212 if (amdgpu_sriov_vf(adev)) 5213 cancel_work(&adev->virt.flr_work); 5214 5215 if (con && adev->ras_enabled) 5216 cancel_work(&con->recovery_work); 5217 5218 } 5219 5220 /** 5221 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5222 * 5223 * @adev: amdgpu_device pointer 5224 * @job: which job trigger hang 5225 * @reset_context: amdgpu reset context pointer 5226 * 5227 * Attempt to reset the GPU if it has hung (all asics). 5228 * Attempt to do soft-reset or full-reset and reinitialize Asic 5229 * Returns 0 for success or an error on failure. 5230 */ 5231 5232 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5233 struct amdgpu_job *job, 5234 struct amdgpu_reset_context *reset_context) 5235 { 5236 struct list_head device_list, *device_list_handle = NULL; 5237 bool job_signaled = false; 5238 struct amdgpu_hive_info *hive = NULL; 5239 struct amdgpu_device *tmp_adev = NULL; 5240 int i, r = 0; 5241 bool need_emergency_restart = false; 5242 bool audio_suspended = false; 5243 bool gpu_reset_for_dev_remove = false; 5244 5245 gpu_reset_for_dev_remove = 5246 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5247 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5248 5249 /* 5250 * Special case: RAS triggered and full reset isn't supported 5251 */ 5252 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5253 5254 /* 5255 * Flush RAM to disk so that after reboot 5256 * the user can read log and see why the system rebooted. 5257 */ 5258 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5259 DRM_WARN("Emergency reboot."); 5260 5261 ksys_sync_helper(); 5262 emergency_restart(); 5263 } 5264 5265 dev_info(adev->dev, "GPU %s begin!\n", 5266 need_emergency_restart ? "jobs stop":"reset"); 5267 5268 if (!amdgpu_sriov_vf(adev)) 5269 hive = amdgpu_get_xgmi_hive(adev); 5270 if (hive) 5271 mutex_lock(&hive->hive_lock); 5272 5273 reset_context->job = job; 5274 reset_context->hive = hive; 5275 /* 5276 * Build list of devices to reset. 5277 * In case we are in XGMI hive mode, resort the device list 5278 * to put adev in the 1st position. 5279 */ 5280 INIT_LIST_HEAD(&device_list); 5281 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5282 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5283 list_add_tail(&tmp_adev->reset_list, &device_list); 5284 if (gpu_reset_for_dev_remove && adev->shutdown) 5285 tmp_adev->shutdown = true; 5286 } 5287 if (!list_is_first(&adev->reset_list, &device_list)) 5288 list_rotate_to_front(&adev->reset_list, &device_list); 5289 device_list_handle = &device_list; 5290 } else { 5291 list_add_tail(&adev->reset_list, &device_list); 5292 device_list_handle = &device_list; 5293 } 5294 5295 /* We need to lock reset domain only once both for XGMI and single device */ 5296 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5297 reset_list); 5298 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5299 5300 /* block all schedulers and reset given job's ring */ 5301 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5302 5303 amdgpu_device_set_mp1_state(tmp_adev); 5304 5305 /* 5306 * Try to put the audio codec into suspend state 5307 * before gpu reset started. 5308 * 5309 * Due to the power domain of the graphics device 5310 * is shared with AZ power domain. Without this, 5311 * we may change the audio hardware from behind 5312 * the audio driver's back. That will trigger 5313 * some audio codec errors. 5314 */ 5315 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5316 audio_suspended = true; 5317 5318 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5319 5320 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5321 5322 if (!amdgpu_sriov_vf(tmp_adev)) 5323 amdgpu_amdkfd_pre_reset(tmp_adev); 5324 5325 /* 5326 * Mark these ASICs to be reseted as untracked first 5327 * And add them back after reset completed 5328 */ 5329 amdgpu_unregister_gpu_instance(tmp_adev); 5330 5331 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5332 5333 /* disable ras on ALL IPs */ 5334 if (!need_emergency_restart && 5335 amdgpu_device_ip_need_full_reset(tmp_adev)) 5336 amdgpu_ras_suspend(tmp_adev); 5337 5338 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5339 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5340 5341 if (!ring || !ring->sched.thread) 5342 continue; 5343 5344 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5345 5346 if (need_emergency_restart) 5347 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5348 } 5349 atomic_inc(&tmp_adev->gpu_reset_counter); 5350 } 5351 5352 if (need_emergency_restart) 5353 goto skip_sched_resume; 5354 5355 /* 5356 * Must check guilty signal here since after this point all old 5357 * HW fences are force signaled. 5358 * 5359 * job->base holds a reference to parent fence 5360 */ 5361 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5362 job_signaled = true; 5363 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5364 goto skip_hw_reset; 5365 } 5366 5367 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5368 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5369 if (gpu_reset_for_dev_remove) { 5370 /* Workaroud for ASICs need to disable SMC first */ 5371 amdgpu_device_smu_fini_early(tmp_adev); 5372 } 5373 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5374 /*TODO Should we stop ?*/ 5375 if (r) { 5376 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5377 r, adev_to_drm(tmp_adev)->unique); 5378 tmp_adev->asic_reset_res = r; 5379 } 5380 5381 /* 5382 * Drop all pending non scheduler resets. Scheduler resets 5383 * were already dropped during drm_sched_stop 5384 */ 5385 amdgpu_device_stop_pending_resets(tmp_adev); 5386 } 5387 5388 /* Actual ASIC resets if needed.*/ 5389 /* Host driver will handle XGMI hive reset for SRIOV */ 5390 if (amdgpu_sriov_vf(adev)) { 5391 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5392 if (r) 5393 adev->asic_reset_res = r; 5394 5395 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5396 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5397 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5398 amdgpu_ras_resume(adev); 5399 } else { 5400 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5401 if (r && r == -EAGAIN) 5402 goto retry; 5403 5404 if (!r && gpu_reset_for_dev_remove) 5405 goto recover_end; 5406 } 5407 5408 skip_hw_reset: 5409 5410 /* Post ASIC reset for all devs .*/ 5411 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5412 5413 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5414 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5415 5416 if (!ring || !ring->sched.thread) 5417 continue; 5418 5419 drm_sched_start(&ring->sched, true); 5420 } 5421 5422 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5423 amdgpu_mes_self_test(tmp_adev); 5424 5425 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5426 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5427 5428 if (tmp_adev->asic_reset_res) 5429 r = tmp_adev->asic_reset_res; 5430 5431 tmp_adev->asic_reset_res = 0; 5432 5433 if (r) { 5434 /* bad news, how to tell it to userspace ? */ 5435 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5436 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5437 } else { 5438 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5439 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5440 DRM_WARN("smart shift update failed\n"); 5441 } 5442 } 5443 5444 skip_sched_resume: 5445 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5446 /* unlock kfd: SRIOV would do it separately */ 5447 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5448 amdgpu_amdkfd_post_reset(tmp_adev); 5449 5450 /* kfd_post_reset will do nothing if kfd device is not initialized, 5451 * need to bring up kfd here if it's not be initialized before 5452 */ 5453 if (!adev->kfd.init_complete) 5454 amdgpu_amdkfd_device_init(adev); 5455 5456 if (audio_suspended) 5457 amdgpu_device_resume_display_audio(tmp_adev); 5458 5459 amdgpu_device_unset_mp1_state(tmp_adev); 5460 5461 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5462 } 5463 5464 recover_end: 5465 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5466 reset_list); 5467 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5468 5469 if (hive) { 5470 mutex_unlock(&hive->hive_lock); 5471 amdgpu_put_xgmi_hive(hive); 5472 } 5473 5474 if (r) 5475 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5476 5477 atomic_set(&adev->reset_domain->reset_res, r); 5478 return r; 5479 } 5480 5481 /** 5482 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5483 * 5484 * @adev: amdgpu_device pointer 5485 * 5486 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5487 * and lanes) of the slot the device is in. Handles APUs and 5488 * virtualized environments where PCIE config space may not be available. 5489 */ 5490 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5491 { 5492 struct pci_dev *pdev; 5493 enum pci_bus_speed speed_cap, platform_speed_cap; 5494 enum pcie_link_width platform_link_width; 5495 5496 if (amdgpu_pcie_gen_cap) 5497 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5498 5499 if (amdgpu_pcie_lane_cap) 5500 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5501 5502 /* covers APUs as well */ 5503 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5504 if (adev->pm.pcie_gen_mask == 0) 5505 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5506 if (adev->pm.pcie_mlw_mask == 0) 5507 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5508 return; 5509 } 5510 5511 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5512 return; 5513 5514 pcie_bandwidth_available(adev->pdev, NULL, 5515 &platform_speed_cap, &platform_link_width); 5516 5517 if (adev->pm.pcie_gen_mask == 0) { 5518 /* asic caps */ 5519 pdev = adev->pdev; 5520 speed_cap = pcie_get_speed_cap(pdev); 5521 if (speed_cap == PCI_SPEED_UNKNOWN) { 5522 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5523 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5524 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5525 } else { 5526 if (speed_cap == PCIE_SPEED_32_0GT) 5527 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5528 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5529 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5530 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5531 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5532 else if (speed_cap == PCIE_SPEED_16_0GT) 5533 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5534 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5535 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5536 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5537 else if (speed_cap == PCIE_SPEED_8_0GT) 5538 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5539 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5540 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5541 else if (speed_cap == PCIE_SPEED_5_0GT) 5542 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5543 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5544 else 5545 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5546 } 5547 /* platform caps */ 5548 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5549 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5550 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5551 } else { 5552 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5553 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5554 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5555 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5556 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5557 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5558 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5559 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5560 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5561 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5562 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5563 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5564 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5565 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5566 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5567 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5568 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5569 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5570 else 5571 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5572 5573 } 5574 } 5575 if (adev->pm.pcie_mlw_mask == 0) { 5576 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5577 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5578 } else { 5579 switch (platform_link_width) { 5580 case PCIE_LNK_X32: 5581 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5582 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5583 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5584 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5585 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5586 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5587 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5588 break; 5589 case PCIE_LNK_X16: 5590 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5591 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5592 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5593 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5594 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5595 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5596 break; 5597 case PCIE_LNK_X12: 5598 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5599 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5600 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5601 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5602 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5603 break; 5604 case PCIE_LNK_X8: 5605 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5606 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5607 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5608 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5609 break; 5610 case PCIE_LNK_X4: 5611 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5612 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5613 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5614 break; 5615 case PCIE_LNK_X2: 5616 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5617 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5618 break; 5619 case PCIE_LNK_X1: 5620 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5621 break; 5622 default: 5623 break; 5624 } 5625 } 5626 } 5627 } 5628 5629 /** 5630 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5631 * 5632 * @adev: amdgpu_device pointer 5633 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5634 * 5635 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5636 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5637 * @peer_adev. 5638 */ 5639 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5640 struct amdgpu_device *peer_adev) 5641 { 5642 #ifdef CONFIG_HSA_AMD_P2P 5643 uint64_t address_mask = peer_adev->dev->dma_mask ? 5644 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5645 resource_size_t aper_limit = 5646 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5647 bool p2p_access = 5648 !adev->gmc.xgmi.connected_to_cpu && 5649 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5650 5651 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5652 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5653 !(adev->gmc.aper_base & address_mask || 5654 aper_limit & address_mask)); 5655 #else 5656 return false; 5657 #endif 5658 } 5659 5660 int amdgpu_device_baco_enter(struct drm_device *dev) 5661 { 5662 struct amdgpu_device *adev = drm_to_adev(dev); 5663 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5664 5665 if (!amdgpu_device_supports_baco(dev)) 5666 return -ENOTSUPP; 5667 5668 if (ras && adev->ras_enabled && 5669 adev->nbio.funcs->enable_doorbell_interrupt) 5670 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5671 5672 return amdgpu_dpm_baco_enter(adev); 5673 } 5674 5675 int amdgpu_device_baco_exit(struct drm_device *dev) 5676 { 5677 struct amdgpu_device *adev = drm_to_adev(dev); 5678 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5679 int ret = 0; 5680 5681 if (!amdgpu_device_supports_baco(dev)) 5682 return -ENOTSUPP; 5683 5684 ret = amdgpu_dpm_baco_exit(adev); 5685 if (ret) 5686 return ret; 5687 5688 if (ras && adev->ras_enabled && 5689 adev->nbio.funcs->enable_doorbell_interrupt) 5690 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5691 5692 if (amdgpu_passthrough(adev) && 5693 adev->nbio.funcs->clear_doorbell_interrupt) 5694 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5695 5696 return 0; 5697 } 5698 5699 /** 5700 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5701 * @pdev: PCI device struct 5702 * @state: PCI channel state 5703 * 5704 * Description: Called when a PCI error is detected. 5705 * 5706 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5707 */ 5708 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5709 { 5710 struct drm_device *dev = pci_get_drvdata(pdev); 5711 struct amdgpu_device *adev = drm_to_adev(dev); 5712 int i; 5713 5714 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5715 5716 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5717 DRM_WARN("No support for XGMI hive yet..."); 5718 return PCI_ERS_RESULT_DISCONNECT; 5719 } 5720 5721 adev->pci_channel_state = state; 5722 5723 switch (state) { 5724 case pci_channel_io_normal: 5725 return PCI_ERS_RESULT_CAN_RECOVER; 5726 /* Fatal error, prepare for slot reset */ 5727 case pci_channel_io_frozen: 5728 /* 5729 * Locking adev->reset_domain->sem will prevent any external access 5730 * to GPU during PCI error recovery 5731 */ 5732 amdgpu_device_lock_reset_domain(adev->reset_domain); 5733 amdgpu_device_set_mp1_state(adev); 5734 5735 /* 5736 * Block any work scheduling as we do for regular GPU reset 5737 * for the duration of the recovery 5738 */ 5739 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5740 struct amdgpu_ring *ring = adev->rings[i]; 5741 5742 if (!ring || !ring->sched.thread) 5743 continue; 5744 5745 drm_sched_stop(&ring->sched, NULL); 5746 } 5747 atomic_inc(&adev->gpu_reset_counter); 5748 return PCI_ERS_RESULT_NEED_RESET; 5749 case pci_channel_io_perm_failure: 5750 /* Permanent error, prepare for device removal */ 5751 return PCI_ERS_RESULT_DISCONNECT; 5752 } 5753 5754 return PCI_ERS_RESULT_NEED_RESET; 5755 } 5756 5757 /** 5758 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5759 * @pdev: pointer to PCI device 5760 */ 5761 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5762 { 5763 5764 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5765 5766 /* TODO - dump whatever for debugging purposes */ 5767 5768 /* This called only if amdgpu_pci_error_detected returns 5769 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5770 * works, no need to reset slot. 5771 */ 5772 5773 return PCI_ERS_RESULT_RECOVERED; 5774 } 5775 5776 /** 5777 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5778 * @pdev: PCI device struct 5779 * 5780 * Description: This routine is called by the pci error recovery 5781 * code after the PCI slot has been reset, just before we 5782 * should resume normal operations. 5783 */ 5784 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5785 { 5786 struct drm_device *dev = pci_get_drvdata(pdev); 5787 struct amdgpu_device *adev = drm_to_adev(dev); 5788 int r, i; 5789 struct amdgpu_reset_context reset_context; 5790 u32 memsize; 5791 struct list_head device_list; 5792 5793 DRM_INFO("PCI error: slot reset callback!!\n"); 5794 5795 memset(&reset_context, 0, sizeof(reset_context)); 5796 5797 INIT_LIST_HEAD(&device_list); 5798 list_add_tail(&adev->reset_list, &device_list); 5799 5800 /* wait for asic to come out of reset */ 5801 msleep(500); 5802 5803 /* Restore PCI confspace */ 5804 amdgpu_device_load_pci_state(pdev); 5805 5806 /* confirm ASIC came out of reset */ 5807 for (i = 0; i < adev->usec_timeout; i++) { 5808 memsize = amdgpu_asic_get_config_memsize(adev); 5809 5810 if (memsize != 0xffffffff) 5811 break; 5812 udelay(1); 5813 } 5814 if (memsize == 0xffffffff) { 5815 r = -ETIME; 5816 goto out; 5817 } 5818 5819 reset_context.method = AMD_RESET_METHOD_NONE; 5820 reset_context.reset_req_dev = adev; 5821 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5822 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5823 5824 adev->no_hw_access = true; 5825 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5826 adev->no_hw_access = false; 5827 if (r) 5828 goto out; 5829 5830 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5831 5832 out: 5833 if (!r) { 5834 if (amdgpu_device_cache_pci_state(adev->pdev)) 5835 pci_restore_state(adev->pdev); 5836 5837 DRM_INFO("PCIe error recovery succeeded\n"); 5838 } else { 5839 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5840 amdgpu_device_unset_mp1_state(adev); 5841 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5842 } 5843 5844 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5845 } 5846 5847 /** 5848 * amdgpu_pci_resume() - resume normal ops after PCI reset 5849 * @pdev: pointer to PCI device 5850 * 5851 * Called when the error recovery driver tells us that its 5852 * OK to resume normal operation. 5853 */ 5854 void amdgpu_pci_resume(struct pci_dev *pdev) 5855 { 5856 struct drm_device *dev = pci_get_drvdata(pdev); 5857 struct amdgpu_device *adev = drm_to_adev(dev); 5858 int i; 5859 5860 5861 DRM_INFO("PCI error: resume callback!!\n"); 5862 5863 /* Only continue execution for the case of pci_channel_io_frozen */ 5864 if (adev->pci_channel_state != pci_channel_io_frozen) 5865 return; 5866 5867 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5868 struct amdgpu_ring *ring = adev->rings[i]; 5869 5870 if (!ring || !ring->sched.thread) 5871 continue; 5872 5873 drm_sched_start(&ring->sched, true); 5874 } 5875 5876 amdgpu_device_unset_mp1_state(adev); 5877 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5878 } 5879 5880 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5881 { 5882 struct drm_device *dev = pci_get_drvdata(pdev); 5883 struct amdgpu_device *adev = drm_to_adev(dev); 5884 int r; 5885 5886 r = pci_save_state(pdev); 5887 if (!r) { 5888 kfree(adev->pci_state); 5889 5890 adev->pci_state = pci_store_saved_state(pdev); 5891 5892 if (!adev->pci_state) { 5893 DRM_ERROR("Failed to store PCI saved state"); 5894 return false; 5895 } 5896 } else { 5897 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5898 return false; 5899 } 5900 5901 return true; 5902 } 5903 5904 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5905 { 5906 struct drm_device *dev = pci_get_drvdata(pdev); 5907 struct amdgpu_device *adev = drm_to_adev(dev); 5908 int r; 5909 5910 if (!adev->pci_state) 5911 return false; 5912 5913 r = pci_load_saved_state(pdev, adev->pci_state); 5914 5915 if (!r) { 5916 pci_restore_state(pdev); 5917 } else { 5918 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5919 return false; 5920 } 5921 5922 return true; 5923 } 5924 5925 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5926 struct amdgpu_ring *ring) 5927 { 5928 #ifdef CONFIG_X86_64 5929 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5930 return; 5931 #endif 5932 if (adev->gmc.xgmi.connected_to_cpu) 5933 return; 5934 5935 if (ring && ring->funcs->emit_hdp_flush) 5936 amdgpu_ring_emit_hdp_flush(ring); 5937 else 5938 amdgpu_asic_flush_hdp(adev, ring); 5939 } 5940 5941 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5942 struct amdgpu_ring *ring) 5943 { 5944 #ifdef CONFIG_X86_64 5945 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5946 return; 5947 #endif 5948 if (adev->gmc.xgmi.connected_to_cpu) 5949 return; 5950 5951 amdgpu_asic_invalidate_hdp(adev, ring); 5952 } 5953 5954 int amdgpu_in_reset(struct amdgpu_device *adev) 5955 { 5956 return atomic_read(&adev->reset_domain->in_gpu_reset); 5957 } 5958 5959 /** 5960 * amdgpu_device_halt() - bring hardware to some kind of halt state 5961 * 5962 * @adev: amdgpu_device pointer 5963 * 5964 * Bring hardware to some kind of halt state so that no one can touch it 5965 * any more. It will help to maintain error context when error occurred. 5966 * Compare to a simple hang, the system will keep stable at least for SSH 5967 * access. Then it should be trivial to inspect the hardware state and 5968 * see what's going on. Implemented as following: 5969 * 5970 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5971 * clears all CPU mappings to device, disallows remappings through page faults 5972 * 2. amdgpu_irq_disable_all() disables all interrupts 5973 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5974 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5975 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5976 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5977 * flush any in flight DMA operations 5978 */ 5979 void amdgpu_device_halt(struct amdgpu_device *adev) 5980 { 5981 struct pci_dev *pdev = adev->pdev; 5982 struct drm_device *ddev = adev_to_drm(adev); 5983 5984 amdgpu_xcp_dev_unplug(adev); 5985 drm_dev_unplug(ddev); 5986 5987 amdgpu_irq_disable_all(adev); 5988 5989 amdgpu_fence_driver_hw_fini(adev); 5990 5991 adev->no_hw_access = true; 5992 5993 amdgpu_device_unmap_mmio(adev); 5994 5995 pci_disable_device(pdev); 5996 pci_wait_for_pending_transaction(pdev); 5997 } 5998 5999 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6000 u32 reg) 6001 { 6002 unsigned long flags, address, data; 6003 u32 r; 6004 6005 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6006 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6007 6008 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6009 WREG32(address, reg * 4); 6010 (void)RREG32(address); 6011 r = RREG32(data); 6012 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6013 return r; 6014 } 6015 6016 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6017 u32 reg, u32 v) 6018 { 6019 unsigned long flags, address, data; 6020 6021 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6022 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6023 6024 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6025 WREG32(address, reg * 4); 6026 (void)RREG32(address); 6027 WREG32(data, v); 6028 (void)RREG32(data); 6029 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6030 } 6031 6032 /** 6033 * amdgpu_device_switch_gang - switch to a new gang 6034 * @adev: amdgpu_device pointer 6035 * @gang: the gang to switch to 6036 * 6037 * Try to switch to a new gang. 6038 * Returns: NULL if we switched to the new gang or a reference to the current 6039 * gang leader. 6040 */ 6041 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6042 struct dma_fence *gang) 6043 { 6044 struct dma_fence *old = NULL; 6045 6046 do { 6047 dma_fence_put(old); 6048 rcu_read_lock(); 6049 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6050 rcu_read_unlock(); 6051 6052 if (old == gang) 6053 break; 6054 6055 if (!dma_fence_is_signaled(old)) 6056 return old; 6057 6058 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6059 old, gang) != old); 6060 6061 dma_fence_put(old); 6062 return NULL; 6063 } 6064 6065 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6066 { 6067 switch (adev->asic_type) { 6068 #ifdef CONFIG_DRM_AMDGPU_SI 6069 case CHIP_HAINAN: 6070 #endif 6071 case CHIP_TOPAZ: 6072 /* chips with no display hardware */ 6073 return false; 6074 #ifdef CONFIG_DRM_AMDGPU_SI 6075 case CHIP_TAHITI: 6076 case CHIP_PITCAIRN: 6077 case CHIP_VERDE: 6078 case CHIP_OLAND: 6079 #endif 6080 #ifdef CONFIG_DRM_AMDGPU_CIK 6081 case CHIP_BONAIRE: 6082 case CHIP_HAWAII: 6083 case CHIP_KAVERI: 6084 case CHIP_KABINI: 6085 case CHIP_MULLINS: 6086 #endif 6087 case CHIP_TONGA: 6088 case CHIP_FIJI: 6089 case CHIP_POLARIS10: 6090 case CHIP_POLARIS11: 6091 case CHIP_POLARIS12: 6092 case CHIP_VEGAM: 6093 case CHIP_CARRIZO: 6094 case CHIP_STONEY: 6095 /* chips with display hardware */ 6096 return true; 6097 default: 6098 /* IP discovery */ 6099 if (!adev->ip_versions[DCE_HWIP][0] || 6100 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6101 return false; 6102 return true; 6103 } 6104 } 6105 6106 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6107 uint32_t inst, uint32_t reg_addr, char reg_name[], 6108 uint32_t expected_value, uint32_t mask) 6109 { 6110 uint32_t ret = 0; 6111 uint32_t old_ = 0; 6112 uint32_t tmp_ = RREG32(reg_addr); 6113 uint32_t loop = adev->usec_timeout; 6114 6115 while ((tmp_ & (mask)) != (expected_value)) { 6116 if (old_ != tmp_) { 6117 loop = adev->usec_timeout; 6118 old_ = tmp_; 6119 } else 6120 udelay(1); 6121 tmp_ = RREG32(reg_addr); 6122 loop--; 6123 if (!loop) { 6124 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6125 inst, reg_name, (uint32_t)expected_value, 6126 (uint32_t)(tmp_ & (mask))); 6127 ret = -ETIMEDOUT; 6128 break; 6129 } 6130 } 6131 return ret; 6132 } 6133