1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 /** 168 * DOC: product_name 169 * 170 * The amdgpu driver provides a sysfs API for reporting the product name 171 * for the device 172 * The file product_name is used for this and returns the product name 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_name(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return sysfs_emit(buf, "%s\n", adev->product_name); 184 } 185 186 static DEVICE_ATTR(product_name, S_IRUGO, 187 amdgpu_device_get_product_name, NULL); 188 189 /** 190 * DOC: product_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the part number 193 * for the device 194 * The file product_number is used for this and returns the part number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_product_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return sysfs_emit(buf, "%s\n", adev->product_number); 206 } 207 208 static DEVICE_ATTR(product_number, S_IRUGO, 209 amdgpu_device_get_product_number, NULL); 210 211 /** 212 * DOC: serial_number 213 * 214 * The amdgpu driver provides a sysfs API for reporting the serial number 215 * for the device 216 * The file serial_number is used for this and returns the serial number 217 * as returned from the FRU. 218 * NOTE: This is only available for certain server cards 219 */ 220 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 222 struct device_attribute *attr, char *buf) 223 { 224 struct drm_device *ddev = dev_get_drvdata(dev); 225 struct amdgpu_device *adev = drm_to_adev(ddev); 226 227 return sysfs_emit(buf, "%s\n", adev->serial); 228 } 229 230 static DEVICE_ATTR(serial_number, S_IRUGO, 231 amdgpu_device_get_serial_number, NULL); 232 233 /** 234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 235 * 236 * @dev: drm_device pointer 237 * 238 * Returns true if the device is a dGPU with ATPX power control, 239 * otherwise return false. 240 */ 241 bool amdgpu_device_supports_px(struct drm_device *dev) 242 { 243 struct amdgpu_device *adev = drm_to_adev(dev); 244 245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device is a dGPU with ACPI power control, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_boco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 if (adev->has_pr3 || 263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 264 return true; 265 return false; 266 } 267 268 /** 269 * amdgpu_device_supports_baco - Does the device support BACO 270 * 271 * @dev: drm_device pointer 272 * 273 * Returns true if the device supporte BACO, 274 * otherwise return false. 275 */ 276 bool amdgpu_device_supports_baco(struct drm_device *dev) 277 { 278 struct amdgpu_device *adev = drm_to_adev(dev); 279 280 return amdgpu_asic_supports_baco(adev); 281 } 282 283 /** 284 * amdgpu_device_supports_smart_shift - Is the device dGPU with 285 * smart shift support 286 * 287 * @dev: drm_device pointer 288 * 289 * Returns true if the device is a dGPU with Smart Shift support, 290 * otherwise returns false. 291 */ 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 293 { 294 return (amdgpu_device_supports_boco(dev) && 295 amdgpu_acpi_is_power_shift_control_supported()); 296 } 297 298 /* 299 * VRAM access helper functions 300 */ 301 302 /** 303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 304 * 305 * @adev: amdgpu_device pointer 306 * @pos: offset of the buffer in vram 307 * @buf: virtual address of the buffer in system memory 308 * @size: read/write size, sizeof(@buf) must > @size 309 * @write: true - write to vram, otherwise - read from vram 310 */ 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 312 void *buf, size_t size, bool write) 313 { 314 unsigned long flags; 315 uint32_t hi = ~0, tmp = 0; 316 uint32_t *data = buf; 317 uint64_t last; 318 int idx; 319 320 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 321 return; 322 323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 324 325 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 326 for (last = pos + size; pos < last; pos += 4) { 327 tmp = pos >> 31; 328 329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 330 if (tmp != hi) { 331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 332 hi = tmp; 333 } 334 if (write) 335 WREG32_NO_KIQ(mmMM_DATA, *data++); 336 else 337 *data++ = RREG32_NO_KIQ(mmMM_DATA); 338 } 339 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 drm_dev_exit(idx); 342 } 343 344 /** 345 * amdgpu_device_aper_access - access vram by vram aperature 346 * 347 * @adev: amdgpu_device pointer 348 * @pos: offset of the buffer in vram 349 * @buf: virtual address of the buffer in system memory 350 * @size: read/write size, sizeof(@buf) must > @size 351 * @write: true - write to vram, otherwise - read from vram 352 * 353 * The return value means how many bytes have been transferred. 354 */ 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 356 void *buf, size_t size, bool write) 357 { 358 #ifdef CONFIG_64BIT 359 void __iomem *addr; 360 size_t count = 0; 361 uint64_t last; 362 363 if (!adev->mman.aper_base_kaddr) 364 return 0; 365 366 last = min(pos + size, adev->gmc.visible_vram_size); 367 if (last > pos) { 368 addr = adev->mman.aper_base_kaddr + pos; 369 count = last - pos; 370 371 if (write) { 372 memcpy_toio(addr, buf, count); 373 mb(); 374 amdgpu_device_flush_hdp(adev, NULL); 375 } else { 376 amdgpu_device_invalidate_hdp(adev, NULL); 377 mb(); 378 memcpy_fromio(buf, addr, count); 379 } 380 381 } 382 383 return count; 384 #else 385 return 0; 386 #endif 387 } 388 389 /** 390 * amdgpu_device_vram_access - read/write a buffer in vram 391 * 392 * @adev: amdgpu_device pointer 393 * @pos: offset of the buffer in vram 394 * @buf: virtual address of the buffer in system memory 395 * @size: read/write size, sizeof(@buf) must > @size 396 * @write: true - write to vram, otherwise - read from vram 397 */ 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 399 void *buf, size_t size, bool write) 400 { 401 size_t count; 402 403 /* try to using vram apreature to access vram first */ 404 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 405 size -= count; 406 if (size) { 407 /* using MM to access rest vram */ 408 pos += count; 409 buf += count; 410 amdgpu_device_mm_access(adev, pos, buf, size, write); 411 } 412 } 413 414 /* 415 * register access helper functions. 416 */ 417 418 /* Check if hw access should be skipped because of hotplug or device error */ 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 420 { 421 if (adev->no_hw_access) 422 return true; 423 424 #ifdef CONFIG_LOCKDEP 425 /* 426 * This is a bit complicated to understand, so worth a comment. What we assert 427 * here is that the GPU reset is not running on another thread in parallel. 428 * 429 * For this we trylock the read side of the reset semaphore, if that succeeds 430 * we know that the reset is not running in paralell. 431 * 432 * If the trylock fails we assert that we are either already holding the read 433 * side of the lock or are the reset thread itself and hold the write side of 434 * the lock. 435 */ 436 if (in_task()) { 437 if (down_read_trylock(&adev->reset_domain->sem)) 438 up_read(&adev->reset_domain->sem); 439 else 440 lockdep_assert_held(&adev->reset_domain->sem); 441 } 442 #endif 443 return false; 444 } 445 446 /** 447 * amdgpu_device_rreg - read a memory mapped IO or indirect register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * @acc_flags: access flags which require special behavior 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t acc_flags) 457 { 458 uint32_t ret; 459 460 if (amdgpu_device_skip_hw_access(adev)) 461 return 0; 462 463 if ((reg * 4) < adev->rmmio_size) { 464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 465 amdgpu_sriov_runtime(adev) && 466 down_read_trylock(&adev->reset_domain->sem)) { 467 ret = amdgpu_kiq_rreg(adev, reg); 468 up_read(&adev->reset_domain->sem); 469 } else { 470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 471 } 472 } else { 473 ret = adev->pcie_rreg(adev, reg * 4); 474 } 475 476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 477 478 return ret; 479 } 480 481 /* 482 * MMIO register read with bytes helper functions 483 * @offset:bytes offset from MMIO start 484 * 485 */ 486 487 /** 488 * amdgpu_mm_rreg8 - read a memory mapped IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @offset: byte aligned register offset 492 * 493 * Returns the 8 bit value from the offset specified. 494 */ 495 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 496 { 497 if (amdgpu_device_skip_hw_access(adev)) 498 return 0; 499 500 if (offset < adev->rmmio_size) 501 return (readb(adev->rmmio + offset)); 502 BUG(); 503 } 504 505 /* 506 * MMIO register write with bytes helper functions 507 * @offset:bytes offset from MMIO start 508 * @value: the value want to be written to the register 509 * 510 */ 511 /** 512 * amdgpu_mm_wreg8 - read a memory mapped IO register 513 * 514 * @adev: amdgpu_device pointer 515 * @offset: byte aligned register offset 516 * @value: 8 bit value to write 517 * 518 * Writes the value specified to the offset specified. 519 */ 520 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 521 { 522 if (amdgpu_device_skip_hw_access(adev)) 523 return; 524 525 if (offset < adev->rmmio_size) 526 writeb(value, adev->rmmio + offset); 527 else 528 BUG(); 529 } 530 531 /** 532 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 533 * 534 * @adev: amdgpu_device pointer 535 * @reg: dword aligned register offset 536 * @v: 32 bit value to write to the register 537 * @acc_flags: access flags which require special behavior 538 * 539 * Writes the value specified to the offset specified. 540 */ 541 void amdgpu_device_wreg(struct amdgpu_device *adev, 542 uint32_t reg, uint32_t v, 543 uint32_t acc_flags) 544 { 545 if (amdgpu_device_skip_hw_access(adev)) 546 return; 547 548 if ((reg * 4) < adev->rmmio_size) { 549 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 550 amdgpu_sriov_runtime(adev) && 551 down_read_trylock(&adev->reset_domain->sem)) { 552 amdgpu_kiq_wreg(adev, reg, v); 553 up_read(&adev->reset_domain->sem); 554 } else { 555 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 556 } 557 } else { 558 adev->pcie_wreg(adev, reg * 4, v); 559 } 560 561 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 562 } 563 564 /** 565 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 566 * 567 * @adev: amdgpu_device pointer 568 * @reg: mmio/rlc register 569 * @v: value to write 570 * 571 * this function is invoked only for the debugfs register access 572 */ 573 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 574 uint32_t reg, uint32_t v) 575 { 576 if (amdgpu_device_skip_hw_access(adev)) 577 return; 578 579 if (amdgpu_sriov_fullaccess(adev) && 580 adev->gfx.rlc.funcs && 581 adev->gfx.rlc.funcs->is_rlcg_access_range) { 582 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 583 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 584 } else if ((reg * 4) >= adev->rmmio_size) { 585 adev->pcie_wreg(adev, reg * 4, v); 586 } else { 587 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 588 } 589 } 590 591 /** 592 * amdgpu_device_indirect_rreg - read an indirect register 593 * 594 * @adev: amdgpu_device pointer 595 * @reg_addr: indirect register address to read from 596 * 597 * Returns the value of indirect register @reg_addr 598 */ 599 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 600 u32 reg_addr) 601 { 602 unsigned long flags, pcie_index, pcie_data; 603 void __iomem *pcie_index_offset; 604 void __iomem *pcie_data_offset; 605 u32 r; 606 607 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 608 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 609 610 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 611 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 612 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 613 614 writel(reg_addr, pcie_index_offset); 615 readl(pcie_index_offset); 616 r = readl(pcie_data_offset); 617 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 618 619 return r; 620 } 621 622 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 623 u64 reg_addr) 624 { 625 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 626 u32 r; 627 void __iomem *pcie_index_offset; 628 void __iomem *pcie_index_hi_offset; 629 void __iomem *pcie_data_offset; 630 631 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 632 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 633 if (adev->nbio.funcs->get_pcie_index_hi_offset) 634 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 635 else 636 pcie_index_hi = 0; 637 638 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 639 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 640 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 641 if (pcie_index_hi != 0) 642 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 643 pcie_index_hi * 4; 644 645 writel(reg_addr, pcie_index_offset); 646 readl(pcie_index_offset); 647 if (pcie_index_hi != 0) { 648 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 649 readl(pcie_index_hi_offset); 650 } 651 r = readl(pcie_data_offset); 652 653 /* clear the high bits */ 654 if (pcie_index_hi != 0) { 655 writel(0, pcie_index_hi_offset); 656 readl(pcie_index_hi_offset); 657 } 658 659 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 660 661 return r; 662 } 663 664 /** 665 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 666 * 667 * @adev: amdgpu_device pointer 668 * @reg_addr: indirect register address to read from 669 * 670 * Returns the value of indirect register @reg_addr 671 */ 672 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 673 u32 reg_addr) 674 { 675 unsigned long flags, pcie_index, pcie_data; 676 void __iomem *pcie_index_offset; 677 void __iomem *pcie_data_offset; 678 u64 r; 679 680 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 681 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 682 683 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 684 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 685 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 686 687 /* read low 32 bits */ 688 writel(reg_addr, pcie_index_offset); 689 readl(pcie_index_offset); 690 r = readl(pcie_data_offset); 691 /* read high 32 bits */ 692 writel(reg_addr + 4, pcie_index_offset); 693 readl(pcie_index_offset); 694 r |= ((u64)readl(pcie_data_offset) << 32); 695 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 696 697 return r; 698 } 699 700 /** 701 * amdgpu_device_indirect_wreg - write an indirect register address 702 * 703 * @adev: amdgpu_device pointer 704 * @reg_addr: indirect register offset 705 * @reg_data: indirect register data 706 * 707 */ 708 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 709 u32 reg_addr, u32 reg_data) 710 { 711 unsigned long flags, pcie_index, pcie_data; 712 void __iomem *pcie_index_offset; 713 void __iomem *pcie_data_offset; 714 715 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 716 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 717 718 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 719 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 720 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 721 722 writel(reg_addr, pcie_index_offset); 723 readl(pcie_index_offset); 724 writel(reg_data, pcie_data_offset); 725 readl(pcie_data_offset); 726 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 727 } 728 729 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 730 u64 reg_addr, u32 reg_data) 731 { 732 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 733 void __iomem *pcie_index_offset; 734 void __iomem *pcie_index_hi_offset; 735 void __iomem *pcie_data_offset; 736 737 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 738 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 739 if (adev->nbio.funcs->get_pcie_index_hi_offset) 740 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 741 else 742 pcie_index_hi = 0; 743 744 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 745 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 746 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 747 if (pcie_index_hi != 0) 748 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 749 pcie_index_hi * 4; 750 751 writel(reg_addr, pcie_index_offset); 752 readl(pcie_index_offset); 753 if (pcie_index_hi != 0) { 754 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 755 readl(pcie_index_hi_offset); 756 } 757 writel(reg_data, pcie_data_offset); 758 readl(pcie_data_offset); 759 760 /* clear the high bits */ 761 if (pcie_index_hi != 0) { 762 writel(0, pcie_index_hi_offset); 763 readl(pcie_index_hi_offset); 764 } 765 766 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 767 } 768 769 /** 770 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 771 * 772 * @adev: amdgpu_device pointer 773 * @reg_addr: indirect register offset 774 * @reg_data: indirect register data 775 * 776 */ 777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 778 u32 reg_addr, u64 reg_data) 779 { 780 unsigned long flags, pcie_index, pcie_data; 781 void __iomem *pcie_index_offset; 782 void __iomem *pcie_data_offset; 783 784 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 785 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 786 787 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 788 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 789 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 790 791 /* write low 32 bits */ 792 writel(reg_addr, pcie_index_offset); 793 readl(pcie_index_offset); 794 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 795 readl(pcie_data_offset); 796 /* write high 32 bits */ 797 writel(reg_addr + 4, pcie_index_offset); 798 readl(pcie_index_offset); 799 writel((u32)(reg_data >> 32), pcie_data_offset); 800 readl(pcie_data_offset); 801 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 802 } 803 804 /** 805 * amdgpu_device_get_rev_id - query device rev_id 806 * 807 * @adev: amdgpu_device pointer 808 * 809 * Return device rev_id 810 */ 811 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 812 { 813 return adev->nbio.funcs->get_rev_id(adev); 814 } 815 816 /** 817 * amdgpu_invalid_rreg - dummy reg read function 818 * 819 * @adev: amdgpu_device pointer 820 * @reg: offset of register 821 * 822 * Dummy register read function. Used for register blocks 823 * that certain asics don't have (all asics). 824 * Returns the value in the register. 825 */ 826 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 827 { 828 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 829 BUG(); 830 return 0; 831 } 832 833 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 834 { 835 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 836 BUG(); 837 return 0; 838 } 839 840 /** 841 * amdgpu_invalid_wreg - dummy reg write function 842 * 843 * @adev: amdgpu_device pointer 844 * @reg: offset of register 845 * @v: value to write to the register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 */ 850 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 851 { 852 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 853 reg, v); 854 BUG(); 855 } 856 857 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 858 { 859 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 860 reg, v); 861 BUG(); 862 } 863 864 /** 865 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 866 * 867 * @adev: amdgpu_device pointer 868 * @reg: offset of register 869 * 870 * Dummy register read function. Used for register blocks 871 * that certain asics don't have (all asics). 872 * Returns the value in the register. 873 */ 874 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 875 { 876 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 877 BUG(); 878 return 0; 879 } 880 881 /** 882 * amdgpu_invalid_wreg64 - dummy reg write function 883 * 884 * @adev: amdgpu_device pointer 885 * @reg: offset of register 886 * @v: value to write to the register 887 * 888 * Dummy register read function. Used for register blocks 889 * that certain asics don't have (all asics). 890 */ 891 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 892 { 893 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 894 reg, v); 895 BUG(); 896 } 897 898 /** 899 * amdgpu_block_invalid_rreg - dummy reg read function 900 * 901 * @adev: amdgpu_device pointer 902 * @block: offset of instance 903 * @reg: offset of register 904 * 905 * Dummy register read function. Used for register blocks 906 * that certain asics don't have (all asics). 907 * Returns the value in the register. 908 */ 909 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 910 uint32_t block, uint32_t reg) 911 { 912 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 913 reg, block); 914 BUG(); 915 return 0; 916 } 917 918 /** 919 * amdgpu_block_invalid_wreg - dummy reg write function 920 * 921 * @adev: amdgpu_device pointer 922 * @block: offset of instance 923 * @reg: offset of register 924 * @v: value to write to the register 925 * 926 * Dummy register read function. Used for register blocks 927 * that certain asics don't have (all asics). 928 */ 929 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 930 uint32_t block, 931 uint32_t reg, uint32_t v) 932 { 933 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 934 reg, block, v); 935 BUG(); 936 } 937 938 /** 939 * amdgpu_device_asic_init - Wrapper for atom asic_init 940 * 941 * @adev: amdgpu_device pointer 942 * 943 * Does any asic specific work and then calls atom asic init. 944 */ 945 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 946 { 947 amdgpu_asic_pre_asic_init(adev); 948 949 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 950 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 951 return amdgpu_atomfirmware_asic_init(adev, true); 952 else 953 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 954 } 955 956 /** 957 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 958 * 959 * @adev: amdgpu_device pointer 960 * 961 * Allocates a scratch page of VRAM for use by various things in the 962 * driver. 963 */ 964 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 965 { 966 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 967 AMDGPU_GEM_DOMAIN_VRAM | 968 AMDGPU_GEM_DOMAIN_GTT, 969 &adev->mem_scratch.robj, 970 &adev->mem_scratch.gpu_addr, 971 (void **)&adev->mem_scratch.ptr); 972 } 973 974 /** 975 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 976 * 977 * @adev: amdgpu_device pointer 978 * 979 * Frees the VRAM scratch page. 980 */ 981 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 982 { 983 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 984 } 985 986 /** 987 * amdgpu_device_program_register_sequence - program an array of registers. 988 * 989 * @adev: amdgpu_device pointer 990 * @registers: pointer to the register array 991 * @array_size: size of the register array 992 * 993 * Programs an array or registers with and and or masks. 994 * This is a helper for setting golden registers. 995 */ 996 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 997 const u32 *registers, 998 const u32 array_size) 999 { 1000 u32 tmp, reg, and_mask, or_mask; 1001 int i; 1002 1003 if (array_size % 3) 1004 return; 1005 1006 for (i = 0; i < array_size; i += 3) { 1007 reg = registers[i + 0]; 1008 and_mask = registers[i + 1]; 1009 or_mask = registers[i + 2]; 1010 1011 if (and_mask == 0xffffffff) { 1012 tmp = or_mask; 1013 } else { 1014 tmp = RREG32(reg); 1015 tmp &= ~and_mask; 1016 if (adev->family >= AMDGPU_FAMILY_AI) 1017 tmp |= (or_mask & and_mask); 1018 else 1019 tmp |= or_mask; 1020 } 1021 WREG32(reg, tmp); 1022 } 1023 } 1024 1025 /** 1026 * amdgpu_device_pci_config_reset - reset the GPU 1027 * 1028 * @adev: amdgpu_device pointer 1029 * 1030 * Resets the GPU using the pci config reset sequence. 1031 * Only applicable to asics prior to vega10. 1032 */ 1033 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1034 { 1035 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1036 } 1037 1038 /** 1039 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1040 * 1041 * @adev: amdgpu_device pointer 1042 * 1043 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1044 */ 1045 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1046 { 1047 return pci_reset_function(adev->pdev); 1048 } 1049 1050 /* 1051 * amdgpu_device_wb_*() 1052 * Writeback is the method by which the GPU updates special pages in memory 1053 * with the status of certain GPU events (fences, ring pointers,etc.). 1054 */ 1055 1056 /** 1057 * amdgpu_device_wb_fini - Disable Writeback and free memory 1058 * 1059 * @adev: amdgpu_device pointer 1060 * 1061 * Disables Writeback and frees the Writeback memory (all asics). 1062 * Used at driver shutdown. 1063 */ 1064 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1065 { 1066 if (adev->wb.wb_obj) { 1067 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1068 &adev->wb.gpu_addr, 1069 (void **)&adev->wb.wb); 1070 adev->wb.wb_obj = NULL; 1071 } 1072 } 1073 1074 /** 1075 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1076 * 1077 * @adev: amdgpu_device pointer 1078 * 1079 * Initializes writeback and allocates writeback memory (all asics). 1080 * Used at driver startup. 1081 * Returns 0 on success or an -error on failure. 1082 */ 1083 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1084 { 1085 int r; 1086 1087 if (adev->wb.wb_obj == NULL) { 1088 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1089 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1090 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1091 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1092 (void **)&adev->wb.wb); 1093 if (r) { 1094 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1095 return r; 1096 } 1097 1098 adev->wb.num_wb = AMDGPU_MAX_WB; 1099 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1100 1101 /* clear wb memory */ 1102 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1103 } 1104 1105 return 0; 1106 } 1107 1108 /** 1109 * amdgpu_device_wb_get - Allocate a wb entry 1110 * 1111 * @adev: amdgpu_device pointer 1112 * @wb: wb index 1113 * 1114 * Allocate a wb slot for use by the driver (all asics). 1115 * Returns 0 on success or -EINVAL on failure. 1116 */ 1117 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1118 { 1119 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1120 1121 if (offset < adev->wb.num_wb) { 1122 __set_bit(offset, adev->wb.used); 1123 *wb = offset << 3; /* convert to dw offset */ 1124 return 0; 1125 } else { 1126 return -EINVAL; 1127 } 1128 } 1129 1130 /** 1131 * amdgpu_device_wb_free - Free a wb entry 1132 * 1133 * @adev: amdgpu_device pointer 1134 * @wb: wb index 1135 * 1136 * Free a wb slot allocated for use by the driver (all asics) 1137 */ 1138 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1139 { 1140 wb >>= 3; 1141 if (wb < adev->wb.num_wb) 1142 __clear_bit(wb, adev->wb.used); 1143 } 1144 1145 /** 1146 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1147 * 1148 * @adev: amdgpu_device pointer 1149 * 1150 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1151 * to fail, but if any of the BARs is not accessible after the size we abort 1152 * driver loading by returning -ENODEV. 1153 */ 1154 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1155 { 1156 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1157 struct pci_bus *root; 1158 struct resource *res; 1159 unsigned i; 1160 u16 cmd; 1161 int r; 1162 1163 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1164 return 0; 1165 1166 /* Bypass for VF */ 1167 if (amdgpu_sriov_vf(adev)) 1168 return 0; 1169 1170 /* skip if the bios has already enabled large BAR */ 1171 if (adev->gmc.real_vram_size && 1172 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1173 return 0; 1174 1175 /* Check if the root BUS has 64bit memory resources */ 1176 root = adev->pdev->bus; 1177 while (root->parent) 1178 root = root->parent; 1179 1180 pci_bus_for_each_resource(root, res, i) { 1181 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1182 res->start > 0x100000000ull) 1183 break; 1184 } 1185 1186 /* Trying to resize is pointless without a root hub window above 4GB */ 1187 if (!res) 1188 return 0; 1189 1190 /* Limit the BAR size to what is available */ 1191 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1192 rbar_size); 1193 1194 /* Disable memory decoding while we change the BAR addresses and size */ 1195 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1196 pci_write_config_word(adev->pdev, PCI_COMMAND, 1197 cmd & ~PCI_COMMAND_MEMORY); 1198 1199 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1200 amdgpu_doorbell_fini(adev); 1201 if (adev->asic_type >= CHIP_BONAIRE) 1202 pci_release_resource(adev->pdev, 2); 1203 1204 pci_release_resource(adev->pdev, 0); 1205 1206 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1207 if (r == -ENOSPC) 1208 DRM_INFO("Not enough PCI address space for a large BAR."); 1209 else if (r && r != -ENOTSUPP) 1210 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1211 1212 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1213 1214 /* When the doorbell or fb BAR isn't available we have no chance of 1215 * using the device. 1216 */ 1217 r = amdgpu_doorbell_init(adev); 1218 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1219 return -ENODEV; 1220 1221 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1222 1223 return 0; 1224 } 1225 1226 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1227 { 1228 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) { 1229 return false; 1230 } 1231 1232 return true; 1233 } 1234 1235 /* 1236 * GPU helpers function. 1237 */ 1238 /** 1239 * amdgpu_device_need_post - check if the hw need post or not 1240 * 1241 * @adev: amdgpu_device pointer 1242 * 1243 * Check if the asic has been initialized (all asics) at driver startup 1244 * or post is needed if hw reset is performed. 1245 * Returns true if need or false if not. 1246 */ 1247 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1248 { 1249 uint32_t reg; 1250 1251 if (amdgpu_sriov_vf(adev)) 1252 return false; 1253 1254 if (!amdgpu_device_read_bios(adev)) 1255 return false; 1256 1257 if (amdgpu_passthrough(adev)) { 1258 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1259 * some old smc fw still need driver do vPost otherwise gpu hang, while 1260 * those smc fw version above 22.15 doesn't have this flaw, so we force 1261 * vpost executed for smc version below 22.15 1262 */ 1263 if (adev->asic_type == CHIP_FIJI) { 1264 int err; 1265 uint32_t fw_ver; 1266 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1267 /* force vPost if error occured */ 1268 if (err) 1269 return true; 1270 1271 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1272 if (fw_ver < 0x00160e00) 1273 return true; 1274 } 1275 } 1276 1277 /* Don't post if we need to reset whole hive on init */ 1278 if (adev->gmc.xgmi.pending_reset) 1279 return false; 1280 1281 if (adev->has_hw_reset) { 1282 adev->has_hw_reset = false; 1283 return true; 1284 } 1285 1286 /* bios scratch used on CIK+ */ 1287 if (adev->asic_type >= CHIP_BONAIRE) 1288 return amdgpu_atombios_scratch_need_asic_init(adev); 1289 1290 /* check MEM_SIZE for older asics */ 1291 reg = amdgpu_asic_get_config_memsize(adev); 1292 1293 if ((reg != 0) && (reg != 0xffffffff)) 1294 return false; 1295 1296 return true; 1297 } 1298 1299 /* 1300 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1301 * speed switching. Until we have confirmation from Intel that a specific host 1302 * supports it, it's safer that we keep it disabled for all. 1303 * 1304 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1305 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1306 */ 1307 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1308 { 1309 #if IS_ENABLED(CONFIG_X86) 1310 struct cpuinfo_x86 *c = &cpu_data(0); 1311 1312 if (c->x86_vendor == X86_VENDOR_INTEL) 1313 return false; 1314 #endif 1315 return true; 1316 } 1317 1318 /** 1319 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1320 * 1321 * @adev: amdgpu_device pointer 1322 * 1323 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1324 * be set for this device. 1325 * 1326 * Returns true if it should be used or false if not. 1327 */ 1328 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1329 { 1330 switch (amdgpu_aspm) { 1331 case -1: 1332 break; 1333 case 0: 1334 return false; 1335 case 1: 1336 return true; 1337 default: 1338 return false; 1339 } 1340 return pcie_aspm_enabled(adev->pdev); 1341 } 1342 1343 bool amdgpu_device_aspm_support_quirk(void) 1344 { 1345 #if IS_ENABLED(CONFIG_X86) 1346 struct cpuinfo_x86 *c = &cpu_data(0); 1347 1348 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1349 #else 1350 return true; 1351 #endif 1352 } 1353 1354 /* if we get transitioned to only one device, take VGA back */ 1355 /** 1356 * amdgpu_device_vga_set_decode - enable/disable vga decode 1357 * 1358 * @pdev: PCI device pointer 1359 * @state: enable/disable vga decode 1360 * 1361 * Enable/disable vga decode (all asics). 1362 * Returns VGA resource flags. 1363 */ 1364 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1365 bool state) 1366 { 1367 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1368 amdgpu_asic_set_vga_state(adev, state); 1369 if (state) 1370 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1371 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1372 else 1373 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1374 } 1375 1376 /** 1377 * amdgpu_device_check_block_size - validate the vm block size 1378 * 1379 * @adev: amdgpu_device pointer 1380 * 1381 * Validates the vm block size specified via module parameter. 1382 * The vm block size defines number of bits in page table versus page directory, 1383 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1384 * page table and the remaining bits are in the page directory. 1385 */ 1386 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1387 { 1388 /* defines number of bits in page table versus page directory, 1389 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1390 * page table and the remaining bits are in the page directory */ 1391 if (amdgpu_vm_block_size == -1) 1392 return; 1393 1394 if (amdgpu_vm_block_size < 9) { 1395 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1396 amdgpu_vm_block_size); 1397 amdgpu_vm_block_size = -1; 1398 } 1399 } 1400 1401 /** 1402 * amdgpu_device_check_vm_size - validate the vm size 1403 * 1404 * @adev: amdgpu_device pointer 1405 * 1406 * Validates the vm size in GB specified via module parameter. 1407 * The VM size is the size of the GPU virtual memory space in GB. 1408 */ 1409 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1410 { 1411 /* no need to check the default value */ 1412 if (amdgpu_vm_size == -1) 1413 return; 1414 1415 if (amdgpu_vm_size < 1) { 1416 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1417 amdgpu_vm_size); 1418 amdgpu_vm_size = -1; 1419 } 1420 } 1421 1422 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1423 { 1424 struct sysinfo si; 1425 bool is_os_64 = (sizeof(void *) == 8); 1426 uint64_t total_memory; 1427 uint64_t dram_size_seven_GB = 0x1B8000000; 1428 uint64_t dram_size_three_GB = 0xB8000000; 1429 1430 if (amdgpu_smu_memory_pool_size == 0) 1431 return; 1432 1433 if (!is_os_64) { 1434 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1435 goto def_value; 1436 } 1437 si_meminfo(&si); 1438 total_memory = (uint64_t)si.totalram * si.mem_unit; 1439 1440 if ((amdgpu_smu_memory_pool_size == 1) || 1441 (amdgpu_smu_memory_pool_size == 2)) { 1442 if (total_memory < dram_size_three_GB) 1443 goto def_value1; 1444 } else if ((amdgpu_smu_memory_pool_size == 4) || 1445 (amdgpu_smu_memory_pool_size == 8)) { 1446 if (total_memory < dram_size_seven_GB) 1447 goto def_value1; 1448 } else { 1449 DRM_WARN("Smu memory pool size not supported\n"); 1450 goto def_value; 1451 } 1452 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1453 1454 return; 1455 1456 def_value1: 1457 DRM_WARN("No enough system memory\n"); 1458 def_value: 1459 adev->pm.smu_prv_buffer_size = 0; 1460 } 1461 1462 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1463 { 1464 if (!(adev->flags & AMD_IS_APU) || 1465 adev->asic_type < CHIP_RAVEN) 1466 return 0; 1467 1468 switch (adev->asic_type) { 1469 case CHIP_RAVEN: 1470 if (adev->pdev->device == 0x15dd) 1471 adev->apu_flags |= AMD_APU_IS_RAVEN; 1472 if (adev->pdev->device == 0x15d8) 1473 adev->apu_flags |= AMD_APU_IS_PICASSO; 1474 break; 1475 case CHIP_RENOIR: 1476 if ((adev->pdev->device == 0x1636) || 1477 (adev->pdev->device == 0x164c)) 1478 adev->apu_flags |= AMD_APU_IS_RENOIR; 1479 else 1480 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1481 break; 1482 case CHIP_VANGOGH: 1483 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1484 break; 1485 case CHIP_YELLOW_CARP: 1486 break; 1487 case CHIP_CYAN_SKILLFISH: 1488 if ((adev->pdev->device == 0x13FE) || 1489 (adev->pdev->device == 0x143F)) 1490 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1491 break; 1492 default: 1493 break; 1494 } 1495 1496 return 0; 1497 } 1498 1499 /** 1500 * amdgpu_device_check_arguments - validate module params 1501 * 1502 * @adev: amdgpu_device pointer 1503 * 1504 * Validates certain module parameters and updates 1505 * the associated values used by the driver (all asics). 1506 */ 1507 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1508 { 1509 if (amdgpu_sched_jobs < 4) { 1510 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1511 amdgpu_sched_jobs); 1512 amdgpu_sched_jobs = 4; 1513 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1514 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1515 amdgpu_sched_jobs); 1516 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1517 } 1518 1519 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1520 /* gart size must be greater or equal to 32M */ 1521 dev_warn(adev->dev, "gart size (%d) too small\n", 1522 amdgpu_gart_size); 1523 amdgpu_gart_size = -1; 1524 } 1525 1526 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1527 /* gtt size must be greater or equal to 32M */ 1528 dev_warn(adev->dev, "gtt size (%d) too small\n", 1529 amdgpu_gtt_size); 1530 amdgpu_gtt_size = -1; 1531 } 1532 1533 /* valid range is between 4 and 9 inclusive */ 1534 if (amdgpu_vm_fragment_size != -1 && 1535 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1536 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1537 amdgpu_vm_fragment_size = -1; 1538 } 1539 1540 if (amdgpu_sched_hw_submission < 2) { 1541 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1542 amdgpu_sched_hw_submission); 1543 amdgpu_sched_hw_submission = 2; 1544 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1545 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1546 amdgpu_sched_hw_submission); 1547 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1548 } 1549 1550 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1551 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1552 amdgpu_reset_method = -1; 1553 } 1554 1555 amdgpu_device_check_smu_prv_buffer_size(adev); 1556 1557 amdgpu_device_check_vm_size(adev); 1558 1559 amdgpu_device_check_block_size(adev); 1560 1561 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1562 1563 return 0; 1564 } 1565 1566 /** 1567 * amdgpu_switcheroo_set_state - set switcheroo state 1568 * 1569 * @pdev: pci dev pointer 1570 * @state: vga_switcheroo state 1571 * 1572 * Callback for the switcheroo driver. Suspends or resumes 1573 * the asics before or after it is powered up using ACPI methods. 1574 */ 1575 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1576 enum vga_switcheroo_state state) 1577 { 1578 struct drm_device *dev = pci_get_drvdata(pdev); 1579 int r; 1580 1581 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1582 return; 1583 1584 if (state == VGA_SWITCHEROO_ON) { 1585 pr_info("switched on\n"); 1586 /* don't suspend or resume card normally */ 1587 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1588 1589 pci_set_power_state(pdev, PCI_D0); 1590 amdgpu_device_load_pci_state(pdev); 1591 r = pci_enable_device(pdev); 1592 if (r) 1593 DRM_WARN("pci_enable_device failed (%d)\n", r); 1594 amdgpu_device_resume(dev, true); 1595 1596 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1597 } else { 1598 pr_info("switched off\n"); 1599 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1600 amdgpu_device_suspend(dev, true); 1601 amdgpu_device_cache_pci_state(pdev); 1602 /* Shut down the device */ 1603 pci_disable_device(pdev); 1604 pci_set_power_state(pdev, PCI_D3cold); 1605 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1606 } 1607 } 1608 1609 /** 1610 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1611 * 1612 * @pdev: pci dev pointer 1613 * 1614 * Callback for the switcheroo driver. Check of the switcheroo 1615 * state can be changed. 1616 * Returns true if the state can be changed, false if not. 1617 */ 1618 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1619 { 1620 struct drm_device *dev = pci_get_drvdata(pdev); 1621 1622 /* 1623 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1624 * locking inversion with the driver load path. And the access here is 1625 * completely racy anyway. So don't bother with locking for now. 1626 */ 1627 return atomic_read(&dev->open_count) == 0; 1628 } 1629 1630 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1631 .set_gpu_state = amdgpu_switcheroo_set_state, 1632 .reprobe = NULL, 1633 .can_switch = amdgpu_switcheroo_can_switch, 1634 }; 1635 1636 /** 1637 * amdgpu_device_ip_set_clockgating_state - set the CG state 1638 * 1639 * @dev: amdgpu_device pointer 1640 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1641 * @state: clockgating state (gate or ungate) 1642 * 1643 * Sets the requested clockgating state for all instances of 1644 * the hardware IP specified. 1645 * Returns the error code from the last instance. 1646 */ 1647 int amdgpu_device_ip_set_clockgating_state(void *dev, 1648 enum amd_ip_block_type block_type, 1649 enum amd_clockgating_state state) 1650 { 1651 struct amdgpu_device *adev = dev; 1652 int i, r = 0; 1653 1654 for (i = 0; i < adev->num_ip_blocks; i++) { 1655 if (!adev->ip_blocks[i].status.valid) 1656 continue; 1657 if (adev->ip_blocks[i].version->type != block_type) 1658 continue; 1659 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1660 continue; 1661 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1662 (void *)adev, state); 1663 if (r) 1664 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1665 adev->ip_blocks[i].version->funcs->name, r); 1666 } 1667 return r; 1668 } 1669 1670 /** 1671 * amdgpu_device_ip_set_powergating_state - set the PG state 1672 * 1673 * @dev: amdgpu_device pointer 1674 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1675 * @state: powergating state (gate or ungate) 1676 * 1677 * Sets the requested powergating state for all instances of 1678 * the hardware IP specified. 1679 * Returns the error code from the last instance. 1680 */ 1681 int amdgpu_device_ip_set_powergating_state(void *dev, 1682 enum amd_ip_block_type block_type, 1683 enum amd_powergating_state state) 1684 { 1685 struct amdgpu_device *adev = dev; 1686 int i, r = 0; 1687 1688 for (i = 0; i < adev->num_ip_blocks; i++) { 1689 if (!adev->ip_blocks[i].status.valid) 1690 continue; 1691 if (adev->ip_blocks[i].version->type != block_type) 1692 continue; 1693 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1694 continue; 1695 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1696 (void *)adev, state); 1697 if (r) 1698 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1699 adev->ip_blocks[i].version->funcs->name, r); 1700 } 1701 return r; 1702 } 1703 1704 /** 1705 * amdgpu_device_ip_get_clockgating_state - get the CG state 1706 * 1707 * @adev: amdgpu_device pointer 1708 * @flags: clockgating feature flags 1709 * 1710 * Walks the list of IPs on the device and updates the clockgating 1711 * flags for each IP. 1712 * Updates @flags with the feature flags for each hardware IP where 1713 * clockgating is enabled. 1714 */ 1715 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1716 u64 *flags) 1717 { 1718 int i; 1719 1720 for (i = 0; i < adev->num_ip_blocks; i++) { 1721 if (!adev->ip_blocks[i].status.valid) 1722 continue; 1723 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1724 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1725 } 1726 } 1727 1728 /** 1729 * amdgpu_device_ip_wait_for_idle - wait for idle 1730 * 1731 * @adev: amdgpu_device pointer 1732 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1733 * 1734 * Waits for the request hardware IP to be idle. 1735 * Returns 0 for success or a negative error code on failure. 1736 */ 1737 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1738 enum amd_ip_block_type block_type) 1739 { 1740 int i, r; 1741 1742 for (i = 0; i < adev->num_ip_blocks; i++) { 1743 if (!adev->ip_blocks[i].status.valid) 1744 continue; 1745 if (adev->ip_blocks[i].version->type == block_type) { 1746 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1747 if (r) 1748 return r; 1749 break; 1750 } 1751 } 1752 return 0; 1753 1754 } 1755 1756 /** 1757 * amdgpu_device_ip_is_idle - is the hardware IP idle 1758 * 1759 * @adev: amdgpu_device pointer 1760 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1761 * 1762 * Check if the hardware IP is idle or not. 1763 * Returns true if it the IP is idle, false if not. 1764 */ 1765 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1766 enum amd_ip_block_type block_type) 1767 { 1768 int i; 1769 1770 for (i = 0; i < adev->num_ip_blocks; i++) { 1771 if (!adev->ip_blocks[i].status.valid) 1772 continue; 1773 if (adev->ip_blocks[i].version->type == block_type) 1774 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1775 } 1776 return true; 1777 1778 } 1779 1780 /** 1781 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1782 * 1783 * @adev: amdgpu_device pointer 1784 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1785 * 1786 * Returns a pointer to the hardware IP block structure 1787 * if it exists for the asic, otherwise NULL. 1788 */ 1789 struct amdgpu_ip_block * 1790 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1791 enum amd_ip_block_type type) 1792 { 1793 int i; 1794 1795 for (i = 0; i < adev->num_ip_blocks; i++) 1796 if (adev->ip_blocks[i].version->type == type) 1797 return &adev->ip_blocks[i]; 1798 1799 return NULL; 1800 } 1801 1802 /** 1803 * amdgpu_device_ip_block_version_cmp 1804 * 1805 * @adev: amdgpu_device pointer 1806 * @type: enum amd_ip_block_type 1807 * @major: major version 1808 * @minor: minor version 1809 * 1810 * return 0 if equal or greater 1811 * return 1 if smaller or the ip_block doesn't exist 1812 */ 1813 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1814 enum amd_ip_block_type type, 1815 u32 major, u32 minor) 1816 { 1817 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1818 1819 if (ip_block && ((ip_block->version->major > major) || 1820 ((ip_block->version->major == major) && 1821 (ip_block->version->minor >= minor)))) 1822 return 0; 1823 1824 return 1; 1825 } 1826 1827 /** 1828 * amdgpu_device_ip_block_add 1829 * 1830 * @adev: amdgpu_device pointer 1831 * @ip_block_version: pointer to the IP to add 1832 * 1833 * Adds the IP block driver information to the collection of IPs 1834 * on the asic. 1835 */ 1836 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1837 const struct amdgpu_ip_block_version *ip_block_version) 1838 { 1839 if (!ip_block_version) 1840 return -EINVAL; 1841 1842 switch (ip_block_version->type) { 1843 case AMD_IP_BLOCK_TYPE_VCN: 1844 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1845 return 0; 1846 break; 1847 case AMD_IP_BLOCK_TYPE_JPEG: 1848 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1849 return 0; 1850 break; 1851 default: 1852 break; 1853 } 1854 1855 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1856 ip_block_version->funcs->name); 1857 1858 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1859 1860 return 0; 1861 } 1862 1863 /** 1864 * amdgpu_device_enable_virtual_display - enable virtual display feature 1865 * 1866 * @adev: amdgpu_device pointer 1867 * 1868 * Enabled the virtual display feature if the user has enabled it via 1869 * the module parameter virtual_display. This feature provides a virtual 1870 * display hardware on headless boards or in virtualized environments. 1871 * This function parses and validates the configuration string specified by 1872 * the user and configues the virtual display configuration (number of 1873 * virtual connectors, crtcs, etc.) specified. 1874 */ 1875 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1876 { 1877 adev->enable_virtual_display = false; 1878 1879 if (amdgpu_virtual_display) { 1880 const char *pci_address_name = pci_name(adev->pdev); 1881 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1882 1883 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1884 pciaddstr_tmp = pciaddstr; 1885 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1886 pciaddname = strsep(&pciaddname_tmp, ","); 1887 if (!strcmp("all", pciaddname) 1888 || !strcmp(pci_address_name, pciaddname)) { 1889 long num_crtc; 1890 int res = -1; 1891 1892 adev->enable_virtual_display = true; 1893 1894 if (pciaddname_tmp) 1895 res = kstrtol(pciaddname_tmp, 10, 1896 &num_crtc); 1897 1898 if (!res) { 1899 if (num_crtc < 1) 1900 num_crtc = 1; 1901 if (num_crtc > 6) 1902 num_crtc = 6; 1903 adev->mode_info.num_crtc = num_crtc; 1904 } else { 1905 adev->mode_info.num_crtc = 1; 1906 } 1907 break; 1908 } 1909 } 1910 1911 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1912 amdgpu_virtual_display, pci_address_name, 1913 adev->enable_virtual_display, adev->mode_info.num_crtc); 1914 1915 kfree(pciaddstr); 1916 } 1917 } 1918 1919 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1920 { 1921 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1922 adev->mode_info.num_crtc = 1; 1923 adev->enable_virtual_display = true; 1924 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1925 adev->enable_virtual_display, adev->mode_info.num_crtc); 1926 } 1927 } 1928 1929 /** 1930 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1931 * 1932 * @adev: amdgpu_device pointer 1933 * 1934 * Parses the asic configuration parameters specified in the gpu info 1935 * firmware and makes them availale to the driver for use in configuring 1936 * the asic. 1937 * Returns 0 on success, -EINVAL on failure. 1938 */ 1939 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1940 { 1941 const char *chip_name; 1942 char fw_name[40]; 1943 int err; 1944 const struct gpu_info_firmware_header_v1_0 *hdr; 1945 1946 adev->firmware.gpu_info_fw = NULL; 1947 1948 if (adev->mman.discovery_bin) { 1949 /* 1950 * FIXME: The bounding box is still needed by Navi12, so 1951 * temporarily read it from gpu_info firmware. Should be dropped 1952 * when DAL no longer needs it. 1953 */ 1954 if (adev->asic_type != CHIP_NAVI12) 1955 return 0; 1956 } 1957 1958 switch (adev->asic_type) { 1959 default: 1960 return 0; 1961 case CHIP_VEGA10: 1962 chip_name = "vega10"; 1963 break; 1964 case CHIP_VEGA12: 1965 chip_name = "vega12"; 1966 break; 1967 case CHIP_RAVEN: 1968 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1969 chip_name = "raven2"; 1970 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1971 chip_name = "picasso"; 1972 else 1973 chip_name = "raven"; 1974 break; 1975 case CHIP_ARCTURUS: 1976 chip_name = "arcturus"; 1977 break; 1978 case CHIP_NAVI12: 1979 chip_name = "navi12"; 1980 break; 1981 } 1982 1983 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1984 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1985 if (err) { 1986 dev_err(adev->dev, 1987 "Failed to get gpu_info firmware \"%s\"\n", 1988 fw_name); 1989 goto out; 1990 } 1991 1992 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1993 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1994 1995 switch (hdr->version_major) { 1996 case 1: 1997 { 1998 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1999 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2000 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2001 2002 /* 2003 * Should be droped when DAL no longer needs it. 2004 */ 2005 if (adev->asic_type == CHIP_NAVI12) 2006 goto parse_soc_bounding_box; 2007 2008 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2009 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2010 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2011 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2012 adev->gfx.config.max_texture_channel_caches = 2013 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2014 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2015 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2016 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2017 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2018 adev->gfx.config.double_offchip_lds_buf = 2019 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2020 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2021 adev->gfx.cu_info.max_waves_per_simd = 2022 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2023 adev->gfx.cu_info.max_scratch_slots_per_cu = 2024 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2025 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2026 if (hdr->version_minor >= 1) { 2027 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2028 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2029 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2030 adev->gfx.config.num_sc_per_sh = 2031 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2032 adev->gfx.config.num_packer_per_sc = 2033 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2034 } 2035 2036 parse_soc_bounding_box: 2037 /* 2038 * soc bounding box info is not integrated in disocovery table, 2039 * we always need to parse it from gpu info firmware if needed. 2040 */ 2041 if (hdr->version_minor == 2) { 2042 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2043 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2044 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2045 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2046 } 2047 break; 2048 } 2049 default: 2050 dev_err(adev->dev, 2051 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2052 err = -EINVAL; 2053 goto out; 2054 } 2055 out: 2056 return err; 2057 } 2058 2059 /** 2060 * amdgpu_device_ip_early_init - run early init for hardware IPs 2061 * 2062 * @adev: amdgpu_device pointer 2063 * 2064 * Early initialization pass for hardware IPs. The hardware IPs that make 2065 * up each asic are discovered each IP's early_init callback is run. This 2066 * is the first stage in initializing the asic. 2067 * Returns 0 on success, negative error code on failure. 2068 */ 2069 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2070 { 2071 struct drm_device *dev = adev_to_drm(adev); 2072 struct pci_dev *parent; 2073 int i, r; 2074 bool total; 2075 2076 amdgpu_device_enable_virtual_display(adev); 2077 2078 if (amdgpu_sriov_vf(adev)) { 2079 r = amdgpu_virt_request_full_gpu(adev, true); 2080 if (r) 2081 return r; 2082 } 2083 2084 switch (adev->asic_type) { 2085 #ifdef CONFIG_DRM_AMDGPU_SI 2086 case CHIP_VERDE: 2087 case CHIP_TAHITI: 2088 case CHIP_PITCAIRN: 2089 case CHIP_OLAND: 2090 case CHIP_HAINAN: 2091 adev->family = AMDGPU_FAMILY_SI; 2092 r = si_set_ip_blocks(adev); 2093 if (r) 2094 return r; 2095 break; 2096 #endif 2097 #ifdef CONFIG_DRM_AMDGPU_CIK 2098 case CHIP_BONAIRE: 2099 case CHIP_HAWAII: 2100 case CHIP_KAVERI: 2101 case CHIP_KABINI: 2102 case CHIP_MULLINS: 2103 if (adev->flags & AMD_IS_APU) 2104 adev->family = AMDGPU_FAMILY_KV; 2105 else 2106 adev->family = AMDGPU_FAMILY_CI; 2107 2108 r = cik_set_ip_blocks(adev); 2109 if (r) 2110 return r; 2111 break; 2112 #endif 2113 case CHIP_TOPAZ: 2114 case CHIP_TONGA: 2115 case CHIP_FIJI: 2116 case CHIP_POLARIS10: 2117 case CHIP_POLARIS11: 2118 case CHIP_POLARIS12: 2119 case CHIP_VEGAM: 2120 case CHIP_CARRIZO: 2121 case CHIP_STONEY: 2122 if (adev->flags & AMD_IS_APU) 2123 adev->family = AMDGPU_FAMILY_CZ; 2124 else 2125 adev->family = AMDGPU_FAMILY_VI; 2126 2127 r = vi_set_ip_blocks(adev); 2128 if (r) 2129 return r; 2130 break; 2131 default: 2132 r = amdgpu_discovery_set_ip_blocks(adev); 2133 if (r) 2134 return r; 2135 break; 2136 } 2137 2138 if (amdgpu_has_atpx() && 2139 (amdgpu_is_atpx_hybrid() || 2140 amdgpu_has_atpx_dgpu_power_cntl()) && 2141 ((adev->flags & AMD_IS_APU) == 0) && 2142 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2143 adev->flags |= AMD_IS_PX; 2144 2145 if (!(adev->flags & AMD_IS_APU)) { 2146 parent = pci_upstream_bridge(adev->pdev); 2147 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2148 } 2149 2150 2151 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2152 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2153 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2154 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2155 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2156 2157 total = true; 2158 for (i = 0; i < adev->num_ip_blocks; i++) { 2159 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2160 DRM_WARN("disabled ip block: %d <%s>\n", 2161 i, adev->ip_blocks[i].version->funcs->name); 2162 adev->ip_blocks[i].status.valid = false; 2163 } else { 2164 if (adev->ip_blocks[i].version->funcs->early_init) { 2165 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2166 if (r == -ENOENT) { 2167 adev->ip_blocks[i].status.valid = false; 2168 } else if (r) { 2169 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2170 adev->ip_blocks[i].version->funcs->name, r); 2171 total = false; 2172 } else { 2173 adev->ip_blocks[i].status.valid = true; 2174 } 2175 } else { 2176 adev->ip_blocks[i].status.valid = true; 2177 } 2178 } 2179 /* get the vbios after the asic_funcs are set up */ 2180 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2181 r = amdgpu_device_parse_gpu_info_fw(adev); 2182 if (r) 2183 return r; 2184 2185 /* Read BIOS */ 2186 if (amdgpu_device_read_bios(adev)) { 2187 if (!amdgpu_get_bios(adev)) 2188 return -EINVAL; 2189 2190 r = amdgpu_atombios_init(adev); 2191 if (r) { 2192 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2193 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2194 return r; 2195 } 2196 } 2197 2198 /*get pf2vf msg info at it's earliest time*/ 2199 if (amdgpu_sriov_vf(adev)) 2200 amdgpu_virt_init_data_exchange(adev); 2201 2202 } 2203 } 2204 if (!total) 2205 return -ENODEV; 2206 2207 amdgpu_amdkfd_device_probe(adev); 2208 adev->cg_flags &= amdgpu_cg_mask; 2209 adev->pg_flags &= amdgpu_pg_mask; 2210 2211 return 0; 2212 } 2213 2214 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2215 { 2216 int i, r; 2217 2218 for (i = 0; i < adev->num_ip_blocks; i++) { 2219 if (!adev->ip_blocks[i].status.sw) 2220 continue; 2221 if (adev->ip_blocks[i].status.hw) 2222 continue; 2223 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2224 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2225 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2226 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2227 if (r) { 2228 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2229 adev->ip_blocks[i].version->funcs->name, r); 2230 return r; 2231 } 2232 adev->ip_blocks[i].status.hw = true; 2233 } 2234 } 2235 2236 return 0; 2237 } 2238 2239 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2240 { 2241 int i, r; 2242 2243 for (i = 0; i < adev->num_ip_blocks; i++) { 2244 if (!adev->ip_blocks[i].status.sw) 2245 continue; 2246 if (adev->ip_blocks[i].status.hw) 2247 continue; 2248 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2249 if (r) { 2250 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2251 adev->ip_blocks[i].version->funcs->name, r); 2252 return r; 2253 } 2254 adev->ip_blocks[i].status.hw = true; 2255 } 2256 2257 return 0; 2258 } 2259 2260 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2261 { 2262 int r = 0; 2263 int i; 2264 uint32_t smu_version; 2265 2266 if (adev->asic_type >= CHIP_VEGA10) { 2267 for (i = 0; i < adev->num_ip_blocks; i++) { 2268 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2269 continue; 2270 2271 if (!adev->ip_blocks[i].status.sw) 2272 continue; 2273 2274 /* no need to do the fw loading again if already done*/ 2275 if (adev->ip_blocks[i].status.hw == true) 2276 break; 2277 2278 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2279 r = adev->ip_blocks[i].version->funcs->resume(adev); 2280 if (r) { 2281 DRM_ERROR("resume of IP block <%s> failed %d\n", 2282 adev->ip_blocks[i].version->funcs->name, r); 2283 return r; 2284 } 2285 } else { 2286 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2287 if (r) { 2288 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2289 adev->ip_blocks[i].version->funcs->name, r); 2290 return r; 2291 } 2292 } 2293 2294 adev->ip_blocks[i].status.hw = true; 2295 break; 2296 } 2297 } 2298 2299 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2300 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2301 2302 return r; 2303 } 2304 2305 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2306 { 2307 long timeout; 2308 int r, i; 2309 2310 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2311 struct amdgpu_ring *ring = adev->rings[i]; 2312 2313 /* No need to setup the GPU scheduler for rings that don't need it */ 2314 if (!ring || ring->no_scheduler) 2315 continue; 2316 2317 switch (ring->funcs->type) { 2318 case AMDGPU_RING_TYPE_GFX: 2319 timeout = adev->gfx_timeout; 2320 break; 2321 case AMDGPU_RING_TYPE_COMPUTE: 2322 timeout = adev->compute_timeout; 2323 break; 2324 case AMDGPU_RING_TYPE_SDMA: 2325 timeout = adev->sdma_timeout; 2326 break; 2327 default: 2328 timeout = adev->video_timeout; 2329 break; 2330 } 2331 2332 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2333 ring->num_hw_submission, 0, 2334 timeout, adev->reset_domain->wq, 2335 ring->sched_score, ring->name, 2336 adev->dev); 2337 if (r) { 2338 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2339 ring->name); 2340 return r; 2341 } 2342 } 2343 2344 amdgpu_xcp_update_partition_sched_list(adev); 2345 2346 return 0; 2347 } 2348 2349 2350 /** 2351 * amdgpu_device_ip_init - run init for hardware IPs 2352 * 2353 * @adev: amdgpu_device pointer 2354 * 2355 * Main initialization pass for hardware IPs. The list of all the hardware 2356 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2357 * are run. sw_init initializes the software state associated with each IP 2358 * and hw_init initializes the hardware associated with each IP. 2359 * Returns 0 on success, negative error code on failure. 2360 */ 2361 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2362 { 2363 int i, r; 2364 2365 r = amdgpu_ras_init(adev); 2366 if (r) 2367 return r; 2368 2369 for (i = 0; i < adev->num_ip_blocks; i++) { 2370 if (!adev->ip_blocks[i].status.valid) 2371 continue; 2372 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2373 if (r) { 2374 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2375 adev->ip_blocks[i].version->funcs->name, r); 2376 goto init_failed; 2377 } 2378 adev->ip_blocks[i].status.sw = true; 2379 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2381 /* need to do common hw init early so everything is set up for gmc */ 2382 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2383 if (r) { 2384 DRM_ERROR("hw_init %d failed %d\n", i, r); 2385 goto init_failed; 2386 } 2387 adev->ip_blocks[i].status.hw = true; 2388 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2389 /* need to do gmc hw init early so we can allocate gpu mem */ 2390 /* Try to reserve bad pages early */ 2391 if (amdgpu_sriov_vf(adev)) 2392 amdgpu_virt_exchange_data(adev); 2393 2394 r = amdgpu_device_mem_scratch_init(adev); 2395 if (r) { 2396 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2397 goto init_failed; 2398 } 2399 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2400 if (r) { 2401 DRM_ERROR("hw_init %d failed %d\n", i, r); 2402 goto init_failed; 2403 } 2404 r = amdgpu_device_wb_init(adev); 2405 if (r) { 2406 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2407 goto init_failed; 2408 } 2409 adev->ip_blocks[i].status.hw = true; 2410 2411 /* right after GMC hw init, we create CSA */ 2412 if (adev->gfx.mcbp) { 2413 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2414 AMDGPU_GEM_DOMAIN_VRAM | 2415 AMDGPU_GEM_DOMAIN_GTT, 2416 AMDGPU_CSA_SIZE); 2417 if (r) { 2418 DRM_ERROR("allocate CSA failed %d\n", r); 2419 goto init_failed; 2420 } 2421 } 2422 } 2423 } 2424 2425 if (amdgpu_sriov_vf(adev)) 2426 amdgpu_virt_init_data_exchange(adev); 2427 2428 r = amdgpu_ib_pool_init(adev); 2429 if (r) { 2430 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2431 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2432 goto init_failed; 2433 } 2434 2435 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2436 if (r) 2437 goto init_failed; 2438 2439 r = amdgpu_device_ip_hw_init_phase1(adev); 2440 if (r) 2441 goto init_failed; 2442 2443 r = amdgpu_device_fw_loading(adev); 2444 if (r) 2445 goto init_failed; 2446 2447 r = amdgpu_device_ip_hw_init_phase2(adev); 2448 if (r) 2449 goto init_failed; 2450 2451 /* 2452 * retired pages will be loaded from eeprom and reserved here, 2453 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2454 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2455 * for I2C communication which only true at this point. 2456 * 2457 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2458 * failure from bad gpu situation and stop amdgpu init process 2459 * accordingly. For other failed cases, it will still release all 2460 * the resource and print error message, rather than returning one 2461 * negative value to upper level. 2462 * 2463 * Note: theoretically, this should be called before all vram allocations 2464 * to protect retired page from abusing 2465 */ 2466 r = amdgpu_ras_recovery_init(adev); 2467 if (r) 2468 goto init_failed; 2469 2470 /** 2471 * In case of XGMI grab extra reference for reset domain for this device 2472 */ 2473 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2474 if (amdgpu_xgmi_add_device(adev) == 0) { 2475 if (!amdgpu_sriov_vf(adev)) { 2476 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2477 2478 if (WARN_ON(!hive)) { 2479 r = -ENOENT; 2480 goto init_failed; 2481 } 2482 2483 if (!hive->reset_domain || 2484 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2485 r = -ENOENT; 2486 amdgpu_put_xgmi_hive(hive); 2487 goto init_failed; 2488 } 2489 2490 /* Drop the early temporary reset domain we created for device */ 2491 amdgpu_reset_put_reset_domain(adev->reset_domain); 2492 adev->reset_domain = hive->reset_domain; 2493 amdgpu_put_xgmi_hive(hive); 2494 } 2495 } 2496 } 2497 2498 r = amdgpu_device_init_schedulers(adev); 2499 if (r) 2500 goto init_failed; 2501 2502 /* Don't init kfd if whole hive need to be reset during init */ 2503 if (!adev->gmc.xgmi.pending_reset) { 2504 kgd2kfd_init_zone_device(adev); 2505 amdgpu_amdkfd_device_init(adev); 2506 } 2507 2508 amdgpu_fru_get_product_info(adev); 2509 2510 init_failed: 2511 2512 return r; 2513 } 2514 2515 /** 2516 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2517 * 2518 * @adev: amdgpu_device pointer 2519 * 2520 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2521 * this function before a GPU reset. If the value is retained after a 2522 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2523 */ 2524 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2525 { 2526 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2527 } 2528 2529 /** 2530 * amdgpu_device_check_vram_lost - check if vram is valid 2531 * 2532 * @adev: amdgpu_device pointer 2533 * 2534 * Checks the reset magic value written to the gart pointer in VRAM. 2535 * The driver calls this after a GPU reset to see if the contents of 2536 * VRAM is lost or now. 2537 * returns true if vram is lost, false if not. 2538 */ 2539 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2540 { 2541 if (memcmp(adev->gart.ptr, adev->reset_magic, 2542 AMDGPU_RESET_MAGIC_NUM)) 2543 return true; 2544 2545 if (!amdgpu_in_reset(adev)) 2546 return false; 2547 2548 /* 2549 * For all ASICs with baco/mode1 reset, the VRAM is 2550 * always assumed to be lost. 2551 */ 2552 switch (amdgpu_asic_reset_method(adev)) { 2553 case AMD_RESET_METHOD_BACO: 2554 case AMD_RESET_METHOD_MODE1: 2555 return true; 2556 default: 2557 return false; 2558 } 2559 } 2560 2561 /** 2562 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2563 * 2564 * @adev: amdgpu_device pointer 2565 * @state: clockgating state (gate or ungate) 2566 * 2567 * The list of all the hardware IPs that make up the asic is walked and the 2568 * set_clockgating_state callbacks are run. 2569 * Late initialization pass enabling clockgating for hardware IPs. 2570 * Fini or suspend, pass disabling clockgating for hardware IPs. 2571 * Returns 0 on success, negative error code on failure. 2572 */ 2573 2574 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2575 enum amd_clockgating_state state) 2576 { 2577 int i, j, r; 2578 2579 if (amdgpu_emu_mode == 1) 2580 return 0; 2581 2582 for (j = 0; j < adev->num_ip_blocks; j++) { 2583 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2584 if (!adev->ip_blocks[i].status.late_initialized) 2585 continue; 2586 /* skip CG for GFX, SDMA on S0ix */ 2587 if (adev->in_s0ix && 2588 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2589 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2590 continue; 2591 /* skip CG for VCE/UVD, it's handled specially */ 2592 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2593 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2594 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2595 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2596 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2597 /* enable clockgating to save power */ 2598 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2599 state); 2600 if (r) { 2601 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2602 adev->ip_blocks[i].version->funcs->name, r); 2603 return r; 2604 } 2605 } 2606 } 2607 2608 return 0; 2609 } 2610 2611 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2612 enum amd_powergating_state state) 2613 { 2614 int i, j, r; 2615 2616 if (amdgpu_emu_mode == 1) 2617 return 0; 2618 2619 for (j = 0; j < adev->num_ip_blocks; j++) { 2620 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2621 if (!adev->ip_blocks[i].status.late_initialized) 2622 continue; 2623 /* skip PG for GFX, SDMA on S0ix */ 2624 if (adev->in_s0ix && 2625 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2626 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2627 continue; 2628 /* skip CG for VCE/UVD, it's handled specially */ 2629 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2630 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2631 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2632 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2633 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2634 /* enable powergating to save power */ 2635 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2636 state); 2637 if (r) { 2638 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2639 adev->ip_blocks[i].version->funcs->name, r); 2640 return r; 2641 } 2642 } 2643 } 2644 return 0; 2645 } 2646 2647 static int amdgpu_device_enable_mgpu_fan_boost(void) 2648 { 2649 struct amdgpu_gpu_instance *gpu_ins; 2650 struct amdgpu_device *adev; 2651 int i, ret = 0; 2652 2653 mutex_lock(&mgpu_info.mutex); 2654 2655 /* 2656 * MGPU fan boost feature should be enabled 2657 * only when there are two or more dGPUs in 2658 * the system 2659 */ 2660 if (mgpu_info.num_dgpu < 2) 2661 goto out; 2662 2663 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2664 gpu_ins = &(mgpu_info.gpu_ins[i]); 2665 adev = gpu_ins->adev; 2666 if (!(adev->flags & AMD_IS_APU) && 2667 !gpu_ins->mgpu_fan_enabled) { 2668 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2669 if (ret) 2670 break; 2671 2672 gpu_ins->mgpu_fan_enabled = 1; 2673 } 2674 } 2675 2676 out: 2677 mutex_unlock(&mgpu_info.mutex); 2678 2679 return ret; 2680 } 2681 2682 /** 2683 * amdgpu_device_ip_late_init - run late init for hardware IPs 2684 * 2685 * @adev: amdgpu_device pointer 2686 * 2687 * Late initialization pass for hardware IPs. The list of all the hardware 2688 * IPs that make up the asic is walked and the late_init callbacks are run. 2689 * late_init covers any special initialization that an IP requires 2690 * after all of the have been initialized or something that needs to happen 2691 * late in the init process. 2692 * Returns 0 on success, negative error code on failure. 2693 */ 2694 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2695 { 2696 struct amdgpu_gpu_instance *gpu_instance; 2697 int i = 0, r; 2698 2699 for (i = 0; i < adev->num_ip_blocks; i++) { 2700 if (!adev->ip_blocks[i].status.hw) 2701 continue; 2702 if (adev->ip_blocks[i].version->funcs->late_init) { 2703 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2704 if (r) { 2705 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2706 adev->ip_blocks[i].version->funcs->name, r); 2707 return r; 2708 } 2709 } 2710 adev->ip_blocks[i].status.late_initialized = true; 2711 } 2712 2713 r = amdgpu_ras_late_init(adev); 2714 if (r) { 2715 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2716 return r; 2717 } 2718 2719 amdgpu_ras_set_error_query_ready(adev, true); 2720 2721 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2722 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2723 2724 amdgpu_device_fill_reset_magic(adev); 2725 2726 r = amdgpu_device_enable_mgpu_fan_boost(); 2727 if (r) 2728 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2729 2730 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2731 if (amdgpu_passthrough(adev) && 2732 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2733 adev->asic_type == CHIP_ALDEBARAN)) 2734 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2735 2736 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2737 mutex_lock(&mgpu_info.mutex); 2738 2739 /* 2740 * Reset device p-state to low as this was booted with high. 2741 * 2742 * This should be performed only after all devices from the same 2743 * hive get initialized. 2744 * 2745 * However, it's unknown how many device in the hive in advance. 2746 * As this is counted one by one during devices initializations. 2747 * 2748 * So, we wait for all XGMI interlinked devices initialized. 2749 * This may bring some delays as those devices may come from 2750 * different hives. But that should be OK. 2751 */ 2752 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2753 for (i = 0; i < mgpu_info.num_gpu; i++) { 2754 gpu_instance = &(mgpu_info.gpu_ins[i]); 2755 if (gpu_instance->adev->flags & AMD_IS_APU) 2756 continue; 2757 2758 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2759 AMDGPU_XGMI_PSTATE_MIN); 2760 if (r) { 2761 DRM_ERROR("pstate setting failed (%d).\n", r); 2762 break; 2763 } 2764 } 2765 } 2766 2767 mutex_unlock(&mgpu_info.mutex); 2768 } 2769 2770 return 0; 2771 } 2772 2773 /** 2774 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2775 * 2776 * @adev: amdgpu_device pointer 2777 * 2778 * For ASICs need to disable SMC first 2779 */ 2780 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2781 { 2782 int i, r; 2783 2784 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2785 return; 2786 2787 for (i = 0; i < adev->num_ip_blocks; i++) { 2788 if (!adev->ip_blocks[i].status.hw) 2789 continue; 2790 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2791 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2792 /* XXX handle errors */ 2793 if (r) { 2794 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2795 adev->ip_blocks[i].version->funcs->name, r); 2796 } 2797 adev->ip_blocks[i].status.hw = false; 2798 break; 2799 } 2800 } 2801 } 2802 2803 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2804 { 2805 int i, r; 2806 2807 for (i = 0; i < adev->num_ip_blocks; i++) { 2808 if (!adev->ip_blocks[i].version->funcs->early_fini) 2809 continue; 2810 2811 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2812 if (r) { 2813 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2814 adev->ip_blocks[i].version->funcs->name, r); 2815 } 2816 } 2817 2818 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2819 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2820 2821 amdgpu_amdkfd_suspend(adev, false); 2822 2823 /* Workaroud for ASICs need to disable SMC first */ 2824 amdgpu_device_smu_fini_early(adev); 2825 2826 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2827 if (!adev->ip_blocks[i].status.hw) 2828 continue; 2829 2830 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2831 /* XXX handle errors */ 2832 if (r) { 2833 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2834 adev->ip_blocks[i].version->funcs->name, r); 2835 } 2836 2837 adev->ip_blocks[i].status.hw = false; 2838 } 2839 2840 if (amdgpu_sriov_vf(adev)) { 2841 if (amdgpu_virt_release_full_gpu(adev, false)) 2842 DRM_ERROR("failed to release exclusive mode on fini\n"); 2843 } 2844 2845 return 0; 2846 } 2847 2848 /** 2849 * amdgpu_device_ip_fini - run fini for hardware IPs 2850 * 2851 * @adev: amdgpu_device pointer 2852 * 2853 * Main teardown pass for hardware IPs. The list of all the hardware 2854 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2855 * are run. hw_fini tears down the hardware associated with each IP 2856 * and sw_fini tears down any software state associated with each IP. 2857 * Returns 0 on success, negative error code on failure. 2858 */ 2859 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2860 { 2861 int i, r; 2862 2863 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2864 amdgpu_virt_release_ras_err_handler_data(adev); 2865 2866 if (adev->gmc.xgmi.num_physical_nodes > 1) 2867 amdgpu_xgmi_remove_device(adev); 2868 2869 amdgpu_amdkfd_device_fini_sw(adev); 2870 2871 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2872 if (!adev->ip_blocks[i].status.sw) 2873 continue; 2874 2875 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2876 amdgpu_ucode_free_bo(adev); 2877 amdgpu_free_static_csa(&adev->virt.csa_obj); 2878 amdgpu_device_wb_fini(adev); 2879 amdgpu_device_mem_scratch_fini(adev); 2880 amdgpu_ib_pool_fini(adev); 2881 } 2882 2883 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2884 /* XXX handle errors */ 2885 if (r) { 2886 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2887 adev->ip_blocks[i].version->funcs->name, r); 2888 } 2889 adev->ip_blocks[i].status.sw = false; 2890 adev->ip_blocks[i].status.valid = false; 2891 } 2892 2893 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2894 if (!adev->ip_blocks[i].status.late_initialized) 2895 continue; 2896 if (adev->ip_blocks[i].version->funcs->late_fini) 2897 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2898 adev->ip_blocks[i].status.late_initialized = false; 2899 } 2900 2901 amdgpu_ras_fini(adev); 2902 2903 return 0; 2904 } 2905 2906 /** 2907 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2908 * 2909 * @work: work_struct. 2910 */ 2911 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2912 { 2913 struct amdgpu_device *adev = 2914 container_of(work, struct amdgpu_device, delayed_init_work.work); 2915 int r; 2916 2917 r = amdgpu_ib_ring_tests(adev); 2918 if (r) 2919 DRM_ERROR("ib ring test failed (%d).\n", r); 2920 } 2921 2922 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2923 { 2924 struct amdgpu_device *adev = 2925 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2926 2927 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2928 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2929 2930 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2931 adev->gfx.gfx_off_state = true; 2932 } 2933 2934 /** 2935 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2936 * 2937 * @adev: amdgpu_device pointer 2938 * 2939 * Main suspend function for hardware IPs. The list of all the hardware 2940 * IPs that make up the asic is walked, clockgating is disabled and the 2941 * suspend callbacks are run. suspend puts the hardware and software state 2942 * in each IP into a state suitable for suspend. 2943 * Returns 0 on success, negative error code on failure. 2944 */ 2945 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2946 { 2947 int i, r; 2948 2949 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2950 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2951 2952 /* 2953 * Per PMFW team's suggestion, driver needs to handle gfxoff 2954 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2955 * scenario. Add the missing df cstate disablement here. 2956 */ 2957 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2958 dev_warn(adev->dev, "Failed to disallow df cstate"); 2959 2960 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2961 if (!adev->ip_blocks[i].status.valid) 2962 continue; 2963 2964 /* displays are handled separately */ 2965 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2966 continue; 2967 2968 /* XXX handle errors */ 2969 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2970 /* XXX handle errors */ 2971 if (r) { 2972 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2973 adev->ip_blocks[i].version->funcs->name, r); 2974 return r; 2975 } 2976 2977 adev->ip_blocks[i].status.hw = false; 2978 } 2979 2980 return 0; 2981 } 2982 2983 /** 2984 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2985 * 2986 * @adev: amdgpu_device pointer 2987 * 2988 * Main suspend function for hardware IPs. The list of all the hardware 2989 * IPs that make up the asic is walked, clockgating is disabled and the 2990 * suspend callbacks are run. suspend puts the hardware and software state 2991 * in each IP into a state suitable for suspend. 2992 * Returns 0 on success, negative error code on failure. 2993 */ 2994 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2995 { 2996 int i, r; 2997 2998 if (adev->in_s0ix) 2999 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3000 3001 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3002 if (!adev->ip_blocks[i].status.valid) 3003 continue; 3004 /* displays are handled in phase1 */ 3005 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3006 continue; 3007 /* PSP lost connection when err_event_athub occurs */ 3008 if (amdgpu_ras_intr_triggered() && 3009 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3010 adev->ip_blocks[i].status.hw = false; 3011 continue; 3012 } 3013 3014 /* skip unnecessary suspend if we do not initialize them yet */ 3015 if (adev->gmc.xgmi.pending_reset && 3016 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3017 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3018 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3019 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3020 adev->ip_blocks[i].status.hw = false; 3021 continue; 3022 } 3023 3024 /* skip suspend of gfx/mes and psp for S0ix 3025 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3026 * like at runtime. PSP is also part of the always on hardware 3027 * so no need to suspend it. 3028 */ 3029 if (adev->in_s0ix && 3030 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3031 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3032 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3033 continue; 3034 3035 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3036 if (adev->in_s0ix && 3037 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3038 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3039 continue; 3040 3041 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3042 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3043 * from this location and RLC Autoload automatically also gets loaded 3044 * from here based on PMFW -> PSP message during re-init sequence. 3045 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3046 * the TMR and reload FWs again for IMU enabled APU ASICs. 3047 */ 3048 if (amdgpu_in_reset(adev) && 3049 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3050 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3051 continue; 3052 3053 /* XXX handle errors */ 3054 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3055 /* XXX handle errors */ 3056 if (r) { 3057 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3058 adev->ip_blocks[i].version->funcs->name, r); 3059 } 3060 adev->ip_blocks[i].status.hw = false; 3061 /* handle putting the SMC in the appropriate state */ 3062 if (!amdgpu_sriov_vf(adev)) { 3063 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3064 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3065 if (r) { 3066 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3067 adev->mp1_state, r); 3068 return r; 3069 } 3070 } 3071 } 3072 } 3073 3074 return 0; 3075 } 3076 3077 /** 3078 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3079 * 3080 * @adev: amdgpu_device pointer 3081 * 3082 * Main suspend function for hardware IPs. The list of all the hardware 3083 * IPs that make up the asic is walked, clockgating is disabled and the 3084 * suspend callbacks are run. suspend puts the hardware and software state 3085 * in each IP into a state suitable for suspend. 3086 * Returns 0 on success, negative error code on failure. 3087 */ 3088 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3089 { 3090 int r; 3091 3092 if (amdgpu_sriov_vf(adev)) { 3093 amdgpu_virt_fini_data_exchange(adev); 3094 amdgpu_virt_request_full_gpu(adev, false); 3095 } 3096 3097 r = amdgpu_device_ip_suspend_phase1(adev); 3098 if (r) 3099 return r; 3100 r = amdgpu_device_ip_suspend_phase2(adev); 3101 3102 if (amdgpu_sriov_vf(adev)) 3103 amdgpu_virt_release_full_gpu(adev, false); 3104 3105 return r; 3106 } 3107 3108 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3109 { 3110 int i, r; 3111 3112 static enum amd_ip_block_type ip_order[] = { 3113 AMD_IP_BLOCK_TYPE_COMMON, 3114 AMD_IP_BLOCK_TYPE_GMC, 3115 AMD_IP_BLOCK_TYPE_PSP, 3116 AMD_IP_BLOCK_TYPE_IH, 3117 }; 3118 3119 for (i = 0; i < adev->num_ip_blocks; i++) { 3120 int j; 3121 struct amdgpu_ip_block *block; 3122 3123 block = &adev->ip_blocks[i]; 3124 block->status.hw = false; 3125 3126 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3127 3128 if (block->version->type != ip_order[j] || 3129 !block->status.valid) 3130 continue; 3131 3132 r = block->version->funcs->hw_init(adev); 3133 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3134 if (r) 3135 return r; 3136 block->status.hw = true; 3137 } 3138 } 3139 3140 return 0; 3141 } 3142 3143 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3144 { 3145 int i, r; 3146 3147 static enum amd_ip_block_type ip_order[] = { 3148 AMD_IP_BLOCK_TYPE_SMC, 3149 AMD_IP_BLOCK_TYPE_DCE, 3150 AMD_IP_BLOCK_TYPE_GFX, 3151 AMD_IP_BLOCK_TYPE_SDMA, 3152 AMD_IP_BLOCK_TYPE_MES, 3153 AMD_IP_BLOCK_TYPE_UVD, 3154 AMD_IP_BLOCK_TYPE_VCE, 3155 AMD_IP_BLOCK_TYPE_VCN, 3156 AMD_IP_BLOCK_TYPE_JPEG 3157 }; 3158 3159 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3160 int j; 3161 struct amdgpu_ip_block *block; 3162 3163 for (j = 0; j < adev->num_ip_blocks; j++) { 3164 block = &adev->ip_blocks[j]; 3165 3166 if (block->version->type != ip_order[i] || 3167 !block->status.valid || 3168 block->status.hw) 3169 continue; 3170 3171 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3172 r = block->version->funcs->resume(adev); 3173 else 3174 r = block->version->funcs->hw_init(adev); 3175 3176 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3177 if (r) 3178 return r; 3179 block->status.hw = true; 3180 } 3181 } 3182 3183 return 0; 3184 } 3185 3186 /** 3187 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3188 * 3189 * @adev: amdgpu_device pointer 3190 * 3191 * First resume function for hardware IPs. The list of all the hardware 3192 * IPs that make up the asic is walked and the resume callbacks are run for 3193 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3194 * after a suspend and updates the software state as necessary. This 3195 * function is also used for restoring the GPU after a GPU reset. 3196 * Returns 0 on success, negative error code on failure. 3197 */ 3198 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3199 { 3200 int i, r; 3201 3202 for (i = 0; i < adev->num_ip_blocks; i++) { 3203 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3204 continue; 3205 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3206 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3207 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3208 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3209 3210 r = adev->ip_blocks[i].version->funcs->resume(adev); 3211 if (r) { 3212 DRM_ERROR("resume of IP block <%s> failed %d\n", 3213 adev->ip_blocks[i].version->funcs->name, r); 3214 return r; 3215 } 3216 adev->ip_blocks[i].status.hw = true; 3217 } 3218 } 3219 3220 return 0; 3221 } 3222 3223 /** 3224 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3225 * 3226 * @adev: amdgpu_device pointer 3227 * 3228 * First resume function for hardware IPs. The list of all the hardware 3229 * IPs that make up the asic is walked and the resume callbacks are run for 3230 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3231 * functional state after a suspend and updates the software state as 3232 * necessary. This function is also used for restoring the GPU after a GPU 3233 * reset. 3234 * Returns 0 on success, negative error code on failure. 3235 */ 3236 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3237 { 3238 int i, r; 3239 3240 for (i = 0; i < adev->num_ip_blocks; i++) { 3241 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3242 continue; 3243 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3244 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3245 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3246 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3247 continue; 3248 r = adev->ip_blocks[i].version->funcs->resume(adev); 3249 if (r) { 3250 DRM_ERROR("resume of IP block <%s> failed %d\n", 3251 adev->ip_blocks[i].version->funcs->name, r); 3252 return r; 3253 } 3254 adev->ip_blocks[i].status.hw = true; 3255 } 3256 3257 return 0; 3258 } 3259 3260 /** 3261 * amdgpu_device_ip_resume - run resume for hardware IPs 3262 * 3263 * @adev: amdgpu_device pointer 3264 * 3265 * Main resume function for hardware IPs. The hardware IPs 3266 * are split into two resume functions because they are 3267 * are also used in in recovering from a GPU reset and some additional 3268 * steps need to be take between them. In this case (S3/S4) they are 3269 * run sequentially. 3270 * Returns 0 on success, negative error code on failure. 3271 */ 3272 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3273 { 3274 int r; 3275 3276 if (!adev->in_s0ix) { 3277 r = amdgpu_amdkfd_resume_iommu(adev); 3278 if (r) 3279 return r; 3280 } 3281 3282 r = amdgpu_device_ip_resume_phase1(adev); 3283 if (r) 3284 return r; 3285 3286 r = amdgpu_device_fw_loading(adev); 3287 if (r) 3288 return r; 3289 3290 r = amdgpu_device_ip_resume_phase2(adev); 3291 3292 return r; 3293 } 3294 3295 /** 3296 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3297 * 3298 * @adev: amdgpu_device pointer 3299 * 3300 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3301 */ 3302 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3303 { 3304 if (amdgpu_sriov_vf(adev)) { 3305 if (adev->is_atom_fw) { 3306 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3307 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3308 } else { 3309 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3310 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3311 } 3312 3313 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3314 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3315 } 3316 } 3317 3318 /** 3319 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3320 * 3321 * @asic_type: AMD asic type 3322 * 3323 * Check if there is DC (new modesetting infrastructre) support for an asic. 3324 * returns true if DC has support, false if not. 3325 */ 3326 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3327 { 3328 switch (asic_type) { 3329 #ifdef CONFIG_DRM_AMDGPU_SI 3330 case CHIP_HAINAN: 3331 #endif 3332 case CHIP_TOPAZ: 3333 /* chips with no display hardware */ 3334 return false; 3335 #if defined(CONFIG_DRM_AMD_DC) 3336 case CHIP_TAHITI: 3337 case CHIP_PITCAIRN: 3338 case CHIP_VERDE: 3339 case CHIP_OLAND: 3340 /* 3341 * We have systems in the wild with these ASICs that require 3342 * LVDS and VGA support which is not supported with DC. 3343 * 3344 * Fallback to the non-DC driver here by default so as not to 3345 * cause regressions. 3346 */ 3347 #if defined(CONFIG_DRM_AMD_DC_SI) 3348 return amdgpu_dc > 0; 3349 #else 3350 return false; 3351 #endif 3352 case CHIP_BONAIRE: 3353 case CHIP_KAVERI: 3354 case CHIP_KABINI: 3355 case CHIP_MULLINS: 3356 /* 3357 * We have systems in the wild with these ASICs that require 3358 * VGA support which is not supported with DC. 3359 * 3360 * Fallback to the non-DC driver here by default so as not to 3361 * cause regressions. 3362 */ 3363 return amdgpu_dc > 0; 3364 default: 3365 return amdgpu_dc != 0; 3366 #else 3367 default: 3368 if (amdgpu_dc > 0) 3369 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3370 "but isn't supported by ASIC, ignoring\n"); 3371 return false; 3372 #endif 3373 } 3374 } 3375 3376 /** 3377 * amdgpu_device_has_dc_support - check if dc is supported 3378 * 3379 * @adev: amdgpu_device pointer 3380 * 3381 * Returns true for supported, false for not supported 3382 */ 3383 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3384 { 3385 if (adev->enable_virtual_display || 3386 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3387 return false; 3388 3389 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3390 } 3391 3392 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3393 { 3394 struct amdgpu_device *adev = 3395 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3396 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3397 3398 /* It's a bug to not have a hive within this function */ 3399 if (WARN_ON(!hive)) 3400 return; 3401 3402 /* 3403 * Use task barrier to synchronize all xgmi reset works across the 3404 * hive. task_barrier_enter and task_barrier_exit will block 3405 * until all the threads running the xgmi reset works reach 3406 * those points. task_barrier_full will do both blocks. 3407 */ 3408 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3409 3410 task_barrier_enter(&hive->tb); 3411 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3412 3413 if (adev->asic_reset_res) 3414 goto fail; 3415 3416 task_barrier_exit(&hive->tb); 3417 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3418 3419 if (adev->asic_reset_res) 3420 goto fail; 3421 3422 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3423 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3424 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3425 } else { 3426 3427 task_barrier_full(&hive->tb); 3428 adev->asic_reset_res = amdgpu_asic_reset(adev); 3429 } 3430 3431 fail: 3432 if (adev->asic_reset_res) 3433 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3434 adev->asic_reset_res, adev_to_drm(adev)->unique); 3435 amdgpu_put_xgmi_hive(hive); 3436 } 3437 3438 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3439 { 3440 char *input = amdgpu_lockup_timeout; 3441 char *timeout_setting = NULL; 3442 int index = 0; 3443 long timeout; 3444 int ret = 0; 3445 3446 /* 3447 * By default timeout for non compute jobs is 10000 3448 * and 60000 for compute jobs. 3449 * In SR-IOV or passthrough mode, timeout for compute 3450 * jobs are 60000 by default. 3451 */ 3452 adev->gfx_timeout = msecs_to_jiffies(10000); 3453 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3454 if (amdgpu_sriov_vf(adev)) 3455 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3456 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3457 else 3458 adev->compute_timeout = msecs_to_jiffies(60000); 3459 3460 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3461 while ((timeout_setting = strsep(&input, ",")) && 3462 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3463 ret = kstrtol(timeout_setting, 0, &timeout); 3464 if (ret) 3465 return ret; 3466 3467 if (timeout == 0) { 3468 index++; 3469 continue; 3470 } else if (timeout < 0) { 3471 timeout = MAX_SCHEDULE_TIMEOUT; 3472 dev_warn(adev->dev, "lockup timeout disabled"); 3473 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3474 } else { 3475 timeout = msecs_to_jiffies(timeout); 3476 } 3477 3478 switch (index++) { 3479 case 0: 3480 adev->gfx_timeout = timeout; 3481 break; 3482 case 1: 3483 adev->compute_timeout = timeout; 3484 break; 3485 case 2: 3486 adev->sdma_timeout = timeout; 3487 break; 3488 case 3: 3489 adev->video_timeout = timeout; 3490 break; 3491 default: 3492 break; 3493 } 3494 } 3495 /* 3496 * There is only one value specified and 3497 * it should apply to all non-compute jobs. 3498 */ 3499 if (index == 1) { 3500 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3501 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3502 adev->compute_timeout = adev->gfx_timeout; 3503 } 3504 } 3505 3506 return ret; 3507 } 3508 3509 /** 3510 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3511 * 3512 * @adev: amdgpu_device pointer 3513 * 3514 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3515 */ 3516 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3517 { 3518 struct iommu_domain *domain; 3519 3520 domain = iommu_get_domain_for_dev(adev->dev); 3521 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3522 adev->ram_is_direct_mapped = true; 3523 } 3524 3525 static const struct attribute *amdgpu_dev_attributes[] = { 3526 &dev_attr_product_name.attr, 3527 &dev_attr_product_number.attr, 3528 &dev_attr_serial_number.attr, 3529 &dev_attr_pcie_replay_count.attr, 3530 NULL 3531 }; 3532 3533 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3534 { 3535 if (amdgpu_mcbp == 1) 3536 adev->gfx.mcbp = true; 3537 3538 if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3539 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3540 adev->gfx.num_gfx_rings) 3541 adev->gfx.mcbp = true; 3542 3543 if (amdgpu_sriov_vf(adev)) 3544 adev->gfx.mcbp = true; 3545 3546 if (adev->gfx.mcbp) 3547 DRM_INFO("MCBP is enabled\n"); 3548 } 3549 3550 /** 3551 * amdgpu_device_init - initialize the driver 3552 * 3553 * @adev: amdgpu_device pointer 3554 * @flags: driver flags 3555 * 3556 * Initializes the driver info and hw (all asics). 3557 * Returns 0 for success or an error on failure. 3558 * Called at driver startup. 3559 */ 3560 int amdgpu_device_init(struct amdgpu_device *adev, 3561 uint32_t flags) 3562 { 3563 struct drm_device *ddev = adev_to_drm(adev); 3564 struct pci_dev *pdev = adev->pdev; 3565 int r, i; 3566 bool px = false; 3567 u32 max_MBps; 3568 int tmp; 3569 3570 adev->shutdown = false; 3571 adev->flags = flags; 3572 3573 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3574 adev->asic_type = amdgpu_force_asic_type; 3575 else 3576 adev->asic_type = flags & AMD_ASIC_MASK; 3577 3578 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3579 if (amdgpu_emu_mode == 1) 3580 adev->usec_timeout *= 10; 3581 adev->gmc.gart_size = 512 * 1024 * 1024; 3582 adev->accel_working = false; 3583 adev->num_rings = 0; 3584 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3585 adev->mman.buffer_funcs = NULL; 3586 adev->mman.buffer_funcs_ring = NULL; 3587 adev->vm_manager.vm_pte_funcs = NULL; 3588 adev->vm_manager.vm_pte_num_scheds = 0; 3589 adev->gmc.gmc_funcs = NULL; 3590 adev->harvest_ip_mask = 0x0; 3591 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3592 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3593 3594 adev->smc_rreg = &amdgpu_invalid_rreg; 3595 adev->smc_wreg = &amdgpu_invalid_wreg; 3596 adev->pcie_rreg = &amdgpu_invalid_rreg; 3597 adev->pcie_wreg = &amdgpu_invalid_wreg; 3598 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3599 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3600 adev->pciep_rreg = &amdgpu_invalid_rreg; 3601 adev->pciep_wreg = &amdgpu_invalid_wreg; 3602 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3603 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3604 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3605 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3606 adev->didt_rreg = &amdgpu_invalid_rreg; 3607 adev->didt_wreg = &amdgpu_invalid_wreg; 3608 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3609 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3610 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3611 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3612 3613 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3614 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3615 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3616 3617 /* mutex initialization are all done here so we 3618 * can recall function without having locking issues */ 3619 mutex_init(&adev->firmware.mutex); 3620 mutex_init(&adev->pm.mutex); 3621 mutex_init(&adev->gfx.gpu_clock_mutex); 3622 mutex_init(&adev->srbm_mutex); 3623 mutex_init(&adev->gfx.pipe_reserve_mutex); 3624 mutex_init(&adev->gfx.gfx_off_mutex); 3625 mutex_init(&adev->gfx.partition_mutex); 3626 mutex_init(&adev->grbm_idx_mutex); 3627 mutex_init(&adev->mn_lock); 3628 mutex_init(&adev->virt.vf_errors.lock); 3629 hash_init(adev->mn_hash); 3630 mutex_init(&adev->psp.mutex); 3631 mutex_init(&adev->notifier_lock); 3632 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3633 mutex_init(&adev->benchmark_mutex); 3634 3635 amdgpu_device_init_apu_flags(adev); 3636 3637 r = amdgpu_device_check_arguments(adev); 3638 if (r) 3639 return r; 3640 3641 spin_lock_init(&adev->mmio_idx_lock); 3642 spin_lock_init(&adev->smc_idx_lock); 3643 spin_lock_init(&adev->pcie_idx_lock); 3644 spin_lock_init(&adev->uvd_ctx_idx_lock); 3645 spin_lock_init(&adev->didt_idx_lock); 3646 spin_lock_init(&adev->gc_cac_idx_lock); 3647 spin_lock_init(&adev->se_cac_idx_lock); 3648 spin_lock_init(&adev->audio_endpt_idx_lock); 3649 spin_lock_init(&adev->mm_stats.lock); 3650 3651 INIT_LIST_HEAD(&adev->shadow_list); 3652 mutex_init(&adev->shadow_list_lock); 3653 3654 INIT_LIST_HEAD(&adev->reset_list); 3655 3656 INIT_LIST_HEAD(&adev->ras_list); 3657 3658 INIT_DELAYED_WORK(&adev->delayed_init_work, 3659 amdgpu_device_delayed_init_work_handler); 3660 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3661 amdgpu_device_delay_enable_gfx_off); 3662 3663 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3664 3665 adev->gfx.gfx_off_req_count = 1; 3666 adev->gfx.gfx_off_residency = 0; 3667 adev->gfx.gfx_off_entrycount = 0; 3668 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3669 3670 atomic_set(&adev->throttling_logging_enabled, 1); 3671 /* 3672 * If throttling continues, logging will be performed every minute 3673 * to avoid log flooding. "-1" is subtracted since the thermal 3674 * throttling interrupt comes every second. Thus, the total logging 3675 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3676 * for throttling interrupt) = 60 seconds. 3677 */ 3678 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3679 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3680 3681 /* Registers mapping */ 3682 /* TODO: block userspace mapping of io register */ 3683 if (adev->asic_type >= CHIP_BONAIRE) { 3684 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3685 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3686 } else { 3687 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3688 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3689 } 3690 3691 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3692 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3693 3694 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3695 if (adev->rmmio == NULL) { 3696 return -ENOMEM; 3697 } 3698 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3699 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3700 3701 /* 3702 * Reset domain needs to be present early, before XGMI hive discovered 3703 * (if any) and intitialized to use reset sem and in_gpu reset flag 3704 * early on during init and before calling to RREG32. 3705 */ 3706 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3707 if (!adev->reset_domain) 3708 return -ENOMEM; 3709 3710 /* detect hw virtualization here */ 3711 amdgpu_detect_virtualization(adev); 3712 3713 amdgpu_device_get_pcie_info(adev); 3714 3715 r = amdgpu_device_get_job_timeout_settings(adev); 3716 if (r) { 3717 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3718 return r; 3719 } 3720 3721 /* early init functions */ 3722 r = amdgpu_device_ip_early_init(adev); 3723 if (r) 3724 return r; 3725 3726 amdgpu_device_set_mcbp(adev); 3727 3728 /* Get rid of things like offb */ 3729 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3730 if (r) 3731 return r; 3732 3733 /* Enable TMZ based on IP_VERSION */ 3734 amdgpu_gmc_tmz_set(adev); 3735 3736 amdgpu_gmc_noretry_set(adev); 3737 /* Need to get xgmi info early to decide the reset behavior*/ 3738 if (adev->gmc.xgmi.supported) { 3739 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3740 if (r) 3741 return r; 3742 } 3743 3744 /* enable PCIE atomic ops */ 3745 if (amdgpu_sriov_vf(adev)) { 3746 if (adev->virt.fw_reserve.p_pf2vf) 3747 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3748 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3749 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3750 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3751 * internal path natively support atomics, set have_atomics_support to true. 3752 */ 3753 } else if ((adev->flags & AMD_IS_APU) && 3754 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3755 adev->have_atomics_support = true; 3756 } else { 3757 adev->have_atomics_support = 3758 !pci_enable_atomic_ops_to_root(adev->pdev, 3759 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3760 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3761 } 3762 3763 if (!adev->have_atomics_support) 3764 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3765 3766 /* doorbell bar mapping and doorbell index init*/ 3767 amdgpu_doorbell_init(adev); 3768 3769 if (amdgpu_emu_mode == 1) { 3770 /* post the asic on emulation mode */ 3771 emu_soc_asic_init(adev); 3772 goto fence_driver_init; 3773 } 3774 3775 amdgpu_reset_init(adev); 3776 3777 /* detect if we are with an SRIOV vbios */ 3778 if (adev->bios) 3779 amdgpu_device_detect_sriov_bios(adev); 3780 3781 /* check if we need to reset the asic 3782 * E.g., driver was not cleanly unloaded previously, etc. 3783 */ 3784 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3785 if (adev->gmc.xgmi.num_physical_nodes) { 3786 dev_info(adev->dev, "Pending hive reset.\n"); 3787 adev->gmc.xgmi.pending_reset = true; 3788 /* Only need to init necessary block for SMU to handle the reset */ 3789 for (i = 0; i < adev->num_ip_blocks; i++) { 3790 if (!adev->ip_blocks[i].status.valid) 3791 continue; 3792 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3793 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3794 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3795 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3796 DRM_DEBUG("IP %s disabled for hw_init.\n", 3797 adev->ip_blocks[i].version->funcs->name); 3798 adev->ip_blocks[i].status.hw = true; 3799 } 3800 } 3801 } else { 3802 tmp = amdgpu_reset_method; 3803 /* It should do a default reset when loading or reloading the driver, 3804 * regardless of the module parameter reset_method. 3805 */ 3806 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3807 r = amdgpu_asic_reset(adev); 3808 amdgpu_reset_method = tmp; 3809 if (r) { 3810 dev_err(adev->dev, "asic reset on init failed\n"); 3811 goto failed; 3812 } 3813 } 3814 } 3815 3816 /* Post card if necessary */ 3817 if (amdgpu_device_need_post(adev)) { 3818 if (!adev->bios) { 3819 dev_err(adev->dev, "no vBIOS found\n"); 3820 r = -EINVAL; 3821 goto failed; 3822 } 3823 DRM_INFO("GPU posting now...\n"); 3824 r = amdgpu_device_asic_init(adev); 3825 if (r) { 3826 dev_err(adev->dev, "gpu post error!\n"); 3827 goto failed; 3828 } 3829 } 3830 3831 if (adev->bios) { 3832 if (adev->is_atom_fw) { 3833 /* Initialize clocks */ 3834 r = amdgpu_atomfirmware_get_clock_info(adev); 3835 if (r) { 3836 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3837 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3838 goto failed; 3839 } 3840 } else { 3841 /* Initialize clocks */ 3842 r = amdgpu_atombios_get_clock_info(adev); 3843 if (r) { 3844 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3845 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3846 goto failed; 3847 } 3848 /* init i2c buses */ 3849 if (!amdgpu_device_has_dc_support(adev)) 3850 amdgpu_atombios_i2c_init(adev); 3851 } 3852 } 3853 3854 fence_driver_init: 3855 /* Fence driver */ 3856 r = amdgpu_fence_driver_sw_init(adev); 3857 if (r) { 3858 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3859 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3860 goto failed; 3861 } 3862 3863 /* init the mode config */ 3864 drm_mode_config_init(adev_to_drm(adev)); 3865 3866 r = amdgpu_device_ip_init(adev); 3867 if (r) { 3868 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3869 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3870 goto release_ras_con; 3871 } 3872 3873 amdgpu_fence_driver_hw_init(adev); 3874 3875 dev_info(adev->dev, 3876 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3877 adev->gfx.config.max_shader_engines, 3878 adev->gfx.config.max_sh_per_se, 3879 adev->gfx.config.max_cu_per_sh, 3880 adev->gfx.cu_info.number); 3881 3882 adev->accel_working = true; 3883 3884 amdgpu_vm_check_compute_bug(adev); 3885 3886 /* Initialize the buffer migration limit. */ 3887 if (amdgpu_moverate >= 0) 3888 max_MBps = amdgpu_moverate; 3889 else 3890 max_MBps = 8; /* Allow 8 MB/s. */ 3891 /* Get a log2 for easy divisions. */ 3892 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3893 3894 r = amdgpu_atombios_sysfs_init(adev); 3895 if (r) 3896 drm_err(&adev->ddev, 3897 "registering atombios sysfs failed (%d).\n", r); 3898 3899 r = amdgpu_pm_sysfs_init(adev); 3900 if (r) 3901 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3902 3903 r = amdgpu_ucode_sysfs_init(adev); 3904 if (r) { 3905 adev->ucode_sysfs_en = false; 3906 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3907 } else 3908 adev->ucode_sysfs_en = true; 3909 3910 /* 3911 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3912 * Otherwise the mgpu fan boost feature will be skipped due to the 3913 * gpu instance is counted less. 3914 */ 3915 amdgpu_register_gpu_instance(adev); 3916 3917 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3918 * explicit gating rather than handling it automatically. 3919 */ 3920 if (!adev->gmc.xgmi.pending_reset) { 3921 r = amdgpu_device_ip_late_init(adev); 3922 if (r) { 3923 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3924 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3925 goto release_ras_con; 3926 } 3927 /* must succeed. */ 3928 amdgpu_ras_resume(adev); 3929 queue_delayed_work(system_wq, &adev->delayed_init_work, 3930 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3931 } 3932 3933 if (amdgpu_sriov_vf(adev)) { 3934 amdgpu_virt_release_full_gpu(adev, true); 3935 flush_delayed_work(&adev->delayed_init_work); 3936 } 3937 3938 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3939 if (r) 3940 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3941 3942 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3943 r = amdgpu_pmu_init(adev); 3944 if (r) 3945 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3946 3947 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3948 if (amdgpu_device_cache_pci_state(adev->pdev)) 3949 pci_restore_state(pdev); 3950 3951 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3952 /* this will fail for cards that aren't VGA class devices, just 3953 * ignore it */ 3954 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3955 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3956 3957 px = amdgpu_device_supports_px(ddev); 3958 3959 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3960 apple_gmux_detect(NULL, NULL))) 3961 vga_switcheroo_register_client(adev->pdev, 3962 &amdgpu_switcheroo_ops, px); 3963 3964 if (px) 3965 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3966 3967 if (adev->gmc.xgmi.pending_reset) 3968 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3969 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3970 3971 amdgpu_device_check_iommu_direct_map(adev); 3972 3973 return 0; 3974 3975 release_ras_con: 3976 if (amdgpu_sriov_vf(adev)) 3977 amdgpu_virt_release_full_gpu(adev, true); 3978 3979 /* failed in exclusive mode due to timeout */ 3980 if (amdgpu_sriov_vf(adev) && 3981 !amdgpu_sriov_runtime(adev) && 3982 amdgpu_virt_mmio_blocked(adev) && 3983 !amdgpu_virt_wait_reset(adev)) { 3984 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3985 /* Don't send request since VF is inactive. */ 3986 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3987 adev->virt.ops = NULL; 3988 r = -EAGAIN; 3989 } 3990 amdgpu_release_ras_context(adev); 3991 3992 failed: 3993 amdgpu_vf_error_trans_all(adev); 3994 3995 return r; 3996 } 3997 3998 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3999 { 4000 4001 /* Clear all CPU mappings pointing to this device */ 4002 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4003 4004 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4005 amdgpu_doorbell_fini(adev); 4006 4007 iounmap(adev->rmmio); 4008 adev->rmmio = NULL; 4009 if (adev->mman.aper_base_kaddr) 4010 iounmap(adev->mman.aper_base_kaddr); 4011 adev->mman.aper_base_kaddr = NULL; 4012 4013 /* Memory manager related */ 4014 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4015 arch_phys_wc_del(adev->gmc.vram_mtrr); 4016 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4017 } 4018 } 4019 4020 /** 4021 * amdgpu_device_fini_hw - tear down the driver 4022 * 4023 * @adev: amdgpu_device pointer 4024 * 4025 * Tear down the driver info (all asics). 4026 * Called at driver shutdown. 4027 */ 4028 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4029 { 4030 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4031 flush_delayed_work(&adev->delayed_init_work); 4032 adev->shutdown = true; 4033 4034 /* make sure IB test finished before entering exclusive mode 4035 * to avoid preemption on IB test 4036 * */ 4037 if (amdgpu_sriov_vf(adev)) { 4038 amdgpu_virt_request_full_gpu(adev, false); 4039 amdgpu_virt_fini_data_exchange(adev); 4040 } 4041 4042 /* disable all interrupts */ 4043 amdgpu_irq_disable_all(adev); 4044 if (adev->mode_info.mode_config_initialized) { 4045 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4046 drm_helper_force_disable_all(adev_to_drm(adev)); 4047 else 4048 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4049 } 4050 amdgpu_fence_driver_hw_fini(adev); 4051 4052 if (adev->mman.initialized) 4053 drain_workqueue(adev->mman.bdev.wq); 4054 4055 if (adev->pm.sysfs_initialized) 4056 amdgpu_pm_sysfs_fini(adev); 4057 if (adev->ucode_sysfs_en) 4058 amdgpu_ucode_sysfs_fini(adev); 4059 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4060 4061 /* disable ras feature must before hw fini */ 4062 amdgpu_ras_pre_fini(adev); 4063 4064 amdgpu_device_ip_fini_early(adev); 4065 4066 amdgpu_irq_fini_hw(adev); 4067 4068 if (adev->mman.initialized) 4069 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4070 4071 amdgpu_gart_dummy_page_fini(adev); 4072 4073 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4074 amdgpu_device_unmap_mmio(adev); 4075 4076 } 4077 4078 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4079 { 4080 int idx; 4081 bool px; 4082 4083 amdgpu_fence_driver_sw_fini(adev); 4084 amdgpu_device_ip_fini(adev); 4085 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4086 adev->accel_working = false; 4087 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4088 4089 amdgpu_reset_fini(adev); 4090 4091 /* free i2c buses */ 4092 if (!amdgpu_device_has_dc_support(adev)) 4093 amdgpu_i2c_fini(adev); 4094 4095 if (amdgpu_emu_mode != 1) 4096 amdgpu_atombios_fini(adev); 4097 4098 kfree(adev->bios); 4099 adev->bios = NULL; 4100 4101 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4102 4103 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4104 apple_gmux_detect(NULL, NULL))) 4105 vga_switcheroo_unregister_client(adev->pdev); 4106 4107 if (px) 4108 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4109 4110 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4111 vga_client_unregister(adev->pdev); 4112 4113 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4114 4115 iounmap(adev->rmmio); 4116 adev->rmmio = NULL; 4117 amdgpu_doorbell_fini(adev); 4118 drm_dev_exit(idx); 4119 } 4120 4121 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4122 amdgpu_pmu_fini(adev); 4123 if (adev->mman.discovery_bin) 4124 amdgpu_discovery_fini(adev); 4125 4126 amdgpu_reset_put_reset_domain(adev->reset_domain); 4127 adev->reset_domain = NULL; 4128 4129 kfree(adev->pci_state); 4130 4131 } 4132 4133 /** 4134 * amdgpu_device_evict_resources - evict device resources 4135 * @adev: amdgpu device object 4136 * 4137 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4138 * of the vram memory type. Mainly used for evicting device resources 4139 * at suspend time. 4140 * 4141 */ 4142 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4143 { 4144 int ret; 4145 4146 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4147 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4148 return 0; 4149 4150 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4151 if (ret) 4152 DRM_WARN("evicting device resources failed\n"); 4153 return ret; 4154 } 4155 4156 /* 4157 * Suspend & resume. 4158 */ 4159 /** 4160 * amdgpu_device_suspend - initiate device suspend 4161 * 4162 * @dev: drm dev pointer 4163 * @fbcon : notify the fbdev of suspend 4164 * 4165 * Puts the hw in the suspend state (all asics). 4166 * Returns 0 for success or an error on failure. 4167 * Called at driver suspend. 4168 */ 4169 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4170 { 4171 struct amdgpu_device *adev = drm_to_adev(dev); 4172 int r = 0; 4173 4174 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4175 return 0; 4176 4177 adev->in_suspend = true; 4178 4179 /* Evict the majority of BOs before grabbing the full access */ 4180 r = amdgpu_device_evict_resources(adev); 4181 if (r) 4182 return r; 4183 4184 if (amdgpu_sriov_vf(adev)) { 4185 amdgpu_virt_fini_data_exchange(adev); 4186 r = amdgpu_virt_request_full_gpu(adev, false); 4187 if (r) 4188 return r; 4189 } 4190 4191 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4192 DRM_WARN("smart shift update failed\n"); 4193 4194 if (fbcon) 4195 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4196 4197 cancel_delayed_work_sync(&adev->delayed_init_work); 4198 4199 amdgpu_ras_suspend(adev); 4200 4201 amdgpu_device_ip_suspend_phase1(adev); 4202 4203 if (!adev->in_s0ix) 4204 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4205 4206 r = amdgpu_device_evict_resources(adev); 4207 if (r) 4208 return r; 4209 4210 amdgpu_fence_driver_hw_fini(adev); 4211 4212 amdgpu_device_ip_suspend_phase2(adev); 4213 4214 if (amdgpu_sriov_vf(adev)) 4215 amdgpu_virt_release_full_gpu(adev, false); 4216 4217 return 0; 4218 } 4219 4220 /** 4221 * amdgpu_device_resume - initiate device resume 4222 * 4223 * @dev: drm dev pointer 4224 * @fbcon : notify the fbdev of resume 4225 * 4226 * Bring the hw back to operating state (all asics). 4227 * Returns 0 for success or an error on failure. 4228 * Called at driver resume. 4229 */ 4230 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4231 { 4232 struct amdgpu_device *adev = drm_to_adev(dev); 4233 int r = 0; 4234 4235 if (amdgpu_sriov_vf(adev)) { 4236 r = amdgpu_virt_request_full_gpu(adev, true); 4237 if (r) 4238 return r; 4239 } 4240 4241 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4242 return 0; 4243 4244 if (adev->in_s0ix) 4245 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4246 4247 /* post card */ 4248 if (amdgpu_device_need_post(adev)) { 4249 r = amdgpu_device_asic_init(adev); 4250 if (r) 4251 dev_err(adev->dev, "amdgpu asic init failed\n"); 4252 } 4253 4254 r = amdgpu_device_ip_resume(adev); 4255 4256 if (r) { 4257 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4258 goto exit; 4259 } 4260 amdgpu_fence_driver_hw_init(adev); 4261 4262 r = amdgpu_device_ip_late_init(adev); 4263 if (r) 4264 goto exit; 4265 4266 queue_delayed_work(system_wq, &adev->delayed_init_work, 4267 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4268 4269 if (!adev->in_s0ix) { 4270 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4271 if (r) 4272 goto exit; 4273 } 4274 4275 exit: 4276 if (amdgpu_sriov_vf(adev)) { 4277 amdgpu_virt_init_data_exchange(adev); 4278 amdgpu_virt_release_full_gpu(adev, true); 4279 } 4280 4281 if (r) 4282 return r; 4283 4284 /* Make sure IB tests flushed */ 4285 flush_delayed_work(&adev->delayed_init_work); 4286 4287 if (fbcon) 4288 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4289 4290 amdgpu_ras_resume(adev); 4291 4292 if (adev->mode_info.num_crtc) { 4293 /* 4294 * Most of the connector probing functions try to acquire runtime pm 4295 * refs to ensure that the GPU is powered on when connector polling is 4296 * performed. Since we're calling this from a runtime PM callback, 4297 * trying to acquire rpm refs will cause us to deadlock. 4298 * 4299 * Since we're guaranteed to be holding the rpm lock, it's safe to 4300 * temporarily disable the rpm helpers so this doesn't deadlock us. 4301 */ 4302 #ifdef CONFIG_PM 4303 dev->dev->power.disable_depth++; 4304 #endif 4305 if (!adev->dc_enabled) 4306 drm_helper_hpd_irq_event(dev); 4307 else 4308 drm_kms_helper_hotplug_event(dev); 4309 #ifdef CONFIG_PM 4310 dev->dev->power.disable_depth--; 4311 #endif 4312 } 4313 adev->in_suspend = false; 4314 4315 if (adev->enable_mes) 4316 amdgpu_mes_self_test(adev); 4317 4318 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4319 DRM_WARN("smart shift update failed\n"); 4320 4321 return 0; 4322 } 4323 4324 /** 4325 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4326 * 4327 * @adev: amdgpu_device pointer 4328 * 4329 * The list of all the hardware IPs that make up the asic is walked and 4330 * the check_soft_reset callbacks are run. check_soft_reset determines 4331 * if the asic is still hung or not. 4332 * Returns true if any of the IPs are still in a hung state, false if not. 4333 */ 4334 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4335 { 4336 int i; 4337 bool asic_hang = false; 4338 4339 if (amdgpu_sriov_vf(adev)) 4340 return true; 4341 4342 if (amdgpu_asic_need_full_reset(adev)) 4343 return true; 4344 4345 for (i = 0; i < adev->num_ip_blocks; i++) { 4346 if (!adev->ip_blocks[i].status.valid) 4347 continue; 4348 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4349 adev->ip_blocks[i].status.hang = 4350 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4351 if (adev->ip_blocks[i].status.hang) { 4352 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4353 asic_hang = true; 4354 } 4355 } 4356 return asic_hang; 4357 } 4358 4359 /** 4360 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4361 * 4362 * @adev: amdgpu_device pointer 4363 * 4364 * The list of all the hardware IPs that make up the asic is walked and the 4365 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4366 * handles any IP specific hardware or software state changes that are 4367 * necessary for a soft reset to succeed. 4368 * Returns 0 on success, negative error code on failure. 4369 */ 4370 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4371 { 4372 int i, r = 0; 4373 4374 for (i = 0; i < adev->num_ip_blocks; i++) { 4375 if (!adev->ip_blocks[i].status.valid) 4376 continue; 4377 if (adev->ip_blocks[i].status.hang && 4378 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4379 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4380 if (r) 4381 return r; 4382 } 4383 } 4384 4385 return 0; 4386 } 4387 4388 /** 4389 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4390 * 4391 * @adev: amdgpu_device pointer 4392 * 4393 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4394 * reset is necessary to recover. 4395 * Returns true if a full asic reset is required, false if not. 4396 */ 4397 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4398 { 4399 int i; 4400 4401 if (amdgpu_asic_need_full_reset(adev)) 4402 return true; 4403 4404 for (i = 0; i < adev->num_ip_blocks; i++) { 4405 if (!adev->ip_blocks[i].status.valid) 4406 continue; 4407 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4408 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4409 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4410 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4411 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4412 if (adev->ip_blocks[i].status.hang) { 4413 dev_info(adev->dev, "Some block need full reset!\n"); 4414 return true; 4415 } 4416 } 4417 } 4418 return false; 4419 } 4420 4421 /** 4422 * amdgpu_device_ip_soft_reset - do a soft reset 4423 * 4424 * @adev: amdgpu_device pointer 4425 * 4426 * The list of all the hardware IPs that make up the asic is walked and the 4427 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4428 * IP specific hardware or software state changes that are necessary to soft 4429 * reset the IP. 4430 * Returns 0 on success, negative error code on failure. 4431 */ 4432 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4433 { 4434 int i, r = 0; 4435 4436 for (i = 0; i < adev->num_ip_blocks; i++) { 4437 if (!adev->ip_blocks[i].status.valid) 4438 continue; 4439 if (adev->ip_blocks[i].status.hang && 4440 adev->ip_blocks[i].version->funcs->soft_reset) { 4441 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4442 if (r) 4443 return r; 4444 } 4445 } 4446 4447 return 0; 4448 } 4449 4450 /** 4451 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4452 * 4453 * @adev: amdgpu_device pointer 4454 * 4455 * The list of all the hardware IPs that make up the asic is walked and the 4456 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4457 * handles any IP specific hardware or software state changes that are 4458 * necessary after the IP has been soft reset. 4459 * Returns 0 on success, negative error code on failure. 4460 */ 4461 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4462 { 4463 int i, r = 0; 4464 4465 for (i = 0; i < adev->num_ip_blocks; i++) { 4466 if (!adev->ip_blocks[i].status.valid) 4467 continue; 4468 if (adev->ip_blocks[i].status.hang && 4469 adev->ip_blocks[i].version->funcs->post_soft_reset) 4470 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4471 if (r) 4472 return r; 4473 } 4474 4475 return 0; 4476 } 4477 4478 /** 4479 * amdgpu_device_recover_vram - Recover some VRAM contents 4480 * 4481 * @adev: amdgpu_device pointer 4482 * 4483 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4484 * restore things like GPUVM page tables after a GPU reset where 4485 * the contents of VRAM might be lost. 4486 * 4487 * Returns: 4488 * 0 on success, negative error code on failure. 4489 */ 4490 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4491 { 4492 struct dma_fence *fence = NULL, *next = NULL; 4493 struct amdgpu_bo *shadow; 4494 struct amdgpu_bo_vm *vmbo; 4495 long r = 1, tmo; 4496 4497 if (amdgpu_sriov_runtime(adev)) 4498 tmo = msecs_to_jiffies(8000); 4499 else 4500 tmo = msecs_to_jiffies(100); 4501 4502 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4503 mutex_lock(&adev->shadow_list_lock); 4504 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4505 /* If vm is compute context or adev is APU, shadow will be NULL */ 4506 if (!vmbo->shadow) 4507 continue; 4508 shadow = vmbo->shadow; 4509 4510 /* No need to recover an evicted BO */ 4511 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4512 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4513 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4514 continue; 4515 4516 r = amdgpu_bo_restore_shadow(shadow, &next); 4517 if (r) 4518 break; 4519 4520 if (fence) { 4521 tmo = dma_fence_wait_timeout(fence, false, tmo); 4522 dma_fence_put(fence); 4523 fence = next; 4524 if (tmo == 0) { 4525 r = -ETIMEDOUT; 4526 break; 4527 } else if (tmo < 0) { 4528 r = tmo; 4529 break; 4530 } 4531 } else { 4532 fence = next; 4533 } 4534 } 4535 mutex_unlock(&adev->shadow_list_lock); 4536 4537 if (fence) 4538 tmo = dma_fence_wait_timeout(fence, false, tmo); 4539 dma_fence_put(fence); 4540 4541 if (r < 0 || tmo <= 0) { 4542 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4543 return -EIO; 4544 } 4545 4546 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4547 return 0; 4548 } 4549 4550 4551 /** 4552 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4553 * 4554 * @adev: amdgpu_device pointer 4555 * @from_hypervisor: request from hypervisor 4556 * 4557 * do VF FLR and reinitialize Asic 4558 * return 0 means succeeded otherwise failed 4559 */ 4560 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4561 bool from_hypervisor) 4562 { 4563 int r; 4564 struct amdgpu_hive_info *hive = NULL; 4565 int retry_limit = 0; 4566 4567 retry: 4568 amdgpu_amdkfd_pre_reset(adev); 4569 4570 if (from_hypervisor) 4571 r = amdgpu_virt_request_full_gpu(adev, true); 4572 else 4573 r = amdgpu_virt_reset_gpu(adev); 4574 if (r) 4575 return r; 4576 4577 /* Resume IP prior to SMC */ 4578 r = amdgpu_device_ip_reinit_early_sriov(adev); 4579 if (r) 4580 goto error; 4581 4582 amdgpu_virt_init_data_exchange(adev); 4583 4584 r = amdgpu_device_fw_loading(adev); 4585 if (r) 4586 return r; 4587 4588 /* now we are okay to resume SMC/CP/SDMA */ 4589 r = amdgpu_device_ip_reinit_late_sriov(adev); 4590 if (r) 4591 goto error; 4592 4593 hive = amdgpu_get_xgmi_hive(adev); 4594 /* Update PSP FW topology after reset */ 4595 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4596 r = amdgpu_xgmi_update_topology(hive, adev); 4597 4598 if (hive) 4599 amdgpu_put_xgmi_hive(hive); 4600 4601 if (!r) { 4602 amdgpu_irq_gpu_reset_resume_helper(adev); 4603 r = amdgpu_ib_ring_tests(adev); 4604 4605 amdgpu_amdkfd_post_reset(adev); 4606 } 4607 4608 error: 4609 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4610 amdgpu_inc_vram_lost(adev); 4611 r = amdgpu_device_recover_vram(adev); 4612 } 4613 amdgpu_virt_release_full_gpu(adev, true); 4614 4615 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4616 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4617 retry_limit++; 4618 goto retry; 4619 } else 4620 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4621 } 4622 4623 return r; 4624 } 4625 4626 /** 4627 * amdgpu_device_has_job_running - check if there is any job in mirror list 4628 * 4629 * @adev: amdgpu_device pointer 4630 * 4631 * check if there is any job in mirror list 4632 */ 4633 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4634 { 4635 int i; 4636 struct drm_sched_job *job; 4637 4638 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4639 struct amdgpu_ring *ring = adev->rings[i]; 4640 4641 if (!ring || !ring->sched.thread) 4642 continue; 4643 4644 spin_lock(&ring->sched.job_list_lock); 4645 job = list_first_entry_or_null(&ring->sched.pending_list, 4646 struct drm_sched_job, list); 4647 spin_unlock(&ring->sched.job_list_lock); 4648 if (job) 4649 return true; 4650 } 4651 return false; 4652 } 4653 4654 /** 4655 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4656 * 4657 * @adev: amdgpu_device pointer 4658 * 4659 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4660 * a hung GPU. 4661 */ 4662 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4663 { 4664 4665 if (amdgpu_gpu_recovery == 0) 4666 goto disabled; 4667 4668 /* Skip soft reset check in fatal error mode */ 4669 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4670 return true; 4671 4672 if (amdgpu_sriov_vf(adev)) 4673 return true; 4674 4675 if (amdgpu_gpu_recovery == -1) { 4676 switch (adev->asic_type) { 4677 #ifdef CONFIG_DRM_AMDGPU_SI 4678 case CHIP_VERDE: 4679 case CHIP_TAHITI: 4680 case CHIP_PITCAIRN: 4681 case CHIP_OLAND: 4682 case CHIP_HAINAN: 4683 #endif 4684 #ifdef CONFIG_DRM_AMDGPU_CIK 4685 case CHIP_KAVERI: 4686 case CHIP_KABINI: 4687 case CHIP_MULLINS: 4688 #endif 4689 case CHIP_CARRIZO: 4690 case CHIP_STONEY: 4691 case CHIP_CYAN_SKILLFISH: 4692 goto disabled; 4693 default: 4694 break; 4695 } 4696 } 4697 4698 return true; 4699 4700 disabled: 4701 dev_info(adev->dev, "GPU recovery disabled.\n"); 4702 return false; 4703 } 4704 4705 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4706 { 4707 u32 i; 4708 int ret = 0; 4709 4710 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4711 4712 dev_info(adev->dev, "GPU mode1 reset\n"); 4713 4714 /* disable BM */ 4715 pci_clear_master(adev->pdev); 4716 4717 amdgpu_device_cache_pci_state(adev->pdev); 4718 4719 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4720 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4721 ret = amdgpu_dpm_mode1_reset(adev); 4722 } else { 4723 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4724 ret = psp_gpu_reset(adev); 4725 } 4726 4727 if (ret) 4728 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4729 4730 amdgpu_device_load_pci_state(adev->pdev); 4731 4732 /* wait for asic to come out of reset */ 4733 for (i = 0; i < adev->usec_timeout; i++) { 4734 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4735 4736 if (memsize != 0xffffffff) 4737 break; 4738 udelay(1); 4739 } 4740 4741 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4742 return ret; 4743 } 4744 4745 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4746 struct amdgpu_reset_context *reset_context) 4747 { 4748 int i, r = 0; 4749 struct amdgpu_job *job = NULL; 4750 bool need_full_reset = 4751 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4752 4753 if (reset_context->reset_req_dev == adev) 4754 job = reset_context->job; 4755 4756 if (amdgpu_sriov_vf(adev)) { 4757 /* stop the data exchange thread */ 4758 amdgpu_virt_fini_data_exchange(adev); 4759 } 4760 4761 amdgpu_fence_driver_isr_toggle(adev, true); 4762 4763 /* block all schedulers and reset given job's ring */ 4764 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4765 struct amdgpu_ring *ring = adev->rings[i]; 4766 4767 if (!ring || !ring->sched.thread) 4768 continue; 4769 4770 /*clear job fence from fence drv to avoid force_completion 4771 *leave NULL and vm flush fence in fence drv */ 4772 amdgpu_fence_driver_clear_job_fences(ring); 4773 4774 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4775 amdgpu_fence_driver_force_completion(ring); 4776 } 4777 4778 amdgpu_fence_driver_isr_toggle(adev, false); 4779 4780 if (job && job->vm) 4781 drm_sched_increase_karma(&job->base); 4782 4783 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4784 /* If reset handler not implemented, continue; otherwise return */ 4785 if (r == -ENOSYS) 4786 r = 0; 4787 else 4788 return r; 4789 4790 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4791 if (!amdgpu_sriov_vf(adev)) { 4792 4793 if (!need_full_reset) 4794 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4795 4796 if (!need_full_reset && amdgpu_gpu_recovery && 4797 amdgpu_device_ip_check_soft_reset(adev)) { 4798 amdgpu_device_ip_pre_soft_reset(adev); 4799 r = amdgpu_device_ip_soft_reset(adev); 4800 amdgpu_device_ip_post_soft_reset(adev); 4801 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4802 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4803 need_full_reset = true; 4804 } 4805 } 4806 4807 if (need_full_reset) 4808 r = amdgpu_device_ip_suspend(adev); 4809 if (need_full_reset) 4810 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4811 else 4812 clear_bit(AMDGPU_NEED_FULL_RESET, 4813 &reset_context->flags); 4814 } 4815 4816 return r; 4817 } 4818 4819 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4820 { 4821 int i; 4822 4823 lockdep_assert_held(&adev->reset_domain->sem); 4824 4825 for (i = 0; i < adev->num_regs; i++) { 4826 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4827 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4828 adev->reset_dump_reg_value[i]); 4829 } 4830 4831 return 0; 4832 } 4833 4834 #ifdef CONFIG_DEV_COREDUMP 4835 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4836 size_t count, void *data, size_t datalen) 4837 { 4838 struct drm_printer p; 4839 struct amdgpu_device *adev = data; 4840 struct drm_print_iterator iter; 4841 int i; 4842 4843 iter.data = buffer; 4844 iter.offset = 0; 4845 iter.start = offset; 4846 iter.remain = count; 4847 4848 p = drm_coredump_printer(&iter); 4849 4850 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4851 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4852 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4853 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4854 if (adev->reset_task_info.pid) 4855 drm_printf(&p, "process_name: %s PID: %d\n", 4856 adev->reset_task_info.process_name, 4857 adev->reset_task_info.pid); 4858 4859 if (adev->reset_vram_lost) 4860 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4861 if (adev->num_regs) { 4862 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4863 4864 for (i = 0; i < adev->num_regs; i++) 4865 drm_printf(&p, "0x%08x: 0x%08x\n", 4866 adev->reset_dump_reg_list[i], 4867 adev->reset_dump_reg_value[i]); 4868 } 4869 4870 return count - iter.remain; 4871 } 4872 4873 static void amdgpu_devcoredump_free(void *data) 4874 { 4875 } 4876 4877 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4878 { 4879 struct drm_device *dev = adev_to_drm(adev); 4880 4881 ktime_get_ts64(&adev->reset_time); 4882 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4883 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4884 } 4885 #endif 4886 4887 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4888 struct amdgpu_reset_context *reset_context) 4889 { 4890 struct amdgpu_device *tmp_adev = NULL; 4891 bool need_full_reset, skip_hw_reset, vram_lost = false; 4892 int r = 0; 4893 bool gpu_reset_for_dev_remove = 0; 4894 4895 /* Try reset handler method first */ 4896 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4897 reset_list); 4898 amdgpu_reset_reg_dumps(tmp_adev); 4899 4900 reset_context->reset_device_list = device_list_handle; 4901 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4902 /* If reset handler not implemented, continue; otherwise return */ 4903 if (r == -ENOSYS) 4904 r = 0; 4905 else 4906 return r; 4907 4908 /* Reset handler not implemented, use the default method */ 4909 need_full_reset = 4910 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4911 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4912 4913 gpu_reset_for_dev_remove = 4914 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4915 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4916 4917 /* 4918 * ASIC reset has to be done on all XGMI hive nodes ASAP 4919 * to allow proper links negotiation in FW (within 1 sec) 4920 */ 4921 if (!skip_hw_reset && need_full_reset) { 4922 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4923 /* For XGMI run all resets in parallel to speed up the process */ 4924 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4925 tmp_adev->gmc.xgmi.pending_reset = false; 4926 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4927 r = -EALREADY; 4928 } else 4929 r = amdgpu_asic_reset(tmp_adev); 4930 4931 if (r) { 4932 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4933 r, adev_to_drm(tmp_adev)->unique); 4934 break; 4935 } 4936 } 4937 4938 /* For XGMI wait for all resets to complete before proceed */ 4939 if (!r) { 4940 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4941 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4942 flush_work(&tmp_adev->xgmi_reset_work); 4943 r = tmp_adev->asic_reset_res; 4944 if (r) 4945 break; 4946 } 4947 } 4948 } 4949 } 4950 4951 if (!r && amdgpu_ras_intr_triggered()) { 4952 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4953 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4954 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4955 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4956 } 4957 4958 amdgpu_ras_intr_cleared(); 4959 } 4960 4961 /* Since the mode1 reset affects base ip blocks, the 4962 * phase1 ip blocks need to be resumed. Otherwise there 4963 * will be a BIOS signature error and the psp bootloader 4964 * can't load kdb on the next amdgpu install. 4965 */ 4966 if (gpu_reset_for_dev_remove) { 4967 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4968 amdgpu_device_ip_resume_phase1(tmp_adev); 4969 4970 goto end; 4971 } 4972 4973 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4974 if (need_full_reset) { 4975 /* post card */ 4976 r = amdgpu_device_asic_init(tmp_adev); 4977 if (r) { 4978 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4979 } else { 4980 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4981 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4982 if (r) 4983 goto out; 4984 4985 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4986 if (r) 4987 goto out; 4988 4989 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4990 #ifdef CONFIG_DEV_COREDUMP 4991 tmp_adev->reset_vram_lost = vram_lost; 4992 memset(&tmp_adev->reset_task_info, 0, 4993 sizeof(tmp_adev->reset_task_info)); 4994 if (reset_context->job && reset_context->job->vm) 4995 tmp_adev->reset_task_info = 4996 reset_context->job->vm->task_info; 4997 amdgpu_reset_capture_coredumpm(tmp_adev); 4998 #endif 4999 if (vram_lost) { 5000 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5001 amdgpu_inc_vram_lost(tmp_adev); 5002 } 5003 5004 r = amdgpu_device_fw_loading(tmp_adev); 5005 if (r) 5006 return r; 5007 5008 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5009 if (r) 5010 goto out; 5011 5012 if (vram_lost) 5013 amdgpu_device_fill_reset_magic(tmp_adev); 5014 5015 /* 5016 * Add this ASIC as tracked as reset was already 5017 * complete successfully. 5018 */ 5019 amdgpu_register_gpu_instance(tmp_adev); 5020 5021 if (!reset_context->hive && 5022 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5023 amdgpu_xgmi_add_device(tmp_adev); 5024 5025 r = amdgpu_device_ip_late_init(tmp_adev); 5026 if (r) 5027 goto out; 5028 5029 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5030 5031 /* 5032 * The GPU enters bad state once faulty pages 5033 * by ECC has reached the threshold, and ras 5034 * recovery is scheduled next. So add one check 5035 * here to break recovery if it indeed exceeds 5036 * bad page threshold, and remind user to 5037 * retire this GPU or setting one bigger 5038 * bad_page_threshold value to fix this once 5039 * probing driver again. 5040 */ 5041 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5042 /* must succeed. */ 5043 amdgpu_ras_resume(tmp_adev); 5044 } else { 5045 r = -EINVAL; 5046 goto out; 5047 } 5048 5049 /* Update PSP FW topology after reset */ 5050 if (reset_context->hive && 5051 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5052 r = amdgpu_xgmi_update_topology( 5053 reset_context->hive, tmp_adev); 5054 } 5055 } 5056 5057 out: 5058 if (!r) { 5059 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5060 r = amdgpu_ib_ring_tests(tmp_adev); 5061 if (r) { 5062 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5063 need_full_reset = true; 5064 r = -EAGAIN; 5065 goto end; 5066 } 5067 } 5068 5069 if (!r) 5070 r = amdgpu_device_recover_vram(tmp_adev); 5071 else 5072 tmp_adev->asic_reset_res = r; 5073 } 5074 5075 end: 5076 if (need_full_reset) 5077 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5078 else 5079 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5080 return r; 5081 } 5082 5083 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5084 { 5085 5086 switch (amdgpu_asic_reset_method(adev)) { 5087 case AMD_RESET_METHOD_MODE1: 5088 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5089 break; 5090 case AMD_RESET_METHOD_MODE2: 5091 adev->mp1_state = PP_MP1_STATE_RESET; 5092 break; 5093 default: 5094 adev->mp1_state = PP_MP1_STATE_NONE; 5095 break; 5096 } 5097 } 5098 5099 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5100 { 5101 amdgpu_vf_error_trans_all(adev); 5102 adev->mp1_state = PP_MP1_STATE_NONE; 5103 } 5104 5105 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5106 { 5107 struct pci_dev *p = NULL; 5108 5109 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5110 adev->pdev->bus->number, 1); 5111 if (p) { 5112 pm_runtime_enable(&(p->dev)); 5113 pm_runtime_resume(&(p->dev)); 5114 } 5115 5116 pci_dev_put(p); 5117 } 5118 5119 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5120 { 5121 enum amd_reset_method reset_method; 5122 struct pci_dev *p = NULL; 5123 u64 expires; 5124 5125 /* 5126 * For now, only BACO and mode1 reset are confirmed 5127 * to suffer the audio issue without proper suspended. 5128 */ 5129 reset_method = amdgpu_asic_reset_method(adev); 5130 if ((reset_method != AMD_RESET_METHOD_BACO) && 5131 (reset_method != AMD_RESET_METHOD_MODE1)) 5132 return -EINVAL; 5133 5134 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5135 adev->pdev->bus->number, 1); 5136 if (!p) 5137 return -ENODEV; 5138 5139 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5140 if (!expires) 5141 /* 5142 * If we cannot get the audio device autosuspend delay, 5143 * a fixed 4S interval will be used. Considering 3S is 5144 * the audio controller default autosuspend delay setting. 5145 * 4S used here is guaranteed to cover that. 5146 */ 5147 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5148 5149 while (!pm_runtime_status_suspended(&(p->dev))) { 5150 if (!pm_runtime_suspend(&(p->dev))) 5151 break; 5152 5153 if (expires < ktime_get_mono_fast_ns()) { 5154 dev_warn(adev->dev, "failed to suspend display audio\n"); 5155 pci_dev_put(p); 5156 /* TODO: abort the succeeding gpu reset? */ 5157 return -ETIMEDOUT; 5158 } 5159 } 5160 5161 pm_runtime_disable(&(p->dev)); 5162 5163 pci_dev_put(p); 5164 return 0; 5165 } 5166 5167 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5168 { 5169 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5170 5171 #if defined(CONFIG_DEBUG_FS) 5172 if (!amdgpu_sriov_vf(adev)) 5173 cancel_work(&adev->reset_work); 5174 #endif 5175 5176 if (adev->kfd.dev) 5177 cancel_work(&adev->kfd.reset_work); 5178 5179 if (amdgpu_sriov_vf(adev)) 5180 cancel_work(&adev->virt.flr_work); 5181 5182 if (con && adev->ras_enabled) 5183 cancel_work(&con->recovery_work); 5184 5185 } 5186 5187 /** 5188 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5189 * 5190 * @adev: amdgpu_device pointer 5191 * @job: which job trigger hang 5192 * @reset_context: amdgpu reset context pointer 5193 * 5194 * Attempt to reset the GPU if it has hung (all asics). 5195 * Attempt to do soft-reset or full-reset and reinitialize Asic 5196 * Returns 0 for success or an error on failure. 5197 */ 5198 5199 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5200 struct amdgpu_job *job, 5201 struct amdgpu_reset_context *reset_context) 5202 { 5203 struct list_head device_list, *device_list_handle = NULL; 5204 bool job_signaled = false; 5205 struct amdgpu_hive_info *hive = NULL; 5206 struct amdgpu_device *tmp_adev = NULL; 5207 int i, r = 0; 5208 bool need_emergency_restart = false; 5209 bool audio_suspended = false; 5210 bool gpu_reset_for_dev_remove = false; 5211 5212 gpu_reset_for_dev_remove = 5213 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5214 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5215 5216 /* 5217 * Special case: RAS triggered and full reset isn't supported 5218 */ 5219 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5220 5221 /* 5222 * Flush RAM to disk so that after reboot 5223 * the user can read log and see why the system rebooted. 5224 */ 5225 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5226 DRM_WARN("Emergency reboot."); 5227 5228 ksys_sync_helper(); 5229 emergency_restart(); 5230 } 5231 5232 dev_info(adev->dev, "GPU %s begin!\n", 5233 need_emergency_restart ? "jobs stop":"reset"); 5234 5235 if (!amdgpu_sriov_vf(adev)) 5236 hive = amdgpu_get_xgmi_hive(adev); 5237 if (hive) 5238 mutex_lock(&hive->hive_lock); 5239 5240 reset_context->job = job; 5241 reset_context->hive = hive; 5242 /* 5243 * Build list of devices to reset. 5244 * In case we are in XGMI hive mode, resort the device list 5245 * to put adev in the 1st position. 5246 */ 5247 INIT_LIST_HEAD(&device_list); 5248 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5249 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5250 list_add_tail(&tmp_adev->reset_list, &device_list); 5251 if (gpu_reset_for_dev_remove && adev->shutdown) 5252 tmp_adev->shutdown = true; 5253 } 5254 if (!list_is_first(&adev->reset_list, &device_list)) 5255 list_rotate_to_front(&adev->reset_list, &device_list); 5256 device_list_handle = &device_list; 5257 } else { 5258 list_add_tail(&adev->reset_list, &device_list); 5259 device_list_handle = &device_list; 5260 } 5261 5262 /* We need to lock reset domain only once both for XGMI and single device */ 5263 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5264 reset_list); 5265 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5266 5267 /* block all schedulers and reset given job's ring */ 5268 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5269 5270 amdgpu_device_set_mp1_state(tmp_adev); 5271 5272 /* 5273 * Try to put the audio codec into suspend state 5274 * before gpu reset started. 5275 * 5276 * Due to the power domain of the graphics device 5277 * is shared with AZ power domain. Without this, 5278 * we may change the audio hardware from behind 5279 * the audio driver's back. That will trigger 5280 * some audio codec errors. 5281 */ 5282 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5283 audio_suspended = true; 5284 5285 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5286 5287 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5288 5289 if (!amdgpu_sriov_vf(tmp_adev)) 5290 amdgpu_amdkfd_pre_reset(tmp_adev); 5291 5292 /* 5293 * Mark these ASICs to be reseted as untracked first 5294 * And add them back after reset completed 5295 */ 5296 amdgpu_unregister_gpu_instance(tmp_adev); 5297 5298 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5299 5300 /* disable ras on ALL IPs */ 5301 if (!need_emergency_restart && 5302 amdgpu_device_ip_need_full_reset(tmp_adev)) 5303 amdgpu_ras_suspend(tmp_adev); 5304 5305 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5306 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5307 5308 if (!ring || !ring->sched.thread) 5309 continue; 5310 5311 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5312 5313 if (need_emergency_restart) 5314 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5315 } 5316 atomic_inc(&tmp_adev->gpu_reset_counter); 5317 } 5318 5319 if (need_emergency_restart) 5320 goto skip_sched_resume; 5321 5322 /* 5323 * Must check guilty signal here since after this point all old 5324 * HW fences are force signaled. 5325 * 5326 * job->base holds a reference to parent fence 5327 */ 5328 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5329 job_signaled = true; 5330 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5331 goto skip_hw_reset; 5332 } 5333 5334 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5335 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5336 if (gpu_reset_for_dev_remove) { 5337 /* Workaroud for ASICs need to disable SMC first */ 5338 amdgpu_device_smu_fini_early(tmp_adev); 5339 } 5340 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5341 /*TODO Should we stop ?*/ 5342 if (r) { 5343 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5344 r, adev_to_drm(tmp_adev)->unique); 5345 tmp_adev->asic_reset_res = r; 5346 } 5347 5348 /* 5349 * Drop all pending non scheduler resets. Scheduler resets 5350 * were already dropped during drm_sched_stop 5351 */ 5352 amdgpu_device_stop_pending_resets(tmp_adev); 5353 } 5354 5355 /* Actual ASIC resets if needed.*/ 5356 /* Host driver will handle XGMI hive reset for SRIOV */ 5357 if (amdgpu_sriov_vf(adev)) { 5358 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5359 if (r) 5360 adev->asic_reset_res = r; 5361 5362 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5363 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5364 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5365 amdgpu_ras_resume(adev); 5366 } else { 5367 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5368 if (r && r == -EAGAIN) 5369 goto retry; 5370 5371 if (!r && gpu_reset_for_dev_remove) 5372 goto recover_end; 5373 } 5374 5375 skip_hw_reset: 5376 5377 /* Post ASIC reset for all devs .*/ 5378 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5379 5380 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5381 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5382 5383 if (!ring || !ring->sched.thread) 5384 continue; 5385 5386 drm_sched_start(&ring->sched, true); 5387 } 5388 5389 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5390 amdgpu_mes_self_test(tmp_adev); 5391 5392 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5393 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5394 } 5395 5396 if (tmp_adev->asic_reset_res) 5397 r = tmp_adev->asic_reset_res; 5398 5399 tmp_adev->asic_reset_res = 0; 5400 5401 if (r) { 5402 /* bad news, how to tell it to userspace ? */ 5403 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5404 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5405 } else { 5406 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5407 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5408 DRM_WARN("smart shift update failed\n"); 5409 } 5410 } 5411 5412 skip_sched_resume: 5413 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5414 /* unlock kfd: SRIOV would do it separately */ 5415 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5416 amdgpu_amdkfd_post_reset(tmp_adev); 5417 5418 /* kfd_post_reset will do nothing if kfd device is not initialized, 5419 * need to bring up kfd here if it's not be initialized before 5420 */ 5421 if (!adev->kfd.init_complete) 5422 amdgpu_amdkfd_device_init(adev); 5423 5424 if (audio_suspended) 5425 amdgpu_device_resume_display_audio(tmp_adev); 5426 5427 amdgpu_device_unset_mp1_state(tmp_adev); 5428 5429 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5430 } 5431 5432 recover_end: 5433 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5434 reset_list); 5435 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5436 5437 if (hive) { 5438 mutex_unlock(&hive->hive_lock); 5439 amdgpu_put_xgmi_hive(hive); 5440 } 5441 5442 if (r) 5443 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5444 5445 atomic_set(&adev->reset_domain->reset_res, r); 5446 return r; 5447 } 5448 5449 /** 5450 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5451 * 5452 * @adev: amdgpu_device pointer 5453 * 5454 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5455 * and lanes) of the slot the device is in. Handles APUs and 5456 * virtualized environments where PCIE config space may not be available. 5457 */ 5458 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5459 { 5460 struct pci_dev *pdev; 5461 enum pci_bus_speed speed_cap, platform_speed_cap; 5462 enum pcie_link_width platform_link_width; 5463 5464 if (amdgpu_pcie_gen_cap) 5465 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5466 5467 if (amdgpu_pcie_lane_cap) 5468 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5469 5470 /* covers APUs as well */ 5471 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5472 if (adev->pm.pcie_gen_mask == 0) 5473 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5474 if (adev->pm.pcie_mlw_mask == 0) 5475 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5476 return; 5477 } 5478 5479 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5480 return; 5481 5482 pcie_bandwidth_available(adev->pdev, NULL, 5483 &platform_speed_cap, &platform_link_width); 5484 5485 if (adev->pm.pcie_gen_mask == 0) { 5486 /* asic caps */ 5487 pdev = adev->pdev; 5488 speed_cap = pcie_get_speed_cap(pdev); 5489 if (speed_cap == PCI_SPEED_UNKNOWN) { 5490 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5491 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5492 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5493 } else { 5494 if (speed_cap == PCIE_SPEED_32_0GT) 5495 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5496 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5497 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5498 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5499 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5500 else if (speed_cap == PCIE_SPEED_16_0GT) 5501 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5502 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5504 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5505 else if (speed_cap == PCIE_SPEED_8_0GT) 5506 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5507 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5508 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5509 else if (speed_cap == PCIE_SPEED_5_0GT) 5510 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5511 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5512 else 5513 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5514 } 5515 /* platform caps */ 5516 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5517 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5518 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5519 } else { 5520 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5521 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5522 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5523 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5524 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5525 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5526 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5527 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5528 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5530 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5531 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5532 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5533 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5534 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5535 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5536 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5537 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5538 else 5539 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5540 5541 } 5542 } 5543 if (adev->pm.pcie_mlw_mask == 0) { 5544 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5545 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5546 } else { 5547 switch (platform_link_width) { 5548 case PCIE_LNK_X32: 5549 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5556 break; 5557 case PCIE_LNK_X16: 5558 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5559 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5560 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5564 break; 5565 case PCIE_LNK_X12: 5566 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5567 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5568 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5569 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5570 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5571 break; 5572 case PCIE_LNK_X8: 5573 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5574 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5575 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5576 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5577 break; 5578 case PCIE_LNK_X4: 5579 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5580 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5581 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5582 break; 5583 case PCIE_LNK_X2: 5584 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5585 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5586 break; 5587 case PCIE_LNK_X1: 5588 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5589 break; 5590 default: 5591 break; 5592 } 5593 } 5594 } 5595 } 5596 5597 /** 5598 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5599 * 5600 * @adev: amdgpu_device pointer 5601 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5602 * 5603 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5604 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5605 * @peer_adev. 5606 */ 5607 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5608 struct amdgpu_device *peer_adev) 5609 { 5610 #ifdef CONFIG_HSA_AMD_P2P 5611 uint64_t address_mask = peer_adev->dev->dma_mask ? 5612 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5613 resource_size_t aper_limit = 5614 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5615 bool p2p_access = 5616 !adev->gmc.xgmi.connected_to_cpu && 5617 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5618 5619 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5620 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5621 !(adev->gmc.aper_base & address_mask || 5622 aper_limit & address_mask)); 5623 #else 5624 return false; 5625 #endif 5626 } 5627 5628 int amdgpu_device_baco_enter(struct drm_device *dev) 5629 { 5630 struct amdgpu_device *adev = drm_to_adev(dev); 5631 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5632 5633 if (!amdgpu_device_supports_baco(dev)) 5634 return -ENOTSUPP; 5635 5636 if (ras && adev->ras_enabled && 5637 adev->nbio.funcs->enable_doorbell_interrupt) 5638 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5639 5640 return amdgpu_dpm_baco_enter(adev); 5641 } 5642 5643 int amdgpu_device_baco_exit(struct drm_device *dev) 5644 { 5645 struct amdgpu_device *adev = drm_to_adev(dev); 5646 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5647 int ret = 0; 5648 5649 if (!amdgpu_device_supports_baco(dev)) 5650 return -ENOTSUPP; 5651 5652 ret = amdgpu_dpm_baco_exit(adev); 5653 if (ret) 5654 return ret; 5655 5656 if (ras && adev->ras_enabled && 5657 adev->nbio.funcs->enable_doorbell_interrupt) 5658 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5659 5660 if (amdgpu_passthrough(adev) && 5661 adev->nbio.funcs->clear_doorbell_interrupt) 5662 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5663 5664 return 0; 5665 } 5666 5667 /** 5668 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5669 * @pdev: PCI device struct 5670 * @state: PCI channel state 5671 * 5672 * Description: Called when a PCI error is detected. 5673 * 5674 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5675 */ 5676 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5677 { 5678 struct drm_device *dev = pci_get_drvdata(pdev); 5679 struct amdgpu_device *adev = drm_to_adev(dev); 5680 int i; 5681 5682 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5683 5684 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5685 DRM_WARN("No support for XGMI hive yet..."); 5686 return PCI_ERS_RESULT_DISCONNECT; 5687 } 5688 5689 adev->pci_channel_state = state; 5690 5691 switch (state) { 5692 case pci_channel_io_normal: 5693 return PCI_ERS_RESULT_CAN_RECOVER; 5694 /* Fatal error, prepare for slot reset */ 5695 case pci_channel_io_frozen: 5696 /* 5697 * Locking adev->reset_domain->sem will prevent any external access 5698 * to GPU during PCI error recovery 5699 */ 5700 amdgpu_device_lock_reset_domain(adev->reset_domain); 5701 amdgpu_device_set_mp1_state(adev); 5702 5703 /* 5704 * Block any work scheduling as we do for regular GPU reset 5705 * for the duration of the recovery 5706 */ 5707 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5708 struct amdgpu_ring *ring = adev->rings[i]; 5709 5710 if (!ring || !ring->sched.thread) 5711 continue; 5712 5713 drm_sched_stop(&ring->sched, NULL); 5714 } 5715 atomic_inc(&adev->gpu_reset_counter); 5716 return PCI_ERS_RESULT_NEED_RESET; 5717 case pci_channel_io_perm_failure: 5718 /* Permanent error, prepare for device removal */ 5719 return PCI_ERS_RESULT_DISCONNECT; 5720 } 5721 5722 return PCI_ERS_RESULT_NEED_RESET; 5723 } 5724 5725 /** 5726 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5727 * @pdev: pointer to PCI device 5728 */ 5729 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5730 { 5731 5732 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5733 5734 /* TODO - dump whatever for debugging purposes */ 5735 5736 /* This called only if amdgpu_pci_error_detected returns 5737 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5738 * works, no need to reset slot. 5739 */ 5740 5741 return PCI_ERS_RESULT_RECOVERED; 5742 } 5743 5744 /** 5745 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5746 * @pdev: PCI device struct 5747 * 5748 * Description: This routine is called by the pci error recovery 5749 * code after the PCI slot has been reset, just before we 5750 * should resume normal operations. 5751 */ 5752 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5753 { 5754 struct drm_device *dev = pci_get_drvdata(pdev); 5755 struct amdgpu_device *adev = drm_to_adev(dev); 5756 int r, i; 5757 struct amdgpu_reset_context reset_context; 5758 u32 memsize; 5759 struct list_head device_list; 5760 5761 DRM_INFO("PCI error: slot reset callback!!\n"); 5762 5763 memset(&reset_context, 0, sizeof(reset_context)); 5764 5765 INIT_LIST_HEAD(&device_list); 5766 list_add_tail(&adev->reset_list, &device_list); 5767 5768 /* wait for asic to come out of reset */ 5769 msleep(500); 5770 5771 /* Restore PCI confspace */ 5772 amdgpu_device_load_pci_state(pdev); 5773 5774 /* confirm ASIC came out of reset */ 5775 for (i = 0; i < adev->usec_timeout; i++) { 5776 memsize = amdgpu_asic_get_config_memsize(adev); 5777 5778 if (memsize != 0xffffffff) 5779 break; 5780 udelay(1); 5781 } 5782 if (memsize == 0xffffffff) { 5783 r = -ETIME; 5784 goto out; 5785 } 5786 5787 reset_context.method = AMD_RESET_METHOD_NONE; 5788 reset_context.reset_req_dev = adev; 5789 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5790 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5791 5792 adev->no_hw_access = true; 5793 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5794 adev->no_hw_access = false; 5795 if (r) 5796 goto out; 5797 5798 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5799 5800 out: 5801 if (!r) { 5802 if (amdgpu_device_cache_pci_state(adev->pdev)) 5803 pci_restore_state(adev->pdev); 5804 5805 DRM_INFO("PCIe error recovery succeeded\n"); 5806 } else { 5807 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5808 amdgpu_device_unset_mp1_state(adev); 5809 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5810 } 5811 5812 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5813 } 5814 5815 /** 5816 * amdgpu_pci_resume() - resume normal ops after PCI reset 5817 * @pdev: pointer to PCI device 5818 * 5819 * Called when the error recovery driver tells us that its 5820 * OK to resume normal operation. 5821 */ 5822 void amdgpu_pci_resume(struct pci_dev *pdev) 5823 { 5824 struct drm_device *dev = pci_get_drvdata(pdev); 5825 struct amdgpu_device *adev = drm_to_adev(dev); 5826 int i; 5827 5828 5829 DRM_INFO("PCI error: resume callback!!\n"); 5830 5831 /* Only continue execution for the case of pci_channel_io_frozen */ 5832 if (adev->pci_channel_state != pci_channel_io_frozen) 5833 return; 5834 5835 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5836 struct amdgpu_ring *ring = adev->rings[i]; 5837 5838 if (!ring || !ring->sched.thread) 5839 continue; 5840 5841 drm_sched_start(&ring->sched, true); 5842 } 5843 5844 amdgpu_device_unset_mp1_state(adev); 5845 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5846 } 5847 5848 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5849 { 5850 struct drm_device *dev = pci_get_drvdata(pdev); 5851 struct amdgpu_device *adev = drm_to_adev(dev); 5852 int r; 5853 5854 r = pci_save_state(pdev); 5855 if (!r) { 5856 kfree(adev->pci_state); 5857 5858 adev->pci_state = pci_store_saved_state(pdev); 5859 5860 if (!adev->pci_state) { 5861 DRM_ERROR("Failed to store PCI saved state"); 5862 return false; 5863 } 5864 } else { 5865 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5866 return false; 5867 } 5868 5869 return true; 5870 } 5871 5872 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5873 { 5874 struct drm_device *dev = pci_get_drvdata(pdev); 5875 struct amdgpu_device *adev = drm_to_adev(dev); 5876 int r; 5877 5878 if (!adev->pci_state) 5879 return false; 5880 5881 r = pci_load_saved_state(pdev, adev->pci_state); 5882 5883 if (!r) { 5884 pci_restore_state(pdev); 5885 } else { 5886 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5887 return false; 5888 } 5889 5890 return true; 5891 } 5892 5893 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5894 struct amdgpu_ring *ring) 5895 { 5896 #ifdef CONFIG_X86_64 5897 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5898 return; 5899 #endif 5900 if (adev->gmc.xgmi.connected_to_cpu) 5901 return; 5902 5903 if (ring && ring->funcs->emit_hdp_flush) 5904 amdgpu_ring_emit_hdp_flush(ring); 5905 else 5906 amdgpu_asic_flush_hdp(adev, ring); 5907 } 5908 5909 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5910 struct amdgpu_ring *ring) 5911 { 5912 #ifdef CONFIG_X86_64 5913 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5914 return; 5915 #endif 5916 if (adev->gmc.xgmi.connected_to_cpu) 5917 return; 5918 5919 amdgpu_asic_invalidate_hdp(adev, ring); 5920 } 5921 5922 int amdgpu_in_reset(struct amdgpu_device *adev) 5923 { 5924 return atomic_read(&adev->reset_domain->in_gpu_reset); 5925 } 5926 5927 /** 5928 * amdgpu_device_halt() - bring hardware to some kind of halt state 5929 * 5930 * @adev: amdgpu_device pointer 5931 * 5932 * Bring hardware to some kind of halt state so that no one can touch it 5933 * any more. It will help to maintain error context when error occurred. 5934 * Compare to a simple hang, the system will keep stable at least for SSH 5935 * access. Then it should be trivial to inspect the hardware state and 5936 * see what's going on. Implemented as following: 5937 * 5938 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5939 * clears all CPU mappings to device, disallows remappings through page faults 5940 * 2. amdgpu_irq_disable_all() disables all interrupts 5941 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5942 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5943 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5944 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5945 * flush any in flight DMA operations 5946 */ 5947 void amdgpu_device_halt(struct amdgpu_device *adev) 5948 { 5949 struct pci_dev *pdev = adev->pdev; 5950 struct drm_device *ddev = adev_to_drm(adev); 5951 5952 amdgpu_xcp_dev_unplug(adev); 5953 drm_dev_unplug(ddev); 5954 5955 amdgpu_irq_disable_all(adev); 5956 5957 amdgpu_fence_driver_hw_fini(adev); 5958 5959 adev->no_hw_access = true; 5960 5961 amdgpu_device_unmap_mmio(adev); 5962 5963 pci_disable_device(pdev); 5964 pci_wait_for_pending_transaction(pdev); 5965 } 5966 5967 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5968 u32 reg) 5969 { 5970 unsigned long flags, address, data; 5971 u32 r; 5972 5973 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5974 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5975 5976 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5977 WREG32(address, reg * 4); 5978 (void)RREG32(address); 5979 r = RREG32(data); 5980 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5981 return r; 5982 } 5983 5984 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5985 u32 reg, u32 v) 5986 { 5987 unsigned long flags, address, data; 5988 5989 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5990 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5991 5992 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5993 WREG32(address, reg * 4); 5994 (void)RREG32(address); 5995 WREG32(data, v); 5996 (void)RREG32(data); 5997 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5998 } 5999 6000 /** 6001 * amdgpu_device_switch_gang - switch to a new gang 6002 * @adev: amdgpu_device pointer 6003 * @gang: the gang to switch to 6004 * 6005 * Try to switch to a new gang. 6006 * Returns: NULL if we switched to the new gang or a reference to the current 6007 * gang leader. 6008 */ 6009 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6010 struct dma_fence *gang) 6011 { 6012 struct dma_fence *old = NULL; 6013 6014 do { 6015 dma_fence_put(old); 6016 rcu_read_lock(); 6017 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6018 rcu_read_unlock(); 6019 6020 if (old == gang) 6021 break; 6022 6023 if (!dma_fence_is_signaled(old)) 6024 return old; 6025 6026 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6027 old, gang) != old); 6028 6029 dma_fence_put(old); 6030 return NULL; 6031 } 6032 6033 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6034 { 6035 switch (adev->asic_type) { 6036 #ifdef CONFIG_DRM_AMDGPU_SI 6037 case CHIP_HAINAN: 6038 #endif 6039 case CHIP_TOPAZ: 6040 /* chips with no display hardware */ 6041 return false; 6042 #ifdef CONFIG_DRM_AMDGPU_SI 6043 case CHIP_TAHITI: 6044 case CHIP_PITCAIRN: 6045 case CHIP_VERDE: 6046 case CHIP_OLAND: 6047 #endif 6048 #ifdef CONFIG_DRM_AMDGPU_CIK 6049 case CHIP_BONAIRE: 6050 case CHIP_HAWAII: 6051 case CHIP_KAVERI: 6052 case CHIP_KABINI: 6053 case CHIP_MULLINS: 6054 #endif 6055 case CHIP_TONGA: 6056 case CHIP_FIJI: 6057 case CHIP_POLARIS10: 6058 case CHIP_POLARIS11: 6059 case CHIP_POLARIS12: 6060 case CHIP_VEGAM: 6061 case CHIP_CARRIZO: 6062 case CHIP_STONEY: 6063 /* chips with display hardware */ 6064 return true; 6065 default: 6066 /* IP discovery */ 6067 if (!adev->ip_versions[DCE_HWIP][0] || 6068 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6069 return false; 6070 return true; 6071 } 6072 } 6073 6074 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6075 uint32_t inst, uint32_t reg_addr, char reg_name[], 6076 uint32_t expected_value, uint32_t mask) 6077 { 6078 uint32_t ret = 0; 6079 uint32_t old_ = 0; 6080 uint32_t tmp_ = RREG32(reg_addr); 6081 uint32_t loop = adev->usec_timeout; 6082 6083 while ((tmp_ & (mask)) != (expected_value)) { 6084 if (old_ != tmp_) { 6085 loop = adev->usec_timeout; 6086 old_ = tmp_; 6087 } else 6088 udelay(1); 6089 tmp_ = RREG32(reg_addr); 6090 loop--; 6091 if (!loop) { 6092 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6093 inst, reg_name, (uint32_t)expected_value, 6094 (uint32_t)(tmp_ & (mask))); 6095 ret = -ETIMEDOUT; 6096 break; 6097 } 6098 } 6099 return ret; 6100 } 6101