1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 91 92 #define AMDGPU_RESUME_MS 2000 93 #define AMDGPU_MAX_RETRY_LIMIT 2 94 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 95 96 static const struct drm_driver amdgpu_kms_driver; 97 98 const char *amdgpu_asic_name[] = { 99 "TAHITI", 100 "PITCAIRN", 101 "VERDE", 102 "OLAND", 103 "HAINAN", 104 "BONAIRE", 105 "KAVERI", 106 "KABINI", 107 "HAWAII", 108 "MULLINS", 109 "TOPAZ", 110 "TONGA", 111 "FIJI", 112 "CARRIZO", 113 "STONEY", 114 "POLARIS10", 115 "POLARIS11", 116 "POLARIS12", 117 "VEGAM", 118 "VEGA10", 119 "VEGA12", 120 "VEGA20", 121 "RAVEN", 122 "ARCTURUS", 123 "RENOIR", 124 "ALDEBARAN", 125 "NAVI10", 126 "CYAN_SKILLFISH", 127 "NAVI14", 128 "NAVI12", 129 "SIENNA_CICHLID", 130 "NAVY_FLOUNDER", 131 "VANGOGH", 132 "DIMGREY_CAVEFISH", 133 "BEIGE_GOBY", 134 "YELLOW_CARP", 135 "IP DISCOVERY", 136 "LAST", 137 }; 138 139 /** 140 * DOC: pcie_replay_count 141 * 142 * The amdgpu driver provides a sysfs API for reporting the total number 143 * of PCIe replays (NAKs) 144 * The file pcie_replay_count is used for this and returns the total 145 * number of replays as a sum of the NAKs generated and NAKs received 146 */ 147 148 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 149 struct device_attribute *attr, char *buf) 150 { 151 struct drm_device *ddev = dev_get_drvdata(dev); 152 struct amdgpu_device *adev = drm_to_adev(ddev); 153 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 154 155 return sysfs_emit(buf, "%llu\n", cnt); 156 } 157 158 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 159 amdgpu_device_get_pcie_replay_count, NULL); 160 161 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 162 163 /** 164 * DOC: product_name 165 * 166 * The amdgpu driver provides a sysfs API for reporting the product name 167 * for the device 168 * The file product_name is used for this and returns the product name 169 * as returned from the FRU. 170 * NOTE: This is only available for certain server cards 171 */ 172 173 static ssize_t amdgpu_device_get_product_name(struct device *dev, 174 struct device_attribute *attr, char *buf) 175 { 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 179 return sysfs_emit(buf, "%s\n", adev->product_name); 180 } 181 182 static DEVICE_ATTR(product_name, S_IRUGO, 183 amdgpu_device_get_product_name, NULL); 184 185 /** 186 * DOC: product_number 187 * 188 * The amdgpu driver provides a sysfs API for reporting the part number 189 * for the device 190 * The file product_number is used for this and returns the part number 191 * as returned from the FRU. 192 * NOTE: This is only available for certain server cards 193 */ 194 195 static ssize_t amdgpu_device_get_product_number(struct device *dev, 196 struct device_attribute *attr, char *buf) 197 { 198 struct drm_device *ddev = dev_get_drvdata(dev); 199 struct amdgpu_device *adev = drm_to_adev(ddev); 200 201 return sysfs_emit(buf, "%s\n", adev->product_number); 202 } 203 204 static DEVICE_ATTR(product_number, S_IRUGO, 205 amdgpu_device_get_product_number, NULL); 206 207 /** 208 * DOC: serial_number 209 * 210 * The amdgpu driver provides a sysfs API for reporting the serial number 211 * for the device 212 * The file serial_number is used for this and returns the serial number 213 * as returned from the FRU. 214 * NOTE: This is only available for certain server cards 215 */ 216 217 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 223 return sysfs_emit(buf, "%s\n", adev->serial); 224 } 225 226 static DEVICE_ATTR(serial_number, S_IRUGO, 227 amdgpu_device_get_serial_number, NULL); 228 229 /** 230 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device is a dGPU with ATPX power control, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_px(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = drm_to_adev(dev); 240 241 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 242 return true; 243 return false; 244 } 245 246 /** 247 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 248 * 249 * @dev: drm_device pointer 250 * 251 * Returns true if the device is a dGPU with ACPI power control, 252 * otherwise return false. 253 */ 254 bool amdgpu_device_supports_boco(struct drm_device *dev) 255 { 256 struct amdgpu_device *adev = drm_to_adev(dev); 257 258 if (adev->has_pr3 || 259 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 260 return true; 261 return false; 262 } 263 264 /** 265 * amdgpu_device_supports_baco - Does the device support BACO 266 * 267 * @dev: drm_device pointer 268 * 269 * Returns true if the device supporte BACO, 270 * otherwise return false. 271 */ 272 bool amdgpu_device_supports_baco(struct drm_device *dev) 273 { 274 struct amdgpu_device *adev = drm_to_adev(dev); 275 276 return amdgpu_asic_supports_baco(adev); 277 } 278 279 /** 280 * amdgpu_device_supports_smart_shift - Is the device dGPU with 281 * smart shift support 282 * 283 * @dev: drm_device pointer 284 * 285 * Returns true if the device is a dGPU with Smart Shift support, 286 * otherwise returns false. 287 */ 288 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 289 { 290 return (amdgpu_device_supports_boco(dev) && 291 amdgpu_acpi_is_power_shift_control_supported()); 292 } 293 294 /* 295 * VRAM access helper functions 296 */ 297 298 /** 299 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 300 * 301 * @adev: amdgpu_device pointer 302 * @pos: offset of the buffer in vram 303 * @buf: virtual address of the buffer in system memory 304 * @size: read/write size, sizeof(@buf) must > @size 305 * @write: true - write to vram, otherwise - read from vram 306 */ 307 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 308 void *buf, size_t size, bool write) 309 { 310 unsigned long flags; 311 uint32_t hi = ~0, tmp = 0; 312 uint32_t *data = buf; 313 uint64_t last; 314 int idx; 315 316 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 317 return; 318 319 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 320 321 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 322 for (last = pos + size; pos < last; pos += 4) { 323 tmp = pos >> 31; 324 325 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 326 if (tmp != hi) { 327 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 328 hi = tmp; 329 } 330 if (write) 331 WREG32_NO_KIQ(mmMM_DATA, *data++); 332 else 333 *data++ = RREG32_NO_KIQ(mmMM_DATA); 334 } 335 336 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 337 drm_dev_exit(idx); 338 } 339 340 /** 341 * amdgpu_device_aper_access - access vram by vram aperature 342 * 343 * @adev: amdgpu_device pointer 344 * @pos: offset of the buffer in vram 345 * @buf: virtual address of the buffer in system memory 346 * @size: read/write size, sizeof(@buf) must > @size 347 * @write: true - write to vram, otherwise - read from vram 348 * 349 * The return value means how many bytes have been transferred. 350 */ 351 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 352 void *buf, size_t size, bool write) 353 { 354 #ifdef CONFIG_64BIT 355 void __iomem *addr; 356 size_t count = 0; 357 uint64_t last; 358 359 if (!adev->mman.aper_base_kaddr) 360 return 0; 361 362 last = min(pos + size, adev->gmc.visible_vram_size); 363 if (last > pos) { 364 addr = adev->mman.aper_base_kaddr + pos; 365 count = last - pos; 366 367 if (write) { 368 memcpy_toio(addr, buf, count); 369 mb(); 370 amdgpu_device_flush_hdp(adev, NULL); 371 } else { 372 amdgpu_device_invalidate_hdp(adev, NULL); 373 mb(); 374 memcpy_fromio(buf, addr, count); 375 } 376 377 } 378 379 return count; 380 #else 381 return 0; 382 #endif 383 } 384 385 /** 386 * amdgpu_device_vram_access - read/write a buffer in vram 387 * 388 * @adev: amdgpu_device pointer 389 * @pos: offset of the buffer in vram 390 * @buf: virtual address of the buffer in system memory 391 * @size: read/write size, sizeof(@buf) must > @size 392 * @write: true - write to vram, otherwise - read from vram 393 */ 394 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 395 void *buf, size_t size, bool write) 396 { 397 size_t count; 398 399 /* try to using vram apreature to access vram first */ 400 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 401 size -= count; 402 if (size) { 403 /* using MM to access rest vram */ 404 pos += count; 405 buf += count; 406 amdgpu_device_mm_access(adev, pos, buf, size, write); 407 } 408 } 409 410 /* 411 * register access helper functions. 412 */ 413 414 /* Check if hw access should be skipped because of hotplug or device error */ 415 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 416 { 417 if (adev->no_hw_access) 418 return true; 419 420 #ifdef CONFIG_LOCKDEP 421 /* 422 * This is a bit complicated to understand, so worth a comment. What we assert 423 * here is that the GPU reset is not running on another thread in parallel. 424 * 425 * For this we trylock the read side of the reset semaphore, if that succeeds 426 * we know that the reset is not running in paralell. 427 * 428 * If the trylock fails we assert that we are either already holding the read 429 * side of the lock or are the reset thread itself and hold the write side of 430 * the lock. 431 */ 432 if (in_task()) { 433 if (down_read_trylock(&adev->reset_domain->sem)) 434 up_read(&adev->reset_domain->sem); 435 else 436 lockdep_assert_held(&adev->reset_domain->sem); 437 } 438 #endif 439 return false; 440 } 441 442 /** 443 * amdgpu_device_rreg - read a memory mapped IO or indirect register 444 * 445 * @adev: amdgpu_device pointer 446 * @reg: dword aligned register offset 447 * @acc_flags: access flags which require special behavior 448 * 449 * Returns the 32 bit value from the offset specified. 450 */ 451 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 452 uint32_t reg, uint32_t acc_flags) 453 { 454 uint32_t ret; 455 456 if (amdgpu_device_skip_hw_access(adev)) 457 return 0; 458 459 if ((reg * 4) < adev->rmmio_size) { 460 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 461 amdgpu_sriov_runtime(adev) && 462 down_read_trylock(&adev->reset_domain->sem)) { 463 ret = amdgpu_kiq_rreg(adev, reg); 464 up_read(&adev->reset_domain->sem); 465 } else { 466 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 467 } 468 } else { 469 ret = adev->pcie_rreg(adev, reg * 4); 470 } 471 472 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 473 474 return ret; 475 } 476 477 /* 478 * MMIO register read with bytes helper functions 479 * @offset:bytes offset from MMIO start 480 * 481 */ 482 483 /** 484 * amdgpu_mm_rreg8 - read a memory mapped IO register 485 * 486 * @adev: amdgpu_device pointer 487 * @offset: byte aligned register offset 488 * 489 * Returns the 8 bit value from the offset specified. 490 */ 491 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 492 { 493 if (amdgpu_device_skip_hw_access(adev)) 494 return 0; 495 496 if (offset < adev->rmmio_size) 497 return (readb(adev->rmmio + offset)); 498 BUG(); 499 } 500 501 /* 502 * MMIO register write with bytes helper functions 503 * @offset:bytes offset from MMIO start 504 * @value: the value want to be written to the register 505 * 506 */ 507 /** 508 * amdgpu_mm_wreg8 - read a memory mapped IO register 509 * 510 * @adev: amdgpu_device pointer 511 * @offset: byte aligned register offset 512 * @value: 8 bit value to write 513 * 514 * Writes the value specified to the offset specified. 515 */ 516 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (offset < adev->rmmio_size) 522 writeb(value, adev->rmmio + offset); 523 else 524 BUG(); 525 } 526 527 /** 528 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 529 * 530 * @adev: amdgpu_device pointer 531 * @reg: dword aligned register offset 532 * @v: 32 bit value to write to the register 533 * @acc_flags: access flags which require special behavior 534 * 535 * Writes the value specified to the offset specified. 536 */ 537 void amdgpu_device_wreg(struct amdgpu_device *adev, 538 uint32_t reg, uint32_t v, 539 uint32_t acc_flags) 540 { 541 if (amdgpu_device_skip_hw_access(adev)) 542 return; 543 544 if ((reg * 4) < adev->rmmio_size) { 545 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 546 amdgpu_sriov_runtime(adev) && 547 down_read_trylock(&adev->reset_domain->sem)) { 548 amdgpu_kiq_wreg(adev, reg, v); 549 up_read(&adev->reset_domain->sem); 550 } else { 551 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 552 } 553 } else { 554 adev->pcie_wreg(adev, reg * 4, v); 555 } 556 557 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 558 } 559 560 /** 561 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 562 * 563 * @adev: amdgpu_device pointer 564 * @reg: mmio/rlc register 565 * @v: value to write 566 * 567 * this function is invoked only for the debugfs register access 568 */ 569 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 570 uint32_t reg, uint32_t v) 571 { 572 if (amdgpu_device_skip_hw_access(adev)) 573 return; 574 575 if (amdgpu_sriov_fullaccess(adev) && 576 adev->gfx.rlc.funcs && 577 adev->gfx.rlc.funcs->is_rlcg_access_range) { 578 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 579 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 580 } else if ((reg * 4) >= adev->rmmio_size) { 581 adev->pcie_wreg(adev, reg * 4, v); 582 } else { 583 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 584 } 585 } 586 587 /** 588 * amdgpu_mm_rdoorbell - read a doorbell dword 589 * 590 * @adev: amdgpu_device pointer 591 * @index: doorbell index 592 * 593 * Returns the value in the doorbell aperture at the 594 * requested doorbell index (CIK). 595 */ 596 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 597 { 598 if (amdgpu_device_skip_hw_access(adev)) 599 return 0; 600 601 if (index < adev->doorbell.num_doorbells) { 602 return readl(adev->doorbell.ptr + index); 603 } else { 604 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 605 return 0; 606 } 607 } 608 609 /** 610 * amdgpu_mm_wdoorbell - write a doorbell dword 611 * 612 * @adev: amdgpu_device pointer 613 * @index: doorbell index 614 * @v: value to write 615 * 616 * Writes @v to the doorbell aperture at the 617 * requested doorbell index (CIK). 618 */ 619 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 620 { 621 if (amdgpu_device_skip_hw_access(adev)) 622 return; 623 624 if (index < adev->doorbell.num_doorbells) { 625 writel(v, adev->doorbell.ptr + index); 626 } else { 627 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 628 } 629 } 630 631 /** 632 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 633 * 634 * @adev: amdgpu_device pointer 635 * @index: doorbell index 636 * 637 * Returns the value in the doorbell aperture at the 638 * requested doorbell index (VEGA10+). 639 */ 640 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 641 { 642 if (amdgpu_device_skip_hw_access(adev)) 643 return 0; 644 645 if (index < adev->doorbell.num_doorbells) { 646 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 647 } else { 648 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 649 return 0; 650 } 651 } 652 653 /** 654 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 655 * 656 * @adev: amdgpu_device pointer 657 * @index: doorbell index 658 * @v: value to write 659 * 660 * Writes @v to the doorbell aperture at the 661 * requested doorbell index (VEGA10+). 662 */ 663 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 664 { 665 if (amdgpu_device_skip_hw_access(adev)) 666 return; 667 668 if (index < adev->doorbell.num_doorbells) { 669 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 670 } else { 671 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 672 } 673 } 674 675 /** 676 * amdgpu_device_indirect_rreg - read an indirect register 677 * 678 * @adev: amdgpu_device pointer 679 * @pcie_index: mmio register offset 680 * @pcie_data: mmio register offset 681 * @reg_addr: indirect register address to read from 682 * 683 * Returns the value of indirect register @reg_addr 684 */ 685 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 686 u32 pcie_index, u32 pcie_data, 687 u32 reg_addr) 688 { 689 unsigned long flags; 690 u32 r; 691 void __iomem *pcie_index_offset; 692 void __iomem *pcie_data_offset; 693 694 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 695 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 696 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 697 698 writel(reg_addr, pcie_index_offset); 699 readl(pcie_index_offset); 700 r = readl(pcie_data_offset); 701 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 702 703 return r; 704 } 705 706 /** 707 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 708 * 709 * @adev: amdgpu_device pointer 710 * @pcie_index: mmio register offset 711 * @pcie_data: mmio register offset 712 * @reg_addr: indirect register address to read from 713 * 714 * Returns the value of indirect register @reg_addr 715 */ 716 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 717 u32 pcie_index, u32 pcie_data, 718 u32 reg_addr) 719 { 720 unsigned long flags; 721 u64 r; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 725 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 726 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 727 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 728 729 /* read low 32 bits */ 730 writel(reg_addr, pcie_index_offset); 731 readl(pcie_index_offset); 732 r = readl(pcie_data_offset); 733 /* read high 32 bits */ 734 writel(reg_addr + 4, pcie_index_offset); 735 readl(pcie_index_offset); 736 r |= ((u64)readl(pcie_data_offset) << 32); 737 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 738 739 return r; 740 } 741 742 /** 743 * amdgpu_device_indirect_wreg - write an indirect register address 744 * 745 * @adev: amdgpu_device pointer 746 * @pcie_index: mmio register offset 747 * @pcie_data: mmio register offset 748 * @reg_addr: indirect register offset 749 * @reg_data: indirect register data 750 * 751 */ 752 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 753 u32 pcie_index, u32 pcie_data, 754 u32 reg_addr, u32 reg_data) 755 { 756 unsigned long flags; 757 void __iomem *pcie_index_offset; 758 void __iomem *pcie_data_offset; 759 760 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 761 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 762 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 763 764 writel(reg_addr, pcie_index_offset); 765 readl(pcie_index_offset); 766 writel(reg_data, pcie_data_offset); 767 readl(pcie_data_offset); 768 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 769 } 770 771 /** 772 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 773 * 774 * @adev: amdgpu_device pointer 775 * @pcie_index: mmio register offset 776 * @pcie_data: mmio register offset 777 * @reg_addr: indirect register offset 778 * @reg_data: indirect register data 779 * 780 */ 781 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 782 u32 pcie_index, u32 pcie_data, 783 u32 reg_addr, u64 reg_data) 784 { 785 unsigned long flags; 786 void __iomem *pcie_index_offset; 787 void __iomem *pcie_data_offset; 788 789 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 790 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 791 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 792 793 /* write low 32 bits */ 794 writel(reg_addr, pcie_index_offset); 795 readl(pcie_index_offset); 796 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 797 readl(pcie_data_offset); 798 /* write high 32 bits */ 799 writel(reg_addr + 4, pcie_index_offset); 800 readl(pcie_index_offset); 801 writel((u32)(reg_data >> 32), pcie_data_offset); 802 readl(pcie_data_offset); 803 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 804 } 805 806 /** 807 * amdgpu_invalid_rreg - dummy reg read function 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: offset of register 811 * 812 * Dummy register read function. Used for register blocks 813 * that certain asics don't have (all asics). 814 * Returns the value in the register. 815 */ 816 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 817 { 818 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 819 BUG(); 820 return 0; 821 } 822 823 /** 824 * amdgpu_invalid_wreg - dummy reg write function 825 * 826 * @adev: amdgpu_device pointer 827 * @reg: offset of register 828 * @v: value to write to the register 829 * 830 * Dummy register read function. Used for register blocks 831 * that certain asics don't have (all asics). 832 */ 833 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 834 { 835 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 836 reg, v); 837 BUG(); 838 } 839 840 /** 841 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 842 * 843 * @adev: amdgpu_device pointer 844 * @reg: offset of register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 * Returns the value in the register. 849 */ 850 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 851 { 852 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 853 BUG(); 854 return 0; 855 } 856 857 /** 858 * amdgpu_invalid_wreg64 - dummy reg write function 859 * 860 * @adev: amdgpu_device pointer 861 * @reg: offset of register 862 * @v: value to write to the register 863 * 864 * Dummy register read function. Used for register blocks 865 * that certain asics don't have (all asics). 866 */ 867 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 868 { 869 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 870 reg, v); 871 BUG(); 872 } 873 874 /** 875 * amdgpu_block_invalid_rreg - dummy reg read function 876 * 877 * @adev: amdgpu_device pointer 878 * @block: offset of instance 879 * @reg: offset of register 880 * 881 * Dummy register read function. Used for register blocks 882 * that certain asics don't have (all asics). 883 * Returns the value in the register. 884 */ 885 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 886 uint32_t block, uint32_t reg) 887 { 888 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 889 reg, block); 890 BUG(); 891 return 0; 892 } 893 894 /** 895 * amdgpu_block_invalid_wreg - dummy reg write function 896 * 897 * @adev: amdgpu_device pointer 898 * @block: offset of instance 899 * @reg: offset of register 900 * @v: value to write to the register 901 * 902 * Dummy register read function. Used for register blocks 903 * that certain asics don't have (all asics). 904 */ 905 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 906 uint32_t block, 907 uint32_t reg, uint32_t v) 908 { 909 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 910 reg, block, v); 911 BUG(); 912 } 913 914 /** 915 * amdgpu_device_asic_init - Wrapper for atom asic_init 916 * 917 * @adev: amdgpu_device pointer 918 * 919 * Does any asic specific work and then calls atom asic init. 920 */ 921 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 922 { 923 amdgpu_asic_pre_asic_init(adev); 924 925 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 926 return amdgpu_atomfirmware_asic_init(adev, true); 927 else 928 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 929 } 930 931 /** 932 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 933 * 934 * @adev: amdgpu_device pointer 935 * 936 * Allocates a scratch page of VRAM for use by various things in the 937 * driver. 938 */ 939 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 940 { 941 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 942 AMDGPU_GEM_DOMAIN_VRAM | 943 AMDGPU_GEM_DOMAIN_GTT, 944 &adev->mem_scratch.robj, 945 &adev->mem_scratch.gpu_addr, 946 (void **)&adev->mem_scratch.ptr); 947 } 948 949 /** 950 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 951 * 952 * @adev: amdgpu_device pointer 953 * 954 * Frees the VRAM scratch page. 955 */ 956 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 957 { 958 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 959 } 960 961 /** 962 * amdgpu_device_program_register_sequence - program an array of registers. 963 * 964 * @adev: amdgpu_device pointer 965 * @registers: pointer to the register array 966 * @array_size: size of the register array 967 * 968 * Programs an array or registers with and and or masks. 969 * This is a helper for setting golden registers. 970 */ 971 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 972 const u32 *registers, 973 const u32 array_size) 974 { 975 u32 tmp, reg, and_mask, or_mask; 976 int i; 977 978 if (array_size % 3) 979 return; 980 981 for (i = 0; i < array_size; i +=3) { 982 reg = registers[i + 0]; 983 and_mask = registers[i + 1]; 984 or_mask = registers[i + 2]; 985 986 if (and_mask == 0xffffffff) { 987 tmp = or_mask; 988 } else { 989 tmp = RREG32(reg); 990 tmp &= ~and_mask; 991 if (adev->family >= AMDGPU_FAMILY_AI) 992 tmp |= (or_mask & and_mask); 993 else 994 tmp |= or_mask; 995 } 996 WREG32(reg, tmp); 997 } 998 } 999 1000 /** 1001 * amdgpu_device_pci_config_reset - reset the GPU 1002 * 1003 * @adev: amdgpu_device pointer 1004 * 1005 * Resets the GPU using the pci config reset sequence. 1006 * Only applicable to asics prior to vega10. 1007 */ 1008 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1009 { 1010 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1011 } 1012 1013 /** 1014 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1015 * 1016 * @adev: amdgpu_device pointer 1017 * 1018 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1019 */ 1020 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1021 { 1022 return pci_reset_function(adev->pdev); 1023 } 1024 1025 /* 1026 * GPU doorbell aperture helpers function. 1027 */ 1028 /** 1029 * amdgpu_device_doorbell_init - Init doorbell driver information. 1030 * 1031 * @adev: amdgpu_device pointer 1032 * 1033 * Init doorbell driver information (CIK) 1034 * Returns 0 on success, error on failure. 1035 */ 1036 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1037 { 1038 1039 /* No doorbell on SI hardware generation */ 1040 if (adev->asic_type < CHIP_BONAIRE) { 1041 adev->doorbell.base = 0; 1042 adev->doorbell.size = 0; 1043 adev->doorbell.num_doorbells = 0; 1044 adev->doorbell.ptr = NULL; 1045 return 0; 1046 } 1047 1048 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1049 return -EINVAL; 1050 1051 amdgpu_asic_init_doorbell_index(adev); 1052 1053 /* doorbell bar mapping */ 1054 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1055 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1056 1057 if (adev->enable_mes) { 1058 adev->doorbell.num_doorbells = 1059 adev->doorbell.size / sizeof(u32); 1060 } else { 1061 adev->doorbell.num_doorbells = 1062 min_t(u32, adev->doorbell.size / sizeof(u32), 1063 adev->doorbell_index.max_assignment+1); 1064 if (adev->doorbell.num_doorbells == 0) 1065 return -EINVAL; 1066 1067 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1068 * paging queue doorbell use the second page. The 1069 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1070 * doorbells are in the first page. So with paging queue enabled, 1071 * the max num_doorbells should + 1 page (0x400 in dword) 1072 */ 1073 if (adev->asic_type >= CHIP_VEGA10) 1074 adev->doorbell.num_doorbells += 0x400; 1075 } 1076 1077 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1078 adev->doorbell.num_doorbells * 1079 sizeof(u32)); 1080 if (adev->doorbell.ptr == NULL) 1081 return -ENOMEM; 1082 1083 return 0; 1084 } 1085 1086 /** 1087 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1088 * 1089 * @adev: amdgpu_device pointer 1090 * 1091 * Tear down doorbell driver information (CIK) 1092 */ 1093 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1094 { 1095 iounmap(adev->doorbell.ptr); 1096 adev->doorbell.ptr = NULL; 1097 } 1098 1099 1100 1101 /* 1102 * amdgpu_device_wb_*() 1103 * Writeback is the method by which the GPU updates special pages in memory 1104 * with the status of certain GPU events (fences, ring pointers,etc.). 1105 */ 1106 1107 /** 1108 * amdgpu_device_wb_fini - Disable Writeback and free memory 1109 * 1110 * @adev: amdgpu_device pointer 1111 * 1112 * Disables Writeback and frees the Writeback memory (all asics). 1113 * Used at driver shutdown. 1114 */ 1115 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1116 { 1117 if (adev->wb.wb_obj) { 1118 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1119 &adev->wb.gpu_addr, 1120 (void **)&adev->wb.wb); 1121 adev->wb.wb_obj = NULL; 1122 } 1123 } 1124 1125 /** 1126 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1127 * 1128 * @adev: amdgpu_device pointer 1129 * 1130 * Initializes writeback and allocates writeback memory (all asics). 1131 * Used at driver startup. 1132 * Returns 0 on success or an -error on failure. 1133 */ 1134 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1135 { 1136 int r; 1137 1138 if (adev->wb.wb_obj == NULL) { 1139 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1140 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1141 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1142 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1143 (void **)&adev->wb.wb); 1144 if (r) { 1145 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1146 return r; 1147 } 1148 1149 adev->wb.num_wb = AMDGPU_MAX_WB; 1150 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1151 1152 /* clear wb memory */ 1153 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1154 } 1155 1156 return 0; 1157 } 1158 1159 /** 1160 * amdgpu_device_wb_get - Allocate a wb entry 1161 * 1162 * @adev: amdgpu_device pointer 1163 * @wb: wb index 1164 * 1165 * Allocate a wb slot for use by the driver (all asics). 1166 * Returns 0 on success or -EINVAL on failure. 1167 */ 1168 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1169 { 1170 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1171 1172 if (offset < adev->wb.num_wb) { 1173 __set_bit(offset, adev->wb.used); 1174 *wb = offset << 3; /* convert to dw offset */ 1175 return 0; 1176 } else { 1177 return -EINVAL; 1178 } 1179 } 1180 1181 /** 1182 * amdgpu_device_wb_free - Free a wb entry 1183 * 1184 * @adev: amdgpu_device pointer 1185 * @wb: wb index 1186 * 1187 * Free a wb slot allocated for use by the driver (all asics) 1188 */ 1189 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1190 { 1191 wb >>= 3; 1192 if (wb < adev->wb.num_wb) 1193 __clear_bit(wb, adev->wb.used); 1194 } 1195 1196 /** 1197 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1198 * 1199 * @adev: amdgpu_device pointer 1200 * 1201 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1202 * to fail, but if any of the BARs is not accessible after the size we abort 1203 * driver loading by returning -ENODEV. 1204 */ 1205 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1206 { 1207 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1208 struct pci_bus *root; 1209 struct resource *res; 1210 unsigned i; 1211 u16 cmd; 1212 int r; 1213 1214 /* Bypass for VF */ 1215 if (amdgpu_sriov_vf(adev)) 1216 return 0; 1217 1218 /* skip if the bios has already enabled large BAR */ 1219 if (adev->gmc.real_vram_size && 1220 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1221 return 0; 1222 1223 /* Check if the root BUS has 64bit memory resources */ 1224 root = adev->pdev->bus; 1225 while (root->parent) 1226 root = root->parent; 1227 1228 pci_bus_for_each_resource(root, res, i) { 1229 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1230 res->start > 0x100000000ull) 1231 break; 1232 } 1233 1234 /* Trying to resize is pointless without a root hub window above 4GB */ 1235 if (!res) 1236 return 0; 1237 1238 /* Limit the BAR size to what is available */ 1239 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1240 rbar_size); 1241 1242 /* Disable memory decoding while we change the BAR addresses and size */ 1243 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1244 pci_write_config_word(adev->pdev, PCI_COMMAND, 1245 cmd & ~PCI_COMMAND_MEMORY); 1246 1247 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1248 amdgpu_device_doorbell_fini(adev); 1249 if (adev->asic_type >= CHIP_BONAIRE) 1250 pci_release_resource(adev->pdev, 2); 1251 1252 pci_release_resource(adev->pdev, 0); 1253 1254 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1255 if (r == -ENOSPC) 1256 DRM_INFO("Not enough PCI address space for a large BAR."); 1257 else if (r && r != -ENOTSUPP) 1258 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1259 1260 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1261 1262 /* When the doorbell or fb BAR isn't available we have no chance of 1263 * using the device. 1264 */ 1265 r = amdgpu_device_doorbell_init(adev); 1266 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1267 return -ENODEV; 1268 1269 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1270 1271 return 0; 1272 } 1273 1274 /* 1275 * GPU helpers function. 1276 */ 1277 /** 1278 * amdgpu_device_need_post - check if the hw need post or not 1279 * 1280 * @adev: amdgpu_device pointer 1281 * 1282 * Check if the asic has been initialized (all asics) at driver startup 1283 * or post is needed if hw reset is performed. 1284 * Returns true if need or false if not. 1285 */ 1286 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1287 { 1288 uint32_t reg; 1289 1290 if (amdgpu_sriov_vf(adev)) 1291 return false; 1292 1293 if (amdgpu_passthrough(adev)) { 1294 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1295 * some old smc fw still need driver do vPost otherwise gpu hang, while 1296 * those smc fw version above 22.15 doesn't have this flaw, so we force 1297 * vpost executed for smc version below 22.15 1298 */ 1299 if (adev->asic_type == CHIP_FIJI) { 1300 int err; 1301 uint32_t fw_ver; 1302 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1303 /* force vPost if error occured */ 1304 if (err) 1305 return true; 1306 1307 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1308 if (fw_ver < 0x00160e00) 1309 return true; 1310 } 1311 } 1312 1313 /* Don't post if we need to reset whole hive on init */ 1314 if (adev->gmc.xgmi.pending_reset) 1315 return false; 1316 1317 if (adev->has_hw_reset) { 1318 adev->has_hw_reset = false; 1319 return true; 1320 } 1321 1322 /* bios scratch used on CIK+ */ 1323 if (adev->asic_type >= CHIP_BONAIRE) 1324 return amdgpu_atombios_scratch_need_asic_init(adev); 1325 1326 /* check MEM_SIZE for older asics */ 1327 reg = amdgpu_asic_get_config_memsize(adev); 1328 1329 if ((reg != 0) && (reg != 0xffffffff)) 1330 return false; 1331 1332 return true; 1333 } 1334 1335 /** 1336 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1337 * 1338 * @adev: amdgpu_device pointer 1339 * 1340 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1341 * be set for this device. 1342 * 1343 * Returns true if it should be used or false if not. 1344 */ 1345 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1346 { 1347 switch (amdgpu_aspm) { 1348 case -1: 1349 break; 1350 case 0: 1351 return false; 1352 case 1: 1353 return true; 1354 default: 1355 return false; 1356 } 1357 return pcie_aspm_enabled(adev->pdev); 1358 } 1359 1360 /* if we get transitioned to only one device, take VGA back */ 1361 /** 1362 * amdgpu_device_vga_set_decode - enable/disable vga decode 1363 * 1364 * @pdev: PCI device pointer 1365 * @state: enable/disable vga decode 1366 * 1367 * Enable/disable vga decode (all asics). 1368 * Returns VGA resource flags. 1369 */ 1370 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1371 bool state) 1372 { 1373 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1374 amdgpu_asic_set_vga_state(adev, state); 1375 if (state) 1376 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1377 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1378 else 1379 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1380 } 1381 1382 /** 1383 * amdgpu_device_check_block_size - validate the vm block size 1384 * 1385 * @adev: amdgpu_device pointer 1386 * 1387 * Validates the vm block size specified via module parameter. 1388 * The vm block size defines number of bits in page table versus page directory, 1389 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1390 * page table and the remaining bits are in the page directory. 1391 */ 1392 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1393 { 1394 /* defines number of bits in page table versus page directory, 1395 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1396 * page table and the remaining bits are in the page directory */ 1397 if (amdgpu_vm_block_size == -1) 1398 return; 1399 1400 if (amdgpu_vm_block_size < 9) { 1401 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1402 amdgpu_vm_block_size); 1403 amdgpu_vm_block_size = -1; 1404 } 1405 } 1406 1407 /** 1408 * amdgpu_device_check_vm_size - validate the vm size 1409 * 1410 * @adev: amdgpu_device pointer 1411 * 1412 * Validates the vm size in GB specified via module parameter. 1413 * The VM size is the size of the GPU virtual memory space in GB. 1414 */ 1415 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1416 { 1417 /* no need to check the default value */ 1418 if (amdgpu_vm_size == -1) 1419 return; 1420 1421 if (amdgpu_vm_size < 1) { 1422 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1423 amdgpu_vm_size); 1424 amdgpu_vm_size = -1; 1425 } 1426 } 1427 1428 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1429 { 1430 struct sysinfo si; 1431 bool is_os_64 = (sizeof(void *) == 8); 1432 uint64_t total_memory; 1433 uint64_t dram_size_seven_GB = 0x1B8000000; 1434 uint64_t dram_size_three_GB = 0xB8000000; 1435 1436 if (amdgpu_smu_memory_pool_size == 0) 1437 return; 1438 1439 if (!is_os_64) { 1440 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1441 goto def_value; 1442 } 1443 si_meminfo(&si); 1444 total_memory = (uint64_t)si.totalram * si.mem_unit; 1445 1446 if ((amdgpu_smu_memory_pool_size == 1) || 1447 (amdgpu_smu_memory_pool_size == 2)) { 1448 if (total_memory < dram_size_three_GB) 1449 goto def_value1; 1450 } else if ((amdgpu_smu_memory_pool_size == 4) || 1451 (amdgpu_smu_memory_pool_size == 8)) { 1452 if (total_memory < dram_size_seven_GB) 1453 goto def_value1; 1454 } else { 1455 DRM_WARN("Smu memory pool size not supported\n"); 1456 goto def_value; 1457 } 1458 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1459 1460 return; 1461 1462 def_value1: 1463 DRM_WARN("No enough system memory\n"); 1464 def_value: 1465 adev->pm.smu_prv_buffer_size = 0; 1466 } 1467 1468 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1469 { 1470 if (!(adev->flags & AMD_IS_APU) || 1471 adev->asic_type < CHIP_RAVEN) 1472 return 0; 1473 1474 switch (adev->asic_type) { 1475 case CHIP_RAVEN: 1476 if (adev->pdev->device == 0x15dd) 1477 adev->apu_flags |= AMD_APU_IS_RAVEN; 1478 if (adev->pdev->device == 0x15d8) 1479 adev->apu_flags |= AMD_APU_IS_PICASSO; 1480 break; 1481 case CHIP_RENOIR: 1482 if ((adev->pdev->device == 0x1636) || 1483 (adev->pdev->device == 0x164c)) 1484 adev->apu_flags |= AMD_APU_IS_RENOIR; 1485 else 1486 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1487 break; 1488 case CHIP_VANGOGH: 1489 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1490 break; 1491 case CHIP_YELLOW_CARP: 1492 break; 1493 case CHIP_CYAN_SKILLFISH: 1494 if ((adev->pdev->device == 0x13FE) || 1495 (adev->pdev->device == 0x143F)) 1496 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1497 break; 1498 default: 1499 break; 1500 } 1501 1502 return 0; 1503 } 1504 1505 /** 1506 * amdgpu_device_check_arguments - validate module params 1507 * 1508 * @adev: amdgpu_device pointer 1509 * 1510 * Validates certain module parameters and updates 1511 * the associated values used by the driver (all asics). 1512 */ 1513 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1514 { 1515 if (amdgpu_sched_jobs < 4) { 1516 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1517 amdgpu_sched_jobs); 1518 amdgpu_sched_jobs = 4; 1519 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1520 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1521 amdgpu_sched_jobs); 1522 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1523 } 1524 1525 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1526 /* gart size must be greater or equal to 32M */ 1527 dev_warn(adev->dev, "gart size (%d) too small\n", 1528 amdgpu_gart_size); 1529 amdgpu_gart_size = -1; 1530 } 1531 1532 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1533 /* gtt size must be greater or equal to 32M */ 1534 dev_warn(adev->dev, "gtt size (%d) too small\n", 1535 amdgpu_gtt_size); 1536 amdgpu_gtt_size = -1; 1537 } 1538 1539 /* valid range is between 4 and 9 inclusive */ 1540 if (amdgpu_vm_fragment_size != -1 && 1541 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1542 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1543 amdgpu_vm_fragment_size = -1; 1544 } 1545 1546 if (amdgpu_sched_hw_submission < 2) { 1547 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1548 amdgpu_sched_hw_submission); 1549 amdgpu_sched_hw_submission = 2; 1550 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1551 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1552 amdgpu_sched_hw_submission); 1553 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1554 } 1555 1556 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1557 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1558 amdgpu_reset_method = -1; 1559 } 1560 1561 amdgpu_device_check_smu_prv_buffer_size(adev); 1562 1563 amdgpu_device_check_vm_size(adev); 1564 1565 amdgpu_device_check_block_size(adev); 1566 1567 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1568 1569 return 0; 1570 } 1571 1572 /** 1573 * amdgpu_switcheroo_set_state - set switcheroo state 1574 * 1575 * @pdev: pci dev pointer 1576 * @state: vga_switcheroo state 1577 * 1578 * Callback for the switcheroo driver. Suspends or resumes 1579 * the asics before or after it is powered up using ACPI methods. 1580 */ 1581 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1582 enum vga_switcheroo_state state) 1583 { 1584 struct drm_device *dev = pci_get_drvdata(pdev); 1585 int r; 1586 1587 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1588 return; 1589 1590 if (state == VGA_SWITCHEROO_ON) { 1591 pr_info("switched on\n"); 1592 /* don't suspend or resume card normally */ 1593 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1594 1595 pci_set_power_state(pdev, PCI_D0); 1596 amdgpu_device_load_pci_state(pdev); 1597 r = pci_enable_device(pdev); 1598 if (r) 1599 DRM_WARN("pci_enable_device failed (%d)\n", r); 1600 amdgpu_device_resume(dev, true); 1601 1602 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1603 } else { 1604 pr_info("switched off\n"); 1605 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1606 amdgpu_device_suspend(dev, true); 1607 amdgpu_device_cache_pci_state(pdev); 1608 /* Shut down the device */ 1609 pci_disable_device(pdev); 1610 pci_set_power_state(pdev, PCI_D3cold); 1611 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1612 } 1613 } 1614 1615 /** 1616 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1617 * 1618 * @pdev: pci dev pointer 1619 * 1620 * Callback for the switcheroo driver. Check of the switcheroo 1621 * state can be changed. 1622 * Returns true if the state can be changed, false if not. 1623 */ 1624 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1625 { 1626 struct drm_device *dev = pci_get_drvdata(pdev); 1627 1628 /* 1629 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1630 * locking inversion with the driver load path. And the access here is 1631 * completely racy anyway. So don't bother with locking for now. 1632 */ 1633 return atomic_read(&dev->open_count) == 0; 1634 } 1635 1636 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1637 .set_gpu_state = amdgpu_switcheroo_set_state, 1638 .reprobe = NULL, 1639 .can_switch = amdgpu_switcheroo_can_switch, 1640 }; 1641 1642 /** 1643 * amdgpu_device_ip_set_clockgating_state - set the CG state 1644 * 1645 * @dev: amdgpu_device pointer 1646 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1647 * @state: clockgating state (gate or ungate) 1648 * 1649 * Sets the requested clockgating state for all instances of 1650 * the hardware IP specified. 1651 * Returns the error code from the last instance. 1652 */ 1653 int amdgpu_device_ip_set_clockgating_state(void *dev, 1654 enum amd_ip_block_type block_type, 1655 enum amd_clockgating_state state) 1656 { 1657 struct amdgpu_device *adev = dev; 1658 int i, r = 0; 1659 1660 for (i = 0; i < adev->num_ip_blocks; i++) { 1661 if (!adev->ip_blocks[i].status.valid) 1662 continue; 1663 if (adev->ip_blocks[i].version->type != block_type) 1664 continue; 1665 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1666 continue; 1667 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1668 (void *)adev, state); 1669 if (r) 1670 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1671 adev->ip_blocks[i].version->funcs->name, r); 1672 } 1673 return r; 1674 } 1675 1676 /** 1677 * amdgpu_device_ip_set_powergating_state - set the PG state 1678 * 1679 * @dev: amdgpu_device pointer 1680 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1681 * @state: powergating state (gate or ungate) 1682 * 1683 * Sets the requested powergating state for all instances of 1684 * the hardware IP specified. 1685 * Returns the error code from the last instance. 1686 */ 1687 int amdgpu_device_ip_set_powergating_state(void *dev, 1688 enum amd_ip_block_type block_type, 1689 enum amd_powergating_state state) 1690 { 1691 struct amdgpu_device *adev = dev; 1692 int i, r = 0; 1693 1694 for (i = 0; i < adev->num_ip_blocks; i++) { 1695 if (!adev->ip_blocks[i].status.valid) 1696 continue; 1697 if (adev->ip_blocks[i].version->type != block_type) 1698 continue; 1699 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1700 continue; 1701 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1702 (void *)adev, state); 1703 if (r) 1704 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1705 adev->ip_blocks[i].version->funcs->name, r); 1706 } 1707 return r; 1708 } 1709 1710 /** 1711 * amdgpu_device_ip_get_clockgating_state - get the CG state 1712 * 1713 * @adev: amdgpu_device pointer 1714 * @flags: clockgating feature flags 1715 * 1716 * Walks the list of IPs on the device and updates the clockgating 1717 * flags for each IP. 1718 * Updates @flags with the feature flags for each hardware IP where 1719 * clockgating is enabled. 1720 */ 1721 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1722 u64 *flags) 1723 { 1724 int i; 1725 1726 for (i = 0; i < adev->num_ip_blocks; i++) { 1727 if (!adev->ip_blocks[i].status.valid) 1728 continue; 1729 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1730 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1731 } 1732 } 1733 1734 /** 1735 * amdgpu_device_ip_wait_for_idle - wait for idle 1736 * 1737 * @adev: amdgpu_device pointer 1738 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1739 * 1740 * Waits for the request hardware IP to be idle. 1741 * Returns 0 for success or a negative error code on failure. 1742 */ 1743 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1744 enum amd_ip_block_type block_type) 1745 { 1746 int i, r; 1747 1748 for (i = 0; i < adev->num_ip_blocks; i++) { 1749 if (!adev->ip_blocks[i].status.valid) 1750 continue; 1751 if (adev->ip_blocks[i].version->type == block_type) { 1752 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1753 if (r) 1754 return r; 1755 break; 1756 } 1757 } 1758 return 0; 1759 1760 } 1761 1762 /** 1763 * amdgpu_device_ip_is_idle - is the hardware IP idle 1764 * 1765 * @adev: amdgpu_device pointer 1766 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1767 * 1768 * Check if the hardware IP is idle or not. 1769 * Returns true if it the IP is idle, false if not. 1770 */ 1771 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1772 enum amd_ip_block_type block_type) 1773 { 1774 int i; 1775 1776 for (i = 0; i < adev->num_ip_blocks; i++) { 1777 if (!adev->ip_blocks[i].status.valid) 1778 continue; 1779 if (adev->ip_blocks[i].version->type == block_type) 1780 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1781 } 1782 return true; 1783 1784 } 1785 1786 /** 1787 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1788 * 1789 * @adev: amdgpu_device pointer 1790 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1791 * 1792 * Returns a pointer to the hardware IP block structure 1793 * if it exists for the asic, otherwise NULL. 1794 */ 1795 struct amdgpu_ip_block * 1796 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1797 enum amd_ip_block_type type) 1798 { 1799 int i; 1800 1801 for (i = 0; i < adev->num_ip_blocks; i++) 1802 if (adev->ip_blocks[i].version->type == type) 1803 return &adev->ip_blocks[i]; 1804 1805 return NULL; 1806 } 1807 1808 /** 1809 * amdgpu_device_ip_block_version_cmp 1810 * 1811 * @adev: amdgpu_device pointer 1812 * @type: enum amd_ip_block_type 1813 * @major: major version 1814 * @minor: minor version 1815 * 1816 * return 0 if equal or greater 1817 * return 1 if smaller or the ip_block doesn't exist 1818 */ 1819 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1820 enum amd_ip_block_type type, 1821 u32 major, u32 minor) 1822 { 1823 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1824 1825 if (ip_block && ((ip_block->version->major > major) || 1826 ((ip_block->version->major == major) && 1827 (ip_block->version->minor >= minor)))) 1828 return 0; 1829 1830 return 1; 1831 } 1832 1833 /** 1834 * amdgpu_device_ip_block_add 1835 * 1836 * @adev: amdgpu_device pointer 1837 * @ip_block_version: pointer to the IP to add 1838 * 1839 * Adds the IP block driver information to the collection of IPs 1840 * on the asic. 1841 */ 1842 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1843 const struct amdgpu_ip_block_version *ip_block_version) 1844 { 1845 if (!ip_block_version) 1846 return -EINVAL; 1847 1848 switch (ip_block_version->type) { 1849 case AMD_IP_BLOCK_TYPE_VCN: 1850 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1851 return 0; 1852 break; 1853 case AMD_IP_BLOCK_TYPE_JPEG: 1854 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1855 return 0; 1856 break; 1857 default: 1858 break; 1859 } 1860 1861 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1862 ip_block_version->funcs->name); 1863 1864 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1865 1866 return 0; 1867 } 1868 1869 /** 1870 * amdgpu_device_enable_virtual_display - enable virtual display feature 1871 * 1872 * @adev: amdgpu_device pointer 1873 * 1874 * Enabled the virtual display feature if the user has enabled it via 1875 * the module parameter virtual_display. This feature provides a virtual 1876 * display hardware on headless boards or in virtualized environments. 1877 * This function parses and validates the configuration string specified by 1878 * the user and configues the virtual display configuration (number of 1879 * virtual connectors, crtcs, etc.) specified. 1880 */ 1881 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1882 { 1883 adev->enable_virtual_display = false; 1884 1885 if (amdgpu_virtual_display) { 1886 const char *pci_address_name = pci_name(adev->pdev); 1887 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1888 1889 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1890 pciaddstr_tmp = pciaddstr; 1891 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1892 pciaddname = strsep(&pciaddname_tmp, ","); 1893 if (!strcmp("all", pciaddname) 1894 || !strcmp(pci_address_name, pciaddname)) { 1895 long num_crtc; 1896 int res = -1; 1897 1898 adev->enable_virtual_display = true; 1899 1900 if (pciaddname_tmp) 1901 res = kstrtol(pciaddname_tmp, 10, 1902 &num_crtc); 1903 1904 if (!res) { 1905 if (num_crtc < 1) 1906 num_crtc = 1; 1907 if (num_crtc > 6) 1908 num_crtc = 6; 1909 adev->mode_info.num_crtc = num_crtc; 1910 } else { 1911 adev->mode_info.num_crtc = 1; 1912 } 1913 break; 1914 } 1915 } 1916 1917 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1918 amdgpu_virtual_display, pci_address_name, 1919 adev->enable_virtual_display, adev->mode_info.num_crtc); 1920 1921 kfree(pciaddstr); 1922 } 1923 } 1924 1925 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1926 { 1927 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1928 adev->mode_info.num_crtc = 1; 1929 adev->enable_virtual_display = true; 1930 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1931 adev->enable_virtual_display, adev->mode_info.num_crtc); 1932 } 1933 } 1934 1935 /** 1936 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1937 * 1938 * @adev: amdgpu_device pointer 1939 * 1940 * Parses the asic configuration parameters specified in the gpu info 1941 * firmware and makes them availale to the driver for use in configuring 1942 * the asic. 1943 * Returns 0 on success, -EINVAL on failure. 1944 */ 1945 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1946 { 1947 const char *chip_name; 1948 char fw_name[40]; 1949 int err; 1950 const struct gpu_info_firmware_header_v1_0 *hdr; 1951 1952 adev->firmware.gpu_info_fw = NULL; 1953 1954 if (adev->mman.discovery_bin) { 1955 /* 1956 * FIXME: The bounding box is still needed by Navi12, so 1957 * temporarily read it from gpu_info firmware. Should be dropped 1958 * when DAL no longer needs it. 1959 */ 1960 if (adev->asic_type != CHIP_NAVI12) 1961 return 0; 1962 } 1963 1964 switch (adev->asic_type) { 1965 default: 1966 return 0; 1967 case CHIP_VEGA10: 1968 chip_name = "vega10"; 1969 break; 1970 case CHIP_VEGA12: 1971 chip_name = "vega12"; 1972 break; 1973 case CHIP_RAVEN: 1974 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1975 chip_name = "raven2"; 1976 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1977 chip_name = "picasso"; 1978 else 1979 chip_name = "raven"; 1980 break; 1981 case CHIP_ARCTURUS: 1982 chip_name = "arcturus"; 1983 break; 1984 case CHIP_NAVI12: 1985 chip_name = "navi12"; 1986 break; 1987 } 1988 1989 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1990 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1991 if (err) { 1992 dev_err(adev->dev, 1993 "Failed to get gpu_info firmware \"%s\"\n", 1994 fw_name); 1995 goto out; 1996 } 1997 1998 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1999 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2000 2001 switch (hdr->version_major) { 2002 case 1: 2003 { 2004 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2005 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2006 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2007 2008 /* 2009 * Should be droped when DAL no longer needs it. 2010 */ 2011 if (adev->asic_type == CHIP_NAVI12) 2012 goto parse_soc_bounding_box; 2013 2014 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2015 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2016 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2017 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2018 adev->gfx.config.max_texture_channel_caches = 2019 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2020 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2021 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2022 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2023 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2024 adev->gfx.config.double_offchip_lds_buf = 2025 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2026 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2027 adev->gfx.cu_info.max_waves_per_simd = 2028 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2029 adev->gfx.cu_info.max_scratch_slots_per_cu = 2030 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2031 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2032 if (hdr->version_minor >= 1) { 2033 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2034 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2035 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2036 adev->gfx.config.num_sc_per_sh = 2037 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2038 adev->gfx.config.num_packer_per_sc = 2039 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2040 } 2041 2042 parse_soc_bounding_box: 2043 /* 2044 * soc bounding box info is not integrated in disocovery table, 2045 * we always need to parse it from gpu info firmware if needed. 2046 */ 2047 if (hdr->version_minor == 2) { 2048 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2049 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2050 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2051 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2052 } 2053 break; 2054 } 2055 default: 2056 dev_err(adev->dev, 2057 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2058 err = -EINVAL; 2059 goto out; 2060 } 2061 out: 2062 return err; 2063 } 2064 2065 /** 2066 * amdgpu_device_ip_early_init - run early init for hardware IPs 2067 * 2068 * @adev: amdgpu_device pointer 2069 * 2070 * Early initialization pass for hardware IPs. The hardware IPs that make 2071 * up each asic are discovered each IP's early_init callback is run. This 2072 * is the first stage in initializing the asic. 2073 * Returns 0 on success, negative error code on failure. 2074 */ 2075 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2076 { 2077 struct drm_device *dev = adev_to_drm(adev); 2078 struct pci_dev *parent; 2079 int i, r; 2080 bool total; 2081 2082 amdgpu_device_enable_virtual_display(adev); 2083 2084 if (amdgpu_sriov_vf(adev)) { 2085 r = amdgpu_virt_request_full_gpu(adev, true); 2086 if (r) 2087 return r; 2088 } 2089 2090 switch (adev->asic_type) { 2091 #ifdef CONFIG_DRM_AMDGPU_SI 2092 case CHIP_VERDE: 2093 case CHIP_TAHITI: 2094 case CHIP_PITCAIRN: 2095 case CHIP_OLAND: 2096 case CHIP_HAINAN: 2097 adev->family = AMDGPU_FAMILY_SI; 2098 r = si_set_ip_blocks(adev); 2099 if (r) 2100 return r; 2101 break; 2102 #endif 2103 #ifdef CONFIG_DRM_AMDGPU_CIK 2104 case CHIP_BONAIRE: 2105 case CHIP_HAWAII: 2106 case CHIP_KAVERI: 2107 case CHIP_KABINI: 2108 case CHIP_MULLINS: 2109 if (adev->flags & AMD_IS_APU) 2110 adev->family = AMDGPU_FAMILY_KV; 2111 else 2112 adev->family = AMDGPU_FAMILY_CI; 2113 2114 r = cik_set_ip_blocks(adev); 2115 if (r) 2116 return r; 2117 break; 2118 #endif 2119 case CHIP_TOPAZ: 2120 case CHIP_TONGA: 2121 case CHIP_FIJI: 2122 case CHIP_POLARIS10: 2123 case CHIP_POLARIS11: 2124 case CHIP_POLARIS12: 2125 case CHIP_VEGAM: 2126 case CHIP_CARRIZO: 2127 case CHIP_STONEY: 2128 if (adev->flags & AMD_IS_APU) 2129 adev->family = AMDGPU_FAMILY_CZ; 2130 else 2131 adev->family = AMDGPU_FAMILY_VI; 2132 2133 r = vi_set_ip_blocks(adev); 2134 if (r) 2135 return r; 2136 break; 2137 default: 2138 r = amdgpu_discovery_set_ip_blocks(adev); 2139 if (r) 2140 return r; 2141 break; 2142 } 2143 2144 if (amdgpu_has_atpx() && 2145 (amdgpu_is_atpx_hybrid() || 2146 amdgpu_has_atpx_dgpu_power_cntl()) && 2147 ((adev->flags & AMD_IS_APU) == 0) && 2148 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2149 adev->flags |= AMD_IS_PX; 2150 2151 if (!(adev->flags & AMD_IS_APU)) { 2152 parent = pci_upstream_bridge(adev->pdev); 2153 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2154 } 2155 2156 amdgpu_amdkfd_device_probe(adev); 2157 2158 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2159 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2160 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2161 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2162 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2163 2164 total = true; 2165 for (i = 0; i < adev->num_ip_blocks; i++) { 2166 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2167 DRM_ERROR("disabled ip block: %d <%s>\n", 2168 i, adev->ip_blocks[i].version->funcs->name); 2169 adev->ip_blocks[i].status.valid = false; 2170 } else { 2171 if (adev->ip_blocks[i].version->funcs->early_init) { 2172 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2173 if (r == -ENOENT) { 2174 adev->ip_blocks[i].status.valid = false; 2175 } else if (r) { 2176 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2177 adev->ip_blocks[i].version->funcs->name, r); 2178 total = false; 2179 } else { 2180 adev->ip_blocks[i].status.valid = true; 2181 } 2182 } else { 2183 adev->ip_blocks[i].status.valid = true; 2184 } 2185 } 2186 /* get the vbios after the asic_funcs are set up */ 2187 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2188 r = amdgpu_device_parse_gpu_info_fw(adev); 2189 if (r) 2190 return r; 2191 2192 /* Read BIOS */ 2193 if (!amdgpu_get_bios(adev)) 2194 return -EINVAL; 2195 2196 r = amdgpu_atombios_init(adev); 2197 if (r) { 2198 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2199 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2200 return r; 2201 } 2202 2203 /*get pf2vf msg info at it's earliest time*/ 2204 if (amdgpu_sriov_vf(adev)) 2205 amdgpu_virt_init_data_exchange(adev); 2206 2207 } 2208 } 2209 if (!total) 2210 return -ENODEV; 2211 2212 adev->cg_flags &= amdgpu_cg_mask; 2213 adev->pg_flags &= amdgpu_pg_mask; 2214 2215 return 0; 2216 } 2217 2218 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2219 { 2220 int i, r; 2221 2222 for (i = 0; i < adev->num_ip_blocks; i++) { 2223 if (!adev->ip_blocks[i].status.sw) 2224 continue; 2225 if (adev->ip_blocks[i].status.hw) 2226 continue; 2227 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2228 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2229 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2230 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2231 if (r) { 2232 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2233 adev->ip_blocks[i].version->funcs->name, r); 2234 return r; 2235 } 2236 adev->ip_blocks[i].status.hw = true; 2237 } 2238 } 2239 2240 return 0; 2241 } 2242 2243 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2244 { 2245 int i, r; 2246 2247 for (i = 0; i < adev->num_ip_blocks; i++) { 2248 if (!adev->ip_blocks[i].status.sw) 2249 continue; 2250 if (adev->ip_blocks[i].status.hw) 2251 continue; 2252 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2253 if (r) { 2254 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2255 adev->ip_blocks[i].version->funcs->name, r); 2256 return r; 2257 } 2258 adev->ip_blocks[i].status.hw = true; 2259 } 2260 2261 return 0; 2262 } 2263 2264 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2265 { 2266 int r = 0; 2267 int i; 2268 uint32_t smu_version; 2269 2270 if (adev->asic_type >= CHIP_VEGA10) { 2271 for (i = 0; i < adev->num_ip_blocks; i++) { 2272 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2273 continue; 2274 2275 if (!adev->ip_blocks[i].status.sw) 2276 continue; 2277 2278 /* no need to do the fw loading again if already done*/ 2279 if (adev->ip_blocks[i].status.hw == true) 2280 break; 2281 2282 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2283 r = adev->ip_blocks[i].version->funcs->resume(adev); 2284 if (r) { 2285 DRM_ERROR("resume of IP block <%s> failed %d\n", 2286 adev->ip_blocks[i].version->funcs->name, r); 2287 return r; 2288 } 2289 } else { 2290 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2291 if (r) { 2292 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2293 adev->ip_blocks[i].version->funcs->name, r); 2294 return r; 2295 } 2296 } 2297 2298 adev->ip_blocks[i].status.hw = true; 2299 break; 2300 } 2301 } 2302 2303 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2304 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2305 2306 return r; 2307 } 2308 2309 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2310 { 2311 long timeout; 2312 int r, i; 2313 2314 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2315 struct amdgpu_ring *ring = adev->rings[i]; 2316 2317 /* No need to setup the GPU scheduler for rings that don't need it */ 2318 if (!ring || ring->no_scheduler) 2319 continue; 2320 2321 switch (ring->funcs->type) { 2322 case AMDGPU_RING_TYPE_GFX: 2323 timeout = adev->gfx_timeout; 2324 break; 2325 case AMDGPU_RING_TYPE_COMPUTE: 2326 timeout = adev->compute_timeout; 2327 break; 2328 case AMDGPU_RING_TYPE_SDMA: 2329 timeout = adev->sdma_timeout; 2330 break; 2331 default: 2332 timeout = adev->video_timeout; 2333 break; 2334 } 2335 2336 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2337 ring->num_hw_submission, amdgpu_job_hang_limit, 2338 timeout, adev->reset_domain->wq, 2339 ring->sched_score, ring->name, 2340 adev->dev); 2341 if (r) { 2342 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2343 ring->name); 2344 return r; 2345 } 2346 } 2347 2348 return 0; 2349 } 2350 2351 2352 /** 2353 * amdgpu_device_ip_init - run init for hardware IPs 2354 * 2355 * @adev: amdgpu_device pointer 2356 * 2357 * Main initialization pass for hardware IPs. The list of all the hardware 2358 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2359 * are run. sw_init initializes the software state associated with each IP 2360 * and hw_init initializes the hardware associated with each IP. 2361 * Returns 0 on success, negative error code on failure. 2362 */ 2363 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2364 { 2365 int i, r; 2366 2367 r = amdgpu_ras_init(adev); 2368 if (r) 2369 return r; 2370 2371 for (i = 0; i < adev->num_ip_blocks; i++) { 2372 if (!adev->ip_blocks[i].status.valid) 2373 continue; 2374 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2375 if (r) { 2376 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2377 adev->ip_blocks[i].version->funcs->name, r); 2378 goto init_failed; 2379 } 2380 adev->ip_blocks[i].status.sw = true; 2381 2382 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2383 /* need to do common hw init early so everything is set up for gmc */ 2384 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2385 if (r) { 2386 DRM_ERROR("hw_init %d failed %d\n", i, r); 2387 goto init_failed; 2388 } 2389 adev->ip_blocks[i].status.hw = true; 2390 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2391 /* need to do gmc hw init early so we can allocate gpu mem */ 2392 /* Try to reserve bad pages early */ 2393 if (amdgpu_sriov_vf(adev)) 2394 amdgpu_virt_exchange_data(adev); 2395 2396 r = amdgpu_device_mem_scratch_init(adev); 2397 if (r) { 2398 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2399 goto init_failed; 2400 } 2401 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2402 if (r) { 2403 DRM_ERROR("hw_init %d failed %d\n", i, r); 2404 goto init_failed; 2405 } 2406 r = amdgpu_device_wb_init(adev); 2407 if (r) { 2408 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2409 goto init_failed; 2410 } 2411 adev->ip_blocks[i].status.hw = true; 2412 2413 /* right after GMC hw init, we create CSA */ 2414 if (amdgpu_mcbp) { 2415 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2416 AMDGPU_GEM_DOMAIN_VRAM | 2417 AMDGPU_GEM_DOMAIN_GTT, 2418 AMDGPU_CSA_SIZE); 2419 if (r) { 2420 DRM_ERROR("allocate CSA failed %d\n", r); 2421 goto init_failed; 2422 } 2423 } 2424 } 2425 } 2426 2427 if (amdgpu_sriov_vf(adev)) 2428 amdgpu_virt_init_data_exchange(adev); 2429 2430 r = amdgpu_ib_pool_init(adev); 2431 if (r) { 2432 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2433 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2434 goto init_failed; 2435 } 2436 2437 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2438 if (r) 2439 goto init_failed; 2440 2441 r = amdgpu_device_ip_hw_init_phase1(adev); 2442 if (r) 2443 goto init_failed; 2444 2445 r = amdgpu_device_fw_loading(adev); 2446 if (r) 2447 goto init_failed; 2448 2449 r = amdgpu_device_ip_hw_init_phase2(adev); 2450 if (r) 2451 goto init_failed; 2452 2453 /* 2454 * retired pages will be loaded from eeprom and reserved here, 2455 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2456 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2457 * for I2C communication which only true at this point. 2458 * 2459 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2460 * failure from bad gpu situation and stop amdgpu init process 2461 * accordingly. For other failed cases, it will still release all 2462 * the resource and print error message, rather than returning one 2463 * negative value to upper level. 2464 * 2465 * Note: theoretically, this should be called before all vram allocations 2466 * to protect retired page from abusing 2467 */ 2468 r = amdgpu_ras_recovery_init(adev); 2469 if (r) 2470 goto init_failed; 2471 2472 /** 2473 * In case of XGMI grab extra reference for reset domain for this device 2474 */ 2475 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2476 if (amdgpu_xgmi_add_device(adev) == 0) { 2477 if (!amdgpu_sriov_vf(adev)) { 2478 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2479 2480 if (WARN_ON(!hive)) { 2481 r = -ENOENT; 2482 goto init_failed; 2483 } 2484 2485 if (!hive->reset_domain || 2486 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2487 r = -ENOENT; 2488 amdgpu_put_xgmi_hive(hive); 2489 goto init_failed; 2490 } 2491 2492 /* Drop the early temporary reset domain we created for device */ 2493 amdgpu_reset_put_reset_domain(adev->reset_domain); 2494 adev->reset_domain = hive->reset_domain; 2495 amdgpu_put_xgmi_hive(hive); 2496 } 2497 } 2498 } 2499 2500 r = amdgpu_device_init_schedulers(adev); 2501 if (r) 2502 goto init_failed; 2503 2504 /* Don't init kfd if whole hive need to be reset during init */ 2505 if (!adev->gmc.xgmi.pending_reset) 2506 amdgpu_amdkfd_device_init(adev); 2507 2508 amdgpu_fru_get_product_info(adev); 2509 2510 init_failed: 2511 if (amdgpu_sriov_vf(adev)) 2512 amdgpu_virt_release_full_gpu(adev, true); 2513 2514 return r; 2515 } 2516 2517 /** 2518 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2519 * 2520 * @adev: amdgpu_device pointer 2521 * 2522 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2523 * this function before a GPU reset. If the value is retained after a 2524 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2525 */ 2526 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2527 { 2528 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2529 } 2530 2531 /** 2532 * amdgpu_device_check_vram_lost - check if vram is valid 2533 * 2534 * @adev: amdgpu_device pointer 2535 * 2536 * Checks the reset magic value written to the gart pointer in VRAM. 2537 * The driver calls this after a GPU reset to see if the contents of 2538 * VRAM is lost or now. 2539 * returns true if vram is lost, false if not. 2540 */ 2541 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2542 { 2543 if (memcmp(adev->gart.ptr, adev->reset_magic, 2544 AMDGPU_RESET_MAGIC_NUM)) 2545 return true; 2546 2547 if (!amdgpu_in_reset(adev)) 2548 return false; 2549 2550 /* 2551 * For all ASICs with baco/mode1 reset, the VRAM is 2552 * always assumed to be lost. 2553 */ 2554 switch (amdgpu_asic_reset_method(adev)) { 2555 case AMD_RESET_METHOD_BACO: 2556 case AMD_RESET_METHOD_MODE1: 2557 return true; 2558 default: 2559 return false; 2560 } 2561 } 2562 2563 /** 2564 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2565 * 2566 * @adev: amdgpu_device pointer 2567 * @state: clockgating state (gate or ungate) 2568 * 2569 * The list of all the hardware IPs that make up the asic is walked and the 2570 * set_clockgating_state callbacks are run. 2571 * Late initialization pass enabling clockgating for hardware IPs. 2572 * Fini or suspend, pass disabling clockgating for hardware IPs. 2573 * Returns 0 on success, negative error code on failure. 2574 */ 2575 2576 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2577 enum amd_clockgating_state state) 2578 { 2579 int i, j, r; 2580 2581 if (amdgpu_emu_mode == 1) 2582 return 0; 2583 2584 for (j = 0; j < adev->num_ip_blocks; j++) { 2585 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2586 if (!adev->ip_blocks[i].status.late_initialized) 2587 continue; 2588 /* skip CG for GFX, SDMA on S0ix */ 2589 if (adev->in_s0ix && 2590 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2591 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2592 continue; 2593 /* skip CG for VCE/UVD, it's handled specially */ 2594 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2595 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2596 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2597 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2598 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2599 /* enable clockgating to save power */ 2600 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2601 state); 2602 if (r) { 2603 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2604 adev->ip_blocks[i].version->funcs->name, r); 2605 return r; 2606 } 2607 } 2608 } 2609 2610 return 0; 2611 } 2612 2613 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2614 enum amd_powergating_state state) 2615 { 2616 int i, j, r; 2617 2618 if (amdgpu_emu_mode == 1) 2619 return 0; 2620 2621 for (j = 0; j < adev->num_ip_blocks; j++) { 2622 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2623 if (!adev->ip_blocks[i].status.late_initialized) 2624 continue; 2625 /* skip PG for GFX, SDMA on S0ix */ 2626 if (adev->in_s0ix && 2627 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2628 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2629 continue; 2630 /* skip CG for VCE/UVD, it's handled specially */ 2631 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2632 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2633 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2634 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2635 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2636 /* enable powergating to save power */ 2637 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2638 state); 2639 if (r) { 2640 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2641 adev->ip_blocks[i].version->funcs->name, r); 2642 return r; 2643 } 2644 } 2645 } 2646 return 0; 2647 } 2648 2649 static int amdgpu_device_enable_mgpu_fan_boost(void) 2650 { 2651 struct amdgpu_gpu_instance *gpu_ins; 2652 struct amdgpu_device *adev; 2653 int i, ret = 0; 2654 2655 mutex_lock(&mgpu_info.mutex); 2656 2657 /* 2658 * MGPU fan boost feature should be enabled 2659 * only when there are two or more dGPUs in 2660 * the system 2661 */ 2662 if (mgpu_info.num_dgpu < 2) 2663 goto out; 2664 2665 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2666 gpu_ins = &(mgpu_info.gpu_ins[i]); 2667 adev = gpu_ins->adev; 2668 if (!(adev->flags & AMD_IS_APU) && 2669 !gpu_ins->mgpu_fan_enabled) { 2670 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2671 if (ret) 2672 break; 2673 2674 gpu_ins->mgpu_fan_enabled = 1; 2675 } 2676 } 2677 2678 out: 2679 mutex_unlock(&mgpu_info.mutex); 2680 2681 return ret; 2682 } 2683 2684 /** 2685 * amdgpu_device_ip_late_init - run late init for hardware IPs 2686 * 2687 * @adev: amdgpu_device pointer 2688 * 2689 * Late initialization pass for hardware IPs. The list of all the hardware 2690 * IPs that make up the asic is walked and the late_init callbacks are run. 2691 * late_init covers any special initialization that an IP requires 2692 * after all of the have been initialized or something that needs to happen 2693 * late in the init process. 2694 * Returns 0 on success, negative error code on failure. 2695 */ 2696 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2697 { 2698 struct amdgpu_gpu_instance *gpu_instance; 2699 int i = 0, r; 2700 2701 for (i = 0; i < adev->num_ip_blocks; i++) { 2702 if (!adev->ip_blocks[i].status.hw) 2703 continue; 2704 if (adev->ip_blocks[i].version->funcs->late_init) { 2705 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2706 if (r) { 2707 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2708 adev->ip_blocks[i].version->funcs->name, r); 2709 return r; 2710 } 2711 } 2712 adev->ip_blocks[i].status.late_initialized = true; 2713 } 2714 2715 r = amdgpu_ras_late_init(adev); 2716 if (r) { 2717 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2718 return r; 2719 } 2720 2721 amdgpu_ras_set_error_query_ready(adev, true); 2722 2723 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2724 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2725 2726 amdgpu_device_fill_reset_magic(adev); 2727 2728 r = amdgpu_device_enable_mgpu_fan_boost(); 2729 if (r) 2730 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2731 2732 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2733 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2734 adev->asic_type == CHIP_ALDEBARAN )) 2735 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2736 2737 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2738 mutex_lock(&mgpu_info.mutex); 2739 2740 /* 2741 * Reset device p-state to low as this was booted with high. 2742 * 2743 * This should be performed only after all devices from the same 2744 * hive get initialized. 2745 * 2746 * However, it's unknown how many device in the hive in advance. 2747 * As this is counted one by one during devices initializations. 2748 * 2749 * So, we wait for all XGMI interlinked devices initialized. 2750 * This may bring some delays as those devices may come from 2751 * different hives. But that should be OK. 2752 */ 2753 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2754 for (i = 0; i < mgpu_info.num_gpu; i++) { 2755 gpu_instance = &(mgpu_info.gpu_ins[i]); 2756 if (gpu_instance->adev->flags & AMD_IS_APU) 2757 continue; 2758 2759 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2760 AMDGPU_XGMI_PSTATE_MIN); 2761 if (r) { 2762 DRM_ERROR("pstate setting failed (%d).\n", r); 2763 break; 2764 } 2765 } 2766 } 2767 2768 mutex_unlock(&mgpu_info.mutex); 2769 } 2770 2771 return 0; 2772 } 2773 2774 /** 2775 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2776 * 2777 * @adev: amdgpu_device pointer 2778 * 2779 * For ASICs need to disable SMC first 2780 */ 2781 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2782 { 2783 int i, r; 2784 2785 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2786 return; 2787 2788 for (i = 0; i < adev->num_ip_blocks; i++) { 2789 if (!adev->ip_blocks[i].status.hw) 2790 continue; 2791 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2792 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2793 /* XXX handle errors */ 2794 if (r) { 2795 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2796 adev->ip_blocks[i].version->funcs->name, r); 2797 } 2798 adev->ip_blocks[i].status.hw = false; 2799 break; 2800 } 2801 } 2802 } 2803 2804 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2805 { 2806 int i, r; 2807 2808 for (i = 0; i < adev->num_ip_blocks; i++) { 2809 if (!adev->ip_blocks[i].version->funcs->early_fini) 2810 continue; 2811 2812 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2813 if (r) { 2814 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2815 adev->ip_blocks[i].version->funcs->name, r); 2816 } 2817 } 2818 2819 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2820 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2821 2822 amdgpu_amdkfd_suspend(adev, false); 2823 2824 /* Workaroud for ASICs need to disable SMC first */ 2825 amdgpu_device_smu_fini_early(adev); 2826 2827 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2828 if (!adev->ip_blocks[i].status.hw) 2829 continue; 2830 2831 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2832 /* XXX handle errors */ 2833 if (r) { 2834 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2835 adev->ip_blocks[i].version->funcs->name, r); 2836 } 2837 2838 adev->ip_blocks[i].status.hw = false; 2839 } 2840 2841 if (amdgpu_sriov_vf(adev)) { 2842 if (amdgpu_virt_release_full_gpu(adev, false)) 2843 DRM_ERROR("failed to release exclusive mode on fini\n"); 2844 } 2845 2846 return 0; 2847 } 2848 2849 /** 2850 * amdgpu_device_ip_fini - run fini for hardware IPs 2851 * 2852 * @adev: amdgpu_device pointer 2853 * 2854 * Main teardown pass for hardware IPs. The list of all the hardware 2855 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2856 * are run. hw_fini tears down the hardware associated with each IP 2857 * and sw_fini tears down any software state associated with each IP. 2858 * Returns 0 on success, negative error code on failure. 2859 */ 2860 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2861 { 2862 int i, r; 2863 2864 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2865 amdgpu_virt_release_ras_err_handler_data(adev); 2866 2867 if (adev->gmc.xgmi.num_physical_nodes > 1) 2868 amdgpu_xgmi_remove_device(adev); 2869 2870 amdgpu_amdkfd_device_fini_sw(adev); 2871 2872 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2873 if (!adev->ip_blocks[i].status.sw) 2874 continue; 2875 2876 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2877 amdgpu_ucode_free_bo(adev); 2878 amdgpu_free_static_csa(&adev->virt.csa_obj); 2879 amdgpu_device_wb_fini(adev); 2880 amdgpu_device_mem_scratch_fini(adev); 2881 amdgpu_ib_pool_fini(adev); 2882 } 2883 2884 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2885 /* XXX handle errors */ 2886 if (r) { 2887 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2888 adev->ip_blocks[i].version->funcs->name, r); 2889 } 2890 adev->ip_blocks[i].status.sw = false; 2891 adev->ip_blocks[i].status.valid = false; 2892 } 2893 2894 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2895 if (!adev->ip_blocks[i].status.late_initialized) 2896 continue; 2897 if (adev->ip_blocks[i].version->funcs->late_fini) 2898 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2899 adev->ip_blocks[i].status.late_initialized = false; 2900 } 2901 2902 amdgpu_ras_fini(adev); 2903 2904 return 0; 2905 } 2906 2907 /** 2908 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2909 * 2910 * @work: work_struct. 2911 */ 2912 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2913 { 2914 struct amdgpu_device *adev = 2915 container_of(work, struct amdgpu_device, delayed_init_work.work); 2916 int r; 2917 2918 r = amdgpu_ib_ring_tests(adev); 2919 if (r) 2920 DRM_ERROR("ib ring test failed (%d).\n", r); 2921 } 2922 2923 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2924 { 2925 struct amdgpu_device *adev = 2926 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2927 2928 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2929 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2930 2931 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2932 adev->gfx.gfx_off_state = true; 2933 } 2934 2935 /** 2936 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2937 * 2938 * @adev: amdgpu_device pointer 2939 * 2940 * Main suspend function for hardware IPs. The list of all the hardware 2941 * IPs that make up the asic is walked, clockgating is disabled and the 2942 * suspend callbacks are run. suspend puts the hardware and software state 2943 * in each IP into a state suitable for suspend. 2944 * Returns 0 on success, negative error code on failure. 2945 */ 2946 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2947 { 2948 int i, r; 2949 2950 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2951 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2952 2953 /* 2954 * Per PMFW team's suggestion, driver needs to handle gfxoff 2955 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2956 * scenario. Add the missing df cstate disablement here. 2957 */ 2958 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2959 dev_warn(adev->dev, "Failed to disallow df cstate"); 2960 2961 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2962 if (!adev->ip_blocks[i].status.valid) 2963 continue; 2964 2965 /* displays are handled separately */ 2966 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2967 continue; 2968 2969 /* XXX handle errors */ 2970 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2971 /* XXX handle errors */ 2972 if (r) { 2973 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2974 adev->ip_blocks[i].version->funcs->name, r); 2975 return r; 2976 } 2977 2978 adev->ip_blocks[i].status.hw = false; 2979 } 2980 2981 return 0; 2982 } 2983 2984 /** 2985 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2986 * 2987 * @adev: amdgpu_device pointer 2988 * 2989 * Main suspend function for hardware IPs. The list of all the hardware 2990 * IPs that make up the asic is walked, clockgating is disabled and the 2991 * suspend callbacks are run. suspend puts the hardware and software state 2992 * in each IP into a state suitable for suspend. 2993 * Returns 0 on success, negative error code on failure. 2994 */ 2995 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2996 { 2997 int i, r; 2998 2999 if (adev->in_s0ix) 3000 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3001 3002 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3003 if (!adev->ip_blocks[i].status.valid) 3004 continue; 3005 /* displays are handled in phase1 */ 3006 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3007 continue; 3008 /* PSP lost connection when err_event_athub occurs */ 3009 if (amdgpu_ras_intr_triggered() && 3010 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3011 adev->ip_blocks[i].status.hw = false; 3012 continue; 3013 } 3014 3015 /* skip unnecessary suspend if we do not initialize them yet */ 3016 if (adev->gmc.xgmi.pending_reset && 3017 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3018 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3019 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3020 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3021 adev->ip_blocks[i].status.hw = false; 3022 continue; 3023 } 3024 3025 /* skip suspend of gfx/mes and psp for S0ix 3026 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3027 * like at runtime. PSP is also part of the always on hardware 3028 * so no need to suspend it. 3029 */ 3030 if (adev->in_s0ix && 3031 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3032 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3033 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3034 continue; 3035 3036 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3037 if (adev->in_s0ix && 3038 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3039 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3040 continue; 3041 3042 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3043 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3044 * from this location and RLC Autoload automatically also gets loaded 3045 * from here based on PMFW -> PSP message during re-init sequence. 3046 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3047 * the TMR and reload FWs again for IMU enabled APU ASICs. 3048 */ 3049 if (amdgpu_in_reset(adev) && 3050 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3051 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3052 continue; 3053 3054 /* XXX handle errors */ 3055 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3056 /* XXX handle errors */ 3057 if (r) { 3058 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3059 adev->ip_blocks[i].version->funcs->name, r); 3060 } 3061 adev->ip_blocks[i].status.hw = false; 3062 /* handle putting the SMC in the appropriate state */ 3063 if(!amdgpu_sriov_vf(adev)){ 3064 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3065 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3066 if (r) { 3067 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3068 adev->mp1_state, r); 3069 return r; 3070 } 3071 } 3072 } 3073 } 3074 3075 return 0; 3076 } 3077 3078 /** 3079 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3080 * 3081 * @adev: amdgpu_device pointer 3082 * 3083 * Main suspend function for hardware IPs. The list of all the hardware 3084 * IPs that make up the asic is walked, clockgating is disabled and the 3085 * suspend callbacks are run. suspend puts the hardware and software state 3086 * in each IP into a state suitable for suspend. 3087 * Returns 0 on success, negative error code on failure. 3088 */ 3089 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3090 { 3091 int r; 3092 3093 if (amdgpu_sriov_vf(adev)) { 3094 amdgpu_virt_fini_data_exchange(adev); 3095 amdgpu_virt_request_full_gpu(adev, false); 3096 } 3097 3098 r = amdgpu_device_ip_suspend_phase1(adev); 3099 if (r) 3100 return r; 3101 r = amdgpu_device_ip_suspend_phase2(adev); 3102 3103 if (amdgpu_sriov_vf(adev)) 3104 amdgpu_virt_release_full_gpu(adev, false); 3105 3106 return r; 3107 } 3108 3109 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3110 { 3111 int i, r; 3112 3113 static enum amd_ip_block_type ip_order[] = { 3114 AMD_IP_BLOCK_TYPE_COMMON, 3115 AMD_IP_BLOCK_TYPE_GMC, 3116 AMD_IP_BLOCK_TYPE_PSP, 3117 AMD_IP_BLOCK_TYPE_IH, 3118 }; 3119 3120 for (i = 0; i < adev->num_ip_blocks; i++) { 3121 int j; 3122 struct amdgpu_ip_block *block; 3123 3124 block = &adev->ip_blocks[i]; 3125 block->status.hw = false; 3126 3127 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3128 3129 if (block->version->type != ip_order[j] || 3130 !block->status.valid) 3131 continue; 3132 3133 r = block->version->funcs->hw_init(adev); 3134 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3135 if (r) 3136 return r; 3137 block->status.hw = true; 3138 } 3139 } 3140 3141 return 0; 3142 } 3143 3144 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3145 { 3146 int i, r; 3147 3148 static enum amd_ip_block_type ip_order[] = { 3149 AMD_IP_BLOCK_TYPE_SMC, 3150 AMD_IP_BLOCK_TYPE_DCE, 3151 AMD_IP_BLOCK_TYPE_GFX, 3152 AMD_IP_BLOCK_TYPE_SDMA, 3153 AMD_IP_BLOCK_TYPE_UVD, 3154 AMD_IP_BLOCK_TYPE_VCE, 3155 AMD_IP_BLOCK_TYPE_VCN 3156 }; 3157 3158 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3159 int j; 3160 struct amdgpu_ip_block *block; 3161 3162 for (j = 0; j < adev->num_ip_blocks; j++) { 3163 block = &adev->ip_blocks[j]; 3164 3165 if (block->version->type != ip_order[i] || 3166 !block->status.valid || 3167 block->status.hw) 3168 continue; 3169 3170 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3171 r = block->version->funcs->resume(adev); 3172 else 3173 r = block->version->funcs->hw_init(adev); 3174 3175 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3176 if (r) 3177 return r; 3178 block->status.hw = true; 3179 } 3180 } 3181 3182 return 0; 3183 } 3184 3185 /** 3186 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3187 * 3188 * @adev: amdgpu_device pointer 3189 * 3190 * First resume function for hardware IPs. The list of all the hardware 3191 * IPs that make up the asic is walked and the resume callbacks are run for 3192 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3193 * after a suspend and updates the software state as necessary. This 3194 * function is also used for restoring the GPU after a GPU reset. 3195 * Returns 0 on success, negative error code on failure. 3196 */ 3197 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3198 { 3199 int i, r; 3200 3201 for (i = 0; i < adev->num_ip_blocks; i++) { 3202 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3203 continue; 3204 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3205 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3206 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3207 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3208 3209 r = adev->ip_blocks[i].version->funcs->resume(adev); 3210 if (r) { 3211 DRM_ERROR("resume of IP block <%s> failed %d\n", 3212 adev->ip_blocks[i].version->funcs->name, r); 3213 return r; 3214 } 3215 adev->ip_blocks[i].status.hw = true; 3216 } 3217 } 3218 3219 return 0; 3220 } 3221 3222 /** 3223 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3224 * 3225 * @adev: amdgpu_device pointer 3226 * 3227 * First resume function for hardware IPs. The list of all the hardware 3228 * IPs that make up the asic is walked and the resume callbacks are run for 3229 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3230 * functional state after a suspend and updates the software state as 3231 * necessary. This function is also used for restoring the GPU after a GPU 3232 * reset. 3233 * Returns 0 on success, negative error code on failure. 3234 */ 3235 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3236 { 3237 int i, r; 3238 3239 for (i = 0; i < adev->num_ip_blocks; i++) { 3240 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3241 continue; 3242 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3243 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3244 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3245 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3246 continue; 3247 r = adev->ip_blocks[i].version->funcs->resume(adev); 3248 if (r) { 3249 DRM_ERROR("resume of IP block <%s> failed %d\n", 3250 adev->ip_blocks[i].version->funcs->name, r); 3251 return r; 3252 } 3253 adev->ip_blocks[i].status.hw = true; 3254 } 3255 3256 return 0; 3257 } 3258 3259 /** 3260 * amdgpu_device_ip_resume - run resume for hardware IPs 3261 * 3262 * @adev: amdgpu_device pointer 3263 * 3264 * Main resume function for hardware IPs. The hardware IPs 3265 * are split into two resume functions because they are 3266 * are also used in in recovering from a GPU reset and some additional 3267 * steps need to be take between them. In this case (S3/S4) they are 3268 * run sequentially. 3269 * Returns 0 on success, negative error code on failure. 3270 */ 3271 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3272 { 3273 int r; 3274 3275 r = amdgpu_amdkfd_resume_iommu(adev); 3276 if (r) 3277 return r; 3278 3279 r = amdgpu_device_ip_resume_phase1(adev); 3280 if (r) 3281 return r; 3282 3283 r = amdgpu_device_fw_loading(adev); 3284 if (r) 3285 return r; 3286 3287 r = amdgpu_device_ip_resume_phase2(adev); 3288 3289 return r; 3290 } 3291 3292 /** 3293 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3294 * 3295 * @adev: amdgpu_device pointer 3296 * 3297 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3298 */ 3299 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3300 { 3301 if (amdgpu_sriov_vf(adev)) { 3302 if (adev->is_atom_fw) { 3303 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3304 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3305 } else { 3306 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3307 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3308 } 3309 3310 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3311 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3312 } 3313 } 3314 3315 /** 3316 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3317 * 3318 * @asic_type: AMD asic type 3319 * 3320 * Check if there is DC (new modesetting infrastructre) support for an asic. 3321 * returns true if DC has support, false if not. 3322 */ 3323 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3324 { 3325 switch (asic_type) { 3326 #ifdef CONFIG_DRM_AMDGPU_SI 3327 case CHIP_HAINAN: 3328 #endif 3329 case CHIP_TOPAZ: 3330 /* chips with no display hardware */ 3331 return false; 3332 #if defined(CONFIG_DRM_AMD_DC) 3333 case CHIP_TAHITI: 3334 case CHIP_PITCAIRN: 3335 case CHIP_VERDE: 3336 case CHIP_OLAND: 3337 /* 3338 * We have systems in the wild with these ASICs that require 3339 * LVDS and VGA support which is not supported with DC. 3340 * 3341 * Fallback to the non-DC driver here by default so as not to 3342 * cause regressions. 3343 */ 3344 #if defined(CONFIG_DRM_AMD_DC_SI) 3345 return amdgpu_dc > 0; 3346 #else 3347 return false; 3348 #endif 3349 case CHIP_BONAIRE: 3350 case CHIP_KAVERI: 3351 case CHIP_KABINI: 3352 case CHIP_MULLINS: 3353 /* 3354 * We have systems in the wild with these ASICs that require 3355 * VGA support which is not supported with DC. 3356 * 3357 * Fallback to the non-DC driver here by default so as not to 3358 * cause regressions. 3359 */ 3360 return amdgpu_dc > 0; 3361 default: 3362 return amdgpu_dc != 0; 3363 #else 3364 default: 3365 if (amdgpu_dc > 0) 3366 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3367 "but isn't supported by ASIC, ignoring\n"); 3368 return false; 3369 #endif 3370 } 3371 } 3372 3373 /** 3374 * amdgpu_device_has_dc_support - check if dc is supported 3375 * 3376 * @adev: amdgpu_device pointer 3377 * 3378 * Returns true for supported, false for not supported 3379 */ 3380 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3381 { 3382 if (adev->enable_virtual_display || 3383 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3384 return false; 3385 3386 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3387 } 3388 3389 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3390 { 3391 struct amdgpu_device *adev = 3392 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3393 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3394 3395 /* It's a bug to not have a hive within this function */ 3396 if (WARN_ON(!hive)) 3397 return; 3398 3399 /* 3400 * Use task barrier to synchronize all xgmi reset works across the 3401 * hive. task_barrier_enter and task_barrier_exit will block 3402 * until all the threads running the xgmi reset works reach 3403 * those points. task_barrier_full will do both blocks. 3404 */ 3405 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3406 3407 task_barrier_enter(&hive->tb); 3408 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3409 3410 if (adev->asic_reset_res) 3411 goto fail; 3412 3413 task_barrier_exit(&hive->tb); 3414 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3415 3416 if (adev->asic_reset_res) 3417 goto fail; 3418 3419 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3420 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3421 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3422 } else { 3423 3424 task_barrier_full(&hive->tb); 3425 adev->asic_reset_res = amdgpu_asic_reset(adev); 3426 } 3427 3428 fail: 3429 if (adev->asic_reset_res) 3430 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3431 adev->asic_reset_res, adev_to_drm(adev)->unique); 3432 amdgpu_put_xgmi_hive(hive); 3433 } 3434 3435 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3436 { 3437 char *input = amdgpu_lockup_timeout; 3438 char *timeout_setting = NULL; 3439 int index = 0; 3440 long timeout; 3441 int ret = 0; 3442 3443 /* 3444 * By default timeout for non compute jobs is 10000 3445 * and 60000 for compute jobs. 3446 * In SR-IOV or passthrough mode, timeout for compute 3447 * jobs are 60000 by default. 3448 */ 3449 adev->gfx_timeout = msecs_to_jiffies(10000); 3450 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3451 if (amdgpu_sriov_vf(adev)) 3452 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3453 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3454 else 3455 adev->compute_timeout = msecs_to_jiffies(60000); 3456 3457 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3458 while ((timeout_setting = strsep(&input, ",")) && 3459 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3460 ret = kstrtol(timeout_setting, 0, &timeout); 3461 if (ret) 3462 return ret; 3463 3464 if (timeout == 0) { 3465 index++; 3466 continue; 3467 } else if (timeout < 0) { 3468 timeout = MAX_SCHEDULE_TIMEOUT; 3469 dev_warn(adev->dev, "lockup timeout disabled"); 3470 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3471 } else { 3472 timeout = msecs_to_jiffies(timeout); 3473 } 3474 3475 switch (index++) { 3476 case 0: 3477 adev->gfx_timeout = timeout; 3478 break; 3479 case 1: 3480 adev->compute_timeout = timeout; 3481 break; 3482 case 2: 3483 adev->sdma_timeout = timeout; 3484 break; 3485 case 3: 3486 adev->video_timeout = timeout; 3487 break; 3488 default: 3489 break; 3490 } 3491 } 3492 /* 3493 * There is only one value specified and 3494 * it should apply to all non-compute jobs. 3495 */ 3496 if (index == 1) { 3497 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3498 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3499 adev->compute_timeout = adev->gfx_timeout; 3500 } 3501 } 3502 3503 return ret; 3504 } 3505 3506 /** 3507 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3508 * 3509 * @adev: amdgpu_device pointer 3510 * 3511 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3512 */ 3513 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3514 { 3515 struct iommu_domain *domain; 3516 3517 domain = iommu_get_domain_for_dev(adev->dev); 3518 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3519 adev->ram_is_direct_mapped = true; 3520 } 3521 3522 static const struct attribute *amdgpu_dev_attributes[] = { 3523 &dev_attr_product_name.attr, 3524 &dev_attr_product_number.attr, 3525 &dev_attr_serial_number.attr, 3526 &dev_attr_pcie_replay_count.attr, 3527 NULL 3528 }; 3529 3530 /** 3531 * amdgpu_device_init - initialize the driver 3532 * 3533 * @adev: amdgpu_device pointer 3534 * @flags: driver flags 3535 * 3536 * Initializes the driver info and hw (all asics). 3537 * Returns 0 for success or an error on failure. 3538 * Called at driver startup. 3539 */ 3540 int amdgpu_device_init(struct amdgpu_device *adev, 3541 uint32_t flags) 3542 { 3543 struct drm_device *ddev = adev_to_drm(adev); 3544 struct pci_dev *pdev = adev->pdev; 3545 int r, i; 3546 bool px = false; 3547 u32 max_MBps; 3548 3549 adev->shutdown = false; 3550 adev->flags = flags; 3551 3552 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3553 adev->asic_type = amdgpu_force_asic_type; 3554 else 3555 adev->asic_type = flags & AMD_ASIC_MASK; 3556 3557 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3558 if (amdgpu_emu_mode == 1) 3559 adev->usec_timeout *= 10; 3560 adev->gmc.gart_size = 512 * 1024 * 1024; 3561 adev->accel_working = false; 3562 adev->num_rings = 0; 3563 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3564 adev->mman.buffer_funcs = NULL; 3565 adev->mman.buffer_funcs_ring = NULL; 3566 adev->vm_manager.vm_pte_funcs = NULL; 3567 adev->vm_manager.vm_pte_num_scheds = 0; 3568 adev->gmc.gmc_funcs = NULL; 3569 adev->harvest_ip_mask = 0x0; 3570 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3571 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3572 3573 adev->smc_rreg = &amdgpu_invalid_rreg; 3574 adev->smc_wreg = &amdgpu_invalid_wreg; 3575 adev->pcie_rreg = &amdgpu_invalid_rreg; 3576 adev->pcie_wreg = &amdgpu_invalid_wreg; 3577 adev->pciep_rreg = &amdgpu_invalid_rreg; 3578 adev->pciep_wreg = &amdgpu_invalid_wreg; 3579 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3580 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3581 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3582 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3583 adev->didt_rreg = &amdgpu_invalid_rreg; 3584 adev->didt_wreg = &amdgpu_invalid_wreg; 3585 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3586 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3587 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3588 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3589 3590 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3591 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3592 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3593 3594 /* mutex initialization are all done here so we 3595 * can recall function without having locking issues */ 3596 mutex_init(&adev->firmware.mutex); 3597 mutex_init(&adev->pm.mutex); 3598 mutex_init(&adev->gfx.gpu_clock_mutex); 3599 mutex_init(&adev->srbm_mutex); 3600 mutex_init(&adev->gfx.pipe_reserve_mutex); 3601 mutex_init(&adev->gfx.gfx_off_mutex); 3602 mutex_init(&adev->grbm_idx_mutex); 3603 mutex_init(&adev->mn_lock); 3604 mutex_init(&adev->virt.vf_errors.lock); 3605 hash_init(adev->mn_hash); 3606 mutex_init(&adev->psp.mutex); 3607 mutex_init(&adev->notifier_lock); 3608 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3609 mutex_init(&adev->benchmark_mutex); 3610 3611 amdgpu_device_init_apu_flags(adev); 3612 3613 r = amdgpu_device_check_arguments(adev); 3614 if (r) 3615 return r; 3616 3617 spin_lock_init(&adev->mmio_idx_lock); 3618 spin_lock_init(&adev->smc_idx_lock); 3619 spin_lock_init(&adev->pcie_idx_lock); 3620 spin_lock_init(&adev->uvd_ctx_idx_lock); 3621 spin_lock_init(&adev->didt_idx_lock); 3622 spin_lock_init(&adev->gc_cac_idx_lock); 3623 spin_lock_init(&adev->se_cac_idx_lock); 3624 spin_lock_init(&adev->audio_endpt_idx_lock); 3625 spin_lock_init(&adev->mm_stats.lock); 3626 3627 INIT_LIST_HEAD(&adev->shadow_list); 3628 mutex_init(&adev->shadow_list_lock); 3629 3630 INIT_LIST_HEAD(&adev->reset_list); 3631 3632 INIT_LIST_HEAD(&adev->ras_list); 3633 3634 INIT_DELAYED_WORK(&adev->delayed_init_work, 3635 amdgpu_device_delayed_init_work_handler); 3636 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3637 amdgpu_device_delay_enable_gfx_off); 3638 3639 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3640 3641 adev->gfx.gfx_off_req_count = 1; 3642 adev->gfx.gfx_off_residency = 0; 3643 adev->gfx.gfx_off_entrycount = 0; 3644 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3645 3646 atomic_set(&adev->throttling_logging_enabled, 1); 3647 /* 3648 * If throttling continues, logging will be performed every minute 3649 * to avoid log flooding. "-1" is subtracted since the thermal 3650 * throttling interrupt comes every second. Thus, the total logging 3651 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3652 * for throttling interrupt) = 60 seconds. 3653 */ 3654 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3655 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3656 3657 /* Registers mapping */ 3658 /* TODO: block userspace mapping of io register */ 3659 if (adev->asic_type >= CHIP_BONAIRE) { 3660 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3661 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3662 } else { 3663 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3664 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3665 } 3666 3667 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3668 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3669 3670 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3671 if (adev->rmmio == NULL) { 3672 return -ENOMEM; 3673 } 3674 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3675 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3676 3677 amdgpu_device_get_pcie_info(adev); 3678 3679 if (amdgpu_mcbp) 3680 DRM_INFO("MCBP is enabled\n"); 3681 3682 /* 3683 * Reset domain needs to be present early, before XGMI hive discovered 3684 * (if any) and intitialized to use reset sem and in_gpu reset flag 3685 * early on during init and before calling to RREG32. 3686 */ 3687 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3688 if (!adev->reset_domain) 3689 return -ENOMEM; 3690 3691 /* detect hw virtualization here */ 3692 amdgpu_detect_virtualization(adev); 3693 3694 r = amdgpu_device_get_job_timeout_settings(adev); 3695 if (r) { 3696 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3697 return r; 3698 } 3699 3700 /* early init functions */ 3701 r = amdgpu_device_ip_early_init(adev); 3702 if (r) 3703 return r; 3704 3705 /* Get rid of things like offb */ 3706 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3707 if (r) 3708 return r; 3709 3710 /* Enable TMZ based on IP_VERSION */ 3711 amdgpu_gmc_tmz_set(adev); 3712 3713 amdgpu_gmc_noretry_set(adev); 3714 /* Need to get xgmi info early to decide the reset behavior*/ 3715 if (adev->gmc.xgmi.supported) { 3716 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3717 if (r) 3718 return r; 3719 } 3720 3721 /* enable PCIE atomic ops */ 3722 if (amdgpu_sriov_vf(adev)) 3723 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3724 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3725 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3726 else 3727 adev->have_atomics_support = 3728 !pci_enable_atomic_ops_to_root(adev->pdev, 3729 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3730 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3731 if (!adev->have_atomics_support) 3732 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3733 3734 /* doorbell bar mapping and doorbell index init*/ 3735 amdgpu_device_doorbell_init(adev); 3736 3737 if (amdgpu_emu_mode == 1) { 3738 /* post the asic on emulation mode */ 3739 emu_soc_asic_init(adev); 3740 goto fence_driver_init; 3741 } 3742 3743 amdgpu_reset_init(adev); 3744 3745 /* detect if we are with an SRIOV vbios */ 3746 amdgpu_device_detect_sriov_bios(adev); 3747 3748 /* check if we need to reset the asic 3749 * E.g., driver was not cleanly unloaded previously, etc. 3750 */ 3751 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3752 if (adev->gmc.xgmi.num_physical_nodes) { 3753 dev_info(adev->dev, "Pending hive reset.\n"); 3754 adev->gmc.xgmi.pending_reset = true; 3755 /* Only need to init necessary block for SMU to handle the reset */ 3756 for (i = 0; i < adev->num_ip_blocks; i++) { 3757 if (!adev->ip_blocks[i].status.valid) 3758 continue; 3759 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3760 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3761 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3762 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3763 DRM_DEBUG("IP %s disabled for hw_init.\n", 3764 adev->ip_blocks[i].version->funcs->name); 3765 adev->ip_blocks[i].status.hw = true; 3766 } 3767 } 3768 } else { 3769 r = amdgpu_asic_reset(adev); 3770 if (r) { 3771 dev_err(adev->dev, "asic reset on init failed\n"); 3772 goto failed; 3773 } 3774 } 3775 } 3776 3777 /* Post card if necessary */ 3778 if (amdgpu_device_need_post(adev)) { 3779 if (!adev->bios) { 3780 dev_err(adev->dev, "no vBIOS found\n"); 3781 r = -EINVAL; 3782 goto failed; 3783 } 3784 DRM_INFO("GPU posting now...\n"); 3785 r = amdgpu_device_asic_init(adev); 3786 if (r) { 3787 dev_err(adev->dev, "gpu post error!\n"); 3788 goto failed; 3789 } 3790 } 3791 3792 if (adev->is_atom_fw) { 3793 /* Initialize clocks */ 3794 r = amdgpu_atomfirmware_get_clock_info(adev); 3795 if (r) { 3796 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3797 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3798 goto failed; 3799 } 3800 } else { 3801 /* Initialize clocks */ 3802 r = amdgpu_atombios_get_clock_info(adev); 3803 if (r) { 3804 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3805 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3806 goto failed; 3807 } 3808 /* init i2c buses */ 3809 if (!amdgpu_device_has_dc_support(adev)) 3810 amdgpu_atombios_i2c_init(adev); 3811 } 3812 3813 fence_driver_init: 3814 /* Fence driver */ 3815 r = amdgpu_fence_driver_sw_init(adev); 3816 if (r) { 3817 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3818 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3819 goto failed; 3820 } 3821 3822 /* init the mode config */ 3823 drm_mode_config_init(adev_to_drm(adev)); 3824 3825 r = amdgpu_device_ip_init(adev); 3826 if (r) { 3827 /* failed in exclusive mode due to timeout */ 3828 if (amdgpu_sriov_vf(adev) && 3829 !amdgpu_sriov_runtime(adev) && 3830 amdgpu_virt_mmio_blocked(adev) && 3831 !amdgpu_virt_wait_reset(adev)) { 3832 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3833 /* Don't send request since VF is inactive. */ 3834 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3835 adev->virt.ops = NULL; 3836 r = -EAGAIN; 3837 goto release_ras_con; 3838 } 3839 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3840 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3841 goto release_ras_con; 3842 } 3843 3844 amdgpu_fence_driver_hw_init(adev); 3845 3846 dev_info(adev->dev, 3847 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3848 adev->gfx.config.max_shader_engines, 3849 adev->gfx.config.max_sh_per_se, 3850 adev->gfx.config.max_cu_per_sh, 3851 adev->gfx.cu_info.number); 3852 3853 adev->accel_working = true; 3854 3855 amdgpu_vm_check_compute_bug(adev); 3856 3857 /* Initialize the buffer migration limit. */ 3858 if (amdgpu_moverate >= 0) 3859 max_MBps = amdgpu_moverate; 3860 else 3861 max_MBps = 8; /* Allow 8 MB/s. */ 3862 /* Get a log2 for easy divisions. */ 3863 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3864 3865 r = amdgpu_pm_sysfs_init(adev); 3866 if (r) 3867 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3868 3869 r = amdgpu_ucode_sysfs_init(adev); 3870 if (r) { 3871 adev->ucode_sysfs_en = false; 3872 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3873 } else 3874 adev->ucode_sysfs_en = true; 3875 3876 r = amdgpu_psp_sysfs_init(adev); 3877 if (r) { 3878 adev->psp_sysfs_en = false; 3879 if (!amdgpu_sriov_vf(adev)) 3880 DRM_ERROR("Creating psp sysfs failed\n"); 3881 } else 3882 adev->psp_sysfs_en = true; 3883 3884 /* 3885 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3886 * Otherwise the mgpu fan boost feature will be skipped due to the 3887 * gpu instance is counted less. 3888 */ 3889 amdgpu_register_gpu_instance(adev); 3890 3891 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3892 * explicit gating rather than handling it automatically. 3893 */ 3894 if (!adev->gmc.xgmi.pending_reset) { 3895 r = amdgpu_device_ip_late_init(adev); 3896 if (r) { 3897 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3898 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3899 goto release_ras_con; 3900 } 3901 /* must succeed. */ 3902 amdgpu_ras_resume(adev); 3903 queue_delayed_work(system_wq, &adev->delayed_init_work, 3904 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3905 } 3906 3907 if (amdgpu_sriov_vf(adev)) 3908 flush_delayed_work(&adev->delayed_init_work); 3909 3910 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3911 if (r) 3912 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3913 3914 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3915 r = amdgpu_pmu_init(adev); 3916 if (r) 3917 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3918 3919 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3920 if (amdgpu_device_cache_pci_state(adev->pdev)) 3921 pci_restore_state(pdev); 3922 3923 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3924 /* this will fail for cards that aren't VGA class devices, just 3925 * ignore it */ 3926 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3927 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3928 3929 px = amdgpu_device_supports_px(ddev); 3930 3931 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3932 apple_gmux_detect(NULL, NULL))) 3933 vga_switcheroo_register_client(adev->pdev, 3934 &amdgpu_switcheroo_ops, px); 3935 3936 if (px) 3937 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3938 3939 if (adev->gmc.xgmi.pending_reset) 3940 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3941 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3942 3943 amdgpu_device_check_iommu_direct_map(adev); 3944 3945 return 0; 3946 3947 release_ras_con: 3948 amdgpu_release_ras_context(adev); 3949 3950 failed: 3951 amdgpu_vf_error_trans_all(adev); 3952 3953 return r; 3954 } 3955 3956 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3957 { 3958 3959 /* Clear all CPU mappings pointing to this device */ 3960 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3961 3962 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3963 amdgpu_device_doorbell_fini(adev); 3964 3965 iounmap(adev->rmmio); 3966 adev->rmmio = NULL; 3967 if (adev->mman.aper_base_kaddr) 3968 iounmap(adev->mman.aper_base_kaddr); 3969 adev->mman.aper_base_kaddr = NULL; 3970 3971 /* Memory manager related */ 3972 if (!adev->gmc.xgmi.connected_to_cpu) { 3973 arch_phys_wc_del(adev->gmc.vram_mtrr); 3974 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3975 } 3976 } 3977 3978 /** 3979 * amdgpu_device_fini_hw - tear down the driver 3980 * 3981 * @adev: amdgpu_device pointer 3982 * 3983 * Tear down the driver info (all asics). 3984 * Called at driver shutdown. 3985 */ 3986 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3987 { 3988 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3989 flush_delayed_work(&adev->delayed_init_work); 3990 adev->shutdown = true; 3991 3992 /* make sure IB test finished before entering exclusive mode 3993 * to avoid preemption on IB test 3994 * */ 3995 if (amdgpu_sriov_vf(adev)) { 3996 amdgpu_virt_request_full_gpu(adev, false); 3997 amdgpu_virt_fini_data_exchange(adev); 3998 } 3999 4000 /* disable all interrupts */ 4001 amdgpu_irq_disable_all(adev); 4002 if (adev->mode_info.mode_config_initialized){ 4003 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4004 drm_helper_force_disable_all(adev_to_drm(adev)); 4005 else 4006 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4007 } 4008 amdgpu_fence_driver_hw_fini(adev); 4009 4010 if (adev->mman.initialized) 4011 drain_workqueue(adev->mman.bdev.wq); 4012 4013 if (adev->pm.sysfs_initialized) 4014 amdgpu_pm_sysfs_fini(adev); 4015 if (adev->ucode_sysfs_en) 4016 amdgpu_ucode_sysfs_fini(adev); 4017 if (adev->psp_sysfs_en) 4018 amdgpu_psp_sysfs_fini(adev); 4019 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4020 4021 /* disable ras feature must before hw fini */ 4022 amdgpu_ras_pre_fini(adev); 4023 4024 amdgpu_device_ip_fini_early(adev); 4025 4026 amdgpu_irq_fini_hw(adev); 4027 4028 if (adev->mman.initialized) 4029 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4030 4031 amdgpu_gart_dummy_page_fini(adev); 4032 4033 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4034 amdgpu_device_unmap_mmio(adev); 4035 4036 } 4037 4038 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4039 { 4040 int idx; 4041 bool px; 4042 4043 amdgpu_fence_driver_sw_fini(adev); 4044 amdgpu_device_ip_fini(adev); 4045 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4046 adev->accel_working = false; 4047 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4048 4049 amdgpu_reset_fini(adev); 4050 4051 /* free i2c buses */ 4052 if (!amdgpu_device_has_dc_support(adev)) 4053 amdgpu_i2c_fini(adev); 4054 4055 if (amdgpu_emu_mode != 1) 4056 amdgpu_atombios_fini(adev); 4057 4058 kfree(adev->bios); 4059 adev->bios = NULL; 4060 4061 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4062 4063 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4064 apple_gmux_detect(NULL, NULL))) 4065 vga_switcheroo_unregister_client(adev->pdev); 4066 4067 if (px) 4068 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4069 4070 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4071 vga_client_unregister(adev->pdev); 4072 4073 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4074 4075 iounmap(adev->rmmio); 4076 adev->rmmio = NULL; 4077 amdgpu_device_doorbell_fini(adev); 4078 drm_dev_exit(idx); 4079 } 4080 4081 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4082 amdgpu_pmu_fini(adev); 4083 if (adev->mman.discovery_bin) 4084 amdgpu_discovery_fini(adev); 4085 4086 amdgpu_reset_put_reset_domain(adev->reset_domain); 4087 adev->reset_domain = NULL; 4088 4089 kfree(adev->pci_state); 4090 4091 } 4092 4093 /** 4094 * amdgpu_device_evict_resources - evict device resources 4095 * @adev: amdgpu device object 4096 * 4097 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4098 * of the vram memory type. Mainly used for evicting device resources 4099 * at suspend time. 4100 * 4101 */ 4102 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4103 { 4104 int ret; 4105 4106 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4107 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4108 return 0; 4109 4110 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4111 if (ret) 4112 DRM_WARN("evicting device resources failed\n"); 4113 return ret; 4114 } 4115 4116 /* 4117 * Suspend & resume. 4118 */ 4119 /** 4120 * amdgpu_device_suspend - initiate device suspend 4121 * 4122 * @dev: drm dev pointer 4123 * @fbcon : notify the fbdev of suspend 4124 * 4125 * Puts the hw in the suspend state (all asics). 4126 * Returns 0 for success or an error on failure. 4127 * Called at driver suspend. 4128 */ 4129 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4130 { 4131 struct amdgpu_device *adev = drm_to_adev(dev); 4132 int r = 0; 4133 4134 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4135 return 0; 4136 4137 adev->in_suspend = true; 4138 4139 /* Evict the majority of BOs before grabbing the full access */ 4140 r = amdgpu_device_evict_resources(adev); 4141 if (r) 4142 return r; 4143 4144 if (amdgpu_sriov_vf(adev)) { 4145 amdgpu_virt_fini_data_exchange(adev); 4146 r = amdgpu_virt_request_full_gpu(adev, false); 4147 if (r) 4148 return r; 4149 } 4150 4151 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4152 DRM_WARN("smart shift update failed\n"); 4153 4154 if (fbcon) 4155 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4156 4157 cancel_delayed_work_sync(&adev->delayed_init_work); 4158 4159 amdgpu_ras_suspend(adev); 4160 4161 amdgpu_device_ip_suspend_phase1(adev); 4162 4163 if (!adev->in_s0ix) 4164 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4165 4166 r = amdgpu_device_evict_resources(adev); 4167 if (r) 4168 return r; 4169 4170 amdgpu_fence_driver_hw_fini(adev); 4171 4172 amdgpu_device_ip_suspend_phase2(adev); 4173 4174 if (amdgpu_sriov_vf(adev)) 4175 amdgpu_virt_release_full_gpu(adev, false); 4176 4177 return 0; 4178 } 4179 4180 /** 4181 * amdgpu_device_resume - initiate device resume 4182 * 4183 * @dev: drm dev pointer 4184 * @fbcon : notify the fbdev of resume 4185 * 4186 * Bring the hw back to operating state (all asics). 4187 * Returns 0 for success or an error on failure. 4188 * Called at driver resume. 4189 */ 4190 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4191 { 4192 struct amdgpu_device *adev = drm_to_adev(dev); 4193 int r = 0; 4194 4195 if (amdgpu_sriov_vf(adev)) { 4196 r = amdgpu_virt_request_full_gpu(adev, true); 4197 if (r) 4198 return r; 4199 } 4200 4201 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4202 return 0; 4203 4204 if (adev->in_s0ix) 4205 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4206 4207 /* post card */ 4208 if (amdgpu_device_need_post(adev)) { 4209 r = amdgpu_device_asic_init(adev); 4210 if (r) 4211 dev_err(adev->dev, "amdgpu asic init failed\n"); 4212 } 4213 4214 r = amdgpu_device_ip_resume(adev); 4215 4216 if (r) { 4217 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4218 goto exit; 4219 } 4220 amdgpu_fence_driver_hw_init(adev); 4221 4222 r = amdgpu_device_ip_late_init(adev); 4223 if (r) 4224 goto exit; 4225 4226 queue_delayed_work(system_wq, &adev->delayed_init_work, 4227 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4228 4229 if (!adev->in_s0ix) { 4230 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4231 if (r) 4232 goto exit; 4233 } 4234 4235 exit: 4236 if (amdgpu_sriov_vf(adev)) { 4237 amdgpu_virt_init_data_exchange(adev); 4238 amdgpu_virt_release_full_gpu(adev, true); 4239 } 4240 4241 if (r) 4242 return r; 4243 4244 /* Make sure IB tests flushed */ 4245 flush_delayed_work(&adev->delayed_init_work); 4246 4247 if (fbcon) 4248 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4249 4250 amdgpu_ras_resume(adev); 4251 4252 if (adev->mode_info.num_crtc) { 4253 /* 4254 * Most of the connector probing functions try to acquire runtime pm 4255 * refs to ensure that the GPU is powered on when connector polling is 4256 * performed. Since we're calling this from a runtime PM callback, 4257 * trying to acquire rpm refs will cause us to deadlock. 4258 * 4259 * Since we're guaranteed to be holding the rpm lock, it's safe to 4260 * temporarily disable the rpm helpers so this doesn't deadlock us. 4261 */ 4262 #ifdef CONFIG_PM 4263 dev->dev->power.disable_depth++; 4264 #endif 4265 if (!adev->dc_enabled) 4266 drm_helper_hpd_irq_event(dev); 4267 else 4268 drm_kms_helper_hotplug_event(dev); 4269 #ifdef CONFIG_PM 4270 dev->dev->power.disable_depth--; 4271 #endif 4272 } 4273 adev->in_suspend = false; 4274 4275 if (adev->enable_mes) 4276 amdgpu_mes_self_test(adev); 4277 4278 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4279 DRM_WARN("smart shift update failed\n"); 4280 4281 return 0; 4282 } 4283 4284 /** 4285 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4286 * 4287 * @adev: amdgpu_device pointer 4288 * 4289 * The list of all the hardware IPs that make up the asic is walked and 4290 * the check_soft_reset callbacks are run. check_soft_reset determines 4291 * if the asic is still hung or not. 4292 * Returns true if any of the IPs are still in a hung state, false if not. 4293 */ 4294 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4295 { 4296 int i; 4297 bool asic_hang = false; 4298 4299 if (amdgpu_sriov_vf(adev)) 4300 return true; 4301 4302 if (amdgpu_asic_need_full_reset(adev)) 4303 return true; 4304 4305 for (i = 0; i < adev->num_ip_blocks; i++) { 4306 if (!adev->ip_blocks[i].status.valid) 4307 continue; 4308 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4309 adev->ip_blocks[i].status.hang = 4310 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4311 if (adev->ip_blocks[i].status.hang) { 4312 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4313 asic_hang = true; 4314 } 4315 } 4316 return asic_hang; 4317 } 4318 4319 /** 4320 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4321 * 4322 * @adev: amdgpu_device pointer 4323 * 4324 * The list of all the hardware IPs that make up the asic is walked and the 4325 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4326 * handles any IP specific hardware or software state changes that are 4327 * necessary for a soft reset to succeed. 4328 * Returns 0 on success, negative error code on failure. 4329 */ 4330 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4331 { 4332 int i, r = 0; 4333 4334 for (i = 0; i < adev->num_ip_blocks; i++) { 4335 if (!adev->ip_blocks[i].status.valid) 4336 continue; 4337 if (adev->ip_blocks[i].status.hang && 4338 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4339 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4340 if (r) 4341 return r; 4342 } 4343 } 4344 4345 return 0; 4346 } 4347 4348 /** 4349 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4350 * 4351 * @adev: amdgpu_device pointer 4352 * 4353 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4354 * reset is necessary to recover. 4355 * Returns true if a full asic reset is required, false if not. 4356 */ 4357 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4358 { 4359 int i; 4360 4361 if (amdgpu_asic_need_full_reset(adev)) 4362 return true; 4363 4364 for (i = 0; i < adev->num_ip_blocks; i++) { 4365 if (!adev->ip_blocks[i].status.valid) 4366 continue; 4367 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4368 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4369 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4370 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4371 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4372 if (adev->ip_blocks[i].status.hang) { 4373 dev_info(adev->dev, "Some block need full reset!\n"); 4374 return true; 4375 } 4376 } 4377 } 4378 return false; 4379 } 4380 4381 /** 4382 * amdgpu_device_ip_soft_reset - do a soft reset 4383 * 4384 * @adev: amdgpu_device pointer 4385 * 4386 * The list of all the hardware IPs that make up the asic is walked and the 4387 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4388 * IP specific hardware or software state changes that are necessary to soft 4389 * reset the IP. 4390 * Returns 0 on success, negative error code on failure. 4391 */ 4392 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4393 { 4394 int i, r = 0; 4395 4396 for (i = 0; i < adev->num_ip_blocks; i++) { 4397 if (!adev->ip_blocks[i].status.valid) 4398 continue; 4399 if (adev->ip_blocks[i].status.hang && 4400 adev->ip_blocks[i].version->funcs->soft_reset) { 4401 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4402 if (r) 4403 return r; 4404 } 4405 } 4406 4407 return 0; 4408 } 4409 4410 /** 4411 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4412 * 4413 * @adev: amdgpu_device pointer 4414 * 4415 * The list of all the hardware IPs that make up the asic is walked and the 4416 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4417 * handles any IP specific hardware or software state changes that are 4418 * necessary after the IP has been soft reset. 4419 * Returns 0 on success, negative error code on failure. 4420 */ 4421 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4422 { 4423 int i, r = 0; 4424 4425 for (i = 0; i < adev->num_ip_blocks; i++) { 4426 if (!adev->ip_blocks[i].status.valid) 4427 continue; 4428 if (adev->ip_blocks[i].status.hang && 4429 adev->ip_blocks[i].version->funcs->post_soft_reset) 4430 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4431 if (r) 4432 return r; 4433 } 4434 4435 return 0; 4436 } 4437 4438 /** 4439 * amdgpu_device_recover_vram - Recover some VRAM contents 4440 * 4441 * @adev: amdgpu_device pointer 4442 * 4443 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4444 * restore things like GPUVM page tables after a GPU reset where 4445 * the contents of VRAM might be lost. 4446 * 4447 * Returns: 4448 * 0 on success, negative error code on failure. 4449 */ 4450 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4451 { 4452 struct dma_fence *fence = NULL, *next = NULL; 4453 struct amdgpu_bo *shadow; 4454 struct amdgpu_bo_vm *vmbo; 4455 long r = 1, tmo; 4456 4457 if (amdgpu_sriov_runtime(adev)) 4458 tmo = msecs_to_jiffies(8000); 4459 else 4460 tmo = msecs_to_jiffies(100); 4461 4462 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4463 mutex_lock(&adev->shadow_list_lock); 4464 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4465 shadow = &vmbo->bo; 4466 /* No need to recover an evicted BO */ 4467 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4468 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4469 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4470 continue; 4471 4472 r = amdgpu_bo_restore_shadow(shadow, &next); 4473 if (r) 4474 break; 4475 4476 if (fence) { 4477 tmo = dma_fence_wait_timeout(fence, false, tmo); 4478 dma_fence_put(fence); 4479 fence = next; 4480 if (tmo == 0) { 4481 r = -ETIMEDOUT; 4482 break; 4483 } else if (tmo < 0) { 4484 r = tmo; 4485 break; 4486 } 4487 } else { 4488 fence = next; 4489 } 4490 } 4491 mutex_unlock(&adev->shadow_list_lock); 4492 4493 if (fence) 4494 tmo = dma_fence_wait_timeout(fence, false, tmo); 4495 dma_fence_put(fence); 4496 4497 if (r < 0 || tmo <= 0) { 4498 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4499 return -EIO; 4500 } 4501 4502 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4503 return 0; 4504 } 4505 4506 4507 /** 4508 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4509 * 4510 * @adev: amdgpu_device pointer 4511 * @from_hypervisor: request from hypervisor 4512 * 4513 * do VF FLR and reinitialize Asic 4514 * return 0 means succeeded otherwise failed 4515 */ 4516 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4517 bool from_hypervisor) 4518 { 4519 int r; 4520 struct amdgpu_hive_info *hive = NULL; 4521 int retry_limit = 0; 4522 4523 retry: 4524 amdgpu_amdkfd_pre_reset(adev); 4525 4526 if (from_hypervisor) 4527 r = amdgpu_virt_request_full_gpu(adev, true); 4528 else 4529 r = amdgpu_virt_reset_gpu(adev); 4530 if (r) 4531 return r; 4532 4533 /* Resume IP prior to SMC */ 4534 r = amdgpu_device_ip_reinit_early_sriov(adev); 4535 if (r) 4536 goto error; 4537 4538 amdgpu_virt_init_data_exchange(adev); 4539 4540 r = amdgpu_device_fw_loading(adev); 4541 if (r) 4542 return r; 4543 4544 /* now we are okay to resume SMC/CP/SDMA */ 4545 r = amdgpu_device_ip_reinit_late_sriov(adev); 4546 if (r) 4547 goto error; 4548 4549 hive = amdgpu_get_xgmi_hive(adev); 4550 /* Update PSP FW topology after reset */ 4551 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4552 r = amdgpu_xgmi_update_topology(hive, adev); 4553 4554 if (hive) 4555 amdgpu_put_xgmi_hive(hive); 4556 4557 if (!r) { 4558 amdgpu_irq_gpu_reset_resume_helper(adev); 4559 r = amdgpu_ib_ring_tests(adev); 4560 4561 amdgpu_amdkfd_post_reset(adev); 4562 } 4563 4564 error: 4565 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4566 amdgpu_inc_vram_lost(adev); 4567 r = amdgpu_device_recover_vram(adev); 4568 } 4569 amdgpu_virt_release_full_gpu(adev, true); 4570 4571 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4572 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4573 retry_limit++; 4574 goto retry; 4575 } else 4576 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4577 } 4578 4579 return r; 4580 } 4581 4582 /** 4583 * amdgpu_device_has_job_running - check if there is any job in mirror list 4584 * 4585 * @adev: amdgpu_device pointer 4586 * 4587 * check if there is any job in mirror list 4588 */ 4589 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4590 { 4591 int i; 4592 struct drm_sched_job *job; 4593 4594 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4595 struct amdgpu_ring *ring = adev->rings[i]; 4596 4597 if (!ring || !ring->sched.thread) 4598 continue; 4599 4600 spin_lock(&ring->sched.job_list_lock); 4601 job = list_first_entry_or_null(&ring->sched.pending_list, 4602 struct drm_sched_job, list); 4603 spin_unlock(&ring->sched.job_list_lock); 4604 if (job) 4605 return true; 4606 } 4607 return false; 4608 } 4609 4610 /** 4611 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4612 * 4613 * @adev: amdgpu_device pointer 4614 * 4615 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4616 * a hung GPU. 4617 */ 4618 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4619 { 4620 4621 if (amdgpu_gpu_recovery == 0) 4622 goto disabled; 4623 4624 /* Skip soft reset check in fatal error mode */ 4625 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4626 return true; 4627 4628 if (amdgpu_sriov_vf(adev)) 4629 return true; 4630 4631 if (amdgpu_gpu_recovery == -1) { 4632 switch (adev->asic_type) { 4633 #ifdef CONFIG_DRM_AMDGPU_SI 4634 case CHIP_VERDE: 4635 case CHIP_TAHITI: 4636 case CHIP_PITCAIRN: 4637 case CHIP_OLAND: 4638 case CHIP_HAINAN: 4639 #endif 4640 #ifdef CONFIG_DRM_AMDGPU_CIK 4641 case CHIP_KAVERI: 4642 case CHIP_KABINI: 4643 case CHIP_MULLINS: 4644 #endif 4645 case CHIP_CARRIZO: 4646 case CHIP_STONEY: 4647 case CHIP_CYAN_SKILLFISH: 4648 goto disabled; 4649 default: 4650 break; 4651 } 4652 } 4653 4654 return true; 4655 4656 disabled: 4657 dev_info(adev->dev, "GPU recovery disabled.\n"); 4658 return false; 4659 } 4660 4661 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4662 { 4663 u32 i; 4664 int ret = 0; 4665 4666 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4667 4668 dev_info(adev->dev, "GPU mode1 reset\n"); 4669 4670 /* disable BM */ 4671 pci_clear_master(adev->pdev); 4672 4673 amdgpu_device_cache_pci_state(adev->pdev); 4674 4675 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4676 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4677 ret = amdgpu_dpm_mode1_reset(adev); 4678 } else { 4679 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4680 ret = psp_gpu_reset(adev); 4681 } 4682 4683 if (ret) 4684 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4685 4686 amdgpu_device_load_pci_state(adev->pdev); 4687 4688 /* wait for asic to come out of reset */ 4689 for (i = 0; i < adev->usec_timeout; i++) { 4690 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4691 4692 if (memsize != 0xffffffff) 4693 break; 4694 udelay(1); 4695 } 4696 4697 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4698 return ret; 4699 } 4700 4701 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4702 struct amdgpu_reset_context *reset_context) 4703 { 4704 int i, r = 0; 4705 struct amdgpu_job *job = NULL; 4706 bool need_full_reset = 4707 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4708 4709 if (reset_context->reset_req_dev == adev) 4710 job = reset_context->job; 4711 4712 if (amdgpu_sriov_vf(adev)) { 4713 /* stop the data exchange thread */ 4714 amdgpu_virt_fini_data_exchange(adev); 4715 } 4716 4717 amdgpu_fence_driver_isr_toggle(adev, true); 4718 4719 /* block all schedulers and reset given job's ring */ 4720 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4721 struct amdgpu_ring *ring = adev->rings[i]; 4722 4723 if (!ring || !ring->sched.thread) 4724 continue; 4725 4726 /*clear job fence from fence drv to avoid force_completion 4727 *leave NULL and vm flush fence in fence drv */ 4728 amdgpu_fence_driver_clear_job_fences(ring); 4729 4730 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4731 amdgpu_fence_driver_force_completion(ring); 4732 } 4733 4734 amdgpu_fence_driver_isr_toggle(adev, false); 4735 4736 if (job && job->vm) 4737 drm_sched_increase_karma(&job->base); 4738 4739 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4740 /* If reset handler not implemented, continue; otherwise return */ 4741 if (r == -ENOSYS) 4742 r = 0; 4743 else 4744 return r; 4745 4746 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4747 if (!amdgpu_sriov_vf(adev)) { 4748 4749 if (!need_full_reset) 4750 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4751 4752 if (!need_full_reset && amdgpu_gpu_recovery && 4753 amdgpu_device_ip_check_soft_reset(adev)) { 4754 amdgpu_device_ip_pre_soft_reset(adev); 4755 r = amdgpu_device_ip_soft_reset(adev); 4756 amdgpu_device_ip_post_soft_reset(adev); 4757 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4758 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4759 need_full_reset = true; 4760 } 4761 } 4762 4763 if (need_full_reset) 4764 r = amdgpu_device_ip_suspend(adev); 4765 if (need_full_reset) 4766 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4767 else 4768 clear_bit(AMDGPU_NEED_FULL_RESET, 4769 &reset_context->flags); 4770 } 4771 4772 return r; 4773 } 4774 4775 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4776 { 4777 int i; 4778 4779 lockdep_assert_held(&adev->reset_domain->sem); 4780 4781 for (i = 0; i < adev->num_regs; i++) { 4782 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4783 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4784 adev->reset_dump_reg_value[i]); 4785 } 4786 4787 return 0; 4788 } 4789 4790 #ifdef CONFIG_DEV_COREDUMP 4791 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4792 size_t count, void *data, size_t datalen) 4793 { 4794 struct drm_printer p; 4795 struct amdgpu_device *adev = data; 4796 struct drm_print_iterator iter; 4797 int i; 4798 4799 iter.data = buffer; 4800 iter.offset = 0; 4801 iter.start = offset; 4802 iter.remain = count; 4803 4804 p = drm_coredump_printer(&iter); 4805 4806 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4807 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4808 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4809 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4810 if (adev->reset_task_info.pid) 4811 drm_printf(&p, "process_name: %s PID: %d\n", 4812 adev->reset_task_info.process_name, 4813 adev->reset_task_info.pid); 4814 4815 if (adev->reset_vram_lost) 4816 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4817 if (adev->num_regs) { 4818 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4819 4820 for (i = 0; i < adev->num_regs; i++) 4821 drm_printf(&p, "0x%08x: 0x%08x\n", 4822 adev->reset_dump_reg_list[i], 4823 adev->reset_dump_reg_value[i]); 4824 } 4825 4826 return count - iter.remain; 4827 } 4828 4829 static void amdgpu_devcoredump_free(void *data) 4830 { 4831 } 4832 4833 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4834 { 4835 struct drm_device *dev = adev_to_drm(adev); 4836 4837 ktime_get_ts64(&adev->reset_time); 4838 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4839 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4840 } 4841 #endif 4842 4843 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4844 struct amdgpu_reset_context *reset_context) 4845 { 4846 struct amdgpu_device *tmp_adev = NULL; 4847 bool need_full_reset, skip_hw_reset, vram_lost = false; 4848 int r = 0; 4849 bool gpu_reset_for_dev_remove = 0; 4850 4851 /* Try reset handler method first */ 4852 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4853 reset_list); 4854 amdgpu_reset_reg_dumps(tmp_adev); 4855 4856 reset_context->reset_device_list = device_list_handle; 4857 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4858 /* If reset handler not implemented, continue; otherwise return */ 4859 if (r == -ENOSYS) 4860 r = 0; 4861 else 4862 return r; 4863 4864 /* Reset handler not implemented, use the default method */ 4865 need_full_reset = 4866 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4867 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4868 4869 gpu_reset_for_dev_remove = 4870 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4871 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4872 4873 /* 4874 * ASIC reset has to be done on all XGMI hive nodes ASAP 4875 * to allow proper links negotiation in FW (within 1 sec) 4876 */ 4877 if (!skip_hw_reset && need_full_reset) { 4878 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4879 /* For XGMI run all resets in parallel to speed up the process */ 4880 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4881 tmp_adev->gmc.xgmi.pending_reset = false; 4882 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4883 r = -EALREADY; 4884 } else 4885 r = amdgpu_asic_reset(tmp_adev); 4886 4887 if (r) { 4888 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4889 r, adev_to_drm(tmp_adev)->unique); 4890 break; 4891 } 4892 } 4893 4894 /* For XGMI wait for all resets to complete before proceed */ 4895 if (!r) { 4896 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4897 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4898 flush_work(&tmp_adev->xgmi_reset_work); 4899 r = tmp_adev->asic_reset_res; 4900 if (r) 4901 break; 4902 } 4903 } 4904 } 4905 } 4906 4907 if (!r && amdgpu_ras_intr_triggered()) { 4908 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4909 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4910 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4911 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4912 } 4913 4914 amdgpu_ras_intr_cleared(); 4915 } 4916 4917 /* Since the mode1 reset affects base ip blocks, the 4918 * phase1 ip blocks need to be resumed. Otherwise there 4919 * will be a BIOS signature error and the psp bootloader 4920 * can't load kdb on the next amdgpu install. 4921 */ 4922 if (gpu_reset_for_dev_remove) { 4923 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4924 amdgpu_device_ip_resume_phase1(tmp_adev); 4925 4926 goto end; 4927 } 4928 4929 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4930 if (need_full_reset) { 4931 /* post card */ 4932 r = amdgpu_device_asic_init(tmp_adev); 4933 if (r) { 4934 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4935 } else { 4936 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4937 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4938 if (r) 4939 goto out; 4940 4941 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4942 if (r) 4943 goto out; 4944 4945 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4946 #ifdef CONFIG_DEV_COREDUMP 4947 tmp_adev->reset_vram_lost = vram_lost; 4948 memset(&tmp_adev->reset_task_info, 0, 4949 sizeof(tmp_adev->reset_task_info)); 4950 if (reset_context->job && reset_context->job->vm) 4951 tmp_adev->reset_task_info = 4952 reset_context->job->vm->task_info; 4953 amdgpu_reset_capture_coredumpm(tmp_adev); 4954 #endif 4955 if (vram_lost) { 4956 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4957 amdgpu_inc_vram_lost(tmp_adev); 4958 } 4959 4960 r = amdgpu_device_fw_loading(tmp_adev); 4961 if (r) 4962 return r; 4963 4964 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4965 if (r) 4966 goto out; 4967 4968 if (vram_lost) 4969 amdgpu_device_fill_reset_magic(tmp_adev); 4970 4971 /* 4972 * Add this ASIC as tracked as reset was already 4973 * complete successfully. 4974 */ 4975 amdgpu_register_gpu_instance(tmp_adev); 4976 4977 if (!reset_context->hive && 4978 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4979 amdgpu_xgmi_add_device(tmp_adev); 4980 4981 r = amdgpu_device_ip_late_init(tmp_adev); 4982 if (r) 4983 goto out; 4984 4985 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4986 4987 /* 4988 * The GPU enters bad state once faulty pages 4989 * by ECC has reached the threshold, and ras 4990 * recovery is scheduled next. So add one check 4991 * here to break recovery if it indeed exceeds 4992 * bad page threshold, and remind user to 4993 * retire this GPU or setting one bigger 4994 * bad_page_threshold value to fix this once 4995 * probing driver again. 4996 */ 4997 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4998 /* must succeed. */ 4999 amdgpu_ras_resume(tmp_adev); 5000 } else { 5001 r = -EINVAL; 5002 goto out; 5003 } 5004 5005 /* Update PSP FW topology after reset */ 5006 if (reset_context->hive && 5007 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5008 r = amdgpu_xgmi_update_topology( 5009 reset_context->hive, tmp_adev); 5010 } 5011 } 5012 5013 out: 5014 if (!r) { 5015 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5016 r = amdgpu_ib_ring_tests(tmp_adev); 5017 if (r) { 5018 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5019 need_full_reset = true; 5020 r = -EAGAIN; 5021 goto end; 5022 } 5023 } 5024 5025 if (!r) 5026 r = amdgpu_device_recover_vram(tmp_adev); 5027 else 5028 tmp_adev->asic_reset_res = r; 5029 } 5030 5031 end: 5032 if (need_full_reset) 5033 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5034 else 5035 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5036 return r; 5037 } 5038 5039 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5040 { 5041 5042 switch (amdgpu_asic_reset_method(adev)) { 5043 case AMD_RESET_METHOD_MODE1: 5044 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5045 break; 5046 case AMD_RESET_METHOD_MODE2: 5047 adev->mp1_state = PP_MP1_STATE_RESET; 5048 break; 5049 default: 5050 adev->mp1_state = PP_MP1_STATE_NONE; 5051 break; 5052 } 5053 } 5054 5055 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5056 { 5057 amdgpu_vf_error_trans_all(adev); 5058 adev->mp1_state = PP_MP1_STATE_NONE; 5059 } 5060 5061 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5062 { 5063 struct pci_dev *p = NULL; 5064 5065 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5066 adev->pdev->bus->number, 1); 5067 if (p) { 5068 pm_runtime_enable(&(p->dev)); 5069 pm_runtime_resume(&(p->dev)); 5070 } 5071 5072 pci_dev_put(p); 5073 } 5074 5075 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5076 { 5077 enum amd_reset_method reset_method; 5078 struct pci_dev *p = NULL; 5079 u64 expires; 5080 5081 /* 5082 * For now, only BACO and mode1 reset are confirmed 5083 * to suffer the audio issue without proper suspended. 5084 */ 5085 reset_method = amdgpu_asic_reset_method(adev); 5086 if ((reset_method != AMD_RESET_METHOD_BACO) && 5087 (reset_method != AMD_RESET_METHOD_MODE1)) 5088 return -EINVAL; 5089 5090 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5091 adev->pdev->bus->number, 1); 5092 if (!p) 5093 return -ENODEV; 5094 5095 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5096 if (!expires) 5097 /* 5098 * If we cannot get the audio device autosuspend delay, 5099 * a fixed 4S interval will be used. Considering 3S is 5100 * the audio controller default autosuspend delay setting. 5101 * 4S used here is guaranteed to cover that. 5102 */ 5103 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5104 5105 while (!pm_runtime_status_suspended(&(p->dev))) { 5106 if (!pm_runtime_suspend(&(p->dev))) 5107 break; 5108 5109 if (expires < ktime_get_mono_fast_ns()) { 5110 dev_warn(adev->dev, "failed to suspend display audio\n"); 5111 pci_dev_put(p); 5112 /* TODO: abort the succeeding gpu reset? */ 5113 return -ETIMEDOUT; 5114 } 5115 } 5116 5117 pm_runtime_disable(&(p->dev)); 5118 5119 pci_dev_put(p); 5120 return 0; 5121 } 5122 5123 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5124 { 5125 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5126 5127 #if defined(CONFIG_DEBUG_FS) 5128 if (!amdgpu_sriov_vf(adev)) 5129 cancel_work(&adev->reset_work); 5130 #endif 5131 5132 if (adev->kfd.dev) 5133 cancel_work(&adev->kfd.reset_work); 5134 5135 if (amdgpu_sriov_vf(adev)) 5136 cancel_work(&adev->virt.flr_work); 5137 5138 if (con && adev->ras_enabled) 5139 cancel_work(&con->recovery_work); 5140 5141 } 5142 5143 /** 5144 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5145 * 5146 * @adev: amdgpu_device pointer 5147 * @job: which job trigger hang 5148 * 5149 * Attempt to reset the GPU if it has hung (all asics). 5150 * Attempt to do soft-reset or full-reset and reinitialize Asic 5151 * Returns 0 for success or an error on failure. 5152 */ 5153 5154 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5155 struct amdgpu_job *job, 5156 struct amdgpu_reset_context *reset_context) 5157 { 5158 struct list_head device_list, *device_list_handle = NULL; 5159 bool job_signaled = false; 5160 struct amdgpu_hive_info *hive = NULL; 5161 struct amdgpu_device *tmp_adev = NULL; 5162 int i, r = 0; 5163 bool need_emergency_restart = false; 5164 bool audio_suspended = false; 5165 bool gpu_reset_for_dev_remove = false; 5166 5167 gpu_reset_for_dev_remove = 5168 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5169 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5170 5171 /* 5172 * Special case: RAS triggered and full reset isn't supported 5173 */ 5174 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5175 5176 /* 5177 * Flush RAM to disk so that after reboot 5178 * the user can read log and see why the system rebooted. 5179 */ 5180 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5181 DRM_WARN("Emergency reboot."); 5182 5183 ksys_sync_helper(); 5184 emergency_restart(); 5185 } 5186 5187 dev_info(adev->dev, "GPU %s begin!\n", 5188 need_emergency_restart ? "jobs stop":"reset"); 5189 5190 if (!amdgpu_sriov_vf(adev)) 5191 hive = amdgpu_get_xgmi_hive(adev); 5192 if (hive) 5193 mutex_lock(&hive->hive_lock); 5194 5195 reset_context->job = job; 5196 reset_context->hive = hive; 5197 /* 5198 * Build list of devices to reset. 5199 * In case we are in XGMI hive mode, resort the device list 5200 * to put adev in the 1st position. 5201 */ 5202 INIT_LIST_HEAD(&device_list); 5203 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5204 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5205 list_add_tail(&tmp_adev->reset_list, &device_list); 5206 if (gpu_reset_for_dev_remove && adev->shutdown) 5207 tmp_adev->shutdown = true; 5208 } 5209 if (!list_is_first(&adev->reset_list, &device_list)) 5210 list_rotate_to_front(&adev->reset_list, &device_list); 5211 device_list_handle = &device_list; 5212 } else { 5213 list_add_tail(&adev->reset_list, &device_list); 5214 device_list_handle = &device_list; 5215 } 5216 5217 /* We need to lock reset domain only once both for XGMI and single device */ 5218 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5219 reset_list); 5220 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5221 5222 /* block all schedulers and reset given job's ring */ 5223 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5224 5225 amdgpu_device_set_mp1_state(tmp_adev); 5226 5227 /* 5228 * Try to put the audio codec into suspend state 5229 * before gpu reset started. 5230 * 5231 * Due to the power domain of the graphics device 5232 * is shared with AZ power domain. Without this, 5233 * we may change the audio hardware from behind 5234 * the audio driver's back. That will trigger 5235 * some audio codec errors. 5236 */ 5237 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5238 audio_suspended = true; 5239 5240 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5241 5242 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5243 5244 if (!amdgpu_sriov_vf(tmp_adev)) 5245 amdgpu_amdkfd_pre_reset(tmp_adev); 5246 5247 /* 5248 * Mark these ASICs to be reseted as untracked first 5249 * And add them back after reset completed 5250 */ 5251 amdgpu_unregister_gpu_instance(tmp_adev); 5252 5253 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5254 5255 /* disable ras on ALL IPs */ 5256 if (!need_emergency_restart && 5257 amdgpu_device_ip_need_full_reset(tmp_adev)) 5258 amdgpu_ras_suspend(tmp_adev); 5259 5260 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5261 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5262 5263 if (!ring || !ring->sched.thread) 5264 continue; 5265 5266 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5267 5268 if (need_emergency_restart) 5269 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5270 } 5271 atomic_inc(&tmp_adev->gpu_reset_counter); 5272 } 5273 5274 if (need_emergency_restart) 5275 goto skip_sched_resume; 5276 5277 /* 5278 * Must check guilty signal here since after this point all old 5279 * HW fences are force signaled. 5280 * 5281 * job->base holds a reference to parent fence 5282 */ 5283 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5284 job_signaled = true; 5285 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5286 goto skip_hw_reset; 5287 } 5288 5289 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5290 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5291 if (gpu_reset_for_dev_remove) { 5292 /* Workaroud for ASICs need to disable SMC first */ 5293 amdgpu_device_smu_fini_early(tmp_adev); 5294 } 5295 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5296 /*TODO Should we stop ?*/ 5297 if (r) { 5298 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5299 r, adev_to_drm(tmp_adev)->unique); 5300 tmp_adev->asic_reset_res = r; 5301 } 5302 5303 /* 5304 * Drop all pending non scheduler resets. Scheduler resets 5305 * were already dropped during drm_sched_stop 5306 */ 5307 amdgpu_device_stop_pending_resets(tmp_adev); 5308 } 5309 5310 /* Actual ASIC resets if needed.*/ 5311 /* Host driver will handle XGMI hive reset for SRIOV */ 5312 if (amdgpu_sriov_vf(adev)) { 5313 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5314 if (r) 5315 adev->asic_reset_res = r; 5316 5317 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5318 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5319 amdgpu_ras_resume(adev); 5320 } else { 5321 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5322 if (r && r == -EAGAIN) 5323 goto retry; 5324 5325 if (!r && gpu_reset_for_dev_remove) 5326 goto recover_end; 5327 } 5328 5329 skip_hw_reset: 5330 5331 /* Post ASIC reset for all devs .*/ 5332 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5333 5334 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5335 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5336 5337 if (!ring || !ring->sched.thread) 5338 continue; 5339 5340 drm_sched_start(&ring->sched, true); 5341 } 5342 5343 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5344 amdgpu_mes_self_test(tmp_adev); 5345 5346 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5347 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5348 } 5349 5350 if (tmp_adev->asic_reset_res) 5351 r = tmp_adev->asic_reset_res; 5352 5353 tmp_adev->asic_reset_res = 0; 5354 5355 if (r) { 5356 /* bad news, how to tell it to userspace ? */ 5357 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5358 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5359 } else { 5360 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5361 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5362 DRM_WARN("smart shift update failed\n"); 5363 } 5364 } 5365 5366 skip_sched_resume: 5367 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5368 /* unlock kfd: SRIOV would do it separately */ 5369 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5370 amdgpu_amdkfd_post_reset(tmp_adev); 5371 5372 /* kfd_post_reset will do nothing if kfd device is not initialized, 5373 * need to bring up kfd here if it's not be initialized before 5374 */ 5375 if (!adev->kfd.init_complete) 5376 amdgpu_amdkfd_device_init(adev); 5377 5378 if (audio_suspended) 5379 amdgpu_device_resume_display_audio(tmp_adev); 5380 5381 amdgpu_device_unset_mp1_state(tmp_adev); 5382 5383 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5384 } 5385 5386 recover_end: 5387 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5388 reset_list); 5389 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5390 5391 if (hive) { 5392 mutex_unlock(&hive->hive_lock); 5393 amdgpu_put_xgmi_hive(hive); 5394 } 5395 5396 if (r) 5397 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5398 5399 atomic_set(&adev->reset_domain->reset_res, r); 5400 return r; 5401 } 5402 5403 /** 5404 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5405 * 5406 * @adev: amdgpu_device pointer 5407 * 5408 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5409 * and lanes) of the slot the device is in. Handles APUs and 5410 * virtualized environments where PCIE config space may not be available. 5411 */ 5412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5413 { 5414 struct pci_dev *pdev; 5415 enum pci_bus_speed speed_cap, platform_speed_cap; 5416 enum pcie_link_width platform_link_width; 5417 5418 if (amdgpu_pcie_gen_cap) 5419 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5420 5421 if (amdgpu_pcie_lane_cap) 5422 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5423 5424 /* covers APUs as well */ 5425 if (pci_is_root_bus(adev->pdev->bus)) { 5426 if (adev->pm.pcie_gen_mask == 0) 5427 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5428 if (adev->pm.pcie_mlw_mask == 0) 5429 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5430 return; 5431 } 5432 5433 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5434 return; 5435 5436 pcie_bandwidth_available(adev->pdev, NULL, 5437 &platform_speed_cap, &platform_link_width); 5438 5439 if (adev->pm.pcie_gen_mask == 0) { 5440 /* asic caps */ 5441 pdev = adev->pdev; 5442 speed_cap = pcie_get_speed_cap(pdev); 5443 if (speed_cap == PCI_SPEED_UNKNOWN) { 5444 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5445 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5446 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5447 } else { 5448 if (speed_cap == PCIE_SPEED_32_0GT) 5449 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5450 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5451 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5452 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5453 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5454 else if (speed_cap == PCIE_SPEED_16_0GT) 5455 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5456 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5457 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5458 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5459 else if (speed_cap == PCIE_SPEED_8_0GT) 5460 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5461 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5463 else if (speed_cap == PCIE_SPEED_5_0GT) 5464 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5465 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5466 else 5467 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5468 } 5469 /* platform caps */ 5470 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5471 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5472 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5473 } else { 5474 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5475 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5476 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5477 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5478 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5479 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5480 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5481 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5482 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5483 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5484 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5485 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5486 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5487 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5489 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5490 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5491 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5492 else 5493 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5494 5495 } 5496 } 5497 if (adev->pm.pcie_mlw_mask == 0) { 5498 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5499 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5500 } else { 5501 switch (platform_link_width) { 5502 case PCIE_LNK_X32: 5503 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5504 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5505 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5506 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5507 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5508 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5509 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5510 break; 5511 case PCIE_LNK_X16: 5512 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5515 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5516 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5517 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5518 break; 5519 case PCIE_LNK_X12: 5520 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5521 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5522 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5523 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5524 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5525 break; 5526 case PCIE_LNK_X8: 5527 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5531 break; 5532 case PCIE_LNK_X4: 5533 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5534 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5536 break; 5537 case PCIE_LNK_X2: 5538 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5540 break; 5541 case PCIE_LNK_X1: 5542 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5543 break; 5544 default: 5545 break; 5546 } 5547 } 5548 } 5549 } 5550 5551 /** 5552 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5553 * 5554 * @adev: amdgpu_device pointer 5555 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5556 * 5557 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5558 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5559 * @peer_adev. 5560 */ 5561 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5562 struct amdgpu_device *peer_adev) 5563 { 5564 #ifdef CONFIG_HSA_AMD_P2P 5565 uint64_t address_mask = peer_adev->dev->dma_mask ? 5566 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5567 resource_size_t aper_limit = 5568 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5569 bool p2p_access = 5570 !adev->gmc.xgmi.connected_to_cpu && 5571 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5572 5573 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5574 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5575 !(adev->gmc.aper_base & address_mask || 5576 aper_limit & address_mask)); 5577 #else 5578 return false; 5579 #endif 5580 } 5581 5582 int amdgpu_device_baco_enter(struct drm_device *dev) 5583 { 5584 struct amdgpu_device *adev = drm_to_adev(dev); 5585 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5586 5587 if (!amdgpu_device_supports_baco(dev)) 5588 return -ENOTSUPP; 5589 5590 if (ras && adev->ras_enabled && 5591 adev->nbio.funcs->enable_doorbell_interrupt) 5592 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5593 5594 return amdgpu_dpm_baco_enter(adev); 5595 } 5596 5597 int amdgpu_device_baco_exit(struct drm_device *dev) 5598 { 5599 struct amdgpu_device *adev = drm_to_adev(dev); 5600 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5601 int ret = 0; 5602 5603 if (!amdgpu_device_supports_baco(dev)) 5604 return -ENOTSUPP; 5605 5606 ret = amdgpu_dpm_baco_exit(adev); 5607 if (ret) 5608 return ret; 5609 5610 if (ras && adev->ras_enabled && 5611 adev->nbio.funcs->enable_doorbell_interrupt) 5612 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5613 5614 if (amdgpu_passthrough(adev) && 5615 adev->nbio.funcs->clear_doorbell_interrupt) 5616 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5617 5618 return 0; 5619 } 5620 5621 /** 5622 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5623 * @pdev: PCI device struct 5624 * @state: PCI channel state 5625 * 5626 * Description: Called when a PCI error is detected. 5627 * 5628 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5629 */ 5630 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5631 { 5632 struct drm_device *dev = pci_get_drvdata(pdev); 5633 struct amdgpu_device *adev = drm_to_adev(dev); 5634 int i; 5635 5636 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5637 5638 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5639 DRM_WARN("No support for XGMI hive yet..."); 5640 return PCI_ERS_RESULT_DISCONNECT; 5641 } 5642 5643 adev->pci_channel_state = state; 5644 5645 switch (state) { 5646 case pci_channel_io_normal: 5647 return PCI_ERS_RESULT_CAN_RECOVER; 5648 /* Fatal error, prepare for slot reset */ 5649 case pci_channel_io_frozen: 5650 /* 5651 * Locking adev->reset_domain->sem will prevent any external access 5652 * to GPU during PCI error recovery 5653 */ 5654 amdgpu_device_lock_reset_domain(adev->reset_domain); 5655 amdgpu_device_set_mp1_state(adev); 5656 5657 /* 5658 * Block any work scheduling as we do for regular GPU reset 5659 * for the duration of the recovery 5660 */ 5661 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5662 struct amdgpu_ring *ring = adev->rings[i]; 5663 5664 if (!ring || !ring->sched.thread) 5665 continue; 5666 5667 drm_sched_stop(&ring->sched, NULL); 5668 } 5669 atomic_inc(&adev->gpu_reset_counter); 5670 return PCI_ERS_RESULT_NEED_RESET; 5671 case pci_channel_io_perm_failure: 5672 /* Permanent error, prepare for device removal */ 5673 return PCI_ERS_RESULT_DISCONNECT; 5674 } 5675 5676 return PCI_ERS_RESULT_NEED_RESET; 5677 } 5678 5679 /** 5680 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5681 * @pdev: pointer to PCI device 5682 */ 5683 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5684 { 5685 5686 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5687 5688 /* TODO - dump whatever for debugging purposes */ 5689 5690 /* This called only if amdgpu_pci_error_detected returns 5691 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5692 * works, no need to reset slot. 5693 */ 5694 5695 return PCI_ERS_RESULT_RECOVERED; 5696 } 5697 5698 /** 5699 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5700 * @pdev: PCI device struct 5701 * 5702 * Description: This routine is called by the pci error recovery 5703 * code after the PCI slot has been reset, just before we 5704 * should resume normal operations. 5705 */ 5706 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5707 { 5708 struct drm_device *dev = pci_get_drvdata(pdev); 5709 struct amdgpu_device *adev = drm_to_adev(dev); 5710 int r, i; 5711 struct amdgpu_reset_context reset_context; 5712 u32 memsize; 5713 struct list_head device_list; 5714 5715 DRM_INFO("PCI error: slot reset callback!!\n"); 5716 5717 memset(&reset_context, 0, sizeof(reset_context)); 5718 5719 INIT_LIST_HEAD(&device_list); 5720 list_add_tail(&adev->reset_list, &device_list); 5721 5722 /* wait for asic to come out of reset */ 5723 msleep(500); 5724 5725 /* Restore PCI confspace */ 5726 amdgpu_device_load_pci_state(pdev); 5727 5728 /* confirm ASIC came out of reset */ 5729 for (i = 0; i < adev->usec_timeout; i++) { 5730 memsize = amdgpu_asic_get_config_memsize(adev); 5731 5732 if (memsize != 0xffffffff) 5733 break; 5734 udelay(1); 5735 } 5736 if (memsize == 0xffffffff) { 5737 r = -ETIME; 5738 goto out; 5739 } 5740 5741 reset_context.method = AMD_RESET_METHOD_NONE; 5742 reset_context.reset_req_dev = adev; 5743 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5744 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5745 5746 adev->no_hw_access = true; 5747 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5748 adev->no_hw_access = false; 5749 if (r) 5750 goto out; 5751 5752 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5753 5754 out: 5755 if (!r) { 5756 if (amdgpu_device_cache_pci_state(adev->pdev)) 5757 pci_restore_state(adev->pdev); 5758 5759 DRM_INFO("PCIe error recovery succeeded\n"); 5760 } else { 5761 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5762 amdgpu_device_unset_mp1_state(adev); 5763 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5764 } 5765 5766 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5767 } 5768 5769 /** 5770 * amdgpu_pci_resume() - resume normal ops after PCI reset 5771 * @pdev: pointer to PCI device 5772 * 5773 * Called when the error recovery driver tells us that its 5774 * OK to resume normal operation. 5775 */ 5776 void amdgpu_pci_resume(struct pci_dev *pdev) 5777 { 5778 struct drm_device *dev = pci_get_drvdata(pdev); 5779 struct amdgpu_device *adev = drm_to_adev(dev); 5780 int i; 5781 5782 5783 DRM_INFO("PCI error: resume callback!!\n"); 5784 5785 /* Only continue execution for the case of pci_channel_io_frozen */ 5786 if (adev->pci_channel_state != pci_channel_io_frozen) 5787 return; 5788 5789 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5790 struct amdgpu_ring *ring = adev->rings[i]; 5791 5792 if (!ring || !ring->sched.thread) 5793 continue; 5794 5795 drm_sched_start(&ring->sched, true); 5796 } 5797 5798 amdgpu_device_unset_mp1_state(adev); 5799 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5800 } 5801 5802 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5803 { 5804 struct drm_device *dev = pci_get_drvdata(pdev); 5805 struct amdgpu_device *adev = drm_to_adev(dev); 5806 int r; 5807 5808 r = pci_save_state(pdev); 5809 if (!r) { 5810 kfree(adev->pci_state); 5811 5812 adev->pci_state = pci_store_saved_state(pdev); 5813 5814 if (!adev->pci_state) { 5815 DRM_ERROR("Failed to store PCI saved state"); 5816 return false; 5817 } 5818 } else { 5819 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5820 return false; 5821 } 5822 5823 return true; 5824 } 5825 5826 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5827 { 5828 struct drm_device *dev = pci_get_drvdata(pdev); 5829 struct amdgpu_device *adev = drm_to_adev(dev); 5830 int r; 5831 5832 if (!adev->pci_state) 5833 return false; 5834 5835 r = pci_load_saved_state(pdev, adev->pci_state); 5836 5837 if (!r) { 5838 pci_restore_state(pdev); 5839 } else { 5840 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5841 return false; 5842 } 5843 5844 return true; 5845 } 5846 5847 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5848 struct amdgpu_ring *ring) 5849 { 5850 #ifdef CONFIG_X86_64 5851 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5852 return; 5853 #endif 5854 if (adev->gmc.xgmi.connected_to_cpu) 5855 return; 5856 5857 if (ring && ring->funcs->emit_hdp_flush) 5858 amdgpu_ring_emit_hdp_flush(ring); 5859 else 5860 amdgpu_asic_flush_hdp(adev, ring); 5861 } 5862 5863 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5864 struct amdgpu_ring *ring) 5865 { 5866 #ifdef CONFIG_X86_64 5867 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5868 return; 5869 #endif 5870 if (adev->gmc.xgmi.connected_to_cpu) 5871 return; 5872 5873 amdgpu_asic_invalidate_hdp(adev, ring); 5874 } 5875 5876 int amdgpu_in_reset(struct amdgpu_device *adev) 5877 { 5878 return atomic_read(&adev->reset_domain->in_gpu_reset); 5879 } 5880 5881 /** 5882 * amdgpu_device_halt() - bring hardware to some kind of halt state 5883 * 5884 * @adev: amdgpu_device pointer 5885 * 5886 * Bring hardware to some kind of halt state so that no one can touch it 5887 * any more. It will help to maintain error context when error occurred. 5888 * Compare to a simple hang, the system will keep stable at least for SSH 5889 * access. Then it should be trivial to inspect the hardware state and 5890 * see what's going on. Implemented as following: 5891 * 5892 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5893 * clears all CPU mappings to device, disallows remappings through page faults 5894 * 2. amdgpu_irq_disable_all() disables all interrupts 5895 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5896 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5897 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5898 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5899 * flush any in flight DMA operations 5900 */ 5901 void amdgpu_device_halt(struct amdgpu_device *adev) 5902 { 5903 struct pci_dev *pdev = adev->pdev; 5904 struct drm_device *ddev = adev_to_drm(adev); 5905 5906 drm_dev_unplug(ddev); 5907 5908 amdgpu_irq_disable_all(adev); 5909 5910 amdgpu_fence_driver_hw_fini(adev); 5911 5912 adev->no_hw_access = true; 5913 5914 amdgpu_device_unmap_mmio(adev); 5915 5916 pci_disable_device(pdev); 5917 pci_wait_for_pending_transaction(pdev); 5918 } 5919 5920 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5921 u32 reg) 5922 { 5923 unsigned long flags, address, data; 5924 u32 r; 5925 5926 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5927 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5928 5929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5930 WREG32(address, reg * 4); 5931 (void)RREG32(address); 5932 r = RREG32(data); 5933 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5934 return r; 5935 } 5936 5937 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5938 u32 reg, u32 v) 5939 { 5940 unsigned long flags, address, data; 5941 5942 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5943 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5944 5945 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5946 WREG32(address, reg * 4); 5947 (void)RREG32(address); 5948 WREG32(data, v); 5949 (void)RREG32(data); 5950 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5951 } 5952 5953 /** 5954 * amdgpu_device_switch_gang - switch to a new gang 5955 * @adev: amdgpu_device pointer 5956 * @gang: the gang to switch to 5957 * 5958 * Try to switch to a new gang. 5959 * Returns: NULL if we switched to the new gang or a reference to the current 5960 * gang leader. 5961 */ 5962 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5963 struct dma_fence *gang) 5964 { 5965 struct dma_fence *old = NULL; 5966 5967 do { 5968 dma_fence_put(old); 5969 rcu_read_lock(); 5970 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5971 rcu_read_unlock(); 5972 5973 if (old == gang) 5974 break; 5975 5976 if (!dma_fence_is_signaled(old)) 5977 return old; 5978 5979 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5980 old, gang) != old); 5981 5982 dma_fence_put(old); 5983 return NULL; 5984 } 5985 5986 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5987 { 5988 switch (adev->asic_type) { 5989 #ifdef CONFIG_DRM_AMDGPU_SI 5990 case CHIP_HAINAN: 5991 #endif 5992 case CHIP_TOPAZ: 5993 /* chips with no display hardware */ 5994 return false; 5995 #ifdef CONFIG_DRM_AMDGPU_SI 5996 case CHIP_TAHITI: 5997 case CHIP_PITCAIRN: 5998 case CHIP_VERDE: 5999 case CHIP_OLAND: 6000 #endif 6001 #ifdef CONFIG_DRM_AMDGPU_CIK 6002 case CHIP_BONAIRE: 6003 case CHIP_HAWAII: 6004 case CHIP_KAVERI: 6005 case CHIP_KABINI: 6006 case CHIP_MULLINS: 6007 #endif 6008 case CHIP_TONGA: 6009 case CHIP_FIJI: 6010 case CHIP_POLARIS10: 6011 case CHIP_POLARIS11: 6012 case CHIP_POLARIS12: 6013 case CHIP_VEGAM: 6014 case CHIP_CARRIZO: 6015 case CHIP_STONEY: 6016 /* chips with display hardware */ 6017 return true; 6018 default: 6019 /* IP discovery */ 6020 if (!adev->ip_versions[DCE_HWIP][0] || 6021 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6022 return false; 6023 return true; 6024 } 6025 } 6026