1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 39 #include <drm/drm_aperture.h> 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 #define AMDGPU_MAX_RETRY_LIMIT 2 92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 93 94 static const struct drm_driver amdgpu_kms_driver; 95 96 const char *amdgpu_asic_name[] = { 97 "TAHITI", 98 "PITCAIRN", 99 "VERDE", 100 "OLAND", 101 "HAINAN", 102 "BONAIRE", 103 "KAVERI", 104 "KABINI", 105 "HAWAII", 106 "MULLINS", 107 "TOPAZ", 108 "TONGA", 109 "FIJI", 110 "CARRIZO", 111 "STONEY", 112 "POLARIS10", 113 "POLARIS11", 114 "POLARIS12", 115 "VEGAM", 116 "VEGA10", 117 "VEGA12", 118 "VEGA20", 119 "RAVEN", 120 "ARCTURUS", 121 "RENOIR", 122 "ALDEBARAN", 123 "NAVI10", 124 "CYAN_SKILLFISH", 125 "NAVI14", 126 "NAVI12", 127 "SIENNA_CICHLID", 128 "NAVY_FLOUNDER", 129 "VANGOGH", 130 "DIMGREY_CAVEFISH", 131 "BEIGE_GOBY", 132 "YELLOW_CARP", 133 "IP DISCOVERY", 134 "LAST", 135 }; 136 137 /** 138 * DOC: pcie_replay_count 139 * 140 * The amdgpu driver provides a sysfs API for reporting the total number 141 * of PCIe replays (NAKs) 142 * The file pcie_replay_count is used for this and returns the total 143 * number of replays as a sum of the NAKs generated and NAKs received 144 */ 145 146 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 147 struct device_attribute *attr, char *buf) 148 { 149 struct drm_device *ddev = dev_get_drvdata(dev); 150 struct amdgpu_device *adev = drm_to_adev(ddev); 151 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 152 153 return sysfs_emit(buf, "%llu\n", cnt); 154 } 155 156 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 157 amdgpu_device_get_pcie_replay_count, NULL); 158 159 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 160 161 /** 162 * DOC: product_name 163 * 164 * The amdgpu driver provides a sysfs API for reporting the product name 165 * for the device 166 * The file serial_number is used for this and returns the product name 167 * as returned from the FRU. 168 * NOTE: This is only available for certain server cards 169 */ 170 171 static ssize_t amdgpu_device_get_product_name(struct device *dev, 172 struct device_attribute *attr, char *buf) 173 { 174 struct drm_device *ddev = dev_get_drvdata(dev); 175 struct amdgpu_device *adev = drm_to_adev(ddev); 176 177 return sysfs_emit(buf, "%s\n", adev->product_name); 178 } 179 180 static DEVICE_ATTR(product_name, S_IRUGO, 181 amdgpu_device_get_product_name, NULL); 182 183 /** 184 * DOC: product_number 185 * 186 * The amdgpu driver provides a sysfs API for reporting the part number 187 * for the device 188 * The file serial_number is used for this and returns the part number 189 * as returned from the FRU. 190 * NOTE: This is only available for certain server cards 191 */ 192 193 static ssize_t amdgpu_device_get_product_number(struct device *dev, 194 struct device_attribute *attr, char *buf) 195 { 196 struct drm_device *ddev = dev_get_drvdata(dev); 197 struct amdgpu_device *adev = drm_to_adev(ddev); 198 199 return sysfs_emit(buf, "%s\n", adev->product_number); 200 } 201 202 static DEVICE_ATTR(product_number, S_IRUGO, 203 amdgpu_device_get_product_number, NULL); 204 205 /** 206 * DOC: serial_number 207 * 208 * The amdgpu driver provides a sysfs API for reporting the serial number 209 * for the device 210 * The file serial_number is used for this and returns the serial number 211 * as returned from the FRU. 212 * NOTE: This is only available for certain server cards 213 */ 214 215 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 216 struct device_attribute *attr, char *buf) 217 { 218 struct drm_device *ddev = dev_get_drvdata(dev); 219 struct amdgpu_device *adev = drm_to_adev(ddev); 220 221 return sysfs_emit(buf, "%s\n", adev->serial); 222 } 223 224 static DEVICE_ATTR(serial_number, S_IRUGO, 225 amdgpu_device_get_serial_number, NULL); 226 227 /** 228 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 229 * 230 * @dev: drm_device pointer 231 * 232 * Returns true if the device is a dGPU with ATPX power control, 233 * otherwise return false. 234 */ 235 bool amdgpu_device_supports_px(struct drm_device *dev) 236 { 237 struct amdgpu_device *adev = drm_to_adev(dev); 238 239 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 240 return true; 241 return false; 242 } 243 244 /** 245 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 246 * 247 * @dev: drm_device pointer 248 * 249 * Returns true if the device is a dGPU with ACPI power control, 250 * otherwise return false. 251 */ 252 bool amdgpu_device_supports_boco(struct drm_device *dev) 253 { 254 struct amdgpu_device *adev = drm_to_adev(dev); 255 256 if (adev->has_pr3 || 257 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 258 return true; 259 return false; 260 } 261 262 /** 263 * amdgpu_device_supports_baco - Does the device support BACO 264 * 265 * @dev: drm_device pointer 266 * 267 * Returns true if the device supporte BACO, 268 * otherwise return false. 269 */ 270 bool amdgpu_device_supports_baco(struct drm_device *dev) 271 { 272 struct amdgpu_device *adev = drm_to_adev(dev); 273 274 return amdgpu_asic_supports_baco(adev); 275 } 276 277 /** 278 * amdgpu_device_supports_smart_shift - Is the device dGPU with 279 * smart shift support 280 * 281 * @dev: drm_device pointer 282 * 283 * Returns true if the device is a dGPU with Smart Shift support, 284 * otherwise returns false. 285 */ 286 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 287 { 288 return (amdgpu_device_supports_boco(dev) && 289 amdgpu_acpi_is_power_shift_control_supported()); 290 } 291 292 /* 293 * VRAM access helper functions 294 */ 295 296 /** 297 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 298 * 299 * @adev: amdgpu_device pointer 300 * @pos: offset of the buffer in vram 301 * @buf: virtual address of the buffer in system memory 302 * @size: read/write size, sizeof(@buf) must > @size 303 * @write: true - write to vram, otherwise - read from vram 304 */ 305 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 306 void *buf, size_t size, bool write) 307 { 308 unsigned long flags; 309 uint32_t hi = ~0, tmp = 0; 310 uint32_t *data = buf; 311 uint64_t last; 312 int idx; 313 314 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 315 return; 316 317 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 318 319 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 320 for (last = pos + size; pos < last; pos += 4) { 321 tmp = pos >> 31; 322 323 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 324 if (tmp != hi) { 325 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 326 hi = tmp; 327 } 328 if (write) 329 WREG32_NO_KIQ(mmMM_DATA, *data++); 330 else 331 *data++ = RREG32_NO_KIQ(mmMM_DATA); 332 } 333 334 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 335 drm_dev_exit(idx); 336 } 337 338 /** 339 * amdgpu_device_aper_access - access vram by vram aperature 340 * 341 * @adev: amdgpu_device pointer 342 * @pos: offset of the buffer in vram 343 * @buf: virtual address of the buffer in system memory 344 * @size: read/write size, sizeof(@buf) must > @size 345 * @write: true - write to vram, otherwise - read from vram 346 * 347 * The return value means how many bytes have been transferred. 348 */ 349 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 350 void *buf, size_t size, bool write) 351 { 352 #ifdef CONFIG_64BIT 353 void __iomem *addr; 354 size_t count = 0; 355 uint64_t last; 356 357 if (!adev->mman.aper_base_kaddr) 358 return 0; 359 360 last = min(pos + size, adev->gmc.visible_vram_size); 361 if (last > pos) { 362 addr = adev->mman.aper_base_kaddr + pos; 363 count = last - pos; 364 365 if (write) { 366 memcpy_toio(addr, buf, count); 367 mb(); 368 amdgpu_device_flush_hdp(adev, NULL); 369 } else { 370 amdgpu_device_invalidate_hdp(adev, NULL); 371 mb(); 372 memcpy_fromio(buf, addr, count); 373 } 374 375 } 376 377 return count; 378 #else 379 return 0; 380 #endif 381 } 382 383 /** 384 * amdgpu_device_vram_access - read/write a buffer in vram 385 * 386 * @adev: amdgpu_device pointer 387 * @pos: offset of the buffer in vram 388 * @buf: virtual address of the buffer in system memory 389 * @size: read/write size, sizeof(@buf) must > @size 390 * @write: true - write to vram, otherwise - read from vram 391 */ 392 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 393 void *buf, size_t size, bool write) 394 { 395 size_t count; 396 397 /* try to using vram apreature to access vram first */ 398 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 399 size -= count; 400 if (size) { 401 /* using MM to access rest vram */ 402 pos += count; 403 buf += count; 404 amdgpu_device_mm_access(adev, pos, buf, size, write); 405 } 406 } 407 408 /* 409 * register access helper functions. 410 */ 411 412 /* Check if hw access should be skipped because of hotplug or device error */ 413 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 414 { 415 if (adev->no_hw_access) 416 return true; 417 418 #ifdef CONFIG_LOCKDEP 419 /* 420 * This is a bit complicated to understand, so worth a comment. What we assert 421 * here is that the GPU reset is not running on another thread in parallel. 422 * 423 * For this we trylock the read side of the reset semaphore, if that succeeds 424 * we know that the reset is not running in paralell. 425 * 426 * If the trylock fails we assert that we are either already holding the read 427 * side of the lock or are the reset thread itself and hold the write side of 428 * the lock. 429 */ 430 if (in_task()) { 431 if (down_read_trylock(&adev->reset_domain->sem)) 432 up_read(&adev->reset_domain->sem); 433 else 434 lockdep_assert_held(&adev->reset_domain->sem); 435 } 436 #endif 437 return false; 438 } 439 440 /** 441 * amdgpu_device_rreg - read a memory mapped IO or indirect register 442 * 443 * @adev: amdgpu_device pointer 444 * @reg: dword aligned register offset 445 * @acc_flags: access flags which require special behavior 446 * 447 * Returns the 32 bit value from the offset specified. 448 */ 449 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 450 uint32_t reg, uint32_t acc_flags) 451 { 452 uint32_t ret; 453 454 if (amdgpu_device_skip_hw_access(adev)) 455 return 0; 456 457 if ((reg * 4) < adev->rmmio_size) { 458 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 459 amdgpu_sriov_runtime(adev) && 460 down_read_trylock(&adev->reset_domain->sem)) { 461 ret = amdgpu_kiq_rreg(adev, reg); 462 up_read(&adev->reset_domain->sem); 463 } else { 464 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 465 } 466 } else { 467 ret = adev->pcie_rreg(adev, reg * 4); 468 } 469 470 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 471 472 return ret; 473 } 474 475 /* 476 * MMIO register read with bytes helper functions 477 * @offset:bytes offset from MMIO start 478 * 479 */ 480 481 /** 482 * amdgpu_mm_rreg8 - read a memory mapped IO register 483 * 484 * @adev: amdgpu_device pointer 485 * @offset: byte aligned register offset 486 * 487 * Returns the 8 bit value from the offset specified. 488 */ 489 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 490 { 491 if (amdgpu_device_skip_hw_access(adev)) 492 return 0; 493 494 if (offset < adev->rmmio_size) 495 return (readb(adev->rmmio + offset)); 496 BUG(); 497 } 498 499 /* 500 * MMIO register write with bytes helper functions 501 * @offset:bytes offset from MMIO start 502 * @value: the value want to be written to the register 503 * 504 */ 505 /** 506 * amdgpu_mm_wreg8 - read a memory mapped IO register 507 * 508 * @adev: amdgpu_device pointer 509 * @offset: byte aligned register offset 510 * @value: 8 bit value to write 511 * 512 * Writes the value specified to the offset specified. 513 */ 514 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 515 { 516 if (amdgpu_device_skip_hw_access(adev)) 517 return; 518 519 if (offset < adev->rmmio_size) 520 writeb(value, adev->rmmio + offset); 521 else 522 BUG(); 523 } 524 525 /** 526 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 527 * 528 * @adev: amdgpu_device pointer 529 * @reg: dword aligned register offset 530 * @v: 32 bit value to write to the register 531 * @acc_flags: access flags which require special behavior 532 * 533 * Writes the value specified to the offset specified. 534 */ 535 void amdgpu_device_wreg(struct amdgpu_device *adev, 536 uint32_t reg, uint32_t v, 537 uint32_t acc_flags) 538 { 539 if (amdgpu_device_skip_hw_access(adev)) 540 return; 541 542 if ((reg * 4) < adev->rmmio_size) { 543 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 544 amdgpu_sriov_runtime(adev) && 545 down_read_trylock(&adev->reset_domain->sem)) { 546 amdgpu_kiq_wreg(adev, reg, v); 547 up_read(&adev->reset_domain->sem); 548 } else { 549 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 550 } 551 } else { 552 adev->pcie_wreg(adev, reg * 4, v); 553 } 554 555 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 556 } 557 558 /** 559 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 560 * 561 * @adev: amdgpu_device pointer 562 * @reg: mmio/rlc register 563 * @v: value to write 564 * 565 * this function is invoked only for the debugfs register access 566 */ 567 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 568 uint32_t reg, uint32_t v) 569 { 570 if (amdgpu_device_skip_hw_access(adev)) 571 return; 572 573 if (amdgpu_sriov_fullaccess(adev) && 574 adev->gfx.rlc.funcs && 575 adev->gfx.rlc.funcs->is_rlcg_access_range) { 576 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 577 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 578 } else if ((reg * 4) >= adev->rmmio_size) { 579 adev->pcie_wreg(adev, reg * 4, v); 580 } else { 581 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 582 } 583 } 584 585 /** 586 * amdgpu_mm_rdoorbell - read a doorbell dword 587 * 588 * @adev: amdgpu_device pointer 589 * @index: doorbell index 590 * 591 * Returns the value in the doorbell aperture at the 592 * requested doorbell index (CIK). 593 */ 594 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 595 { 596 if (amdgpu_device_skip_hw_access(adev)) 597 return 0; 598 599 if (index < adev->doorbell.num_doorbells) { 600 return readl(adev->doorbell.ptr + index); 601 } else { 602 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 603 return 0; 604 } 605 } 606 607 /** 608 * amdgpu_mm_wdoorbell - write a doorbell dword 609 * 610 * @adev: amdgpu_device pointer 611 * @index: doorbell index 612 * @v: value to write 613 * 614 * Writes @v to the doorbell aperture at the 615 * requested doorbell index (CIK). 616 */ 617 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 618 { 619 if (amdgpu_device_skip_hw_access(adev)) 620 return; 621 622 if (index < adev->doorbell.num_doorbells) { 623 writel(v, adev->doorbell.ptr + index); 624 } else { 625 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 626 } 627 } 628 629 /** 630 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 631 * 632 * @adev: amdgpu_device pointer 633 * @index: doorbell index 634 * 635 * Returns the value in the doorbell aperture at the 636 * requested doorbell index (VEGA10+). 637 */ 638 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 639 { 640 if (amdgpu_device_skip_hw_access(adev)) 641 return 0; 642 643 if (index < adev->doorbell.num_doorbells) { 644 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 645 } else { 646 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 647 return 0; 648 } 649 } 650 651 /** 652 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 653 * 654 * @adev: amdgpu_device pointer 655 * @index: doorbell index 656 * @v: value to write 657 * 658 * Writes @v to the doorbell aperture at the 659 * requested doorbell index (VEGA10+). 660 */ 661 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 662 { 663 if (amdgpu_device_skip_hw_access(adev)) 664 return; 665 666 if (index < adev->doorbell.num_doorbells) { 667 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 668 } else { 669 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 670 } 671 } 672 673 /** 674 * amdgpu_device_indirect_rreg - read an indirect register 675 * 676 * @adev: amdgpu_device pointer 677 * @pcie_index: mmio register offset 678 * @pcie_data: mmio register offset 679 * @reg_addr: indirect register address to read from 680 * 681 * Returns the value of indirect register @reg_addr 682 */ 683 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 684 u32 pcie_index, u32 pcie_data, 685 u32 reg_addr) 686 { 687 unsigned long flags; 688 u32 r; 689 void __iomem *pcie_index_offset; 690 void __iomem *pcie_data_offset; 691 692 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 693 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 694 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 695 696 writel(reg_addr, pcie_index_offset); 697 readl(pcie_index_offset); 698 r = readl(pcie_data_offset); 699 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 700 701 return r; 702 } 703 704 /** 705 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 706 * 707 * @adev: amdgpu_device pointer 708 * @pcie_index: mmio register offset 709 * @pcie_data: mmio register offset 710 * @reg_addr: indirect register address to read from 711 * 712 * Returns the value of indirect register @reg_addr 713 */ 714 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 715 u32 pcie_index, u32 pcie_data, 716 u32 reg_addr) 717 { 718 unsigned long flags; 719 u64 r; 720 void __iomem *pcie_index_offset; 721 void __iomem *pcie_data_offset; 722 723 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 724 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 725 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 726 727 /* read low 32 bits */ 728 writel(reg_addr, pcie_index_offset); 729 readl(pcie_index_offset); 730 r = readl(pcie_data_offset); 731 /* read high 32 bits */ 732 writel(reg_addr + 4, pcie_index_offset); 733 readl(pcie_index_offset); 734 r |= ((u64)readl(pcie_data_offset) << 32); 735 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 736 737 return r; 738 } 739 740 /** 741 * amdgpu_device_indirect_wreg - write an indirect register address 742 * 743 * @adev: amdgpu_device pointer 744 * @pcie_index: mmio register offset 745 * @pcie_data: mmio register offset 746 * @reg_addr: indirect register offset 747 * @reg_data: indirect register data 748 * 749 */ 750 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 751 u32 pcie_index, u32 pcie_data, 752 u32 reg_addr, u32 reg_data) 753 { 754 unsigned long flags; 755 void __iomem *pcie_index_offset; 756 void __iomem *pcie_data_offset; 757 758 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 759 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 760 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 761 762 writel(reg_addr, pcie_index_offset); 763 readl(pcie_index_offset); 764 writel(reg_data, pcie_data_offset); 765 readl(pcie_data_offset); 766 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 767 } 768 769 /** 770 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 771 * 772 * @adev: amdgpu_device pointer 773 * @pcie_index: mmio register offset 774 * @pcie_data: mmio register offset 775 * @reg_addr: indirect register offset 776 * @reg_data: indirect register data 777 * 778 */ 779 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 780 u32 pcie_index, u32 pcie_data, 781 u32 reg_addr, u64 reg_data) 782 { 783 unsigned long flags; 784 void __iomem *pcie_index_offset; 785 void __iomem *pcie_data_offset; 786 787 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 788 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 789 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 790 791 /* write low 32 bits */ 792 writel(reg_addr, pcie_index_offset); 793 readl(pcie_index_offset); 794 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 795 readl(pcie_data_offset); 796 /* write high 32 bits */ 797 writel(reg_addr + 4, pcie_index_offset); 798 readl(pcie_index_offset); 799 writel((u32)(reg_data >> 32), pcie_data_offset); 800 readl(pcie_data_offset); 801 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 802 } 803 804 /** 805 * amdgpu_invalid_rreg - dummy reg read function 806 * 807 * @adev: amdgpu_device pointer 808 * @reg: offset of register 809 * 810 * Dummy register read function. Used for register blocks 811 * that certain asics don't have (all asics). 812 * Returns the value in the register. 813 */ 814 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 815 { 816 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 817 BUG(); 818 return 0; 819 } 820 821 /** 822 * amdgpu_invalid_wreg - dummy reg write function 823 * 824 * @adev: amdgpu_device pointer 825 * @reg: offset of register 826 * @v: value to write to the register 827 * 828 * Dummy register read function. Used for register blocks 829 * that certain asics don't have (all asics). 830 */ 831 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 832 { 833 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 834 reg, v); 835 BUG(); 836 } 837 838 /** 839 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 840 * 841 * @adev: amdgpu_device pointer 842 * @reg: offset of register 843 * 844 * Dummy register read function. Used for register blocks 845 * that certain asics don't have (all asics). 846 * Returns the value in the register. 847 */ 848 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 849 { 850 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 851 BUG(); 852 return 0; 853 } 854 855 /** 856 * amdgpu_invalid_wreg64 - dummy reg write function 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: offset of register 860 * @v: value to write to the register 861 * 862 * Dummy register read function. Used for register blocks 863 * that certain asics don't have (all asics). 864 */ 865 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 866 { 867 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 868 reg, v); 869 BUG(); 870 } 871 872 /** 873 * amdgpu_block_invalid_rreg - dummy reg read function 874 * 875 * @adev: amdgpu_device pointer 876 * @block: offset of instance 877 * @reg: offset of register 878 * 879 * Dummy register read function. Used for register blocks 880 * that certain asics don't have (all asics). 881 * Returns the value in the register. 882 */ 883 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 884 uint32_t block, uint32_t reg) 885 { 886 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 887 reg, block); 888 BUG(); 889 return 0; 890 } 891 892 /** 893 * amdgpu_block_invalid_wreg - dummy reg write function 894 * 895 * @adev: amdgpu_device pointer 896 * @block: offset of instance 897 * @reg: offset of register 898 * @v: value to write to the register 899 * 900 * Dummy register read function. Used for register blocks 901 * that certain asics don't have (all asics). 902 */ 903 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 904 uint32_t block, 905 uint32_t reg, uint32_t v) 906 { 907 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 908 reg, block, v); 909 BUG(); 910 } 911 912 /** 913 * amdgpu_device_asic_init - Wrapper for atom asic_init 914 * 915 * @adev: amdgpu_device pointer 916 * 917 * Does any asic specific work and then calls atom asic init. 918 */ 919 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 920 { 921 amdgpu_asic_pre_asic_init(adev); 922 923 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 924 return amdgpu_atomfirmware_asic_init(adev, true); 925 else 926 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 927 } 928 929 /** 930 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 931 * 932 * @adev: amdgpu_device pointer 933 * 934 * Allocates a scratch page of VRAM for use by various things in the 935 * driver. 936 */ 937 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 938 { 939 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 940 AMDGPU_GEM_DOMAIN_VRAM | 941 AMDGPU_GEM_DOMAIN_GTT, 942 &adev->mem_scratch.robj, 943 &adev->mem_scratch.gpu_addr, 944 (void **)&adev->mem_scratch.ptr); 945 } 946 947 /** 948 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 949 * 950 * @adev: amdgpu_device pointer 951 * 952 * Frees the VRAM scratch page. 953 */ 954 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 955 { 956 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 957 } 958 959 /** 960 * amdgpu_device_program_register_sequence - program an array of registers. 961 * 962 * @adev: amdgpu_device pointer 963 * @registers: pointer to the register array 964 * @array_size: size of the register array 965 * 966 * Programs an array or registers with and and or masks. 967 * This is a helper for setting golden registers. 968 */ 969 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 970 const u32 *registers, 971 const u32 array_size) 972 { 973 u32 tmp, reg, and_mask, or_mask; 974 int i; 975 976 if (array_size % 3) 977 return; 978 979 for (i = 0; i < array_size; i +=3) { 980 reg = registers[i + 0]; 981 and_mask = registers[i + 1]; 982 or_mask = registers[i + 2]; 983 984 if (and_mask == 0xffffffff) { 985 tmp = or_mask; 986 } else { 987 tmp = RREG32(reg); 988 tmp &= ~and_mask; 989 if (adev->family >= AMDGPU_FAMILY_AI) 990 tmp |= (or_mask & and_mask); 991 else 992 tmp |= or_mask; 993 } 994 WREG32(reg, tmp); 995 } 996 } 997 998 /** 999 * amdgpu_device_pci_config_reset - reset the GPU 1000 * 1001 * @adev: amdgpu_device pointer 1002 * 1003 * Resets the GPU using the pci config reset sequence. 1004 * Only applicable to asics prior to vega10. 1005 */ 1006 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1007 { 1008 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1009 } 1010 1011 /** 1012 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1013 * 1014 * @adev: amdgpu_device pointer 1015 * 1016 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1017 */ 1018 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1019 { 1020 return pci_reset_function(adev->pdev); 1021 } 1022 1023 /* 1024 * GPU doorbell aperture helpers function. 1025 */ 1026 /** 1027 * amdgpu_device_doorbell_init - Init doorbell driver information. 1028 * 1029 * @adev: amdgpu_device pointer 1030 * 1031 * Init doorbell driver information (CIK) 1032 * Returns 0 on success, error on failure. 1033 */ 1034 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1035 { 1036 1037 /* No doorbell on SI hardware generation */ 1038 if (adev->asic_type < CHIP_BONAIRE) { 1039 adev->doorbell.base = 0; 1040 adev->doorbell.size = 0; 1041 adev->doorbell.num_doorbells = 0; 1042 adev->doorbell.ptr = NULL; 1043 return 0; 1044 } 1045 1046 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1047 return -EINVAL; 1048 1049 amdgpu_asic_init_doorbell_index(adev); 1050 1051 /* doorbell bar mapping */ 1052 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1053 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1054 1055 if (adev->enable_mes) { 1056 adev->doorbell.num_doorbells = 1057 adev->doorbell.size / sizeof(u32); 1058 } else { 1059 adev->doorbell.num_doorbells = 1060 min_t(u32, adev->doorbell.size / sizeof(u32), 1061 adev->doorbell_index.max_assignment+1); 1062 if (adev->doorbell.num_doorbells == 0) 1063 return -EINVAL; 1064 1065 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1066 * paging queue doorbell use the second page. The 1067 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1068 * doorbells are in the first page. So with paging queue enabled, 1069 * the max num_doorbells should + 1 page (0x400 in dword) 1070 */ 1071 if (adev->asic_type >= CHIP_VEGA10) 1072 adev->doorbell.num_doorbells += 0x400; 1073 } 1074 1075 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1076 adev->doorbell.num_doorbells * 1077 sizeof(u32)); 1078 if (adev->doorbell.ptr == NULL) 1079 return -ENOMEM; 1080 1081 return 0; 1082 } 1083 1084 /** 1085 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1086 * 1087 * @adev: amdgpu_device pointer 1088 * 1089 * Tear down doorbell driver information (CIK) 1090 */ 1091 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1092 { 1093 iounmap(adev->doorbell.ptr); 1094 adev->doorbell.ptr = NULL; 1095 } 1096 1097 1098 1099 /* 1100 * amdgpu_device_wb_*() 1101 * Writeback is the method by which the GPU updates special pages in memory 1102 * with the status of certain GPU events (fences, ring pointers,etc.). 1103 */ 1104 1105 /** 1106 * amdgpu_device_wb_fini - Disable Writeback and free memory 1107 * 1108 * @adev: amdgpu_device pointer 1109 * 1110 * Disables Writeback and frees the Writeback memory (all asics). 1111 * Used at driver shutdown. 1112 */ 1113 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1114 { 1115 if (adev->wb.wb_obj) { 1116 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1117 &adev->wb.gpu_addr, 1118 (void **)&adev->wb.wb); 1119 adev->wb.wb_obj = NULL; 1120 } 1121 } 1122 1123 /** 1124 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1125 * 1126 * @adev: amdgpu_device pointer 1127 * 1128 * Initializes writeback and allocates writeback memory (all asics). 1129 * Used at driver startup. 1130 * Returns 0 on success or an -error on failure. 1131 */ 1132 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1133 { 1134 int r; 1135 1136 if (adev->wb.wb_obj == NULL) { 1137 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1138 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1139 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1140 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1141 (void **)&adev->wb.wb); 1142 if (r) { 1143 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1144 return r; 1145 } 1146 1147 adev->wb.num_wb = AMDGPU_MAX_WB; 1148 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1149 1150 /* clear wb memory */ 1151 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1152 } 1153 1154 return 0; 1155 } 1156 1157 /** 1158 * amdgpu_device_wb_get - Allocate a wb entry 1159 * 1160 * @adev: amdgpu_device pointer 1161 * @wb: wb index 1162 * 1163 * Allocate a wb slot for use by the driver (all asics). 1164 * Returns 0 on success or -EINVAL on failure. 1165 */ 1166 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1167 { 1168 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1169 1170 if (offset < adev->wb.num_wb) { 1171 __set_bit(offset, adev->wb.used); 1172 *wb = offset << 3; /* convert to dw offset */ 1173 return 0; 1174 } else { 1175 return -EINVAL; 1176 } 1177 } 1178 1179 /** 1180 * amdgpu_device_wb_free - Free a wb entry 1181 * 1182 * @adev: amdgpu_device pointer 1183 * @wb: wb index 1184 * 1185 * Free a wb slot allocated for use by the driver (all asics) 1186 */ 1187 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1188 { 1189 wb >>= 3; 1190 if (wb < adev->wb.num_wb) 1191 __clear_bit(wb, adev->wb.used); 1192 } 1193 1194 /** 1195 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1196 * 1197 * @adev: amdgpu_device pointer 1198 * 1199 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1200 * to fail, but if any of the BARs is not accessible after the size we abort 1201 * driver loading by returning -ENODEV. 1202 */ 1203 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1204 { 1205 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1206 struct pci_bus *root; 1207 struct resource *res; 1208 unsigned i; 1209 u16 cmd; 1210 int r; 1211 1212 /* Bypass for VF */ 1213 if (amdgpu_sriov_vf(adev)) 1214 return 0; 1215 1216 /* skip if the bios has already enabled large BAR */ 1217 if (adev->gmc.real_vram_size && 1218 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1219 return 0; 1220 1221 /* Check if the root BUS has 64bit memory resources */ 1222 root = adev->pdev->bus; 1223 while (root->parent) 1224 root = root->parent; 1225 1226 pci_bus_for_each_resource(root, res, i) { 1227 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1228 res->start > 0x100000000ull) 1229 break; 1230 } 1231 1232 /* Trying to resize is pointless without a root hub window above 4GB */ 1233 if (!res) 1234 return 0; 1235 1236 /* Limit the BAR size to what is available */ 1237 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1238 rbar_size); 1239 1240 /* Disable memory decoding while we change the BAR addresses and size */ 1241 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1242 pci_write_config_word(adev->pdev, PCI_COMMAND, 1243 cmd & ~PCI_COMMAND_MEMORY); 1244 1245 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1246 amdgpu_device_doorbell_fini(adev); 1247 if (adev->asic_type >= CHIP_BONAIRE) 1248 pci_release_resource(adev->pdev, 2); 1249 1250 pci_release_resource(adev->pdev, 0); 1251 1252 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1253 if (r == -ENOSPC) 1254 DRM_INFO("Not enough PCI address space for a large BAR."); 1255 else if (r && r != -ENOTSUPP) 1256 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1257 1258 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1259 1260 /* When the doorbell or fb BAR isn't available we have no chance of 1261 * using the device. 1262 */ 1263 r = amdgpu_device_doorbell_init(adev); 1264 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1265 return -ENODEV; 1266 1267 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1268 1269 return 0; 1270 } 1271 1272 /* 1273 * GPU helpers function. 1274 */ 1275 /** 1276 * amdgpu_device_need_post - check if the hw need post or not 1277 * 1278 * @adev: amdgpu_device pointer 1279 * 1280 * Check if the asic has been initialized (all asics) at driver startup 1281 * or post is needed if hw reset is performed. 1282 * Returns true if need or false if not. 1283 */ 1284 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1285 { 1286 uint32_t reg; 1287 1288 if (amdgpu_sriov_vf(adev)) 1289 return false; 1290 1291 if (amdgpu_passthrough(adev)) { 1292 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1293 * some old smc fw still need driver do vPost otherwise gpu hang, while 1294 * those smc fw version above 22.15 doesn't have this flaw, so we force 1295 * vpost executed for smc version below 22.15 1296 */ 1297 if (adev->asic_type == CHIP_FIJI) { 1298 int err; 1299 uint32_t fw_ver; 1300 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1301 /* force vPost if error occured */ 1302 if (err) 1303 return true; 1304 1305 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1306 if (fw_ver < 0x00160e00) 1307 return true; 1308 } 1309 } 1310 1311 /* Don't post if we need to reset whole hive on init */ 1312 if (adev->gmc.xgmi.pending_reset) 1313 return false; 1314 1315 if (adev->has_hw_reset) { 1316 adev->has_hw_reset = false; 1317 return true; 1318 } 1319 1320 /* bios scratch used on CIK+ */ 1321 if (adev->asic_type >= CHIP_BONAIRE) 1322 return amdgpu_atombios_scratch_need_asic_init(adev); 1323 1324 /* check MEM_SIZE for older asics */ 1325 reg = amdgpu_asic_get_config_memsize(adev); 1326 1327 if ((reg != 0) && (reg != 0xffffffff)) 1328 return false; 1329 1330 return true; 1331 } 1332 1333 /** 1334 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1335 * 1336 * @adev: amdgpu_device pointer 1337 * 1338 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1339 * be set for this device. 1340 * 1341 * Returns true if it should be used or false if not. 1342 */ 1343 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1344 { 1345 switch (amdgpu_aspm) { 1346 case -1: 1347 break; 1348 case 0: 1349 return false; 1350 case 1: 1351 return true; 1352 default: 1353 return false; 1354 } 1355 return pcie_aspm_enabled(adev->pdev); 1356 } 1357 1358 /* if we get transitioned to only one device, take VGA back */ 1359 /** 1360 * amdgpu_device_vga_set_decode - enable/disable vga decode 1361 * 1362 * @pdev: PCI device pointer 1363 * @state: enable/disable vga decode 1364 * 1365 * Enable/disable vga decode (all asics). 1366 * Returns VGA resource flags. 1367 */ 1368 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1369 bool state) 1370 { 1371 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1372 amdgpu_asic_set_vga_state(adev, state); 1373 if (state) 1374 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1375 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1376 else 1377 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1378 } 1379 1380 /** 1381 * amdgpu_device_check_block_size - validate the vm block size 1382 * 1383 * @adev: amdgpu_device pointer 1384 * 1385 * Validates the vm block size specified via module parameter. 1386 * The vm block size defines number of bits in page table versus page directory, 1387 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1388 * page table and the remaining bits are in the page directory. 1389 */ 1390 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1391 { 1392 /* defines number of bits in page table versus page directory, 1393 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1394 * page table and the remaining bits are in the page directory */ 1395 if (amdgpu_vm_block_size == -1) 1396 return; 1397 1398 if (amdgpu_vm_block_size < 9) { 1399 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1400 amdgpu_vm_block_size); 1401 amdgpu_vm_block_size = -1; 1402 } 1403 } 1404 1405 /** 1406 * amdgpu_device_check_vm_size - validate the vm size 1407 * 1408 * @adev: amdgpu_device pointer 1409 * 1410 * Validates the vm size in GB specified via module parameter. 1411 * The VM size is the size of the GPU virtual memory space in GB. 1412 */ 1413 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1414 { 1415 /* no need to check the default value */ 1416 if (amdgpu_vm_size == -1) 1417 return; 1418 1419 if (amdgpu_vm_size < 1) { 1420 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1421 amdgpu_vm_size); 1422 amdgpu_vm_size = -1; 1423 } 1424 } 1425 1426 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1427 { 1428 struct sysinfo si; 1429 bool is_os_64 = (sizeof(void *) == 8); 1430 uint64_t total_memory; 1431 uint64_t dram_size_seven_GB = 0x1B8000000; 1432 uint64_t dram_size_three_GB = 0xB8000000; 1433 1434 if (amdgpu_smu_memory_pool_size == 0) 1435 return; 1436 1437 if (!is_os_64) { 1438 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1439 goto def_value; 1440 } 1441 si_meminfo(&si); 1442 total_memory = (uint64_t)si.totalram * si.mem_unit; 1443 1444 if ((amdgpu_smu_memory_pool_size == 1) || 1445 (amdgpu_smu_memory_pool_size == 2)) { 1446 if (total_memory < dram_size_three_GB) 1447 goto def_value1; 1448 } else if ((amdgpu_smu_memory_pool_size == 4) || 1449 (amdgpu_smu_memory_pool_size == 8)) { 1450 if (total_memory < dram_size_seven_GB) 1451 goto def_value1; 1452 } else { 1453 DRM_WARN("Smu memory pool size not supported\n"); 1454 goto def_value; 1455 } 1456 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1457 1458 return; 1459 1460 def_value1: 1461 DRM_WARN("No enough system memory\n"); 1462 def_value: 1463 adev->pm.smu_prv_buffer_size = 0; 1464 } 1465 1466 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1467 { 1468 if (!(adev->flags & AMD_IS_APU) || 1469 adev->asic_type < CHIP_RAVEN) 1470 return 0; 1471 1472 switch (adev->asic_type) { 1473 case CHIP_RAVEN: 1474 if (adev->pdev->device == 0x15dd) 1475 adev->apu_flags |= AMD_APU_IS_RAVEN; 1476 if (adev->pdev->device == 0x15d8) 1477 adev->apu_flags |= AMD_APU_IS_PICASSO; 1478 break; 1479 case CHIP_RENOIR: 1480 if ((adev->pdev->device == 0x1636) || 1481 (adev->pdev->device == 0x164c)) 1482 adev->apu_flags |= AMD_APU_IS_RENOIR; 1483 else 1484 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1485 break; 1486 case CHIP_VANGOGH: 1487 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1488 break; 1489 case CHIP_YELLOW_CARP: 1490 break; 1491 case CHIP_CYAN_SKILLFISH: 1492 if ((adev->pdev->device == 0x13FE) || 1493 (adev->pdev->device == 0x143F)) 1494 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1495 break; 1496 default: 1497 break; 1498 } 1499 1500 return 0; 1501 } 1502 1503 /** 1504 * amdgpu_device_check_arguments - validate module params 1505 * 1506 * @adev: amdgpu_device pointer 1507 * 1508 * Validates certain module parameters and updates 1509 * the associated values used by the driver (all asics). 1510 */ 1511 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1512 { 1513 if (amdgpu_sched_jobs < 4) { 1514 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1515 amdgpu_sched_jobs); 1516 amdgpu_sched_jobs = 4; 1517 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1518 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1519 amdgpu_sched_jobs); 1520 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1521 } 1522 1523 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1524 /* gart size must be greater or equal to 32M */ 1525 dev_warn(adev->dev, "gart size (%d) too small\n", 1526 amdgpu_gart_size); 1527 amdgpu_gart_size = -1; 1528 } 1529 1530 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1531 /* gtt size must be greater or equal to 32M */ 1532 dev_warn(adev->dev, "gtt size (%d) too small\n", 1533 amdgpu_gtt_size); 1534 amdgpu_gtt_size = -1; 1535 } 1536 1537 /* valid range is between 4 and 9 inclusive */ 1538 if (amdgpu_vm_fragment_size != -1 && 1539 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1540 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1541 amdgpu_vm_fragment_size = -1; 1542 } 1543 1544 if (amdgpu_sched_hw_submission < 2) { 1545 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1546 amdgpu_sched_hw_submission); 1547 amdgpu_sched_hw_submission = 2; 1548 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1549 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1550 amdgpu_sched_hw_submission); 1551 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1552 } 1553 1554 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1555 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1556 amdgpu_reset_method = -1; 1557 } 1558 1559 amdgpu_device_check_smu_prv_buffer_size(adev); 1560 1561 amdgpu_device_check_vm_size(adev); 1562 1563 amdgpu_device_check_block_size(adev); 1564 1565 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1566 1567 return 0; 1568 } 1569 1570 /** 1571 * amdgpu_switcheroo_set_state - set switcheroo state 1572 * 1573 * @pdev: pci dev pointer 1574 * @state: vga_switcheroo state 1575 * 1576 * Callback for the switcheroo driver. Suspends or resumes 1577 * the asics before or after it is powered up using ACPI methods. 1578 */ 1579 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1580 enum vga_switcheroo_state state) 1581 { 1582 struct drm_device *dev = pci_get_drvdata(pdev); 1583 int r; 1584 1585 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1586 return; 1587 1588 if (state == VGA_SWITCHEROO_ON) { 1589 pr_info("switched on\n"); 1590 /* don't suspend or resume card normally */ 1591 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1592 1593 pci_set_power_state(pdev, PCI_D0); 1594 amdgpu_device_load_pci_state(pdev); 1595 r = pci_enable_device(pdev); 1596 if (r) 1597 DRM_WARN("pci_enable_device failed (%d)\n", r); 1598 amdgpu_device_resume(dev, true); 1599 1600 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1601 } else { 1602 pr_info("switched off\n"); 1603 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1604 amdgpu_device_suspend(dev, true); 1605 amdgpu_device_cache_pci_state(pdev); 1606 /* Shut down the device */ 1607 pci_disable_device(pdev); 1608 pci_set_power_state(pdev, PCI_D3cold); 1609 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1610 } 1611 } 1612 1613 /** 1614 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1615 * 1616 * @pdev: pci dev pointer 1617 * 1618 * Callback for the switcheroo driver. Check of the switcheroo 1619 * state can be changed. 1620 * Returns true if the state can be changed, false if not. 1621 */ 1622 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1623 { 1624 struct drm_device *dev = pci_get_drvdata(pdev); 1625 1626 /* 1627 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1628 * locking inversion with the driver load path. And the access here is 1629 * completely racy anyway. So don't bother with locking for now. 1630 */ 1631 return atomic_read(&dev->open_count) == 0; 1632 } 1633 1634 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1635 .set_gpu_state = amdgpu_switcheroo_set_state, 1636 .reprobe = NULL, 1637 .can_switch = amdgpu_switcheroo_can_switch, 1638 }; 1639 1640 /** 1641 * amdgpu_device_ip_set_clockgating_state - set the CG state 1642 * 1643 * @dev: amdgpu_device pointer 1644 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1645 * @state: clockgating state (gate or ungate) 1646 * 1647 * Sets the requested clockgating state for all instances of 1648 * the hardware IP specified. 1649 * Returns the error code from the last instance. 1650 */ 1651 int amdgpu_device_ip_set_clockgating_state(void *dev, 1652 enum amd_ip_block_type block_type, 1653 enum amd_clockgating_state state) 1654 { 1655 struct amdgpu_device *adev = dev; 1656 int i, r = 0; 1657 1658 for (i = 0; i < adev->num_ip_blocks; i++) { 1659 if (!adev->ip_blocks[i].status.valid) 1660 continue; 1661 if (adev->ip_blocks[i].version->type != block_type) 1662 continue; 1663 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1664 continue; 1665 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1666 (void *)adev, state); 1667 if (r) 1668 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1669 adev->ip_blocks[i].version->funcs->name, r); 1670 } 1671 return r; 1672 } 1673 1674 /** 1675 * amdgpu_device_ip_set_powergating_state - set the PG state 1676 * 1677 * @dev: amdgpu_device pointer 1678 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1679 * @state: powergating state (gate or ungate) 1680 * 1681 * Sets the requested powergating state for all instances of 1682 * the hardware IP specified. 1683 * Returns the error code from the last instance. 1684 */ 1685 int amdgpu_device_ip_set_powergating_state(void *dev, 1686 enum amd_ip_block_type block_type, 1687 enum amd_powergating_state state) 1688 { 1689 struct amdgpu_device *adev = dev; 1690 int i, r = 0; 1691 1692 for (i = 0; i < adev->num_ip_blocks; i++) { 1693 if (!adev->ip_blocks[i].status.valid) 1694 continue; 1695 if (adev->ip_blocks[i].version->type != block_type) 1696 continue; 1697 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1698 continue; 1699 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1700 (void *)adev, state); 1701 if (r) 1702 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1703 adev->ip_blocks[i].version->funcs->name, r); 1704 } 1705 return r; 1706 } 1707 1708 /** 1709 * amdgpu_device_ip_get_clockgating_state - get the CG state 1710 * 1711 * @adev: amdgpu_device pointer 1712 * @flags: clockgating feature flags 1713 * 1714 * Walks the list of IPs on the device and updates the clockgating 1715 * flags for each IP. 1716 * Updates @flags with the feature flags for each hardware IP where 1717 * clockgating is enabled. 1718 */ 1719 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1720 u64 *flags) 1721 { 1722 int i; 1723 1724 for (i = 0; i < adev->num_ip_blocks; i++) { 1725 if (!adev->ip_blocks[i].status.valid) 1726 continue; 1727 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1728 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1729 } 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_wait_for_idle - wait for idle 1734 * 1735 * @adev: amdgpu_device pointer 1736 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1737 * 1738 * Waits for the request hardware IP to be idle. 1739 * Returns 0 for success or a negative error code on failure. 1740 */ 1741 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1742 enum amd_ip_block_type block_type) 1743 { 1744 int i, r; 1745 1746 for (i = 0; i < adev->num_ip_blocks; i++) { 1747 if (!adev->ip_blocks[i].status.valid) 1748 continue; 1749 if (adev->ip_blocks[i].version->type == block_type) { 1750 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1751 if (r) 1752 return r; 1753 break; 1754 } 1755 } 1756 return 0; 1757 1758 } 1759 1760 /** 1761 * amdgpu_device_ip_is_idle - is the hardware IP idle 1762 * 1763 * @adev: amdgpu_device pointer 1764 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1765 * 1766 * Check if the hardware IP is idle or not. 1767 * Returns true if it the IP is idle, false if not. 1768 */ 1769 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1770 enum amd_ip_block_type block_type) 1771 { 1772 int i; 1773 1774 for (i = 0; i < adev->num_ip_blocks; i++) { 1775 if (!adev->ip_blocks[i].status.valid) 1776 continue; 1777 if (adev->ip_blocks[i].version->type == block_type) 1778 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1779 } 1780 return true; 1781 1782 } 1783 1784 /** 1785 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1786 * 1787 * @adev: amdgpu_device pointer 1788 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1789 * 1790 * Returns a pointer to the hardware IP block structure 1791 * if it exists for the asic, otherwise NULL. 1792 */ 1793 struct amdgpu_ip_block * 1794 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1795 enum amd_ip_block_type type) 1796 { 1797 int i; 1798 1799 for (i = 0; i < adev->num_ip_blocks; i++) 1800 if (adev->ip_blocks[i].version->type == type) 1801 return &adev->ip_blocks[i]; 1802 1803 return NULL; 1804 } 1805 1806 /** 1807 * amdgpu_device_ip_block_version_cmp 1808 * 1809 * @adev: amdgpu_device pointer 1810 * @type: enum amd_ip_block_type 1811 * @major: major version 1812 * @minor: minor version 1813 * 1814 * return 0 if equal or greater 1815 * return 1 if smaller or the ip_block doesn't exist 1816 */ 1817 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1818 enum amd_ip_block_type type, 1819 u32 major, u32 minor) 1820 { 1821 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1822 1823 if (ip_block && ((ip_block->version->major > major) || 1824 ((ip_block->version->major == major) && 1825 (ip_block->version->minor >= minor)))) 1826 return 0; 1827 1828 return 1; 1829 } 1830 1831 /** 1832 * amdgpu_device_ip_block_add 1833 * 1834 * @adev: amdgpu_device pointer 1835 * @ip_block_version: pointer to the IP to add 1836 * 1837 * Adds the IP block driver information to the collection of IPs 1838 * on the asic. 1839 */ 1840 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1841 const struct amdgpu_ip_block_version *ip_block_version) 1842 { 1843 if (!ip_block_version) 1844 return -EINVAL; 1845 1846 switch (ip_block_version->type) { 1847 case AMD_IP_BLOCK_TYPE_VCN: 1848 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1849 return 0; 1850 break; 1851 case AMD_IP_BLOCK_TYPE_JPEG: 1852 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1853 return 0; 1854 break; 1855 default: 1856 break; 1857 } 1858 1859 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1860 ip_block_version->funcs->name); 1861 1862 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1863 1864 return 0; 1865 } 1866 1867 /** 1868 * amdgpu_device_enable_virtual_display - enable virtual display feature 1869 * 1870 * @adev: amdgpu_device pointer 1871 * 1872 * Enabled the virtual display feature if the user has enabled it via 1873 * the module parameter virtual_display. This feature provides a virtual 1874 * display hardware on headless boards or in virtualized environments. 1875 * This function parses and validates the configuration string specified by 1876 * the user and configues the virtual display configuration (number of 1877 * virtual connectors, crtcs, etc.) specified. 1878 */ 1879 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1880 { 1881 adev->enable_virtual_display = false; 1882 1883 if (amdgpu_virtual_display) { 1884 const char *pci_address_name = pci_name(adev->pdev); 1885 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1886 1887 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1888 pciaddstr_tmp = pciaddstr; 1889 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1890 pciaddname = strsep(&pciaddname_tmp, ","); 1891 if (!strcmp("all", pciaddname) 1892 || !strcmp(pci_address_name, pciaddname)) { 1893 long num_crtc; 1894 int res = -1; 1895 1896 adev->enable_virtual_display = true; 1897 1898 if (pciaddname_tmp) 1899 res = kstrtol(pciaddname_tmp, 10, 1900 &num_crtc); 1901 1902 if (!res) { 1903 if (num_crtc < 1) 1904 num_crtc = 1; 1905 if (num_crtc > 6) 1906 num_crtc = 6; 1907 adev->mode_info.num_crtc = num_crtc; 1908 } else { 1909 adev->mode_info.num_crtc = 1; 1910 } 1911 break; 1912 } 1913 } 1914 1915 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1916 amdgpu_virtual_display, pci_address_name, 1917 adev->enable_virtual_display, adev->mode_info.num_crtc); 1918 1919 kfree(pciaddstr); 1920 } 1921 } 1922 1923 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1924 { 1925 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1926 adev->mode_info.num_crtc = 1; 1927 adev->enable_virtual_display = true; 1928 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1929 adev->enable_virtual_display, adev->mode_info.num_crtc); 1930 } 1931 } 1932 1933 /** 1934 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1935 * 1936 * @adev: amdgpu_device pointer 1937 * 1938 * Parses the asic configuration parameters specified in the gpu info 1939 * firmware and makes them availale to the driver for use in configuring 1940 * the asic. 1941 * Returns 0 on success, -EINVAL on failure. 1942 */ 1943 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1944 { 1945 const char *chip_name; 1946 char fw_name[40]; 1947 int err; 1948 const struct gpu_info_firmware_header_v1_0 *hdr; 1949 1950 adev->firmware.gpu_info_fw = NULL; 1951 1952 if (adev->mman.discovery_bin) { 1953 /* 1954 * FIXME: The bounding box is still needed by Navi12, so 1955 * temporarily read it from gpu_info firmware. Should be dropped 1956 * when DAL no longer needs it. 1957 */ 1958 if (adev->asic_type != CHIP_NAVI12) 1959 return 0; 1960 } 1961 1962 switch (adev->asic_type) { 1963 default: 1964 return 0; 1965 case CHIP_VEGA10: 1966 chip_name = "vega10"; 1967 break; 1968 case CHIP_VEGA12: 1969 chip_name = "vega12"; 1970 break; 1971 case CHIP_RAVEN: 1972 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1973 chip_name = "raven2"; 1974 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1975 chip_name = "picasso"; 1976 else 1977 chip_name = "raven"; 1978 break; 1979 case CHIP_ARCTURUS: 1980 chip_name = "arcturus"; 1981 break; 1982 case CHIP_NAVI12: 1983 chip_name = "navi12"; 1984 break; 1985 } 1986 1987 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1988 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1989 if (err) { 1990 dev_err(adev->dev, 1991 "Failed to get gpu_info firmware \"%s\"\n", 1992 fw_name); 1993 goto out; 1994 } 1995 1996 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1997 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1998 1999 switch (hdr->version_major) { 2000 case 1: 2001 { 2002 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2003 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2004 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2005 2006 /* 2007 * Should be droped when DAL no longer needs it. 2008 */ 2009 if (adev->asic_type == CHIP_NAVI12) 2010 goto parse_soc_bounding_box; 2011 2012 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2013 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2014 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2015 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2016 adev->gfx.config.max_texture_channel_caches = 2017 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2018 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2019 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2020 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2021 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2022 adev->gfx.config.double_offchip_lds_buf = 2023 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2024 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2025 adev->gfx.cu_info.max_waves_per_simd = 2026 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2027 adev->gfx.cu_info.max_scratch_slots_per_cu = 2028 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2029 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2030 if (hdr->version_minor >= 1) { 2031 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2032 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2033 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2034 adev->gfx.config.num_sc_per_sh = 2035 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2036 adev->gfx.config.num_packer_per_sc = 2037 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2038 } 2039 2040 parse_soc_bounding_box: 2041 /* 2042 * soc bounding box info is not integrated in disocovery table, 2043 * we always need to parse it from gpu info firmware if needed. 2044 */ 2045 if (hdr->version_minor == 2) { 2046 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2047 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2048 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2049 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2050 } 2051 break; 2052 } 2053 default: 2054 dev_err(adev->dev, 2055 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2056 err = -EINVAL; 2057 goto out; 2058 } 2059 out: 2060 return err; 2061 } 2062 2063 /** 2064 * amdgpu_device_ip_early_init - run early init for hardware IPs 2065 * 2066 * @adev: amdgpu_device pointer 2067 * 2068 * Early initialization pass for hardware IPs. The hardware IPs that make 2069 * up each asic are discovered each IP's early_init callback is run. This 2070 * is the first stage in initializing the asic. 2071 * Returns 0 on success, negative error code on failure. 2072 */ 2073 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2074 { 2075 struct drm_device *dev = adev_to_drm(adev); 2076 struct pci_dev *parent; 2077 int i, r; 2078 2079 amdgpu_device_enable_virtual_display(adev); 2080 2081 if (amdgpu_sriov_vf(adev)) { 2082 r = amdgpu_virt_request_full_gpu(adev, true); 2083 if (r) 2084 return r; 2085 } 2086 2087 switch (adev->asic_type) { 2088 #ifdef CONFIG_DRM_AMDGPU_SI 2089 case CHIP_VERDE: 2090 case CHIP_TAHITI: 2091 case CHIP_PITCAIRN: 2092 case CHIP_OLAND: 2093 case CHIP_HAINAN: 2094 adev->family = AMDGPU_FAMILY_SI; 2095 r = si_set_ip_blocks(adev); 2096 if (r) 2097 return r; 2098 break; 2099 #endif 2100 #ifdef CONFIG_DRM_AMDGPU_CIK 2101 case CHIP_BONAIRE: 2102 case CHIP_HAWAII: 2103 case CHIP_KAVERI: 2104 case CHIP_KABINI: 2105 case CHIP_MULLINS: 2106 if (adev->flags & AMD_IS_APU) 2107 adev->family = AMDGPU_FAMILY_KV; 2108 else 2109 adev->family = AMDGPU_FAMILY_CI; 2110 2111 r = cik_set_ip_blocks(adev); 2112 if (r) 2113 return r; 2114 break; 2115 #endif 2116 case CHIP_TOPAZ: 2117 case CHIP_TONGA: 2118 case CHIP_FIJI: 2119 case CHIP_POLARIS10: 2120 case CHIP_POLARIS11: 2121 case CHIP_POLARIS12: 2122 case CHIP_VEGAM: 2123 case CHIP_CARRIZO: 2124 case CHIP_STONEY: 2125 if (adev->flags & AMD_IS_APU) 2126 adev->family = AMDGPU_FAMILY_CZ; 2127 else 2128 adev->family = AMDGPU_FAMILY_VI; 2129 2130 r = vi_set_ip_blocks(adev); 2131 if (r) 2132 return r; 2133 break; 2134 default: 2135 r = amdgpu_discovery_set_ip_blocks(adev); 2136 if (r) 2137 return r; 2138 break; 2139 } 2140 2141 if (amdgpu_has_atpx() && 2142 (amdgpu_is_atpx_hybrid() || 2143 amdgpu_has_atpx_dgpu_power_cntl()) && 2144 ((adev->flags & AMD_IS_APU) == 0) && 2145 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2146 adev->flags |= AMD_IS_PX; 2147 2148 if (!(adev->flags & AMD_IS_APU)) { 2149 parent = pci_upstream_bridge(adev->pdev); 2150 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2151 } 2152 2153 amdgpu_amdkfd_device_probe(adev); 2154 2155 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2156 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2157 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2158 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2159 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2160 2161 for (i = 0; i < adev->num_ip_blocks; i++) { 2162 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2163 DRM_ERROR("disabled ip block: %d <%s>\n", 2164 i, adev->ip_blocks[i].version->funcs->name); 2165 adev->ip_blocks[i].status.valid = false; 2166 } else { 2167 if (adev->ip_blocks[i].version->funcs->early_init) { 2168 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2169 if (r == -ENOENT) { 2170 adev->ip_blocks[i].status.valid = false; 2171 } else if (r) { 2172 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2173 adev->ip_blocks[i].version->funcs->name, r); 2174 return r; 2175 } else { 2176 adev->ip_blocks[i].status.valid = true; 2177 } 2178 } else { 2179 adev->ip_blocks[i].status.valid = true; 2180 } 2181 } 2182 /* get the vbios after the asic_funcs are set up */ 2183 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2184 r = amdgpu_device_parse_gpu_info_fw(adev); 2185 if (r) 2186 return r; 2187 2188 /* Read BIOS */ 2189 if (!amdgpu_get_bios(adev)) 2190 return -EINVAL; 2191 2192 r = amdgpu_atombios_init(adev); 2193 if (r) { 2194 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2195 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2196 return r; 2197 } 2198 2199 /*get pf2vf msg info at it's earliest time*/ 2200 if (amdgpu_sriov_vf(adev)) 2201 amdgpu_virt_init_data_exchange(adev); 2202 2203 } 2204 } 2205 2206 adev->cg_flags &= amdgpu_cg_mask; 2207 adev->pg_flags &= amdgpu_pg_mask; 2208 2209 return 0; 2210 } 2211 2212 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2213 { 2214 int i, r; 2215 2216 for (i = 0; i < adev->num_ip_blocks; i++) { 2217 if (!adev->ip_blocks[i].status.sw) 2218 continue; 2219 if (adev->ip_blocks[i].status.hw) 2220 continue; 2221 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2222 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2223 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2224 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2225 if (r) { 2226 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2227 adev->ip_blocks[i].version->funcs->name, r); 2228 return r; 2229 } 2230 adev->ip_blocks[i].status.hw = true; 2231 } 2232 } 2233 2234 return 0; 2235 } 2236 2237 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2238 { 2239 int i, r; 2240 2241 for (i = 0; i < adev->num_ip_blocks; i++) { 2242 if (!adev->ip_blocks[i].status.sw) 2243 continue; 2244 if (adev->ip_blocks[i].status.hw) 2245 continue; 2246 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2247 if (r) { 2248 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2249 adev->ip_blocks[i].version->funcs->name, r); 2250 return r; 2251 } 2252 adev->ip_blocks[i].status.hw = true; 2253 } 2254 2255 return 0; 2256 } 2257 2258 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2259 { 2260 int r = 0; 2261 int i; 2262 uint32_t smu_version; 2263 2264 if (adev->asic_type >= CHIP_VEGA10) { 2265 for (i = 0; i < adev->num_ip_blocks; i++) { 2266 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2267 continue; 2268 2269 if (!adev->ip_blocks[i].status.sw) 2270 continue; 2271 2272 /* no need to do the fw loading again if already done*/ 2273 if (adev->ip_blocks[i].status.hw == true) 2274 break; 2275 2276 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2277 r = adev->ip_blocks[i].version->funcs->resume(adev); 2278 if (r) { 2279 DRM_ERROR("resume of IP block <%s> failed %d\n", 2280 adev->ip_blocks[i].version->funcs->name, r); 2281 return r; 2282 } 2283 } else { 2284 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2285 if (r) { 2286 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2287 adev->ip_blocks[i].version->funcs->name, r); 2288 return r; 2289 } 2290 } 2291 2292 adev->ip_blocks[i].status.hw = true; 2293 break; 2294 } 2295 } 2296 2297 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2298 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2299 2300 return r; 2301 } 2302 2303 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2304 { 2305 long timeout; 2306 int r, i; 2307 2308 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2309 struct amdgpu_ring *ring = adev->rings[i]; 2310 2311 /* No need to setup the GPU scheduler for rings that don't need it */ 2312 if (!ring || ring->no_scheduler) 2313 continue; 2314 2315 switch (ring->funcs->type) { 2316 case AMDGPU_RING_TYPE_GFX: 2317 timeout = adev->gfx_timeout; 2318 break; 2319 case AMDGPU_RING_TYPE_COMPUTE: 2320 timeout = adev->compute_timeout; 2321 break; 2322 case AMDGPU_RING_TYPE_SDMA: 2323 timeout = adev->sdma_timeout; 2324 break; 2325 default: 2326 timeout = adev->video_timeout; 2327 break; 2328 } 2329 2330 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2331 ring->num_hw_submission, amdgpu_job_hang_limit, 2332 timeout, adev->reset_domain->wq, 2333 ring->sched_score, ring->name, 2334 adev->dev); 2335 if (r) { 2336 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2337 ring->name); 2338 return r; 2339 } 2340 } 2341 2342 return 0; 2343 } 2344 2345 2346 /** 2347 * amdgpu_device_ip_init - run init for hardware IPs 2348 * 2349 * @adev: amdgpu_device pointer 2350 * 2351 * Main initialization pass for hardware IPs. The list of all the hardware 2352 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2353 * are run. sw_init initializes the software state associated with each IP 2354 * and hw_init initializes the hardware associated with each IP. 2355 * Returns 0 on success, negative error code on failure. 2356 */ 2357 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2358 { 2359 int i, r; 2360 2361 r = amdgpu_ras_init(adev); 2362 if (r) 2363 return r; 2364 2365 for (i = 0; i < adev->num_ip_blocks; i++) { 2366 if (!adev->ip_blocks[i].status.valid) 2367 continue; 2368 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2369 if (r) { 2370 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2371 adev->ip_blocks[i].version->funcs->name, r); 2372 goto init_failed; 2373 } 2374 adev->ip_blocks[i].status.sw = true; 2375 2376 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2377 /* need to do common hw init early so everything is set up for gmc */ 2378 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2379 if (r) { 2380 DRM_ERROR("hw_init %d failed %d\n", i, r); 2381 goto init_failed; 2382 } 2383 adev->ip_blocks[i].status.hw = true; 2384 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2385 /* need to do gmc hw init early so we can allocate gpu mem */ 2386 /* Try to reserve bad pages early */ 2387 if (amdgpu_sriov_vf(adev)) 2388 amdgpu_virt_exchange_data(adev); 2389 2390 r = amdgpu_device_mem_scratch_init(adev); 2391 if (r) { 2392 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2393 goto init_failed; 2394 } 2395 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2396 if (r) { 2397 DRM_ERROR("hw_init %d failed %d\n", i, r); 2398 goto init_failed; 2399 } 2400 r = amdgpu_device_wb_init(adev); 2401 if (r) { 2402 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2403 goto init_failed; 2404 } 2405 adev->ip_blocks[i].status.hw = true; 2406 2407 /* right after GMC hw init, we create CSA */ 2408 if (amdgpu_mcbp) { 2409 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2410 AMDGPU_GEM_DOMAIN_VRAM | 2411 AMDGPU_GEM_DOMAIN_GTT, 2412 AMDGPU_CSA_SIZE); 2413 if (r) { 2414 DRM_ERROR("allocate CSA failed %d\n", r); 2415 goto init_failed; 2416 } 2417 } 2418 } 2419 } 2420 2421 if (amdgpu_sriov_vf(adev)) 2422 amdgpu_virt_init_data_exchange(adev); 2423 2424 r = amdgpu_ib_pool_init(adev); 2425 if (r) { 2426 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2427 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2428 goto init_failed; 2429 } 2430 2431 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2432 if (r) 2433 goto init_failed; 2434 2435 r = amdgpu_device_ip_hw_init_phase1(adev); 2436 if (r) 2437 goto init_failed; 2438 2439 r = amdgpu_device_fw_loading(adev); 2440 if (r) 2441 goto init_failed; 2442 2443 r = amdgpu_device_ip_hw_init_phase2(adev); 2444 if (r) 2445 goto init_failed; 2446 2447 /* 2448 * retired pages will be loaded from eeprom and reserved here, 2449 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2450 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2451 * for I2C communication which only true at this point. 2452 * 2453 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2454 * failure from bad gpu situation and stop amdgpu init process 2455 * accordingly. For other failed cases, it will still release all 2456 * the resource and print error message, rather than returning one 2457 * negative value to upper level. 2458 * 2459 * Note: theoretically, this should be called before all vram allocations 2460 * to protect retired page from abusing 2461 */ 2462 r = amdgpu_ras_recovery_init(adev); 2463 if (r) 2464 goto init_failed; 2465 2466 /** 2467 * In case of XGMI grab extra reference for reset domain for this device 2468 */ 2469 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2470 if (amdgpu_xgmi_add_device(adev) == 0) { 2471 if (!amdgpu_sriov_vf(adev)) { 2472 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2473 2474 if (WARN_ON(!hive)) { 2475 r = -ENOENT; 2476 goto init_failed; 2477 } 2478 2479 if (!hive->reset_domain || 2480 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2481 r = -ENOENT; 2482 amdgpu_put_xgmi_hive(hive); 2483 goto init_failed; 2484 } 2485 2486 /* Drop the early temporary reset domain we created for device */ 2487 amdgpu_reset_put_reset_domain(adev->reset_domain); 2488 adev->reset_domain = hive->reset_domain; 2489 amdgpu_put_xgmi_hive(hive); 2490 } 2491 } 2492 } 2493 2494 r = amdgpu_device_init_schedulers(adev); 2495 if (r) 2496 goto init_failed; 2497 2498 /* Don't init kfd if whole hive need to be reset during init */ 2499 if (!adev->gmc.xgmi.pending_reset) 2500 amdgpu_amdkfd_device_init(adev); 2501 2502 amdgpu_fru_get_product_info(adev); 2503 2504 init_failed: 2505 if (amdgpu_sriov_vf(adev)) 2506 amdgpu_virt_release_full_gpu(adev, true); 2507 2508 return r; 2509 } 2510 2511 /** 2512 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2513 * 2514 * @adev: amdgpu_device pointer 2515 * 2516 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2517 * this function before a GPU reset. If the value is retained after a 2518 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2519 */ 2520 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2521 { 2522 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2523 } 2524 2525 /** 2526 * amdgpu_device_check_vram_lost - check if vram is valid 2527 * 2528 * @adev: amdgpu_device pointer 2529 * 2530 * Checks the reset magic value written to the gart pointer in VRAM. 2531 * The driver calls this after a GPU reset to see if the contents of 2532 * VRAM is lost or now. 2533 * returns true if vram is lost, false if not. 2534 */ 2535 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2536 { 2537 if (memcmp(adev->gart.ptr, adev->reset_magic, 2538 AMDGPU_RESET_MAGIC_NUM)) 2539 return true; 2540 2541 if (!amdgpu_in_reset(adev)) 2542 return false; 2543 2544 /* 2545 * For all ASICs with baco/mode1 reset, the VRAM is 2546 * always assumed to be lost. 2547 */ 2548 switch (amdgpu_asic_reset_method(adev)) { 2549 case AMD_RESET_METHOD_BACO: 2550 case AMD_RESET_METHOD_MODE1: 2551 return true; 2552 default: 2553 return false; 2554 } 2555 } 2556 2557 /** 2558 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2559 * 2560 * @adev: amdgpu_device pointer 2561 * @state: clockgating state (gate or ungate) 2562 * 2563 * The list of all the hardware IPs that make up the asic is walked and the 2564 * set_clockgating_state callbacks are run. 2565 * Late initialization pass enabling clockgating for hardware IPs. 2566 * Fini or suspend, pass disabling clockgating for hardware IPs. 2567 * Returns 0 on success, negative error code on failure. 2568 */ 2569 2570 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2571 enum amd_clockgating_state state) 2572 { 2573 int i, j, r; 2574 2575 if (amdgpu_emu_mode == 1) 2576 return 0; 2577 2578 for (j = 0; j < adev->num_ip_blocks; j++) { 2579 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2580 if (!adev->ip_blocks[i].status.late_initialized) 2581 continue; 2582 /* skip CG for GFX, SDMA on S0ix */ 2583 if (adev->in_s0ix && 2584 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2585 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2586 continue; 2587 /* skip CG for VCE/UVD, it's handled specially */ 2588 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2589 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2590 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2591 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2592 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2593 /* enable clockgating to save power */ 2594 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2595 state); 2596 if (r) { 2597 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2598 adev->ip_blocks[i].version->funcs->name, r); 2599 return r; 2600 } 2601 } 2602 } 2603 2604 return 0; 2605 } 2606 2607 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2608 enum amd_powergating_state state) 2609 { 2610 int i, j, r; 2611 2612 if (amdgpu_emu_mode == 1) 2613 return 0; 2614 2615 for (j = 0; j < adev->num_ip_blocks; j++) { 2616 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2617 if (!adev->ip_blocks[i].status.late_initialized) 2618 continue; 2619 /* skip PG for GFX, SDMA on S0ix */ 2620 if (adev->in_s0ix && 2621 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2622 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2623 continue; 2624 /* skip CG for VCE/UVD, it's handled specially */ 2625 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2626 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2627 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2628 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2629 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2630 /* enable powergating to save power */ 2631 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2632 state); 2633 if (r) { 2634 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2635 adev->ip_blocks[i].version->funcs->name, r); 2636 return r; 2637 } 2638 } 2639 } 2640 return 0; 2641 } 2642 2643 static int amdgpu_device_enable_mgpu_fan_boost(void) 2644 { 2645 struct amdgpu_gpu_instance *gpu_ins; 2646 struct amdgpu_device *adev; 2647 int i, ret = 0; 2648 2649 mutex_lock(&mgpu_info.mutex); 2650 2651 /* 2652 * MGPU fan boost feature should be enabled 2653 * only when there are two or more dGPUs in 2654 * the system 2655 */ 2656 if (mgpu_info.num_dgpu < 2) 2657 goto out; 2658 2659 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2660 gpu_ins = &(mgpu_info.gpu_ins[i]); 2661 adev = gpu_ins->adev; 2662 if (!(adev->flags & AMD_IS_APU) && 2663 !gpu_ins->mgpu_fan_enabled) { 2664 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2665 if (ret) 2666 break; 2667 2668 gpu_ins->mgpu_fan_enabled = 1; 2669 } 2670 } 2671 2672 out: 2673 mutex_unlock(&mgpu_info.mutex); 2674 2675 return ret; 2676 } 2677 2678 /** 2679 * amdgpu_device_ip_late_init - run late init for hardware IPs 2680 * 2681 * @adev: amdgpu_device pointer 2682 * 2683 * Late initialization pass for hardware IPs. The list of all the hardware 2684 * IPs that make up the asic is walked and the late_init callbacks are run. 2685 * late_init covers any special initialization that an IP requires 2686 * after all of the have been initialized or something that needs to happen 2687 * late in the init process. 2688 * Returns 0 on success, negative error code on failure. 2689 */ 2690 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2691 { 2692 struct amdgpu_gpu_instance *gpu_instance; 2693 int i = 0, r; 2694 2695 for (i = 0; i < adev->num_ip_blocks; i++) { 2696 if (!adev->ip_blocks[i].status.hw) 2697 continue; 2698 if (adev->ip_blocks[i].version->funcs->late_init) { 2699 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2700 if (r) { 2701 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2702 adev->ip_blocks[i].version->funcs->name, r); 2703 return r; 2704 } 2705 } 2706 adev->ip_blocks[i].status.late_initialized = true; 2707 } 2708 2709 r = amdgpu_ras_late_init(adev); 2710 if (r) { 2711 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2712 return r; 2713 } 2714 2715 amdgpu_ras_set_error_query_ready(adev, true); 2716 2717 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2718 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2719 2720 amdgpu_device_fill_reset_magic(adev); 2721 2722 r = amdgpu_device_enable_mgpu_fan_boost(); 2723 if (r) 2724 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2725 2726 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2727 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2728 adev->asic_type == CHIP_ALDEBARAN )) 2729 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2730 2731 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2732 mutex_lock(&mgpu_info.mutex); 2733 2734 /* 2735 * Reset device p-state to low as this was booted with high. 2736 * 2737 * This should be performed only after all devices from the same 2738 * hive get initialized. 2739 * 2740 * However, it's unknown how many device in the hive in advance. 2741 * As this is counted one by one during devices initializations. 2742 * 2743 * So, we wait for all XGMI interlinked devices initialized. 2744 * This may bring some delays as those devices may come from 2745 * different hives. But that should be OK. 2746 */ 2747 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2748 for (i = 0; i < mgpu_info.num_gpu; i++) { 2749 gpu_instance = &(mgpu_info.gpu_ins[i]); 2750 if (gpu_instance->adev->flags & AMD_IS_APU) 2751 continue; 2752 2753 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2754 AMDGPU_XGMI_PSTATE_MIN); 2755 if (r) { 2756 DRM_ERROR("pstate setting failed (%d).\n", r); 2757 break; 2758 } 2759 } 2760 } 2761 2762 mutex_unlock(&mgpu_info.mutex); 2763 } 2764 2765 return 0; 2766 } 2767 2768 /** 2769 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2770 * 2771 * @adev: amdgpu_device pointer 2772 * 2773 * For ASICs need to disable SMC first 2774 */ 2775 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2776 { 2777 int i, r; 2778 2779 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2780 return; 2781 2782 for (i = 0; i < adev->num_ip_blocks; i++) { 2783 if (!adev->ip_blocks[i].status.hw) 2784 continue; 2785 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2786 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2787 /* XXX handle errors */ 2788 if (r) { 2789 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2790 adev->ip_blocks[i].version->funcs->name, r); 2791 } 2792 adev->ip_blocks[i].status.hw = false; 2793 break; 2794 } 2795 } 2796 } 2797 2798 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2799 { 2800 int i, r; 2801 2802 for (i = 0; i < adev->num_ip_blocks; i++) { 2803 if (!adev->ip_blocks[i].version->funcs->early_fini) 2804 continue; 2805 2806 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2807 if (r) { 2808 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2809 adev->ip_blocks[i].version->funcs->name, r); 2810 } 2811 } 2812 2813 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2814 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2815 2816 amdgpu_amdkfd_suspend(adev, false); 2817 2818 /* Workaroud for ASICs need to disable SMC first */ 2819 amdgpu_device_smu_fini_early(adev); 2820 2821 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2822 if (!adev->ip_blocks[i].status.hw) 2823 continue; 2824 2825 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2826 /* XXX handle errors */ 2827 if (r) { 2828 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2829 adev->ip_blocks[i].version->funcs->name, r); 2830 } 2831 2832 adev->ip_blocks[i].status.hw = false; 2833 } 2834 2835 if (amdgpu_sriov_vf(adev)) { 2836 if (amdgpu_virt_release_full_gpu(adev, false)) 2837 DRM_ERROR("failed to release exclusive mode on fini\n"); 2838 } 2839 2840 return 0; 2841 } 2842 2843 /** 2844 * amdgpu_device_ip_fini - run fini for hardware IPs 2845 * 2846 * @adev: amdgpu_device pointer 2847 * 2848 * Main teardown pass for hardware IPs. The list of all the hardware 2849 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2850 * are run. hw_fini tears down the hardware associated with each IP 2851 * and sw_fini tears down any software state associated with each IP. 2852 * Returns 0 on success, negative error code on failure. 2853 */ 2854 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2855 { 2856 int i, r; 2857 2858 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2859 amdgpu_virt_release_ras_err_handler_data(adev); 2860 2861 if (adev->gmc.xgmi.num_physical_nodes > 1) 2862 amdgpu_xgmi_remove_device(adev); 2863 2864 amdgpu_amdkfd_device_fini_sw(adev); 2865 2866 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2867 if (!adev->ip_blocks[i].status.sw) 2868 continue; 2869 2870 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2871 amdgpu_ucode_free_bo(adev); 2872 amdgpu_free_static_csa(&adev->virt.csa_obj); 2873 amdgpu_device_wb_fini(adev); 2874 amdgpu_device_mem_scratch_fini(adev); 2875 amdgpu_ib_pool_fini(adev); 2876 } 2877 2878 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2879 /* XXX handle errors */ 2880 if (r) { 2881 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2882 adev->ip_blocks[i].version->funcs->name, r); 2883 } 2884 adev->ip_blocks[i].status.sw = false; 2885 adev->ip_blocks[i].status.valid = false; 2886 } 2887 2888 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2889 if (!adev->ip_blocks[i].status.late_initialized) 2890 continue; 2891 if (adev->ip_blocks[i].version->funcs->late_fini) 2892 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2893 adev->ip_blocks[i].status.late_initialized = false; 2894 } 2895 2896 amdgpu_ras_fini(adev); 2897 2898 return 0; 2899 } 2900 2901 /** 2902 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2903 * 2904 * @work: work_struct. 2905 */ 2906 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2907 { 2908 struct amdgpu_device *adev = 2909 container_of(work, struct amdgpu_device, delayed_init_work.work); 2910 int r; 2911 2912 r = amdgpu_ib_ring_tests(adev); 2913 if (r) 2914 DRM_ERROR("ib ring test failed (%d).\n", r); 2915 } 2916 2917 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2918 { 2919 struct amdgpu_device *adev = 2920 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2921 2922 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2923 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2924 2925 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2926 adev->gfx.gfx_off_state = true; 2927 } 2928 2929 /** 2930 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2931 * 2932 * @adev: amdgpu_device pointer 2933 * 2934 * Main suspend function for hardware IPs. The list of all the hardware 2935 * IPs that make up the asic is walked, clockgating is disabled and the 2936 * suspend callbacks are run. suspend puts the hardware and software state 2937 * in each IP into a state suitable for suspend. 2938 * Returns 0 on success, negative error code on failure. 2939 */ 2940 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2941 { 2942 int i, r; 2943 2944 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2945 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2946 2947 /* 2948 * Per PMFW team's suggestion, driver needs to handle gfxoff 2949 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2950 * scenario. Add the missing df cstate disablement here. 2951 */ 2952 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2953 dev_warn(adev->dev, "Failed to disallow df cstate"); 2954 2955 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2956 if (!adev->ip_blocks[i].status.valid) 2957 continue; 2958 2959 /* displays are handled separately */ 2960 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2961 continue; 2962 2963 /* XXX handle errors */ 2964 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2965 /* XXX handle errors */ 2966 if (r) { 2967 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2968 adev->ip_blocks[i].version->funcs->name, r); 2969 return r; 2970 } 2971 2972 adev->ip_blocks[i].status.hw = false; 2973 } 2974 2975 return 0; 2976 } 2977 2978 /** 2979 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2980 * 2981 * @adev: amdgpu_device pointer 2982 * 2983 * Main suspend function for hardware IPs. The list of all the hardware 2984 * IPs that make up the asic is walked, clockgating is disabled and the 2985 * suspend callbacks are run. suspend puts the hardware and software state 2986 * in each IP into a state suitable for suspend. 2987 * Returns 0 on success, negative error code on failure. 2988 */ 2989 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2990 { 2991 int i, r; 2992 2993 if (adev->in_s0ix) 2994 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2995 2996 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2997 if (!adev->ip_blocks[i].status.valid) 2998 continue; 2999 /* displays are handled in phase1 */ 3000 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3001 continue; 3002 /* PSP lost connection when err_event_athub occurs */ 3003 if (amdgpu_ras_intr_triggered() && 3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3005 adev->ip_blocks[i].status.hw = false; 3006 continue; 3007 } 3008 3009 /* skip unnecessary suspend if we do not initialize them yet */ 3010 if (adev->gmc.xgmi.pending_reset && 3011 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3012 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3013 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3014 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3015 adev->ip_blocks[i].status.hw = false; 3016 continue; 3017 } 3018 3019 /* skip suspend of gfx/mes and psp for S0ix 3020 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3021 * like at runtime. PSP is also part of the always on hardware 3022 * so no need to suspend it. 3023 */ 3024 if (adev->in_s0ix && 3025 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3026 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3027 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3028 continue; 3029 3030 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3031 if (adev->in_s0ix && 3032 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3033 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3034 continue; 3035 3036 /* XXX handle errors */ 3037 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3038 /* XXX handle errors */ 3039 if (r) { 3040 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3041 adev->ip_blocks[i].version->funcs->name, r); 3042 } 3043 adev->ip_blocks[i].status.hw = false; 3044 /* handle putting the SMC in the appropriate state */ 3045 if(!amdgpu_sriov_vf(adev)){ 3046 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3047 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3048 if (r) { 3049 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3050 adev->mp1_state, r); 3051 return r; 3052 } 3053 } 3054 } 3055 } 3056 3057 return 0; 3058 } 3059 3060 /** 3061 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3062 * 3063 * @adev: amdgpu_device pointer 3064 * 3065 * Main suspend function for hardware IPs. The list of all the hardware 3066 * IPs that make up the asic is walked, clockgating is disabled and the 3067 * suspend callbacks are run. suspend puts the hardware and software state 3068 * in each IP into a state suitable for suspend. 3069 * Returns 0 on success, negative error code on failure. 3070 */ 3071 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3072 { 3073 int r; 3074 3075 if (amdgpu_sriov_vf(adev)) { 3076 amdgpu_virt_fini_data_exchange(adev); 3077 amdgpu_virt_request_full_gpu(adev, false); 3078 } 3079 3080 r = amdgpu_device_ip_suspend_phase1(adev); 3081 if (r) 3082 return r; 3083 r = amdgpu_device_ip_suspend_phase2(adev); 3084 3085 if (amdgpu_sriov_vf(adev)) 3086 amdgpu_virt_release_full_gpu(adev, false); 3087 3088 return r; 3089 } 3090 3091 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3092 { 3093 int i, r; 3094 3095 static enum amd_ip_block_type ip_order[] = { 3096 AMD_IP_BLOCK_TYPE_COMMON, 3097 AMD_IP_BLOCK_TYPE_GMC, 3098 AMD_IP_BLOCK_TYPE_PSP, 3099 AMD_IP_BLOCK_TYPE_IH, 3100 }; 3101 3102 for (i = 0; i < adev->num_ip_blocks; i++) { 3103 int j; 3104 struct amdgpu_ip_block *block; 3105 3106 block = &adev->ip_blocks[i]; 3107 block->status.hw = false; 3108 3109 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3110 3111 if (block->version->type != ip_order[j] || 3112 !block->status.valid) 3113 continue; 3114 3115 r = block->version->funcs->hw_init(adev); 3116 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3117 if (r) 3118 return r; 3119 block->status.hw = true; 3120 } 3121 } 3122 3123 return 0; 3124 } 3125 3126 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3127 { 3128 int i, r; 3129 3130 static enum amd_ip_block_type ip_order[] = { 3131 AMD_IP_BLOCK_TYPE_SMC, 3132 AMD_IP_BLOCK_TYPE_DCE, 3133 AMD_IP_BLOCK_TYPE_GFX, 3134 AMD_IP_BLOCK_TYPE_SDMA, 3135 AMD_IP_BLOCK_TYPE_UVD, 3136 AMD_IP_BLOCK_TYPE_VCE, 3137 AMD_IP_BLOCK_TYPE_VCN 3138 }; 3139 3140 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3141 int j; 3142 struct amdgpu_ip_block *block; 3143 3144 for (j = 0; j < adev->num_ip_blocks; j++) { 3145 block = &adev->ip_blocks[j]; 3146 3147 if (block->version->type != ip_order[i] || 3148 !block->status.valid || 3149 block->status.hw) 3150 continue; 3151 3152 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3153 r = block->version->funcs->resume(adev); 3154 else 3155 r = block->version->funcs->hw_init(adev); 3156 3157 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3158 if (r) 3159 return r; 3160 block->status.hw = true; 3161 } 3162 } 3163 3164 return 0; 3165 } 3166 3167 /** 3168 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3169 * 3170 * @adev: amdgpu_device pointer 3171 * 3172 * First resume function for hardware IPs. The list of all the hardware 3173 * IPs that make up the asic is walked and the resume callbacks are run for 3174 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3175 * after a suspend and updates the software state as necessary. This 3176 * function is also used for restoring the GPU after a GPU reset. 3177 * Returns 0 on success, negative error code on failure. 3178 */ 3179 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3180 { 3181 int i, r; 3182 3183 for (i = 0; i < adev->num_ip_blocks; i++) { 3184 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3185 continue; 3186 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3187 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3188 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3189 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3190 3191 r = adev->ip_blocks[i].version->funcs->resume(adev); 3192 if (r) { 3193 DRM_ERROR("resume of IP block <%s> failed %d\n", 3194 adev->ip_blocks[i].version->funcs->name, r); 3195 return r; 3196 } 3197 adev->ip_blocks[i].status.hw = true; 3198 } 3199 } 3200 3201 return 0; 3202 } 3203 3204 /** 3205 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3206 * 3207 * @adev: amdgpu_device pointer 3208 * 3209 * First resume function for hardware IPs. The list of all the hardware 3210 * IPs that make up the asic is walked and the resume callbacks are run for 3211 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3212 * functional state after a suspend and updates the software state as 3213 * necessary. This function is also used for restoring the GPU after a GPU 3214 * reset. 3215 * Returns 0 on success, negative error code on failure. 3216 */ 3217 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3218 { 3219 int i, r; 3220 3221 for (i = 0; i < adev->num_ip_blocks; i++) { 3222 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3223 continue; 3224 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3225 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3226 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3228 continue; 3229 r = adev->ip_blocks[i].version->funcs->resume(adev); 3230 if (r) { 3231 DRM_ERROR("resume of IP block <%s> failed %d\n", 3232 adev->ip_blocks[i].version->funcs->name, r); 3233 return r; 3234 } 3235 adev->ip_blocks[i].status.hw = true; 3236 } 3237 3238 return 0; 3239 } 3240 3241 /** 3242 * amdgpu_device_ip_resume - run resume for hardware IPs 3243 * 3244 * @adev: amdgpu_device pointer 3245 * 3246 * Main resume function for hardware IPs. The hardware IPs 3247 * are split into two resume functions because they are 3248 * are also used in in recovering from a GPU reset and some additional 3249 * steps need to be take between them. In this case (S3/S4) they are 3250 * run sequentially. 3251 * Returns 0 on success, negative error code on failure. 3252 */ 3253 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3254 { 3255 int r; 3256 3257 r = amdgpu_amdkfd_resume_iommu(adev); 3258 if (r) 3259 return r; 3260 3261 r = amdgpu_device_ip_resume_phase1(adev); 3262 if (r) 3263 return r; 3264 3265 r = amdgpu_device_fw_loading(adev); 3266 if (r) 3267 return r; 3268 3269 r = amdgpu_device_ip_resume_phase2(adev); 3270 3271 return r; 3272 } 3273 3274 /** 3275 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3276 * 3277 * @adev: amdgpu_device pointer 3278 * 3279 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3280 */ 3281 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3282 { 3283 if (amdgpu_sriov_vf(adev)) { 3284 if (adev->is_atom_fw) { 3285 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3286 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3287 } else { 3288 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3289 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3290 } 3291 3292 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3293 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3294 } 3295 } 3296 3297 /** 3298 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3299 * 3300 * @asic_type: AMD asic type 3301 * 3302 * Check if there is DC (new modesetting infrastructre) support for an asic. 3303 * returns true if DC has support, false if not. 3304 */ 3305 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3306 { 3307 switch (asic_type) { 3308 #ifdef CONFIG_DRM_AMDGPU_SI 3309 case CHIP_HAINAN: 3310 #endif 3311 case CHIP_TOPAZ: 3312 /* chips with no display hardware */ 3313 return false; 3314 #if defined(CONFIG_DRM_AMD_DC) 3315 case CHIP_TAHITI: 3316 case CHIP_PITCAIRN: 3317 case CHIP_VERDE: 3318 case CHIP_OLAND: 3319 /* 3320 * We have systems in the wild with these ASICs that require 3321 * LVDS and VGA support which is not supported with DC. 3322 * 3323 * Fallback to the non-DC driver here by default so as not to 3324 * cause regressions. 3325 */ 3326 #if defined(CONFIG_DRM_AMD_DC_SI) 3327 return amdgpu_dc > 0; 3328 #else 3329 return false; 3330 #endif 3331 case CHIP_BONAIRE: 3332 case CHIP_KAVERI: 3333 case CHIP_KABINI: 3334 case CHIP_MULLINS: 3335 /* 3336 * We have systems in the wild with these ASICs that require 3337 * VGA support which is not supported with DC. 3338 * 3339 * Fallback to the non-DC driver here by default so as not to 3340 * cause regressions. 3341 */ 3342 return amdgpu_dc > 0; 3343 default: 3344 return amdgpu_dc != 0; 3345 #else 3346 default: 3347 if (amdgpu_dc > 0) 3348 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3349 "but isn't supported by ASIC, ignoring\n"); 3350 return false; 3351 #endif 3352 } 3353 } 3354 3355 /** 3356 * amdgpu_device_has_dc_support - check if dc is supported 3357 * 3358 * @adev: amdgpu_device pointer 3359 * 3360 * Returns true for supported, false for not supported 3361 */ 3362 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3363 { 3364 if (adev->enable_virtual_display || 3365 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3366 return false; 3367 3368 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3369 } 3370 3371 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3372 { 3373 struct amdgpu_device *adev = 3374 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3375 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3376 3377 /* It's a bug to not have a hive within this function */ 3378 if (WARN_ON(!hive)) 3379 return; 3380 3381 /* 3382 * Use task barrier to synchronize all xgmi reset works across the 3383 * hive. task_barrier_enter and task_barrier_exit will block 3384 * until all the threads running the xgmi reset works reach 3385 * those points. task_barrier_full will do both blocks. 3386 */ 3387 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3388 3389 task_barrier_enter(&hive->tb); 3390 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3391 3392 if (adev->asic_reset_res) 3393 goto fail; 3394 3395 task_barrier_exit(&hive->tb); 3396 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3397 3398 if (adev->asic_reset_res) 3399 goto fail; 3400 3401 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3402 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3403 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3404 } else { 3405 3406 task_barrier_full(&hive->tb); 3407 adev->asic_reset_res = amdgpu_asic_reset(adev); 3408 } 3409 3410 fail: 3411 if (adev->asic_reset_res) 3412 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3413 adev->asic_reset_res, adev_to_drm(adev)->unique); 3414 amdgpu_put_xgmi_hive(hive); 3415 } 3416 3417 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3418 { 3419 char *input = amdgpu_lockup_timeout; 3420 char *timeout_setting = NULL; 3421 int index = 0; 3422 long timeout; 3423 int ret = 0; 3424 3425 /* 3426 * By default timeout for non compute jobs is 10000 3427 * and 60000 for compute jobs. 3428 * In SR-IOV or passthrough mode, timeout for compute 3429 * jobs are 60000 by default. 3430 */ 3431 adev->gfx_timeout = msecs_to_jiffies(10000); 3432 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3433 if (amdgpu_sriov_vf(adev)) 3434 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3435 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3436 else 3437 adev->compute_timeout = msecs_to_jiffies(60000); 3438 3439 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3440 while ((timeout_setting = strsep(&input, ",")) && 3441 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3442 ret = kstrtol(timeout_setting, 0, &timeout); 3443 if (ret) 3444 return ret; 3445 3446 if (timeout == 0) { 3447 index++; 3448 continue; 3449 } else if (timeout < 0) { 3450 timeout = MAX_SCHEDULE_TIMEOUT; 3451 dev_warn(adev->dev, "lockup timeout disabled"); 3452 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3453 } else { 3454 timeout = msecs_to_jiffies(timeout); 3455 } 3456 3457 switch (index++) { 3458 case 0: 3459 adev->gfx_timeout = timeout; 3460 break; 3461 case 1: 3462 adev->compute_timeout = timeout; 3463 break; 3464 case 2: 3465 adev->sdma_timeout = timeout; 3466 break; 3467 case 3: 3468 adev->video_timeout = timeout; 3469 break; 3470 default: 3471 break; 3472 } 3473 } 3474 /* 3475 * There is only one value specified and 3476 * it should apply to all non-compute jobs. 3477 */ 3478 if (index == 1) { 3479 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3480 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3481 adev->compute_timeout = adev->gfx_timeout; 3482 } 3483 } 3484 3485 return ret; 3486 } 3487 3488 /** 3489 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3490 * 3491 * @adev: amdgpu_device pointer 3492 * 3493 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3494 */ 3495 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3496 { 3497 struct iommu_domain *domain; 3498 3499 domain = iommu_get_domain_for_dev(adev->dev); 3500 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3501 adev->ram_is_direct_mapped = true; 3502 } 3503 3504 static const struct attribute *amdgpu_dev_attributes[] = { 3505 &dev_attr_product_name.attr, 3506 &dev_attr_product_number.attr, 3507 &dev_attr_serial_number.attr, 3508 &dev_attr_pcie_replay_count.attr, 3509 NULL 3510 }; 3511 3512 /** 3513 * amdgpu_device_init - initialize the driver 3514 * 3515 * @adev: amdgpu_device pointer 3516 * @flags: driver flags 3517 * 3518 * Initializes the driver info and hw (all asics). 3519 * Returns 0 for success or an error on failure. 3520 * Called at driver startup. 3521 */ 3522 int amdgpu_device_init(struct amdgpu_device *adev, 3523 uint32_t flags) 3524 { 3525 struct drm_device *ddev = adev_to_drm(adev); 3526 struct pci_dev *pdev = adev->pdev; 3527 int r, i; 3528 bool px = false; 3529 u32 max_MBps; 3530 3531 adev->shutdown = false; 3532 adev->flags = flags; 3533 3534 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3535 adev->asic_type = amdgpu_force_asic_type; 3536 else 3537 adev->asic_type = flags & AMD_ASIC_MASK; 3538 3539 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3540 if (amdgpu_emu_mode == 1) 3541 adev->usec_timeout *= 10; 3542 adev->gmc.gart_size = 512 * 1024 * 1024; 3543 adev->accel_working = false; 3544 adev->num_rings = 0; 3545 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3546 adev->mman.buffer_funcs = NULL; 3547 adev->mman.buffer_funcs_ring = NULL; 3548 adev->vm_manager.vm_pte_funcs = NULL; 3549 adev->vm_manager.vm_pte_num_scheds = 0; 3550 adev->gmc.gmc_funcs = NULL; 3551 adev->harvest_ip_mask = 0x0; 3552 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3553 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3554 3555 adev->smc_rreg = &amdgpu_invalid_rreg; 3556 adev->smc_wreg = &amdgpu_invalid_wreg; 3557 adev->pcie_rreg = &amdgpu_invalid_rreg; 3558 adev->pcie_wreg = &amdgpu_invalid_wreg; 3559 adev->pciep_rreg = &amdgpu_invalid_rreg; 3560 adev->pciep_wreg = &amdgpu_invalid_wreg; 3561 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3562 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3563 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3564 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3565 adev->didt_rreg = &amdgpu_invalid_rreg; 3566 adev->didt_wreg = &amdgpu_invalid_wreg; 3567 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3568 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3569 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3570 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3571 3572 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3573 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3574 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3575 3576 /* mutex initialization are all done here so we 3577 * can recall function without having locking issues */ 3578 mutex_init(&adev->firmware.mutex); 3579 mutex_init(&adev->pm.mutex); 3580 mutex_init(&adev->gfx.gpu_clock_mutex); 3581 mutex_init(&adev->srbm_mutex); 3582 mutex_init(&adev->gfx.pipe_reserve_mutex); 3583 mutex_init(&adev->gfx.gfx_off_mutex); 3584 mutex_init(&adev->grbm_idx_mutex); 3585 mutex_init(&adev->mn_lock); 3586 mutex_init(&adev->virt.vf_errors.lock); 3587 hash_init(adev->mn_hash); 3588 mutex_init(&adev->psp.mutex); 3589 mutex_init(&adev->notifier_lock); 3590 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3591 mutex_init(&adev->benchmark_mutex); 3592 3593 amdgpu_device_init_apu_flags(adev); 3594 3595 r = amdgpu_device_check_arguments(adev); 3596 if (r) 3597 return r; 3598 3599 spin_lock_init(&adev->mmio_idx_lock); 3600 spin_lock_init(&adev->smc_idx_lock); 3601 spin_lock_init(&adev->pcie_idx_lock); 3602 spin_lock_init(&adev->uvd_ctx_idx_lock); 3603 spin_lock_init(&adev->didt_idx_lock); 3604 spin_lock_init(&adev->gc_cac_idx_lock); 3605 spin_lock_init(&adev->se_cac_idx_lock); 3606 spin_lock_init(&adev->audio_endpt_idx_lock); 3607 spin_lock_init(&adev->mm_stats.lock); 3608 3609 INIT_LIST_HEAD(&adev->shadow_list); 3610 mutex_init(&adev->shadow_list_lock); 3611 3612 INIT_LIST_HEAD(&adev->reset_list); 3613 3614 INIT_LIST_HEAD(&adev->ras_list); 3615 3616 INIT_DELAYED_WORK(&adev->delayed_init_work, 3617 amdgpu_device_delayed_init_work_handler); 3618 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3619 amdgpu_device_delay_enable_gfx_off); 3620 3621 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3622 3623 adev->gfx.gfx_off_req_count = 1; 3624 adev->gfx.gfx_off_residency = 0; 3625 adev->gfx.gfx_off_entrycount = 0; 3626 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3627 3628 atomic_set(&adev->throttling_logging_enabled, 1); 3629 /* 3630 * If throttling continues, logging will be performed every minute 3631 * to avoid log flooding. "-1" is subtracted since the thermal 3632 * throttling interrupt comes every second. Thus, the total logging 3633 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3634 * for throttling interrupt) = 60 seconds. 3635 */ 3636 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3637 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3638 3639 /* Registers mapping */ 3640 /* TODO: block userspace mapping of io register */ 3641 if (adev->asic_type >= CHIP_BONAIRE) { 3642 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3643 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3644 } else { 3645 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3646 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3647 } 3648 3649 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3650 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3651 3652 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3653 if (adev->rmmio == NULL) { 3654 return -ENOMEM; 3655 } 3656 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3657 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3658 3659 amdgpu_device_get_pcie_info(adev); 3660 3661 if (amdgpu_mcbp) 3662 DRM_INFO("MCBP is enabled\n"); 3663 3664 /* 3665 * Reset domain needs to be present early, before XGMI hive discovered 3666 * (if any) and intitialized to use reset sem and in_gpu reset flag 3667 * early on during init and before calling to RREG32. 3668 */ 3669 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3670 if (!adev->reset_domain) 3671 return -ENOMEM; 3672 3673 /* detect hw virtualization here */ 3674 amdgpu_detect_virtualization(adev); 3675 3676 r = amdgpu_device_get_job_timeout_settings(adev); 3677 if (r) { 3678 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3679 return r; 3680 } 3681 3682 /* early init functions */ 3683 r = amdgpu_device_ip_early_init(adev); 3684 if (r) 3685 return r; 3686 3687 /* Get rid of things like offb */ 3688 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3689 if (r) 3690 return r; 3691 3692 /* Enable TMZ based on IP_VERSION */ 3693 amdgpu_gmc_tmz_set(adev); 3694 3695 amdgpu_gmc_noretry_set(adev); 3696 /* Need to get xgmi info early to decide the reset behavior*/ 3697 if (adev->gmc.xgmi.supported) { 3698 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3699 if (r) 3700 return r; 3701 } 3702 3703 /* enable PCIE atomic ops */ 3704 if (amdgpu_sriov_vf(adev)) 3705 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3706 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3707 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3708 else 3709 adev->have_atomics_support = 3710 !pci_enable_atomic_ops_to_root(adev->pdev, 3711 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3712 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3713 if (!adev->have_atomics_support) 3714 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3715 3716 /* doorbell bar mapping and doorbell index init*/ 3717 amdgpu_device_doorbell_init(adev); 3718 3719 if (amdgpu_emu_mode == 1) { 3720 /* post the asic on emulation mode */ 3721 emu_soc_asic_init(adev); 3722 goto fence_driver_init; 3723 } 3724 3725 amdgpu_reset_init(adev); 3726 3727 /* detect if we are with an SRIOV vbios */ 3728 amdgpu_device_detect_sriov_bios(adev); 3729 3730 /* check if we need to reset the asic 3731 * E.g., driver was not cleanly unloaded previously, etc. 3732 */ 3733 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3734 if (adev->gmc.xgmi.num_physical_nodes) { 3735 dev_info(adev->dev, "Pending hive reset.\n"); 3736 adev->gmc.xgmi.pending_reset = true; 3737 /* Only need to init necessary block for SMU to handle the reset */ 3738 for (i = 0; i < adev->num_ip_blocks; i++) { 3739 if (!adev->ip_blocks[i].status.valid) 3740 continue; 3741 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3744 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3745 DRM_DEBUG("IP %s disabled for hw_init.\n", 3746 adev->ip_blocks[i].version->funcs->name); 3747 adev->ip_blocks[i].status.hw = true; 3748 } 3749 } 3750 } else { 3751 r = amdgpu_asic_reset(adev); 3752 if (r) { 3753 dev_err(adev->dev, "asic reset on init failed\n"); 3754 goto failed; 3755 } 3756 } 3757 } 3758 3759 pci_enable_pcie_error_reporting(adev->pdev); 3760 3761 /* Post card if necessary */ 3762 if (amdgpu_device_need_post(adev)) { 3763 if (!adev->bios) { 3764 dev_err(adev->dev, "no vBIOS found\n"); 3765 r = -EINVAL; 3766 goto failed; 3767 } 3768 DRM_INFO("GPU posting now...\n"); 3769 r = amdgpu_device_asic_init(adev); 3770 if (r) { 3771 dev_err(adev->dev, "gpu post error!\n"); 3772 goto failed; 3773 } 3774 } 3775 3776 if (adev->is_atom_fw) { 3777 /* Initialize clocks */ 3778 r = amdgpu_atomfirmware_get_clock_info(adev); 3779 if (r) { 3780 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3781 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3782 goto failed; 3783 } 3784 } else { 3785 /* Initialize clocks */ 3786 r = amdgpu_atombios_get_clock_info(adev); 3787 if (r) { 3788 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3789 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3790 goto failed; 3791 } 3792 /* init i2c buses */ 3793 if (!amdgpu_device_has_dc_support(adev)) 3794 amdgpu_atombios_i2c_init(adev); 3795 } 3796 3797 fence_driver_init: 3798 /* Fence driver */ 3799 r = amdgpu_fence_driver_sw_init(adev); 3800 if (r) { 3801 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3802 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3803 goto failed; 3804 } 3805 3806 /* init the mode config */ 3807 drm_mode_config_init(adev_to_drm(adev)); 3808 3809 r = amdgpu_device_ip_init(adev); 3810 if (r) { 3811 /* failed in exclusive mode due to timeout */ 3812 if (amdgpu_sriov_vf(adev) && 3813 !amdgpu_sriov_runtime(adev) && 3814 amdgpu_virt_mmio_blocked(adev) && 3815 !amdgpu_virt_wait_reset(adev)) { 3816 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3817 /* Don't send request since VF is inactive. */ 3818 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3819 adev->virt.ops = NULL; 3820 r = -EAGAIN; 3821 goto release_ras_con; 3822 } 3823 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3824 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3825 goto release_ras_con; 3826 } 3827 3828 amdgpu_fence_driver_hw_init(adev); 3829 3830 dev_info(adev->dev, 3831 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3832 adev->gfx.config.max_shader_engines, 3833 adev->gfx.config.max_sh_per_se, 3834 adev->gfx.config.max_cu_per_sh, 3835 adev->gfx.cu_info.number); 3836 3837 adev->accel_working = true; 3838 3839 amdgpu_vm_check_compute_bug(adev); 3840 3841 /* Initialize the buffer migration limit. */ 3842 if (amdgpu_moverate >= 0) 3843 max_MBps = amdgpu_moverate; 3844 else 3845 max_MBps = 8; /* Allow 8 MB/s. */ 3846 /* Get a log2 for easy divisions. */ 3847 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3848 3849 r = amdgpu_pm_sysfs_init(adev); 3850 if (r) { 3851 adev->pm_sysfs_en = false; 3852 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3853 } else 3854 adev->pm_sysfs_en = true; 3855 3856 r = amdgpu_ucode_sysfs_init(adev); 3857 if (r) { 3858 adev->ucode_sysfs_en = false; 3859 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3860 } else 3861 adev->ucode_sysfs_en = true; 3862 3863 r = amdgpu_psp_sysfs_init(adev); 3864 if (r) { 3865 adev->psp_sysfs_en = false; 3866 if (!amdgpu_sriov_vf(adev)) 3867 DRM_ERROR("Creating psp sysfs failed\n"); 3868 } else 3869 adev->psp_sysfs_en = true; 3870 3871 /* 3872 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3873 * Otherwise the mgpu fan boost feature will be skipped due to the 3874 * gpu instance is counted less. 3875 */ 3876 amdgpu_register_gpu_instance(adev); 3877 3878 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3879 * explicit gating rather than handling it automatically. 3880 */ 3881 if (!adev->gmc.xgmi.pending_reset) { 3882 r = amdgpu_device_ip_late_init(adev); 3883 if (r) { 3884 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3885 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3886 goto release_ras_con; 3887 } 3888 /* must succeed. */ 3889 amdgpu_ras_resume(adev); 3890 queue_delayed_work(system_wq, &adev->delayed_init_work, 3891 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3892 } 3893 3894 if (amdgpu_sriov_vf(adev)) 3895 flush_delayed_work(&adev->delayed_init_work); 3896 3897 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3898 if (r) 3899 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3900 3901 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3902 r = amdgpu_pmu_init(adev); 3903 if (r) 3904 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3905 3906 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3907 if (amdgpu_device_cache_pci_state(adev->pdev)) 3908 pci_restore_state(pdev); 3909 3910 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3911 /* this will fail for cards that aren't VGA class devices, just 3912 * ignore it */ 3913 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3914 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3915 3916 if (amdgpu_device_supports_px(ddev)) { 3917 px = true; 3918 vga_switcheroo_register_client(adev->pdev, 3919 &amdgpu_switcheroo_ops, px); 3920 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3921 } 3922 3923 if (adev->gmc.xgmi.pending_reset) 3924 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3925 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3926 3927 amdgpu_device_check_iommu_direct_map(adev); 3928 3929 return 0; 3930 3931 release_ras_con: 3932 amdgpu_release_ras_context(adev); 3933 3934 failed: 3935 amdgpu_vf_error_trans_all(adev); 3936 3937 return r; 3938 } 3939 3940 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3941 { 3942 3943 /* Clear all CPU mappings pointing to this device */ 3944 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3945 3946 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3947 amdgpu_device_doorbell_fini(adev); 3948 3949 iounmap(adev->rmmio); 3950 adev->rmmio = NULL; 3951 if (adev->mman.aper_base_kaddr) 3952 iounmap(adev->mman.aper_base_kaddr); 3953 adev->mman.aper_base_kaddr = NULL; 3954 3955 /* Memory manager related */ 3956 if (!adev->gmc.xgmi.connected_to_cpu) { 3957 arch_phys_wc_del(adev->gmc.vram_mtrr); 3958 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3959 } 3960 } 3961 3962 /** 3963 * amdgpu_device_fini_hw - tear down the driver 3964 * 3965 * @adev: amdgpu_device pointer 3966 * 3967 * Tear down the driver info (all asics). 3968 * Called at driver shutdown. 3969 */ 3970 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3971 { 3972 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3973 flush_delayed_work(&adev->delayed_init_work); 3974 adev->shutdown = true; 3975 3976 /* make sure IB test finished before entering exclusive mode 3977 * to avoid preemption on IB test 3978 * */ 3979 if (amdgpu_sriov_vf(adev)) { 3980 amdgpu_virt_request_full_gpu(adev, false); 3981 amdgpu_virt_fini_data_exchange(adev); 3982 } 3983 3984 /* disable all interrupts */ 3985 amdgpu_irq_disable_all(adev); 3986 if (adev->mode_info.mode_config_initialized){ 3987 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3988 drm_helper_force_disable_all(adev_to_drm(adev)); 3989 else 3990 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3991 } 3992 amdgpu_fence_driver_hw_fini(adev); 3993 3994 if (adev->mman.initialized) 3995 drain_workqueue(adev->mman.bdev.wq); 3996 3997 if (adev->pm_sysfs_en) 3998 amdgpu_pm_sysfs_fini(adev); 3999 if (adev->ucode_sysfs_en) 4000 amdgpu_ucode_sysfs_fini(adev); 4001 if (adev->psp_sysfs_en) 4002 amdgpu_psp_sysfs_fini(adev); 4003 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4004 4005 /* disable ras feature must before hw fini */ 4006 amdgpu_ras_pre_fini(adev); 4007 4008 amdgpu_device_ip_fini_early(adev); 4009 4010 amdgpu_irq_fini_hw(adev); 4011 4012 if (adev->mman.initialized) 4013 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4014 4015 amdgpu_gart_dummy_page_fini(adev); 4016 4017 amdgpu_device_unmap_mmio(adev); 4018 4019 } 4020 4021 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4022 { 4023 int idx; 4024 4025 amdgpu_fence_driver_sw_fini(adev); 4026 amdgpu_device_ip_fini(adev); 4027 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4028 adev->accel_working = false; 4029 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4030 4031 amdgpu_reset_fini(adev); 4032 4033 /* free i2c buses */ 4034 if (!amdgpu_device_has_dc_support(adev)) 4035 amdgpu_i2c_fini(adev); 4036 4037 if (amdgpu_emu_mode != 1) 4038 amdgpu_atombios_fini(adev); 4039 4040 kfree(adev->bios); 4041 adev->bios = NULL; 4042 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4043 vga_switcheroo_unregister_client(adev->pdev); 4044 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4045 } 4046 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4047 vga_client_unregister(adev->pdev); 4048 4049 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4050 4051 iounmap(adev->rmmio); 4052 adev->rmmio = NULL; 4053 amdgpu_device_doorbell_fini(adev); 4054 drm_dev_exit(idx); 4055 } 4056 4057 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4058 amdgpu_pmu_fini(adev); 4059 if (adev->mman.discovery_bin) 4060 amdgpu_discovery_fini(adev); 4061 4062 amdgpu_reset_put_reset_domain(adev->reset_domain); 4063 adev->reset_domain = NULL; 4064 4065 kfree(adev->pci_state); 4066 4067 } 4068 4069 /** 4070 * amdgpu_device_evict_resources - evict device resources 4071 * @adev: amdgpu device object 4072 * 4073 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4074 * of the vram memory type. Mainly used for evicting device resources 4075 * at suspend time. 4076 * 4077 */ 4078 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4079 { 4080 int ret; 4081 4082 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4083 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4084 return 0; 4085 4086 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4087 if (ret) 4088 DRM_WARN("evicting device resources failed\n"); 4089 return ret; 4090 } 4091 4092 /* 4093 * Suspend & resume. 4094 */ 4095 /** 4096 * amdgpu_device_suspend - initiate device suspend 4097 * 4098 * @dev: drm dev pointer 4099 * @fbcon : notify the fbdev of suspend 4100 * 4101 * Puts the hw in the suspend state (all asics). 4102 * Returns 0 for success or an error on failure. 4103 * Called at driver suspend. 4104 */ 4105 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4106 { 4107 struct amdgpu_device *adev = drm_to_adev(dev); 4108 int r = 0; 4109 4110 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4111 return 0; 4112 4113 adev->in_suspend = true; 4114 4115 /* Evict the majority of BOs before grabbing the full access */ 4116 r = amdgpu_device_evict_resources(adev); 4117 if (r) 4118 return r; 4119 4120 if (amdgpu_sriov_vf(adev)) { 4121 amdgpu_virt_fini_data_exchange(adev); 4122 r = amdgpu_virt_request_full_gpu(adev, false); 4123 if (r) 4124 return r; 4125 } 4126 4127 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4128 DRM_WARN("smart shift update failed\n"); 4129 4130 drm_kms_helper_poll_disable(dev); 4131 4132 if (fbcon) 4133 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4134 4135 cancel_delayed_work_sync(&adev->delayed_init_work); 4136 4137 amdgpu_ras_suspend(adev); 4138 4139 amdgpu_device_ip_suspend_phase1(adev); 4140 4141 if (!adev->in_s0ix) 4142 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4143 4144 r = amdgpu_device_evict_resources(adev); 4145 if (r) 4146 return r; 4147 4148 amdgpu_fence_driver_hw_fini(adev); 4149 4150 amdgpu_device_ip_suspend_phase2(adev); 4151 4152 if (amdgpu_sriov_vf(adev)) 4153 amdgpu_virt_release_full_gpu(adev, false); 4154 4155 return 0; 4156 } 4157 4158 /** 4159 * amdgpu_device_resume - initiate device resume 4160 * 4161 * @dev: drm dev pointer 4162 * @fbcon : notify the fbdev of resume 4163 * 4164 * Bring the hw back to operating state (all asics). 4165 * Returns 0 for success or an error on failure. 4166 * Called at driver resume. 4167 */ 4168 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4169 { 4170 struct amdgpu_device *adev = drm_to_adev(dev); 4171 int r = 0; 4172 4173 if (amdgpu_sriov_vf(adev)) { 4174 r = amdgpu_virt_request_full_gpu(adev, true); 4175 if (r) 4176 return r; 4177 } 4178 4179 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4180 return 0; 4181 4182 if (adev->in_s0ix) 4183 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4184 4185 /* post card */ 4186 if (amdgpu_device_need_post(adev)) { 4187 r = amdgpu_device_asic_init(adev); 4188 if (r) 4189 dev_err(adev->dev, "amdgpu asic init failed\n"); 4190 } 4191 4192 r = amdgpu_device_ip_resume(adev); 4193 4194 if (r) { 4195 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4196 goto exit; 4197 } 4198 amdgpu_fence_driver_hw_init(adev); 4199 4200 r = amdgpu_device_ip_late_init(adev); 4201 if (r) 4202 goto exit; 4203 4204 queue_delayed_work(system_wq, &adev->delayed_init_work, 4205 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4206 4207 if (!adev->in_s0ix) { 4208 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4209 if (r) 4210 goto exit; 4211 } 4212 4213 exit: 4214 if (amdgpu_sriov_vf(adev)) { 4215 amdgpu_virt_init_data_exchange(adev); 4216 amdgpu_virt_release_full_gpu(adev, true); 4217 } 4218 4219 if (r) 4220 return r; 4221 4222 /* Make sure IB tests flushed */ 4223 flush_delayed_work(&adev->delayed_init_work); 4224 4225 if (fbcon) 4226 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4227 4228 drm_kms_helper_poll_enable(dev); 4229 4230 amdgpu_ras_resume(adev); 4231 4232 if (adev->mode_info.num_crtc) { 4233 /* 4234 * Most of the connector probing functions try to acquire runtime pm 4235 * refs to ensure that the GPU is powered on when connector polling is 4236 * performed. Since we're calling this from a runtime PM callback, 4237 * trying to acquire rpm refs will cause us to deadlock. 4238 * 4239 * Since we're guaranteed to be holding the rpm lock, it's safe to 4240 * temporarily disable the rpm helpers so this doesn't deadlock us. 4241 */ 4242 #ifdef CONFIG_PM 4243 dev->dev->power.disable_depth++; 4244 #endif 4245 if (!adev->dc_enabled) 4246 drm_helper_hpd_irq_event(dev); 4247 else 4248 drm_kms_helper_hotplug_event(dev); 4249 #ifdef CONFIG_PM 4250 dev->dev->power.disable_depth--; 4251 #endif 4252 } 4253 adev->in_suspend = false; 4254 4255 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4256 DRM_WARN("smart shift update failed\n"); 4257 4258 return 0; 4259 } 4260 4261 /** 4262 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4263 * 4264 * @adev: amdgpu_device pointer 4265 * 4266 * The list of all the hardware IPs that make up the asic is walked and 4267 * the check_soft_reset callbacks are run. check_soft_reset determines 4268 * if the asic is still hung or not. 4269 * Returns true if any of the IPs are still in a hung state, false if not. 4270 */ 4271 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4272 { 4273 int i; 4274 bool asic_hang = false; 4275 4276 if (amdgpu_sriov_vf(adev)) 4277 return true; 4278 4279 if (amdgpu_asic_need_full_reset(adev)) 4280 return true; 4281 4282 for (i = 0; i < adev->num_ip_blocks; i++) { 4283 if (!adev->ip_blocks[i].status.valid) 4284 continue; 4285 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4286 adev->ip_blocks[i].status.hang = 4287 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4288 if (adev->ip_blocks[i].status.hang) { 4289 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4290 asic_hang = true; 4291 } 4292 } 4293 return asic_hang; 4294 } 4295 4296 /** 4297 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4298 * 4299 * @adev: amdgpu_device pointer 4300 * 4301 * The list of all the hardware IPs that make up the asic is walked and the 4302 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4303 * handles any IP specific hardware or software state changes that are 4304 * necessary for a soft reset to succeed. 4305 * Returns 0 on success, negative error code on failure. 4306 */ 4307 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4308 { 4309 int i, r = 0; 4310 4311 for (i = 0; i < adev->num_ip_blocks; i++) { 4312 if (!adev->ip_blocks[i].status.valid) 4313 continue; 4314 if (adev->ip_blocks[i].status.hang && 4315 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4316 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4317 if (r) 4318 return r; 4319 } 4320 } 4321 4322 return 0; 4323 } 4324 4325 /** 4326 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4327 * 4328 * @adev: amdgpu_device pointer 4329 * 4330 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4331 * reset is necessary to recover. 4332 * Returns true if a full asic reset is required, false if not. 4333 */ 4334 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4335 { 4336 int i; 4337 4338 if (amdgpu_asic_need_full_reset(adev)) 4339 return true; 4340 4341 for (i = 0; i < adev->num_ip_blocks; i++) { 4342 if (!adev->ip_blocks[i].status.valid) 4343 continue; 4344 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4345 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4346 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4347 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4348 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4349 if (adev->ip_blocks[i].status.hang) { 4350 dev_info(adev->dev, "Some block need full reset!\n"); 4351 return true; 4352 } 4353 } 4354 } 4355 return false; 4356 } 4357 4358 /** 4359 * amdgpu_device_ip_soft_reset - do a soft reset 4360 * 4361 * @adev: amdgpu_device pointer 4362 * 4363 * The list of all the hardware IPs that make up the asic is walked and the 4364 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4365 * IP specific hardware or software state changes that are necessary to soft 4366 * reset the IP. 4367 * Returns 0 on success, negative error code on failure. 4368 */ 4369 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4370 { 4371 int i, r = 0; 4372 4373 for (i = 0; i < adev->num_ip_blocks; i++) { 4374 if (!adev->ip_blocks[i].status.valid) 4375 continue; 4376 if (adev->ip_blocks[i].status.hang && 4377 adev->ip_blocks[i].version->funcs->soft_reset) { 4378 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4379 if (r) 4380 return r; 4381 } 4382 } 4383 4384 return 0; 4385 } 4386 4387 /** 4388 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4389 * 4390 * @adev: amdgpu_device pointer 4391 * 4392 * The list of all the hardware IPs that make up the asic is walked and the 4393 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4394 * handles any IP specific hardware or software state changes that are 4395 * necessary after the IP has been soft reset. 4396 * Returns 0 on success, negative error code on failure. 4397 */ 4398 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4399 { 4400 int i, r = 0; 4401 4402 for (i = 0; i < adev->num_ip_blocks; i++) { 4403 if (!adev->ip_blocks[i].status.valid) 4404 continue; 4405 if (adev->ip_blocks[i].status.hang && 4406 adev->ip_blocks[i].version->funcs->post_soft_reset) 4407 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4408 if (r) 4409 return r; 4410 } 4411 4412 return 0; 4413 } 4414 4415 /** 4416 * amdgpu_device_recover_vram - Recover some VRAM contents 4417 * 4418 * @adev: amdgpu_device pointer 4419 * 4420 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4421 * restore things like GPUVM page tables after a GPU reset where 4422 * the contents of VRAM might be lost. 4423 * 4424 * Returns: 4425 * 0 on success, negative error code on failure. 4426 */ 4427 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4428 { 4429 struct dma_fence *fence = NULL, *next = NULL; 4430 struct amdgpu_bo *shadow; 4431 struct amdgpu_bo_vm *vmbo; 4432 long r = 1, tmo; 4433 4434 if (amdgpu_sriov_runtime(adev)) 4435 tmo = msecs_to_jiffies(8000); 4436 else 4437 tmo = msecs_to_jiffies(100); 4438 4439 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4440 mutex_lock(&adev->shadow_list_lock); 4441 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4442 shadow = &vmbo->bo; 4443 /* No need to recover an evicted BO */ 4444 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4445 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4446 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4447 continue; 4448 4449 r = amdgpu_bo_restore_shadow(shadow, &next); 4450 if (r) 4451 break; 4452 4453 if (fence) { 4454 tmo = dma_fence_wait_timeout(fence, false, tmo); 4455 dma_fence_put(fence); 4456 fence = next; 4457 if (tmo == 0) { 4458 r = -ETIMEDOUT; 4459 break; 4460 } else if (tmo < 0) { 4461 r = tmo; 4462 break; 4463 } 4464 } else { 4465 fence = next; 4466 } 4467 } 4468 mutex_unlock(&adev->shadow_list_lock); 4469 4470 if (fence) 4471 tmo = dma_fence_wait_timeout(fence, false, tmo); 4472 dma_fence_put(fence); 4473 4474 if (r < 0 || tmo <= 0) { 4475 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4476 return -EIO; 4477 } 4478 4479 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4480 return 0; 4481 } 4482 4483 4484 /** 4485 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4486 * 4487 * @adev: amdgpu_device pointer 4488 * @from_hypervisor: request from hypervisor 4489 * 4490 * do VF FLR and reinitialize Asic 4491 * return 0 means succeeded otherwise failed 4492 */ 4493 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4494 bool from_hypervisor) 4495 { 4496 int r; 4497 struct amdgpu_hive_info *hive = NULL; 4498 int retry_limit = 0; 4499 4500 retry: 4501 amdgpu_amdkfd_pre_reset(adev); 4502 4503 if (from_hypervisor) 4504 r = amdgpu_virt_request_full_gpu(adev, true); 4505 else 4506 r = amdgpu_virt_reset_gpu(adev); 4507 if (r) 4508 return r; 4509 4510 /* Resume IP prior to SMC */ 4511 r = amdgpu_device_ip_reinit_early_sriov(adev); 4512 if (r) 4513 goto error; 4514 4515 amdgpu_virt_init_data_exchange(adev); 4516 4517 r = amdgpu_device_fw_loading(adev); 4518 if (r) 4519 return r; 4520 4521 /* now we are okay to resume SMC/CP/SDMA */ 4522 r = amdgpu_device_ip_reinit_late_sriov(adev); 4523 if (r) 4524 goto error; 4525 4526 hive = amdgpu_get_xgmi_hive(adev); 4527 /* Update PSP FW topology after reset */ 4528 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4529 r = amdgpu_xgmi_update_topology(hive, adev); 4530 4531 if (hive) 4532 amdgpu_put_xgmi_hive(hive); 4533 4534 if (!r) { 4535 amdgpu_irq_gpu_reset_resume_helper(adev); 4536 r = amdgpu_ib_ring_tests(adev); 4537 4538 amdgpu_amdkfd_post_reset(adev); 4539 } 4540 4541 error: 4542 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4543 amdgpu_inc_vram_lost(adev); 4544 r = amdgpu_device_recover_vram(adev); 4545 } 4546 amdgpu_virt_release_full_gpu(adev, true); 4547 4548 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4549 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4550 retry_limit++; 4551 goto retry; 4552 } else 4553 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4554 } 4555 4556 return r; 4557 } 4558 4559 /** 4560 * amdgpu_device_has_job_running - check if there is any job in mirror list 4561 * 4562 * @adev: amdgpu_device pointer 4563 * 4564 * check if there is any job in mirror list 4565 */ 4566 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4567 { 4568 int i; 4569 struct drm_sched_job *job; 4570 4571 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4572 struct amdgpu_ring *ring = adev->rings[i]; 4573 4574 if (!ring || !ring->sched.thread) 4575 continue; 4576 4577 spin_lock(&ring->sched.job_list_lock); 4578 job = list_first_entry_or_null(&ring->sched.pending_list, 4579 struct drm_sched_job, list); 4580 spin_unlock(&ring->sched.job_list_lock); 4581 if (job) 4582 return true; 4583 } 4584 return false; 4585 } 4586 4587 /** 4588 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4589 * 4590 * @adev: amdgpu_device pointer 4591 * 4592 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4593 * a hung GPU. 4594 */ 4595 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4596 { 4597 4598 if (amdgpu_gpu_recovery == 0) 4599 goto disabled; 4600 4601 /* Skip soft reset check in fatal error mode */ 4602 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4603 return true; 4604 4605 if (amdgpu_sriov_vf(adev)) 4606 return true; 4607 4608 if (amdgpu_gpu_recovery == -1) { 4609 switch (adev->asic_type) { 4610 #ifdef CONFIG_DRM_AMDGPU_SI 4611 case CHIP_VERDE: 4612 case CHIP_TAHITI: 4613 case CHIP_PITCAIRN: 4614 case CHIP_OLAND: 4615 case CHIP_HAINAN: 4616 #endif 4617 #ifdef CONFIG_DRM_AMDGPU_CIK 4618 case CHIP_KAVERI: 4619 case CHIP_KABINI: 4620 case CHIP_MULLINS: 4621 #endif 4622 case CHIP_CARRIZO: 4623 case CHIP_STONEY: 4624 case CHIP_CYAN_SKILLFISH: 4625 goto disabled; 4626 default: 4627 break; 4628 } 4629 } 4630 4631 return true; 4632 4633 disabled: 4634 dev_info(adev->dev, "GPU recovery disabled.\n"); 4635 return false; 4636 } 4637 4638 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4639 { 4640 u32 i; 4641 int ret = 0; 4642 4643 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4644 4645 dev_info(adev->dev, "GPU mode1 reset\n"); 4646 4647 /* disable BM */ 4648 pci_clear_master(adev->pdev); 4649 4650 amdgpu_device_cache_pci_state(adev->pdev); 4651 4652 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4653 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4654 ret = amdgpu_dpm_mode1_reset(adev); 4655 } else { 4656 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4657 ret = psp_gpu_reset(adev); 4658 } 4659 4660 if (ret) 4661 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4662 4663 amdgpu_device_load_pci_state(adev->pdev); 4664 4665 /* wait for asic to come out of reset */ 4666 for (i = 0; i < adev->usec_timeout; i++) { 4667 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4668 4669 if (memsize != 0xffffffff) 4670 break; 4671 udelay(1); 4672 } 4673 4674 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4675 return ret; 4676 } 4677 4678 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4679 struct amdgpu_reset_context *reset_context) 4680 { 4681 int i, r = 0; 4682 struct amdgpu_job *job = NULL; 4683 bool need_full_reset = 4684 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4685 4686 if (reset_context->reset_req_dev == adev) 4687 job = reset_context->job; 4688 4689 if (amdgpu_sriov_vf(adev)) { 4690 /* stop the data exchange thread */ 4691 amdgpu_virt_fini_data_exchange(adev); 4692 } 4693 4694 amdgpu_fence_driver_isr_toggle(adev, true); 4695 4696 /* block all schedulers and reset given job's ring */ 4697 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4698 struct amdgpu_ring *ring = adev->rings[i]; 4699 4700 if (!ring || !ring->sched.thread) 4701 continue; 4702 4703 /*clear job fence from fence drv to avoid force_completion 4704 *leave NULL and vm flush fence in fence drv */ 4705 amdgpu_fence_driver_clear_job_fences(ring); 4706 4707 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4708 amdgpu_fence_driver_force_completion(ring); 4709 } 4710 4711 amdgpu_fence_driver_isr_toggle(adev, false); 4712 4713 if (job && job->vm) 4714 drm_sched_increase_karma(&job->base); 4715 4716 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4717 /* If reset handler not implemented, continue; otherwise return */ 4718 if (r == -ENOSYS) 4719 r = 0; 4720 else 4721 return r; 4722 4723 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4724 if (!amdgpu_sriov_vf(adev)) { 4725 4726 if (!need_full_reset) 4727 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4728 4729 if (!need_full_reset && amdgpu_gpu_recovery && 4730 amdgpu_device_ip_check_soft_reset(adev)) { 4731 amdgpu_device_ip_pre_soft_reset(adev); 4732 r = amdgpu_device_ip_soft_reset(adev); 4733 amdgpu_device_ip_post_soft_reset(adev); 4734 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4735 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4736 need_full_reset = true; 4737 } 4738 } 4739 4740 if (need_full_reset) 4741 r = amdgpu_device_ip_suspend(adev); 4742 if (need_full_reset) 4743 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4744 else 4745 clear_bit(AMDGPU_NEED_FULL_RESET, 4746 &reset_context->flags); 4747 } 4748 4749 return r; 4750 } 4751 4752 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4753 { 4754 int i; 4755 4756 lockdep_assert_held(&adev->reset_domain->sem); 4757 4758 for (i = 0; i < adev->num_regs; i++) { 4759 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4760 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4761 adev->reset_dump_reg_value[i]); 4762 } 4763 4764 return 0; 4765 } 4766 4767 #ifdef CONFIG_DEV_COREDUMP 4768 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4769 size_t count, void *data, size_t datalen) 4770 { 4771 struct drm_printer p; 4772 struct amdgpu_device *adev = data; 4773 struct drm_print_iterator iter; 4774 int i; 4775 4776 iter.data = buffer; 4777 iter.offset = 0; 4778 iter.start = offset; 4779 iter.remain = count; 4780 4781 p = drm_coredump_printer(&iter); 4782 4783 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4784 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4785 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4786 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4787 if (adev->reset_task_info.pid) 4788 drm_printf(&p, "process_name: %s PID: %d\n", 4789 adev->reset_task_info.process_name, 4790 adev->reset_task_info.pid); 4791 4792 if (adev->reset_vram_lost) 4793 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4794 if (adev->num_regs) { 4795 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4796 4797 for (i = 0; i < adev->num_regs; i++) 4798 drm_printf(&p, "0x%08x: 0x%08x\n", 4799 adev->reset_dump_reg_list[i], 4800 adev->reset_dump_reg_value[i]); 4801 } 4802 4803 return count - iter.remain; 4804 } 4805 4806 static void amdgpu_devcoredump_free(void *data) 4807 { 4808 } 4809 4810 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4811 { 4812 struct drm_device *dev = adev_to_drm(adev); 4813 4814 ktime_get_ts64(&adev->reset_time); 4815 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4816 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4817 } 4818 #endif 4819 4820 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4821 struct amdgpu_reset_context *reset_context) 4822 { 4823 struct amdgpu_device *tmp_adev = NULL; 4824 bool need_full_reset, skip_hw_reset, vram_lost = false; 4825 int r = 0; 4826 bool gpu_reset_for_dev_remove = 0; 4827 4828 /* Try reset handler method first */ 4829 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4830 reset_list); 4831 amdgpu_reset_reg_dumps(tmp_adev); 4832 4833 reset_context->reset_device_list = device_list_handle; 4834 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4835 /* If reset handler not implemented, continue; otherwise return */ 4836 if (r == -ENOSYS) 4837 r = 0; 4838 else 4839 return r; 4840 4841 /* Reset handler not implemented, use the default method */ 4842 need_full_reset = 4843 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4844 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4845 4846 gpu_reset_for_dev_remove = 4847 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4848 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4849 4850 /* 4851 * ASIC reset has to be done on all XGMI hive nodes ASAP 4852 * to allow proper links negotiation in FW (within 1 sec) 4853 */ 4854 if (!skip_hw_reset && need_full_reset) { 4855 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4856 /* For XGMI run all resets in parallel to speed up the process */ 4857 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4858 tmp_adev->gmc.xgmi.pending_reset = false; 4859 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4860 r = -EALREADY; 4861 } else 4862 r = amdgpu_asic_reset(tmp_adev); 4863 4864 if (r) { 4865 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4866 r, adev_to_drm(tmp_adev)->unique); 4867 break; 4868 } 4869 } 4870 4871 /* For XGMI wait for all resets to complete before proceed */ 4872 if (!r) { 4873 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4874 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4875 flush_work(&tmp_adev->xgmi_reset_work); 4876 r = tmp_adev->asic_reset_res; 4877 if (r) 4878 break; 4879 } 4880 } 4881 } 4882 } 4883 4884 if (!r && amdgpu_ras_intr_triggered()) { 4885 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4886 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4887 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4888 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4889 } 4890 4891 amdgpu_ras_intr_cleared(); 4892 } 4893 4894 /* Since the mode1 reset affects base ip blocks, the 4895 * phase1 ip blocks need to be resumed. Otherwise there 4896 * will be a BIOS signature error and the psp bootloader 4897 * can't load kdb on the next amdgpu install. 4898 */ 4899 if (gpu_reset_for_dev_remove) { 4900 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4901 amdgpu_device_ip_resume_phase1(tmp_adev); 4902 4903 goto end; 4904 } 4905 4906 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4907 if (need_full_reset) { 4908 /* post card */ 4909 r = amdgpu_device_asic_init(tmp_adev); 4910 if (r) { 4911 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4912 } else { 4913 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4914 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4915 if (r) 4916 goto out; 4917 4918 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4919 if (r) 4920 goto out; 4921 4922 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4923 #ifdef CONFIG_DEV_COREDUMP 4924 tmp_adev->reset_vram_lost = vram_lost; 4925 memset(&tmp_adev->reset_task_info, 0, 4926 sizeof(tmp_adev->reset_task_info)); 4927 if (reset_context->job && reset_context->job->vm) 4928 tmp_adev->reset_task_info = 4929 reset_context->job->vm->task_info; 4930 amdgpu_reset_capture_coredumpm(tmp_adev); 4931 #endif 4932 if (vram_lost) { 4933 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4934 amdgpu_inc_vram_lost(tmp_adev); 4935 } 4936 4937 r = amdgpu_device_fw_loading(tmp_adev); 4938 if (r) 4939 return r; 4940 4941 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4942 if (r) 4943 goto out; 4944 4945 if (vram_lost) 4946 amdgpu_device_fill_reset_magic(tmp_adev); 4947 4948 /* 4949 * Add this ASIC as tracked as reset was already 4950 * complete successfully. 4951 */ 4952 amdgpu_register_gpu_instance(tmp_adev); 4953 4954 if (!reset_context->hive && 4955 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4956 amdgpu_xgmi_add_device(tmp_adev); 4957 4958 r = amdgpu_device_ip_late_init(tmp_adev); 4959 if (r) 4960 goto out; 4961 4962 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4963 4964 /* 4965 * The GPU enters bad state once faulty pages 4966 * by ECC has reached the threshold, and ras 4967 * recovery is scheduled next. So add one check 4968 * here to break recovery if it indeed exceeds 4969 * bad page threshold, and remind user to 4970 * retire this GPU or setting one bigger 4971 * bad_page_threshold value to fix this once 4972 * probing driver again. 4973 */ 4974 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4975 /* must succeed. */ 4976 amdgpu_ras_resume(tmp_adev); 4977 } else { 4978 r = -EINVAL; 4979 goto out; 4980 } 4981 4982 /* Update PSP FW topology after reset */ 4983 if (reset_context->hive && 4984 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4985 r = amdgpu_xgmi_update_topology( 4986 reset_context->hive, tmp_adev); 4987 } 4988 } 4989 4990 out: 4991 if (!r) { 4992 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4993 r = amdgpu_ib_ring_tests(tmp_adev); 4994 if (r) { 4995 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4996 need_full_reset = true; 4997 r = -EAGAIN; 4998 goto end; 4999 } 5000 } 5001 5002 if (!r) 5003 r = amdgpu_device_recover_vram(tmp_adev); 5004 else 5005 tmp_adev->asic_reset_res = r; 5006 } 5007 5008 end: 5009 if (need_full_reset) 5010 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5011 else 5012 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5013 return r; 5014 } 5015 5016 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5017 { 5018 5019 switch (amdgpu_asic_reset_method(adev)) { 5020 case AMD_RESET_METHOD_MODE1: 5021 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5022 break; 5023 case AMD_RESET_METHOD_MODE2: 5024 adev->mp1_state = PP_MP1_STATE_RESET; 5025 break; 5026 default: 5027 adev->mp1_state = PP_MP1_STATE_NONE; 5028 break; 5029 } 5030 } 5031 5032 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5033 { 5034 amdgpu_vf_error_trans_all(adev); 5035 adev->mp1_state = PP_MP1_STATE_NONE; 5036 } 5037 5038 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5039 { 5040 struct pci_dev *p = NULL; 5041 5042 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5043 adev->pdev->bus->number, 1); 5044 if (p) { 5045 pm_runtime_enable(&(p->dev)); 5046 pm_runtime_resume(&(p->dev)); 5047 } 5048 5049 pci_dev_put(p); 5050 } 5051 5052 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5053 { 5054 enum amd_reset_method reset_method; 5055 struct pci_dev *p = NULL; 5056 u64 expires; 5057 5058 /* 5059 * For now, only BACO and mode1 reset are confirmed 5060 * to suffer the audio issue without proper suspended. 5061 */ 5062 reset_method = amdgpu_asic_reset_method(adev); 5063 if ((reset_method != AMD_RESET_METHOD_BACO) && 5064 (reset_method != AMD_RESET_METHOD_MODE1)) 5065 return -EINVAL; 5066 5067 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5068 adev->pdev->bus->number, 1); 5069 if (!p) 5070 return -ENODEV; 5071 5072 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5073 if (!expires) 5074 /* 5075 * If we cannot get the audio device autosuspend delay, 5076 * a fixed 4S interval will be used. Considering 3S is 5077 * the audio controller default autosuspend delay setting. 5078 * 4S used here is guaranteed to cover that. 5079 */ 5080 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5081 5082 while (!pm_runtime_status_suspended(&(p->dev))) { 5083 if (!pm_runtime_suspend(&(p->dev))) 5084 break; 5085 5086 if (expires < ktime_get_mono_fast_ns()) { 5087 dev_warn(adev->dev, "failed to suspend display audio\n"); 5088 pci_dev_put(p); 5089 /* TODO: abort the succeeding gpu reset? */ 5090 return -ETIMEDOUT; 5091 } 5092 } 5093 5094 pm_runtime_disable(&(p->dev)); 5095 5096 pci_dev_put(p); 5097 return 0; 5098 } 5099 5100 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5101 { 5102 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5103 5104 #if defined(CONFIG_DEBUG_FS) 5105 if (!amdgpu_sriov_vf(adev)) 5106 cancel_work(&adev->reset_work); 5107 #endif 5108 5109 if (adev->kfd.dev) 5110 cancel_work(&adev->kfd.reset_work); 5111 5112 if (amdgpu_sriov_vf(adev)) 5113 cancel_work(&adev->virt.flr_work); 5114 5115 if (con && adev->ras_enabled) 5116 cancel_work(&con->recovery_work); 5117 5118 } 5119 5120 /** 5121 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5122 * 5123 * @adev: amdgpu_device pointer 5124 * @job: which job trigger hang 5125 * 5126 * Attempt to reset the GPU if it has hung (all asics). 5127 * Attempt to do soft-reset or full-reset and reinitialize Asic 5128 * Returns 0 for success or an error on failure. 5129 */ 5130 5131 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5132 struct amdgpu_job *job, 5133 struct amdgpu_reset_context *reset_context) 5134 { 5135 struct list_head device_list, *device_list_handle = NULL; 5136 bool job_signaled = false; 5137 struct amdgpu_hive_info *hive = NULL; 5138 struct amdgpu_device *tmp_adev = NULL; 5139 int i, r = 0; 5140 bool need_emergency_restart = false; 5141 bool audio_suspended = false; 5142 bool gpu_reset_for_dev_remove = false; 5143 5144 gpu_reset_for_dev_remove = 5145 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5146 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5147 5148 /* 5149 * Special case: RAS triggered and full reset isn't supported 5150 */ 5151 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5152 5153 /* 5154 * Flush RAM to disk so that after reboot 5155 * the user can read log and see why the system rebooted. 5156 */ 5157 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5158 DRM_WARN("Emergency reboot."); 5159 5160 ksys_sync_helper(); 5161 emergency_restart(); 5162 } 5163 5164 dev_info(adev->dev, "GPU %s begin!\n", 5165 need_emergency_restart ? "jobs stop":"reset"); 5166 5167 if (!amdgpu_sriov_vf(adev)) 5168 hive = amdgpu_get_xgmi_hive(adev); 5169 if (hive) 5170 mutex_lock(&hive->hive_lock); 5171 5172 reset_context->job = job; 5173 reset_context->hive = hive; 5174 /* 5175 * Build list of devices to reset. 5176 * In case we are in XGMI hive mode, resort the device list 5177 * to put adev in the 1st position. 5178 */ 5179 INIT_LIST_HEAD(&device_list); 5180 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5181 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5182 list_add_tail(&tmp_adev->reset_list, &device_list); 5183 if (gpu_reset_for_dev_remove && adev->shutdown) 5184 tmp_adev->shutdown = true; 5185 } 5186 if (!list_is_first(&adev->reset_list, &device_list)) 5187 list_rotate_to_front(&adev->reset_list, &device_list); 5188 device_list_handle = &device_list; 5189 } else { 5190 list_add_tail(&adev->reset_list, &device_list); 5191 device_list_handle = &device_list; 5192 } 5193 5194 /* We need to lock reset domain only once both for XGMI and single device */ 5195 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5196 reset_list); 5197 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5198 5199 /* block all schedulers and reset given job's ring */ 5200 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5201 5202 amdgpu_device_set_mp1_state(tmp_adev); 5203 5204 /* 5205 * Try to put the audio codec into suspend state 5206 * before gpu reset started. 5207 * 5208 * Due to the power domain of the graphics device 5209 * is shared with AZ power domain. Without this, 5210 * we may change the audio hardware from behind 5211 * the audio driver's back. That will trigger 5212 * some audio codec errors. 5213 */ 5214 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5215 audio_suspended = true; 5216 5217 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5218 5219 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5220 5221 if (!amdgpu_sriov_vf(tmp_adev)) 5222 amdgpu_amdkfd_pre_reset(tmp_adev); 5223 5224 /* 5225 * Mark these ASICs to be reseted as untracked first 5226 * And add them back after reset completed 5227 */ 5228 amdgpu_unregister_gpu_instance(tmp_adev); 5229 5230 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5231 5232 /* disable ras on ALL IPs */ 5233 if (!need_emergency_restart && 5234 amdgpu_device_ip_need_full_reset(tmp_adev)) 5235 amdgpu_ras_suspend(tmp_adev); 5236 5237 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5238 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5239 5240 if (!ring || !ring->sched.thread) 5241 continue; 5242 5243 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5244 5245 if (need_emergency_restart) 5246 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5247 } 5248 atomic_inc(&tmp_adev->gpu_reset_counter); 5249 } 5250 5251 if (need_emergency_restart) 5252 goto skip_sched_resume; 5253 5254 /* 5255 * Must check guilty signal here since after this point all old 5256 * HW fences are force signaled. 5257 * 5258 * job->base holds a reference to parent fence 5259 */ 5260 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5261 job_signaled = true; 5262 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5263 goto skip_hw_reset; 5264 } 5265 5266 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5267 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5268 if (gpu_reset_for_dev_remove) { 5269 /* Workaroud for ASICs need to disable SMC first */ 5270 amdgpu_device_smu_fini_early(tmp_adev); 5271 } 5272 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5273 /*TODO Should we stop ?*/ 5274 if (r) { 5275 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5276 r, adev_to_drm(tmp_adev)->unique); 5277 tmp_adev->asic_reset_res = r; 5278 } 5279 5280 /* 5281 * Drop all pending non scheduler resets. Scheduler resets 5282 * were already dropped during drm_sched_stop 5283 */ 5284 amdgpu_device_stop_pending_resets(tmp_adev); 5285 } 5286 5287 /* Actual ASIC resets if needed.*/ 5288 /* Host driver will handle XGMI hive reset for SRIOV */ 5289 if (amdgpu_sriov_vf(adev)) { 5290 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5291 if (r) 5292 adev->asic_reset_res = r; 5293 5294 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5295 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5296 amdgpu_ras_resume(adev); 5297 } else { 5298 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5299 if (r && r == -EAGAIN) 5300 goto retry; 5301 5302 if (!r && gpu_reset_for_dev_remove) 5303 goto recover_end; 5304 } 5305 5306 skip_hw_reset: 5307 5308 /* Post ASIC reset for all devs .*/ 5309 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5310 5311 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5312 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5313 5314 if (!ring || !ring->sched.thread) 5315 continue; 5316 5317 drm_sched_start(&ring->sched, true); 5318 } 5319 5320 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5321 amdgpu_mes_self_test(tmp_adev); 5322 5323 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5324 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5325 } 5326 5327 if (tmp_adev->asic_reset_res) 5328 r = tmp_adev->asic_reset_res; 5329 5330 tmp_adev->asic_reset_res = 0; 5331 5332 if (r) { 5333 /* bad news, how to tell it to userspace ? */ 5334 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5335 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5336 } else { 5337 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5338 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5339 DRM_WARN("smart shift update failed\n"); 5340 } 5341 } 5342 5343 skip_sched_resume: 5344 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5345 /* unlock kfd: SRIOV would do it separately */ 5346 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5347 amdgpu_amdkfd_post_reset(tmp_adev); 5348 5349 /* kfd_post_reset will do nothing if kfd device is not initialized, 5350 * need to bring up kfd here if it's not be initialized before 5351 */ 5352 if (!adev->kfd.init_complete) 5353 amdgpu_amdkfd_device_init(adev); 5354 5355 if (audio_suspended) 5356 amdgpu_device_resume_display_audio(tmp_adev); 5357 5358 amdgpu_device_unset_mp1_state(tmp_adev); 5359 5360 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5361 } 5362 5363 recover_end: 5364 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5365 reset_list); 5366 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5367 5368 if (hive) { 5369 mutex_unlock(&hive->hive_lock); 5370 amdgpu_put_xgmi_hive(hive); 5371 } 5372 5373 if (r) 5374 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5375 5376 atomic_set(&adev->reset_domain->reset_res, r); 5377 return r; 5378 } 5379 5380 /** 5381 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5382 * 5383 * @adev: amdgpu_device pointer 5384 * 5385 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5386 * and lanes) of the slot the device is in. Handles APUs and 5387 * virtualized environments where PCIE config space may not be available. 5388 */ 5389 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5390 { 5391 struct pci_dev *pdev; 5392 enum pci_bus_speed speed_cap, platform_speed_cap; 5393 enum pcie_link_width platform_link_width; 5394 5395 if (amdgpu_pcie_gen_cap) 5396 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5397 5398 if (amdgpu_pcie_lane_cap) 5399 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5400 5401 /* covers APUs as well */ 5402 if (pci_is_root_bus(adev->pdev->bus)) { 5403 if (adev->pm.pcie_gen_mask == 0) 5404 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5405 if (adev->pm.pcie_mlw_mask == 0) 5406 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5407 return; 5408 } 5409 5410 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5411 return; 5412 5413 pcie_bandwidth_available(adev->pdev, NULL, 5414 &platform_speed_cap, &platform_link_width); 5415 5416 if (adev->pm.pcie_gen_mask == 0) { 5417 /* asic caps */ 5418 pdev = adev->pdev; 5419 speed_cap = pcie_get_speed_cap(pdev); 5420 if (speed_cap == PCI_SPEED_UNKNOWN) { 5421 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5422 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5423 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5424 } else { 5425 if (speed_cap == PCIE_SPEED_32_0GT) 5426 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5427 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5428 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5429 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5430 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5431 else if (speed_cap == PCIE_SPEED_16_0GT) 5432 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5433 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5434 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5435 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5436 else if (speed_cap == PCIE_SPEED_8_0GT) 5437 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5438 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5439 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5440 else if (speed_cap == PCIE_SPEED_5_0GT) 5441 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5442 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5443 else 5444 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5445 } 5446 /* platform caps */ 5447 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5448 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5449 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5450 } else { 5451 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5452 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5453 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5454 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5455 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5456 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5457 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5458 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5459 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5460 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5461 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5462 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5463 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5464 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5465 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5466 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5467 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5468 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5469 else 5470 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5471 5472 } 5473 } 5474 if (adev->pm.pcie_mlw_mask == 0) { 5475 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5476 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5477 } else { 5478 switch (platform_link_width) { 5479 case PCIE_LNK_X32: 5480 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5481 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5482 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5483 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5484 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5485 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5486 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5487 break; 5488 case PCIE_LNK_X16: 5489 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5490 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5491 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5492 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5493 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5494 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5495 break; 5496 case PCIE_LNK_X12: 5497 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5498 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5499 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5500 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5501 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5502 break; 5503 case PCIE_LNK_X8: 5504 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5505 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5506 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5507 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5508 break; 5509 case PCIE_LNK_X4: 5510 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5511 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5512 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5513 break; 5514 case PCIE_LNK_X2: 5515 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5516 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5517 break; 5518 case PCIE_LNK_X1: 5519 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5520 break; 5521 default: 5522 break; 5523 } 5524 } 5525 } 5526 } 5527 5528 /** 5529 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5530 * 5531 * @adev: amdgpu_device pointer 5532 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5533 * 5534 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5535 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5536 * @peer_adev. 5537 */ 5538 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5539 struct amdgpu_device *peer_adev) 5540 { 5541 #ifdef CONFIG_HSA_AMD_P2P 5542 uint64_t address_mask = peer_adev->dev->dma_mask ? 5543 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5544 resource_size_t aper_limit = 5545 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5546 bool p2p_access = 5547 !adev->gmc.xgmi.connected_to_cpu && 5548 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5549 5550 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5551 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5552 !(adev->gmc.aper_base & address_mask || 5553 aper_limit & address_mask)); 5554 #else 5555 return false; 5556 #endif 5557 } 5558 5559 int amdgpu_device_baco_enter(struct drm_device *dev) 5560 { 5561 struct amdgpu_device *adev = drm_to_adev(dev); 5562 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5563 5564 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5565 return -ENOTSUPP; 5566 5567 if (ras && adev->ras_enabled && 5568 adev->nbio.funcs->enable_doorbell_interrupt) 5569 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5570 5571 return amdgpu_dpm_baco_enter(adev); 5572 } 5573 5574 int amdgpu_device_baco_exit(struct drm_device *dev) 5575 { 5576 struct amdgpu_device *adev = drm_to_adev(dev); 5577 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5578 int ret = 0; 5579 5580 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5581 return -ENOTSUPP; 5582 5583 ret = amdgpu_dpm_baco_exit(adev); 5584 if (ret) 5585 return ret; 5586 5587 if (ras && adev->ras_enabled && 5588 adev->nbio.funcs->enable_doorbell_interrupt) 5589 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5590 5591 if (amdgpu_passthrough(adev) && 5592 adev->nbio.funcs->clear_doorbell_interrupt) 5593 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5594 5595 return 0; 5596 } 5597 5598 /** 5599 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5600 * @pdev: PCI device struct 5601 * @state: PCI channel state 5602 * 5603 * Description: Called when a PCI error is detected. 5604 * 5605 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5606 */ 5607 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5608 { 5609 struct drm_device *dev = pci_get_drvdata(pdev); 5610 struct amdgpu_device *adev = drm_to_adev(dev); 5611 int i; 5612 5613 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5614 5615 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5616 DRM_WARN("No support for XGMI hive yet..."); 5617 return PCI_ERS_RESULT_DISCONNECT; 5618 } 5619 5620 adev->pci_channel_state = state; 5621 5622 switch (state) { 5623 case pci_channel_io_normal: 5624 return PCI_ERS_RESULT_CAN_RECOVER; 5625 /* Fatal error, prepare for slot reset */ 5626 case pci_channel_io_frozen: 5627 /* 5628 * Locking adev->reset_domain->sem will prevent any external access 5629 * to GPU during PCI error recovery 5630 */ 5631 amdgpu_device_lock_reset_domain(adev->reset_domain); 5632 amdgpu_device_set_mp1_state(adev); 5633 5634 /* 5635 * Block any work scheduling as we do for regular GPU reset 5636 * for the duration of the recovery 5637 */ 5638 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5639 struct amdgpu_ring *ring = adev->rings[i]; 5640 5641 if (!ring || !ring->sched.thread) 5642 continue; 5643 5644 drm_sched_stop(&ring->sched, NULL); 5645 } 5646 atomic_inc(&adev->gpu_reset_counter); 5647 return PCI_ERS_RESULT_NEED_RESET; 5648 case pci_channel_io_perm_failure: 5649 /* Permanent error, prepare for device removal */ 5650 return PCI_ERS_RESULT_DISCONNECT; 5651 } 5652 5653 return PCI_ERS_RESULT_NEED_RESET; 5654 } 5655 5656 /** 5657 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5658 * @pdev: pointer to PCI device 5659 */ 5660 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5661 { 5662 5663 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5664 5665 /* TODO - dump whatever for debugging purposes */ 5666 5667 /* This called only if amdgpu_pci_error_detected returns 5668 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5669 * works, no need to reset slot. 5670 */ 5671 5672 return PCI_ERS_RESULT_RECOVERED; 5673 } 5674 5675 /** 5676 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5677 * @pdev: PCI device struct 5678 * 5679 * Description: This routine is called by the pci error recovery 5680 * code after the PCI slot has been reset, just before we 5681 * should resume normal operations. 5682 */ 5683 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5684 { 5685 struct drm_device *dev = pci_get_drvdata(pdev); 5686 struct amdgpu_device *adev = drm_to_adev(dev); 5687 int r, i; 5688 struct amdgpu_reset_context reset_context; 5689 u32 memsize; 5690 struct list_head device_list; 5691 5692 DRM_INFO("PCI error: slot reset callback!!\n"); 5693 5694 memset(&reset_context, 0, sizeof(reset_context)); 5695 5696 INIT_LIST_HEAD(&device_list); 5697 list_add_tail(&adev->reset_list, &device_list); 5698 5699 /* wait for asic to come out of reset */ 5700 msleep(500); 5701 5702 /* Restore PCI confspace */ 5703 amdgpu_device_load_pci_state(pdev); 5704 5705 /* confirm ASIC came out of reset */ 5706 for (i = 0; i < adev->usec_timeout; i++) { 5707 memsize = amdgpu_asic_get_config_memsize(adev); 5708 5709 if (memsize != 0xffffffff) 5710 break; 5711 udelay(1); 5712 } 5713 if (memsize == 0xffffffff) { 5714 r = -ETIME; 5715 goto out; 5716 } 5717 5718 reset_context.method = AMD_RESET_METHOD_NONE; 5719 reset_context.reset_req_dev = adev; 5720 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5721 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5722 5723 adev->no_hw_access = true; 5724 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5725 adev->no_hw_access = false; 5726 if (r) 5727 goto out; 5728 5729 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5730 5731 out: 5732 if (!r) { 5733 if (amdgpu_device_cache_pci_state(adev->pdev)) 5734 pci_restore_state(adev->pdev); 5735 5736 DRM_INFO("PCIe error recovery succeeded\n"); 5737 } else { 5738 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5739 amdgpu_device_unset_mp1_state(adev); 5740 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5741 } 5742 5743 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5744 } 5745 5746 /** 5747 * amdgpu_pci_resume() - resume normal ops after PCI reset 5748 * @pdev: pointer to PCI device 5749 * 5750 * Called when the error recovery driver tells us that its 5751 * OK to resume normal operation. 5752 */ 5753 void amdgpu_pci_resume(struct pci_dev *pdev) 5754 { 5755 struct drm_device *dev = pci_get_drvdata(pdev); 5756 struct amdgpu_device *adev = drm_to_adev(dev); 5757 int i; 5758 5759 5760 DRM_INFO("PCI error: resume callback!!\n"); 5761 5762 /* Only continue execution for the case of pci_channel_io_frozen */ 5763 if (adev->pci_channel_state != pci_channel_io_frozen) 5764 return; 5765 5766 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5767 struct amdgpu_ring *ring = adev->rings[i]; 5768 5769 if (!ring || !ring->sched.thread) 5770 continue; 5771 5772 drm_sched_start(&ring->sched, true); 5773 } 5774 5775 amdgpu_device_unset_mp1_state(adev); 5776 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5777 } 5778 5779 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5780 { 5781 struct drm_device *dev = pci_get_drvdata(pdev); 5782 struct amdgpu_device *adev = drm_to_adev(dev); 5783 int r; 5784 5785 r = pci_save_state(pdev); 5786 if (!r) { 5787 kfree(adev->pci_state); 5788 5789 adev->pci_state = pci_store_saved_state(pdev); 5790 5791 if (!adev->pci_state) { 5792 DRM_ERROR("Failed to store PCI saved state"); 5793 return false; 5794 } 5795 } else { 5796 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5797 return false; 5798 } 5799 5800 return true; 5801 } 5802 5803 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5804 { 5805 struct drm_device *dev = pci_get_drvdata(pdev); 5806 struct amdgpu_device *adev = drm_to_adev(dev); 5807 int r; 5808 5809 if (!adev->pci_state) 5810 return false; 5811 5812 r = pci_load_saved_state(pdev, adev->pci_state); 5813 5814 if (!r) { 5815 pci_restore_state(pdev); 5816 } else { 5817 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5818 return false; 5819 } 5820 5821 return true; 5822 } 5823 5824 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5825 struct amdgpu_ring *ring) 5826 { 5827 #ifdef CONFIG_X86_64 5828 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5829 return; 5830 #endif 5831 if (adev->gmc.xgmi.connected_to_cpu) 5832 return; 5833 5834 if (ring && ring->funcs->emit_hdp_flush) 5835 amdgpu_ring_emit_hdp_flush(ring); 5836 else 5837 amdgpu_asic_flush_hdp(adev, ring); 5838 } 5839 5840 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5841 struct amdgpu_ring *ring) 5842 { 5843 #ifdef CONFIG_X86_64 5844 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5845 return; 5846 #endif 5847 if (adev->gmc.xgmi.connected_to_cpu) 5848 return; 5849 5850 amdgpu_asic_invalidate_hdp(adev, ring); 5851 } 5852 5853 int amdgpu_in_reset(struct amdgpu_device *adev) 5854 { 5855 return atomic_read(&adev->reset_domain->in_gpu_reset); 5856 } 5857 5858 /** 5859 * amdgpu_device_halt() - bring hardware to some kind of halt state 5860 * 5861 * @adev: amdgpu_device pointer 5862 * 5863 * Bring hardware to some kind of halt state so that no one can touch it 5864 * any more. It will help to maintain error context when error occurred. 5865 * Compare to a simple hang, the system will keep stable at least for SSH 5866 * access. Then it should be trivial to inspect the hardware state and 5867 * see what's going on. Implemented as following: 5868 * 5869 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5870 * clears all CPU mappings to device, disallows remappings through page faults 5871 * 2. amdgpu_irq_disable_all() disables all interrupts 5872 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5873 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5874 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5875 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5876 * flush any in flight DMA operations 5877 */ 5878 void amdgpu_device_halt(struct amdgpu_device *adev) 5879 { 5880 struct pci_dev *pdev = adev->pdev; 5881 struct drm_device *ddev = adev_to_drm(adev); 5882 5883 drm_dev_unplug(ddev); 5884 5885 amdgpu_irq_disable_all(adev); 5886 5887 amdgpu_fence_driver_hw_fini(adev); 5888 5889 adev->no_hw_access = true; 5890 5891 amdgpu_device_unmap_mmio(adev); 5892 5893 pci_disable_device(pdev); 5894 pci_wait_for_pending_transaction(pdev); 5895 } 5896 5897 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5898 u32 reg) 5899 { 5900 unsigned long flags, address, data; 5901 u32 r; 5902 5903 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5904 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5905 5906 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5907 WREG32(address, reg * 4); 5908 (void)RREG32(address); 5909 r = RREG32(data); 5910 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5911 return r; 5912 } 5913 5914 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5915 u32 reg, u32 v) 5916 { 5917 unsigned long flags, address, data; 5918 5919 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5920 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5921 5922 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5923 WREG32(address, reg * 4); 5924 (void)RREG32(address); 5925 WREG32(data, v); 5926 (void)RREG32(data); 5927 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5928 } 5929 5930 /** 5931 * amdgpu_device_switch_gang - switch to a new gang 5932 * @adev: amdgpu_device pointer 5933 * @gang: the gang to switch to 5934 * 5935 * Try to switch to a new gang. 5936 * Returns: NULL if we switched to the new gang or a reference to the current 5937 * gang leader. 5938 */ 5939 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5940 struct dma_fence *gang) 5941 { 5942 struct dma_fence *old = NULL; 5943 5944 do { 5945 dma_fence_put(old); 5946 rcu_read_lock(); 5947 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5948 rcu_read_unlock(); 5949 5950 if (old == gang) 5951 break; 5952 5953 if (!dma_fence_is_signaled(old)) 5954 return old; 5955 5956 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5957 old, gang) != old); 5958 5959 dma_fence_put(old); 5960 return NULL; 5961 } 5962 5963 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5964 { 5965 switch (adev->asic_type) { 5966 #ifdef CONFIG_DRM_AMDGPU_SI 5967 case CHIP_HAINAN: 5968 #endif 5969 case CHIP_TOPAZ: 5970 /* chips with no display hardware */ 5971 return false; 5972 #ifdef CONFIG_DRM_AMDGPU_SI 5973 case CHIP_TAHITI: 5974 case CHIP_PITCAIRN: 5975 case CHIP_VERDE: 5976 case CHIP_OLAND: 5977 #endif 5978 #ifdef CONFIG_DRM_AMDGPU_CIK 5979 case CHIP_BONAIRE: 5980 case CHIP_HAWAII: 5981 case CHIP_KAVERI: 5982 case CHIP_KABINI: 5983 case CHIP_MULLINS: 5984 #endif 5985 case CHIP_TONGA: 5986 case CHIP_FIJI: 5987 case CHIP_POLARIS10: 5988 case CHIP_POLARIS11: 5989 case CHIP_POLARIS12: 5990 case CHIP_VEGAM: 5991 case CHIP_CARRIZO: 5992 case CHIP_STONEY: 5993 /* chips with display hardware */ 5994 return true; 5995 default: 5996 /* IP discovery */ 5997 if (!adev->ip_versions[DCE_HWIP][0] || 5998 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 5999 return false; 6000 return true; 6001 } 6002 } 6003