1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 #define AMDGPU_MAX_RETRY_LIMIT 2 92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 93 94 const char *amdgpu_asic_name[] = { 95 "TAHITI", 96 "PITCAIRN", 97 "VERDE", 98 "OLAND", 99 "HAINAN", 100 "BONAIRE", 101 "KAVERI", 102 "KABINI", 103 "HAWAII", 104 "MULLINS", 105 "TOPAZ", 106 "TONGA", 107 "FIJI", 108 "CARRIZO", 109 "STONEY", 110 "POLARIS10", 111 "POLARIS11", 112 "POLARIS12", 113 "VEGAM", 114 "VEGA10", 115 "VEGA12", 116 "VEGA20", 117 "RAVEN", 118 "ARCTURUS", 119 "RENOIR", 120 "ALDEBARAN", 121 "NAVI10", 122 "CYAN_SKILLFISH", 123 "NAVI14", 124 "NAVI12", 125 "SIENNA_CICHLID", 126 "NAVY_FLOUNDER", 127 "VANGOGH", 128 "DIMGREY_CAVEFISH", 129 "BEIGE_GOBY", 130 "YELLOW_CARP", 131 "IP DISCOVERY", 132 "LAST", 133 }; 134 135 /** 136 * DOC: pcie_replay_count 137 * 138 * The amdgpu driver provides a sysfs API for reporting the total number 139 * of PCIe replays (NAKs) 140 * The file pcie_replay_count is used for this and returns the total 141 * number of replays as a sum of the NAKs generated and NAKs received 142 */ 143 144 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 145 struct device_attribute *attr, char *buf) 146 { 147 struct drm_device *ddev = dev_get_drvdata(dev); 148 struct amdgpu_device *adev = drm_to_adev(ddev); 149 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 150 151 return sysfs_emit(buf, "%llu\n", cnt); 152 } 153 154 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 155 amdgpu_device_get_pcie_replay_count, NULL); 156 157 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 158 159 /** 160 * DOC: product_name 161 * 162 * The amdgpu driver provides a sysfs API for reporting the product name 163 * for the device 164 * The file serial_number is used for this and returns the product name 165 * as returned from the FRU. 166 * NOTE: This is only available for certain server cards 167 */ 168 169 static ssize_t amdgpu_device_get_product_name(struct device *dev, 170 struct device_attribute *attr, char *buf) 171 { 172 struct drm_device *ddev = dev_get_drvdata(dev); 173 struct amdgpu_device *adev = drm_to_adev(ddev); 174 175 return sysfs_emit(buf, "%s\n", adev->product_name); 176 } 177 178 static DEVICE_ATTR(product_name, S_IRUGO, 179 amdgpu_device_get_product_name, NULL); 180 181 /** 182 * DOC: product_number 183 * 184 * The amdgpu driver provides a sysfs API for reporting the part number 185 * for the device 186 * The file serial_number is used for this and returns the part number 187 * as returned from the FRU. 188 * NOTE: This is only available for certain server cards 189 */ 190 191 static ssize_t amdgpu_device_get_product_number(struct device *dev, 192 struct device_attribute *attr, char *buf) 193 { 194 struct drm_device *ddev = dev_get_drvdata(dev); 195 struct amdgpu_device *adev = drm_to_adev(ddev); 196 197 return sysfs_emit(buf, "%s\n", adev->product_number); 198 } 199 200 static DEVICE_ATTR(product_number, S_IRUGO, 201 amdgpu_device_get_product_number, NULL); 202 203 /** 204 * DOC: serial_number 205 * 206 * The amdgpu driver provides a sysfs API for reporting the serial number 207 * for the device 208 * The file serial_number is used for this and returns the serial number 209 * as returned from the FRU. 210 * NOTE: This is only available for certain server cards 211 */ 212 213 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 214 struct device_attribute *attr, char *buf) 215 { 216 struct drm_device *ddev = dev_get_drvdata(dev); 217 struct amdgpu_device *adev = drm_to_adev(ddev); 218 219 return sysfs_emit(buf, "%s\n", adev->serial); 220 } 221 222 static DEVICE_ATTR(serial_number, S_IRUGO, 223 amdgpu_device_get_serial_number, NULL); 224 225 /** 226 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 227 * 228 * @dev: drm_device pointer 229 * 230 * Returns true if the device is a dGPU with ATPX power control, 231 * otherwise return false. 232 */ 233 bool amdgpu_device_supports_px(struct drm_device *dev) 234 { 235 struct amdgpu_device *adev = drm_to_adev(dev); 236 237 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 238 return true; 239 return false; 240 } 241 242 /** 243 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 244 * 245 * @dev: drm_device pointer 246 * 247 * Returns true if the device is a dGPU with ACPI power control, 248 * otherwise return false. 249 */ 250 bool amdgpu_device_supports_boco(struct drm_device *dev) 251 { 252 struct amdgpu_device *adev = drm_to_adev(dev); 253 254 if (adev->has_pr3 || 255 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 256 return true; 257 return false; 258 } 259 260 /** 261 * amdgpu_device_supports_baco - Does the device support BACO 262 * 263 * @dev: drm_device pointer 264 * 265 * Returns true if the device supporte BACO, 266 * otherwise return false. 267 */ 268 bool amdgpu_device_supports_baco(struct drm_device *dev) 269 { 270 struct amdgpu_device *adev = drm_to_adev(dev); 271 272 return amdgpu_asic_supports_baco(adev); 273 } 274 275 /** 276 * amdgpu_device_supports_smart_shift - Is the device dGPU with 277 * smart shift support 278 * 279 * @dev: drm_device pointer 280 * 281 * Returns true if the device is a dGPU with Smart Shift support, 282 * otherwise returns false. 283 */ 284 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 285 { 286 return (amdgpu_device_supports_boco(dev) && 287 amdgpu_acpi_is_power_shift_control_supported()); 288 } 289 290 /* 291 * VRAM access helper functions 292 */ 293 294 /** 295 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 296 * 297 * @adev: amdgpu_device pointer 298 * @pos: offset of the buffer in vram 299 * @buf: virtual address of the buffer in system memory 300 * @size: read/write size, sizeof(@buf) must > @size 301 * @write: true - write to vram, otherwise - read from vram 302 */ 303 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 304 void *buf, size_t size, bool write) 305 { 306 unsigned long flags; 307 uint32_t hi = ~0, tmp = 0; 308 uint32_t *data = buf; 309 uint64_t last; 310 int idx; 311 312 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 313 return; 314 315 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 316 317 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 318 for (last = pos + size; pos < last; pos += 4) { 319 tmp = pos >> 31; 320 321 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 322 if (tmp != hi) { 323 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 324 hi = tmp; 325 } 326 if (write) 327 WREG32_NO_KIQ(mmMM_DATA, *data++); 328 else 329 *data++ = RREG32_NO_KIQ(mmMM_DATA); 330 } 331 332 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 333 drm_dev_exit(idx); 334 } 335 336 /** 337 * amdgpu_device_aper_access - access vram by vram aperature 338 * 339 * @adev: amdgpu_device pointer 340 * @pos: offset of the buffer in vram 341 * @buf: virtual address of the buffer in system memory 342 * @size: read/write size, sizeof(@buf) must > @size 343 * @write: true - write to vram, otherwise - read from vram 344 * 345 * The return value means how many bytes have been transferred. 346 */ 347 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 348 void *buf, size_t size, bool write) 349 { 350 #ifdef CONFIG_64BIT 351 void __iomem *addr; 352 size_t count = 0; 353 uint64_t last; 354 355 if (!adev->mman.aper_base_kaddr) 356 return 0; 357 358 last = min(pos + size, adev->gmc.visible_vram_size); 359 if (last > pos) { 360 addr = adev->mman.aper_base_kaddr + pos; 361 count = last - pos; 362 363 if (write) { 364 memcpy_toio(addr, buf, count); 365 mb(); 366 amdgpu_device_flush_hdp(adev, NULL); 367 } else { 368 amdgpu_device_invalidate_hdp(adev, NULL); 369 mb(); 370 memcpy_fromio(buf, addr, count); 371 } 372 373 } 374 375 return count; 376 #else 377 return 0; 378 #endif 379 } 380 381 /** 382 * amdgpu_device_vram_access - read/write a buffer in vram 383 * 384 * @adev: amdgpu_device pointer 385 * @pos: offset of the buffer in vram 386 * @buf: virtual address of the buffer in system memory 387 * @size: read/write size, sizeof(@buf) must > @size 388 * @write: true - write to vram, otherwise - read from vram 389 */ 390 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 391 void *buf, size_t size, bool write) 392 { 393 size_t count; 394 395 /* try to using vram apreature to access vram first */ 396 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 397 size -= count; 398 if (size) { 399 /* using MM to access rest vram */ 400 pos += count; 401 buf += count; 402 amdgpu_device_mm_access(adev, pos, buf, size, write); 403 } 404 } 405 406 /* 407 * register access helper functions. 408 */ 409 410 /* Check if hw access should be skipped because of hotplug or device error */ 411 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 412 { 413 if (adev->no_hw_access) 414 return true; 415 416 #ifdef CONFIG_LOCKDEP 417 /* 418 * This is a bit complicated to understand, so worth a comment. What we assert 419 * here is that the GPU reset is not running on another thread in parallel. 420 * 421 * For this we trylock the read side of the reset semaphore, if that succeeds 422 * we know that the reset is not running in paralell. 423 * 424 * If the trylock fails we assert that we are either already holding the read 425 * side of the lock or are the reset thread itself and hold the write side of 426 * the lock. 427 */ 428 if (in_task()) { 429 if (down_read_trylock(&adev->reset_domain->sem)) 430 up_read(&adev->reset_domain->sem); 431 else 432 lockdep_assert_held(&adev->reset_domain->sem); 433 } 434 #endif 435 return false; 436 } 437 438 /** 439 * amdgpu_device_rreg - read a memory mapped IO or indirect register 440 * 441 * @adev: amdgpu_device pointer 442 * @reg: dword aligned register offset 443 * @acc_flags: access flags which require special behavior 444 * 445 * Returns the 32 bit value from the offset specified. 446 */ 447 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 448 uint32_t reg, uint32_t acc_flags) 449 { 450 uint32_t ret; 451 452 if (amdgpu_device_skip_hw_access(adev)) 453 return 0; 454 455 if ((reg * 4) < adev->rmmio_size) { 456 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 457 amdgpu_sriov_runtime(adev) && 458 down_read_trylock(&adev->reset_domain->sem)) { 459 ret = amdgpu_kiq_rreg(adev, reg); 460 up_read(&adev->reset_domain->sem); 461 } else { 462 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 463 } 464 } else { 465 ret = adev->pcie_rreg(adev, reg * 4); 466 } 467 468 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 469 470 return ret; 471 } 472 473 /* 474 * MMIO register read with bytes helper functions 475 * @offset:bytes offset from MMIO start 476 * 477 */ 478 479 /** 480 * amdgpu_mm_rreg8 - read a memory mapped IO register 481 * 482 * @adev: amdgpu_device pointer 483 * @offset: byte aligned register offset 484 * 485 * Returns the 8 bit value from the offset specified. 486 */ 487 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 488 { 489 if (amdgpu_device_skip_hw_access(adev)) 490 return 0; 491 492 if (offset < adev->rmmio_size) 493 return (readb(adev->rmmio + offset)); 494 BUG(); 495 } 496 497 /* 498 * MMIO register write with bytes helper functions 499 * @offset:bytes offset from MMIO start 500 * @value: the value want to be written to the register 501 * 502 */ 503 /** 504 * amdgpu_mm_wreg8 - read a memory mapped IO register 505 * 506 * @adev: amdgpu_device pointer 507 * @offset: byte aligned register offset 508 * @value: 8 bit value to write 509 * 510 * Writes the value specified to the offset specified. 511 */ 512 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 513 { 514 if (amdgpu_device_skip_hw_access(adev)) 515 return; 516 517 if (offset < adev->rmmio_size) 518 writeb(value, adev->rmmio + offset); 519 else 520 BUG(); 521 } 522 523 /** 524 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 525 * 526 * @adev: amdgpu_device pointer 527 * @reg: dword aligned register offset 528 * @v: 32 bit value to write to the register 529 * @acc_flags: access flags which require special behavior 530 * 531 * Writes the value specified to the offset specified. 532 */ 533 void amdgpu_device_wreg(struct amdgpu_device *adev, 534 uint32_t reg, uint32_t v, 535 uint32_t acc_flags) 536 { 537 if (amdgpu_device_skip_hw_access(adev)) 538 return; 539 540 if ((reg * 4) < adev->rmmio_size) { 541 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 542 amdgpu_sriov_runtime(adev) && 543 down_read_trylock(&adev->reset_domain->sem)) { 544 amdgpu_kiq_wreg(adev, reg, v); 545 up_read(&adev->reset_domain->sem); 546 } else { 547 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 548 } 549 } else { 550 adev->pcie_wreg(adev, reg * 4, v); 551 } 552 553 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 554 } 555 556 /** 557 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 558 * 559 * @adev: amdgpu_device pointer 560 * @reg: mmio/rlc register 561 * @v: value to write 562 * 563 * this function is invoked only for the debugfs register access 564 */ 565 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 566 uint32_t reg, uint32_t v) 567 { 568 if (amdgpu_device_skip_hw_access(adev)) 569 return; 570 571 if (amdgpu_sriov_fullaccess(adev) && 572 adev->gfx.rlc.funcs && 573 adev->gfx.rlc.funcs->is_rlcg_access_range) { 574 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 575 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 576 } else if ((reg * 4) >= adev->rmmio_size) { 577 adev->pcie_wreg(adev, reg * 4, v); 578 } else { 579 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 580 } 581 } 582 583 /** 584 * amdgpu_mm_rdoorbell - read a doorbell dword 585 * 586 * @adev: amdgpu_device pointer 587 * @index: doorbell index 588 * 589 * Returns the value in the doorbell aperture at the 590 * requested doorbell index (CIK). 591 */ 592 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 593 { 594 if (amdgpu_device_skip_hw_access(adev)) 595 return 0; 596 597 if (index < adev->doorbell.num_doorbells) { 598 return readl(adev->doorbell.ptr + index); 599 } else { 600 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 601 return 0; 602 } 603 } 604 605 /** 606 * amdgpu_mm_wdoorbell - write a doorbell dword 607 * 608 * @adev: amdgpu_device pointer 609 * @index: doorbell index 610 * @v: value to write 611 * 612 * Writes @v to the doorbell aperture at the 613 * requested doorbell index (CIK). 614 */ 615 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 616 { 617 if (amdgpu_device_skip_hw_access(adev)) 618 return; 619 620 if (index < adev->doorbell.num_doorbells) { 621 writel(v, adev->doorbell.ptr + index); 622 } else { 623 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 624 } 625 } 626 627 /** 628 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 629 * 630 * @adev: amdgpu_device pointer 631 * @index: doorbell index 632 * 633 * Returns the value in the doorbell aperture at the 634 * requested doorbell index (VEGA10+). 635 */ 636 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 637 { 638 if (amdgpu_device_skip_hw_access(adev)) 639 return 0; 640 641 if (index < adev->doorbell.num_doorbells) { 642 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 643 } else { 644 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 645 return 0; 646 } 647 } 648 649 /** 650 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 651 * 652 * @adev: amdgpu_device pointer 653 * @index: doorbell index 654 * @v: value to write 655 * 656 * Writes @v to the doorbell aperture at the 657 * requested doorbell index (VEGA10+). 658 */ 659 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 660 { 661 if (amdgpu_device_skip_hw_access(adev)) 662 return; 663 664 if (index < adev->doorbell.num_doorbells) { 665 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 666 } else { 667 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 668 } 669 } 670 671 /** 672 * amdgpu_device_indirect_rreg - read an indirect register 673 * 674 * @adev: amdgpu_device pointer 675 * @pcie_index: mmio register offset 676 * @pcie_data: mmio register offset 677 * @reg_addr: indirect register address to read from 678 * 679 * Returns the value of indirect register @reg_addr 680 */ 681 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 682 u32 pcie_index, u32 pcie_data, 683 u32 reg_addr) 684 { 685 unsigned long flags; 686 u32 r; 687 void __iomem *pcie_index_offset; 688 void __iomem *pcie_data_offset; 689 690 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 693 694 writel(reg_addr, pcie_index_offset); 695 readl(pcie_index_offset); 696 r = readl(pcie_data_offset); 697 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 698 699 return r; 700 } 701 702 /** 703 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 704 * 705 * @adev: amdgpu_device pointer 706 * @pcie_index: mmio register offset 707 * @pcie_data: mmio register offset 708 * @reg_addr: indirect register address to read from 709 * 710 * Returns the value of indirect register @reg_addr 711 */ 712 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 713 u32 pcie_index, u32 pcie_data, 714 u32 reg_addr) 715 { 716 unsigned long flags; 717 u64 r; 718 void __iomem *pcie_index_offset; 719 void __iomem *pcie_data_offset; 720 721 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 722 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 723 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 724 725 /* read low 32 bits */ 726 writel(reg_addr, pcie_index_offset); 727 readl(pcie_index_offset); 728 r = readl(pcie_data_offset); 729 /* read high 32 bits */ 730 writel(reg_addr + 4, pcie_index_offset); 731 readl(pcie_index_offset); 732 r |= ((u64)readl(pcie_data_offset) << 32); 733 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 734 735 return r; 736 } 737 738 /** 739 * amdgpu_device_indirect_wreg - write an indirect register address 740 * 741 * @adev: amdgpu_device pointer 742 * @pcie_index: mmio register offset 743 * @pcie_data: mmio register offset 744 * @reg_addr: indirect register offset 745 * @reg_data: indirect register data 746 * 747 */ 748 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 749 u32 pcie_index, u32 pcie_data, 750 u32 reg_addr, u32 reg_data) 751 { 752 unsigned long flags; 753 void __iomem *pcie_index_offset; 754 void __iomem *pcie_data_offset; 755 756 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 757 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 758 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 759 760 writel(reg_addr, pcie_index_offset); 761 readl(pcie_index_offset); 762 writel(reg_data, pcie_data_offset); 763 readl(pcie_data_offset); 764 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 765 } 766 767 /** 768 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 769 * 770 * @adev: amdgpu_device pointer 771 * @pcie_index: mmio register offset 772 * @pcie_data: mmio register offset 773 * @reg_addr: indirect register offset 774 * @reg_data: indirect register data 775 * 776 */ 777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 778 u32 pcie_index, u32 pcie_data, 779 u32 reg_addr, u64 reg_data) 780 { 781 unsigned long flags; 782 void __iomem *pcie_index_offset; 783 void __iomem *pcie_data_offset; 784 785 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 786 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 787 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 788 789 /* write low 32 bits */ 790 writel(reg_addr, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 793 readl(pcie_data_offset); 794 /* write high 32 bits */ 795 writel(reg_addr + 4, pcie_index_offset); 796 readl(pcie_index_offset); 797 writel((u32)(reg_data >> 32), pcie_data_offset); 798 readl(pcie_data_offset); 799 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 800 } 801 802 /** 803 * amdgpu_invalid_rreg - dummy reg read function 804 * 805 * @adev: amdgpu_device pointer 806 * @reg: offset of register 807 * 808 * Dummy register read function. Used for register blocks 809 * that certain asics don't have (all asics). 810 * Returns the value in the register. 811 */ 812 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 813 { 814 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 815 BUG(); 816 return 0; 817 } 818 819 /** 820 * amdgpu_invalid_wreg - dummy reg write function 821 * 822 * @adev: amdgpu_device pointer 823 * @reg: offset of register 824 * @v: value to write to the register 825 * 826 * Dummy register read function. Used for register blocks 827 * that certain asics don't have (all asics). 828 */ 829 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 830 { 831 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 832 reg, v); 833 BUG(); 834 } 835 836 /** 837 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 838 * 839 * @adev: amdgpu_device pointer 840 * @reg: offset of register 841 * 842 * Dummy register read function. Used for register blocks 843 * that certain asics don't have (all asics). 844 * Returns the value in the register. 845 */ 846 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 847 { 848 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 849 BUG(); 850 return 0; 851 } 852 853 /** 854 * amdgpu_invalid_wreg64 - dummy reg write function 855 * 856 * @adev: amdgpu_device pointer 857 * @reg: offset of register 858 * @v: value to write to the register 859 * 860 * Dummy register read function. Used for register blocks 861 * that certain asics don't have (all asics). 862 */ 863 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 864 { 865 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 866 reg, v); 867 BUG(); 868 } 869 870 /** 871 * amdgpu_block_invalid_rreg - dummy reg read function 872 * 873 * @adev: amdgpu_device pointer 874 * @block: offset of instance 875 * @reg: offset of register 876 * 877 * Dummy register read function. Used for register blocks 878 * that certain asics don't have (all asics). 879 * Returns the value in the register. 880 */ 881 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 882 uint32_t block, uint32_t reg) 883 { 884 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 885 reg, block); 886 BUG(); 887 return 0; 888 } 889 890 /** 891 * amdgpu_block_invalid_wreg - dummy reg write function 892 * 893 * @adev: amdgpu_device pointer 894 * @block: offset of instance 895 * @reg: offset of register 896 * @v: value to write to the register 897 * 898 * Dummy register read function. Used for register blocks 899 * that certain asics don't have (all asics). 900 */ 901 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 902 uint32_t block, 903 uint32_t reg, uint32_t v) 904 { 905 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 906 reg, block, v); 907 BUG(); 908 } 909 910 /** 911 * amdgpu_device_asic_init - Wrapper for atom asic_init 912 * 913 * @adev: amdgpu_device pointer 914 * 915 * Does any asic specific work and then calls atom asic init. 916 */ 917 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 918 { 919 amdgpu_asic_pre_asic_init(adev); 920 921 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 922 return amdgpu_atomfirmware_asic_init(adev, true); 923 else 924 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 925 } 926 927 /** 928 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 929 * 930 * @adev: amdgpu_device pointer 931 * 932 * Allocates a scratch page of VRAM for use by various things in the 933 * driver. 934 */ 935 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 936 { 937 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 938 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 939 &adev->vram_scratch.robj, 940 &adev->vram_scratch.gpu_addr, 941 (void **)&adev->vram_scratch.ptr); 942 } 943 944 /** 945 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 946 * 947 * @adev: amdgpu_device pointer 948 * 949 * Frees the VRAM scratch page. 950 */ 951 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 952 { 953 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 954 } 955 956 /** 957 * amdgpu_device_program_register_sequence - program an array of registers. 958 * 959 * @adev: amdgpu_device pointer 960 * @registers: pointer to the register array 961 * @array_size: size of the register array 962 * 963 * Programs an array or registers with and and or masks. 964 * This is a helper for setting golden registers. 965 */ 966 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 967 const u32 *registers, 968 const u32 array_size) 969 { 970 u32 tmp, reg, and_mask, or_mask; 971 int i; 972 973 if (array_size % 3) 974 return; 975 976 for (i = 0; i < array_size; i +=3) { 977 reg = registers[i + 0]; 978 and_mask = registers[i + 1]; 979 or_mask = registers[i + 2]; 980 981 if (and_mask == 0xffffffff) { 982 tmp = or_mask; 983 } else { 984 tmp = RREG32(reg); 985 tmp &= ~and_mask; 986 if (adev->family >= AMDGPU_FAMILY_AI) 987 tmp |= (or_mask & and_mask); 988 else 989 tmp |= or_mask; 990 } 991 WREG32(reg, tmp); 992 } 993 } 994 995 /** 996 * amdgpu_device_pci_config_reset - reset the GPU 997 * 998 * @adev: amdgpu_device pointer 999 * 1000 * Resets the GPU using the pci config reset sequence. 1001 * Only applicable to asics prior to vega10. 1002 */ 1003 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1004 { 1005 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1006 } 1007 1008 /** 1009 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1010 * 1011 * @adev: amdgpu_device pointer 1012 * 1013 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1014 */ 1015 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1016 { 1017 return pci_reset_function(adev->pdev); 1018 } 1019 1020 /* 1021 * GPU doorbell aperture helpers function. 1022 */ 1023 /** 1024 * amdgpu_device_doorbell_init - Init doorbell driver information. 1025 * 1026 * @adev: amdgpu_device pointer 1027 * 1028 * Init doorbell driver information (CIK) 1029 * Returns 0 on success, error on failure. 1030 */ 1031 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1032 { 1033 1034 /* No doorbell on SI hardware generation */ 1035 if (adev->asic_type < CHIP_BONAIRE) { 1036 adev->doorbell.base = 0; 1037 adev->doorbell.size = 0; 1038 adev->doorbell.num_doorbells = 0; 1039 adev->doorbell.ptr = NULL; 1040 return 0; 1041 } 1042 1043 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1044 return -EINVAL; 1045 1046 amdgpu_asic_init_doorbell_index(adev); 1047 1048 /* doorbell bar mapping */ 1049 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1050 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1051 1052 if (adev->enable_mes) { 1053 adev->doorbell.num_doorbells = 1054 adev->doorbell.size / sizeof(u32); 1055 } else { 1056 adev->doorbell.num_doorbells = 1057 min_t(u32, adev->doorbell.size / sizeof(u32), 1058 adev->doorbell_index.max_assignment+1); 1059 if (adev->doorbell.num_doorbells == 0) 1060 return -EINVAL; 1061 1062 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1063 * paging queue doorbell use the second page. The 1064 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1065 * doorbells are in the first page. So with paging queue enabled, 1066 * the max num_doorbells should + 1 page (0x400 in dword) 1067 */ 1068 if (adev->asic_type >= CHIP_VEGA10) 1069 adev->doorbell.num_doorbells += 0x400; 1070 } 1071 1072 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1073 adev->doorbell.num_doorbells * 1074 sizeof(u32)); 1075 if (adev->doorbell.ptr == NULL) 1076 return -ENOMEM; 1077 1078 return 0; 1079 } 1080 1081 /** 1082 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1083 * 1084 * @adev: amdgpu_device pointer 1085 * 1086 * Tear down doorbell driver information (CIK) 1087 */ 1088 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1089 { 1090 iounmap(adev->doorbell.ptr); 1091 adev->doorbell.ptr = NULL; 1092 } 1093 1094 1095 1096 /* 1097 * amdgpu_device_wb_*() 1098 * Writeback is the method by which the GPU updates special pages in memory 1099 * with the status of certain GPU events (fences, ring pointers,etc.). 1100 */ 1101 1102 /** 1103 * amdgpu_device_wb_fini - Disable Writeback and free memory 1104 * 1105 * @adev: amdgpu_device pointer 1106 * 1107 * Disables Writeback and frees the Writeback memory (all asics). 1108 * Used at driver shutdown. 1109 */ 1110 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1111 { 1112 if (adev->wb.wb_obj) { 1113 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1114 &adev->wb.gpu_addr, 1115 (void **)&adev->wb.wb); 1116 adev->wb.wb_obj = NULL; 1117 } 1118 } 1119 1120 /** 1121 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1122 * 1123 * @adev: amdgpu_device pointer 1124 * 1125 * Initializes writeback and allocates writeback memory (all asics). 1126 * Used at driver startup. 1127 * Returns 0 on success or an -error on failure. 1128 */ 1129 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1130 { 1131 int r; 1132 1133 if (adev->wb.wb_obj == NULL) { 1134 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1135 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1136 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1137 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1138 (void **)&adev->wb.wb); 1139 if (r) { 1140 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1141 return r; 1142 } 1143 1144 adev->wb.num_wb = AMDGPU_MAX_WB; 1145 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1146 1147 /* clear wb memory */ 1148 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1149 } 1150 1151 return 0; 1152 } 1153 1154 /** 1155 * amdgpu_device_wb_get - Allocate a wb entry 1156 * 1157 * @adev: amdgpu_device pointer 1158 * @wb: wb index 1159 * 1160 * Allocate a wb slot for use by the driver (all asics). 1161 * Returns 0 on success or -EINVAL on failure. 1162 */ 1163 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1164 { 1165 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1166 1167 if (offset < adev->wb.num_wb) { 1168 __set_bit(offset, adev->wb.used); 1169 *wb = offset << 3; /* convert to dw offset */ 1170 return 0; 1171 } else { 1172 return -EINVAL; 1173 } 1174 } 1175 1176 /** 1177 * amdgpu_device_wb_free - Free a wb entry 1178 * 1179 * @adev: amdgpu_device pointer 1180 * @wb: wb index 1181 * 1182 * Free a wb slot allocated for use by the driver (all asics) 1183 */ 1184 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1185 { 1186 wb >>= 3; 1187 if (wb < adev->wb.num_wb) 1188 __clear_bit(wb, adev->wb.used); 1189 } 1190 1191 /** 1192 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1193 * 1194 * @adev: amdgpu_device pointer 1195 * 1196 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1197 * to fail, but if any of the BARs is not accessible after the size we abort 1198 * driver loading by returning -ENODEV. 1199 */ 1200 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1201 { 1202 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1203 struct pci_bus *root; 1204 struct resource *res; 1205 unsigned i; 1206 u16 cmd; 1207 int r; 1208 1209 /* Bypass for VF */ 1210 if (amdgpu_sriov_vf(adev)) 1211 return 0; 1212 1213 /* skip if the bios has already enabled large BAR */ 1214 if (adev->gmc.real_vram_size && 1215 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1216 return 0; 1217 1218 /* Check if the root BUS has 64bit memory resources */ 1219 root = adev->pdev->bus; 1220 while (root->parent) 1221 root = root->parent; 1222 1223 pci_bus_for_each_resource(root, res, i) { 1224 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1225 res->start > 0x100000000ull) 1226 break; 1227 } 1228 1229 /* Trying to resize is pointless without a root hub window above 4GB */ 1230 if (!res) 1231 return 0; 1232 1233 /* Limit the BAR size to what is available */ 1234 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1235 rbar_size); 1236 1237 /* Disable memory decoding while we change the BAR addresses and size */ 1238 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1239 pci_write_config_word(adev->pdev, PCI_COMMAND, 1240 cmd & ~PCI_COMMAND_MEMORY); 1241 1242 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1243 amdgpu_device_doorbell_fini(adev); 1244 if (adev->asic_type >= CHIP_BONAIRE) 1245 pci_release_resource(adev->pdev, 2); 1246 1247 pci_release_resource(adev->pdev, 0); 1248 1249 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1250 if (r == -ENOSPC) 1251 DRM_INFO("Not enough PCI address space for a large BAR."); 1252 else if (r && r != -ENOTSUPP) 1253 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1254 1255 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1256 1257 /* When the doorbell or fb BAR isn't available we have no chance of 1258 * using the device. 1259 */ 1260 r = amdgpu_device_doorbell_init(adev); 1261 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1262 return -ENODEV; 1263 1264 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * GPU helpers function. 1271 */ 1272 /** 1273 * amdgpu_device_need_post - check if the hw need post or not 1274 * 1275 * @adev: amdgpu_device pointer 1276 * 1277 * Check if the asic has been initialized (all asics) at driver startup 1278 * or post is needed if hw reset is performed. 1279 * Returns true if need or false if not. 1280 */ 1281 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1282 { 1283 uint32_t reg; 1284 1285 if (amdgpu_sriov_vf(adev)) 1286 return false; 1287 1288 if (amdgpu_passthrough(adev)) { 1289 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1290 * some old smc fw still need driver do vPost otherwise gpu hang, while 1291 * those smc fw version above 22.15 doesn't have this flaw, so we force 1292 * vpost executed for smc version below 22.15 1293 */ 1294 if (adev->asic_type == CHIP_FIJI) { 1295 int err; 1296 uint32_t fw_ver; 1297 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1298 /* force vPost if error occured */ 1299 if (err) 1300 return true; 1301 1302 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1303 if (fw_ver < 0x00160e00) 1304 return true; 1305 } 1306 } 1307 1308 /* Don't post if we need to reset whole hive on init */ 1309 if (adev->gmc.xgmi.pending_reset) 1310 return false; 1311 1312 if (adev->has_hw_reset) { 1313 adev->has_hw_reset = false; 1314 return true; 1315 } 1316 1317 /* bios scratch used on CIK+ */ 1318 if (adev->asic_type >= CHIP_BONAIRE) 1319 return amdgpu_atombios_scratch_need_asic_init(adev); 1320 1321 /* check MEM_SIZE for older asics */ 1322 reg = amdgpu_asic_get_config_memsize(adev); 1323 1324 if ((reg != 0) && (reg != 0xffffffff)) 1325 return false; 1326 1327 return true; 1328 } 1329 1330 /** 1331 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1332 * 1333 * @adev: amdgpu_device pointer 1334 * 1335 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1336 * be set for this device. 1337 * 1338 * Returns true if it should be used or false if not. 1339 */ 1340 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1341 { 1342 switch (amdgpu_aspm) { 1343 case -1: 1344 break; 1345 case 0: 1346 return false; 1347 case 1: 1348 return true; 1349 default: 1350 return false; 1351 } 1352 return pcie_aspm_enabled(adev->pdev); 1353 } 1354 1355 /* if we get transitioned to only one device, take VGA back */ 1356 /** 1357 * amdgpu_device_vga_set_decode - enable/disable vga decode 1358 * 1359 * @pdev: PCI device pointer 1360 * @state: enable/disable vga decode 1361 * 1362 * Enable/disable vga decode (all asics). 1363 * Returns VGA resource flags. 1364 */ 1365 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1366 bool state) 1367 { 1368 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1369 amdgpu_asic_set_vga_state(adev, state); 1370 if (state) 1371 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1372 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1373 else 1374 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1375 } 1376 1377 /** 1378 * amdgpu_device_check_block_size - validate the vm block size 1379 * 1380 * @adev: amdgpu_device pointer 1381 * 1382 * Validates the vm block size specified via module parameter. 1383 * The vm block size defines number of bits in page table versus page directory, 1384 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1385 * page table and the remaining bits are in the page directory. 1386 */ 1387 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1388 { 1389 /* defines number of bits in page table versus page directory, 1390 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1391 * page table and the remaining bits are in the page directory */ 1392 if (amdgpu_vm_block_size == -1) 1393 return; 1394 1395 if (amdgpu_vm_block_size < 9) { 1396 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1397 amdgpu_vm_block_size); 1398 amdgpu_vm_block_size = -1; 1399 } 1400 } 1401 1402 /** 1403 * amdgpu_device_check_vm_size - validate the vm size 1404 * 1405 * @adev: amdgpu_device pointer 1406 * 1407 * Validates the vm size in GB specified via module parameter. 1408 * The VM size is the size of the GPU virtual memory space in GB. 1409 */ 1410 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1411 { 1412 /* no need to check the default value */ 1413 if (amdgpu_vm_size == -1) 1414 return; 1415 1416 if (amdgpu_vm_size < 1) { 1417 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1418 amdgpu_vm_size); 1419 amdgpu_vm_size = -1; 1420 } 1421 } 1422 1423 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1424 { 1425 struct sysinfo si; 1426 bool is_os_64 = (sizeof(void *) == 8); 1427 uint64_t total_memory; 1428 uint64_t dram_size_seven_GB = 0x1B8000000; 1429 uint64_t dram_size_three_GB = 0xB8000000; 1430 1431 if (amdgpu_smu_memory_pool_size == 0) 1432 return; 1433 1434 if (!is_os_64) { 1435 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1436 goto def_value; 1437 } 1438 si_meminfo(&si); 1439 total_memory = (uint64_t)si.totalram * si.mem_unit; 1440 1441 if ((amdgpu_smu_memory_pool_size == 1) || 1442 (amdgpu_smu_memory_pool_size == 2)) { 1443 if (total_memory < dram_size_three_GB) 1444 goto def_value1; 1445 } else if ((amdgpu_smu_memory_pool_size == 4) || 1446 (amdgpu_smu_memory_pool_size == 8)) { 1447 if (total_memory < dram_size_seven_GB) 1448 goto def_value1; 1449 } else { 1450 DRM_WARN("Smu memory pool size not supported\n"); 1451 goto def_value; 1452 } 1453 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1454 1455 return; 1456 1457 def_value1: 1458 DRM_WARN("No enough system memory\n"); 1459 def_value: 1460 adev->pm.smu_prv_buffer_size = 0; 1461 } 1462 1463 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1464 { 1465 if (!(adev->flags & AMD_IS_APU) || 1466 adev->asic_type < CHIP_RAVEN) 1467 return 0; 1468 1469 switch (adev->asic_type) { 1470 case CHIP_RAVEN: 1471 if (adev->pdev->device == 0x15dd) 1472 adev->apu_flags |= AMD_APU_IS_RAVEN; 1473 if (adev->pdev->device == 0x15d8) 1474 adev->apu_flags |= AMD_APU_IS_PICASSO; 1475 break; 1476 case CHIP_RENOIR: 1477 if ((adev->pdev->device == 0x1636) || 1478 (adev->pdev->device == 0x164c)) 1479 adev->apu_flags |= AMD_APU_IS_RENOIR; 1480 else 1481 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1482 break; 1483 case CHIP_VANGOGH: 1484 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1485 break; 1486 case CHIP_YELLOW_CARP: 1487 break; 1488 case CHIP_CYAN_SKILLFISH: 1489 if ((adev->pdev->device == 0x13FE) || 1490 (adev->pdev->device == 0x143F)) 1491 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1492 break; 1493 default: 1494 break; 1495 } 1496 1497 return 0; 1498 } 1499 1500 /** 1501 * amdgpu_device_check_arguments - validate module params 1502 * 1503 * @adev: amdgpu_device pointer 1504 * 1505 * Validates certain module parameters and updates 1506 * the associated values used by the driver (all asics). 1507 */ 1508 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1509 { 1510 if (amdgpu_sched_jobs < 4) { 1511 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1512 amdgpu_sched_jobs); 1513 amdgpu_sched_jobs = 4; 1514 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1515 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1516 amdgpu_sched_jobs); 1517 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1518 } 1519 1520 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1521 /* gart size must be greater or equal to 32M */ 1522 dev_warn(adev->dev, "gart size (%d) too small\n", 1523 amdgpu_gart_size); 1524 amdgpu_gart_size = -1; 1525 } 1526 1527 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1528 /* gtt size must be greater or equal to 32M */ 1529 dev_warn(adev->dev, "gtt size (%d) too small\n", 1530 amdgpu_gtt_size); 1531 amdgpu_gtt_size = -1; 1532 } 1533 1534 /* valid range is between 4 and 9 inclusive */ 1535 if (amdgpu_vm_fragment_size != -1 && 1536 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1537 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1538 amdgpu_vm_fragment_size = -1; 1539 } 1540 1541 if (amdgpu_sched_hw_submission < 2) { 1542 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1543 amdgpu_sched_hw_submission); 1544 amdgpu_sched_hw_submission = 2; 1545 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1546 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1547 amdgpu_sched_hw_submission); 1548 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1549 } 1550 1551 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1552 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1553 amdgpu_reset_method = -1; 1554 } 1555 1556 amdgpu_device_check_smu_prv_buffer_size(adev); 1557 1558 amdgpu_device_check_vm_size(adev); 1559 1560 amdgpu_device_check_block_size(adev); 1561 1562 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1563 1564 return 0; 1565 } 1566 1567 /** 1568 * amdgpu_switcheroo_set_state - set switcheroo state 1569 * 1570 * @pdev: pci dev pointer 1571 * @state: vga_switcheroo state 1572 * 1573 * Callback for the switcheroo driver. Suspends or resumes 1574 * the asics before or after it is powered up using ACPI methods. 1575 */ 1576 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1577 enum vga_switcheroo_state state) 1578 { 1579 struct drm_device *dev = pci_get_drvdata(pdev); 1580 int r; 1581 1582 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1583 return; 1584 1585 if (state == VGA_SWITCHEROO_ON) { 1586 pr_info("switched on\n"); 1587 /* don't suspend or resume card normally */ 1588 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1589 1590 pci_set_power_state(pdev, PCI_D0); 1591 amdgpu_device_load_pci_state(pdev); 1592 r = pci_enable_device(pdev); 1593 if (r) 1594 DRM_WARN("pci_enable_device failed (%d)\n", r); 1595 amdgpu_device_resume(dev, true); 1596 1597 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1598 } else { 1599 pr_info("switched off\n"); 1600 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1601 amdgpu_device_suspend(dev, true); 1602 amdgpu_device_cache_pci_state(pdev); 1603 /* Shut down the device */ 1604 pci_disable_device(pdev); 1605 pci_set_power_state(pdev, PCI_D3cold); 1606 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1607 } 1608 } 1609 1610 /** 1611 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1612 * 1613 * @pdev: pci dev pointer 1614 * 1615 * Callback for the switcheroo driver. Check of the switcheroo 1616 * state can be changed. 1617 * Returns true if the state can be changed, false if not. 1618 */ 1619 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1620 { 1621 struct drm_device *dev = pci_get_drvdata(pdev); 1622 1623 /* 1624 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1625 * locking inversion with the driver load path. And the access here is 1626 * completely racy anyway. So don't bother with locking for now. 1627 */ 1628 return atomic_read(&dev->open_count) == 0; 1629 } 1630 1631 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1632 .set_gpu_state = amdgpu_switcheroo_set_state, 1633 .reprobe = NULL, 1634 .can_switch = amdgpu_switcheroo_can_switch, 1635 }; 1636 1637 /** 1638 * amdgpu_device_ip_set_clockgating_state - set the CG state 1639 * 1640 * @dev: amdgpu_device pointer 1641 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1642 * @state: clockgating state (gate or ungate) 1643 * 1644 * Sets the requested clockgating state for all instances of 1645 * the hardware IP specified. 1646 * Returns the error code from the last instance. 1647 */ 1648 int amdgpu_device_ip_set_clockgating_state(void *dev, 1649 enum amd_ip_block_type block_type, 1650 enum amd_clockgating_state state) 1651 { 1652 struct amdgpu_device *adev = dev; 1653 int i, r = 0; 1654 1655 for (i = 0; i < adev->num_ip_blocks; i++) { 1656 if (!adev->ip_blocks[i].status.valid) 1657 continue; 1658 if (adev->ip_blocks[i].version->type != block_type) 1659 continue; 1660 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1661 continue; 1662 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1663 (void *)adev, state); 1664 if (r) 1665 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1666 adev->ip_blocks[i].version->funcs->name, r); 1667 } 1668 return r; 1669 } 1670 1671 /** 1672 * amdgpu_device_ip_set_powergating_state - set the PG state 1673 * 1674 * @dev: amdgpu_device pointer 1675 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1676 * @state: powergating state (gate or ungate) 1677 * 1678 * Sets the requested powergating state for all instances of 1679 * the hardware IP specified. 1680 * Returns the error code from the last instance. 1681 */ 1682 int amdgpu_device_ip_set_powergating_state(void *dev, 1683 enum amd_ip_block_type block_type, 1684 enum amd_powergating_state state) 1685 { 1686 struct amdgpu_device *adev = dev; 1687 int i, r = 0; 1688 1689 for (i = 0; i < adev->num_ip_blocks; i++) { 1690 if (!adev->ip_blocks[i].status.valid) 1691 continue; 1692 if (adev->ip_blocks[i].version->type != block_type) 1693 continue; 1694 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1695 continue; 1696 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1697 (void *)adev, state); 1698 if (r) 1699 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1700 adev->ip_blocks[i].version->funcs->name, r); 1701 } 1702 return r; 1703 } 1704 1705 /** 1706 * amdgpu_device_ip_get_clockgating_state - get the CG state 1707 * 1708 * @adev: amdgpu_device pointer 1709 * @flags: clockgating feature flags 1710 * 1711 * Walks the list of IPs on the device and updates the clockgating 1712 * flags for each IP. 1713 * Updates @flags with the feature flags for each hardware IP where 1714 * clockgating is enabled. 1715 */ 1716 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1717 u64 *flags) 1718 { 1719 int i; 1720 1721 for (i = 0; i < adev->num_ip_blocks; i++) { 1722 if (!adev->ip_blocks[i].status.valid) 1723 continue; 1724 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1725 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1726 } 1727 } 1728 1729 /** 1730 * amdgpu_device_ip_wait_for_idle - wait for idle 1731 * 1732 * @adev: amdgpu_device pointer 1733 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1734 * 1735 * Waits for the request hardware IP to be idle. 1736 * Returns 0 for success or a negative error code on failure. 1737 */ 1738 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1739 enum amd_ip_block_type block_type) 1740 { 1741 int i, r; 1742 1743 for (i = 0; i < adev->num_ip_blocks; i++) { 1744 if (!adev->ip_blocks[i].status.valid) 1745 continue; 1746 if (adev->ip_blocks[i].version->type == block_type) { 1747 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1748 if (r) 1749 return r; 1750 break; 1751 } 1752 } 1753 return 0; 1754 1755 } 1756 1757 /** 1758 * amdgpu_device_ip_is_idle - is the hardware IP idle 1759 * 1760 * @adev: amdgpu_device pointer 1761 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1762 * 1763 * Check if the hardware IP is idle or not. 1764 * Returns true if it the IP is idle, false if not. 1765 */ 1766 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1767 enum amd_ip_block_type block_type) 1768 { 1769 int i; 1770 1771 for (i = 0; i < adev->num_ip_blocks; i++) { 1772 if (!adev->ip_blocks[i].status.valid) 1773 continue; 1774 if (adev->ip_blocks[i].version->type == block_type) 1775 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1776 } 1777 return true; 1778 1779 } 1780 1781 /** 1782 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1783 * 1784 * @adev: amdgpu_device pointer 1785 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1786 * 1787 * Returns a pointer to the hardware IP block structure 1788 * if it exists for the asic, otherwise NULL. 1789 */ 1790 struct amdgpu_ip_block * 1791 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1792 enum amd_ip_block_type type) 1793 { 1794 int i; 1795 1796 for (i = 0; i < adev->num_ip_blocks; i++) 1797 if (adev->ip_blocks[i].version->type == type) 1798 return &adev->ip_blocks[i]; 1799 1800 return NULL; 1801 } 1802 1803 /** 1804 * amdgpu_device_ip_block_version_cmp 1805 * 1806 * @adev: amdgpu_device pointer 1807 * @type: enum amd_ip_block_type 1808 * @major: major version 1809 * @minor: minor version 1810 * 1811 * return 0 if equal or greater 1812 * return 1 if smaller or the ip_block doesn't exist 1813 */ 1814 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1815 enum amd_ip_block_type type, 1816 u32 major, u32 minor) 1817 { 1818 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1819 1820 if (ip_block && ((ip_block->version->major > major) || 1821 ((ip_block->version->major == major) && 1822 (ip_block->version->minor >= minor)))) 1823 return 0; 1824 1825 return 1; 1826 } 1827 1828 /** 1829 * amdgpu_device_ip_block_add 1830 * 1831 * @adev: amdgpu_device pointer 1832 * @ip_block_version: pointer to the IP to add 1833 * 1834 * Adds the IP block driver information to the collection of IPs 1835 * on the asic. 1836 */ 1837 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1838 const struct amdgpu_ip_block_version *ip_block_version) 1839 { 1840 if (!ip_block_version) 1841 return -EINVAL; 1842 1843 switch (ip_block_version->type) { 1844 case AMD_IP_BLOCK_TYPE_VCN: 1845 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1846 return 0; 1847 break; 1848 case AMD_IP_BLOCK_TYPE_JPEG: 1849 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1850 return 0; 1851 break; 1852 default: 1853 break; 1854 } 1855 1856 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1857 ip_block_version->funcs->name); 1858 1859 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1860 1861 return 0; 1862 } 1863 1864 /** 1865 * amdgpu_device_enable_virtual_display - enable virtual display feature 1866 * 1867 * @adev: amdgpu_device pointer 1868 * 1869 * Enabled the virtual display feature if the user has enabled it via 1870 * the module parameter virtual_display. This feature provides a virtual 1871 * display hardware on headless boards or in virtualized environments. 1872 * This function parses and validates the configuration string specified by 1873 * the user and configues the virtual display configuration (number of 1874 * virtual connectors, crtcs, etc.) specified. 1875 */ 1876 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1877 { 1878 adev->enable_virtual_display = false; 1879 1880 if (amdgpu_virtual_display) { 1881 const char *pci_address_name = pci_name(adev->pdev); 1882 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1883 1884 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1885 pciaddstr_tmp = pciaddstr; 1886 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1887 pciaddname = strsep(&pciaddname_tmp, ","); 1888 if (!strcmp("all", pciaddname) 1889 || !strcmp(pci_address_name, pciaddname)) { 1890 long num_crtc; 1891 int res = -1; 1892 1893 adev->enable_virtual_display = true; 1894 1895 if (pciaddname_tmp) 1896 res = kstrtol(pciaddname_tmp, 10, 1897 &num_crtc); 1898 1899 if (!res) { 1900 if (num_crtc < 1) 1901 num_crtc = 1; 1902 if (num_crtc > 6) 1903 num_crtc = 6; 1904 adev->mode_info.num_crtc = num_crtc; 1905 } else { 1906 adev->mode_info.num_crtc = 1; 1907 } 1908 break; 1909 } 1910 } 1911 1912 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1913 amdgpu_virtual_display, pci_address_name, 1914 adev->enable_virtual_display, adev->mode_info.num_crtc); 1915 1916 kfree(pciaddstr); 1917 } 1918 } 1919 1920 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1921 { 1922 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1923 adev->mode_info.num_crtc = 1; 1924 adev->enable_virtual_display = true; 1925 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1926 adev->enable_virtual_display, adev->mode_info.num_crtc); 1927 } 1928 } 1929 1930 /** 1931 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1932 * 1933 * @adev: amdgpu_device pointer 1934 * 1935 * Parses the asic configuration parameters specified in the gpu info 1936 * firmware and makes them availale to the driver for use in configuring 1937 * the asic. 1938 * Returns 0 on success, -EINVAL on failure. 1939 */ 1940 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1941 { 1942 const char *chip_name; 1943 char fw_name[40]; 1944 int err; 1945 const struct gpu_info_firmware_header_v1_0 *hdr; 1946 1947 adev->firmware.gpu_info_fw = NULL; 1948 1949 if (adev->mman.discovery_bin) { 1950 /* 1951 * FIXME: The bounding box is still needed by Navi12, so 1952 * temporarily read it from gpu_info firmware. Should be dropped 1953 * when DAL no longer needs it. 1954 */ 1955 if (adev->asic_type != CHIP_NAVI12) 1956 return 0; 1957 } 1958 1959 switch (adev->asic_type) { 1960 default: 1961 return 0; 1962 case CHIP_VEGA10: 1963 chip_name = "vega10"; 1964 break; 1965 case CHIP_VEGA12: 1966 chip_name = "vega12"; 1967 break; 1968 case CHIP_RAVEN: 1969 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1970 chip_name = "raven2"; 1971 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1972 chip_name = "picasso"; 1973 else 1974 chip_name = "raven"; 1975 break; 1976 case CHIP_ARCTURUS: 1977 chip_name = "arcturus"; 1978 break; 1979 case CHIP_NAVI12: 1980 chip_name = "navi12"; 1981 break; 1982 } 1983 1984 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1985 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1986 if (err) { 1987 dev_err(adev->dev, 1988 "Failed to load gpu_info firmware \"%s\"\n", 1989 fw_name); 1990 goto out; 1991 } 1992 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1993 if (err) { 1994 dev_err(adev->dev, 1995 "Failed to validate gpu_info firmware \"%s\"\n", 1996 fw_name); 1997 goto out; 1998 } 1999 2000 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2001 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2002 2003 switch (hdr->version_major) { 2004 case 1: 2005 { 2006 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2007 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2008 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2009 2010 /* 2011 * Should be droped when DAL no longer needs it. 2012 */ 2013 if (adev->asic_type == CHIP_NAVI12) 2014 goto parse_soc_bounding_box; 2015 2016 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2017 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2018 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2019 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2020 adev->gfx.config.max_texture_channel_caches = 2021 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2022 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2023 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2024 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2025 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2026 adev->gfx.config.double_offchip_lds_buf = 2027 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2028 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2029 adev->gfx.cu_info.max_waves_per_simd = 2030 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2031 adev->gfx.cu_info.max_scratch_slots_per_cu = 2032 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2033 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2034 if (hdr->version_minor >= 1) { 2035 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2036 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2037 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2038 adev->gfx.config.num_sc_per_sh = 2039 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2040 adev->gfx.config.num_packer_per_sc = 2041 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2042 } 2043 2044 parse_soc_bounding_box: 2045 /* 2046 * soc bounding box info is not integrated in disocovery table, 2047 * we always need to parse it from gpu info firmware if needed. 2048 */ 2049 if (hdr->version_minor == 2) { 2050 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2051 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2052 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2053 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2054 } 2055 break; 2056 } 2057 default: 2058 dev_err(adev->dev, 2059 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2060 err = -EINVAL; 2061 goto out; 2062 } 2063 out: 2064 return err; 2065 } 2066 2067 /** 2068 * amdgpu_device_ip_early_init - run early init for hardware IPs 2069 * 2070 * @adev: amdgpu_device pointer 2071 * 2072 * Early initialization pass for hardware IPs. The hardware IPs that make 2073 * up each asic are discovered each IP's early_init callback is run. This 2074 * is the first stage in initializing the asic. 2075 * Returns 0 on success, negative error code on failure. 2076 */ 2077 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2078 { 2079 struct drm_device *dev = adev_to_drm(adev); 2080 struct pci_dev *parent; 2081 int i, r; 2082 2083 amdgpu_device_enable_virtual_display(adev); 2084 2085 if (amdgpu_sriov_vf(adev)) { 2086 r = amdgpu_virt_request_full_gpu(adev, true); 2087 if (r) 2088 return r; 2089 } 2090 2091 switch (adev->asic_type) { 2092 #ifdef CONFIG_DRM_AMDGPU_SI 2093 case CHIP_VERDE: 2094 case CHIP_TAHITI: 2095 case CHIP_PITCAIRN: 2096 case CHIP_OLAND: 2097 case CHIP_HAINAN: 2098 adev->family = AMDGPU_FAMILY_SI; 2099 r = si_set_ip_blocks(adev); 2100 if (r) 2101 return r; 2102 break; 2103 #endif 2104 #ifdef CONFIG_DRM_AMDGPU_CIK 2105 case CHIP_BONAIRE: 2106 case CHIP_HAWAII: 2107 case CHIP_KAVERI: 2108 case CHIP_KABINI: 2109 case CHIP_MULLINS: 2110 if (adev->flags & AMD_IS_APU) 2111 adev->family = AMDGPU_FAMILY_KV; 2112 else 2113 adev->family = AMDGPU_FAMILY_CI; 2114 2115 r = cik_set_ip_blocks(adev); 2116 if (r) 2117 return r; 2118 break; 2119 #endif 2120 case CHIP_TOPAZ: 2121 case CHIP_TONGA: 2122 case CHIP_FIJI: 2123 case CHIP_POLARIS10: 2124 case CHIP_POLARIS11: 2125 case CHIP_POLARIS12: 2126 case CHIP_VEGAM: 2127 case CHIP_CARRIZO: 2128 case CHIP_STONEY: 2129 if (adev->flags & AMD_IS_APU) 2130 adev->family = AMDGPU_FAMILY_CZ; 2131 else 2132 adev->family = AMDGPU_FAMILY_VI; 2133 2134 r = vi_set_ip_blocks(adev); 2135 if (r) 2136 return r; 2137 break; 2138 default: 2139 r = amdgpu_discovery_set_ip_blocks(adev); 2140 if (r) 2141 return r; 2142 break; 2143 } 2144 2145 if (amdgpu_has_atpx() && 2146 (amdgpu_is_atpx_hybrid() || 2147 amdgpu_has_atpx_dgpu_power_cntl()) && 2148 ((adev->flags & AMD_IS_APU) == 0) && 2149 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2150 adev->flags |= AMD_IS_PX; 2151 2152 if (!(adev->flags & AMD_IS_APU)) { 2153 parent = pci_upstream_bridge(adev->pdev); 2154 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2155 } 2156 2157 amdgpu_amdkfd_device_probe(adev); 2158 2159 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2160 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2161 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2162 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2163 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2164 2165 for (i = 0; i < adev->num_ip_blocks; i++) { 2166 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2167 DRM_ERROR("disabled ip block: %d <%s>\n", 2168 i, adev->ip_blocks[i].version->funcs->name); 2169 adev->ip_blocks[i].status.valid = false; 2170 } else { 2171 if (adev->ip_blocks[i].version->funcs->early_init) { 2172 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2173 if (r == -ENOENT) { 2174 adev->ip_blocks[i].status.valid = false; 2175 } else if (r) { 2176 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2177 adev->ip_blocks[i].version->funcs->name, r); 2178 return r; 2179 } else { 2180 adev->ip_blocks[i].status.valid = true; 2181 } 2182 } else { 2183 adev->ip_blocks[i].status.valid = true; 2184 } 2185 } 2186 /* get the vbios after the asic_funcs are set up */ 2187 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2188 r = amdgpu_device_parse_gpu_info_fw(adev); 2189 if (r) 2190 return r; 2191 2192 /* Read BIOS */ 2193 if (!amdgpu_get_bios(adev)) 2194 return -EINVAL; 2195 2196 r = amdgpu_atombios_init(adev); 2197 if (r) { 2198 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2199 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2200 return r; 2201 } 2202 2203 /*get pf2vf msg info at it's earliest time*/ 2204 if (amdgpu_sriov_vf(adev)) 2205 amdgpu_virt_init_data_exchange(adev); 2206 2207 } 2208 } 2209 2210 adev->cg_flags &= amdgpu_cg_mask; 2211 adev->pg_flags &= amdgpu_pg_mask; 2212 2213 return 0; 2214 } 2215 2216 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2217 { 2218 int i, r; 2219 2220 for (i = 0; i < adev->num_ip_blocks; i++) { 2221 if (!adev->ip_blocks[i].status.sw) 2222 continue; 2223 if (adev->ip_blocks[i].status.hw) 2224 continue; 2225 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2226 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2228 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2229 if (r) { 2230 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2231 adev->ip_blocks[i].version->funcs->name, r); 2232 return r; 2233 } 2234 adev->ip_blocks[i].status.hw = true; 2235 } 2236 } 2237 2238 return 0; 2239 } 2240 2241 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2242 { 2243 int i, r; 2244 2245 for (i = 0; i < adev->num_ip_blocks; i++) { 2246 if (!adev->ip_blocks[i].status.sw) 2247 continue; 2248 if (adev->ip_blocks[i].status.hw) 2249 continue; 2250 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2251 if (r) { 2252 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2253 adev->ip_blocks[i].version->funcs->name, r); 2254 return r; 2255 } 2256 adev->ip_blocks[i].status.hw = true; 2257 } 2258 2259 return 0; 2260 } 2261 2262 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2263 { 2264 int r = 0; 2265 int i; 2266 uint32_t smu_version; 2267 2268 if (adev->asic_type >= CHIP_VEGA10) { 2269 for (i = 0; i < adev->num_ip_blocks; i++) { 2270 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2271 continue; 2272 2273 if (!adev->ip_blocks[i].status.sw) 2274 continue; 2275 2276 /* no need to do the fw loading again if already done*/ 2277 if (adev->ip_blocks[i].status.hw == true) 2278 break; 2279 2280 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2281 r = adev->ip_blocks[i].version->funcs->resume(adev); 2282 if (r) { 2283 DRM_ERROR("resume of IP block <%s> failed %d\n", 2284 adev->ip_blocks[i].version->funcs->name, r); 2285 return r; 2286 } 2287 } else { 2288 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2289 if (r) { 2290 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2291 adev->ip_blocks[i].version->funcs->name, r); 2292 return r; 2293 } 2294 } 2295 2296 adev->ip_blocks[i].status.hw = true; 2297 break; 2298 } 2299 } 2300 2301 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2302 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2303 2304 return r; 2305 } 2306 2307 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2308 { 2309 long timeout; 2310 int r, i; 2311 2312 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2313 struct amdgpu_ring *ring = adev->rings[i]; 2314 2315 /* No need to setup the GPU scheduler for rings that don't need it */ 2316 if (!ring || ring->no_scheduler) 2317 continue; 2318 2319 switch (ring->funcs->type) { 2320 case AMDGPU_RING_TYPE_GFX: 2321 timeout = adev->gfx_timeout; 2322 break; 2323 case AMDGPU_RING_TYPE_COMPUTE: 2324 timeout = adev->compute_timeout; 2325 break; 2326 case AMDGPU_RING_TYPE_SDMA: 2327 timeout = adev->sdma_timeout; 2328 break; 2329 default: 2330 timeout = adev->video_timeout; 2331 break; 2332 } 2333 2334 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2335 ring->num_hw_submission, amdgpu_job_hang_limit, 2336 timeout, adev->reset_domain->wq, 2337 ring->sched_score, ring->name, 2338 adev->dev); 2339 if (r) { 2340 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2341 ring->name); 2342 return r; 2343 } 2344 } 2345 2346 return 0; 2347 } 2348 2349 2350 /** 2351 * amdgpu_device_ip_init - run init for hardware IPs 2352 * 2353 * @adev: amdgpu_device pointer 2354 * 2355 * Main initialization pass for hardware IPs. The list of all the hardware 2356 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2357 * are run. sw_init initializes the software state associated with each IP 2358 * and hw_init initializes the hardware associated with each IP. 2359 * Returns 0 on success, negative error code on failure. 2360 */ 2361 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2362 { 2363 int i, r; 2364 2365 r = amdgpu_ras_init(adev); 2366 if (r) 2367 return r; 2368 2369 for (i = 0; i < adev->num_ip_blocks; i++) { 2370 if (!adev->ip_blocks[i].status.valid) 2371 continue; 2372 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2373 if (r) { 2374 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2375 adev->ip_blocks[i].version->funcs->name, r); 2376 goto init_failed; 2377 } 2378 adev->ip_blocks[i].status.sw = true; 2379 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2381 /* need to do common hw init early so everything is set up for gmc */ 2382 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2383 if (r) { 2384 DRM_ERROR("hw_init %d failed %d\n", i, r); 2385 goto init_failed; 2386 } 2387 adev->ip_blocks[i].status.hw = true; 2388 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2389 /* need to do gmc hw init early so we can allocate gpu mem */ 2390 /* Try to reserve bad pages early */ 2391 if (amdgpu_sriov_vf(adev)) 2392 amdgpu_virt_exchange_data(adev); 2393 2394 r = amdgpu_device_vram_scratch_init(adev); 2395 if (r) { 2396 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2397 goto init_failed; 2398 } 2399 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2400 if (r) { 2401 DRM_ERROR("hw_init %d failed %d\n", i, r); 2402 goto init_failed; 2403 } 2404 r = amdgpu_device_wb_init(adev); 2405 if (r) { 2406 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2407 goto init_failed; 2408 } 2409 adev->ip_blocks[i].status.hw = true; 2410 2411 /* right after GMC hw init, we create CSA */ 2412 if (amdgpu_mcbp) { 2413 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2414 AMDGPU_GEM_DOMAIN_VRAM, 2415 AMDGPU_CSA_SIZE); 2416 if (r) { 2417 DRM_ERROR("allocate CSA failed %d\n", r); 2418 goto init_failed; 2419 } 2420 } 2421 } 2422 } 2423 2424 if (amdgpu_sriov_vf(adev)) 2425 amdgpu_virt_init_data_exchange(adev); 2426 2427 r = amdgpu_ib_pool_init(adev); 2428 if (r) { 2429 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2430 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2431 goto init_failed; 2432 } 2433 2434 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2435 if (r) 2436 goto init_failed; 2437 2438 r = amdgpu_device_ip_hw_init_phase1(adev); 2439 if (r) 2440 goto init_failed; 2441 2442 r = amdgpu_device_fw_loading(adev); 2443 if (r) 2444 goto init_failed; 2445 2446 r = amdgpu_device_ip_hw_init_phase2(adev); 2447 if (r) 2448 goto init_failed; 2449 2450 /* 2451 * retired pages will be loaded from eeprom and reserved here, 2452 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2453 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2454 * for I2C communication which only true at this point. 2455 * 2456 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2457 * failure from bad gpu situation and stop amdgpu init process 2458 * accordingly. For other failed cases, it will still release all 2459 * the resource and print error message, rather than returning one 2460 * negative value to upper level. 2461 * 2462 * Note: theoretically, this should be called before all vram allocations 2463 * to protect retired page from abusing 2464 */ 2465 r = amdgpu_ras_recovery_init(adev); 2466 if (r) 2467 goto init_failed; 2468 2469 /** 2470 * In case of XGMI grab extra reference for reset domain for this device 2471 */ 2472 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2473 if (amdgpu_xgmi_add_device(adev) == 0) { 2474 if (!amdgpu_sriov_vf(adev)) { 2475 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2476 2477 if (!hive->reset_domain || 2478 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2479 r = -ENOENT; 2480 amdgpu_put_xgmi_hive(hive); 2481 goto init_failed; 2482 } 2483 2484 /* Drop the early temporary reset domain we created for device */ 2485 amdgpu_reset_put_reset_domain(adev->reset_domain); 2486 adev->reset_domain = hive->reset_domain; 2487 amdgpu_put_xgmi_hive(hive); 2488 } 2489 } 2490 } 2491 2492 r = amdgpu_device_init_schedulers(adev); 2493 if (r) 2494 goto init_failed; 2495 2496 /* Don't init kfd if whole hive need to be reset during init */ 2497 if (!adev->gmc.xgmi.pending_reset) 2498 amdgpu_amdkfd_device_init(adev); 2499 2500 amdgpu_fru_get_product_info(adev); 2501 2502 init_failed: 2503 if (amdgpu_sriov_vf(adev)) 2504 amdgpu_virt_release_full_gpu(adev, true); 2505 2506 return r; 2507 } 2508 2509 /** 2510 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2511 * 2512 * @adev: amdgpu_device pointer 2513 * 2514 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2515 * this function before a GPU reset. If the value is retained after a 2516 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2517 */ 2518 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2519 { 2520 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2521 } 2522 2523 /** 2524 * amdgpu_device_check_vram_lost - check if vram is valid 2525 * 2526 * @adev: amdgpu_device pointer 2527 * 2528 * Checks the reset magic value written to the gart pointer in VRAM. 2529 * The driver calls this after a GPU reset to see if the contents of 2530 * VRAM is lost or now. 2531 * returns true if vram is lost, false if not. 2532 */ 2533 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2534 { 2535 if (memcmp(adev->gart.ptr, adev->reset_magic, 2536 AMDGPU_RESET_MAGIC_NUM)) 2537 return true; 2538 2539 if (!amdgpu_in_reset(adev)) 2540 return false; 2541 2542 /* 2543 * For all ASICs with baco/mode1 reset, the VRAM is 2544 * always assumed to be lost. 2545 */ 2546 switch (amdgpu_asic_reset_method(adev)) { 2547 case AMD_RESET_METHOD_BACO: 2548 case AMD_RESET_METHOD_MODE1: 2549 return true; 2550 default: 2551 return false; 2552 } 2553 } 2554 2555 /** 2556 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2557 * 2558 * @adev: amdgpu_device pointer 2559 * @state: clockgating state (gate or ungate) 2560 * 2561 * The list of all the hardware IPs that make up the asic is walked and the 2562 * set_clockgating_state callbacks are run. 2563 * Late initialization pass enabling clockgating for hardware IPs. 2564 * Fini or suspend, pass disabling clockgating for hardware IPs. 2565 * Returns 0 on success, negative error code on failure. 2566 */ 2567 2568 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2569 enum amd_clockgating_state state) 2570 { 2571 int i, j, r; 2572 2573 if (amdgpu_emu_mode == 1) 2574 return 0; 2575 2576 for (j = 0; j < adev->num_ip_blocks; j++) { 2577 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2578 if (!adev->ip_blocks[i].status.late_initialized) 2579 continue; 2580 /* skip CG for GFX on S0ix */ 2581 if (adev->in_s0ix && 2582 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2583 continue; 2584 /* skip CG for VCE/UVD, it's handled specially */ 2585 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2586 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2587 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2588 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2589 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2590 /* enable clockgating to save power */ 2591 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2592 state); 2593 if (r) { 2594 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2595 adev->ip_blocks[i].version->funcs->name, r); 2596 return r; 2597 } 2598 } 2599 } 2600 2601 return 0; 2602 } 2603 2604 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2605 enum amd_powergating_state state) 2606 { 2607 int i, j, r; 2608 2609 if (amdgpu_emu_mode == 1) 2610 return 0; 2611 2612 for (j = 0; j < adev->num_ip_blocks; j++) { 2613 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2614 if (!adev->ip_blocks[i].status.late_initialized) 2615 continue; 2616 /* skip PG for GFX on S0ix */ 2617 if (adev->in_s0ix && 2618 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2619 continue; 2620 /* skip CG for VCE/UVD, it's handled specially */ 2621 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2622 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2623 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2624 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2625 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2626 /* enable powergating to save power */ 2627 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2628 state); 2629 if (r) { 2630 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2631 adev->ip_blocks[i].version->funcs->name, r); 2632 return r; 2633 } 2634 } 2635 } 2636 return 0; 2637 } 2638 2639 static int amdgpu_device_enable_mgpu_fan_boost(void) 2640 { 2641 struct amdgpu_gpu_instance *gpu_ins; 2642 struct amdgpu_device *adev; 2643 int i, ret = 0; 2644 2645 mutex_lock(&mgpu_info.mutex); 2646 2647 /* 2648 * MGPU fan boost feature should be enabled 2649 * only when there are two or more dGPUs in 2650 * the system 2651 */ 2652 if (mgpu_info.num_dgpu < 2) 2653 goto out; 2654 2655 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2656 gpu_ins = &(mgpu_info.gpu_ins[i]); 2657 adev = gpu_ins->adev; 2658 if (!(adev->flags & AMD_IS_APU) && 2659 !gpu_ins->mgpu_fan_enabled) { 2660 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2661 if (ret) 2662 break; 2663 2664 gpu_ins->mgpu_fan_enabled = 1; 2665 } 2666 } 2667 2668 out: 2669 mutex_unlock(&mgpu_info.mutex); 2670 2671 return ret; 2672 } 2673 2674 /** 2675 * amdgpu_device_ip_late_init - run late init for hardware IPs 2676 * 2677 * @adev: amdgpu_device pointer 2678 * 2679 * Late initialization pass for hardware IPs. The list of all the hardware 2680 * IPs that make up the asic is walked and the late_init callbacks are run. 2681 * late_init covers any special initialization that an IP requires 2682 * after all of the have been initialized or something that needs to happen 2683 * late in the init process. 2684 * Returns 0 on success, negative error code on failure. 2685 */ 2686 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2687 { 2688 struct amdgpu_gpu_instance *gpu_instance; 2689 int i = 0, r; 2690 2691 for (i = 0; i < adev->num_ip_blocks; i++) { 2692 if (!adev->ip_blocks[i].status.hw) 2693 continue; 2694 if (adev->ip_blocks[i].version->funcs->late_init) { 2695 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2696 if (r) { 2697 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2698 adev->ip_blocks[i].version->funcs->name, r); 2699 return r; 2700 } 2701 } 2702 adev->ip_blocks[i].status.late_initialized = true; 2703 } 2704 2705 r = amdgpu_ras_late_init(adev); 2706 if (r) { 2707 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2708 return r; 2709 } 2710 2711 amdgpu_ras_set_error_query_ready(adev, true); 2712 2713 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2714 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2715 2716 amdgpu_device_fill_reset_magic(adev); 2717 2718 r = amdgpu_device_enable_mgpu_fan_boost(); 2719 if (r) 2720 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2721 2722 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2723 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2724 adev->asic_type == CHIP_ALDEBARAN )) 2725 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2726 2727 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2728 mutex_lock(&mgpu_info.mutex); 2729 2730 /* 2731 * Reset device p-state to low as this was booted with high. 2732 * 2733 * This should be performed only after all devices from the same 2734 * hive get initialized. 2735 * 2736 * However, it's unknown how many device in the hive in advance. 2737 * As this is counted one by one during devices initializations. 2738 * 2739 * So, we wait for all XGMI interlinked devices initialized. 2740 * This may bring some delays as those devices may come from 2741 * different hives. But that should be OK. 2742 */ 2743 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2744 for (i = 0; i < mgpu_info.num_gpu; i++) { 2745 gpu_instance = &(mgpu_info.gpu_ins[i]); 2746 if (gpu_instance->adev->flags & AMD_IS_APU) 2747 continue; 2748 2749 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2750 AMDGPU_XGMI_PSTATE_MIN); 2751 if (r) { 2752 DRM_ERROR("pstate setting failed (%d).\n", r); 2753 break; 2754 } 2755 } 2756 } 2757 2758 mutex_unlock(&mgpu_info.mutex); 2759 } 2760 2761 return 0; 2762 } 2763 2764 /** 2765 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2766 * 2767 * @adev: amdgpu_device pointer 2768 * 2769 * For ASICs need to disable SMC first 2770 */ 2771 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2772 { 2773 int i, r; 2774 2775 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2776 return; 2777 2778 for (i = 0; i < adev->num_ip_blocks; i++) { 2779 if (!adev->ip_blocks[i].status.hw) 2780 continue; 2781 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2782 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2783 /* XXX handle errors */ 2784 if (r) { 2785 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2786 adev->ip_blocks[i].version->funcs->name, r); 2787 } 2788 adev->ip_blocks[i].status.hw = false; 2789 break; 2790 } 2791 } 2792 } 2793 2794 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2795 { 2796 int i, r; 2797 2798 for (i = 0; i < adev->num_ip_blocks; i++) { 2799 if (!adev->ip_blocks[i].version->funcs->early_fini) 2800 continue; 2801 2802 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2803 if (r) { 2804 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2805 adev->ip_blocks[i].version->funcs->name, r); 2806 } 2807 } 2808 2809 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2810 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2811 2812 amdgpu_amdkfd_suspend(adev, false); 2813 2814 /* Workaroud for ASICs need to disable SMC first */ 2815 amdgpu_device_smu_fini_early(adev); 2816 2817 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2818 if (!adev->ip_blocks[i].status.hw) 2819 continue; 2820 2821 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2822 /* XXX handle errors */ 2823 if (r) { 2824 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2825 adev->ip_blocks[i].version->funcs->name, r); 2826 } 2827 2828 adev->ip_blocks[i].status.hw = false; 2829 } 2830 2831 if (amdgpu_sriov_vf(adev)) { 2832 if (amdgpu_virt_release_full_gpu(adev, false)) 2833 DRM_ERROR("failed to release exclusive mode on fini\n"); 2834 } 2835 2836 return 0; 2837 } 2838 2839 /** 2840 * amdgpu_device_ip_fini - run fini for hardware IPs 2841 * 2842 * @adev: amdgpu_device pointer 2843 * 2844 * Main teardown pass for hardware IPs. The list of all the hardware 2845 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2846 * are run. hw_fini tears down the hardware associated with each IP 2847 * and sw_fini tears down any software state associated with each IP. 2848 * Returns 0 on success, negative error code on failure. 2849 */ 2850 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2851 { 2852 int i, r; 2853 2854 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2855 amdgpu_virt_release_ras_err_handler_data(adev); 2856 2857 if (adev->gmc.xgmi.num_physical_nodes > 1) 2858 amdgpu_xgmi_remove_device(adev); 2859 2860 amdgpu_amdkfd_device_fini_sw(adev); 2861 2862 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2863 if (!adev->ip_blocks[i].status.sw) 2864 continue; 2865 2866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2867 amdgpu_ucode_free_bo(adev); 2868 amdgpu_free_static_csa(&adev->virt.csa_obj); 2869 amdgpu_device_wb_fini(adev); 2870 amdgpu_device_vram_scratch_fini(adev); 2871 amdgpu_ib_pool_fini(adev); 2872 } 2873 2874 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2875 /* XXX handle errors */ 2876 if (r) { 2877 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2878 adev->ip_blocks[i].version->funcs->name, r); 2879 } 2880 adev->ip_blocks[i].status.sw = false; 2881 adev->ip_blocks[i].status.valid = false; 2882 } 2883 2884 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2885 if (!adev->ip_blocks[i].status.late_initialized) 2886 continue; 2887 if (adev->ip_blocks[i].version->funcs->late_fini) 2888 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2889 adev->ip_blocks[i].status.late_initialized = false; 2890 } 2891 2892 amdgpu_ras_fini(adev); 2893 2894 return 0; 2895 } 2896 2897 /** 2898 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2899 * 2900 * @work: work_struct. 2901 */ 2902 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2903 { 2904 struct amdgpu_device *adev = 2905 container_of(work, struct amdgpu_device, delayed_init_work.work); 2906 int r; 2907 2908 r = amdgpu_ib_ring_tests(adev); 2909 if (r) 2910 DRM_ERROR("ib ring test failed (%d).\n", r); 2911 } 2912 2913 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2914 { 2915 struct amdgpu_device *adev = 2916 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2917 2918 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2919 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2920 2921 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2922 adev->gfx.gfx_off_state = true; 2923 } 2924 2925 /** 2926 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2927 * 2928 * @adev: amdgpu_device pointer 2929 * 2930 * Main suspend function for hardware IPs. The list of all the hardware 2931 * IPs that make up the asic is walked, clockgating is disabled and the 2932 * suspend callbacks are run. suspend puts the hardware and software state 2933 * in each IP into a state suitable for suspend. 2934 * Returns 0 on success, negative error code on failure. 2935 */ 2936 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2937 { 2938 int i, r; 2939 2940 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2941 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2942 2943 /* 2944 * Per PMFW team's suggestion, driver needs to handle gfxoff 2945 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2946 * scenario. Add the missing df cstate disablement here. 2947 */ 2948 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2949 dev_warn(adev->dev, "Failed to disallow df cstate"); 2950 2951 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2952 if (!adev->ip_blocks[i].status.valid) 2953 continue; 2954 2955 /* displays are handled separately */ 2956 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2957 continue; 2958 2959 /* XXX handle errors */ 2960 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2961 /* XXX handle errors */ 2962 if (r) { 2963 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2964 adev->ip_blocks[i].version->funcs->name, r); 2965 return r; 2966 } 2967 2968 adev->ip_blocks[i].status.hw = false; 2969 } 2970 2971 return 0; 2972 } 2973 2974 /** 2975 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2976 * 2977 * @adev: amdgpu_device pointer 2978 * 2979 * Main suspend function for hardware IPs. The list of all the hardware 2980 * IPs that make up the asic is walked, clockgating is disabled and the 2981 * suspend callbacks are run. suspend puts the hardware and software state 2982 * in each IP into a state suitable for suspend. 2983 * Returns 0 on success, negative error code on failure. 2984 */ 2985 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2986 { 2987 int i, r; 2988 2989 if (adev->in_s0ix) 2990 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2991 2992 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2993 if (!adev->ip_blocks[i].status.valid) 2994 continue; 2995 /* displays are handled in phase1 */ 2996 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2997 continue; 2998 /* PSP lost connection when err_event_athub occurs */ 2999 if (amdgpu_ras_intr_triggered() && 3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3001 adev->ip_blocks[i].status.hw = false; 3002 continue; 3003 } 3004 3005 /* skip unnecessary suspend if we do not initialize them yet */ 3006 if (adev->gmc.xgmi.pending_reset && 3007 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3009 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3010 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3011 adev->ip_blocks[i].status.hw = false; 3012 continue; 3013 } 3014 3015 /* skip suspend of gfx and psp for S0ix 3016 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3017 * like at runtime. PSP is also part of the always on hardware 3018 * so no need to suspend it. 3019 */ 3020 if (adev->in_s0ix && 3021 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3022 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 3023 continue; 3024 3025 /* XXX handle errors */ 3026 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3027 /* XXX handle errors */ 3028 if (r) { 3029 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3030 adev->ip_blocks[i].version->funcs->name, r); 3031 } 3032 adev->ip_blocks[i].status.hw = false; 3033 /* handle putting the SMC in the appropriate state */ 3034 if(!amdgpu_sriov_vf(adev)){ 3035 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3036 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3037 if (r) { 3038 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3039 adev->mp1_state, r); 3040 return r; 3041 } 3042 } 3043 } 3044 } 3045 3046 return 0; 3047 } 3048 3049 /** 3050 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3051 * 3052 * @adev: amdgpu_device pointer 3053 * 3054 * Main suspend function for hardware IPs. The list of all the hardware 3055 * IPs that make up the asic is walked, clockgating is disabled and the 3056 * suspend callbacks are run. suspend puts the hardware and software state 3057 * in each IP into a state suitable for suspend. 3058 * Returns 0 on success, negative error code on failure. 3059 */ 3060 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3061 { 3062 int r; 3063 3064 if (amdgpu_sriov_vf(adev)) { 3065 amdgpu_virt_fini_data_exchange(adev); 3066 amdgpu_virt_request_full_gpu(adev, false); 3067 } 3068 3069 r = amdgpu_device_ip_suspend_phase1(adev); 3070 if (r) 3071 return r; 3072 r = amdgpu_device_ip_suspend_phase2(adev); 3073 3074 if (amdgpu_sriov_vf(adev)) 3075 amdgpu_virt_release_full_gpu(adev, false); 3076 3077 return r; 3078 } 3079 3080 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3081 { 3082 int i, r; 3083 3084 static enum amd_ip_block_type ip_order[] = { 3085 AMD_IP_BLOCK_TYPE_COMMON, 3086 AMD_IP_BLOCK_TYPE_GMC, 3087 AMD_IP_BLOCK_TYPE_PSP, 3088 AMD_IP_BLOCK_TYPE_IH, 3089 }; 3090 3091 for (i = 0; i < adev->num_ip_blocks; i++) { 3092 int j; 3093 struct amdgpu_ip_block *block; 3094 3095 block = &adev->ip_blocks[i]; 3096 block->status.hw = false; 3097 3098 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3099 3100 if (block->version->type != ip_order[j] || 3101 !block->status.valid) 3102 continue; 3103 3104 r = block->version->funcs->hw_init(adev); 3105 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3106 if (r) 3107 return r; 3108 block->status.hw = true; 3109 } 3110 } 3111 3112 return 0; 3113 } 3114 3115 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3116 { 3117 int i, r; 3118 3119 static enum amd_ip_block_type ip_order[] = { 3120 AMD_IP_BLOCK_TYPE_SMC, 3121 AMD_IP_BLOCK_TYPE_DCE, 3122 AMD_IP_BLOCK_TYPE_GFX, 3123 AMD_IP_BLOCK_TYPE_SDMA, 3124 AMD_IP_BLOCK_TYPE_UVD, 3125 AMD_IP_BLOCK_TYPE_VCE, 3126 AMD_IP_BLOCK_TYPE_VCN 3127 }; 3128 3129 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3130 int j; 3131 struct amdgpu_ip_block *block; 3132 3133 for (j = 0; j < adev->num_ip_blocks; j++) { 3134 block = &adev->ip_blocks[j]; 3135 3136 if (block->version->type != ip_order[i] || 3137 !block->status.valid || 3138 block->status.hw) 3139 continue; 3140 3141 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3142 r = block->version->funcs->resume(adev); 3143 else 3144 r = block->version->funcs->hw_init(adev); 3145 3146 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3147 if (r) 3148 return r; 3149 block->status.hw = true; 3150 } 3151 } 3152 3153 return 0; 3154 } 3155 3156 /** 3157 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3158 * 3159 * @adev: amdgpu_device pointer 3160 * 3161 * First resume function for hardware IPs. The list of all the hardware 3162 * IPs that make up the asic is walked and the resume callbacks are run for 3163 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3164 * after a suspend and updates the software state as necessary. This 3165 * function is also used for restoring the GPU after a GPU reset. 3166 * Returns 0 on success, negative error code on failure. 3167 */ 3168 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3169 { 3170 int i, r; 3171 3172 for (i = 0; i < adev->num_ip_blocks; i++) { 3173 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3174 continue; 3175 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3176 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3177 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3178 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3179 3180 r = adev->ip_blocks[i].version->funcs->resume(adev); 3181 if (r) { 3182 DRM_ERROR("resume of IP block <%s> failed %d\n", 3183 adev->ip_blocks[i].version->funcs->name, r); 3184 return r; 3185 } 3186 adev->ip_blocks[i].status.hw = true; 3187 } 3188 } 3189 3190 return 0; 3191 } 3192 3193 /** 3194 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3195 * 3196 * @adev: amdgpu_device pointer 3197 * 3198 * First resume function for hardware IPs. The list of all the hardware 3199 * IPs that make up the asic is walked and the resume callbacks are run for 3200 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3201 * functional state after a suspend and updates the software state as 3202 * necessary. This function is also used for restoring the GPU after a GPU 3203 * reset. 3204 * Returns 0 on success, negative error code on failure. 3205 */ 3206 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3207 { 3208 int i, r; 3209 3210 for (i = 0; i < adev->num_ip_blocks; i++) { 3211 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3212 continue; 3213 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3214 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3215 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3216 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3217 continue; 3218 r = adev->ip_blocks[i].version->funcs->resume(adev); 3219 if (r) { 3220 DRM_ERROR("resume of IP block <%s> failed %d\n", 3221 adev->ip_blocks[i].version->funcs->name, r); 3222 return r; 3223 } 3224 adev->ip_blocks[i].status.hw = true; 3225 3226 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3227 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3228 * amdgpu_device_resume() after IP resume. 3229 */ 3230 amdgpu_gfx_off_ctrl(adev, false); 3231 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3232 } 3233 3234 } 3235 3236 return 0; 3237 } 3238 3239 /** 3240 * amdgpu_device_ip_resume - run resume for hardware IPs 3241 * 3242 * @adev: amdgpu_device pointer 3243 * 3244 * Main resume function for hardware IPs. The hardware IPs 3245 * are split into two resume functions because they are 3246 * are also used in in recovering from a GPU reset and some additional 3247 * steps need to be take between them. In this case (S3/S4) they are 3248 * run sequentially. 3249 * Returns 0 on success, negative error code on failure. 3250 */ 3251 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3252 { 3253 int r; 3254 3255 r = amdgpu_amdkfd_resume_iommu(adev); 3256 if (r) 3257 return r; 3258 3259 r = amdgpu_device_ip_resume_phase1(adev); 3260 if (r) 3261 return r; 3262 3263 r = amdgpu_device_fw_loading(adev); 3264 if (r) 3265 return r; 3266 3267 r = amdgpu_device_ip_resume_phase2(adev); 3268 3269 return r; 3270 } 3271 3272 /** 3273 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3274 * 3275 * @adev: amdgpu_device pointer 3276 * 3277 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3278 */ 3279 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3280 { 3281 if (amdgpu_sriov_vf(adev)) { 3282 if (adev->is_atom_fw) { 3283 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3284 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3285 } else { 3286 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3287 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3288 } 3289 3290 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3291 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3292 } 3293 } 3294 3295 /** 3296 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3297 * 3298 * @asic_type: AMD asic type 3299 * 3300 * Check if there is DC (new modesetting infrastructre) support for an asic. 3301 * returns true if DC has support, false if not. 3302 */ 3303 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3304 { 3305 switch (asic_type) { 3306 #ifdef CONFIG_DRM_AMDGPU_SI 3307 case CHIP_HAINAN: 3308 #endif 3309 case CHIP_TOPAZ: 3310 /* chips with no display hardware */ 3311 return false; 3312 #if defined(CONFIG_DRM_AMD_DC) 3313 case CHIP_TAHITI: 3314 case CHIP_PITCAIRN: 3315 case CHIP_VERDE: 3316 case CHIP_OLAND: 3317 /* 3318 * We have systems in the wild with these ASICs that require 3319 * LVDS and VGA support which is not supported with DC. 3320 * 3321 * Fallback to the non-DC driver here by default so as not to 3322 * cause regressions. 3323 */ 3324 #if defined(CONFIG_DRM_AMD_DC_SI) 3325 return amdgpu_dc > 0; 3326 #else 3327 return false; 3328 #endif 3329 case CHIP_BONAIRE: 3330 case CHIP_KAVERI: 3331 case CHIP_KABINI: 3332 case CHIP_MULLINS: 3333 /* 3334 * We have systems in the wild with these ASICs that require 3335 * VGA support which is not supported with DC. 3336 * 3337 * Fallback to the non-DC driver here by default so as not to 3338 * cause regressions. 3339 */ 3340 return amdgpu_dc > 0; 3341 default: 3342 return amdgpu_dc != 0; 3343 #else 3344 default: 3345 if (amdgpu_dc > 0) 3346 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3347 "but isn't supported by ASIC, ignoring\n"); 3348 return false; 3349 #endif 3350 } 3351 } 3352 3353 /** 3354 * amdgpu_device_has_dc_support - check if dc is supported 3355 * 3356 * @adev: amdgpu_device pointer 3357 * 3358 * Returns true for supported, false for not supported 3359 */ 3360 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3361 { 3362 if (adev->enable_virtual_display || 3363 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3364 return false; 3365 3366 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3367 } 3368 3369 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3370 { 3371 struct amdgpu_device *adev = 3372 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3373 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3374 3375 /* It's a bug to not have a hive within this function */ 3376 if (WARN_ON(!hive)) 3377 return; 3378 3379 /* 3380 * Use task barrier to synchronize all xgmi reset works across the 3381 * hive. task_barrier_enter and task_barrier_exit will block 3382 * until all the threads running the xgmi reset works reach 3383 * those points. task_barrier_full will do both blocks. 3384 */ 3385 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3386 3387 task_barrier_enter(&hive->tb); 3388 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3389 3390 if (adev->asic_reset_res) 3391 goto fail; 3392 3393 task_barrier_exit(&hive->tb); 3394 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3395 3396 if (adev->asic_reset_res) 3397 goto fail; 3398 3399 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3400 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3401 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3402 } else { 3403 3404 task_barrier_full(&hive->tb); 3405 adev->asic_reset_res = amdgpu_asic_reset(adev); 3406 } 3407 3408 fail: 3409 if (adev->asic_reset_res) 3410 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3411 adev->asic_reset_res, adev_to_drm(adev)->unique); 3412 amdgpu_put_xgmi_hive(hive); 3413 } 3414 3415 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3416 { 3417 char *input = amdgpu_lockup_timeout; 3418 char *timeout_setting = NULL; 3419 int index = 0; 3420 long timeout; 3421 int ret = 0; 3422 3423 /* 3424 * By default timeout for non compute jobs is 10000 3425 * and 60000 for compute jobs. 3426 * In SR-IOV or passthrough mode, timeout for compute 3427 * jobs are 60000 by default. 3428 */ 3429 adev->gfx_timeout = msecs_to_jiffies(10000); 3430 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3431 if (amdgpu_sriov_vf(adev)) 3432 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3433 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3434 else 3435 adev->compute_timeout = msecs_to_jiffies(60000); 3436 3437 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3438 while ((timeout_setting = strsep(&input, ",")) && 3439 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3440 ret = kstrtol(timeout_setting, 0, &timeout); 3441 if (ret) 3442 return ret; 3443 3444 if (timeout == 0) { 3445 index++; 3446 continue; 3447 } else if (timeout < 0) { 3448 timeout = MAX_SCHEDULE_TIMEOUT; 3449 dev_warn(adev->dev, "lockup timeout disabled"); 3450 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3451 } else { 3452 timeout = msecs_to_jiffies(timeout); 3453 } 3454 3455 switch (index++) { 3456 case 0: 3457 adev->gfx_timeout = timeout; 3458 break; 3459 case 1: 3460 adev->compute_timeout = timeout; 3461 break; 3462 case 2: 3463 adev->sdma_timeout = timeout; 3464 break; 3465 case 3: 3466 adev->video_timeout = timeout; 3467 break; 3468 default: 3469 break; 3470 } 3471 } 3472 /* 3473 * There is only one value specified and 3474 * it should apply to all non-compute jobs. 3475 */ 3476 if (index == 1) { 3477 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3478 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3479 adev->compute_timeout = adev->gfx_timeout; 3480 } 3481 } 3482 3483 return ret; 3484 } 3485 3486 /** 3487 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3488 * 3489 * @adev: amdgpu_device pointer 3490 * 3491 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3492 */ 3493 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3494 { 3495 struct iommu_domain *domain; 3496 3497 domain = iommu_get_domain_for_dev(adev->dev); 3498 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3499 adev->ram_is_direct_mapped = true; 3500 } 3501 3502 static const struct attribute *amdgpu_dev_attributes[] = { 3503 &dev_attr_product_name.attr, 3504 &dev_attr_product_number.attr, 3505 &dev_attr_serial_number.attr, 3506 &dev_attr_pcie_replay_count.attr, 3507 NULL 3508 }; 3509 3510 /** 3511 * amdgpu_device_init - initialize the driver 3512 * 3513 * @adev: amdgpu_device pointer 3514 * @flags: driver flags 3515 * 3516 * Initializes the driver info and hw (all asics). 3517 * Returns 0 for success or an error on failure. 3518 * Called at driver startup. 3519 */ 3520 int amdgpu_device_init(struct amdgpu_device *adev, 3521 uint32_t flags) 3522 { 3523 struct drm_device *ddev = adev_to_drm(adev); 3524 struct pci_dev *pdev = adev->pdev; 3525 int r, i; 3526 bool px = false; 3527 u32 max_MBps; 3528 3529 adev->shutdown = false; 3530 adev->flags = flags; 3531 3532 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3533 adev->asic_type = amdgpu_force_asic_type; 3534 else 3535 adev->asic_type = flags & AMD_ASIC_MASK; 3536 3537 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3538 if (amdgpu_emu_mode == 1) 3539 adev->usec_timeout *= 10; 3540 adev->gmc.gart_size = 512 * 1024 * 1024; 3541 adev->accel_working = false; 3542 adev->num_rings = 0; 3543 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3544 adev->mman.buffer_funcs = NULL; 3545 adev->mman.buffer_funcs_ring = NULL; 3546 adev->vm_manager.vm_pte_funcs = NULL; 3547 adev->vm_manager.vm_pte_num_scheds = 0; 3548 adev->gmc.gmc_funcs = NULL; 3549 adev->harvest_ip_mask = 0x0; 3550 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3551 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3552 3553 adev->smc_rreg = &amdgpu_invalid_rreg; 3554 adev->smc_wreg = &amdgpu_invalid_wreg; 3555 adev->pcie_rreg = &amdgpu_invalid_rreg; 3556 adev->pcie_wreg = &amdgpu_invalid_wreg; 3557 adev->pciep_rreg = &amdgpu_invalid_rreg; 3558 adev->pciep_wreg = &amdgpu_invalid_wreg; 3559 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3560 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3561 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3562 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3563 adev->didt_rreg = &amdgpu_invalid_rreg; 3564 adev->didt_wreg = &amdgpu_invalid_wreg; 3565 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3566 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3567 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3568 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3569 3570 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3571 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3572 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3573 3574 /* mutex initialization are all done here so we 3575 * can recall function without having locking issues */ 3576 mutex_init(&adev->firmware.mutex); 3577 mutex_init(&adev->pm.mutex); 3578 mutex_init(&adev->gfx.gpu_clock_mutex); 3579 mutex_init(&adev->srbm_mutex); 3580 mutex_init(&adev->gfx.pipe_reserve_mutex); 3581 mutex_init(&adev->gfx.gfx_off_mutex); 3582 mutex_init(&adev->grbm_idx_mutex); 3583 mutex_init(&adev->mn_lock); 3584 mutex_init(&adev->virt.vf_errors.lock); 3585 hash_init(adev->mn_hash); 3586 mutex_init(&adev->psp.mutex); 3587 mutex_init(&adev->notifier_lock); 3588 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3589 mutex_init(&adev->benchmark_mutex); 3590 3591 amdgpu_device_init_apu_flags(adev); 3592 3593 r = amdgpu_device_check_arguments(adev); 3594 if (r) 3595 return r; 3596 3597 spin_lock_init(&adev->mmio_idx_lock); 3598 spin_lock_init(&adev->smc_idx_lock); 3599 spin_lock_init(&adev->pcie_idx_lock); 3600 spin_lock_init(&adev->uvd_ctx_idx_lock); 3601 spin_lock_init(&adev->didt_idx_lock); 3602 spin_lock_init(&adev->gc_cac_idx_lock); 3603 spin_lock_init(&adev->se_cac_idx_lock); 3604 spin_lock_init(&adev->audio_endpt_idx_lock); 3605 spin_lock_init(&adev->mm_stats.lock); 3606 3607 INIT_LIST_HEAD(&adev->shadow_list); 3608 mutex_init(&adev->shadow_list_lock); 3609 3610 INIT_LIST_HEAD(&adev->reset_list); 3611 3612 INIT_LIST_HEAD(&adev->ras_list); 3613 3614 INIT_DELAYED_WORK(&adev->delayed_init_work, 3615 amdgpu_device_delayed_init_work_handler); 3616 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3617 amdgpu_device_delay_enable_gfx_off); 3618 3619 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3620 3621 adev->gfx.gfx_off_req_count = 1; 3622 adev->gfx.gfx_off_residency = 0; 3623 adev->gfx.gfx_off_entrycount = 0; 3624 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3625 3626 atomic_set(&adev->throttling_logging_enabled, 1); 3627 /* 3628 * If throttling continues, logging will be performed every minute 3629 * to avoid log flooding. "-1" is subtracted since the thermal 3630 * throttling interrupt comes every second. Thus, the total logging 3631 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3632 * for throttling interrupt) = 60 seconds. 3633 */ 3634 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3635 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3636 3637 /* Registers mapping */ 3638 /* TODO: block userspace mapping of io register */ 3639 if (adev->asic_type >= CHIP_BONAIRE) { 3640 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3641 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3642 } else { 3643 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3644 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3645 } 3646 3647 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3648 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3649 3650 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3651 if (adev->rmmio == NULL) { 3652 return -ENOMEM; 3653 } 3654 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3655 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3656 3657 amdgpu_device_get_pcie_info(adev); 3658 3659 if (amdgpu_mcbp) 3660 DRM_INFO("MCBP is enabled\n"); 3661 3662 /* 3663 * Reset domain needs to be present early, before XGMI hive discovered 3664 * (if any) and intitialized to use reset sem and in_gpu reset flag 3665 * early on during init and before calling to RREG32. 3666 */ 3667 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3668 if (!adev->reset_domain) 3669 return -ENOMEM; 3670 3671 /* detect hw virtualization here */ 3672 amdgpu_detect_virtualization(adev); 3673 3674 r = amdgpu_device_get_job_timeout_settings(adev); 3675 if (r) { 3676 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3677 return r; 3678 } 3679 3680 /* early init functions */ 3681 r = amdgpu_device_ip_early_init(adev); 3682 if (r) 3683 return r; 3684 3685 /* Enable TMZ based on IP_VERSION */ 3686 amdgpu_gmc_tmz_set(adev); 3687 3688 amdgpu_gmc_noretry_set(adev); 3689 /* Need to get xgmi info early to decide the reset behavior*/ 3690 if (adev->gmc.xgmi.supported) { 3691 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3692 if (r) 3693 return r; 3694 } 3695 3696 /* enable PCIE atomic ops */ 3697 if (amdgpu_sriov_vf(adev)) 3698 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3699 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3700 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3701 else 3702 adev->have_atomics_support = 3703 !pci_enable_atomic_ops_to_root(adev->pdev, 3704 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3705 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3706 if (!adev->have_atomics_support) 3707 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3708 3709 /* doorbell bar mapping and doorbell index init*/ 3710 amdgpu_device_doorbell_init(adev); 3711 3712 if (amdgpu_emu_mode == 1) { 3713 /* post the asic on emulation mode */ 3714 emu_soc_asic_init(adev); 3715 goto fence_driver_init; 3716 } 3717 3718 amdgpu_reset_init(adev); 3719 3720 /* detect if we are with an SRIOV vbios */ 3721 amdgpu_device_detect_sriov_bios(adev); 3722 3723 /* check if we need to reset the asic 3724 * E.g., driver was not cleanly unloaded previously, etc. 3725 */ 3726 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3727 if (adev->gmc.xgmi.num_physical_nodes) { 3728 dev_info(adev->dev, "Pending hive reset.\n"); 3729 adev->gmc.xgmi.pending_reset = true; 3730 /* Only need to init necessary block for SMU to handle the reset */ 3731 for (i = 0; i < adev->num_ip_blocks; i++) { 3732 if (!adev->ip_blocks[i].status.valid) 3733 continue; 3734 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3736 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3738 DRM_DEBUG("IP %s disabled for hw_init.\n", 3739 adev->ip_blocks[i].version->funcs->name); 3740 adev->ip_blocks[i].status.hw = true; 3741 } 3742 } 3743 } else { 3744 r = amdgpu_asic_reset(adev); 3745 if (r) { 3746 dev_err(adev->dev, "asic reset on init failed\n"); 3747 goto failed; 3748 } 3749 } 3750 } 3751 3752 pci_enable_pcie_error_reporting(adev->pdev); 3753 3754 /* Post card if necessary */ 3755 if (amdgpu_device_need_post(adev)) { 3756 if (!adev->bios) { 3757 dev_err(adev->dev, "no vBIOS found\n"); 3758 r = -EINVAL; 3759 goto failed; 3760 } 3761 DRM_INFO("GPU posting now...\n"); 3762 r = amdgpu_device_asic_init(adev); 3763 if (r) { 3764 dev_err(adev->dev, "gpu post error!\n"); 3765 goto failed; 3766 } 3767 } 3768 3769 if (adev->is_atom_fw) { 3770 /* Initialize clocks */ 3771 r = amdgpu_atomfirmware_get_clock_info(adev); 3772 if (r) { 3773 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3774 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3775 goto failed; 3776 } 3777 } else { 3778 /* Initialize clocks */ 3779 r = amdgpu_atombios_get_clock_info(adev); 3780 if (r) { 3781 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3782 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3783 goto failed; 3784 } 3785 /* init i2c buses */ 3786 if (!amdgpu_device_has_dc_support(adev)) 3787 amdgpu_atombios_i2c_init(adev); 3788 } 3789 3790 fence_driver_init: 3791 /* Fence driver */ 3792 r = amdgpu_fence_driver_sw_init(adev); 3793 if (r) { 3794 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3795 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3796 goto failed; 3797 } 3798 3799 /* init the mode config */ 3800 drm_mode_config_init(adev_to_drm(adev)); 3801 3802 r = amdgpu_device_ip_init(adev); 3803 if (r) { 3804 /* failed in exclusive mode due to timeout */ 3805 if (amdgpu_sriov_vf(adev) && 3806 !amdgpu_sriov_runtime(adev) && 3807 amdgpu_virt_mmio_blocked(adev) && 3808 !amdgpu_virt_wait_reset(adev)) { 3809 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3810 /* Don't send request since VF is inactive. */ 3811 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3812 adev->virt.ops = NULL; 3813 r = -EAGAIN; 3814 goto release_ras_con; 3815 } 3816 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3817 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3818 goto release_ras_con; 3819 } 3820 3821 amdgpu_fence_driver_hw_init(adev); 3822 3823 dev_info(adev->dev, 3824 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3825 adev->gfx.config.max_shader_engines, 3826 adev->gfx.config.max_sh_per_se, 3827 adev->gfx.config.max_cu_per_sh, 3828 adev->gfx.cu_info.number); 3829 3830 adev->accel_working = true; 3831 3832 amdgpu_vm_check_compute_bug(adev); 3833 3834 /* Initialize the buffer migration limit. */ 3835 if (amdgpu_moverate >= 0) 3836 max_MBps = amdgpu_moverate; 3837 else 3838 max_MBps = 8; /* Allow 8 MB/s. */ 3839 /* Get a log2 for easy divisions. */ 3840 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3841 3842 r = amdgpu_pm_sysfs_init(adev); 3843 if (r) { 3844 adev->pm_sysfs_en = false; 3845 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3846 } else 3847 adev->pm_sysfs_en = true; 3848 3849 r = amdgpu_ucode_sysfs_init(adev); 3850 if (r) { 3851 adev->ucode_sysfs_en = false; 3852 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3853 } else 3854 adev->ucode_sysfs_en = true; 3855 3856 r = amdgpu_psp_sysfs_init(adev); 3857 if (r) { 3858 adev->psp_sysfs_en = false; 3859 if (!amdgpu_sriov_vf(adev)) 3860 DRM_ERROR("Creating psp sysfs failed\n"); 3861 } else 3862 adev->psp_sysfs_en = true; 3863 3864 /* 3865 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3866 * Otherwise the mgpu fan boost feature will be skipped due to the 3867 * gpu instance is counted less. 3868 */ 3869 amdgpu_register_gpu_instance(adev); 3870 3871 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3872 * explicit gating rather than handling it automatically. 3873 */ 3874 if (!adev->gmc.xgmi.pending_reset) { 3875 r = amdgpu_device_ip_late_init(adev); 3876 if (r) { 3877 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3878 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3879 goto release_ras_con; 3880 } 3881 /* must succeed. */ 3882 amdgpu_ras_resume(adev); 3883 queue_delayed_work(system_wq, &adev->delayed_init_work, 3884 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3885 } 3886 3887 if (amdgpu_sriov_vf(adev)) 3888 flush_delayed_work(&adev->delayed_init_work); 3889 3890 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3891 if (r) 3892 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3893 3894 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3895 r = amdgpu_pmu_init(adev); 3896 if (r) 3897 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3898 3899 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3900 if (amdgpu_device_cache_pci_state(adev->pdev)) 3901 pci_restore_state(pdev); 3902 3903 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3904 /* this will fail for cards that aren't VGA class devices, just 3905 * ignore it */ 3906 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3907 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3908 3909 if (amdgpu_device_supports_px(ddev)) { 3910 px = true; 3911 vga_switcheroo_register_client(adev->pdev, 3912 &amdgpu_switcheroo_ops, px); 3913 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3914 } 3915 3916 if (adev->gmc.xgmi.pending_reset) 3917 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3918 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3919 3920 amdgpu_device_check_iommu_direct_map(adev); 3921 3922 return 0; 3923 3924 release_ras_con: 3925 amdgpu_release_ras_context(adev); 3926 3927 failed: 3928 amdgpu_vf_error_trans_all(adev); 3929 3930 return r; 3931 } 3932 3933 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3934 { 3935 3936 /* Clear all CPU mappings pointing to this device */ 3937 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3938 3939 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3940 amdgpu_device_doorbell_fini(adev); 3941 3942 iounmap(adev->rmmio); 3943 adev->rmmio = NULL; 3944 if (adev->mman.aper_base_kaddr) 3945 iounmap(adev->mman.aper_base_kaddr); 3946 adev->mman.aper_base_kaddr = NULL; 3947 3948 /* Memory manager related */ 3949 if (!adev->gmc.xgmi.connected_to_cpu) { 3950 arch_phys_wc_del(adev->gmc.vram_mtrr); 3951 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3952 } 3953 } 3954 3955 /** 3956 * amdgpu_device_fini_hw - tear down the driver 3957 * 3958 * @adev: amdgpu_device pointer 3959 * 3960 * Tear down the driver info (all asics). 3961 * Called at driver shutdown. 3962 */ 3963 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3964 { 3965 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3966 flush_delayed_work(&adev->delayed_init_work); 3967 adev->shutdown = true; 3968 3969 /* make sure IB test finished before entering exclusive mode 3970 * to avoid preemption on IB test 3971 * */ 3972 if (amdgpu_sriov_vf(adev)) { 3973 amdgpu_virt_request_full_gpu(adev, false); 3974 amdgpu_virt_fini_data_exchange(adev); 3975 } 3976 3977 /* disable all interrupts */ 3978 amdgpu_irq_disable_all(adev); 3979 if (adev->mode_info.mode_config_initialized){ 3980 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3981 drm_helper_force_disable_all(adev_to_drm(adev)); 3982 else 3983 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3984 } 3985 amdgpu_fence_driver_hw_fini(adev); 3986 3987 if (adev->mman.initialized) 3988 drain_workqueue(adev->mman.bdev.wq); 3989 3990 if (adev->pm_sysfs_en) 3991 amdgpu_pm_sysfs_fini(adev); 3992 if (adev->ucode_sysfs_en) 3993 amdgpu_ucode_sysfs_fini(adev); 3994 if (adev->psp_sysfs_en) 3995 amdgpu_psp_sysfs_fini(adev); 3996 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3997 3998 /* disable ras feature must before hw fini */ 3999 amdgpu_ras_pre_fini(adev); 4000 4001 amdgpu_device_ip_fini_early(adev); 4002 4003 amdgpu_irq_fini_hw(adev); 4004 4005 if (adev->mman.initialized) 4006 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4007 4008 amdgpu_gart_dummy_page_fini(adev); 4009 4010 amdgpu_device_unmap_mmio(adev); 4011 4012 } 4013 4014 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4015 { 4016 int idx; 4017 4018 amdgpu_fence_driver_sw_fini(adev); 4019 amdgpu_device_ip_fini(adev); 4020 release_firmware(adev->firmware.gpu_info_fw); 4021 adev->firmware.gpu_info_fw = NULL; 4022 adev->accel_working = false; 4023 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4024 4025 amdgpu_reset_fini(adev); 4026 4027 /* free i2c buses */ 4028 if (!amdgpu_device_has_dc_support(adev)) 4029 amdgpu_i2c_fini(adev); 4030 4031 if (amdgpu_emu_mode != 1) 4032 amdgpu_atombios_fini(adev); 4033 4034 kfree(adev->bios); 4035 adev->bios = NULL; 4036 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4037 vga_switcheroo_unregister_client(adev->pdev); 4038 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4039 } 4040 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4041 vga_client_unregister(adev->pdev); 4042 4043 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4044 4045 iounmap(adev->rmmio); 4046 adev->rmmio = NULL; 4047 amdgpu_device_doorbell_fini(adev); 4048 drm_dev_exit(idx); 4049 } 4050 4051 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4052 amdgpu_pmu_fini(adev); 4053 if (adev->mman.discovery_bin) 4054 amdgpu_discovery_fini(adev); 4055 4056 amdgpu_reset_put_reset_domain(adev->reset_domain); 4057 adev->reset_domain = NULL; 4058 4059 kfree(adev->pci_state); 4060 4061 } 4062 4063 /** 4064 * amdgpu_device_evict_resources - evict device resources 4065 * @adev: amdgpu device object 4066 * 4067 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4068 * of the vram memory type. Mainly used for evicting device resources 4069 * at suspend time. 4070 * 4071 */ 4072 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4073 { 4074 int ret; 4075 4076 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4077 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4078 return 0; 4079 4080 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4081 if (ret) 4082 DRM_WARN("evicting device resources failed\n"); 4083 return ret; 4084 } 4085 4086 /* 4087 * Suspend & resume. 4088 */ 4089 /** 4090 * amdgpu_device_suspend - initiate device suspend 4091 * 4092 * @dev: drm dev pointer 4093 * @fbcon : notify the fbdev of suspend 4094 * 4095 * Puts the hw in the suspend state (all asics). 4096 * Returns 0 for success or an error on failure. 4097 * Called at driver suspend. 4098 */ 4099 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4100 { 4101 struct amdgpu_device *adev = drm_to_adev(dev); 4102 int r = 0; 4103 4104 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4105 return 0; 4106 4107 adev->in_suspend = true; 4108 4109 if (amdgpu_sriov_vf(adev)) { 4110 amdgpu_virt_fini_data_exchange(adev); 4111 r = amdgpu_virt_request_full_gpu(adev, false); 4112 if (r) 4113 return r; 4114 } 4115 4116 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4117 DRM_WARN("smart shift update failed\n"); 4118 4119 drm_kms_helper_poll_disable(dev); 4120 4121 if (fbcon) 4122 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4123 4124 cancel_delayed_work_sync(&adev->delayed_init_work); 4125 4126 amdgpu_ras_suspend(adev); 4127 4128 amdgpu_device_ip_suspend_phase1(adev); 4129 4130 if (!adev->in_s0ix) 4131 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4132 4133 r = amdgpu_device_evict_resources(adev); 4134 if (r) 4135 return r; 4136 4137 amdgpu_fence_driver_hw_fini(adev); 4138 4139 amdgpu_device_ip_suspend_phase2(adev); 4140 4141 if (amdgpu_sriov_vf(adev)) 4142 amdgpu_virt_release_full_gpu(adev, false); 4143 4144 return 0; 4145 } 4146 4147 /** 4148 * amdgpu_device_resume - initiate device resume 4149 * 4150 * @dev: drm dev pointer 4151 * @fbcon : notify the fbdev of resume 4152 * 4153 * Bring the hw back to operating state (all asics). 4154 * Returns 0 for success or an error on failure. 4155 * Called at driver resume. 4156 */ 4157 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4158 { 4159 struct amdgpu_device *adev = drm_to_adev(dev); 4160 int r = 0; 4161 4162 if (amdgpu_sriov_vf(adev)) { 4163 r = amdgpu_virt_request_full_gpu(adev, true); 4164 if (r) 4165 return r; 4166 } 4167 4168 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4169 return 0; 4170 4171 if (adev->in_s0ix) 4172 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4173 4174 /* post card */ 4175 if (amdgpu_device_need_post(adev)) { 4176 r = amdgpu_device_asic_init(adev); 4177 if (r) 4178 dev_err(adev->dev, "amdgpu asic init failed\n"); 4179 } 4180 4181 r = amdgpu_device_ip_resume(adev); 4182 4183 /* no matter what r is, always need to properly release full GPU */ 4184 if (amdgpu_sriov_vf(adev)) { 4185 amdgpu_virt_init_data_exchange(adev); 4186 amdgpu_virt_release_full_gpu(adev, true); 4187 } 4188 4189 if (r) { 4190 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4191 return r; 4192 } 4193 amdgpu_fence_driver_hw_init(adev); 4194 4195 r = amdgpu_device_ip_late_init(adev); 4196 if (r) 4197 return r; 4198 4199 queue_delayed_work(system_wq, &adev->delayed_init_work, 4200 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4201 4202 if (!adev->in_s0ix) { 4203 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4204 if (r) 4205 return r; 4206 } 4207 4208 /* Make sure IB tests flushed */ 4209 if (amdgpu_sriov_vf(adev)) 4210 amdgpu_irq_gpu_reset_resume_helper(adev); 4211 flush_delayed_work(&adev->delayed_init_work); 4212 4213 if (adev->in_s0ix) { 4214 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4215 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4216 */ 4217 amdgpu_gfx_off_ctrl(adev, true); 4218 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4219 } 4220 if (fbcon) 4221 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4222 4223 drm_kms_helper_poll_enable(dev); 4224 4225 amdgpu_ras_resume(adev); 4226 4227 if (adev->mode_info.num_crtc) { 4228 /* 4229 * Most of the connector probing functions try to acquire runtime pm 4230 * refs to ensure that the GPU is powered on when connector polling is 4231 * performed. Since we're calling this from a runtime PM callback, 4232 * trying to acquire rpm refs will cause us to deadlock. 4233 * 4234 * Since we're guaranteed to be holding the rpm lock, it's safe to 4235 * temporarily disable the rpm helpers so this doesn't deadlock us. 4236 */ 4237 #ifdef CONFIG_PM 4238 dev->dev->power.disable_depth++; 4239 #endif 4240 if (!adev->dc_enabled) 4241 drm_helper_hpd_irq_event(dev); 4242 else 4243 drm_kms_helper_hotplug_event(dev); 4244 #ifdef CONFIG_PM 4245 dev->dev->power.disable_depth--; 4246 #endif 4247 } 4248 adev->in_suspend = false; 4249 4250 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4251 DRM_WARN("smart shift update failed\n"); 4252 4253 return 0; 4254 } 4255 4256 /** 4257 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4258 * 4259 * @adev: amdgpu_device pointer 4260 * 4261 * The list of all the hardware IPs that make up the asic is walked and 4262 * the check_soft_reset callbacks are run. check_soft_reset determines 4263 * if the asic is still hung or not. 4264 * Returns true if any of the IPs are still in a hung state, false if not. 4265 */ 4266 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4267 { 4268 int i; 4269 bool asic_hang = false; 4270 4271 if (amdgpu_sriov_vf(adev)) 4272 return true; 4273 4274 if (amdgpu_asic_need_full_reset(adev)) 4275 return true; 4276 4277 for (i = 0; i < adev->num_ip_blocks; i++) { 4278 if (!adev->ip_blocks[i].status.valid) 4279 continue; 4280 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4281 adev->ip_blocks[i].status.hang = 4282 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4283 if (adev->ip_blocks[i].status.hang) { 4284 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4285 asic_hang = true; 4286 } 4287 } 4288 return asic_hang; 4289 } 4290 4291 /** 4292 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4293 * 4294 * @adev: amdgpu_device pointer 4295 * 4296 * The list of all the hardware IPs that make up the asic is walked and the 4297 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4298 * handles any IP specific hardware or software state changes that are 4299 * necessary for a soft reset to succeed. 4300 * Returns 0 on success, negative error code on failure. 4301 */ 4302 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4303 { 4304 int i, r = 0; 4305 4306 for (i = 0; i < adev->num_ip_blocks; i++) { 4307 if (!adev->ip_blocks[i].status.valid) 4308 continue; 4309 if (adev->ip_blocks[i].status.hang && 4310 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4311 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4312 if (r) 4313 return r; 4314 } 4315 } 4316 4317 return 0; 4318 } 4319 4320 /** 4321 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4322 * 4323 * @adev: amdgpu_device pointer 4324 * 4325 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4326 * reset is necessary to recover. 4327 * Returns true if a full asic reset is required, false if not. 4328 */ 4329 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4330 { 4331 int i; 4332 4333 if (amdgpu_asic_need_full_reset(adev)) 4334 return true; 4335 4336 for (i = 0; i < adev->num_ip_blocks; i++) { 4337 if (!adev->ip_blocks[i].status.valid) 4338 continue; 4339 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4340 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4341 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4342 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4343 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4344 if (adev->ip_blocks[i].status.hang) { 4345 dev_info(adev->dev, "Some block need full reset!\n"); 4346 return true; 4347 } 4348 } 4349 } 4350 return false; 4351 } 4352 4353 /** 4354 * amdgpu_device_ip_soft_reset - do a soft reset 4355 * 4356 * @adev: amdgpu_device pointer 4357 * 4358 * The list of all the hardware IPs that make up the asic is walked and the 4359 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4360 * IP specific hardware or software state changes that are necessary to soft 4361 * reset the IP. 4362 * Returns 0 on success, negative error code on failure. 4363 */ 4364 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4365 { 4366 int i, r = 0; 4367 4368 for (i = 0; i < adev->num_ip_blocks; i++) { 4369 if (!adev->ip_blocks[i].status.valid) 4370 continue; 4371 if (adev->ip_blocks[i].status.hang && 4372 adev->ip_blocks[i].version->funcs->soft_reset) { 4373 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4374 if (r) 4375 return r; 4376 } 4377 } 4378 4379 return 0; 4380 } 4381 4382 /** 4383 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4384 * 4385 * @adev: amdgpu_device pointer 4386 * 4387 * The list of all the hardware IPs that make up the asic is walked and the 4388 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4389 * handles any IP specific hardware or software state changes that are 4390 * necessary after the IP has been soft reset. 4391 * Returns 0 on success, negative error code on failure. 4392 */ 4393 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4394 { 4395 int i, r = 0; 4396 4397 for (i = 0; i < adev->num_ip_blocks; i++) { 4398 if (!adev->ip_blocks[i].status.valid) 4399 continue; 4400 if (adev->ip_blocks[i].status.hang && 4401 adev->ip_blocks[i].version->funcs->post_soft_reset) 4402 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4403 if (r) 4404 return r; 4405 } 4406 4407 return 0; 4408 } 4409 4410 /** 4411 * amdgpu_device_recover_vram - Recover some VRAM contents 4412 * 4413 * @adev: amdgpu_device pointer 4414 * 4415 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4416 * restore things like GPUVM page tables after a GPU reset where 4417 * the contents of VRAM might be lost. 4418 * 4419 * Returns: 4420 * 0 on success, negative error code on failure. 4421 */ 4422 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4423 { 4424 struct dma_fence *fence = NULL, *next = NULL; 4425 struct amdgpu_bo *shadow; 4426 struct amdgpu_bo_vm *vmbo; 4427 long r = 1, tmo; 4428 4429 if (amdgpu_sriov_runtime(adev)) 4430 tmo = msecs_to_jiffies(8000); 4431 else 4432 tmo = msecs_to_jiffies(100); 4433 4434 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4435 mutex_lock(&adev->shadow_list_lock); 4436 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4437 shadow = &vmbo->bo; 4438 /* No need to recover an evicted BO */ 4439 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4440 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4441 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4442 continue; 4443 4444 r = amdgpu_bo_restore_shadow(shadow, &next); 4445 if (r) 4446 break; 4447 4448 if (fence) { 4449 tmo = dma_fence_wait_timeout(fence, false, tmo); 4450 dma_fence_put(fence); 4451 fence = next; 4452 if (tmo == 0) { 4453 r = -ETIMEDOUT; 4454 break; 4455 } else if (tmo < 0) { 4456 r = tmo; 4457 break; 4458 } 4459 } else { 4460 fence = next; 4461 } 4462 } 4463 mutex_unlock(&adev->shadow_list_lock); 4464 4465 if (fence) 4466 tmo = dma_fence_wait_timeout(fence, false, tmo); 4467 dma_fence_put(fence); 4468 4469 if (r < 0 || tmo <= 0) { 4470 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4471 return -EIO; 4472 } 4473 4474 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4475 return 0; 4476 } 4477 4478 4479 /** 4480 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4481 * 4482 * @adev: amdgpu_device pointer 4483 * @from_hypervisor: request from hypervisor 4484 * 4485 * do VF FLR and reinitialize Asic 4486 * return 0 means succeeded otherwise failed 4487 */ 4488 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4489 bool from_hypervisor) 4490 { 4491 int r; 4492 struct amdgpu_hive_info *hive = NULL; 4493 int retry_limit = 0; 4494 4495 retry: 4496 amdgpu_amdkfd_pre_reset(adev); 4497 4498 if (from_hypervisor) 4499 r = amdgpu_virt_request_full_gpu(adev, true); 4500 else 4501 r = amdgpu_virt_reset_gpu(adev); 4502 if (r) 4503 return r; 4504 4505 /* Resume IP prior to SMC */ 4506 r = amdgpu_device_ip_reinit_early_sriov(adev); 4507 if (r) 4508 goto error; 4509 4510 amdgpu_virt_init_data_exchange(adev); 4511 4512 r = amdgpu_device_fw_loading(adev); 4513 if (r) 4514 return r; 4515 4516 /* now we are okay to resume SMC/CP/SDMA */ 4517 r = amdgpu_device_ip_reinit_late_sriov(adev); 4518 if (r) 4519 goto error; 4520 4521 hive = amdgpu_get_xgmi_hive(adev); 4522 /* Update PSP FW topology after reset */ 4523 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4524 r = amdgpu_xgmi_update_topology(hive, adev); 4525 4526 if (hive) 4527 amdgpu_put_xgmi_hive(hive); 4528 4529 if (!r) { 4530 amdgpu_irq_gpu_reset_resume_helper(adev); 4531 r = amdgpu_ib_ring_tests(adev); 4532 4533 amdgpu_amdkfd_post_reset(adev); 4534 } 4535 4536 error: 4537 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4538 amdgpu_inc_vram_lost(adev); 4539 r = amdgpu_device_recover_vram(adev); 4540 } 4541 amdgpu_virt_release_full_gpu(adev, true); 4542 4543 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4544 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4545 retry_limit++; 4546 goto retry; 4547 } else 4548 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4549 } 4550 4551 return r; 4552 } 4553 4554 /** 4555 * amdgpu_device_has_job_running - check if there is any job in mirror list 4556 * 4557 * @adev: amdgpu_device pointer 4558 * 4559 * check if there is any job in mirror list 4560 */ 4561 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4562 { 4563 int i; 4564 struct drm_sched_job *job; 4565 4566 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4567 struct amdgpu_ring *ring = adev->rings[i]; 4568 4569 if (!ring || !ring->sched.thread) 4570 continue; 4571 4572 spin_lock(&ring->sched.job_list_lock); 4573 job = list_first_entry_or_null(&ring->sched.pending_list, 4574 struct drm_sched_job, list); 4575 spin_unlock(&ring->sched.job_list_lock); 4576 if (job) 4577 return true; 4578 } 4579 return false; 4580 } 4581 4582 /** 4583 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4584 * 4585 * @adev: amdgpu_device pointer 4586 * 4587 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4588 * a hung GPU. 4589 */ 4590 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4591 { 4592 4593 if (amdgpu_gpu_recovery == 0) 4594 goto disabled; 4595 4596 /* Skip soft reset check in fatal error mode */ 4597 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4598 return true; 4599 4600 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4601 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4602 return false; 4603 } 4604 4605 if (amdgpu_sriov_vf(adev)) 4606 return true; 4607 4608 if (amdgpu_gpu_recovery == -1) { 4609 switch (adev->asic_type) { 4610 #ifdef CONFIG_DRM_AMDGPU_SI 4611 case CHIP_VERDE: 4612 case CHIP_TAHITI: 4613 case CHIP_PITCAIRN: 4614 case CHIP_OLAND: 4615 case CHIP_HAINAN: 4616 #endif 4617 #ifdef CONFIG_DRM_AMDGPU_CIK 4618 case CHIP_KAVERI: 4619 case CHIP_KABINI: 4620 case CHIP_MULLINS: 4621 #endif 4622 case CHIP_CARRIZO: 4623 case CHIP_STONEY: 4624 case CHIP_CYAN_SKILLFISH: 4625 goto disabled; 4626 default: 4627 break; 4628 } 4629 } 4630 4631 return true; 4632 4633 disabled: 4634 dev_info(adev->dev, "GPU recovery disabled.\n"); 4635 return false; 4636 } 4637 4638 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4639 { 4640 u32 i; 4641 int ret = 0; 4642 4643 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4644 4645 dev_info(adev->dev, "GPU mode1 reset\n"); 4646 4647 /* disable BM */ 4648 pci_clear_master(adev->pdev); 4649 4650 amdgpu_device_cache_pci_state(adev->pdev); 4651 4652 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4653 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4654 ret = amdgpu_dpm_mode1_reset(adev); 4655 } else { 4656 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4657 ret = psp_gpu_reset(adev); 4658 } 4659 4660 if (ret) 4661 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4662 4663 amdgpu_device_load_pci_state(adev->pdev); 4664 4665 /* wait for asic to come out of reset */ 4666 for (i = 0; i < adev->usec_timeout; i++) { 4667 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4668 4669 if (memsize != 0xffffffff) 4670 break; 4671 udelay(1); 4672 } 4673 4674 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4675 return ret; 4676 } 4677 4678 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4679 struct amdgpu_reset_context *reset_context) 4680 { 4681 int i, r = 0; 4682 struct amdgpu_job *job = NULL; 4683 bool need_full_reset = 4684 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4685 4686 if (reset_context->reset_req_dev == adev) 4687 job = reset_context->job; 4688 4689 if (amdgpu_sriov_vf(adev)) { 4690 /* stop the data exchange thread */ 4691 amdgpu_virt_fini_data_exchange(adev); 4692 } 4693 4694 amdgpu_fence_driver_isr_toggle(adev, true); 4695 4696 /* block all schedulers and reset given job's ring */ 4697 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4698 struct amdgpu_ring *ring = adev->rings[i]; 4699 4700 if (!ring || !ring->sched.thread) 4701 continue; 4702 4703 /*clear job fence from fence drv to avoid force_completion 4704 *leave NULL and vm flush fence in fence drv */ 4705 amdgpu_fence_driver_clear_job_fences(ring); 4706 4707 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4708 amdgpu_fence_driver_force_completion(ring); 4709 } 4710 4711 amdgpu_fence_driver_isr_toggle(adev, false); 4712 4713 if (job && job->vm) 4714 drm_sched_increase_karma(&job->base); 4715 4716 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4717 /* If reset handler not implemented, continue; otherwise return */ 4718 if (r == -ENOSYS) 4719 r = 0; 4720 else 4721 return r; 4722 4723 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4724 if (!amdgpu_sriov_vf(adev)) { 4725 4726 if (!need_full_reset) 4727 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4728 4729 if (!need_full_reset && amdgpu_gpu_recovery) { 4730 amdgpu_device_ip_pre_soft_reset(adev); 4731 r = amdgpu_device_ip_soft_reset(adev); 4732 amdgpu_device_ip_post_soft_reset(adev); 4733 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4734 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4735 need_full_reset = true; 4736 } 4737 } 4738 4739 if (need_full_reset) 4740 r = amdgpu_device_ip_suspend(adev); 4741 if (need_full_reset) 4742 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4743 else 4744 clear_bit(AMDGPU_NEED_FULL_RESET, 4745 &reset_context->flags); 4746 } 4747 4748 return r; 4749 } 4750 4751 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4752 { 4753 int i; 4754 4755 lockdep_assert_held(&adev->reset_domain->sem); 4756 4757 for (i = 0; i < adev->num_regs; i++) { 4758 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4759 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4760 adev->reset_dump_reg_value[i]); 4761 } 4762 4763 return 0; 4764 } 4765 4766 #ifdef CONFIG_DEV_COREDUMP 4767 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4768 size_t count, void *data, size_t datalen) 4769 { 4770 struct drm_printer p; 4771 struct amdgpu_device *adev = data; 4772 struct drm_print_iterator iter; 4773 int i; 4774 4775 iter.data = buffer; 4776 iter.offset = 0; 4777 iter.start = offset; 4778 iter.remain = count; 4779 4780 p = drm_coredump_printer(&iter); 4781 4782 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4783 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4784 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4785 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4786 if (adev->reset_task_info.pid) 4787 drm_printf(&p, "process_name: %s PID: %d\n", 4788 adev->reset_task_info.process_name, 4789 adev->reset_task_info.pid); 4790 4791 if (adev->reset_vram_lost) 4792 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4793 if (adev->num_regs) { 4794 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4795 4796 for (i = 0; i < adev->num_regs; i++) 4797 drm_printf(&p, "0x%08x: 0x%08x\n", 4798 adev->reset_dump_reg_list[i], 4799 adev->reset_dump_reg_value[i]); 4800 } 4801 4802 return count - iter.remain; 4803 } 4804 4805 static void amdgpu_devcoredump_free(void *data) 4806 { 4807 } 4808 4809 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4810 { 4811 struct drm_device *dev = adev_to_drm(adev); 4812 4813 ktime_get_ts64(&adev->reset_time); 4814 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4815 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4816 } 4817 #endif 4818 4819 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4820 struct amdgpu_reset_context *reset_context) 4821 { 4822 struct amdgpu_device *tmp_adev = NULL; 4823 bool need_full_reset, skip_hw_reset, vram_lost = false; 4824 int r = 0; 4825 bool gpu_reset_for_dev_remove = 0; 4826 4827 /* Try reset handler method first */ 4828 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4829 reset_list); 4830 amdgpu_reset_reg_dumps(tmp_adev); 4831 4832 reset_context->reset_device_list = device_list_handle; 4833 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4834 /* If reset handler not implemented, continue; otherwise return */ 4835 if (r == -ENOSYS) 4836 r = 0; 4837 else 4838 return r; 4839 4840 /* Reset handler not implemented, use the default method */ 4841 need_full_reset = 4842 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4843 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4844 4845 gpu_reset_for_dev_remove = 4846 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4847 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4848 4849 /* 4850 * ASIC reset has to be done on all XGMI hive nodes ASAP 4851 * to allow proper links negotiation in FW (within 1 sec) 4852 */ 4853 if (!skip_hw_reset && need_full_reset) { 4854 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4855 /* For XGMI run all resets in parallel to speed up the process */ 4856 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4857 tmp_adev->gmc.xgmi.pending_reset = false; 4858 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4859 r = -EALREADY; 4860 } else 4861 r = amdgpu_asic_reset(tmp_adev); 4862 4863 if (r) { 4864 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4865 r, adev_to_drm(tmp_adev)->unique); 4866 break; 4867 } 4868 } 4869 4870 /* For XGMI wait for all resets to complete before proceed */ 4871 if (!r) { 4872 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4873 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4874 flush_work(&tmp_adev->xgmi_reset_work); 4875 r = tmp_adev->asic_reset_res; 4876 if (r) 4877 break; 4878 } 4879 } 4880 } 4881 } 4882 4883 if (!r && amdgpu_ras_intr_triggered()) { 4884 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4885 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4886 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4887 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4888 } 4889 4890 amdgpu_ras_intr_cleared(); 4891 } 4892 4893 /* Since the mode1 reset affects base ip blocks, the 4894 * phase1 ip blocks need to be resumed. Otherwise there 4895 * will be a BIOS signature error and the psp bootloader 4896 * can't load kdb on the next amdgpu install. 4897 */ 4898 if (gpu_reset_for_dev_remove) { 4899 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4900 amdgpu_device_ip_resume_phase1(tmp_adev); 4901 4902 goto end; 4903 } 4904 4905 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4906 if (need_full_reset) { 4907 /* post card */ 4908 r = amdgpu_device_asic_init(tmp_adev); 4909 if (r) { 4910 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4911 } else { 4912 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4913 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4914 if (r) 4915 goto out; 4916 4917 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4918 if (r) 4919 goto out; 4920 4921 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4922 #ifdef CONFIG_DEV_COREDUMP 4923 tmp_adev->reset_vram_lost = vram_lost; 4924 memset(&tmp_adev->reset_task_info, 0, 4925 sizeof(tmp_adev->reset_task_info)); 4926 if (reset_context->job && reset_context->job->vm) 4927 tmp_adev->reset_task_info = 4928 reset_context->job->vm->task_info; 4929 amdgpu_reset_capture_coredumpm(tmp_adev); 4930 #endif 4931 if (vram_lost) { 4932 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4933 amdgpu_inc_vram_lost(tmp_adev); 4934 } 4935 4936 r = amdgpu_device_fw_loading(tmp_adev); 4937 if (r) 4938 return r; 4939 4940 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4941 if (r) 4942 goto out; 4943 4944 if (vram_lost) 4945 amdgpu_device_fill_reset_magic(tmp_adev); 4946 4947 /* 4948 * Add this ASIC as tracked as reset was already 4949 * complete successfully. 4950 */ 4951 amdgpu_register_gpu_instance(tmp_adev); 4952 4953 if (!reset_context->hive && 4954 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4955 amdgpu_xgmi_add_device(tmp_adev); 4956 4957 r = amdgpu_device_ip_late_init(tmp_adev); 4958 if (r) 4959 goto out; 4960 4961 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4962 4963 /* 4964 * The GPU enters bad state once faulty pages 4965 * by ECC has reached the threshold, and ras 4966 * recovery is scheduled next. So add one check 4967 * here to break recovery if it indeed exceeds 4968 * bad page threshold, and remind user to 4969 * retire this GPU or setting one bigger 4970 * bad_page_threshold value to fix this once 4971 * probing driver again. 4972 */ 4973 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4974 /* must succeed. */ 4975 amdgpu_ras_resume(tmp_adev); 4976 } else { 4977 r = -EINVAL; 4978 goto out; 4979 } 4980 4981 /* Update PSP FW topology after reset */ 4982 if (reset_context->hive && 4983 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4984 r = amdgpu_xgmi_update_topology( 4985 reset_context->hive, tmp_adev); 4986 } 4987 } 4988 4989 out: 4990 if (!r) { 4991 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4992 r = amdgpu_ib_ring_tests(tmp_adev); 4993 if (r) { 4994 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4995 need_full_reset = true; 4996 r = -EAGAIN; 4997 goto end; 4998 } 4999 } 5000 5001 if (!r) 5002 r = amdgpu_device_recover_vram(tmp_adev); 5003 else 5004 tmp_adev->asic_reset_res = r; 5005 } 5006 5007 end: 5008 if (need_full_reset) 5009 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5010 else 5011 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5012 return r; 5013 } 5014 5015 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5016 { 5017 5018 switch (amdgpu_asic_reset_method(adev)) { 5019 case AMD_RESET_METHOD_MODE1: 5020 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5021 break; 5022 case AMD_RESET_METHOD_MODE2: 5023 adev->mp1_state = PP_MP1_STATE_RESET; 5024 break; 5025 default: 5026 adev->mp1_state = PP_MP1_STATE_NONE; 5027 break; 5028 } 5029 } 5030 5031 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5032 { 5033 amdgpu_vf_error_trans_all(adev); 5034 adev->mp1_state = PP_MP1_STATE_NONE; 5035 } 5036 5037 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5038 { 5039 struct pci_dev *p = NULL; 5040 5041 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5042 adev->pdev->bus->number, 1); 5043 if (p) { 5044 pm_runtime_enable(&(p->dev)); 5045 pm_runtime_resume(&(p->dev)); 5046 } 5047 } 5048 5049 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5050 { 5051 enum amd_reset_method reset_method; 5052 struct pci_dev *p = NULL; 5053 u64 expires; 5054 5055 /* 5056 * For now, only BACO and mode1 reset are confirmed 5057 * to suffer the audio issue without proper suspended. 5058 */ 5059 reset_method = amdgpu_asic_reset_method(adev); 5060 if ((reset_method != AMD_RESET_METHOD_BACO) && 5061 (reset_method != AMD_RESET_METHOD_MODE1)) 5062 return -EINVAL; 5063 5064 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5065 adev->pdev->bus->number, 1); 5066 if (!p) 5067 return -ENODEV; 5068 5069 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5070 if (!expires) 5071 /* 5072 * If we cannot get the audio device autosuspend delay, 5073 * a fixed 4S interval will be used. Considering 3S is 5074 * the audio controller default autosuspend delay setting. 5075 * 4S used here is guaranteed to cover that. 5076 */ 5077 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5078 5079 while (!pm_runtime_status_suspended(&(p->dev))) { 5080 if (!pm_runtime_suspend(&(p->dev))) 5081 break; 5082 5083 if (expires < ktime_get_mono_fast_ns()) { 5084 dev_warn(adev->dev, "failed to suspend display audio\n"); 5085 /* TODO: abort the succeeding gpu reset? */ 5086 return -ETIMEDOUT; 5087 } 5088 } 5089 5090 pm_runtime_disable(&(p->dev)); 5091 5092 return 0; 5093 } 5094 5095 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5096 { 5097 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5098 5099 #if defined(CONFIG_DEBUG_FS) 5100 if (!amdgpu_sriov_vf(adev)) 5101 cancel_work(&adev->reset_work); 5102 #endif 5103 5104 if (adev->kfd.dev) 5105 cancel_work(&adev->kfd.reset_work); 5106 5107 if (amdgpu_sriov_vf(adev)) 5108 cancel_work(&adev->virt.flr_work); 5109 5110 if (con && adev->ras_enabled) 5111 cancel_work(&con->recovery_work); 5112 5113 } 5114 5115 /** 5116 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5117 * 5118 * @adev: amdgpu_device pointer 5119 * @job: which job trigger hang 5120 * 5121 * Attempt to reset the GPU if it has hung (all asics). 5122 * Attempt to do soft-reset or full-reset and reinitialize Asic 5123 * Returns 0 for success or an error on failure. 5124 */ 5125 5126 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5127 struct amdgpu_job *job, 5128 struct amdgpu_reset_context *reset_context) 5129 { 5130 struct list_head device_list, *device_list_handle = NULL; 5131 bool job_signaled = false; 5132 struct amdgpu_hive_info *hive = NULL; 5133 struct amdgpu_device *tmp_adev = NULL; 5134 int i, r = 0; 5135 bool need_emergency_restart = false; 5136 bool audio_suspended = false; 5137 bool gpu_reset_for_dev_remove = false; 5138 5139 gpu_reset_for_dev_remove = 5140 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5141 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5142 5143 /* 5144 * Special case: RAS triggered and full reset isn't supported 5145 */ 5146 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5147 5148 /* 5149 * Flush RAM to disk so that after reboot 5150 * the user can read log and see why the system rebooted. 5151 */ 5152 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5153 DRM_WARN("Emergency reboot."); 5154 5155 ksys_sync_helper(); 5156 emergency_restart(); 5157 } 5158 5159 dev_info(adev->dev, "GPU %s begin!\n", 5160 need_emergency_restart ? "jobs stop":"reset"); 5161 5162 if (!amdgpu_sriov_vf(adev)) 5163 hive = amdgpu_get_xgmi_hive(adev); 5164 if (hive) 5165 mutex_lock(&hive->hive_lock); 5166 5167 reset_context->job = job; 5168 reset_context->hive = hive; 5169 /* 5170 * Build list of devices to reset. 5171 * In case we are in XGMI hive mode, resort the device list 5172 * to put adev in the 1st position. 5173 */ 5174 INIT_LIST_HEAD(&device_list); 5175 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5176 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5177 list_add_tail(&tmp_adev->reset_list, &device_list); 5178 if (gpu_reset_for_dev_remove && adev->shutdown) 5179 tmp_adev->shutdown = true; 5180 } 5181 if (!list_is_first(&adev->reset_list, &device_list)) 5182 list_rotate_to_front(&adev->reset_list, &device_list); 5183 device_list_handle = &device_list; 5184 } else { 5185 list_add_tail(&adev->reset_list, &device_list); 5186 device_list_handle = &device_list; 5187 } 5188 5189 /* We need to lock reset domain only once both for XGMI and single device */ 5190 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5191 reset_list); 5192 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5193 5194 /* block all schedulers and reset given job's ring */ 5195 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5196 5197 amdgpu_device_set_mp1_state(tmp_adev); 5198 5199 /* 5200 * Try to put the audio codec into suspend state 5201 * before gpu reset started. 5202 * 5203 * Due to the power domain of the graphics device 5204 * is shared with AZ power domain. Without this, 5205 * we may change the audio hardware from behind 5206 * the audio driver's back. That will trigger 5207 * some audio codec errors. 5208 */ 5209 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5210 audio_suspended = true; 5211 5212 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5213 5214 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5215 5216 if (!amdgpu_sriov_vf(tmp_adev)) 5217 amdgpu_amdkfd_pre_reset(tmp_adev); 5218 5219 /* 5220 * Mark these ASICs to be reseted as untracked first 5221 * And add them back after reset completed 5222 */ 5223 amdgpu_unregister_gpu_instance(tmp_adev); 5224 5225 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5226 5227 /* disable ras on ALL IPs */ 5228 if (!need_emergency_restart && 5229 amdgpu_device_ip_need_full_reset(tmp_adev)) 5230 amdgpu_ras_suspend(tmp_adev); 5231 5232 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5233 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5234 5235 if (!ring || !ring->sched.thread) 5236 continue; 5237 5238 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5239 5240 if (need_emergency_restart) 5241 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5242 } 5243 atomic_inc(&tmp_adev->gpu_reset_counter); 5244 } 5245 5246 if (need_emergency_restart) 5247 goto skip_sched_resume; 5248 5249 /* 5250 * Must check guilty signal here since after this point all old 5251 * HW fences are force signaled. 5252 * 5253 * job->base holds a reference to parent fence 5254 */ 5255 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5256 job_signaled = true; 5257 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5258 goto skip_hw_reset; 5259 } 5260 5261 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5262 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5263 if (gpu_reset_for_dev_remove) { 5264 /* Workaroud for ASICs need to disable SMC first */ 5265 amdgpu_device_smu_fini_early(tmp_adev); 5266 } 5267 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5268 /*TODO Should we stop ?*/ 5269 if (r) { 5270 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5271 r, adev_to_drm(tmp_adev)->unique); 5272 tmp_adev->asic_reset_res = r; 5273 } 5274 5275 /* 5276 * Drop all pending non scheduler resets. Scheduler resets 5277 * were already dropped during drm_sched_stop 5278 */ 5279 amdgpu_device_stop_pending_resets(tmp_adev); 5280 } 5281 5282 /* Actual ASIC resets if needed.*/ 5283 /* Host driver will handle XGMI hive reset for SRIOV */ 5284 if (amdgpu_sriov_vf(adev)) { 5285 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5286 if (r) 5287 adev->asic_reset_res = r; 5288 5289 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5290 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5291 amdgpu_ras_resume(adev); 5292 } else { 5293 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5294 if (r && r == -EAGAIN) 5295 goto retry; 5296 5297 if (!r && gpu_reset_for_dev_remove) 5298 goto recover_end; 5299 } 5300 5301 skip_hw_reset: 5302 5303 /* Post ASIC reset for all devs .*/ 5304 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5305 5306 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5307 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5308 5309 if (!ring || !ring->sched.thread) 5310 continue; 5311 5312 drm_sched_start(&ring->sched, true); 5313 } 5314 5315 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5316 amdgpu_mes_self_test(tmp_adev); 5317 5318 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5319 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5320 } 5321 5322 if (tmp_adev->asic_reset_res) 5323 r = tmp_adev->asic_reset_res; 5324 5325 tmp_adev->asic_reset_res = 0; 5326 5327 if (r) { 5328 /* bad news, how to tell it to userspace ? */ 5329 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5330 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5331 } else { 5332 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5333 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5334 DRM_WARN("smart shift update failed\n"); 5335 } 5336 } 5337 5338 skip_sched_resume: 5339 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5340 /* unlock kfd: SRIOV would do it separately */ 5341 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5342 amdgpu_amdkfd_post_reset(tmp_adev); 5343 5344 /* kfd_post_reset will do nothing if kfd device is not initialized, 5345 * need to bring up kfd here if it's not be initialized before 5346 */ 5347 if (!adev->kfd.init_complete) 5348 amdgpu_amdkfd_device_init(adev); 5349 5350 if (audio_suspended) 5351 amdgpu_device_resume_display_audio(tmp_adev); 5352 5353 amdgpu_device_unset_mp1_state(tmp_adev); 5354 5355 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5356 } 5357 5358 recover_end: 5359 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5360 reset_list); 5361 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5362 5363 if (hive) { 5364 mutex_unlock(&hive->hive_lock); 5365 amdgpu_put_xgmi_hive(hive); 5366 } 5367 5368 if (r) 5369 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5370 5371 atomic_set(&adev->reset_domain->reset_res, r); 5372 return r; 5373 } 5374 5375 /** 5376 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5377 * 5378 * @adev: amdgpu_device pointer 5379 * 5380 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5381 * and lanes) of the slot the device is in. Handles APUs and 5382 * virtualized environments where PCIE config space may not be available. 5383 */ 5384 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5385 { 5386 struct pci_dev *pdev; 5387 enum pci_bus_speed speed_cap, platform_speed_cap; 5388 enum pcie_link_width platform_link_width; 5389 5390 if (amdgpu_pcie_gen_cap) 5391 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5392 5393 if (amdgpu_pcie_lane_cap) 5394 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5395 5396 /* covers APUs as well */ 5397 if (pci_is_root_bus(adev->pdev->bus)) { 5398 if (adev->pm.pcie_gen_mask == 0) 5399 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5400 if (adev->pm.pcie_mlw_mask == 0) 5401 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5402 return; 5403 } 5404 5405 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5406 return; 5407 5408 pcie_bandwidth_available(adev->pdev, NULL, 5409 &platform_speed_cap, &platform_link_width); 5410 5411 if (adev->pm.pcie_gen_mask == 0) { 5412 /* asic caps */ 5413 pdev = adev->pdev; 5414 speed_cap = pcie_get_speed_cap(pdev); 5415 if (speed_cap == PCI_SPEED_UNKNOWN) { 5416 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5417 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5418 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5419 } else { 5420 if (speed_cap == PCIE_SPEED_32_0GT) 5421 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5422 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5423 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5424 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5425 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5426 else if (speed_cap == PCIE_SPEED_16_0GT) 5427 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5428 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5429 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5430 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5431 else if (speed_cap == PCIE_SPEED_8_0GT) 5432 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5433 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5434 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5435 else if (speed_cap == PCIE_SPEED_5_0GT) 5436 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5437 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5438 else 5439 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5440 } 5441 /* platform caps */ 5442 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5443 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5444 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5445 } else { 5446 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5447 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5448 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5449 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5450 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5451 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5452 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5453 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5454 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5455 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5456 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5457 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5458 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5459 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5460 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5461 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5462 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5463 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5464 else 5465 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5466 5467 } 5468 } 5469 if (adev->pm.pcie_mlw_mask == 0) { 5470 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5471 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5472 } else { 5473 switch (platform_link_width) { 5474 case PCIE_LNK_X32: 5475 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5476 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5477 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5478 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5479 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5480 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5481 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5482 break; 5483 case PCIE_LNK_X16: 5484 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5485 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5486 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5487 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5488 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5489 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5490 break; 5491 case PCIE_LNK_X12: 5492 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5493 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5494 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5495 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5496 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5497 break; 5498 case PCIE_LNK_X8: 5499 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5500 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5501 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5502 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5503 break; 5504 case PCIE_LNK_X4: 5505 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5506 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5507 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5508 break; 5509 case PCIE_LNK_X2: 5510 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5511 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5512 break; 5513 case PCIE_LNK_X1: 5514 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5515 break; 5516 default: 5517 break; 5518 } 5519 } 5520 } 5521 } 5522 5523 /** 5524 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5525 * 5526 * @adev: amdgpu_device pointer 5527 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5528 * 5529 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5530 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5531 * @peer_adev. 5532 */ 5533 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5534 struct amdgpu_device *peer_adev) 5535 { 5536 #ifdef CONFIG_HSA_AMD_P2P 5537 uint64_t address_mask = peer_adev->dev->dma_mask ? 5538 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5539 resource_size_t aper_limit = 5540 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5541 bool p2p_access = 5542 !adev->gmc.xgmi.connected_to_cpu && 5543 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5544 5545 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5546 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5547 !(adev->gmc.aper_base & address_mask || 5548 aper_limit & address_mask)); 5549 #else 5550 return false; 5551 #endif 5552 } 5553 5554 int amdgpu_device_baco_enter(struct drm_device *dev) 5555 { 5556 struct amdgpu_device *adev = drm_to_adev(dev); 5557 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5558 5559 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5560 return -ENOTSUPP; 5561 5562 if (ras && adev->ras_enabled && 5563 adev->nbio.funcs->enable_doorbell_interrupt) 5564 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5565 5566 return amdgpu_dpm_baco_enter(adev); 5567 } 5568 5569 int amdgpu_device_baco_exit(struct drm_device *dev) 5570 { 5571 struct amdgpu_device *adev = drm_to_adev(dev); 5572 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5573 int ret = 0; 5574 5575 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5576 return -ENOTSUPP; 5577 5578 ret = amdgpu_dpm_baco_exit(adev); 5579 if (ret) 5580 return ret; 5581 5582 if (ras && adev->ras_enabled && 5583 adev->nbio.funcs->enable_doorbell_interrupt) 5584 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5585 5586 if (amdgpu_passthrough(adev) && 5587 adev->nbio.funcs->clear_doorbell_interrupt) 5588 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5589 5590 return 0; 5591 } 5592 5593 /** 5594 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5595 * @pdev: PCI device struct 5596 * @state: PCI channel state 5597 * 5598 * Description: Called when a PCI error is detected. 5599 * 5600 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5601 */ 5602 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5603 { 5604 struct drm_device *dev = pci_get_drvdata(pdev); 5605 struct amdgpu_device *adev = drm_to_adev(dev); 5606 int i; 5607 5608 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5609 5610 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5611 DRM_WARN("No support for XGMI hive yet..."); 5612 return PCI_ERS_RESULT_DISCONNECT; 5613 } 5614 5615 adev->pci_channel_state = state; 5616 5617 switch (state) { 5618 case pci_channel_io_normal: 5619 return PCI_ERS_RESULT_CAN_RECOVER; 5620 /* Fatal error, prepare for slot reset */ 5621 case pci_channel_io_frozen: 5622 /* 5623 * Locking adev->reset_domain->sem will prevent any external access 5624 * to GPU during PCI error recovery 5625 */ 5626 amdgpu_device_lock_reset_domain(adev->reset_domain); 5627 amdgpu_device_set_mp1_state(adev); 5628 5629 /* 5630 * Block any work scheduling as we do for regular GPU reset 5631 * for the duration of the recovery 5632 */ 5633 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5634 struct amdgpu_ring *ring = adev->rings[i]; 5635 5636 if (!ring || !ring->sched.thread) 5637 continue; 5638 5639 drm_sched_stop(&ring->sched, NULL); 5640 } 5641 atomic_inc(&adev->gpu_reset_counter); 5642 return PCI_ERS_RESULT_NEED_RESET; 5643 case pci_channel_io_perm_failure: 5644 /* Permanent error, prepare for device removal */ 5645 return PCI_ERS_RESULT_DISCONNECT; 5646 } 5647 5648 return PCI_ERS_RESULT_NEED_RESET; 5649 } 5650 5651 /** 5652 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5653 * @pdev: pointer to PCI device 5654 */ 5655 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5656 { 5657 5658 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5659 5660 /* TODO - dump whatever for debugging purposes */ 5661 5662 /* This called only if amdgpu_pci_error_detected returns 5663 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5664 * works, no need to reset slot. 5665 */ 5666 5667 return PCI_ERS_RESULT_RECOVERED; 5668 } 5669 5670 /** 5671 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5672 * @pdev: PCI device struct 5673 * 5674 * Description: This routine is called by the pci error recovery 5675 * code after the PCI slot has been reset, just before we 5676 * should resume normal operations. 5677 */ 5678 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5679 { 5680 struct drm_device *dev = pci_get_drvdata(pdev); 5681 struct amdgpu_device *adev = drm_to_adev(dev); 5682 int r, i; 5683 struct amdgpu_reset_context reset_context; 5684 u32 memsize; 5685 struct list_head device_list; 5686 5687 DRM_INFO("PCI error: slot reset callback!!\n"); 5688 5689 memset(&reset_context, 0, sizeof(reset_context)); 5690 5691 INIT_LIST_HEAD(&device_list); 5692 list_add_tail(&adev->reset_list, &device_list); 5693 5694 /* wait for asic to come out of reset */ 5695 msleep(500); 5696 5697 /* Restore PCI confspace */ 5698 amdgpu_device_load_pci_state(pdev); 5699 5700 /* confirm ASIC came out of reset */ 5701 for (i = 0; i < adev->usec_timeout; i++) { 5702 memsize = amdgpu_asic_get_config_memsize(adev); 5703 5704 if (memsize != 0xffffffff) 5705 break; 5706 udelay(1); 5707 } 5708 if (memsize == 0xffffffff) { 5709 r = -ETIME; 5710 goto out; 5711 } 5712 5713 reset_context.method = AMD_RESET_METHOD_NONE; 5714 reset_context.reset_req_dev = adev; 5715 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5716 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5717 5718 adev->no_hw_access = true; 5719 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5720 adev->no_hw_access = false; 5721 if (r) 5722 goto out; 5723 5724 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5725 5726 out: 5727 if (!r) { 5728 if (amdgpu_device_cache_pci_state(adev->pdev)) 5729 pci_restore_state(adev->pdev); 5730 5731 DRM_INFO("PCIe error recovery succeeded\n"); 5732 } else { 5733 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5734 amdgpu_device_unset_mp1_state(adev); 5735 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5736 } 5737 5738 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5739 } 5740 5741 /** 5742 * amdgpu_pci_resume() - resume normal ops after PCI reset 5743 * @pdev: pointer to PCI device 5744 * 5745 * Called when the error recovery driver tells us that its 5746 * OK to resume normal operation. 5747 */ 5748 void amdgpu_pci_resume(struct pci_dev *pdev) 5749 { 5750 struct drm_device *dev = pci_get_drvdata(pdev); 5751 struct amdgpu_device *adev = drm_to_adev(dev); 5752 int i; 5753 5754 5755 DRM_INFO("PCI error: resume callback!!\n"); 5756 5757 /* Only continue execution for the case of pci_channel_io_frozen */ 5758 if (adev->pci_channel_state != pci_channel_io_frozen) 5759 return; 5760 5761 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5762 struct amdgpu_ring *ring = adev->rings[i]; 5763 5764 if (!ring || !ring->sched.thread) 5765 continue; 5766 5767 drm_sched_start(&ring->sched, true); 5768 } 5769 5770 amdgpu_device_unset_mp1_state(adev); 5771 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5772 } 5773 5774 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5775 { 5776 struct drm_device *dev = pci_get_drvdata(pdev); 5777 struct amdgpu_device *adev = drm_to_adev(dev); 5778 int r; 5779 5780 r = pci_save_state(pdev); 5781 if (!r) { 5782 kfree(adev->pci_state); 5783 5784 adev->pci_state = pci_store_saved_state(pdev); 5785 5786 if (!adev->pci_state) { 5787 DRM_ERROR("Failed to store PCI saved state"); 5788 return false; 5789 } 5790 } else { 5791 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5792 return false; 5793 } 5794 5795 return true; 5796 } 5797 5798 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5799 { 5800 struct drm_device *dev = pci_get_drvdata(pdev); 5801 struct amdgpu_device *adev = drm_to_adev(dev); 5802 int r; 5803 5804 if (!adev->pci_state) 5805 return false; 5806 5807 r = pci_load_saved_state(pdev, adev->pci_state); 5808 5809 if (!r) { 5810 pci_restore_state(pdev); 5811 } else { 5812 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5813 return false; 5814 } 5815 5816 return true; 5817 } 5818 5819 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5820 struct amdgpu_ring *ring) 5821 { 5822 #ifdef CONFIG_X86_64 5823 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5824 return; 5825 #endif 5826 if (adev->gmc.xgmi.connected_to_cpu) 5827 return; 5828 5829 if (ring && ring->funcs->emit_hdp_flush) 5830 amdgpu_ring_emit_hdp_flush(ring); 5831 else 5832 amdgpu_asic_flush_hdp(adev, ring); 5833 } 5834 5835 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5836 struct amdgpu_ring *ring) 5837 { 5838 #ifdef CONFIG_X86_64 5839 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5840 return; 5841 #endif 5842 if (adev->gmc.xgmi.connected_to_cpu) 5843 return; 5844 5845 amdgpu_asic_invalidate_hdp(adev, ring); 5846 } 5847 5848 int amdgpu_in_reset(struct amdgpu_device *adev) 5849 { 5850 return atomic_read(&adev->reset_domain->in_gpu_reset); 5851 } 5852 5853 /** 5854 * amdgpu_device_halt() - bring hardware to some kind of halt state 5855 * 5856 * @adev: amdgpu_device pointer 5857 * 5858 * Bring hardware to some kind of halt state so that no one can touch it 5859 * any more. It will help to maintain error context when error occurred. 5860 * Compare to a simple hang, the system will keep stable at least for SSH 5861 * access. Then it should be trivial to inspect the hardware state and 5862 * see what's going on. Implemented as following: 5863 * 5864 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5865 * clears all CPU mappings to device, disallows remappings through page faults 5866 * 2. amdgpu_irq_disable_all() disables all interrupts 5867 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5868 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5869 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5870 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5871 * flush any in flight DMA operations 5872 */ 5873 void amdgpu_device_halt(struct amdgpu_device *adev) 5874 { 5875 struct pci_dev *pdev = adev->pdev; 5876 struct drm_device *ddev = adev_to_drm(adev); 5877 5878 drm_dev_unplug(ddev); 5879 5880 amdgpu_irq_disable_all(adev); 5881 5882 amdgpu_fence_driver_hw_fini(adev); 5883 5884 adev->no_hw_access = true; 5885 5886 amdgpu_device_unmap_mmio(adev); 5887 5888 pci_disable_device(pdev); 5889 pci_wait_for_pending_transaction(pdev); 5890 } 5891 5892 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5893 u32 reg) 5894 { 5895 unsigned long flags, address, data; 5896 u32 r; 5897 5898 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5899 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5900 5901 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5902 WREG32(address, reg * 4); 5903 (void)RREG32(address); 5904 r = RREG32(data); 5905 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5906 return r; 5907 } 5908 5909 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5910 u32 reg, u32 v) 5911 { 5912 unsigned long flags, address, data; 5913 5914 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5915 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5916 5917 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5918 WREG32(address, reg * 4); 5919 (void)RREG32(address); 5920 WREG32(data, v); 5921 (void)RREG32(data); 5922 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5923 } 5924 5925 /** 5926 * amdgpu_device_switch_gang - switch to a new gang 5927 * @adev: amdgpu_device pointer 5928 * @gang: the gang to switch to 5929 * 5930 * Try to switch to a new gang. 5931 * Returns: NULL if we switched to the new gang or a reference to the current 5932 * gang leader. 5933 */ 5934 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5935 struct dma_fence *gang) 5936 { 5937 struct dma_fence *old = NULL; 5938 5939 do { 5940 dma_fence_put(old); 5941 rcu_read_lock(); 5942 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5943 rcu_read_unlock(); 5944 5945 if (old == gang) 5946 break; 5947 5948 if (!dma_fence_is_signaled(old)) 5949 return old; 5950 5951 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5952 old, gang) != old); 5953 5954 dma_fence_put(old); 5955 return NULL; 5956 } 5957 5958 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5959 { 5960 switch (adev->asic_type) { 5961 #ifdef CONFIG_DRM_AMDGPU_SI 5962 case CHIP_HAINAN: 5963 #endif 5964 case CHIP_TOPAZ: 5965 /* chips with no display hardware */ 5966 return false; 5967 #ifdef CONFIG_DRM_AMDGPU_SI 5968 case CHIP_TAHITI: 5969 case CHIP_PITCAIRN: 5970 case CHIP_VERDE: 5971 case CHIP_OLAND: 5972 #endif 5973 #ifdef CONFIG_DRM_AMDGPU_CIK 5974 case CHIP_BONAIRE: 5975 case CHIP_HAWAII: 5976 case CHIP_KAVERI: 5977 case CHIP_KABINI: 5978 case CHIP_MULLINS: 5979 #endif 5980 case CHIP_TONGA: 5981 case CHIP_FIJI: 5982 case CHIP_POLARIS10: 5983 case CHIP_POLARIS11: 5984 case CHIP_POLARIS12: 5985 case CHIP_VEGAM: 5986 case CHIP_CARRIZO: 5987 case CHIP_STONEY: 5988 /* chips with display hardware */ 5989 return true; 5990 default: 5991 /* IP discovery */ 5992 if (!adev->ip_versions[DCE_HWIP][0] || 5993 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 5994 return false; 5995 return true; 5996 } 5997 } 5998