1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 36 #include <drm/drm_atomic_helper.h> 37 #include <drm/drm_probe_helper.h> 38 #include <drm/amdgpu_drm.h> 39 #include <linux/vgaarb.h> 40 #include <linux/vga_switcheroo.h> 41 #include <linux/efi.h> 42 #include "amdgpu.h" 43 #include "amdgpu_trace.h" 44 #include "amdgpu_i2c.h" 45 #include "atom.h" 46 #include "amdgpu_atombios.h" 47 #include "amdgpu_atomfirmware.h" 48 #include "amd_pcie.h" 49 #ifdef CONFIG_DRM_AMDGPU_SI 50 #include "si.h" 51 #endif 52 #ifdef CONFIG_DRM_AMDGPU_CIK 53 #include "cik.h" 54 #endif 55 #include "vi.h" 56 #include "soc15.h" 57 #include "nv.h" 58 #include "bif/bif_4_1_d.h" 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 #define AMDGPU_MAX_RETRY_LIMIT 2 87 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 88 89 const char *amdgpu_asic_name[] = { 90 "TAHITI", 91 "PITCAIRN", 92 "VERDE", 93 "OLAND", 94 "HAINAN", 95 "BONAIRE", 96 "KAVERI", 97 "KABINI", 98 "HAWAII", 99 "MULLINS", 100 "TOPAZ", 101 "TONGA", 102 "FIJI", 103 "CARRIZO", 104 "STONEY", 105 "POLARIS10", 106 "POLARIS11", 107 "POLARIS12", 108 "VEGAM", 109 "VEGA10", 110 "VEGA12", 111 "VEGA20", 112 "RAVEN", 113 "ARCTURUS", 114 "RENOIR", 115 "ALDEBARAN", 116 "NAVI10", 117 "CYAN_SKILLFISH", 118 "NAVI14", 119 "NAVI12", 120 "SIENNA_CICHLID", 121 "NAVY_FLOUNDER", 122 "VANGOGH", 123 "DIMGREY_CAVEFISH", 124 "BEIGE_GOBY", 125 "YELLOW_CARP", 126 "IP DISCOVERY", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 299 void *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0, tmp = 0; 303 uint32_t *data = buf; 304 uint64_t last; 305 int idx; 306 307 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 308 return; 309 310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 311 312 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 313 for (last = pos + size; pos < last; pos += 4) { 314 tmp = pos >> 31; 315 316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 317 if (tmp != hi) { 318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 319 hi = tmp; 320 } 321 if (write) 322 WREG32_NO_KIQ(mmMM_DATA, *data++); 323 else 324 *data++ = RREG32_NO_KIQ(mmMM_DATA); 325 } 326 327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 328 drm_dev_exit(idx); 329 } 330 331 /** 332 * amdgpu_device_aper_access - access vram by vram aperature 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 * 340 * The return value means how many bytes have been transferred. 341 */ 342 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 343 void *buf, size_t size, bool write) 344 { 345 #ifdef CONFIG_64BIT 346 void __iomem *addr; 347 size_t count = 0; 348 uint64_t last; 349 350 if (!adev->mman.aper_base_kaddr) 351 return 0; 352 353 last = min(pos + size, adev->gmc.visible_vram_size); 354 if (last > pos) { 355 addr = adev->mman.aper_base_kaddr + pos; 356 count = last - pos; 357 358 if (write) { 359 memcpy_toio(addr, buf, count); 360 mb(); 361 amdgpu_device_flush_hdp(adev, NULL); 362 } else { 363 amdgpu_device_invalidate_hdp(adev, NULL); 364 mb(); 365 memcpy_fromio(buf, addr, count); 366 } 367 368 } 369 370 return count; 371 #else 372 return 0; 373 #endif 374 } 375 376 /** 377 * amdgpu_device_vram_access - read/write a buffer in vram 378 * 379 * @adev: amdgpu_device pointer 380 * @pos: offset of the buffer in vram 381 * @buf: virtual address of the buffer in system memory 382 * @size: read/write size, sizeof(@buf) must > @size 383 * @write: true - write to vram, otherwise - read from vram 384 */ 385 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 386 void *buf, size_t size, bool write) 387 { 388 size_t count; 389 390 /* try to using vram apreature to access vram first */ 391 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 392 size -= count; 393 if (size) { 394 /* using MM to access rest vram */ 395 pos += count; 396 buf += count; 397 amdgpu_device_mm_access(adev, pos, buf, size, write); 398 } 399 } 400 401 /* 402 * register access helper functions. 403 */ 404 405 /* Check if hw access should be skipped because of hotplug or device error */ 406 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 407 { 408 if (adev->no_hw_access) 409 return true; 410 411 #ifdef CONFIG_LOCKDEP 412 /* 413 * This is a bit complicated to understand, so worth a comment. What we assert 414 * here is that the GPU reset is not running on another thread in parallel. 415 * 416 * For this we trylock the read side of the reset semaphore, if that succeeds 417 * we know that the reset is not running in paralell. 418 * 419 * If the trylock fails we assert that we are either already holding the read 420 * side of the lock or are the reset thread itself and hold the write side of 421 * the lock. 422 */ 423 if (in_task()) { 424 if (down_read_trylock(&adev->reset_domain->sem)) 425 up_read(&adev->reset_domain->sem); 426 else 427 lockdep_assert_held(&adev->reset_domain->sem); 428 } 429 #endif 430 return false; 431 } 432 433 /** 434 * amdgpu_device_rreg - read a memory mapped IO or indirect register 435 * 436 * @adev: amdgpu_device pointer 437 * @reg: dword aligned register offset 438 * @acc_flags: access flags which require special behavior 439 * 440 * Returns the 32 bit value from the offset specified. 441 */ 442 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 443 uint32_t reg, uint32_t acc_flags) 444 { 445 uint32_t ret; 446 447 if (amdgpu_device_skip_hw_access(adev)) 448 return 0; 449 450 if ((reg * 4) < adev->rmmio_size) { 451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 452 amdgpu_sriov_runtime(adev) && 453 down_read_trylock(&adev->reset_domain->sem)) { 454 ret = amdgpu_kiq_rreg(adev, reg); 455 up_read(&adev->reset_domain->sem); 456 } else { 457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 458 } 459 } else { 460 ret = adev->pcie_rreg(adev, reg * 4); 461 } 462 463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 464 465 return ret; 466 } 467 468 /* 469 * MMIO register read with bytes helper functions 470 * @offset:bytes offset from MMIO start 471 * 472 */ 473 474 /** 475 * amdgpu_mm_rreg8 - read a memory mapped IO register 476 * 477 * @adev: amdgpu_device pointer 478 * @offset: byte aligned register offset 479 * 480 * Returns the 8 bit value from the offset specified. 481 */ 482 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 483 { 484 if (amdgpu_device_skip_hw_access(adev)) 485 return 0; 486 487 if (offset < adev->rmmio_size) 488 return (readb(adev->rmmio + offset)); 489 BUG(); 490 } 491 492 /* 493 * MMIO register write with bytes helper functions 494 * @offset:bytes offset from MMIO start 495 * @value: the value want to be written to the register 496 * 497 */ 498 /** 499 * amdgpu_mm_wreg8 - read a memory mapped IO register 500 * 501 * @adev: amdgpu_device pointer 502 * @offset: byte aligned register offset 503 * @value: 8 bit value to write 504 * 505 * Writes the value specified to the offset specified. 506 */ 507 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 508 { 509 if (amdgpu_device_skip_hw_access(adev)) 510 return; 511 512 if (offset < adev->rmmio_size) 513 writeb(value, adev->rmmio + offset); 514 else 515 BUG(); 516 } 517 518 /** 519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 520 * 521 * @adev: amdgpu_device pointer 522 * @reg: dword aligned register offset 523 * @v: 32 bit value to write to the register 524 * @acc_flags: access flags which require special behavior 525 * 526 * Writes the value specified to the offset specified. 527 */ 528 void amdgpu_device_wreg(struct amdgpu_device *adev, 529 uint32_t reg, uint32_t v, 530 uint32_t acc_flags) 531 { 532 if (amdgpu_device_skip_hw_access(adev)) 533 return; 534 535 if ((reg * 4) < adev->rmmio_size) { 536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 537 amdgpu_sriov_runtime(adev) && 538 down_read_trylock(&adev->reset_domain->sem)) { 539 amdgpu_kiq_wreg(adev, reg, v); 540 up_read(&adev->reset_domain->sem); 541 } else { 542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 543 } 544 } else { 545 adev->pcie_wreg(adev, reg * 4, v); 546 } 547 548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 549 } 550 551 /** 552 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 553 * 554 * @adev: amdgpu_device pointer 555 * @reg: mmio/rlc register 556 * @v: value to write 557 * 558 * this function is invoked only for the debugfs register access 559 */ 560 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 561 uint32_t reg, uint32_t v) 562 { 563 if (amdgpu_device_skip_hw_access(adev)) 564 return; 565 566 if (amdgpu_sriov_fullaccess(adev) && 567 adev->gfx.rlc.funcs && 568 adev->gfx.rlc.funcs->is_rlcg_access_range) { 569 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 570 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 571 } else if ((reg * 4) >= adev->rmmio_size) { 572 adev->pcie_wreg(adev, reg * 4, v); 573 } else { 574 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 575 } 576 } 577 578 /** 579 * amdgpu_mm_rdoorbell - read a doorbell dword 580 * 581 * @adev: amdgpu_device pointer 582 * @index: doorbell index 583 * 584 * Returns the value in the doorbell aperture at the 585 * requested doorbell index (CIK). 586 */ 587 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 588 { 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if (index < adev->doorbell.num_doorbells) { 593 return readl(adev->doorbell.ptr + index); 594 } else { 595 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 596 return 0; 597 } 598 } 599 600 /** 601 * amdgpu_mm_wdoorbell - write a doorbell dword 602 * 603 * @adev: amdgpu_device pointer 604 * @index: doorbell index 605 * @v: value to write 606 * 607 * Writes @v to the doorbell aperture at the 608 * requested doorbell index (CIK). 609 */ 610 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 611 { 612 if (amdgpu_device_skip_hw_access(adev)) 613 return; 614 615 if (index < adev->doorbell.num_doorbells) { 616 writel(v, adev->doorbell.ptr + index); 617 } else { 618 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 619 } 620 } 621 622 /** 623 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 624 * 625 * @adev: amdgpu_device pointer 626 * @index: doorbell index 627 * 628 * Returns the value in the doorbell aperture at the 629 * requested doorbell index (VEGA10+). 630 */ 631 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 632 { 633 if (amdgpu_device_skip_hw_access(adev)) 634 return 0; 635 636 if (index < adev->doorbell.num_doorbells) { 637 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 638 } else { 639 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 640 return 0; 641 } 642 } 643 644 /** 645 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 646 * 647 * @adev: amdgpu_device pointer 648 * @index: doorbell index 649 * @v: value to write 650 * 651 * Writes @v to the doorbell aperture at the 652 * requested doorbell index (VEGA10+). 653 */ 654 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 655 { 656 if (amdgpu_device_skip_hw_access(adev)) 657 return; 658 659 if (index < adev->doorbell.num_doorbells) { 660 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 661 } else { 662 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 663 } 664 } 665 666 /** 667 * amdgpu_device_indirect_rreg - read an indirect register 668 * 669 * @adev: amdgpu_device pointer 670 * @pcie_index: mmio register offset 671 * @pcie_data: mmio register offset 672 * @reg_addr: indirect register address to read from 673 * 674 * Returns the value of indirect register @reg_addr 675 */ 676 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 677 u32 pcie_index, u32 pcie_data, 678 u32 reg_addr) 679 { 680 unsigned long flags; 681 u32 r; 682 void __iomem *pcie_index_offset; 683 void __iomem *pcie_data_offset; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 689 writel(reg_addr, pcie_index_offset); 690 readl(pcie_index_offset); 691 r = readl(pcie_data_offset); 692 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 693 694 return r; 695 } 696 697 /** 698 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 699 * 700 * @adev: amdgpu_device pointer 701 * @pcie_index: mmio register offset 702 * @pcie_data: mmio register offset 703 * @reg_addr: indirect register address to read from 704 * 705 * Returns the value of indirect register @reg_addr 706 */ 707 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 708 u32 pcie_index, u32 pcie_data, 709 u32 reg_addr) 710 { 711 unsigned long flags; 712 u64 r; 713 void __iomem *pcie_index_offset; 714 void __iomem *pcie_data_offset; 715 716 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 717 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 718 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 719 720 /* read low 32 bits */ 721 writel(reg_addr, pcie_index_offset); 722 readl(pcie_index_offset); 723 r = readl(pcie_data_offset); 724 /* read high 32 bits */ 725 writel(reg_addr + 4, pcie_index_offset); 726 readl(pcie_index_offset); 727 r |= ((u64)readl(pcie_data_offset) << 32); 728 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 729 730 return r; 731 } 732 733 /** 734 * amdgpu_device_indirect_wreg - write an indirect register address 735 * 736 * @adev: amdgpu_device pointer 737 * @pcie_index: mmio register offset 738 * @pcie_data: mmio register offset 739 * @reg_addr: indirect register offset 740 * @reg_data: indirect register data 741 * 742 */ 743 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 744 u32 pcie_index, u32 pcie_data, 745 u32 reg_addr, u32 reg_data) 746 { 747 unsigned long flags; 748 void __iomem *pcie_index_offset; 749 void __iomem *pcie_data_offset; 750 751 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 752 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 753 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 754 755 writel(reg_addr, pcie_index_offset); 756 readl(pcie_index_offset); 757 writel(reg_data, pcie_data_offset); 758 readl(pcie_data_offset); 759 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 760 } 761 762 /** 763 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 764 * 765 * @adev: amdgpu_device pointer 766 * @pcie_index: mmio register offset 767 * @pcie_data: mmio register offset 768 * @reg_addr: indirect register offset 769 * @reg_data: indirect register data 770 * 771 */ 772 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 773 u32 pcie_index, u32 pcie_data, 774 u32 reg_addr, u64 reg_data) 775 { 776 unsigned long flags; 777 void __iomem *pcie_index_offset; 778 void __iomem *pcie_data_offset; 779 780 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 781 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 782 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 783 784 /* write low 32 bits */ 785 writel(reg_addr, pcie_index_offset); 786 readl(pcie_index_offset); 787 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 788 readl(pcie_data_offset); 789 /* write high 32 bits */ 790 writel(reg_addr + 4, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data >> 32), pcie_data_offset); 793 readl(pcie_data_offset); 794 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 795 } 796 797 /** 798 * amdgpu_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @reg: offset of register 802 * 803 * Dummy register read function. Used for register blocks 804 * that certain asics don't have (all asics). 805 * Returns the value in the register. 806 */ 807 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 808 { 809 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 810 BUG(); 811 return 0; 812 } 813 814 /** 815 * amdgpu_invalid_wreg - dummy reg write function 816 * 817 * @adev: amdgpu_device pointer 818 * @reg: offset of register 819 * @v: value to write to the register 820 * 821 * Dummy register read function. Used for register blocks 822 * that certain asics don't have (all asics). 823 */ 824 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 825 { 826 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 827 reg, v); 828 BUG(); 829 } 830 831 /** 832 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 833 * 834 * @adev: amdgpu_device pointer 835 * @reg: offset of register 836 * 837 * Dummy register read function. Used for register blocks 838 * that certain asics don't have (all asics). 839 * Returns the value in the register. 840 */ 841 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 842 { 843 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 844 BUG(); 845 return 0; 846 } 847 848 /** 849 * amdgpu_invalid_wreg64 - dummy reg write function 850 * 851 * @adev: amdgpu_device pointer 852 * @reg: offset of register 853 * @v: value to write to the register 854 * 855 * Dummy register read function. Used for register blocks 856 * that certain asics don't have (all asics). 857 */ 858 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 859 { 860 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 861 reg, v); 862 BUG(); 863 } 864 865 /** 866 * amdgpu_block_invalid_rreg - dummy reg read function 867 * 868 * @adev: amdgpu_device pointer 869 * @block: offset of instance 870 * @reg: offset of register 871 * 872 * Dummy register read function. Used for register blocks 873 * that certain asics don't have (all asics). 874 * Returns the value in the register. 875 */ 876 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 877 uint32_t block, uint32_t reg) 878 { 879 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 880 reg, block); 881 BUG(); 882 return 0; 883 } 884 885 /** 886 * amdgpu_block_invalid_wreg - dummy reg write function 887 * 888 * @adev: amdgpu_device pointer 889 * @block: offset of instance 890 * @reg: offset of register 891 * @v: value to write to the register 892 * 893 * Dummy register read function. Used for register blocks 894 * that certain asics don't have (all asics). 895 */ 896 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 897 uint32_t block, 898 uint32_t reg, uint32_t v) 899 { 900 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 901 reg, block, v); 902 BUG(); 903 } 904 905 /** 906 * amdgpu_device_asic_init - Wrapper for atom asic_init 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Does any asic specific work and then calls atom asic init. 911 */ 912 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 913 { 914 amdgpu_asic_pre_asic_init(adev); 915 916 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 917 return amdgpu_atomfirmware_asic_init(adev, true); 918 else 919 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 920 } 921 922 /** 923 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 924 * 925 * @adev: amdgpu_device pointer 926 * 927 * Allocates a scratch page of VRAM for use by various things in the 928 * driver. 929 */ 930 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 931 { 932 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 933 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 934 &adev->vram_scratch.robj, 935 &adev->vram_scratch.gpu_addr, 936 (void **)&adev->vram_scratch.ptr); 937 } 938 939 /** 940 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 941 * 942 * @adev: amdgpu_device pointer 943 * 944 * Frees the VRAM scratch page. 945 */ 946 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 947 { 948 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 949 } 950 951 /** 952 * amdgpu_device_program_register_sequence - program an array of registers. 953 * 954 * @adev: amdgpu_device pointer 955 * @registers: pointer to the register array 956 * @array_size: size of the register array 957 * 958 * Programs an array or registers with and and or masks. 959 * This is a helper for setting golden registers. 960 */ 961 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 962 const u32 *registers, 963 const u32 array_size) 964 { 965 u32 tmp, reg, and_mask, or_mask; 966 int i; 967 968 if (array_size % 3) 969 return; 970 971 for (i = 0; i < array_size; i +=3) { 972 reg = registers[i + 0]; 973 and_mask = registers[i + 1]; 974 or_mask = registers[i + 2]; 975 976 if (and_mask == 0xffffffff) { 977 tmp = or_mask; 978 } else { 979 tmp = RREG32(reg); 980 tmp &= ~and_mask; 981 if (adev->family >= AMDGPU_FAMILY_AI) 982 tmp |= (or_mask & and_mask); 983 else 984 tmp |= or_mask; 985 } 986 WREG32(reg, tmp); 987 } 988 } 989 990 /** 991 * amdgpu_device_pci_config_reset - reset the GPU 992 * 993 * @adev: amdgpu_device pointer 994 * 995 * Resets the GPU using the pci config reset sequence. 996 * Only applicable to asics prior to vega10. 997 */ 998 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 999 { 1000 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1001 } 1002 1003 /** 1004 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1005 * 1006 * @adev: amdgpu_device pointer 1007 * 1008 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1009 */ 1010 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1011 { 1012 return pci_reset_function(adev->pdev); 1013 } 1014 1015 /* 1016 * GPU doorbell aperture helpers function. 1017 */ 1018 /** 1019 * amdgpu_device_doorbell_init - Init doorbell driver information. 1020 * 1021 * @adev: amdgpu_device pointer 1022 * 1023 * Init doorbell driver information (CIK) 1024 * Returns 0 on success, error on failure. 1025 */ 1026 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1027 { 1028 1029 /* No doorbell on SI hardware generation */ 1030 if (adev->asic_type < CHIP_BONAIRE) { 1031 adev->doorbell.base = 0; 1032 adev->doorbell.size = 0; 1033 adev->doorbell.num_doorbells = 0; 1034 adev->doorbell.ptr = NULL; 1035 return 0; 1036 } 1037 1038 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1039 return -EINVAL; 1040 1041 amdgpu_asic_init_doorbell_index(adev); 1042 1043 /* doorbell bar mapping */ 1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1046 1047 if (adev->enable_mes) { 1048 adev->doorbell.num_doorbells = 1049 adev->doorbell.size / sizeof(u32); 1050 } else { 1051 adev->doorbell.num_doorbells = 1052 min_t(u32, adev->doorbell.size / sizeof(u32), 1053 adev->doorbell_index.max_assignment+1); 1054 if (adev->doorbell.num_doorbells == 0) 1055 return -EINVAL; 1056 1057 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1058 * paging queue doorbell use the second page. The 1059 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1060 * doorbells are in the first page. So with paging queue enabled, 1061 * the max num_doorbells should + 1 page (0x400 in dword) 1062 */ 1063 if (adev->asic_type >= CHIP_VEGA10) 1064 adev->doorbell.num_doorbells += 0x400; 1065 } 1066 1067 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1068 adev->doorbell.num_doorbells * 1069 sizeof(u32)); 1070 if (adev->doorbell.ptr == NULL) 1071 return -ENOMEM; 1072 1073 return 0; 1074 } 1075 1076 /** 1077 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1078 * 1079 * @adev: amdgpu_device pointer 1080 * 1081 * Tear down doorbell driver information (CIK) 1082 */ 1083 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1084 { 1085 iounmap(adev->doorbell.ptr); 1086 adev->doorbell.ptr = NULL; 1087 } 1088 1089 1090 1091 /* 1092 * amdgpu_device_wb_*() 1093 * Writeback is the method by which the GPU updates special pages in memory 1094 * with the status of certain GPU events (fences, ring pointers,etc.). 1095 */ 1096 1097 /** 1098 * amdgpu_device_wb_fini - Disable Writeback and free memory 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Disables Writeback and frees the Writeback memory (all asics). 1103 * Used at driver shutdown. 1104 */ 1105 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1106 { 1107 if (adev->wb.wb_obj) { 1108 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1109 &adev->wb.gpu_addr, 1110 (void **)&adev->wb.wb); 1111 adev->wb.wb_obj = NULL; 1112 } 1113 } 1114 1115 /** 1116 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1117 * 1118 * @adev: amdgpu_device pointer 1119 * 1120 * Initializes writeback and allocates writeback memory (all asics). 1121 * Used at driver startup. 1122 * Returns 0 on success or an -error on failure. 1123 */ 1124 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1125 { 1126 int r; 1127 1128 if (adev->wb.wb_obj == NULL) { 1129 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1130 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1131 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1132 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1133 (void **)&adev->wb.wb); 1134 if (r) { 1135 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1136 return r; 1137 } 1138 1139 adev->wb.num_wb = AMDGPU_MAX_WB; 1140 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1141 1142 /* clear wb memory */ 1143 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1144 } 1145 1146 return 0; 1147 } 1148 1149 /** 1150 * amdgpu_device_wb_get - Allocate a wb entry 1151 * 1152 * @adev: amdgpu_device pointer 1153 * @wb: wb index 1154 * 1155 * Allocate a wb slot for use by the driver (all asics). 1156 * Returns 0 on success or -EINVAL on failure. 1157 */ 1158 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1159 { 1160 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1161 1162 if (offset < adev->wb.num_wb) { 1163 __set_bit(offset, adev->wb.used); 1164 *wb = offset << 3; /* convert to dw offset */ 1165 return 0; 1166 } else { 1167 return -EINVAL; 1168 } 1169 } 1170 1171 /** 1172 * amdgpu_device_wb_free - Free a wb entry 1173 * 1174 * @adev: amdgpu_device pointer 1175 * @wb: wb index 1176 * 1177 * Free a wb slot allocated for use by the driver (all asics) 1178 */ 1179 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1180 { 1181 wb >>= 3; 1182 if (wb < adev->wb.num_wb) 1183 __clear_bit(wb, adev->wb.used); 1184 } 1185 1186 /** 1187 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1188 * 1189 * @adev: amdgpu_device pointer 1190 * 1191 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1192 * to fail, but if any of the BARs is not accessible after the size we abort 1193 * driver loading by returning -ENODEV. 1194 */ 1195 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1196 { 1197 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1198 struct pci_bus *root; 1199 struct resource *res; 1200 unsigned i; 1201 u16 cmd; 1202 int r; 1203 1204 /* Bypass for VF */ 1205 if (amdgpu_sriov_vf(adev)) 1206 return 0; 1207 1208 /* skip if the bios has already enabled large BAR */ 1209 if (adev->gmc.real_vram_size && 1210 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1211 return 0; 1212 1213 /* Check if the root BUS has 64bit memory resources */ 1214 root = adev->pdev->bus; 1215 while (root->parent) 1216 root = root->parent; 1217 1218 pci_bus_for_each_resource(root, res, i) { 1219 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1220 res->start > 0x100000000ull) 1221 break; 1222 } 1223 1224 /* Trying to resize is pointless without a root hub window above 4GB */ 1225 if (!res) 1226 return 0; 1227 1228 /* Limit the BAR size to what is available */ 1229 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1230 rbar_size); 1231 1232 /* Disable memory decoding while we change the BAR addresses and size */ 1233 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1234 pci_write_config_word(adev->pdev, PCI_COMMAND, 1235 cmd & ~PCI_COMMAND_MEMORY); 1236 1237 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1238 amdgpu_device_doorbell_fini(adev); 1239 if (adev->asic_type >= CHIP_BONAIRE) 1240 pci_release_resource(adev->pdev, 2); 1241 1242 pci_release_resource(adev->pdev, 0); 1243 1244 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1245 if (r == -ENOSPC) 1246 DRM_INFO("Not enough PCI address space for a large BAR."); 1247 else if (r && r != -ENOTSUPP) 1248 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1249 1250 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1251 1252 /* When the doorbell or fb BAR isn't available we have no chance of 1253 * using the device. 1254 */ 1255 r = amdgpu_device_doorbell_init(adev); 1256 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1257 return -ENODEV; 1258 1259 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1260 1261 return 0; 1262 } 1263 1264 /* 1265 * GPU helpers function. 1266 */ 1267 /** 1268 * amdgpu_device_need_post - check if the hw need post or not 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Check if the asic has been initialized (all asics) at driver startup 1273 * or post is needed if hw reset is performed. 1274 * Returns true if need or false if not. 1275 */ 1276 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1277 { 1278 uint32_t reg; 1279 1280 if (amdgpu_sriov_vf(adev)) 1281 return false; 1282 1283 if (amdgpu_passthrough(adev)) { 1284 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1285 * some old smc fw still need driver do vPost otherwise gpu hang, while 1286 * those smc fw version above 22.15 doesn't have this flaw, so we force 1287 * vpost executed for smc version below 22.15 1288 */ 1289 if (adev->asic_type == CHIP_FIJI) { 1290 int err; 1291 uint32_t fw_ver; 1292 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1293 /* force vPost if error occured */ 1294 if (err) 1295 return true; 1296 1297 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1298 if (fw_ver < 0x00160e00) 1299 return true; 1300 } 1301 } 1302 1303 /* Don't post if we need to reset whole hive on init */ 1304 if (adev->gmc.xgmi.pending_reset) 1305 return false; 1306 1307 if (adev->has_hw_reset) { 1308 adev->has_hw_reset = false; 1309 return true; 1310 } 1311 1312 /* bios scratch used on CIK+ */ 1313 if (adev->asic_type >= CHIP_BONAIRE) 1314 return amdgpu_atombios_scratch_need_asic_init(adev); 1315 1316 /* check MEM_SIZE for older asics */ 1317 reg = amdgpu_asic_get_config_memsize(adev); 1318 1319 if ((reg != 0) && (reg != 0xffffffff)) 1320 return false; 1321 1322 return true; 1323 } 1324 1325 /** 1326 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1327 * 1328 * @adev: amdgpu_device pointer 1329 * 1330 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1331 * be set for this device. 1332 * 1333 * Returns true if it should be used or false if not. 1334 */ 1335 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1336 { 1337 switch (amdgpu_aspm) { 1338 case -1: 1339 break; 1340 case 0: 1341 return false; 1342 case 1: 1343 return true; 1344 default: 1345 return false; 1346 } 1347 return pcie_aspm_enabled(adev->pdev); 1348 } 1349 1350 /* if we get transitioned to only one device, take VGA back */ 1351 /** 1352 * amdgpu_device_vga_set_decode - enable/disable vga decode 1353 * 1354 * @pdev: PCI device pointer 1355 * @state: enable/disable vga decode 1356 * 1357 * Enable/disable vga decode (all asics). 1358 * Returns VGA resource flags. 1359 */ 1360 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1361 bool state) 1362 { 1363 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1364 amdgpu_asic_set_vga_state(adev, state); 1365 if (state) 1366 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1367 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1368 else 1369 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1370 } 1371 1372 /** 1373 * amdgpu_device_check_block_size - validate the vm block size 1374 * 1375 * @adev: amdgpu_device pointer 1376 * 1377 * Validates the vm block size specified via module parameter. 1378 * The vm block size defines number of bits in page table versus page directory, 1379 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1380 * page table and the remaining bits are in the page directory. 1381 */ 1382 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1383 { 1384 /* defines number of bits in page table versus page directory, 1385 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1386 * page table and the remaining bits are in the page directory */ 1387 if (amdgpu_vm_block_size == -1) 1388 return; 1389 1390 if (amdgpu_vm_block_size < 9) { 1391 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1392 amdgpu_vm_block_size); 1393 amdgpu_vm_block_size = -1; 1394 } 1395 } 1396 1397 /** 1398 * amdgpu_device_check_vm_size - validate the vm size 1399 * 1400 * @adev: amdgpu_device pointer 1401 * 1402 * Validates the vm size in GB specified via module parameter. 1403 * The VM size is the size of the GPU virtual memory space in GB. 1404 */ 1405 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1406 { 1407 /* no need to check the default value */ 1408 if (amdgpu_vm_size == -1) 1409 return; 1410 1411 if (amdgpu_vm_size < 1) { 1412 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1413 amdgpu_vm_size); 1414 amdgpu_vm_size = -1; 1415 } 1416 } 1417 1418 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1419 { 1420 struct sysinfo si; 1421 bool is_os_64 = (sizeof(void *) == 8); 1422 uint64_t total_memory; 1423 uint64_t dram_size_seven_GB = 0x1B8000000; 1424 uint64_t dram_size_three_GB = 0xB8000000; 1425 1426 if (amdgpu_smu_memory_pool_size == 0) 1427 return; 1428 1429 if (!is_os_64) { 1430 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1431 goto def_value; 1432 } 1433 si_meminfo(&si); 1434 total_memory = (uint64_t)si.totalram * si.mem_unit; 1435 1436 if ((amdgpu_smu_memory_pool_size == 1) || 1437 (amdgpu_smu_memory_pool_size == 2)) { 1438 if (total_memory < dram_size_three_GB) 1439 goto def_value1; 1440 } else if ((amdgpu_smu_memory_pool_size == 4) || 1441 (amdgpu_smu_memory_pool_size == 8)) { 1442 if (total_memory < dram_size_seven_GB) 1443 goto def_value1; 1444 } else { 1445 DRM_WARN("Smu memory pool size not supported\n"); 1446 goto def_value; 1447 } 1448 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1449 1450 return; 1451 1452 def_value1: 1453 DRM_WARN("No enough system memory\n"); 1454 def_value: 1455 adev->pm.smu_prv_buffer_size = 0; 1456 } 1457 1458 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1459 { 1460 if (!(adev->flags & AMD_IS_APU) || 1461 adev->asic_type < CHIP_RAVEN) 1462 return 0; 1463 1464 switch (adev->asic_type) { 1465 case CHIP_RAVEN: 1466 if (adev->pdev->device == 0x15dd) 1467 adev->apu_flags |= AMD_APU_IS_RAVEN; 1468 if (adev->pdev->device == 0x15d8) 1469 adev->apu_flags |= AMD_APU_IS_PICASSO; 1470 break; 1471 case CHIP_RENOIR: 1472 if ((adev->pdev->device == 0x1636) || 1473 (adev->pdev->device == 0x164c)) 1474 adev->apu_flags |= AMD_APU_IS_RENOIR; 1475 else 1476 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1477 break; 1478 case CHIP_VANGOGH: 1479 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1480 break; 1481 case CHIP_YELLOW_CARP: 1482 break; 1483 case CHIP_CYAN_SKILLFISH: 1484 if ((adev->pdev->device == 0x13FE) || 1485 (adev->pdev->device == 0x143F)) 1486 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1487 break; 1488 default: 1489 break; 1490 } 1491 1492 return 0; 1493 } 1494 1495 /** 1496 * amdgpu_device_check_arguments - validate module params 1497 * 1498 * @adev: amdgpu_device pointer 1499 * 1500 * Validates certain module parameters and updates 1501 * the associated values used by the driver (all asics). 1502 */ 1503 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1504 { 1505 if (amdgpu_sched_jobs < 4) { 1506 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1507 amdgpu_sched_jobs); 1508 amdgpu_sched_jobs = 4; 1509 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1510 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1511 amdgpu_sched_jobs); 1512 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1513 } 1514 1515 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1516 /* gart size must be greater or equal to 32M */ 1517 dev_warn(adev->dev, "gart size (%d) too small\n", 1518 amdgpu_gart_size); 1519 amdgpu_gart_size = -1; 1520 } 1521 1522 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1523 /* gtt size must be greater or equal to 32M */ 1524 dev_warn(adev->dev, "gtt size (%d) too small\n", 1525 amdgpu_gtt_size); 1526 amdgpu_gtt_size = -1; 1527 } 1528 1529 /* valid range is between 4 and 9 inclusive */ 1530 if (amdgpu_vm_fragment_size != -1 && 1531 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1532 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1533 amdgpu_vm_fragment_size = -1; 1534 } 1535 1536 if (amdgpu_sched_hw_submission < 2) { 1537 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1538 amdgpu_sched_hw_submission); 1539 amdgpu_sched_hw_submission = 2; 1540 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1541 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1542 amdgpu_sched_hw_submission); 1543 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1544 } 1545 1546 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1547 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1548 amdgpu_reset_method = -1; 1549 } 1550 1551 amdgpu_device_check_smu_prv_buffer_size(adev); 1552 1553 amdgpu_device_check_vm_size(adev); 1554 1555 amdgpu_device_check_block_size(adev); 1556 1557 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1558 1559 amdgpu_gmc_tmz_set(adev); 1560 1561 1562 return 0; 1563 } 1564 1565 /** 1566 * amdgpu_switcheroo_set_state - set switcheroo state 1567 * 1568 * @pdev: pci dev pointer 1569 * @state: vga_switcheroo state 1570 * 1571 * Callback for the switcheroo driver. Suspends or resumes the 1572 * the asics before or after it is powered up using ACPI methods. 1573 */ 1574 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1575 enum vga_switcheroo_state state) 1576 { 1577 struct drm_device *dev = pci_get_drvdata(pdev); 1578 int r; 1579 1580 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1581 return; 1582 1583 if (state == VGA_SWITCHEROO_ON) { 1584 pr_info("switched on\n"); 1585 /* don't suspend or resume card normally */ 1586 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1587 1588 pci_set_power_state(pdev, PCI_D0); 1589 amdgpu_device_load_pci_state(pdev); 1590 r = pci_enable_device(pdev); 1591 if (r) 1592 DRM_WARN("pci_enable_device failed (%d)\n", r); 1593 amdgpu_device_resume(dev, true); 1594 1595 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1596 } else { 1597 pr_info("switched off\n"); 1598 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1599 amdgpu_device_suspend(dev, true); 1600 amdgpu_device_cache_pci_state(pdev); 1601 /* Shut down the device */ 1602 pci_disable_device(pdev); 1603 pci_set_power_state(pdev, PCI_D3cold); 1604 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1605 } 1606 } 1607 1608 /** 1609 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1610 * 1611 * @pdev: pci dev pointer 1612 * 1613 * Callback for the switcheroo driver. Check of the switcheroo 1614 * state can be changed. 1615 * Returns true if the state can be changed, false if not. 1616 */ 1617 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1618 { 1619 struct drm_device *dev = pci_get_drvdata(pdev); 1620 1621 /* 1622 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1623 * locking inversion with the driver load path. And the access here is 1624 * completely racy anyway. So don't bother with locking for now. 1625 */ 1626 return atomic_read(&dev->open_count) == 0; 1627 } 1628 1629 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1630 .set_gpu_state = amdgpu_switcheroo_set_state, 1631 .reprobe = NULL, 1632 .can_switch = amdgpu_switcheroo_can_switch, 1633 }; 1634 1635 /** 1636 * amdgpu_device_ip_set_clockgating_state - set the CG state 1637 * 1638 * @dev: amdgpu_device pointer 1639 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1640 * @state: clockgating state (gate or ungate) 1641 * 1642 * Sets the requested clockgating state for all instances of 1643 * the hardware IP specified. 1644 * Returns the error code from the last instance. 1645 */ 1646 int amdgpu_device_ip_set_clockgating_state(void *dev, 1647 enum amd_ip_block_type block_type, 1648 enum amd_clockgating_state state) 1649 { 1650 struct amdgpu_device *adev = dev; 1651 int i, r = 0; 1652 1653 for (i = 0; i < adev->num_ip_blocks; i++) { 1654 if (!adev->ip_blocks[i].status.valid) 1655 continue; 1656 if (adev->ip_blocks[i].version->type != block_type) 1657 continue; 1658 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1659 continue; 1660 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1661 (void *)adev, state); 1662 if (r) 1663 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1664 adev->ip_blocks[i].version->funcs->name, r); 1665 } 1666 return r; 1667 } 1668 1669 /** 1670 * amdgpu_device_ip_set_powergating_state - set the PG state 1671 * 1672 * @dev: amdgpu_device pointer 1673 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1674 * @state: powergating state (gate or ungate) 1675 * 1676 * Sets the requested powergating state for all instances of 1677 * the hardware IP specified. 1678 * Returns the error code from the last instance. 1679 */ 1680 int amdgpu_device_ip_set_powergating_state(void *dev, 1681 enum amd_ip_block_type block_type, 1682 enum amd_powergating_state state) 1683 { 1684 struct amdgpu_device *adev = dev; 1685 int i, r = 0; 1686 1687 for (i = 0; i < adev->num_ip_blocks; i++) { 1688 if (!adev->ip_blocks[i].status.valid) 1689 continue; 1690 if (adev->ip_blocks[i].version->type != block_type) 1691 continue; 1692 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1693 continue; 1694 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1695 (void *)adev, state); 1696 if (r) 1697 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1698 adev->ip_blocks[i].version->funcs->name, r); 1699 } 1700 return r; 1701 } 1702 1703 /** 1704 * amdgpu_device_ip_get_clockgating_state - get the CG state 1705 * 1706 * @adev: amdgpu_device pointer 1707 * @flags: clockgating feature flags 1708 * 1709 * Walks the list of IPs on the device and updates the clockgating 1710 * flags for each IP. 1711 * Updates @flags with the feature flags for each hardware IP where 1712 * clockgating is enabled. 1713 */ 1714 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1715 u64 *flags) 1716 { 1717 int i; 1718 1719 for (i = 0; i < adev->num_ip_blocks; i++) { 1720 if (!adev->ip_blocks[i].status.valid) 1721 continue; 1722 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1723 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1724 } 1725 } 1726 1727 /** 1728 * amdgpu_device_ip_wait_for_idle - wait for idle 1729 * 1730 * @adev: amdgpu_device pointer 1731 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1732 * 1733 * Waits for the request hardware IP to be idle. 1734 * Returns 0 for success or a negative error code on failure. 1735 */ 1736 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1737 enum amd_ip_block_type block_type) 1738 { 1739 int i, r; 1740 1741 for (i = 0; i < adev->num_ip_blocks; i++) { 1742 if (!adev->ip_blocks[i].status.valid) 1743 continue; 1744 if (adev->ip_blocks[i].version->type == block_type) { 1745 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1746 if (r) 1747 return r; 1748 break; 1749 } 1750 } 1751 return 0; 1752 1753 } 1754 1755 /** 1756 * amdgpu_device_ip_is_idle - is the hardware IP idle 1757 * 1758 * @adev: amdgpu_device pointer 1759 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1760 * 1761 * Check if the hardware IP is idle or not. 1762 * Returns true if it the IP is idle, false if not. 1763 */ 1764 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1765 enum amd_ip_block_type block_type) 1766 { 1767 int i; 1768 1769 for (i = 0; i < adev->num_ip_blocks; i++) { 1770 if (!adev->ip_blocks[i].status.valid) 1771 continue; 1772 if (adev->ip_blocks[i].version->type == block_type) 1773 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1774 } 1775 return true; 1776 1777 } 1778 1779 /** 1780 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1781 * 1782 * @adev: amdgpu_device pointer 1783 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1784 * 1785 * Returns a pointer to the hardware IP block structure 1786 * if it exists for the asic, otherwise NULL. 1787 */ 1788 struct amdgpu_ip_block * 1789 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1790 enum amd_ip_block_type type) 1791 { 1792 int i; 1793 1794 for (i = 0; i < adev->num_ip_blocks; i++) 1795 if (adev->ip_blocks[i].version->type == type) 1796 return &adev->ip_blocks[i]; 1797 1798 return NULL; 1799 } 1800 1801 /** 1802 * amdgpu_device_ip_block_version_cmp 1803 * 1804 * @adev: amdgpu_device pointer 1805 * @type: enum amd_ip_block_type 1806 * @major: major version 1807 * @minor: minor version 1808 * 1809 * return 0 if equal or greater 1810 * return 1 if smaller or the ip_block doesn't exist 1811 */ 1812 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1813 enum amd_ip_block_type type, 1814 u32 major, u32 minor) 1815 { 1816 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1817 1818 if (ip_block && ((ip_block->version->major > major) || 1819 ((ip_block->version->major == major) && 1820 (ip_block->version->minor >= minor)))) 1821 return 0; 1822 1823 return 1; 1824 } 1825 1826 /** 1827 * amdgpu_device_ip_block_add 1828 * 1829 * @adev: amdgpu_device pointer 1830 * @ip_block_version: pointer to the IP to add 1831 * 1832 * Adds the IP block driver information to the collection of IPs 1833 * on the asic. 1834 */ 1835 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1836 const struct amdgpu_ip_block_version *ip_block_version) 1837 { 1838 if (!ip_block_version) 1839 return -EINVAL; 1840 1841 switch (ip_block_version->type) { 1842 case AMD_IP_BLOCK_TYPE_VCN: 1843 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1844 return 0; 1845 break; 1846 case AMD_IP_BLOCK_TYPE_JPEG: 1847 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1848 return 0; 1849 break; 1850 default: 1851 break; 1852 } 1853 1854 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1855 ip_block_version->funcs->name); 1856 1857 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1858 1859 return 0; 1860 } 1861 1862 /** 1863 * amdgpu_device_enable_virtual_display - enable virtual display feature 1864 * 1865 * @adev: amdgpu_device pointer 1866 * 1867 * Enabled the virtual display feature if the user has enabled it via 1868 * the module parameter virtual_display. This feature provides a virtual 1869 * display hardware on headless boards or in virtualized environments. 1870 * This function parses and validates the configuration string specified by 1871 * the user and configues the virtual display configuration (number of 1872 * virtual connectors, crtcs, etc.) specified. 1873 */ 1874 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1875 { 1876 adev->enable_virtual_display = false; 1877 1878 if (amdgpu_virtual_display) { 1879 const char *pci_address_name = pci_name(adev->pdev); 1880 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1881 1882 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1883 pciaddstr_tmp = pciaddstr; 1884 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1885 pciaddname = strsep(&pciaddname_tmp, ","); 1886 if (!strcmp("all", pciaddname) 1887 || !strcmp(pci_address_name, pciaddname)) { 1888 long num_crtc; 1889 int res = -1; 1890 1891 adev->enable_virtual_display = true; 1892 1893 if (pciaddname_tmp) 1894 res = kstrtol(pciaddname_tmp, 10, 1895 &num_crtc); 1896 1897 if (!res) { 1898 if (num_crtc < 1) 1899 num_crtc = 1; 1900 if (num_crtc > 6) 1901 num_crtc = 6; 1902 adev->mode_info.num_crtc = num_crtc; 1903 } else { 1904 adev->mode_info.num_crtc = 1; 1905 } 1906 break; 1907 } 1908 } 1909 1910 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1911 amdgpu_virtual_display, pci_address_name, 1912 adev->enable_virtual_display, adev->mode_info.num_crtc); 1913 1914 kfree(pciaddstr); 1915 } 1916 } 1917 1918 /** 1919 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1920 * 1921 * @adev: amdgpu_device pointer 1922 * 1923 * Parses the asic configuration parameters specified in the gpu info 1924 * firmware and makes them availale to the driver for use in configuring 1925 * the asic. 1926 * Returns 0 on success, -EINVAL on failure. 1927 */ 1928 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1929 { 1930 const char *chip_name; 1931 char fw_name[40]; 1932 int err; 1933 const struct gpu_info_firmware_header_v1_0 *hdr; 1934 1935 adev->firmware.gpu_info_fw = NULL; 1936 1937 if (adev->mman.discovery_bin) { 1938 /* 1939 * FIXME: The bounding box is still needed by Navi12, so 1940 * temporarily read it from gpu_info firmware. Should be dropped 1941 * when DAL no longer needs it. 1942 */ 1943 if (adev->asic_type != CHIP_NAVI12) 1944 return 0; 1945 } 1946 1947 switch (adev->asic_type) { 1948 #ifdef CONFIG_DRM_AMDGPU_SI 1949 case CHIP_VERDE: 1950 case CHIP_TAHITI: 1951 case CHIP_PITCAIRN: 1952 case CHIP_OLAND: 1953 case CHIP_HAINAN: 1954 #endif 1955 #ifdef CONFIG_DRM_AMDGPU_CIK 1956 case CHIP_BONAIRE: 1957 case CHIP_HAWAII: 1958 case CHIP_KAVERI: 1959 case CHIP_KABINI: 1960 case CHIP_MULLINS: 1961 #endif 1962 case CHIP_TOPAZ: 1963 case CHIP_TONGA: 1964 case CHIP_FIJI: 1965 case CHIP_POLARIS10: 1966 case CHIP_POLARIS11: 1967 case CHIP_POLARIS12: 1968 case CHIP_VEGAM: 1969 case CHIP_CARRIZO: 1970 case CHIP_STONEY: 1971 case CHIP_VEGA20: 1972 case CHIP_ALDEBARAN: 1973 case CHIP_SIENNA_CICHLID: 1974 case CHIP_NAVY_FLOUNDER: 1975 case CHIP_DIMGREY_CAVEFISH: 1976 case CHIP_BEIGE_GOBY: 1977 default: 1978 return 0; 1979 case CHIP_VEGA10: 1980 chip_name = "vega10"; 1981 break; 1982 case CHIP_VEGA12: 1983 chip_name = "vega12"; 1984 break; 1985 case CHIP_RAVEN: 1986 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1987 chip_name = "raven2"; 1988 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1989 chip_name = "picasso"; 1990 else 1991 chip_name = "raven"; 1992 break; 1993 case CHIP_ARCTURUS: 1994 chip_name = "arcturus"; 1995 break; 1996 case CHIP_NAVI12: 1997 chip_name = "navi12"; 1998 break; 1999 } 2000 2001 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2002 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2003 if (err) { 2004 dev_err(adev->dev, 2005 "Failed to load gpu_info firmware \"%s\"\n", 2006 fw_name); 2007 goto out; 2008 } 2009 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2010 if (err) { 2011 dev_err(adev->dev, 2012 "Failed to validate gpu_info firmware \"%s\"\n", 2013 fw_name); 2014 goto out; 2015 } 2016 2017 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2018 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2019 2020 switch (hdr->version_major) { 2021 case 1: 2022 { 2023 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2024 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2025 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2026 2027 /* 2028 * Should be droped when DAL no longer needs it. 2029 */ 2030 if (adev->asic_type == CHIP_NAVI12) 2031 goto parse_soc_bounding_box; 2032 2033 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2034 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2035 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2036 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2037 adev->gfx.config.max_texture_channel_caches = 2038 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2039 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2040 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2041 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2042 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2043 adev->gfx.config.double_offchip_lds_buf = 2044 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2045 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2046 adev->gfx.cu_info.max_waves_per_simd = 2047 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2048 adev->gfx.cu_info.max_scratch_slots_per_cu = 2049 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2050 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2051 if (hdr->version_minor >= 1) { 2052 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2053 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2054 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2055 adev->gfx.config.num_sc_per_sh = 2056 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2057 adev->gfx.config.num_packer_per_sc = 2058 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2059 } 2060 2061 parse_soc_bounding_box: 2062 /* 2063 * soc bounding box info is not integrated in disocovery table, 2064 * we always need to parse it from gpu info firmware if needed. 2065 */ 2066 if (hdr->version_minor == 2) { 2067 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2068 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2069 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2070 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2071 } 2072 break; 2073 } 2074 default: 2075 dev_err(adev->dev, 2076 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2077 err = -EINVAL; 2078 goto out; 2079 } 2080 out: 2081 return err; 2082 } 2083 2084 /** 2085 * amdgpu_device_ip_early_init - run early init for hardware IPs 2086 * 2087 * @adev: amdgpu_device pointer 2088 * 2089 * Early initialization pass for hardware IPs. The hardware IPs that make 2090 * up each asic are discovered each IP's early_init callback is run. This 2091 * is the first stage in initializing the asic. 2092 * Returns 0 on success, negative error code on failure. 2093 */ 2094 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2095 { 2096 struct drm_device *dev = adev_to_drm(adev); 2097 struct pci_dev *parent; 2098 int i, r; 2099 2100 amdgpu_device_enable_virtual_display(adev); 2101 2102 if (amdgpu_sriov_vf(adev)) { 2103 r = amdgpu_virt_request_full_gpu(adev, true); 2104 if (r) 2105 return r; 2106 } 2107 2108 switch (adev->asic_type) { 2109 #ifdef CONFIG_DRM_AMDGPU_SI 2110 case CHIP_VERDE: 2111 case CHIP_TAHITI: 2112 case CHIP_PITCAIRN: 2113 case CHIP_OLAND: 2114 case CHIP_HAINAN: 2115 adev->family = AMDGPU_FAMILY_SI; 2116 r = si_set_ip_blocks(adev); 2117 if (r) 2118 return r; 2119 break; 2120 #endif 2121 #ifdef CONFIG_DRM_AMDGPU_CIK 2122 case CHIP_BONAIRE: 2123 case CHIP_HAWAII: 2124 case CHIP_KAVERI: 2125 case CHIP_KABINI: 2126 case CHIP_MULLINS: 2127 if (adev->flags & AMD_IS_APU) 2128 adev->family = AMDGPU_FAMILY_KV; 2129 else 2130 adev->family = AMDGPU_FAMILY_CI; 2131 2132 r = cik_set_ip_blocks(adev); 2133 if (r) 2134 return r; 2135 break; 2136 #endif 2137 case CHIP_TOPAZ: 2138 case CHIP_TONGA: 2139 case CHIP_FIJI: 2140 case CHIP_POLARIS10: 2141 case CHIP_POLARIS11: 2142 case CHIP_POLARIS12: 2143 case CHIP_VEGAM: 2144 case CHIP_CARRIZO: 2145 case CHIP_STONEY: 2146 if (adev->flags & AMD_IS_APU) 2147 adev->family = AMDGPU_FAMILY_CZ; 2148 else 2149 adev->family = AMDGPU_FAMILY_VI; 2150 2151 r = vi_set_ip_blocks(adev); 2152 if (r) 2153 return r; 2154 break; 2155 default: 2156 r = amdgpu_discovery_set_ip_blocks(adev); 2157 if (r) 2158 return r; 2159 break; 2160 } 2161 2162 if (amdgpu_has_atpx() && 2163 (amdgpu_is_atpx_hybrid() || 2164 amdgpu_has_atpx_dgpu_power_cntl()) && 2165 ((adev->flags & AMD_IS_APU) == 0) && 2166 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2167 adev->flags |= AMD_IS_PX; 2168 2169 if (!(adev->flags & AMD_IS_APU)) { 2170 parent = pci_upstream_bridge(adev->pdev); 2171 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2172 } 2173 2174 amdgpu_amdkfd_device_probe(adev); 2175 2176 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2177 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2178 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2179 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2180 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2181 2182 for (i = 0; i < adev->num_ip_blocks; i++) { 2183 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2184 DRM_ERROR("disabled ip block: %d <%s>\n", 2185 i, adev->ip_blocks[i].version->funcs->name); 2186 adev->ip_blocks[i].status.valid = false; 2187 } else { 2188 if (adev->ip_blocks[i].version->funcs->early_init) { 2189 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2190 if (r == -ENOENT) { 2191 adev->ip_blocks[i].status.valid = false; 2192 } else if (r) { 2193 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2194 adev->ip_blocks[i].version->funcs->name, r); 2195 return r; 2196 } else { 2197 adev->ip_blocks[i].status.valid = true; 2198 } 2199 } else { 2200 adev->ip_blocks[i].status.valid = true; 2201 } 2202 } 2203 /* get the vbios after the asic_funcs are set up */ 2204 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2205 r = amdgpu_device_parse_gpu_info_fw(adev); 2206 if (r) 2207 return r; 2208 2209 /* Read BIOS */ 2210 if (!amdgpu_get_bios(adev)) 2211 return -EINVAL; 2212 2213 r = amdgpu_atombios_init(adev); 2214 if (r) { 2215 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2216 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2217 return r; 2218 } 2219 2220 /*get pf2vf msg info at it's earliest time*/ 2221 if (amdgpu_sriov_vf(adev)) 2222 amdgpu_virt_init_data_exchange(adev); 2223 2224 } 2225 } 2226 2227 adev->cg_flags &= amdgpu_cg_mask; 2228 adev->pg_flags &= amdgpu_pg_mask; 2229 2230 return 0; 2231 } 2232 2233 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2234 { 2235 int i, r; 2236 2237 for (i = 0; i < adev->num_ip_blocks; i++) { 2238 if (!adev->ip_blocks[i].status.sw) 2239 continue; 2240 if (adev->ip_blocks[i].status.hw) 2241 continue; 2242 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2243 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2244 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2245 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2246 if (r) { 2247 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2248 adev->ip_blocks[i].version->funcs->name, r); 2249 return r; 2250 } 2251 adev->ip_blocks[i].status.hw = true; 2252 } 2253 } 2254 2255 return 0; 2256 } 2257 2258 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2259 { 2260 int i, r; 2261 2262 for (i = 0; i < adev->num_ip_blocks; i++) { 2263 if (!adev->ip_blocks[i].status.sw) 2264 continue; 2265 if (adev->ip_blocks[i].status.hw) 2266 continue; 2267 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2268 if (r) { 2269 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2270 adev->ip_blocks[i].version->funcs->name, r); 2271 return r; 2272 } 2273 adev->ip_blocks[i].status.hw = true; 2274 } 2275 2276 return 0; 2277 } 2278 2279 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2280 { 2281 int r = 0; 2282 int i; 2283 uint32_t smu_version; 2284 2285 if (adev->asic_type >= CHIP_VEGA10) { 2286 for (i = 0; i < adev->num_ip_blocks; i++) { 2287 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2288 continue; 2289 2290 if (!adev->ip_blocks[i].status.sw) 2291 continue; 2292 2293 /* no need to do the fw loading again if already done*/ 2294 if (adev->ip_blocks[i].status.hw == true) 2295 break; 2296 2297 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2298 r = adev->ip_blocks[i].version->funcs->resume(adev); 2299 if (r) { 2300 DRM_ERROR("resume of IP block <%s> failed %d\n", 2301 adev->ip_blocks[i].version->funcs->name, r); 2302 return r; 2303 } 2304 } else { 2305 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2306 if (r) { 2307 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2308 adev->ip_blocks[i].version->funcs->name, r); 2309 return r; 2310 } 2311 } 2312 2313 adev->ip_blocks[i].status.hw = true; 2314 break; 2315 } 2316 } 2317 2318 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2319 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2320 2321 return r; 2322 } 2323 2324 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2325 { 2326 long timeout; 2327 int r, i; 2328 2329 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2330 struct amdgpu_ring *ring = adev->rings[i]; 2331 2332 /* No need to setup the GPU scheduler for rings that don't need it */ 2333 if (!ring || ring->no_scheduler) 2334 continue; 2335 2336 switch (ring->funcs->type) { 2337 case AMDGPU_RING_TYPE_GFX: 2338 timeout = adev->gfx_timeout; 2339 break; 2340 case AMDGPU_RING_TYPE_COMPUTE: 2341 timeout = adev->compute_timeout; 2342 break; 2343 case AMDGPU_RING_TYPE_SDMA: 2344 timeout = adev->sdma_timeout; 2345 break; 2346 default: 2347 timeout = adev->video_timeout; 2348 break; 2349 } 2350 2351 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2352 ring->num_hw_submission, amdgpu_job_hang_limit, 2353 timeout, adev->reset_domain->wq, 2354 ring->sched_score, ring->name, 2355 adev->dev); 2356 if (r) { 2357 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2358 ring->name); 2359 return r; 2360 } 2361 } 2362 2363 return 0; 2364 } 2365 2366 2367 /** 2368 * amdgpu_device_ip_init - run init for hardware IPs 2369 * 2370 * @adev: amdgpu_device pointer 2371 * 2372 * Main initialization pass for hardware IPs. The list of all the hardware 2373 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2374 * are run. sw_init initializes the software state associated with each IP 2375 * and hw_init initializes the hardware associated with each IP. 2376 * Returns 0 on success, negative error code on failure. 2377 */ 2378 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2379 { 2380 int i, r; 2381 2382 r = amdgpu_ras_init(adev); 2383 if (r) 2384 return r; 2385 2386 for (i = 0; i < adev->num_ip_blocks; i++) { 2387 if (!adev->ip_blocks[i].status.valid) 2388 continue; 2389 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2390 if (r) { 2391 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2392 adev->ip_blocks[i].version->funcs->name, r); 2393 goto init_failed; 2394 } 2395 adev->ip_blocks[i].status.sw = true; 2396 2397 /* need to do gmc hw init early so we can allocate gpu mem */ 2398 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2399 /* Try to reserve bad pages early */ 2400 if (amdgpu_sriov_vf(adev)) 2401 amdgpu_virt_exchange_data(adev); 2402 2403 r = amdgpu_device_vram_scratch_init(adev); 2404 if (r) { 2405 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2406 goto init_failed; 2407 } 2408 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2409 if (r) { 2410 DRM_ERROR("hw_init %d failed %d\n", i, r); 2411 goto init_failed; 2412 } 2413 r = amdgpu_device_wb_init(adev); 2414 if (r) { 2415 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2416 goto init_failed; 2417 } 2418 adev->ip_blocks[i].status.hw = true; 2419 2420 /* right after GMC hw init, we create CSA */ 2421 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2422 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2423 AMDGPU_GEM_DOMAIN_VRAM, 2424 AMDGPU_CSA_SIZE); 2425 if (r) { 2426 DRM_ERROR("allocate CSA failed %d\n", r); 2427 goto init_failed; 2428 } 2429 } 2430 } 2431 } 2432 2433 if (amdgpu_sriov_vf(adev)) 2434 amdgpu_virt_init_data_exchange(adev); 2435 2436 r = amdgpu_ib_pool_init(adev); 2437 if (r) { 2438 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2439 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2440 goto init_failed; 2441 } 2442 2443 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2444 if (r) 2445 goto init_failed; 2446 2447 r = amdgpu_device_ip_hw_init_phase1(adev); 2448 if (r) 2449 goto init_failed; 2450 2451 r = amdgpu_device_fw_loading(adev); 2452 if (r) 2453 goto init_failed; 2454 2455 r = amdgpu_device_ip_hw_init_phase2(adev); 2456 if (r) 2457 goto init_failed; 2458 2459 /* 2460 * retired pages will be loaded from eeprom and reserved here, 2461 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2462 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2463 * for I2C communication which only true at this point. 2464 * 2465 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2466 * failure from bad gpu situation and stop amdgpu init process 2467 * accordingly. For other failed cases, it will still release all 2468 * the resource and print error message, rather than returning one 2469 * negative value to upper level. 2470 * 2471 * Note: theoretically, this should be called before all vram allocations 2472 * to protect retired page from abusing 2473 */ 2474 r = amdgpu_ras_recovery_init(adev); 2475 if (r) 2476 goto init_failed; 2477 2478 /** 2479 * In case of XGMI grab extra reference for reset domain for this device 2480 */ 2481 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2482 if (amdgpu_xgmi_add_device(adev) == 0) { 2483 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2484 2485 if (!hive->reset_domain || 2486 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2487 r = -ENOENT; 2488 goto init_failed; 2489 } 2490 2491 /* Drop the early temporary reset domain we created for device */ 2492 amdgpu_reset_put_reset_domain(adev->reset_domain); 2493 adev->reset_domain = hive->reset_domain; 2494 } 2495 } 2496 2497 r = amdgpu_device_init_schedulers(adev); 2498 if (r) 2499 goto init_failed; 2500 2501 /* Don't init kfd if whole hive need to be reset during init */ 2502 if (!adev->gmc.xgmi.pending_reset) 2503 amdgpu_amdkfd_device_init(adev); 2504 2505 amdgpu_fru_get_product_info(adev); 2506 2507 init_failed: 2508 if (amdgpu_sriov_vf(adev)) 2509 amdgpu_virt_release_full_gpu(adev, true); 2510 2511 return r; 2512 } 2513 2514 /** 2515 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2516 * 2517 * @adev: amdgpu_device pointer 2518 * 2519 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2520 * this function before a GPU reset. If the value is retained after a 2521 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2522 */ 2523 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2524 { 2525 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2526 } 2527 2528 /** 2529 * amdgpu_device_check_vram_lost - check if vram is valid 2530 * 2531 * @adev: amdgpu_device pointer 2532 * 2533 * Checks the reset magic value written to the gart pointer in VRAM. 2534 * The driver calls this after a GPU reset to see if the contents of 2535 * VRAM is lost or now. 2536 * returns true if vram is lost, false if not. 2537 */ 2538 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2539 { 2540 if (memcmp(adev->gart.ptr, adev->reset_magic, 2541 AMDGPU_RESET_MAGIC_NUM)) 2542 return true; 2543 2544 if (!amdgpu_in_reset(adev)) 2545 return false; 2546 2547 /* 2548 * For all ASICs with baco/mode1 reset, the VRAM is 2549 * always assumed to be lost. 2550 */ 2551 switch (amdgpu_asic_reset_method(adev)) { 2552 case AMD_RESET_METHOD_BACO: 2553 case AMD_RESET_METHOD_MODE1: 2554 return true; 2555 default: 2556 return false; 2557 } 2558 } 2559 2560 /** 2561 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2562 * 2563 * @adev: amdgpu_device pointer 2564 * @state: clockgating state (gate or ungate) 2565 * 2566 * The list of all the hardware IPs that make up the asic is walked and the 2567 * set_clockgating_state callbacks are run. 2568 * Late initialization pass enabling clockgating for hardware IPs. 2569 * Fini or suspend, pass disabling clockgating for hardware IPs. 2570 * Returns 0 on success, negative error code on failure. 2571 */ 2572 2573 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2574 enum amd_clockgating_state state) 2575 { 2576 int i, j, r; 2577 2578 if (amdgpu_emu_mode == 1) 2579 return 0; 2580 2581 for (j = 0; j < adev->num_ip_blocks; j++) { 2582 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2583 if (!adev->ip_blocks[i].status.late_initialized) 2584 continue; 2585 /* skip CG for GFX on S0ix */ 2586 if (adev->in_s0ix && 2587 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2588 continue; 2589 /* skip CG for VCE/UVD, it's handled specially */ 2590 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2591 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2592 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2593 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2594 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2595 /* enable clockgating to save power */ 2596 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2597 state); 2598 if (r) { 2599 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2600 adev->ip_blocks[i].version->funcs->name, r); 2601 return r; 2602 } 2603 } 2604 } 2605 2606 return 0; 2607 } 2608 2609 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2610 enum amd_powergating_state state) 2611 { 2612 int i, j, r; 2613 2614 if (amdgpu_emu_mode == 1) 2615 return 0; 2616 2617 for (j = 0; j < adev->num_ip_blocks; j++) { 2618 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2619 if (!adev->ip_blocks[i].status.late_initialized) 2620 continue; 2621 /* skip PG for GFX on S0ix */ 2622 if (adev->in_s0ix && 2623 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2624 continue; 2625 /* skip CG for VCE/UVD, it's handled specially */ 2626 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2627 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2628 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2629 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2630 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2631 /* enable powergating to save power */ 2632 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2633 state); 2634 if (r) { 2635 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2636 adev->ip_blocks[i].version->funcs->name, r); 2637 return r; 2638 } 2639 } 2640 } 2641 return 0; 2642 } 2643 2644 static int amdgpu_device_enable_mgpu_fan_boost(void) 2645 { 2646 struct amdgpu_gpu_instance *gpu_ins; 2647 struct amdgpu_device *adev; 2648 int i, ret = 0; 2649 2650 mutex_lock(&mgpu_info.mutex); 2651 2652 /* 2653 * MGPU fan boost feature should be enabled 2654 * only when there are two or more dGPUs in 2655 * the system 2656 */ 2657 if (mgpu_info.num_dgpu < 2) 2658 goto out; 2659 2660 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2661 gpu_ins = &(mgpu_info.gpu_ins[i]); 2662 adev = gpu_ins->adev; 2663 if (!(adev->flags & AMD_IS_APU) && 2664 !gpu_ins->mgpu_fan_enabled) { 2665 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2666 if (ret) 2667 break; 2668 2669 gpu_ins->mgpu_fan_enabled = 1; 2670 } 2671 } 2672 2673 out: 2674 mutex_unlock(&mgpu_info.mutex); 2675 2676 return ret; 2677 } 2678 2679 /** 2680 * amdgpu_device_ip_late_init - run late init for hardware IPs 2681 * 2682 * @adev: amdgpu_device pointer 2683 * 2684 * Late initialization pass for hardware IPs. The list of all the hardware 2685 * IPs that make up the asic is walked and the late_init callbacks are run. 2686 * late_init covers any special initialization that an IP requires 2687 * after all of the have been initialized or something that needs to happen 2688 * late in the init process. 2689 * Returns 0 on success, negative error code on failure. 2690 */ 2691 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2692 { 2693 struct amdgpu_gpu_instance *gpu_instance; 2694 int i = 0, r; 2695 2696 for (i = 0; i < adev->num_ip_blocks; i++) { 2697 if (!adev->ip_blocks[i].status.hw) 2698 continue; 2699 if (adev->ip_blocks[i].version->funcs->late_init) { 2700 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2701 if (r) { 2702 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2703 adev->ip_blocks[i].version->funcs->name, r); 2704 return r; 2705 } 2706 } 2707 adev->ip_blocks[i].status.late_initialized = true; 2708 } 2709 2710 r = amdgpu_ras_late_init(adev); 2711 if (r) { 2712 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2713 return r; 2714 } 2715 2716 amdgpu_ras_set_error_query_ready(adev, true); 2717 2718 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2719 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2720 2721 amdgpu_device_fill_reset_magic(adev); 2722 2723 r = amdgpu_device_enable_mgpu_fan_boost(); 2724 if (r) 2725 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2726 2727 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2728 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2729 adev->asic_type == CHIP_ALDEBARAN )) 2730 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2731 2732 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2733 mutex_lock(&mgpu_info.mutex); 2734 2735 /* 2736 * Reset device p-state to low as this was booted with high. 2737 * 2738 * This should be performed only after all devices from the same 2739 * hive get initialized. 2740 * 2741 * However, it's unknown how many device in the hive in advance. 2742 * As this is counted one by one during devices initializations. 2743 * 2744 * So, we wait for all XGMI interlinked devices initialized. 2745 * This may bring some delays as those devices may come from 2746 * different hives. But that should be OK. 2747 */ 2748 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2749 for (i = 0; i < mgpu_info.num_gpu; i++) { 2750 gpu_instance = &(mgpu_info.gpu_ins[i]); 2751 if (gpu_instance->adev->flags & AMD_IS_APU) 2752 continue; 2753 2754 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2755 AMDGPU_XGMI_PSTATE_MIN); 2756 if (r) { 2757 DRM_ERROR("pstate setting failed (%d).\n", r); 2758 break; 2759 } 2760 } 2761 } 2762 2763 mutex_unlock(&mgpu_info.mutex); 2764 } 2765 2766 return 0; 2767 } 2768 2769 /** 2770 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2771 * 2772 * @adev: amdgpu_device pointer 2773 * 2774 * For ASICs need to disable SMC first 2775 */ 2776 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2777 { 2778 int i, r; 2779 2780 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2781 return; 2782 2783 for (i = 0; i < adev->num_ip_blocks; i++) { 2784 if (!adev->ip_blocks[i].status.hw) 2785 continue; 2786 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2787 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2788 /* XXX handle errors */ 2789 if (r) { 2790 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2791 adev->ip_blocks[i].version->funcs->name, r); 2792 } 2793 adev->ip_blocks[i].status.hw = false; 2794 break; 2795 } 2796 } 2797 } 2798 2799 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2800 { 2801 int i, r; 2802 2803 for (i = 0; i < adev->num_ip_blocks; i++) { 2804 if (!adev->ip_blocks[i].version->funcs->early_fini) 2805 continue; 2806 2807 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2808 if (r) { 2809 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2810 adev->ip_blocks[i].version->funcs->name, r); 2811 } 2812 } 2813 2814 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2815 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2816 2817 amdgpu_amdkfd_suspend(adev, false); 2818 2819 /* Workaroud for ASICs need to disable SMC first */ 2820 amdgpu_device_smu_fini_early(adev); 2821 2822 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2823 if (!adev->ip_blocks[i].status.hw) 2824 continue; 2825 2826 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2827 /* XXX handle errors */ 2828 if (r) { 2829 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2830 adev->ip_blocks[i].version->funcs->name, r); 2831 } 2832 2833 adev->ip_blocks[i].status.hw = false; 2834 } 2835 2836 if (amdgpu_sriov_vf(adev)) { 2837 if (amdgpu_virt_release_full_gpu(adev, false)) 2838 DRM_ERROR("failed to release exclusive mode on fini\n"); 2839 } 2840 2841 return 0; 2842 } 2843 2844 /** 2845 * amdgpu_device_ip_fini - run fini for hardware IPs 2846 * 2847 * @adev: amdgpu_device pointer 2848 * 2849 * Main teardown pass for hardware IPs. The list of all the hardware 2850 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2851 * are run. hw_fini tears down the hardware associated with each IP 2852 * and sw_fini tears down any software state associated with each IP. 2853 * Returns 0 on success, negative error code on failure. 2854 */ 2855 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2856 { 2857 int i, r; 2858 2859 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2860 amdgpu_virt_release_ras_err_handler_data(adev); 2861 2862 if (adev->gmc.xgmi.num_physical_nodes > 1) 2863 amdgpu_xgmi_remove_device(adev); 2864 2865 amdgpu_amdkfd_device_fini_sw(adev); 2866 2867 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2868 if (!adev->ip_blocks[i].status.sw) 2869 continue; 2870 2871 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2872 amdgpu_ucode_free_bo(adev); 2873 amdgpu_free_static_csa(&adev->virt.csa_obj); 2874 amdgpu_device_wb_fini(adev); 2875 amdgpu_device_vram_scratch_fini(adev); 2876 amdgpu_ib_pool_fini(adev); 2877 } 2878 2879 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2880 /* XXX handle errors */ 2881 if (r) { 2882 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2883 adev->ip_blocks[i].version->funcs->name, r); 2884 } 2885 adev->ip_blocks[i].status.sw = false; 2886 adev->ip_blocks[i].status.valid = false; 2887 } 2888 2889 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2890 if (!adev->ip_blocks[i].status.late_initialized) 2891 continue; 2892 if (adev->ip_blocks[i].version->funcs->late_fini) 2893 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2894 adev->ip_blocks[i].status.late_initialized = false; 2895 } 2896 2897 amdgpu_ras_fini(adev); 2898 2899 return 0; 2900 } 2901 2902 /** 2903 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2904 * 2905 * @work: work_struct. 2906 */ 2907 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2908 { 2909 struct amdgpu_device *adev = 2910 container_of(work, struct amdgpu_device, delayed_init_work.work); 2911 int r; 2912 2913 r = amdgpu_ib_ring_tests(adev); 2914 if (r) 2915 DRM_ERROR("ib ring test failed (%d).\n", r); 2916 } 2917 2918 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2919 { 2920 struct amdgpu_device *adev = 2921 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2922 2923 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2924 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2925 2926 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2927 adev->gfx.gfx_off_state = true; 2928 } 2929 2930 /** 2931 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2932 * 2933 * @adev: amdgpu_device pointer 2934 * 2935 * Main suspend function for hardware IPs. The list of all the hardware 2936 * IPs that make up the asic is walked, clockgating is disabled and the 2937 * suspend callbacks are run. suspend puts the hardware and software state 2938 * in each IP into a state suitable for suspend. 2939 * Returns 0 on success, negative error code on failure. 2940 */ 2941 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2942 { 2943 int i, r; 2944 2945 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2946 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2947 2948 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2949 if (!adev->ip_blocks[i].status.valid) 2950 continue; 2951 2952 /* displays are handled separately */ 2953 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2954 continue; 2955 2956 /* XXX handle errors */ 2957 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2958 /* XXX handle errors */ 2959 if (r) { 2960 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2961 adev->ip_blocks[i].version->funcs->name, r); 2962 return r; 2963 } 2964 2965 adev->ip_blocks[i].status.hw = false; 2966 } 2967 2968 return 0; 2969 } 2970 2971 /** 2972 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2973 * 2974 * @adev: amdgpu_device pointer 2975 * 2976 * Main suspend function for hardware IPs. The list of all the hardware 2977 * IPs that make up the asic is walked, clockgating is disabled and the 2978 * suspend callbacks are run. suspend puts the hardware and software state 2979 * in each IP into a state suitable for suspend. 2980 * Returns 0 on success, negative error code on failure. 2981 */ 2982 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2983 { 2984 int i, r; 2985 2986 if (adev->in_s0ix) 2987 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2988 2989 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2990 if (!adev->ip_blocks[i].status.valid) 2991 continue; 2992 /* displays are handled in phase1 */ 2993 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2994 continue; 2995 /* PSP lost connection when err_event_athub occurs */ 2996 if (amdgpu_ras_intr_triggered() && 2997 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2998 adev->ip_blocks[i].status.hw = false; 2999 continue; 3000 } 3001 3002 /* skip unnecessary suspend if we do not initialize them yet */ 3003 if (adev->gmc.xgmi.pending_reset && 3004 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3005 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3006 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3008 adev->ip_blocks[i].status.hw = false; 3009 continue; 3010 } 3011 3012 /* skip suspend of gfx and psp for S0ix 3013 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3014 * like at runtime. PSP is also part of the always on hardware 3015 * so no need to suspend it. 3016 */ 3017 if (adev->in_s0ix && 3018 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3019 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 3020 continue; 3021 3022 /* XXX handle errors */ 3023 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3024 /* XXX handle errors */ 3025 if (r) { 3026 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3027 adev->ip_blocks[i].version->funcs->name, r); 3028 } 3029 adev->ip_blocks[i].status.hw = false; 3030 /* handle putting the SMC in the appropriate state */ 3031 if(!amdgpu_sriov_vf(adev)){ 3032 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3033 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3034 if (r) { 3035 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3036 adev->mp1_state, r); 3037 return r; 3038 } 3039 } 3040 } 3041 } 3042 3043 return 0; 3044 } 3045 3046 /** 3047 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3048 * 3049 * @adev: amdgpu_device pointer 3050 * 3051 * Main suspend function for hardware IPs. The list of all the hardware 3052 * IPs that make up the asic is walked, clockgating is disabled and the 3053 * suspend callbacks are run. suspend puts the hardware and software state 3054 * in each IP into a state suitable for suspend. 3055 * Returns 0 on success, negative error code on failure. 3056 */ 3057 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3058 { 3059 int r; 3060 3061 if (amdgpu_sriov_vf(adev)) { 3062 amdgpu_virt_fini_data_exchange(adev); 3063 amdgpu_virt_request_full_gpu(adev, false); 3064 } 3065 3066 r = amdgpu_device_ip_suspend_phase1(adev); 3067 if (r) 3068 return r; 3069 r = amdgpu_device_ip_suspend_phase2(adev); 3070 3071 if (amdgpu_sriov_vf(adev)) 3072 amdgpu_virt_release_full_gpu(adev, false); 3073 3074 return r; 3075 } 3076 3077 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3078 { 3079 int i, r; 3080 3081 static enum amd_ip_block_type ip_order[] = { 3082 AMD_IP_BLOCK_TYPE_GMC, 3083 AMD_IP_BLOCK_TYPE_COMMON, 3084 AMD_IP_BLOCK_TYPE_PSP, 3085 AMD_IP_BLOCK_TYPE_IH, 3086 }; 3087 3088 for (i = 0; i < adev->num_ip_blocks; i++) { 3089 int j; 3090 struct amdgpu_ip_block *block; 3091 3092 block = &adev->ip_blocks[i]; 3093 block->status.hw = false; 3094 3095 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3096 3097 if (block->version->type != ip_order[j] || 3098 !block->status.valid) 3099 continue; 3100 3101 r = block->version->funcs->hw_init(adev); 3102 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3103 if (r) 3104 return r; 3105 block->status.hw = true; 3106 } 3107 } 3108 3109 return 0; 3110 } 3111 3112 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3113 { 3114 int i, r; 3115 3116 static enum amd_ip_block_type ip_order[] = { 3117 AMD_IP_BLOCK_TYPE_SMC, 3118 AMD_IP_BLOCK_TYPE_DCE, 3119 AMD_IP_BLOCK_TYPE_GFX, 3120 AMD_IP_BLOCK_TYPE_SDMA, 3121 AMD_IP_BLOCK_TYPE_UVD, 3122 AMD_IP_BLOCK_TYPE_VCE, 3123 AMD_IP_BLOCK_TYPE_VCN 3124 }; 3125 3126 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3127 int j; 3128 struct amdgpu_ip_block *block; 3129 3130 for (j = 0; j < adev->num_ip_blocks; j++) { 3131 block = &adev->ip_blocks[j]; 3132 3133 if (block->version->type != ip_order[i] || 3134 !block->status.valid || 3135 block->status.hw) 3136 continue; 3137 3138 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3139 r = block->version->funcs->resume(adev); 3140 else 3141 r = block->version->funcs->hw_init(adev); 3142 3143 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3144 if (r) 3145 return r; 3146 block->status.hw = true; 3147 } 3148 } 3149 3150 return 0; 3151 } 3152 3153 /** 3154 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3155 * 3156 * @adev: amdgpu_device pointer 3157 * 3158 * First resume function for hardware IPs. The list of all the hardware 3159 * IPs that make up the asic is walked and the resume callbacks are run for 3160 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3161 * after a suspend and updates the software state as necessary. This 3162 * function is also used for restoring the GPU after a GPU reset. 3163 * Returns 0 on success, negative error code on failure. 3164 */ 3165 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3166 { 3167 int i, r; 3168 3169 for (i = 0; i < adev->num_ip_blocks; i++) { 3170 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3171 continue; 3172 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3173 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3174 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3175 3176 r = adev->ip_blocks[i].version->funcs->resume(adev); 3177 if (r) { 3178 DRM_ERROR("resume of IP block <%s> failed %d\n", 3179 adev->ip_blocks[i].version->funcs->name, r); 3180 return r; 3181 } 3182 adev->ip_blocks[i].status.hw = true; 3183 } 3184 } 3185 3186 return 0; 3187 } 3188 3189 /** 3190 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3191 * 3192 * @adev: amdgpu_device pointer 3193 * 3194 * First resume function for hardware IPs. The list of all the hardware 3195 * IPs that make up the asic is walked and the resume callbacks are run for 3196 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3197 * functional state after a suspend and updates the software state as 3198 * necessary. This function is also used for restoring the GPU after a GPU 3199 * reset. 3200 * Returns 0 on success, negative error code on failure. 3201 */ 3202 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3203 { 3204 int i, r; 3205 3206 for (i = 0; i < adev->num_ip_blocks; i++) { 3207 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3208 continue; 3209 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3210 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3211 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3212 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3213 continue; 3214 r = adev->ip_blocks[i].version->funcs->resume(adev); 3215 if (r) { 3216 DRM_ERROR("resume of IP block <%s> failed %d\n", 3217 adev->ip_blocks[i].version->funcs->name, r); 3218 return r; 3219 } 3220 adev->ip_blocks[i].status.hw = true; 3221 } 3222 3223 return 0; 3224 } 3225 3226 /** 3227 * amdgpu_device_ip_resume - run resume for hardware IPs 3228 * 3229 * @adev: amdgpu_device pointer 3230 * 3231 * Main resume function for hardware IPs. The hardware IPs 3232 * are split into two resume functions because they are 3233 * are also used in in recovering from a GPU reset and some additional 3234 * steps need to be take between them. In this case (S3/S4) they are 3235 * run sequentially. 3236 * Returns 0 on success, negative error code on failure. 3237 */ 3238 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3239 { 3240 int r; 3241 3242 r = amdgpu_amdkfd_resume_iommu(adev); 3243 if (r) 3244 return r; 3245 3246 r = amdgpu_device_ip_resume_phase1(adev); 3247 if (r) 3248 return r; 3249 3250 r = amdgpu_device_fw_loading(adev); 3251 if (r) 3252 return r; 3253 3254 r = amdgpu_device_ip_resume_phase2(adev); 3255 3256 return r; 3257 } 3258 3259 /** 3260 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3261 * 3262 * @adev: amdgpu_device pointer 3263 * 3264 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3265 */ 3266 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3267 { 3268 if (amdgpu_sriov_vf(adev)) { 3269 if (adev->is_atom_fw) { 3270 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3271 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3272 } else { 3273 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3274 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3275 } 3276 3277 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3278 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3279 } 3280 } 3281 3282 /** 3283 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3284 * 3285 * @asic_type: AMD asic type 3286 * 3287 * Check if there is DC (new modesetting infrastructre) support for an asic. 3288 * returns true if DC has support, false if not. 3289 */ 3290 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3291 { 3292 switch (asic_type) { 3293 #ifdef CONFIG_DRM_AMDGPU_SI 3294 case CHIP_HAINAN: 3295 #endif 3296 case CHIP_TOPAZ: 3297 /* chips with no display hardware */ 3298 return false; 3299 #if defined(CONFIG_DRM_AMD_DC) 3300 case CHIP_TAHITI: 3301 case CHIP_PITCAIRN: 3302 case CHIP_VERDE: 3303 case CHIP_OLAND: 3304 /* 3305 * We have systems in the wild with these ASICs that require 3306 * LVDS and VGA support which is not supported with DC. 3307 * 3308 * Fallback to the non-DC driver here by default so as not to 3309 * cause regressions. 3310 */ 3311 #if defined(CONFIG_DRM_AMD_DC_SI) 3312 return amdgpu_dc > 0; 3313 #else 3314 return false; 3315 #endif 3316 case CHIP_BONAIRE: 3317 case CHIP_KAVERI: 3318 case CHIP_KABINI: 3319 case CHIP_MULLINS: 3320 /* 3321 * We have systems in the wild with these ASICs that require 3322 * LVDS and VGA support which is not supported with DC. 3323 * 3324 * Fallback to the non-DC driver here by default so as not to 3325 * cause regressions. 3326 */ 3327 return amdgpu_dc > 0; 3328 case CHIP_HAWAII: 3329 case CHIP_CARRIZO: 3330 case CHIP_STONEY: 3331 case CHIP_POLARIS10: 3332 case CHIP_POLARIS11: 3333 case CHIP_POLARIS12: 3334 case CHIP_VEGAM: 3335 case CHIP_TONGA: 3336 case CHIP_FIJI: 3337 case CHIP_VEGA10: 3338 case CHIP_VEGA12: 3339 case CHIP_VEGA20: 3340 #if defined(CONFIG_DRM_AMD_DC_DCN) 3341 case CHIP_RAVEN: 3342 case CHIP_NAVI10: 3343 case CHIP_NAVI14: 3344 case CHIP_NAVI12: 3345 case CHIP_RENOIR: 3346 case CHIP_CYAN_SKILLFISH: 3347 case CHIP_SIENNA_CICHLID: 3348 case CHIP_NAVY_FLOUNDER: 3349 case CHIP_DIMGREY_CAVEFISH: 3350 case CHIP_BEIGE_GOBY: 3351 case CHIP_VANGOGH: 3352 case CHIP_YELLOW_CARP: 3353 #endif 3354 default: 3355 return amdgpu_dc != 0; 3356 #else 3357 default: 3358 if (amdgpu_dc > 0) 3359 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3360 "but isn't supported by ASIC, ignoring\n"); 3361 return false; 3362 #endif 3363 } 3364 } 3365 3366 /** 3367 * amdgpu_device_has_dc_support - check if dc is supported 3368 * 3369 * @adev: amdgpu_device pointer 3370 * 3371 * Returns true for supported, false for not supported 3372 */ 3373 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3374 { 3375 if (amdgpu_sriov_vf(adev) || 3376 adev->enable_virtual_display || 3377 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3378 return false; 3379 3380 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3381 } 3382 3383 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3384 { 3385 struct amdgpu_device *adev = 3386 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3387 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3388 3389 /* It's a bug to not have a hive within this function */ 3390 if (WARN_ON(!hive)) 3391 return; 3392 3393 /* 3394 * Use task barrier to synchronize all xgmi reset works across the 3395 * hive. task_barrier_enter and task_barrier_exit will block 3396 * until all the threads running the xgmi reset works reach 3397 * those points. task_barrier_full will do both blocks. 3398 */ 3399 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3400 3401 task_barrier_enter(&hive->tb); 3402 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3403 3404 if (adev->asic_reset_res) 3405 goto fail; 3406 3407 task_barrier_exit(&hive->tb); 3408 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3409 3410 if (adev->asic_reset_res) 3411 goto fail; 3412 3413 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3414 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3415 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3416 } else { 3417 3418 task_barrier_full(&hive->tb); 3419 adev->asic_reset_res = amdgpu_asic_reset(adev); 3420 } 3421 3422 fail: 3423 if (adev->asic_reset_res) 3424 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3425 adev->asic_reset_res, adev_to_drm(adev)->unique); 3426 amdgpu_put_xgmi_hive(hive); 3427 } 3428 3429 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3430 { 3431 char *input = amdgpu_lockup_timeout; 3432 char *timeout_setting = NULL; 3433 int index = 0; 3434 long timeout; 3435 int ret = 0; 3436 3437 /* 3438 * By default timeout for non compute jobs is 10000 3439 * and 60000 for compute jobs. 3440 * In SR-IOV or passthrough mode, timeout for compute 3441 * jobs are 60000 by default. 3442 */ 3443 adev->gfx_timeout = msecs_to_jiffies(10000); 3444 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3445 if (amdgpu_sriov_vf(adev)) 3446 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3447 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3448 else 3449 adev->compute_timeout = msecs_to_jiffies(60000); 3450 3451 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3452 while ((timeout_setting = strsep(&input, ",")) && 3453 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3454 ret = kstrtol(timeout_setting, 0, &timeout); 3455 if (ret) 3456 return ret; 3457 3458 if (timeout == 0) { 3459 index++; 3460 continue; 3461 } else if (timeout < 0) { 3462 timeout = MAX_SCHEDULE_TIMEOUT; 3463 dev_warn(adev->dev, "lockup timeout disabled"); 3464 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3465 } else { 3466 timeout = msecs_to_jiffies(timeout); 3467 } 3468 3469 switch (index++) { 3470 case 0: 3471 adev->gfx_timeout = timeout; 3472 break; 3473 case 1: 3474 adev->compute_timeout = timeout; 3475 break; 3476 case 2: 3477 adev->sdma_timeout = timeout; 3478 break; 3479 case 3: 3480 adev->video_timeout = timeout; 3481 break; 3482 default: 3483 break; 3484 } 3485 } 3486 /* 3487 * There is only one value specified and 3488 * it should apply to all non-compute jobs. 3489 */ 3490 if (index == 1) { 3491 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3492 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3493 adev->compute_timeout = adev->gfx_timeout; 3494 } 3495 } 3496 3497 return ret; 3498 } 3499 3500 /** 3501 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3502 * 3503 * @adev: amdgpu_device pointer 3504 * 3505 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3506 */ 3507 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3508 { 3509 struct iommu_domain *domain; 3510 3511 domain = iommu_get_domain_for_dev(adev->dev); 3512 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3513 adev->ram_is_direct_mapped = true; 3514 } 3515 3516 static const struct attribute *amdgpu_dev_attributes[] = { 3517 &dev_attr_product_name.attr, 3518 &dev_attr_product_number.attr, 3519 &dev_attr_serial_number.attr, 3520 &dev_attr_pcie_replay_count.attr, 3521 NULL 3522 }; 3523 3524 /** 3525 * amdgpu_device_init - initialize the driver 3526 * 3527 * @adev: amdgpu_device pointer 3528 * @flags: driver flags 3529 * 3530 * Initializes the driver info and hw (all asics). 3531 * Returns 0 for success or an error on failure. 3532 * Called at driver startup. 3533 */ 3534 int amdgpu_device_init(struct amdgpu_device *adev, 3535 uint32_t flags) 3536 { 3537 struct drm_device *ddev = adev_to_drm(adev); 3538 struct pci_dev *pdev = adev->pdev; 3539 int r, i; 3540 bool px = false; 3541 u32 max_MBps; 3542 3543 adev->shutdown = false; 3544 adev->flags = flags; 3545 3546 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3547 adev->asic_type = amdgpu_force_asic_type; 3548 else 3549 adev->asic_type = flags & AMD_ASIC_MASK; 3550 3551 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3552 if (amdgpu_emu_mode == 1) 3553 adev->usec_timeout *= 10; 3554 adev->gmc.gart_size = 512 * 1024 * 1024; 3555 adev->accel_working = false; 3556 adev->num_rings = 0; 3557 adev->mman.buffer_funcs = NULL; 3558 adev->mman.buffer_funcs_ring = NULL; 3559 adev->vm_manager.vm_pte_funcs = NULL; 3560 adev->vm_manager.vm_pte_num_scheds = 0; 3561 adev->gmc.gmc_funcs = NULL; 3562 adev->harvest_ip_mask = 0x0; 3563 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3564 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3565 3566 adev->smc_rreg = &amdgpu_invalid_rreg; 3567 adev->smc_wreg = &amdgpu_invalid_wreg; 3568 adev->pcie_rreg = &amdgpu_invalid_rreg; 3569 adev->pcie_wreg = &amdgpu_invalid_wreg; 3570 adev->pciep_rreg = &amdgpu_invalid_rreg; 3571 adev->pciep_wreg = &amdgpu_invalid_wreg; 3572 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3573 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3574 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3575 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3576 adev->didt_rreg = &amdgpu_invalid_rreg; 3577 adev->didt_wreg = &amdgpu_invalid_wreg; 3578 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3579 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3580 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3581 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3582 3583 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3584 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3585 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3586 3587 /* mutex initialization are all done here so we 3588 * can recall function without having locking issues */ 3589 mutex_init(&adev->firmware.mutex); 3590 mutex_init(&adev->pm.mutex); 3591 mutex_init(&adev->gfx.gpu_clock_mutex); 3592 mutex_init(&adev->srbm_mutex); 3593 mutex_init(&adev->gfx.pipe_reserve_mutex); 3594 mutex_init(&adev->gfx.gfx_off_mutex); 3595 mutex_init(&adev->grbm_idx_mutex); 3596 mutex_init(&adev->mn_lock); 3597 mutex_init(&adev->virt.vf_errors.lock); 3598 hash_init(adev->mn_hash); 3599 mutex_init(&adev->psp.mutex); 3600 mutex_init(&adev->notifier_lock); 3601 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3602 mutex_init(&adev->benchmark_mutex); 3603 3604 amdgpu_device_init_apu_flags(adev); 3605 3606 r = amdgpu_device_check_arguments(adev); 3607 if (r) 3608 return r; 3609 3610 spin_lock_init(&adev->mmio_idx_lock); 3611 spin_lock_init(&adev->smc_idx_lock); 3612 spin_lock_init(&adev->pcie_idx_lock); 3613 spin_lock_init(&adev->uvd_ctx_idx_lock); 3614 spin_lock_init(&adev->didt_idx_lock); 3615 spin_lock_init(&adev->gc_cac_idx_lock); 3616 spin_lock_init(&adev->se_cac_idx_lock); 3617 spin_lock_init(&adev->audio_endpt_idx_lock); 3618 spin_lock_init(&adev->mm_stats.lock); 3619 3620 INIT_LIST_HEAD(&adev->shadow_list); 3621 mutex_init(&adev->shadow_list_lock); 3622 3623 INIT_LIST_HEAD(&adev->reset_list); 3624 3625 INIT_LIST_HEAD(&adev->ras_list); 3626 3627 INIT_DELAYED_WORK(&adev->delayed_init_work, 3628 amdgpu_device_delayed_init_work_handler); 3629 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3630 amdgpu_device_delay_enable_gfx_off); 3631 3632 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3633 3634 adev->gfx.gfx_off_req_count = 1; 3635 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3636 3637 atomic_set(&adev->throttling_logging_enabled, 1); 3638 /* 3639 * If throttling continues, logging will be performed every minute 3640 * to avoid log flooding. "-1" is subtracted since the thermal 3641 * throttling interrupt comes every second. Thus, the total logging 3642 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3643 * for throttling interrupt) = 60 seconds. 3644 */ 3645 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3646 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3647 3648 /* Registers mapping */ 3649 /* TODO: block userspace mapping of io register */ 3650 if (adev->asic_type >= CHIP_BONAIRE) { 3651 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3652 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3653 } else { 3654 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3655 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3656 } 3657 3658 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3659 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3660 3661 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3662 if (adev->rmmio == NULL) { 3663 return -ENOMEM; 3664 } 3665 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3666 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3667 3668 amdgpu_device_get_pcie_info(adev); 3669 3670 if (amdgpu_mcbp) 3671 DRM_INFO("MCBP is enabled\n"); 3672 3673 if (adev->asic_type >= CHIP_NAVI10) { 3674 if (amdgpu_mes || amdgpu_mes_kiq) 3675 adev->enable_mes = true; 3676 3677 if (amdgpu_mes_kiq) 3678 adev->enable_mes_kiq = true; 3679 } 3680 3681 /* 3682 * Reset domain needs to be present early, before XGMI hive discovered 3683 * (if any) and intitialized to use reset sem and in_gpu reset flag 3684 * early on during init and before calling to RREG32. 3685 */ 3686 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3687 if (!adev->reset_domain) 3688 return -ENOMEM; 3689 3690 /* detect hw virtualization here */ 3691 amdgpu_detect_virtualization(adev); 3692 3693 r = amdgpu_device_get_job_timeout_settings(adev); 3694 if (r) { 3695 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3696 return r; 3697 } 3698 3699 /* early init functions */ 3700 r = amdgpu_device_ip_early_init(adev); 3701 if (r) 3702 return r; 3703 3704 amdgpu_gmc_noretry_set(adev); 3705 /* Need to get xgmi info early to decide the reset behavior*/ 3706 if (adev->gmc.xgmi.supported) { 3707 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3708 if (r) 3709 return r; 3710 } 3711 3712 /* enable PCIE atomic ops */ 3713 if (amdgpu_sriov_vf(adev)) 3714 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3715 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3716 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3717 else 3718 adev->have_atomics_support = 3719 !pci_enable_atomic_ops_to_root(adev->pdev, 3720 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3721 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3722 if (!adev->have_atomics_support) 3723 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3724 3725 /* doorbell bar mapping and doorbell index init*/ 3726 amdgpu_device_doorbell_init(adev); 3727 3728 if (amdgpu_emu_mode == 1) { 3729 /* post the asic on emulation mode */ 3730 emu_soc_asic_init(adev); 3731 goto fence_driver_init; 3732 } 3733 3734 amdgpu_reset_init(adev); 3735 3736 /* detect if we are with an SRIOV vbios */ 3737 amdgpu_device_detect_sriov_bios(adev); 3738 3739 /* check if we need to reset the asic 3740 * E.g., driver was not cleanly unloaded previously, etc. 3741 */ 3742 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3743 if (adev->gmc.xgmi.num_physical_nodes) { 3744 dev_info(adev->dev, "Pending hive reset.\n"); 3745 adev->gmc.xgmi.pending_reset = true; 3746 /* Only need to init necessary block for SMU to handle the reset */ 3747 for (i = 0; i < adev->num_ip_blocks; i++) { 3748 if (!adev->ip_blocks[i].status.valid) 3749 continue; 3750 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3752 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3753 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3754 DRM_DEBUG("IP %s disabled for hw_init.\n", 3755 adev->ip_blocks[i].version->funcs->name); 3756 adev->ip_blocks[i].status.hw = true; 3757 } 3758 } 3759 } else { 3760 r = amdgpu_asic_reset(adev); 3761 if (r) { 3762 dev_err(adev->dev, "asic reset on init failed\n"); 3763 goto failed; 3764 } 3765 } 3766 } 3767 3768 pci_enable_pcie_error_reporting(adev->pdev); 3769 3770 /* Post card if necessary */ 3771 if (amdgpu_device_need_post(adev)) { 3772 if (!adev->bios) { 3773 dev_err(adev->dev, "no vBIOS found\n"); 3774 r = -EINVAL; 3775 goto failed; 3776 } 3777 DRM_INFO("GPU posting now...\n"); 3778 r = amdgpu_device_asic_init(adev); 3779 if (r) { 3780 dev_err(adev->dev, "gpu post error!\n"); 3781 goto failed; 3782 } 3783 } 3784 3785 if (adev->is_atom_fw) { 3786 /* Initialize clocks */ 3787 r = amdgpu_atomfirmware_get_clock_info(adev); 3788 if (r) { 3789 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3790 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3791 goto failed; 3792 } 3793 } else { 3794 /* Initialize clocks */ 3795 r = amdgpu_atombios_get_clock_info(adev); 3796 if (r) { 3797 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3798 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3799 goto failed; 3800 } 3801 /* init i2c buses */ 3802 if (!amdgpu_device_has_dc_support(adev)) 3803 amdgpu_atombios_i2c_init(adev); 3804 } 3805 3806 fence_driver_init: 3807 /* Fence driver */ 3808 r = amdgpu_fence_driver_sw_init(adev); 3809 if (r) { 3810 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3811 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3812 goto failed; 3813 } 3814 3815 /* init the mode config */ 3816 drm_mode_config_init(adev_to_drm(adev)); 3817 3818 r = amdgpu_device_ip_init(adev); 3819 if (r) { 3820 /* failed in exclusive mode due to timeout */ 3821 if (amdgpu_sriov_vf(adev) && 3822 !amdgpu_sriov_runtime(adev) && 3823 amdgpu_virt_mmio_blocked(adev) && 3824 !amdgpu_virt_wait_reset(adev)) { 3825 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3826 /* Don't send request since VF is inactive. */ 3827 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3828 adev->virt.ops = NULL; 3829 r = -EAGAIN; 3830 goto release_ras_con; 3831 } 3832 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3833 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3834 goto release_ras_con; 3835 } 3836 3837 amdgpu_fence_driver_hw_init(adev); 3838 3839 dev_info(adev->dev, 3840 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3841 adev->gfx.config.max_shader_engines, 3842 adev->gfx.config.max_sh_per_se, 3843 adev->gfx.config.max_cu_per_sh, 3844 adev->gfx.cu_info.number); 3845 3846 adev->accel_working = true; 3847 3848 amdgpu_vm_check_compute_bug(adev); 3849 3850 /* Initialize the buffer migration limit. */ 3851 if (amdgpu_moverate >= 0) 3852 max_MBps = amdgpu_moverate; 3853 else 3854 max_MBps = 8; /* Allow 8 MB/s. */ 3855 /* Get a log2 for easy divisions. */ 3856 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3857 3858 r = amdgpu_pm_sysfs_init(adev); 3859 if (r) { 3860 adev->pm_sysfs_en = false; 3861 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3862 } else 3863 adev->pm_sysfs_en = true; 3864 3865 r = amdgpu_ucode_sysfs_init(adev); 3866 if (r) { 3867 adev->ucode_sysfs_en = false; 3868 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3869 } else 3870 adev->ucode_sysfs_en = true; 3871 3872 r = amdgpu_psp_sysfs_init(adev); 3873 if (r) { 3874 adev->psp_sysfs_en = false; 3875 if (!amdgpu_sriov_vf(adev)) 3876 DRM_ERROR("Creating psp sysfs failed\n"); 3877 } else 3878 adev->psp_sysfs_en = true; 3879 3880 /* 3881 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3882 * Otherwise the mgpu fan boost feature will be skipped due to the 3883 * gpu instance is counted less. 3884 */ 3885 amdgpu_register_gpu_instance(adev); 3886 3887 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3888 * explicit gating rather than handling it automatically. 3889 */ 3890 if (!adev->gmc.xgmi.pending_reset) { 3891 r = amdgpu_device_ip_late_init(adev); 3892 if (r) { 3893 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3894 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3895 goto release_ras_con; 3896 } 3897 /* must succeed. */ 3898 amdgpu_ras_resume(adev); 3899 queue_delayed_work(system_wq, &adev->delayed_init_work, 3900 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3901 } 3902 3903 if (amdgpu_sriov_vf(adev)) 3904 flush_delayed_work(&adev->delayed_init_work); 3905 3906 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3907 if (r) 3908 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3909 3910 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3911 r = amdgpu_pmu_init(adev); 3912 if (r) 3913 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3914 3915 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3916 if (amdgpu_device_cache_pci_state(adev->pdev)) 3917 pci_restore_state(pdev); 3918 3919 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3920 /* this will fail for cards that aren't VGA class devices, just 3921 * ignore it */ 3922 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3923 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3924 3925 if (amdgpu_device_supports_px(ddev)) { 3926 px = true; 3927 vga_switcheroo_register_client(adev->pdev, 3928 &amdgpu_switcheroo_ops, px); 3929 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3930 } 3931 3932 if (adev->gmc.xgmi.pending_reset) 3933 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3934 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3935 3936 amdgpu_device_check_iommu_direct_map(adev); 3937 3938 return 0; 3939 3940 release_ras_con: 3941 amdgpu_release_ras_context(adev); 3942 3943 failed: 3944 amdgpu_vf_error_trans_all(adev); 3945 3946 return r; 3947 } 3948 3949 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3950 { 3951 3952 /* Clear all CPU mappings pointing to this device */ 3953 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3954 3955 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3956 amdgpu_device_doorbell_fini(adev); 3957 3958 iounmap(adev->rmmio); 3959 adev->rmmio = NULL; 3960 if (adev->mman.aper_base_kaddr) 3961 iounmap(adev->mman.aper_base_kaddr); 3962 adev->mman.aper_base_kaddr = NULL; 3963 3964 /* Memory manager related */ 3965 if (!adev->gmc.xgmi.connected_to_cpu) { 3966 arch_phys_wc_del(adev->gmc.vram_mtrr); 3967 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3968 } 3969 } 3970 3971 /** 3972 * amdgpu_device_fini_hw - tear down the driver 3973 * 3974 * @adev: amdgpu_device pointer 3975 * 3976 * Tear down the driver info (all asics). 3977 * Called at driver shutdown. 3978 */ 3979 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3980 { 3981 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3982 flush_delayed_work(&adev->delayed_init_work); 3983 adev->shutdown = true; 3984 3985 /* make sure IB test finished before entering exclusive mode 3986 * to avoid preemption on IB test 3987 * */ 3988 if (amdgpu_sriov_vf(adev)) { 3989 amdgpu_virt_request_full_gpu(adev, false); 3990 amdgpu_virt_fini_data_exchange(adev); 3991 } 3992 3993 /* disable all interrupts */ 3994 amdgpu_irq_disable_all(adev); 3995 if (adev->mode_info.mode_config_initialized){ 3996 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3997 drm_helper_force_disable_all(adev_to_drm(adev)); 3998 else 3999 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4000 } 4001 amdgpu_fence_driver_hw_fini(adev); 4002 4003 if (adev->mman.initialized) { 4004 flush_delayed_work(&adev->mman.bdev.wq); 4005 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4006 } 4007 4008 if (adev->pm_sysfs_en) 4009 amdgpu_pm_sysfs_fini(adev); 4010 if (adev->ucode_sysfs_en) 4011 amdgpu_ucode_sysfs_fini(adev); 4012 if (adev->psp_sysfs_en) 4013 amdgpu_psp_sysfs_fini(adev); 4014 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4015 4016 /* disable ras feature must before hw fini */ 4017 amdgpu_ras_pre_fini(adev); 4018 4019 amdgpu_device_ip_fini_early(adev); 4020 4021 amdgpu_irq_fini_hw(adev); 4022 4023 if (adev->mman.initialized) 4024 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4025 4026 amdgpu_gart_dummy_page_fini(adev); 4027 4028 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4029 amdgpu_device_unmap_mmio(adev); 4030 4031 } 4032 4033 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4034 { 4035 int idx; 4036 4037 amdgpu_fence_driver_sw_fini(adev); 4038 amdgpu_device_ip_fini(adev); 4039 release_firmware(adev->firmware.gpu_info_fw); 4040 adev->firmware.gpu_info_fw = NULL; 4041 adev->accel_working = false; 4042 4043 amdgpu_reset_fini(adev); 4044 4045 /* free i2c buses */ 4046 if (!amdgpu_device_has_dc_support(adev)) 4047 amdgpu_i2c_fini(adev); 4048 4049 if (amdgpu_emu_mode != 1) 4050 amdgpu_atombios_fini(adev); 4051 4052 kfree(adev->bios); 4053 adev->bios = NULL; 4054 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4055 vga_switcheroo_unregister_client(adev->pdev); 4056 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4057 } 4058 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4059 vga_client_unregister(adev->pdev); 4060 4061 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4062 4063 iounmap(adev->rmmio); 4064 adev->rmmio = NULL; 4065 amdgpu_device_doorbell_fini(adev); 4066 drm_dev_exit(idx); 4067 } 4068 4069 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4070 amdgpu_pmu_fini(adev); 4071 if (adev->mman.discovery_bin) 4072 amdgpu_discovery_fini(adev); 4073 4074 amdgpu_reset_put_reset_domain(adev->reset_domain); 4075 adev->reset_domain = NULL; 4076 4077 kfree(adev->pci_state); 4078 4079 } 4080 4081 /** 4082 * amdgpu_device_evict_resources - evict device resources 4083 * @adev: amdgpu device object 4084 * 4085 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4086 * of the vram memory type. Mainly used for evicting device resources 4087 * at suspend time. 4088 * 4089 */ 4090 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4091 { 4092 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4093 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4094 return; 4095 4096 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4097 DRM_WARN("evicting device resources failed\n"); 4098 4099 } 4100 4101 /* 4102 * Suspend & resume. 4103 */ 4104 /** 4105 * amdgpu_device_suspend - initiate device suspend 4106 * 4107 * @dev: drm dev pointer 4108 * @fbcon : notify the fbdev of suspend 4109 * 4110 * Puts the hw in the suspend state (all asics). 4111 * Returns 0 for success or an error on failure. 4112 * Called at driver suspend. 4113 */ 4114 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4115 { 4116 struct amdgpu_device *adev = drm_to_adev(dev); 4117 4118 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4119 return 0; 4120 4121 adev->in_suspend = true; 4122 4123 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4124 DRM_WARN("smart shift update failed\n"); 4125 4126 drm_kms_helper_poll_disable(dev); 4127 4128 if (fbcon) 4129 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4130 4131 cancel_delayed_work_sync(&adev->delayed_init_work); 4132 4133 amdgpu_ras_suspend(adev); 4134 4135 amdgpu_device_ip_suspend_phase1(adev); 4136 4137 if (!adev->in_s0ix) 4138 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4139 4140 amdgpu_device_evict_resources(adev); 4141 4142 amdgpu_fence_driver_hw_fini(adev); 4143 4144 amdgpu_device_ip_suspend_phase2(adev); 4145 4146 return 0; 4147 } 4148 4149 /** 4150 * amdgpu_device_resume - initiate device resume 4151 * 4152 * @dev: drm dev pointer 4153 * @fbcon : notify the fbdev of resume 4154 * 4155 * Bring the hw back to operating state (all asics). 4156 * Returns 0 for success or an error on failure. 4157 * Called at driver resume. 4158 */ 4159 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4160 { 4161 struct amdgpu_device *adev = drm_to_adev(dev); 4162 int r = 0; 4163 4164 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4165 return 0; 4166 4167 if (adev->in_s0ix) 4168 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4169 4170 /* post card */ 4171 if (amdgpu_device_need_post(adev)) { 4172 r = amdgpu_device_asic_init(adev); 4173 if (r) 4174 dev_err(adev->dev, "amdgpu asic init failed\n"); 4175 } 4176 4177 r = amdgpu_device_ip_resume(adev); 4178 if (r) { 4179 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4180 return r; 4181 } 4182 amdgpu_fence_driver_hw_init(adev); 4183 4184 r = amdgpu_device_ip_late_init(adev); 4185 if (r) 4186 return r; 4187 4188 queue_delayed_work(system_wq, &adev->delayed_init_work, 4189 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4190 4191 if (!adev->in_s0ix) { 4192 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4193 if (r) 4194 return r; 4195 } 4196 4197 /* Make sure IB tests flushed */ 4198 flush_delayed_work(&adev->delayed_init_work); 4199 4200 if (fbcon) 4201 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4202 4203 drm_kms_helper_poll_enable(dev); 4204 4205 amdgpu_ras_resume(adev); 4206 4207 /* 4208 * Most of the connector probing functions try to acquire runtime pm 4209 * refs to ensure that the GPU is powered on when connector polling is 4210 * performed. Since we're calling this from a runtime PM callback, 4211 * trying to acquire rpm refs will cause us to deadlock. 4212 * 4213 * Since we're guaranteed to be holding the rpm lock, it's safe to 4214 * temporarily disable the rpm helpers so this doesn't deadlock us. 4215 */ 4216 #ifdef CONFIG_PM 4217 dev->dev->power.disable_depth++; 4218 #endif 4219 if (!amdgpu_device_has_dc_support(adev)) 4220 drm_helper_hpd_irq_event(dev); 4221 else 4222 drm_kms_helper_hotplug_event(dev); 4223 #ifdef CONFIG_PM 4224 dev->dev->power.disable_depth--; 4225 #endif 4226 adev->in_suspend = false; 4227 4228 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4229 DRM_WARN("smart shift update failed\n"); 4230 4231 return 0; 4232 } 4233 4234 /** 4235 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4236 * 4237 * @adev: amdgpu_device pointer 4238 * 4239 * The list of all the hardware IPs that make up the asic is walked and 4240 * the check_soft_reset callbacks are run. check_soft_reset determines 4241 * if the asic is still hung or not. 4242 * Returns true if any of the IPs are still in a hung state, false if not. 4243 */ 4244 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4245 { 4246 int i; 4247 bool asic_hang = false; 4248 4249 if (amdgpu_sriov_vf(adev)) 4250 return true; 4251 4252 if (amdgpu_asic_need_full_reset(adev)) 4253 return true; 4254 4255 for (i = 0; i < adev->num_ip_blocks; i++) { 4256 if (!adev->ip_blocks[i].status.valid) 4257 continue; 4258 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4259 adev->ip_blocks[i].status.hang = 4260 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4261 if (adev->ip_blocks[i].status.hang) { 4262 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4263 asic_hang = true; 4264 } 4265 } 4266 return asic_hang; 4267 } 4268 4269 /** 4270 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4271 * 4272 * @adev: amdgpu_device pointer 4273 * 4274 * The list of all the hardware IPs that make up the asic is walked and the 4275 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4276 * handles any IP specific hardware or software state changes that are 4277 * necessary for a soft reset to succeed. 4278 * Returns 0 on success, negative error code on failure. 4279 */ 4280 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4281 { 4282 int i, r = 0; 4283 4284 for (i = 0; i < adev->num_ip_blocks; i++) { 4285 if (!adev->ip_blocks[i].status.valid) 4286 continue; 4287 if (adev->ip_blocks[i].status.hang && 4288 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4289 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4290 if (r) 4291 return r; 4292 } 4293 } 4294 4295 return 0; 4296 } 4297 4298 /** 4299 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4300 * 4301 * @adev: amdgpu_device pointer 4302 * 4303 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4304 * reset is necessary to recover. 4305 * Returns true if a full asic reset is required, false if not. 4306 */ 4307 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4308 { 4309 int i; 4310 4311 if (amdgpu_asic_need_full_reset(adev)) 4312 return true; 4313 4314 for (i = 0; i < adev->num_ip_blocks; i++) { 4315 if (!adev->ip_blocks[i].status.valid) 4316 continue; 4317 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4318 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4319 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4320 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4321 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4322 if (adev->ip_blocks[i].status.hang) { 4323 dev_info(adev->dev, "Some block need full reset!\n"); 4324 return true; 4325 } 4326 } 4327 } 4328 return false; 4329 } 4330 4331 /** 4332 * amdgpu_device_ip_soft_reset - do a soft reset 4333 * 4334 * @adev: amdgpu_device pointer 4335 * 4336 * The list of all the hardware IPs that make up the asic is walked and the 4337 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4338 * IP specific hardware or software state changes that are necessary to soft 4339 * reset the IP. 4340 * Returns 0 on success, negative error code on failure. 4341 */ 4342 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4343 { 4344 int i, r = 0; 4345 4346 for (i = 0; i < adev->num_ip_blocks; i++) { 4347 if (!adev->ip_blocks[i].status.valid) 4348 continue; 4349 if (adev->ip_blocks[i].status.hang && 4350 adev->ip_blocks[i].version->funcs->soft_reset) { 4351 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4352 if (r) 4353 return r; 4354 } 4355 } 4356 4357 return 0; 4358 } 4359 4360 /** 4361 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4362 * 4363 * @adev: amdgpu_device pointer 4364 * 4365 * The list of all the hardware IPs that make up the asic is walked and the 4366 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4367 * handles any IP specific hardware or software state changes that are 4368 * necessary after the IP has been soft reset. 4369 * Returns 0 on success, negative error code on failure. 4370 */ 4371 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4372 { 4373 int i, r = 0; 4374 4375 for (i = 0; i < adev->num_ip_blocks; i++) { 4376 if (!adev->ip_blocks[i].status.valid) 4377 continue; 4378 if (adev->ip_blocks[i].status.hang && 4379 adev->ip_blocks[i].version->funcs->post_soft_reset) 4380 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4381 if (r) 4382 return r; 4383 } 4384 4385 return 0; 4386 } 4387 4388 /** 4389 * amdgpu_device_recover_vram - Recover some VRAM contents 4390 * 4391 * @adev: amdgpu_device pointer 4392 * 4393 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4394 * restore things like GPUVM page tables after a GPU reset where 4395 * the contents of VRAM might be lost. 4396 * 4397 * Returns: 4398 * 0 on success, negative error code on failure. 4399 */ 4400 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4401 { 4402 struct dma_fence *fence = NULL, *next = NULL; 4403 struct amdgpu_bo *shadow; 4404 struct amdgpu_bo_vm *vmbo; 4405 long r = 1, tmo; 4406 4407 if (amdgpu_sriov_runtime(adev)) 4408 tmo = msecs_to_jiffies(8000); 4409 else 4410 tmo = msecs_to_jiffies(100); 4411 4412 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4413 mutex_lock(&adev->shadow_list_lock); 4414 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4415 shadow = &vmbo->bo; 4416 /* No need to recover an evicted BO */ 4417 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4418 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4419 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4420 continue; 4421 4422 r = amdgpu_bo_restore_shadow(shadow, &next); 4423 if (r) 4424 break; 4425 4426 if (fence) { 4427 tmo = dma_fence_wait_timeout(fence, false, tmo); 4428 dma_fence_put(fence); 4429 fence = next; 4430 if (tmo == 0) { 4431 r = -ETIMEDOUT; 4432 break; 4433 } else if (tmo < 0) { 4434 r = tmo; 4435 break; 4436 } 4437 } else { 4438 fence = next; 4439 } 4440 } 4441 mutex_unlock(&adev->shadow_list_lock); 4442 4443 if (fence) 4444 tmo = dma_fence_wait_timeout(fence, false, tmo); 4445 dma_fence_put(fence); 4446 4447 if (r < 0 || tmo <= 0) { 4448 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4449 return -EIO; 4450 } 4451 4452 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4453 return 0; 4454 } 4455 4456 4457 /** 4458 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4459 * 4460 * @adev: amdgpu_device pointer 4461 * @from_hypervisor: request from hypervisor 4462 * 4463 * do VF FLR and reinitialize Asic 4464 * return 0 means succeeded otherwise failed 4465 */ 4466 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4467 bool from_hypervisor) 4468 { 4469 int r; 4470 struct amdgpu_hive_info *hive = NULL; 4471 int retry_limit = 0; 4472 4473 retry: 4474 amdgpu_amdkfd_pre_reset(adev); 4475 4476 amdgpu_amdkfd_pre_reset(adev); 4477 4478 if (from_hypervisor) 4479 r = amdgpu_virt_request_full_gpu(adev, true); 4480 else 4481 r = amdgpu_virt_reset_gpu(adev); 4482 if (r) 4483 return r; 4484 4485 /* Resume IP prior to SMC */ 4486 r = amdgpu_device_ip_reinit_early_sriov(adev); 4487 if (r) 4488 goto error; 4489 4490 amdgpu_virt_init_data_exchange(adev); 4491 4492 r = amdgpu_device_fw_loading(adev); 4493 if (r) 4494 return r; 4495 4496 /* now we are okay to resume SMC/CP/SDMA */ 4497 r = amdgpu_device_ip_reinit_late_sriov(adev); 4498 if (r) 4499 goto error; 4500 4501 hive = amdgpu_get_xgmi_hive(adev); 4502 /* Update PSP FW topology after reset */ 4503 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4504 r = amdgpu_xgmi_update_topology(hive, adev); 4505 4506 if (hive) 4507 amdgpu_put_xgmi_hive(hive); 4508 4509 if (!r) { 4510 amdgpu_irq_gpu_reset_resume_helper(adev); 4511 r = amdgpu_ib_ring_tests(adev); 4512 4513 amdgpu_amdkfd_post_reset(adev); 4514 } 4515 4516 error: 4517 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4518 amdgpu_inc_vram_lost(adev); 4519 r = amdgpu_device_recover_vram(adev); 4520 } 4521 amdgpu_virt_release_full_gpu(adev, true); 4522 4523 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4524 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4525 retry_limit++; 4526 goto retry; 4527 } else 4528 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4529 } 4530 4531 return r; 4532 } 4533 4534 /** 4535 * amdgpu_device_has_job_running - check if there is any job in mirror list 4536 * 4537 * @adev: amdgpu_device pointer 4538 * 4539 * check if there is any job in mirror list 4540 */ 4541 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4542 { 4543 int i; 4544 struct drm_sched_job *job; 4545 4546 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4547 struct amdgpu_ring *ring = adev->rings[i]; 4548 4549 if (!ring || !ring->sched.thread) 4550 continue; 4551 4552 spin_lock(&ring->sched.job_list_lock); 4553 job = list_first_entry_or_null(&ring->sched.pending_list, 4554 struct drm_sched_job, list); 4555 spin_unlock(&ring->sched.job_list_lock); 4556 if (job) 4557 return true; 4558 } 4559 return false; 4560 } 4561 4562 /** 4563 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4564 * 4565 * @adev: amdgpu_device pointer 4566 * 4567 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4568 * a hung GPU. 4569 */ 4570 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4571 { 4572 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4573 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4574 return false; 4575 } 4576 4577 if (amdgpu_gpu_recovery == 0) 4578 goto disabled; 4579 4580 if (amdgpu_sriov_vf(adev)) 4581 return true; 4582 4583 if (amdgpu_gpu_recovery == -1) { 4584 switch (adev->asic_type) { 4585 #ifdef CONFIG_DRM_AMDGPU_SI 4586 case CHIP_VERDE: 4587 case CHIP_TAHITI: 4588 case CHIP_PITCAIRN: 4589 case CHIP_OLAND: 4590 case CHIP_HAINAN: 4591 #endif 4592 #ifdef CONFIG_DRM_AMDGPU_CIK 4593 case CHIP_KAVERI: 4594 case CHIP_KABINI: 4595 case CHIP_MULLINS: 4596 #endif 4597 case CHIP_CARRIZO: 4598 case CHIP_STONEY: 4599 case CHIP_CYAN_SKILLFISH: 4600 goto disabled; 4601 default: 4602 break; 4603 } 4604 } 4605 4606 return true; 4607 4608 disabled: 4609 dev_info(adev->dev, "GPU recovery disabled.\n"); 4610 return false; 4611 } 4612 4613 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4614 { 4615 u32 i; 4616 int ret = 0; 4617 4618 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4619 4620 dev_info(adev->dev, "GPU mode1 reset\n"); 4621 4622 /* disable BM */ 4623 pci_clear_master(adev->pdev); 4624 4625 amdgpu_device_cache_pci_state(adev->pdev); 4626 4627 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4628 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4629 ret = amdgpu_dpm_mode1_reset(adev); 4630 } else { 4631 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4632 ret = psp_gpu_reset(adev); 4633 } 4634 4635 if (ret) 4636 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4637 4638 amdgpu_device_load_pci_state(adev->pdev); 4639 4640 /* wait for asic to come out of reset */ 4641 for (i = 0; i < adev->usec_timeout; i++) { 4642 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4643 4644 if (memsize != 0xffffffff) 4645 break; 4646 udelay(1); 4647 } 4648 4649 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4650 return ret; 4651 } 4652 4653 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4654 struct amdgpu_reset_context *reset_context) 4655 { 4656 int i, r = 0; 4657 struct amdgpu_job *job = NULL; 4658 bool need_full_reset = 4659 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4660 4661 if (reset_context->reset_req_dev == adev) 4662 job = reset_context->job; 4663 4664 if (amdgpu_sriov_vf(adev)) { 4665 /* stop the data exchange thread */ 4666 amdgpu_virt_fini_data_exchange(adev); 4667 } 4668 4669 /* block all schedulers and reset given job's ring */ 4670 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4671 struct amdgpu_ring *ring = adev->rings[i]; 4672 4673 if (!ring || !ring->sched.thread) 4674 continue; 4675 4676 /*clear job fence from fence drv to avoid force_completion 4677 *leave NULL and vm flush fence in fence drv */ 4678 amdgpu_fence_driver_clear_job_fences(ring); 4679 4680 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4681 amdgpu_fence_driver_force_completion(ring); 4682 } 4683 4684 if (job && job->vm) 4685 drm_sched_increase_karma(&job->base); 4686 4687 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4688 /* If reset handler not implemented, continue; otherwise return */ 4689 if (r == -ENOSYS) 4690 r = 0; 4691 else 4692 return r; 4693 4694 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4695 if (!amdgpu_sriov_vf(adev)) { 4696 4697 if (!need_full_reset) 4698 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4699 4700 if (!need_full_reset) { 4701 amdgpu_device_ip_pre_soft_reset(adev); 4702 r = amdgpu_device_ip_soft_reset(adev); 4703 amdgpu_device_ip_post_soft_reset(adev); 4704 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4705 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4706 need_full_reset = true; 4707 } 4708 } 4709 4710 if (need_full_reset) 4711 r = amdgpu_device_ip_suspend(adev); 4712 if (need_full_reset) 4713 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4714 else 4715 clear_bit(AMDGPU_NEED_FULL_RESET, 4716 &reset_context->flags); 4717 } 4718 4719 return r; 4720 } 4721 4722 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4723 { 4724 uint32_t reg_value; 4725 int i; 4726 4727 lockdep_assert_held(&adev->reset_domain->sem); 4728 dump_stack(); 4729 4730 for (i = 0; i < adev->num_regs; i++) { 4731 reg_value = RREG32(adev->reset_dump_reg_list[i]); 4732 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); 4733 } 4734 4735 return 0; 4736 } 4737 4738 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4739 struct amdgpu_reset_context *reset_context) 4740 { 4741 struct amdgpu_device *tmp_adev = NULL; 4742 bool need_full_reset, skip_hw_reset, vram_lost = false; 4743 int r = 0; 4744 4745 /* Try reset handler method first */ 4746 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4747 reset_list); 4748 amdgpu_reset_reg_dumps(tmp_adev); 4749 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4750 /* If reset handler not implemented, continue; otherwise return */ 4751 if (r == -ENOSYS) 4752 r = 0; 4753 else 4754 return r; 4755 4756 /* Reset handler not implemented, use the default method */ 4757 need_full_reset = 4758 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4759 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4760 4761 /* 4762 * ASIC reset has to be done on all XGMI hive nodes ASAP 4763 * to allow proper links negotiation in FW (within 1 sec) 4764 */ 4765 if (!skip_hw_reset && need_full_reset) { 4766 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4767 /* For XGMI run all resets in parallel to speed up the process */ 4768 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4769 tmp_adev->gmc.xgmi.pending_reset = false; 4770 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4771 r = -EALREADY; 4772 } else 4773 r = amdgpu_asic_reset(tmp_adev); 4774 4775 if (r) { 4776 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4777 r, adev_to_drm(tmp_adev)->unique); 4778 break; 4779 } 4780 } 4781 4782 /* For XGMI wait for all resets to complete before proceed */ 4783 if (!r) { 4784 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4785 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4786 flush_work(&tmp_adev->xgmi_reset_work); 4787 r = tmp_adev->asic_reset_res; 4788 if (r) 4789 break; 4790 } 4791 } 4792 } 4793 } 4794 4795 if (!r && amdgpu_ras_intr_triggered()) { 4796 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4797 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4798 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4799 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4800 } 4801 4802 amdgpu_ras_intr_cleared(); 4803 } 4804 4805 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4806 if (need_full_reset) { 4807 /* post card */ 4808 r = amdgpu_device_asic_init(tmp_adev); 4809 if (r) { 4810 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4811 } else { 4812 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4813 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4814 if (r) 4815 goto out; 4816 4817 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4818 if (r) 4819 goto out; 4820 4821 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4822 if (vram_lost) { 4823 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4824 amdgpu_inc_vram_lost(tmp_adev); 4825 } 4826 4827 r = amdgpu_device_fw_loading(tmp_adev); 4828 if (r) 4829 return r; 4830 4831 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4832 if (r) 4833 goto out; 4834 4835 if (vram_lost) 4836 amdgpu_device_fill_reset_magic(tmp_adev); 4837 4838 /* 4839 * Add this ASIC as tracked as reset was already 4840 * complete successfully. 4841 */ 4842 amdgpu_register_gpu_instance(tmp_adev); 4843 4844 if (!reset_context->hive && 4845 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4846 amdgpu_xgmi_add_device(tmp_adev); 4847 4848 r = amdgpu_device_ip_late_init(tmp_adev); 4849 if (r) 4850 goto out; 4851 4852 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4853 4854 /* 4855 * The GPU enters bad state once faulty pages 4856 * by ECC has reached the threshold, and ras 4857 * recovery is scheduled next. So add one check 4858 * here to break recovery if it indeed exceeds 4859 * bad page threshold, and remind user to 4860 * retire this GPU or setting one bigger 4861 * bad_page_threshold value to fix this once 4862 * probing driver again. 4863 */ 4864 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4865 /* must succeed. */ 4866 amdgpu_ras_resume(tmp_adev); 4867 } else { 4868 r = -EINVAL; 4869 goto out; 4870 } 4871 4872 /* Update PSP FW topology after reset */ 4873 if (reset_context->hive && 4874 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4875 r = amdgpu_xgmi_update_topology( 4876 reset_context->hive, tmp_adev); 4877 } 4878 } 4879 4880 out: 4881 if (!r) { 4882 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4883 r = amdgpu_ib_ring_tests(tmp_adev); 4884 if (r) { 4885 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4886 need_full_reset = true; 4887 r = -EAGAIN; 4888 goto end; 4889 } 4890 } 4891 4892 if (!r) 4893 r = amdgpu_device_recover_vram(tmp_adev); 4894 else 4895 tmp_adev->asic_reset_res = r; 4896 } 4897 4898 end: 4899 if (need_full_reset) 4900 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4901 else 4902 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4903 return r; 4904 } 4905 4906 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 4907 { 4908 4909 switch (amdgpu_asic_reset_method(adev)) { 4910 case AMD_RESET_METHOD_MODE1: 4911 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4912 break; 4913 case AMD_RESET_METHOD_MODE2: 4914 adev->mp1_state = PP_MP1_STATE_RESET; 4915 break; 4916 default: 4917 adev->mp1_state = PP_MP1_STATE_NONE; 4918 break; 4919 } 4920 } 4921 4922 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 4923 { 4924 amdgpu_vf_error_trans_all(adev); 4925 adev->mp1_state = PP_MP1_STATE_NONE; 4926 } 4927 4928 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4929 { 4930 struct pci_dev *p = NULL; 4931 4932 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4933 adev->pdev->bus->number, 1); 4934 if (p) { 4935 pm_runtime_enable(&(p->dev)); 4936 pm_runtime_resume(&(p->dev)); 4937 } 4938 } 4939 4940 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4941 { 4942 enum amd_reset_method reset_method; 4943 struct pci_dev *p = NULL; 4944 u64 expires; 4945 4946 /* 4947 * For now, only BACO and mode1 reset are confirmed 4948 * to suffer the audio issue without proper suspended. 4949 */ 4950 reset_method = amdgpu_asic_reset_method(adev); 4951 if ((reset_method != AMD_RESET_METHOD_BACO) && 4952 (reset_method != AMD_RESET_METHOD_MODE1)) 4953 return -EINVAL; 4954 4955 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4956 adev->pdev->bus->number, 1); 4957 if (!p) 4958 return -ENODEV; 4959 4960 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4961 if (!expires) 4962 /* 4963 * If we cannot get the audio device autosuspend delay, 4964 * a fixed 4S interval will be used. Considering 3S is 4965 * the audio controller default autosuspend delay setting. 4966 * 4S used here is guaranteed to cover that. 4967 */ 4968 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4969 4970 while (!pm_runtime_status_suspended(&(p->dev))) { 4971 if (!pm_runtime_suspend(&(p->dev))) 4972 break; 4973 4974 if (expires < ktime_get_mono_fast_ns()) { 4975 dev_warn(adev->dev, "failed to suspend display audio\n"); 4976 /* TODO: abort the succeeding gpu reset? */ 4977 return -ETIMEDOUT; 4978 } 4979 } 4980 4981 pm_runtime_disable(&(p->dev)); 4982 4983 return 0; 4984 } 4985 4986 static void amdgpu_device_recheck_guilty_jobs( 4987 struct amdgpu_device *adev, struct list_head *device_list_handle, 4988 struct amdgpu_reset_context *reset_context) 4989 { 4990 int i, r = 0; 4991 4992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4993 struct amdgpu_ring *ring = adev->rings[i]; 4994 int ret = 0; 4995 struct drm_sched_job *s_job; 4996 4997 if (!ring || !ring->sched.thread) 4998 continue; 4999 5000 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5001 struct drm_sched_job, list); 5002 if (s_job == NULL) 5003 continue; 5004 5005 /* clear job's guilty and depend the folowing step to decide the real one */ 5006 drm_sched_reset_karma(s_job); 5007 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get 5008 * to make sure fence is balanced */ 5009 dma_fence_get(s_job->s_fence->parent); 5010 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5011 5012 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5013 if (ret == 0) { /* timeout */ 5014 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5015 ring->sched.name, s_job->id); 5016 5017 /* set guilty */ 5018 drm_sched_increase_karma(s_job); 5019 retry: 5020 /* do hw reset */ 5021 if (amdgpu_sriov_vf(adev)) { 5022 amdgpu_virt_fini_data_exchange(adev); 5023 r = amdgpu_device_reset_sriov(adev, false); 5024 if (r) 5025 adev->asic_reset_res = r; 5026 } else { 5027 clear_bit(AMDGPU_SKIP_HW_RESET, 5028 &reset_context->flags); 5029 r = amdgpu_do_asic_reset(device_list_handle, 5030 reset_context); 5031 if (r && r == -EAGAIN) 5032 goto retry; 5033 } 5034 5035 /* 5036 * add reset counter so that the following 5037 * resubmitted job could flush vmid 5038 */ 5039 atomic_inc(&adev->gpu_reset_counter); 5040 continue; 5041 } 5042 5043 /* got the hw fence, signal finished fence */ 5044 atomic_dec(ring->sched.score); 5045 dma_fence_put(s_job->s_fence->parent); 5046 dma_fence_get(&s_job->s_fence->finished); 5047 dma_fence_signal(&s_job->s_fence->finished); 5048 dma_fence_put(&s_job->s_fence->finished); 5049 5050 /* remove node from list and free the job */ 5051 spin_lock(&ring->sched.job_list_lock); 5052 list_del_init(&s_job->list); 5053 spin_unlock(&ring->sched.job_list_lock); 5054 ring->sched.ops->free_job(s_job); 5055 } 5056 } 5057 5058 /** 5059 * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler 5060 * 5061 * @adev: amdgpu_device pointer 5062 * @job: which job trigger hang 5063 * 5064 * Attempt to reset the GPU if it has hung (all asics). 5065 * Attempt to do soft-reset or full-reset and reinitialize Asic 5066 * Returns 0 for success or an error on failure. 5067 */ 5068 5069 int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, 5070 struct amdgpu_job *job) 5071 { 5072 struct list_head device_list, *device_list_handle = NULL; 5073 bool job_signaled = false; 5074 struct amdgpu_hive_info *hive = NULL; 5075 struct amdgpu_device *tmp_adev = NULL; 5076 int i, r = 0; 5077 bool need_emergency_restart = false; 5078 bool audio_suspended = false; 5079 int tmp_vram_lost_counter; 5080 struct amdgpu_reset_context reset_context; 5081 5082 memset(&reset_context, 0, sizeof(reset_context)); 5083 5084 /* 5085 * Special case: RAS triggered and full reset isn't supported 5086 */ 5087 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5088 5089 /* 5090 * Flush RAM to disk so that after reboot 5091 * the user can read log and see why the system rebooted. 5092 */ 5093 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5094 DRM_WARN("Emergency reboot."); 5095 5096 ksys_sync_helper(); 5097 emergency_restart(); 5098 } 5099 5100 dev_info(adev->dev, "GPU %s begin!\n", 5101 need_emergency_restart ? "jobs stop":"reset"); 5102 5103 if (!amdgpu_sriov_vf(adev)) 5104 hive = amdgpu_get_xgmi_hive(adev); 5105 if (hive) 5106 mutex_lock(&hive->hive_lock); 5107 5108 reset_context.method = AMD_RESET_METHOD_NONE; 5109 reset_context.reset_req_dev = adev; 5110 reset_context.job = job; 5111 reset_context.hive = hive; 5112 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5113 5114 /* 5115 * Build list of devices to reset. 5116 * In case we are in XGMI hive mode, resort the device list 5117 * to put adev in the 1st position. 5118 */ 5119 INIT_LIST_HEAD(&device_list); 5120 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5121 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5122 list_add_tail(&tmp_adev->reset_list, &device_list); 5123 if (!list_is_first(&adev->reset_list, &device_list)) 5124 list_rotate_to_front(&adev->reset_list, &device_list); 5125 device_list_handle = &device_list; 5126 } else { 5127 list_add_tail(&adev->reset_list, &device_list); 5128 device_list_handle = &device_list; 5129 } 5130 5131 /* We need to lock reset domain only once both for XGMI and single device */ 5132 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5133 reset_list); 5134 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5135 5136 /* block all schedulers and reset given job's ring */ 5137 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5138 5139 amdgpu_device_set_mp1_state(tmp_adev); 5140 5141 /* 5142 * Try to put the audio codec into suspend state 5143 * before gpu reset started. 5144 * 5145 * Due to the power domain of the graphics device 5146 * is shared with AZ power domain. Without this, 5147 * we may change the audio hardware from behind 5148 * the audio driver's back. That will trigger 5149 * some audio codec errors. 5150 */ 5151 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5152 audio_suspended = true; 5153 5154 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5155 5156 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5157 5158 if (!amdgpu_sriov_vf(tmp_adev)) 5159 amdgpu_amdkfd_pre_reset(tmp_adev); 5160 5161 /* 5162 * Mark these ASICs to be reseted as untracked first 5163 * And add them back after reset completed 5164 */ 5165 amdgpu_unregister_gpu_instance(tmp_adev); 5166 5167 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 5168 5169 /* disable ras on ALL IPs */ 5170 if (!need_emergency_restart && 5171 amdgpu_device_ip_need_full_reset(tmp_adev)) 5172 amdgpu_ras_suspend(tmp_adev); 5173 5174 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5175 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5176 5177 if (!ring || !ring->sched.thread) 5178 continue; 5179 5180 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5181 5182 if (need_emergency_restart) 5183 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5184 } 5185 atomic_inc(&tmp_adev->gpu_reset_counter); 5186 } 5187 5188 if (need_emergency_restart) 5189 goto skip_sched_resume; 5190 5191 /* 5192 * Must check guilty signal here since after this point all old 5193 * HW fences are force signaled. 5194 * 5195 * job->base holds a reference to parent fence 5196 */ 5197 if (job && job->base.s_fence->parent && 5198 dma_fence_is_signaled(job->base.s_fence->parent)) { 5199 job_signaled = true; 5200 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5201 goto skip_hw_reset; 5202 } 5203 5204 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5205 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5206 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5207 /*TODO Should we stop ?*/ 5208 if (r) { 5209 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5210 r, adev_to_drm(tmp_adev)->unique); 5211 tmp_adev->asic_reset_res = r; 5212 } 5213 } 5214 5215 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5216 /* Actual ASIC resets if needed.*/ 5217 /* Host driver will handle XGMI hive reset for SRIOV */ 5218 if (amdgpu_sriov_vf(adev)) { 5219 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5220 if (r) 5221 adev->asic_reset_res = r; 5222 } else { 5223 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5224 if (r && r == -EAGAIN) 5225 goto retry; 5226 } 5227 5228 skip_hw_reset: 5229 5230 /* Post ASIC reset for all devs .*/ 5231 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5232 5233 /* 5234 * Sometimes a later bad compute job can block a good gfx job as gfx 5235 * and compute ring share internal GC HW mutually. We add an additional 5236 * guilty jobs recheck step to find the real guilty job, it synchronously 5237 * submits and pends for the first job being signaled. If it gets timeout, 5238 * we identify it as a real guilty job. 5239 */ 5240 if (amdgpu_gpu_recovery == 2 && 5241 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5242 amdgpu_device_recheck_guilty_jobs( 5243 tmp_adev, device_list_handle, &reset_context); 5244 5245 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5246 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5247 5248 if (!ring || !ring->sched.thread) 5249 continue; 5250 5251 /* No point to resubmit jobs if we didn't HW reset*/ 5252 if (!tmp_adev->asic_reset_res && !job_signaled) 5253 drm_sched_resubmit_jobs(&ring->sched); 5254 5255 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5256 } 5257 5258 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5259 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5260 } 5261 5262 if (tmp_adev->asic_reset_res) 5263 r = tmp_adev->asic_reset_res; 5264 5265 tmp_adev->asic_reset_res = 0; 5266 5267 if (r) { 5268 /* bad news, how to tell it to userspace ? */ 5269 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5270 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5271 } else { 5272 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5273 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5274 DRM_WARN("smart shift update failed\n"); 5275 } 5276 } 5277 5278 skip_sched_resume: 5279 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5280 /* unlock kfd: SRIOV would do it separately */ 5281 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5282 amdgpu_amdkfd_post_reset(tmp_adev); 5283 5284 /* kfd_post_reset will do nothing if kfd device is not initialized, 5285 * need to bring up kfd here if it's not be initialized before 5286 */ 5287 if (!adev->kfd.init_complete) 5288 amdgpu_amdkfd_device_init(adev); 5289 5290 if (audio_suspended) 5291 amdgpu_device_resume_display_audio(tmp_adev); 5292 5293 amdgpu_device_unset_mp1_state(tmp_adev); 5294 } 5295 5296 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5297 reset_list); 5298 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5299 5300 if (hive) { 5301 mutex_unlock(&hive->hive_lock); 5302 amdgpu_put_xgmi_hive(hive); 5303 } 5304 5305 if (r) 5306 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5307 return r; 5308 } 5309 5310 struct amdgpu_recover_work_struct { 5311 struct work_struct base; 5312 struct amdgpu_device *adev; 5313 struct amdgpu_job *job; 5314 int ret; 5315 }; 5316 5317 static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) 5318 { 5319 struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); 5320 5321 recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); 5322 } 5323 /* 5324 * Serialize gpu recover into reset domain single threaded wq 5325 */ 5326 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5327 struct amdgpu_job *job) 5328 { 5329 struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; 5330 5331 INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); 5332 5333 if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) 5334 return -EAGAIN; 5335 5336 flush_work(&work.base); 5337 5338 return work.ret; 5339 } 5340 5341 /** 5342 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5343 * 5344 * @adev: amdgpu_device pointer 5345 * 5346 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5347 * and lanes) of the slot the device is in. Handles APUs and 5348 * virtualized environments where PCIE config space may not be available. 5349 */ 5350 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5351 { 5352 struct pci_dev *pdev; 5353 enum pci_bus_speed speed_cap, platform_speed_cap; 5354 enum pcie_link_width platform_link_width; 5355 5356 if (amdgpu_pcie_gen_cap) 5357 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5358 5359 if (amdgpu_pcie_lane_cap) 5360 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5361 5362 /* covers APUs as well */ 5363 if (pci_is_root_bus(adev->pdev->bus)) { 5364 if (adev->pm.pcie_gen_mask == 0) 5365 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5366 if (adev->pm.pcie_mlw_mask == 0) 5367 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5368 return; 5369 } 5370 5371 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5372 return; 5373 5374 pcie_bandwidth_available(adev->pdev, NULL, 5375 &platform_speed_cap, &platform_link_width); 5376 5377 if (adev->pm.pcie_gen_mask == 0) { 5378 /* asic caps */ 5379 pdev = adev->pdev; 5380 speed_cap = pcie_get_speed_cap(pdev); 5381 if (speed_cap == PCI_SPEED_UNKNOWN) { 5382 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5383 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5384 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5385 } else { 5386 if (speed_cap == PCIE_SPEED_32_0GT) 5387 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5388 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5389 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5390 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5391 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5392 else if (speed_cap == PCIE_SPEED_16_0GT) 5393 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5394 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5395 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5396 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5397 else if (speed_cap == PCIE_SPEED_8_0GT) 5398 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5399 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5400 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5401 else if (speed_cap == PCIE_SPEED_5_0GT) 5402 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5403 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5404 else 5405 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5406 } 5407 /* platform caps */ 5408 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5409 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5410 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5411 } else { 5412 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5413 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5414 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5415 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5416 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5417 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5418 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5419 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5420 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5421 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5422 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5423 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5424 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5425 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5426 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5427 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5428 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5429 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5430 else 5431 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5432 5433 } 5434 } 5435 if (adev->pm.pcie_mlw_mask == 0) { 5436 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5437 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5438 } else { 5439 switch (platform_link_width) { 5440 case PCIE_LNK_X32: 5441 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5442 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5443 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5447 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5448 break; 5449 case PCIE_LNK_X16: 5450 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5451 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5452 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5453 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5454 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5456 break; 5457 case PCIE_LNK_X12: 5458 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5459 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5460 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5461 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5462 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5463 break; 5464 case PCIE_LNK_X8: 5465 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5466 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5467 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5468 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5469 break; 5470 case PCIE_LNK_X4: 5471 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5472 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5473 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5474 break; 5475 case PCIE_LNK_X2: 5476 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5477 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5478 break; 5479 case PCIE_LNK_X1: 5480 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5481 break; 5482 default: 5483 break; 5484 } 5485 } 5486 } 5487 } 5488 5489 int amdgpu_device_baco_enter(struct drm_device *dev) 5490 { 5491 struct amdgpu_device *adev = drm_to_adev(dev); 5492 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5493 5494 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5495 return -ENOTSUPP; 5496 5497 if (ras && adev->ras_enabled && 5498 adev->nbio.funcs->enable_doorbell_interrupt) 5499 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5500 5501 return amdgpu_dpm_baco_enter(adev); 5502 } 5503 5504 int amdgpu_device_baco_exit(struct drm_device *dev) 5505 { 5506 struct amdgpu_device *adev = drm_to_adev(dev); 5507 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5508 int ret = 0; 5509 5510 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5511 return -ENOTSUPP; 5512 5513 ret = amdgpu_dpm_baco_exit(adev); 5514 if (ret) 5515 return ret; 5516 5517 if (ras && adev->ras_enabled && 5518 adev->nbio.funcs->enable_doorbell_interrupt) 5519 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5520 5521 if (amdgpu_passthrough(adev) && 5522 adev->nbio.funcs->clear_doorbell_interrupt) 5523 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5524 5525 return 0; 5526 } 5527 5528 /** 5529 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5530 * @pdev: PCI device struct 5531 * @state: PCI channel state 5532 * 5533 * Description: Called when a PCI error is detected. 5534 * 5535 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5536 */ 5537 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5538 { 5539 struct drm_device *dev = pci_get_drvdata(pdev); 5540 struct amdgpu_device *adev = drm_to_adev(dev); 5541 int i; 5542 5543 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5544 5545 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5546 DRM_WARN("No support for XGMI hive yet..."); 5547 return PCI_ERS_RESULT_DISCONNECT; 5548 } 5549 5550 adev->pci_channel_state = state; 5551 5552 switch (state) { 5553 case pci_channel_io_normal: 5554 return PCI_ERS_RESULT_CAN_RECOVER; 5555 /* Fatal error, prepare for slot reset */ 5556 case pci_channel_io_frozen: 5557 /* 5558 * Locking adev->reset_domain->sem will prevent any external access 5559 * to GPU during PCI error recovery 5560 */ 5561 amdgpu_device_lock_reset_domain(adev->reset_domain); 5562 amdgpu_device_set_mp1_state(adev); 5563 5564 /* 5565 * Block any work scheduling as we do for regular GPU reset 5566 * for the duration of the recovery 5567 */ 5568 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5569 struct amdgpu_ring *ring = adev->rings[i]; 5570 5571 if (!ring || !ring->sched.thread) 5572 continue; 5573 5574 drm_sched_stop(&ring->sched, NULL); 5575 } 5576 atomic_inc(&adev->gpu_reset_counter); 5577 return PCI_ERS_RESULT_NEED_RESET; 5578 case pci_channel_io_perm_failure: 5579 /* Permanent error, prepare for device removal */ 5580 return PCI_ERS_RESULT_DISCONNECT; 5581 } 5582 5583 return PCI_ERS_RESULT_NEED_RESET; 5584 } 5585 5586 /** 5587 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5588 * @pdev: pointer to PCI device 5589 */ 5590 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5591 { 5592 5593 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5594 5595 /* TODO - dump whatever for debugging purposes */ 5596 5597 /* This called only if amdgpu_pci_error_detected returns 5598 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5599 * works, no need to reset slot. 5600 */ 5601 5602 return PCI_ERS_RESULT_RECOVERED; 5603 } 5604 5605 /** 5606 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5607 * @pdev: PCI device struct 5608 * 5609 * Description: This routine is called by the pci error recovery 5610 * code after the PCI slot has been reset, just before we 5611 * should resume normal operations. 5612 */ 5613 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5614 { 5615 struct drm_device *dev = pci_get_drvdata(pdev); 5616 struct amdgpu_device *adev = drm_to_adev(dev); 5617 int r, i; 5618 struct amdgpu_reset_context reset_context; 5619 u32 memsize; 5620 struct list_head device_list; 5621 5622 DRM_INFO("PCI error: slot reset callback!!\n"); 5623 5624 memset(&reset_context, 0, sizeof(reset_context)); 5625 5626 INIT_LIST_HEAD(&device_list); 5627 list_add_tail(&adev->reset_list, &device_list); 5628 5629 /* wait for asic to come out of reset */ 5630 msleep(500); 5631 5632 /* Restore PCI confspace */ 5633 amdgpu_device_load_pci_state(pdev); 5634 5635 /* confirm ASIC came out of reset */ 5636 for (i = 0; i < adev->usec_timeout; i++) { 5637 memsize = amdgpu_asic_get_config_memsize(adev); 5638 5639 if (memsize != 0xffffffff) 5640 break; 5641 udelay(1); 5642 } 5643 if (memsize == 0xffffffff) { 5644 r = -ETIME; 5645 goto out; 5646 } 5647 5648 reset_context.method = AMD_RESET_METHOD_NONE; 5649 reset_context.reset_req_dev = adev; 5650 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5651 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5652 5653 adev->no_hw_access = true; 5654 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5655 adev->no_hw_access = false; 5656 if (r) 5657 goto out; 5658 5659 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5660 5661 out: 5662 if (!r) { 5663 if (amdgpu_device_cache_pci_state(adev->pdev)) 5664 pci_restore_state(adev->pdev); 5665 5666 DRM_INFO("PCIe error recovery succeeded\n"); 5667 } else { 5668 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5669 amdgpu_device_unset_mp1_state(adev); 5670 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5671 } 5672 5673 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5674 } 5675 5676 /** 5677 * amdgpu_pci_resume() - resume normal ops after PCI reset 5678 * @pdev: pointer to PCI device 5679 * 5680 * Called when the error recovery driver tells us that its 5681 * OK to resume normal operation. 5682 */ 5683 void amdgpu_pci_resume(struct pci_dev *pdev) 5684 { 5685 struct drm_device *dev = pci_get_drvdata(pdev); 5686 struct amdgpu_device *adev = drm_to_adev(dev); 5687 int i; 5688 5689 5690 DRM_INFO("PCI error: resume callback!!\n"); 5691 5692 /* Only continue execution for the case of pci_channel_io_frozen */ 5693 if (adev->pci_channel_state != pci_channel_io_frozen) 5694 return; 5695 5696 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5697 struct amdgpu_ring *ring = adev->rings[i]; 5698 5699 if (!ring || !ring->sched.thread) 5700 continue; 5701 5702 5703 drm_sched_resubmit_jobs(&ring->sched); 5704 drm_sched_start(&ring->sched, true); 5705 } 5706 5707 amdgpu_device_unset_mp1_state(adev); 5708 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5709 } 5710 5711 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5712 { 5713 struct drm_device *dev = pci_get_drvdata(pdev); 5714 struct amdgpu_device *adev = drm_to_adev(dev); 5715 int r; 5716 5717 r = pci_save_state(pdev); 5718 if (!r) { 5719 kfree(adev->pci_state); 5720 5721 adev->pci_state = pci_store_saved_state(pdev); 5722 5723 if (!adev->pci_state) { 5724 DRM_ERROR("Failed to store PCI saved state"); 5725 return false; 5726 } 5727 } else { 5728 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5729 return false; 5730 } 5731 5732 return true; 5733 } 5734 5735 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5736 { 5737 struct drm_device *dev = pci_get_drvdata(pdev); 5738 struct amdgpu_device *adev = drm_to_adev(dev); 5739 int r; 5740 5741 if (!adev->pci_state) 5742 return false; 5743 5744 r = pci_load_saved_state(pdev, adev->pci_state); 5745 5746 if (!r) { 5747 pci_restore_state(pdev); 5748 } else { 5749 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5750 return false; 5751 } 5752 5753 return true; 5754 } 5755 5756 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5757 struct amdgpu_ring *ring) 5758 { 5759 #ifdef CONFIG_X86_64 5760 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5761 return; 5762 #endif 5763 if (adev->gmc.xgmi.connected_to_cpu) 5764 return; 5765 5766 if (ring && ring->funcs->emit_hdp_flush) 5767 amdgpu_ring_emit_hdp_flush(ring); 5768 else 5769 amdgpu_asic_flush_hdp(adev, ring); 5770 } 5771 5772 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5773 struct amdgpu_ring *ring) 5774 { 5775 #ifdef CONFIG_X86_64 5776 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5777 return; 5778 #endif 5779 if (adev->gmc.xgmi.connected_to_cpu) 5780 return; 5781 5782 amdgpu_asic_invalidate_hdp(adev, ring); 5783 } 5784 5785 int amdgpu_in_reset(struct amdgpu_device *adev) 5786 { 5787 return atomic_read(&adev->reset_domain->in_gpu_reset); 5788 } 5789 5790 /** 5791 * amdgpu_device_halt() - bring hardware to some kind of halt state 5792 * 5793 * @adev: amdgpu_device pointer 5794 * 5795 * Bring hardware to some kind of halt state so that no one can touch it 5796 * any more. It will help to maintain error context when error occurred. 5797 * Compare to a simple hang, the system will keep stable at least for SSH 5798 * access. Then it should be trivial to inspect the hardware state and 5799 * see what's going on. Implemented as following: 5800 * 5801 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5802 * clears all CPU mappings to device, disallows remappings through page faults 5803 * 2. amdgpu_irq_disable_all() disables all interrupts 5804 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5805 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5806 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5807 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5808 * flush any in flight DMA operations 5809 */ 5810 void amdgpu_device_halt(struct amdgpu_device *adev) 5811 { 5812 struct pci_dev *pdev = adev->pdev; 5813 struct drm_device *ddev = adev_to_drm(adev); 5814 5815 drm_dev_unplug(ddev); 5816 5817 amdgpu_irq_disable_all(adev); 5818 5819 amdgpu_fence_driver_hw_fini(adev); 5820 5821 adev->no_hw_access = true; 5822 5823 amdgpu_device_unmap_mmio(adev); 5824 5825 pci_disable_device(pdev); 5826 pci_wait_for_pending_transaction(pdev); 5827 } 5828 5829 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5830 u32 reg) 5831 { 5832 unsigned long flags, address, data; 5833 u32 r; 5834 5835 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5836 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5837 5838 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5839 WREG32(address, reg * 4); 5840 (void)RREG32(address); 5841 r = RREG32(data); 5842 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5843 return r; 5844 } 5845 5846 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5847 u32 reg, u32 v) 5848 { 5849 unsigned long flags, address, data; 5850 5851 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5852 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5853 5854 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5855 WREG32(address, reg * 4); 5856 (void)RREG32(address); 5857 WREG32(data, v); 5858 (void)RREG32(data); 5859 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5860 } 5861