1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 36 #include <drm/drm_atomic_helper.h> 37 #include <drm/drm_probe_helper.h> 38 #include <drm/amdgpu_drm.h> 39 #include <linux/vgaarb.h> 40 #include <linux/vga_switcheroo.h> 41 #include <linux/efi.h> 42 #include "amdgpu.h" 43 #include "amdgpu_trace.h" 44 #include "amdgpu_i2c.h" 45 #include "atom.h" 46 #include "amdgpu_atombios.h" 47 #include "amdgpu_atomfirmware.h" 48 #include "amd_pcie.h" 49 #ifdef CONFIG_DRM_AMDGPU_SI 50 #include "si.h" 51 #endif 52 #ifdef CONFIG_DRM_AMDGPU_CIK 53 #include "cik.h" 54 #endif 55 #include "vi.h" 56 #include "soc15.h" 57 #include "nv.h" 58 #include "bif/bif_4_1_d.h" 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 #define AMDGPU_MAX_RETRY_LIMIT 2 87 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 88 89 const char *amdgpu_asic_name[] = { 90 "TAHITI", 91 "PITCAIRN", 92 "VERDE", 93 "OLAND", 94 "HAINAN", 95 "BONAIRE", 96 "KAVERI", 97 "KABINI", 98 "HAWAII", 99 "MULLINS", 100 "TOPAZ", 101 "TONGA", 102 "FIJI", 103 "CARRIZO", 104 "STONEY", 105 "POLARIS10", 106 "POLARIS11", 107 "POLARIS12", 108 "VEGAM", 109 "VEGA10", 110 "VEGA12", 111 "VEGA20", 112 "RAVEN", 113 "ARCTURUS", 114 "RENOIR", 115 "ALDEBARAN", 116 "NAVI10", 117 "CYAN_SKILLFISH", 118 "NAVI14", 119 "NAVI12", 120 "SIENNA_CICHLID", 121 "NAVY_FLOUNDER", 122 "VANGOGH", 123 "DIMGREY_CAVEFISH", 124 "BEIGE_GOBY", 125 "YELLOW_CARP", 126 "IP DISCOVERY", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 299 void *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0, tmp = 0; 303 uint32_t *data = buf; 304 uint64_t last; 305 int idx; 306 307 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 308 return; 309 310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 311 312 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 313 for (last = pos + size; pos < last; pos += 4) { 314 tmp = pos >> 31; 315 316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 317 if (tmp != hi) { 318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 319 hi = tmp; 320 } 321 if (write) 322 WREG32_NO_KIQ(mmMM_DATA, *data++); 323 else 324 *data++ = RREG32_NO_KIQ(mmMM_DATA); 325 } 326 327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 328 drm_dev_exit(idx); 329 } 330 331 /** 332 * amdgpu_device_aper_access - access vram by vram aperature 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 * 340 * The return value means how many bytes have been transferred. 341 */ 342 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 343 void *buf, size_t size, bool write) 344 { 345 #ifdef CONFIG_64BIT 346 void __iomem *addr; 347 size_t count = 0; 348 uint64_t last; 349 350 if (!adev->mman.aper_base_kaddr) 351 return 0; 352 353 last = min(pos + size, adev->gmc.visible_vram_size); 354 if (last > pos) { 355 addr = adev->mman.aper_base_kaddr + pos; 356 count = last - pos; 357 358 if (write) { 359 memcpy_toio(addr, buf, count); 360 mb(); 361 amdgpu_device_flush_hdp(adev, NULL); 362 } else { 363 amdgpu_device_invalidate_hdp(adev, NULL); 364 mb(); 365 memcpy_fromio(buf, addr, count); 366 } 367 368 } 369 370 return count; 371 #else 372 return 0; 373 #endif 374 } 375 376 /** 377 * amdgpu_device_vram_access - read/write a buffer in vram 378 * 379 * @adev: amdgpu_device pointer 380 * @pos: offset of the buffer in vram 381 * @buf: virtual address of the buffer in system memory 382 * @size: read/write size, sizeof(@buf) must > @size 383 * @write: true - write to vram, otherwise - read from vram 384 */ 385 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 386 void *buf, size_t size, bool write) 387 { 388 size_t count; 389 390 /* try to using vram apreature to access vram first */ 391 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 392 size -= count; 393 if (size) { 394 /* using MM to access rest vram */ 395 pos += count; 396 buf += count; 397 amdgpu_device_mm_access(adev, pos, buf, size, write); 398 } 399 } 400 401 /* 402 * register access helper functions. 403 */ 404 405 /* Check if hw access should be skipped because of hotplug or device error */ 406 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 407 { 408 if (adev->no_hw_access) 409 return true; 410 411 #ifdef CONFIG_LOCKDEP 412 /* 413 * This is a bit complicated to understand, so worth a comment. What we assert 414 * here is that the GPU reset is not running on another thread in parallel. 415 * 416 * For this we trylock the read side of the reset semaphore, if that succeeds 417 * we know that the reset is not running in paralell. 418 * 419 * If the trylock fails we assert that we are either already holding the read 420 * side of the lock or are the reset thread itself and hold the write side of 421 * the lock. 422 */ 423 if (in_task()) { 424 if (down_read_trylock(&adev->reset_domain->sem)) 425 up_read(&adev->reset_domain->sem); 426 else 427 lockdep_assert_held(&adev->reset_domain->sem); 428 } 429 #endif 430 return false; 431 } 432 433 /** 434 * amdgpu_device_rreg - read a memory mapped IO or indirect register 435 * 436 * @adev: amdgpu_device pointer 437 * @reg: dword aligned register offset 438 * @acc_flags: access flags which require special behavior 439 * 440 * Returns the 32 bit value from the offset specified. 441 */ 442 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 443 uint32_t reg, uint32_t acc_flags) 444 { 445 uint32_t ret; 446 447 if (amdgpu_device_skip_hw_access(adev)) 448 return 0; 449 450 if ((reg * 4) < adev->rmmio_size) { 451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 452 amdgpu_sriov_runtime(adev) && 453 down_read_trylock(&adev->reset_domain->sem)) { 454 ret = amdgpu_kiq_rreg(adev, reg); 455 up_read(&adev->reset_domain->sem); 456 } else { 457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 458 } 459 } else { 460 ret = adev->pcie_rreg(adev, reg * 4); 461 } 462 463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 464 465 return ret; 466 } 467 468 /* 469 * MMIO register read with bytes helper functions 470 * @offset:bytes offset from MMIO start 471 * 472 */ 473 474 /** 475 * amdgpu_mm_rreg8 - read a memory mapped IO register 476 * 477 * @adev: amdgpu_device pointer 478 * @offset: byte aligned register offset 479 * 480 * Returns the 8 bit value from the offset specified. 481 */ 482 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 483 { 484 if (amdgpu_device_skip_hw_access(adev)) 485 return 0; 486 487 if (offset < adev->rmmio_size) 488 return (readb(adev->rmmio + offset)); 489 BUG(); 490 } 491 492 /* 493 * MMIO register write with bytes helper functions 494 * @offset:bytes offset from MMIO start 495 * @value: the value want to be written to the register 496 * 497 */ 498 /** 499 * amdgpu_mm_wreg8 - read a memory mapped IO register 500 * 501 * @adev: amdgpu_device pointer 502 * @offset: byte aligned register offset 503 * @value: 8 bit value to write 504 * 505 * Writes the value specified to the offset specified. 506 */ 507 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 508 { 509 if (amdgpu_device_skip_hw_access(adev)) 510 return; 511 512 if (offset < adev->rmmio_size) 513 writeb(value, adev->rmmio + offset); 514 else 515 BUG(); 516 } 517 518 /** 519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 520 * 521 * @adev: amdgpu_device pointer 522 * @reg: dword aligned register offset 523 * @v: 32 bit value to write to the register 524 * @acc_flags: access flags which require special behavior 525 * 526 * Writes the value specified to the offset specified. 527 */ 528 void amdgpu_device_wreg(struct amdgpu_device *adev, 529 uint32_t reg, uint32_t v, 530 uint32_t acc_flags) 531 { 532 if (amdgpu_device_skip_hw_access(adev)) 533 return; 534 535 if ((reg * 4) < adev->rmmio_size) { 536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 537 amdgpu_sriov_runtime(adev) && 538 down_read_trylock(&adev->reset_domain->sem)) { 539 amdgpu_kiq_wreg(adev, reg, v); 540 up_read(&adev->reset_domain->sem); 541 } else { 542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 543 } 544 } else { 545 adev->pcie_wreg(adev, reg * 4, v); 546 } 547 548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 549 } 550 551 /** 552 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 553 * 554 * @adev: amdgpu_device pointer 555 * @reg: mmio/rlc register 556 * @v: value to write 557 * 558 * this function is invoked only for the debugfs register access 559 */ 560 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 561 uint32_t reg, uint32_t v) 562 { 563 if (amdgpu_device_skip_hw_access(adev)) 564 return; 565 566 if (amdgpu_sriov_fullaccess(adev) && 567 adev->gfx.rlc.funcs && 568 adev->gfx.rlc.funcs->is_rlcg_access_range) { 569 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 570 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 571 } else if ((reg * 4) >= adev->rmmio_size) { 572 adev->pcie_wreg(adev, reg * 4, v); 573 } else { 574 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 575 } 576 } 577 578 /** 579 * amdgpu_mm_rdoorbell - read a doorbell dword 580 * 581 * @adev: amdgpu_device pointer 582 * @index: doorbell index 583 * 584 * Returns the value in the doorbell aperture at the 585 * requested doorbell index (CIK). 586 */ 587 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 588 { 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if (index < adev->doorbell.num_doorbells) { 593 return readl(adev->doorbell.ptr + index); 594 } else { 595 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 596 return 0; 597 } 598 } 599 600 /** 601 * amdgpu_mm_wdoorbell - write a doorbell dword 602 * 603 * @adev: amdgpu_device pointer 604 * @index: doorbell index 605 * @v: value to write 606 * 607 * Writes @v to the doorbell aperture at the 608 * requested doorbell index (CIK). 609 */ 610 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 611 { 612 if (amdgpu_device_skip_hw_access(adev)) 613 return; 614 615 if (index < adev->doorbell.num_doorbells) { 616 writel(v, adev->doorbell.ptr + index); 617 } else { 618 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 619 } 620 } 621 622 /** 623 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 624 * 625 * @adev: amdgpu_device pointer 626 * @index: doorbell index 627 * 628 * Returns the value in the doorbell aperture at the 629 * requested doorbell index (VEGA10+). 630 */ 631 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 632 { 633 if (amdgpu_device_skip_hw_access(adev)) 634 return 0; 635 636 if (index < adev->doorbell.num_doorbells) { 637 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 638 } else { 639 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 640 return 0; 641 } 642 } 643 644 /** 645 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 646 * 647 * @adev: amdgpu_device pointer 648 * @index: doorbell index 649 * @v: value to write 650 * 651 * Writes @v to the doorbell aperture at the 652 * requested doorbell index (VEGA10+). 653 */ 654 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 655 { 656 if (amdgpu_device_skip_hw_access(adev)) 657 return; 658 659 if (index < adev->doorbell.num_doorbells) { 660 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 661 } else { 662 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 663 } 664 } 665 666 /** 667 * amdgpu_device_indirect_rreg - read an indirect register 668 * 669 * @adev: amdgpu_device pointer 670 * @pcie_index: mmio register offset 671 * @pcie_data: mmio register offset 672 * @reg_addr: indirect register address to read from 673 * 674 * Returns the value of indirect register @reg_addr 675 */ 676 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 677 u32 pcie_index, u32 pcie_data, 678 u32 reg_addr) 679 { 680 unsigned long flags; 681 u32 r; 682 void __iomem *pcie_index_offset; 683 void __iomem *pcie_data_offset; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 689 writel(reg_addr, pcie_index_offset); 690 readl(pcie_index_offset); 691 r = readl(pcie_data_offset); 692 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 693 694 return r; 695 } 696 697 /** 698 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 699 * 700 * @adev: amdgpu_device pointer 701 * @pcie_index: mmio register offset 702 * @pcie_data: mmio register offset 703 * @reg_addr: indirect register address to read from 704 * 705 * Returns the value of indirect register @reg_addr 706 */ 707 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 708 u32 pcie_index, u32 pcie_data, 709 u32 reg_addr) 710 { 711 unsigned long flags; 712 u64 r; 713 void __iomem *pcie_index_offset; 714 void __iomem *pcie_data_offset; 715 716 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 717 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 718 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 719 720 /* read low 32 bits */ 721 writel(reg_addr, pcie_index_offset); 722 readl(pcie_index_offset); 723 r = readl(pcie_data_offset); 724 /* read high 32 bits */ 725 writel(reg_addr + 4, pcie_index_offset); 726 readl(pcie_index_offset); 727 r |= ((u64)readl(pcie_data_offset) << 32); 728 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 729 730 return r; 731 } 732 733 /** 734 * amdgpu_device_indirect_wreg - write an indirect register address 735 * 736 * @adev: amdgpu_device pointer 737 * @pcie_index: mmio register offset 738 * @pcie_data: mmio register offset 739 * @reg_addr: indirect register offset 740 * @reg_data: indirect register data 741 * 742 */ 743 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 744 u32 pcie_index, u32 pcie_data, 745 u32 reg_addr, u32 reg_data) 746 { 747 unsigned long flags; 748 void __iomem *pcie_index_offset; 749 void __iomem *pcie_data_offset; 750 751 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 752 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 753 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 754 755 writel(reg_addr, pcie_index_offset); 756 readl(pcie_index_offset); 757 writel(reg_data, pcie_data_offset); 758 readl(pcie_data_offset); 759 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 760 } 761 762 /** 763 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 764 * 765 * @adev: amdgpu_device pointer 766 * @pcie_index: mmio register offset 767 * @pcie_data: mmio register offset 768 * @reg_addr: indirect register offset 769 * @reg_data: indirect register data 770 * 771 */ 772 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 773 u32 pcie_index, u32 pcie_data, 774 u32 reg_addr, u64 reg_data) 775 { 776 unsigned long flags; 777 void __iomem *pcie_index_offset; 778 void __iomem *pcie_data_offset; 779 780 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 781 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 782 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 783 784 /* write low 32 bits */ 785 writel(reg_addr, pcie_index_offset); 786 readl(pcie_index_offset); 787 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 788 readl(pcie_data_offset); 789 /* write high 32 bits */ 790 writel(reg_addr + 4, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data >> 32), pcie_data_offset); 793 readl(pcie_data_offset); 794 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 795 } 796 797 /** 798 * amdgpu_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @reg: offset of register 802 * 803 * Dummy register read function. Used for register blocks 804 * that certain asics don't have (all asics). 805 * Returns the value in the register. 806 */ 807 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 808 { 809 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 810 BUG(); 811 return 0; 812 } 813 814 /** 815 * amdgpu_invalid_wreg - dummy reg write function 816 * 817 * @adev: amdgpu_device pointer 818 * @reg: offset of register 819 * @v: value to write to the register 820 * 821 * Dummy register read function. Used for register blocks 822 * that certain asics don't have (all asics). 823 */ 824 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 825 { 826 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 827 reg, v); 828 BUG(); 829 } 830 831 /** 832 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 833 * 834 * @adev: amdgpu_device pointer 835 * @reg: offset of register 836 * 837 * Dummy register read function. Used for register blocks 838 * that certain asics don't have (all asics). 839 * Returns the value in the register. 840 */ 841 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 842 { 843 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 844 BUG(); 845 return 0; 846 } 847 848 /** 849 * amdgpu_invalid_wreg64 - dummy reg write function 850 * 851 * @adev: amdgpu_device pointer 852 * @reg: offset of register 853 * @v: value to write to the register 854 * 855 * Dummy register read function. Used for register blocks 856 * that certain asics don't have (all asics). 857 */ 858 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 859 { 860 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 861 reg, v); 862 BUG(); 863 } 864 865 /** 866 * amdgpu_block_invalid_rreg - dummy reg read function 867 * 868 * @adev: amdgpu_device pointer 869 * @block: offset of instance 870 * @reg: offset of register 871 * 872 * Dummy register read function. Used for register blocks 873 * that certain asics don't have (all asics). 874 * Returns the value in the register. 875 */ 876 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 877 uint32_t block, uint32_t reg) 878 { 879 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 880 reg, block); 881 BUG(); 882 return 0; 883 } 884 885 /** 886 * amdgpu_block_invalid_wreg - dummy reg write function 887 * 888 * @adev: amdgpu_device pointer 889 * @block: offset of instance 890 * @reg: offset of register 891 * @v: value to write to the register 892 * 893 * Dummy register read function. Used for register blocks 894 * that certain asics don't have (all asics). 895 */ 896 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 897 uint32_t block, 898 uint32_t reg, uint32_t v) 899 { 900 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 901 reg, block, v); 902 BUG(); 903 } 904 905 /** 906 * amdgpu_device_asic_init - Wrapper for atom asic_init 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Does any asic specific work and then calls atom asic init. 911 */ 912 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 913 { 914 amdgpu_asic_pre_asic_init(adev); 915 916 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 917 return amdgpu_atomfirmware_asic_init(adev, true); 918 else 919 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 920 } 921 922 /** 923 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 924 * 925 * @adev: amdgpu_device pointer 926 * 927 * Allocates a scratch page of VRAM for use by various things in the 928 * driver. 929 */ 930 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 931 { 932 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 933 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 934 &adev->vram_scratch.robj, 935 &adev->vram_scratch.gpu_addr, 936 (void **)&adev->vram_scratch.ptr); 937 } 938 939 /** 940 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 941 * 942 * @adev: amdgpu_device pointer 943 * 944 * Frees the VRAM scratch page. 945 */ 946 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 947 { 948 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 949 } 950 951 /** 952 * amdgpu_device_program_register_sequence - program an array of registers. 953 * 954 * @adev: amdgpu_device pointer 955 * @registers: pointer to the register array 956 * @array_size: size of the register array 957 * 958 * Programs an array or registers with and and or masks. 959 * This is a helper for setting golden registers. 960 */ 961 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 962 const u32 *registers, 963 const u32 array_size) 964 { 965 u32 tmp, reg, and_mask, or_mask; 966 int i; 967 968 if (array_size % 3) 969 return; 970 971 for (i = 0; i < array_size; i +=3) { 972 reg = registers[i + 0]; 973 and_mask = registers[i + 1]; 974 or_mask = registers[i + 2]; 975 976 if (and_mask == 0xffffffff) { 977 tmp = or_mask; 978 } else { 979 tmp = RREG32(reg); 980 tmp &= ~and_mask; 981 if (adev->family >= AMDGPU_FAMILY_AI) 982 tmp |= (or_mask & and_mask); 983 else 984 tmp |= or_mask; 985 } 986 WREG32(reg, tmp); 987 } 988 } 989 990 /** 991 * amdgpu_device_pci_config_reset - reset the GPU 992 * 993 * @adev: amdgpu_device pointer 994 * 995 * Resets the GPU using the pci config reset sequence. 996 * Only applicable to asics prior to vega10. 997 */ 998 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 999 { 1000 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1001 } 1002 1003 /** 1004 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1005 * 1006 * @adev: amdgpu_device pointer 1007 * 1008 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1009 */ 1010 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1011 { 1012 return pci_reset_function(adev->pdev); 1013 } 1014 1015 /* 1016 * GPU doorbell aperture helpers function. 1017 */ 1018 /** 1019 * amdgpu_device_doorbell_init - Init doorbell driver information. 1020 * 1021 * @adev: amdgpu_device pointer 1022 * 1023 * Init doorbell driver information (CIK) 1024 * Returns 0 on success, error on failure. 1025 */ 1026 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1027 { 1028 1029 /* No doorbell on SI hardware generation */ 1030 if (adev->asic_type < CHIP_BONAIRE) { 1031 adev->doorbell.base = 0; 1032 adev->doorbell.size = 0; 1033 adev->doorbell.num_doorbells = 0; 1034 adev->doorbell.ptr = NULL; 1035 return 0; 1036 } 1037 1038 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1039 return -EINVAL; 1040 1041 amdgpu_asic_init_doorbell_index(adev); 1042 1043 /* doorbell bar mapping */ 1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1046 1047 if (adev->enable_mes) { 1048 adev->doorbell.num_doorbells = 1049 adev->doorbell.size / sizeof(u32); 1050 } else { 1051 adev->doorbell.num_doorbells = 1052 min_t(u32, adev->doorbell.size / sizeof(u32), 1053 adev->doorbell_index.max_assignment+1); 1054 if (adev->doorbell.num_doorbells == 0) 1055 return -EINVAL; 1056 1057 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1058 * paging queue doorbell use the second page. The 1059 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1060 * doorbells are in the first page. So with paging queue enabled, 1061 * the max num_doorbells should + 1 page (0x400 in dword) 1062 */ 1063 if (adev->asic_type >= CHIP_VEGA10) 1064 adev->doorbell.num_doorbells += 0x400; 1065 } 1066 1067 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1068 adev->doorbell.num_doorbells * 1069 sizeof(u32)); 1070 if (adev->doorbell.ptr == NULL) 1071 return -ENOMEM; 1072 1073 return 0; 1074 } 1075 1076 /** 1077 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1078 * 1079 * @adev: amdgpu_device pointer 1080 * 1081 * Tear down doorbell driver information (CIK) 1082 */ 1083 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1084 { 1085 iounmap(adev->doorbell.ptr); 1086 adev->doorbell.ptr = NULL; 1087 } 1088 1089 1090 1091 /* 1092 * amdgpu_device_wb_*() 1093 * Writeback is the method by which the GPU updates special pages in memory 1094 * with the status of certain GPU events (fences, ring pointers,etc.). 1095 */ 1096 1097 /** 1098 * amdgpu_device_wb_fini - Disable Writeback and free memory 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Disables Writeback and frees the Writeback memory (all asics). 1103 * Used at driver shutdown. 1104 */ 1105 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1106 { 1107 if (adev->wb.wb_obj) { 1108 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1109 &adev->wb.gpu_addr, 1110 (void **)&adev->wb.wb); 1111 adev->wb.wb_obj = NULL; 1112 } 1113 } 1114 1115 /** 1116 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1117 * 1118 * @adev: amdgpu_device pointer 1119 * 1120 * Initializes writeback and allocates writeback memory (all asics). 1121 * Used at driver startup. 1122 * Returns 0 on success or an -error on failure. 1123 */ 1124 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1125 { 1126 int r; 1127 1128 if (adev->wb.wb_obj == NULL) { 1129 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1130 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1131 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1132 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1133 (void **)&adev->wb.wb); 1134 if (r) { 1135 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1136 return r; 1137 } 1138 1139 adev->wb.num_wb = AMDGPU_MAX_WB; 1140 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1141 1142 /* clear wb memory */ 1143 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1144 } 1145 1146 return 0; 1147 } 1148 1149 /** 1150 * amdgpu_device_wb_get - Allocate a wb entry 1151 * 1152 * @adev: amdgpu_device pointer 1153 * @wb: wb index 1154 * 1155 * Allocate a wb slot for use by the driver (all asics). 1156 * Returns 0 on success or -EINVAL on failure. 1157 */ 1158 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1159 { 1160 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1161 1162 if (offset < adev->wb.num_wb) { 1163 __set_bit(offset, adev->wb.used); 1164 *wb = offset << 3; /* convert to dw offset */ 1165 return 0; 1166 } else { 1167 return -EINVAL; 1168 } 1169 } 1170 1171 /** 1172 * amdgpu_device_wb_free - Free a wb entry 1173 * 1174 * @adev: amdgpu_device pointer 1175 * @wb: wb index 1176 * 1177 * Free a wb slot allocated for use by the driver (all asics) 1178 */ 1179 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1180 { 1181 wb >>= 3; 1182 if (wb < adev->wb.num_wb) 1183 __clear_bit(wb, adev->wb.used); 1184 } 1185 1186 /** 1187 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1188 * 1189 * @adev: amdgpu_device pointer 1190 * 1191 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1192 * to fail, but if any of the BARs is not accessible after the size we abort 1193 * driver loading by returning -ENODEV. 1194 */ 1195 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1196 { 1197 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1198 struct pci_bus *root; 1199 struct resource *res; 1200 unsigned i; 1201 u16 cmd; 1202 int r; 1203 1204 /* Bypass for VF */ 1205 if (amdgpu_sriov_vf(adev)) 1206 return 0; 1207 1208 /* skip if the bios has already enabled large BAR */ 1209 if (adev->gmc.real_vram_size && 1210 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1211 return 0; 1212 1213 /* Check if the root BUS has 64bit memory resources */ 1214 root = adev->pdev->bus; 1215 while (root->parent) 1216 root = root->parent; 1217 1218 pci_bus_for_each_resource(root, res, i) { 1219 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1220 res->start > 0x100000000ull) 1221 break; 1222 } 1223 1224 /* Trying to resize is pointless without a root hub window above 4GB */ 1225 if (!res) 1226 return 0; 1227 1228 /* Limit the BAR size to what is available */ 1229 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1230 rbar_size); 1231 1232 /* Disable memory decoding while we change the BAR addresses and size */ 1233 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1234 pci_write_config_word(adev->pdev, PCI_COMMAND, 1235 cmd & ~PCI_COMMAND_MEMORY); 1236 1237 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1238 amdgpu_device_doorbell_fini(adev); 1239 if (adev->asic_type >= CHIP_BONAIRE) 1240 pci_release_resource(adev->pdev, 2); 1241 1242 pci_release_resource(adev->pdev, 0); 1243 1244 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1245 if (r == -ENOSPC) 1246 DRM_INFO("Not enough PCI address space for a large BAR."); 1247 else if (r && r != -ENOTSUPP) 1248 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1249 1250 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1251 1252 /* When the doorbell or fb BAR isn't available we have no chance of 1253 * using the device. 1254 */ 1255 r = amdgpu_device_doorbell_init(adev); 1256 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1257 return -ENODEV; 1258 1259 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1260 1261 return 0; 1262 } 1263 1264 /* 1265 * GPU helpers function. 1266 */ 1267 /** 1268 * amdgpu_device_need_post - check if the hw need post or not 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Check if the asic has been initialized (all asics) at driver startup 1273 * or post is needed if hw reset is performed. 1274 * Returns true if need or false if not. 1275 */ 1276 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1277 { 1278 uint32_t reg; 1279 1280 if (amdgpu_sriov_vf(adev)) 1281 return false; 1282 1283 if (amdgpu_passthrough(adev)) { 1284 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1285 * some old smc fw still need driver do vPost otherwise gpu hang, while 1286 * those smc fw version above 22.15 doesn't have this flaw, so we force 1287 * vpost executed for smc version below 22.15 1288 */ 1289 if (adev->asic_type == CHIP_FIJI) { 1290 int err; 1291 uint32_t fw_ver; 1292 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1293 /* force vPost if error occured */ 1294 if (err) 1295 return true; 1296 1297 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1298 if (fw_ver < 0x00160e00) 1299 return true; 1300 } 1301 } 1302 1303 /* Don't post if we need to reset whole hive on init */ 1304 if (adev->gmc.xgmi.pending_reset) 1305 return false; 1306 1307 if (adev->has_hw_reset) { 1308 adev->has_hw_reset = false; 1309 return true; 1310 } 1311 1312 /* bios scratch used on CIK+ */ 1313 if (adev->asic_type >= CHIP_BONAIRE) 1314 return amdgpu_atombios_scratch_need_asic_init(adev); 1315 1316 /* check MEM_SIZE for older asics */ 1317 reg = amdgpu_asic_get_config_memsize(adev); 1318 1319 if ((reg != 0) && (reg != 0xffffffff)) 1320 return false; 1321 1322 return true; 1323 } 1324 1325 /** 1326 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1327 * 1328 * @adev: amdgpu_device pointer 1329 * 1330 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1331 * be set for this device. 1332 * 1333 * Returns true if it should be used or false if not. 1334 */ 1335 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1336 { 1337 switch (amdgpu_aspm) { 1338 case -1: 1339 break; 1340 case 0: 1341 return false; 1342 case 1: 1343 return true; 1344 default: 1345 return false; 1346 } 1347 return pcie_aspm_enabled(adev->pdev); 1348 } 1349 1350 /* if we get transitioned to only one device, take VGA back */ 1351 /** 1352 * amdgpu_device_vga_set_decode - enable/disable vga decode 1353 * 1354 * @pdev: PCI device pointer 1355 * @state: enable/disable vga decode 1356 * 1357 * Enable/disable vga decode (all asics). 1358 * Returns VGA resource flags. 1359 */ 1360 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1361 bool state) 1362 { 1363 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1364 amdgpu_asic_set_vga_state(adev, state); 1365 if (state) 1366 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1367 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1368 else 1369 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1370 } 1371 1372 /** 1373 * amdgpu_device_check_block_size - validate the vm block size 1374 * 1375 * @adev: amdgpu_device pointer 1376 * 1377 * Validates the vm block size specified via module parameter. 1378 * The vm block size defines number of bits in page table versus page directory, 1379 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1380 * page table and the remaining bits are in the page directory. 1381 */ 1382 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1383 { 1384 /* defines number of bits in page table versus page directory, 1385 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1386 * page table and the remaining bits are in the page directory */ 1387 if (amdgpu_vm_block_size == -1) 1388 return; 1389 1390 if (amdgpu_vm_block_size < 9) { 1391 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1392 amdgpu_vm_block_size); 1393 amdgpu_vm_block_size = -1; 1394 } 1395 } 1396 1397 /** 1398 * amdgpu_device_check_vm_size - validate the vm size 1399 * 1400 * @adev: amdgpu_device pointer 1401 * 1402 * Validates the vm size in GB specified via module parameter. 1403 * The VM size is the size of the GPU virtual memory space in GB. 1404 */ 1405 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1406 { 1407 /* no need to check the default value */ 1408 if (amdgpu_vm_size == -1) 1409 return; 1410 1411 if (amdgpu_vm_size < 1) { 1412 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1413 amdgpu_vm_size); 1414 amdgpu_vm_size = -1; 1415 } 1416 } 1417 1418 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1419 { 1420 struct sysinfo si; 1421 bool is_os_64 = (sizeof(void *) == 8); 1422 uint64_t total_memory; 1423 uint64_t dram_size_seven_GB = 0x1B8000000; 1424 uint64_t dram_size_three_GB = 0xB8000000; 1425 1426 if (amdgpu_smu_memory_pool_size == 0) 1427 return; 1428 1429 if (!is_os_64) { 1430 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1431 goto def_value; 1432 } 1433 si_meminfo(&si); 1434 total_memory = (uint64_t)si.totalram * si.mem_unit; 1435 1436 if ((amdgpu_smu_memory_pool_size == 1) || 1437 (amdgpu_smu_memory_pool_size == 2)) { 1438 if (total_memory < dram_size_three_GB) 1439 goto def_value1; 1440 } else if ((amdgpu_smu_memory_pool_size == 4) || 1441 (amdgpu_smu_memory_pool_size == 8)) { 1442 if (total_memory < dram_size_seven_GB) 1443 goto def_value1; 1444 } else { 1445 DRM_WARN("Smu memory pool size not supported\n"); 1446 goto def_value; 1447 } 1448 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1449 1450 return; 1451 1452 def_value1: 1453 DRM_WARN("No enough system memory\n"); 1454 def_value: 1455 adev->pm.smu_prv_buffer_size = 0; 1456 } 1457 1458 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1459 { 1460 if (!(adev->flags & AMD_IS_APU) || 1461 adev->asic_type < CHIP_RAVEN) 1462 return 0; 1463 1464 switch (adev->asic_type) { 1465 case CHIP_RAVEN: 1466 if (adev->pdev->device == 0x15dd) 1467 adev->apu_flags |= AMD_APU_IS_RAVEN; 1468 if (adev->pdev->device == 0x15d8) 1469 adev->apu_flags |= AMD_APU_IS_PICASSO; 1470 break; 1471 case CHIP_RENOIR: 1472 if ((adev->pdev->device == 0x1636) || 1473 (adev->pdev->device == 0x164c)) 1474 adev->apu_flags |= AMD_APU_IS_RENOIR; 1475 else 1476 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1477 break; 1478 case CHIP_VANGOGH: 1479 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1480 break; 1481 case CHIP_YELLOW_CARP: 1482 break; 1483 case CHIP_CYAN_SKILLFISH: 1484 if ((adev->pdev->device == 0x13FE) || 1485 (adev->pdev->device == 0x143F)) 1486 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1487 break; 1488 default: 1489 break; 1490 } 1491 1492 return 0; 1493 } 1494 1495 /** 1496 * amdgpu_device_check_arguments - validate module params 1497 * 1498 * @adev: amdgpu_device pointer 1499 * 1500 * Validates certain module parameters and updates 1501 * the associated values used by the driver (all asics). 1502 */ 1503 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1504 { 1505 if (amdgpu_sched_jobs < 4) { 1506 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1507 amdgpu_sched_jobs); 1508 amdgpu_sched_jobs = 4; 1509 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1510 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1511 amdgpu_sched_jobs); 1512 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1513 } 1514 1515 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1516 /* gart size must be greater or equal to 32M */ 1517 dev_warn(adev->dev, "gart size (%d) too small\n", 1518 amdgpu_gart_size); 1519 amdgpu_gart_size = -1; 1520 } 1521 1522 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1523 /* gtt size must be greater or equal to 32M */ 1524 dev_warn(adev->dev, "gtt size (%d) too small\n", 1525 amdgpu_gtt_size); 1526 amdgpu_gtt_size = -1; 1527 } 1528 1529 /* valid range is between 4 and 9 inclusive */ 1530 if (amdgpu_vm_fragment_size != -1 && 1531 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1532 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1533 amdgpu_vm_fragment_size = -1; 1534 } 1535 1536 if (amdgpu_sched_hw_submission < 2) { 1537 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1538 amdgpu_sched_hw_submission); 1539 amdgpu_sched_hw_submission = 2; 1540 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1541 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1542 amdgpu_sched_hw_submission); 1543 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1544 } 1545 1546 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1547 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1548 amdgpu_reset_method = -1; 1549 } 1550 1551 amdgpu_device_check_smu_prv_buffer_size(adev); 1552 1553 amdgpu_device_check_vm_size(adev); 1554 1555 amdgpu_device_check_block_size(adev); 1556 1557 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1558 1559 return 0; 1560 } 1561 1562 /** 1563 * amdgpu_switcheroo_set_state - set switcheroo state 1564 * 1565 * @pdev: pci dev pointer 1566 * @state: vga_switcheroo state 1567 * 1568 * Callback for the switcheroo driver. Suspends or resumes the 1569 * the asics before or after it is powered up using ACPI methods. 1570 */ 1571 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1572 enum vga_switcheroo_state state) 1573 { 1574 struct drm_device *dev = pci_get_drvdata(pdev); 1575 int r; 1576 1577 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1578 return; 1579 1580 if (state == VGA_SWITCHEROO_ON) { 1581 pr_info("switched on\n"); 1582 /* don't suspend or resume card normally */ 1583 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1584 1585 pci_set_power_state(pdev, PCI_D0); 1586 amdgpu_device_load_pci_state(pdev); 1587 r = pci_enable_device(pdev); 1588 if (r) 1589 DRM_WARN("pci_enable_device failed (%d)\n", r); 1590 amdgpu_device_resume(dev, true); 1591 1592 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1593 } else { 1594 pr_info("switched off\n"); 1595 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1596 amdgpu_device_suspend(dev, true); 1597 amdgpu_device_cache_pci_state(pdev); 1598 /* Shut down the device */ 1599 pci_disable_device(pdev); 1600 pci_set_power_state(pdev, PCI_D3cold); 1601 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1602 } 1603 } 1604 1605 /** 1606 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1607 * 1608 * @pdev: pci dev pointer 1609 * 1610 * Callback for the switcheroo driver. Check of the switcheroo 1611 * state can be changed. 1612 * Returns true if the state can be changed, false if not. 1613 */ 1614 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1615 { 1616 struct drm_device *dev = pci_get_drvdata(pdev); 1617 1618 /* 1619 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1620 * locking inversion with the driver load path. And the access here is 1621 * completely racy anyway. So don't bother with locking for now. 1622 */ 1623 return atomic_read(&dev->open_count) == 0; 1624 } 1625 1626 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1627 .set_gpu_state = amdgpu_switcheroo_set_state, 1628 .reprobe = NULL, 1629 .can_switch = amdgpu_switcheroo_can_switch, 1630 }; 1631 1632 /** 1633 * amdgpu_device_ip_set_clockgating_state - set the CG state 1634 * 1635 * @dev: amdgpu_device pointer 1636 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1637 * @state: clockgating state (gate or ungate) 1638 * 1639 * Sets the requested clockgating state for all instances of 1640 * the hardware IP specified. 1641 * Returns the error code from the last instance. 1642 */ 1643 int amdgpu_device_ip_set_clockgating_state(void *dev, 1644 enum amd_ip_block_type block_type, 1645 enum amd_clockgating_state state) 1646 { 1647 struct amdgpu_device *adev = dev; 1648 int i, r = 0; 1649 1650 for (i = 0; i < adev->num_ip_blocks; i++) { 1651 if (!adev->ip_blocks[i].status.valid) 1652 continue; 1653 if (adev->ip_blocks[i].version->type != block_type) 1654 continue; 1655 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1656 continue; 1657 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1658 (void *)adev, state); 1659 if (r) 1660 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1661 adev->ip_blocks[i].version->funcs->name, r); 1662 } 1663 return r; 1664 } 1665 1666 /** 1667 * amdgpu_device_ip_set_powergating_state - set the PG state 1668 * 1669 * @dev: amdgpu_device pointer 1670 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1671 * @state: powergating state (gate or ungate) 1672 * 1673 * Sets the requested powergating state for all instances of 1674 * the hardware IP specified. 1675 * Returns the error code from the last instance. 1676 */ 1677 int amdgpu_device_ip_set_powergating_state(void *dev, 1678 enum amd_ip_block_type block_type, 1679 enum amd_powergating_state state) 1680 { 1681 struct amdgpu_device *adev = dev; 1682 int i, r = 0; 1683 1684 for (i = 0; i < adev->num_ip_blocks; i++) { 1685 if (!adev->ip_blocks[i].status.valid) 1686 continue; 1687 if (adev->ip_blocks[i].version->type != block_type) 1688 continue; 1689 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1690 continue; 1691 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1692 (void *)adev, state); 1693 if (r) 1694 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1695 adev->ip_blocks[i].version->funcs->name, r); 1696 } 1697 return r; 1698 } 1699 1700 /** 1701 * amdgpu_device_ip_get_clockgating_state - get the CG state 1702 * 1703 * @adev: amdgpu_device pointer 1704 * @flags: clockgating feature flags 1705 * 1706 * Walks the list of IPs on the device and updates the clockgating 1707 * flags for each IP. 1708 * Updates @flags with the feature flags for each hardware IP where 1709 * clockgating is enabled. 1710 */ 1711 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1712 u64 *flags) 1713 { 1714 int i; 1715 1716 for (i = 0; i < adev->num_ip_blocks; i++) { 1717 if (!adev->ip_blocks[i].status.valid) 1718 continue; 1719 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1720 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1721 } 1722 } 1723 1724 /** 1725 * amdgpu_device_ip_wait_for_idle - wait for idle 1726 * 1727 * @adev: amdgpu_device pointer 1728 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1729 * 1730 * Waits for the request hardware IP to be idle. 1731 * Returns 0 for success or a negative error code on failure. 1732 */ 1733 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1734 enum amd_ip_block_type block_type) 1735 { 1736 int i, r; 1737 1738 for (i = 0; i < adev->num_ip_blocks; i++) { 1739 if (!adev->ip_blocks[i].status.valid) 1740 continue; 1741 if (adev->ip_blocks[i].version->type == block_type) { 1742 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1743 if (r) 1744 return r; 1745 break; 1746 } 1747 } 1748 return 0; 1749 1750 } 1751 1752 /** 1753 * amdgpu_device_ip_is_idle - is the hardware IP idle 1754 * 1755 * @adev: amdgpu_device pointer 1756 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1757 * 1758 * Check if the hardware IP is idle or not. 1759 * Returns true if it the IP is idle, false if not. 1760 */ 1761 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1762 enum amd_ip_block_type block_type) 1763 { 1764 int i; 1765 1766 for (i = 0; i < adev->num_ip_blocks; i++) { 1767 if (!adev->ip_blocks[i].status.valid) 1768 continue; 1769 if (adev->ip_blocks[i].version->type == block_type) 1770 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1771 } 1772 return true; 1773 1774 } 1775 1776 /** 1777 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1778 * 1779 * @adev: amdgpu_device pointer 1780 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1781 * 1782 * Returns a pointer to the hardware IP block structure 1783 * if it exists for the asic, otherwise NULL. 1784 */ 1785 struct amdgpu_ip_block * 1786 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1787 enum amd_ip_block_type type) 1788 { 1789 int i; 1790 1791 for (i = 0; i < adev->num_ip_blocks; i++) 1792 if (adev->ip_blocks[i].version->type == type) 1793 return &adev->ip_blocks[i]; 1794 1795 return NULL; 1796 } 1797 1798 /** 1799 * amdgpu_device_ip_block_version_cmp 1800 * 1801 * @adev: amdgpu_device pointer 1802 * @type: enum amd_ip_block_type 1803 * @major: major version 1804 * @minor: minor version 1805 * 1806 * return 0 if equal or greater 1807 * return 1 if smaller or the ip_block doesn't exist 1808 */ 1809 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1810 enum amd_ip_block_type type, 1811 u32 major, u32 minor) 1812 { 1813 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1814 1815 if (ip_block && ((ip_block->version->major > major) || 1816 ((ip_block->version->major == major) && 1817 (ip_block->version->minor >= minor)))) 1818 return 0; 1819 1820 return 1; 1821 } 1822 1823 /** 1824 * amdgpu_device_ip_block_add 1825 * 1826 * @adev: amdgpu_device pointer 1827 * @ip_block_version: pointer to the IP to add 1828 * 1829 * Adds the IP block driver information to the collection of IPs 1830 * on the asic. 1831 */ 1832 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1833 const struct amdgpu_ip_block_version *ip_block_version) 1834 { 1835 if (!ip_block_version) 1836 return -EINVAL; 1837 1838 switch (ip_block_version->type) { 1839 case AMD_IP_BLOCK_TYPE_VCN: 1840 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1841 return 0; 1842 break; 1843 case AMD_IP_BLOCK_TYPE_JPEG: 1844 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1845 return 0; 1846 break; 1847 default: 1848 break; 1849 } 1850 1851 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1852 ip_block_version->funcs->name); 1853 1854 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1855 1856 return 0; 1857 } 1858 1859 /** 1860 * amdgpu_device_enable_virtual_display - enable virtual display feature 1861 * 1862 * @adev: amdgpu_device pointer 1863 * 1864 * Enabled the virtual display feature if the user has enabled it via 1865 * the module parameter virtual_display. This feature provides a virtual 1866 * display hardware on headless boards or in virtualized environments. 1867 * This function parses and validates the configuration string specified by 1868 * the user and configues the virtual display configuration (number of 1869 * virtual connectors, crtcs, etc.) specified. 1870 */ 1871 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1872 { 1873 adev->enable_virtual_display = false; 1874 1875 if (amdgpu_virtual_display) { 1876 const char *pci_address_name = pci_name(adev->pdev); 1877 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1878 1879 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1880 pciaddstr_tmp = pciaddstr; 1881 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1882 pciaddname = strsep(&pciaddname_tmp, ","); 1883 if (!strcmp("all", pciaddname) 1884 || !strcmp(pci_address_name, pciaddname)) { 1885 long num_crtc; 1886 int res = -1; 1887 1888 adev->enable_virtual_display = true; 1889 1890 if (pciaddname_tmp) 1891 res = kstrtol(pciaddname_tmp, 10, 1892 &num_crtc); 1893 1894 if (!res) { 1895 if (num_crtc < 1) 1896 num_crtc = 1; 1897 if (num_crtc > 6) 1898 num_crtc = 6; 1899 adev->mode_info.num_crtc = num_crtc; 1900 } else { 1901 adev->mode_info.num_crtc = 1; 1902 } 1903 break; 1904 } 1905 } 1906 1907 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1908 amdgpu_virtual_display, pci_address_name, 1909 adev->enable_virtual_display, adev->mode_info.num_crtc); 1910 1911 kfree(pciaddstr); 1912 } 1913 } 1914 1915 /** 1916 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1917 * 1918 * @adev: amdgpu_device pointer 1919 * 1920 * Parses the asic configuration parameters specified in the gpu info 1921 * firmware and makes them availale to the driver for use in configuring 1922 * the asic. 1923 * Returns 0 on success, -EINVAL on failure. 1924 */ 1925 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1926 { 1927 const char *chip_name; 1928 char fw_name[40]; 1929 int err; 1930 const struct gpu_info_firmware_header_v1_0 *hdr; 1931 1932 adev->firmware.gpu_info_fw = NULL; 1933 1934 if (adev->mman.discovery_bin) { 1935 /* 1936 * FIXME: The bounding box is still needed by Navi12, so 1937 * temporarily read it from gpu_info firmware. Should be dropped 1938 * when DAL no longer needs it. 1939 */ 1940 if (adev->asic_type != CHIP_NAVI12) 1941 return 0; 1942 } 1943 1944 switch (adev->asic_type) { 1945 #ifdef CONFIG_DRM_AMDGPU_SI 1946 case CHIP_VERDE: 1947 case CHIP_TAHITI: 1948 case CHIP_PITCAIRN: 1949 case CHIP_OLAND: 1950 case CHIP_HAINAN: 1951 #endif 1952 #ifdef CONFIG_DRM_AMDGPU_CIK 1953 case CHIP_BONAIRE: 1954 case CHIP_HAWAII: 1955 case CHIP_KAVERI: 1956 case CHIP_KABINI: 1957 case CHIP_MULLINS: 1958 #endif 1959 case CHIP_TOPAZ: 1960 case CHIP_TONGA: 1961 case CHIP_FIJI: 1962 case CHIP_POLARIS10: 1963 case CHIP_POLARIS11: 1964 case CHIP_POLARIS12: 1965 case CHIP_VEGAM: 1966 case CHIP_CARRIZO: 1967 case CHIP_STONEY: 1968 case CHIP_VEGA20: 1969 case CHIP_ALDEBARAN: 1970 case CHIP_SIENNA_CICHLID: 1971 case CHIP_NAVY_FLOUNDER: 1972 case CHIP_DIMGREY_CAVEFISH: 1973 case CHIP_BEIGE_GOBY: 1974 default: 1975 return 0; 1976 case CHIP_VEGA10: 1977 chip_name = "vega10"; 1978 break; 1979 case CHIP_VEGA12: 1980 chip_name = "vega12"; 1981 break; 1982 case CHIP_RAVEN: 1983 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1984 chip_name = "raven2"; 1985 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1986 chip_name = "picasso"; 1987 else 1988 chip_name = "raven"; 1989 break; 1990 case CHIP_ARCTURUS: 1991 chip_name = "arcturus"; 1992 break; 1993 case CHIP_NAVI12: 1994 chip_name = "navi12"; 1995 break; 1996 } 1997 1998 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1999 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2000 if (err) { 2001 dev_err(adev->dev, 2002 "Failed to load gpu_info firmware \"%s\"\n", 2003 fw_name); 2004 goto out; 2005 } 2006 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2007 if (err) { 2008 dev_err(adev->dev, 2009 "Failed to validate gpu_info firmware \"%s\"\n", 2010 fw_name); 2011 goto out; 2012 } 2013 2014 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2015 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2016 2017 switch (hdr->version_major) { 2018 case 1: 2019 { 2020 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2021 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2022 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2023 2024 /* 2025 * Should be droped when DAL no longer needs it. 2026 */ 2027 if (adev->asic_type == CHIP_NAVI12) 2028 goto parse_soc_bounding_box; 2029 2030 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2031 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2032 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2033 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2034 adev->gfx.config.max_texture_channel_caches = 2035 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2036 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2037 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2038 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2039 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2040 adev->gfx.config.double_offchip_lds_buf = 2041 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2042 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2043 adev->gfx.cu_info.max_waves_per_simd = 2044 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2045 adev->gfx.cu_info.max_scratch_slots_per_cu = 2046 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2047 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2048 if (hdr->version_minor >= 1) { 2049 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2050 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2051 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2052 adev->gfx.config.num_sc_per_sh = 2053 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2054 adev->gfx.config.num_packer_per_sc = 2055 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2056 } 2057 2058 parse_soc_bounding_box: 2059 /* 2060 * soc bounding box info is not integrated in disocovery table, 2061 * we always need to parse it from gpu info firmware if needed. 2062 */ 2063 if (hdr->version_minor == 2) { 2064 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2065 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2066 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2067 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2068 } 2069 break; 2070 } 2071 default: 2072 dev_err(adev->dev, 2073 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2074 err = -EINVAL; 2075 goto out; 2076 } 2077 out: 2078 return err; 2079 } 2080 2081 /** 2082 * amdgpu_device_ip_early_init - run early init for hardware IPs 2083 * 2084 * @adev: amdgpu_device pointer 2085 * 2086 * Early initialization pass for hardware IPs. The hardware IPs that make 2087 * up each asic are discovered each IP's early_init callback is run. This 2088 * is the first stage in initializing the asic. 2089 * Returns 0 on success, negative error code on failure. 2090 */ 2091 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2092 { 2093 struct drm_device *dev = adev_to_drm(adev); 2094 struct pci_dev *parent; 2095 int i, r; 2096 2097 amdgpu_device_enable_virtual_display(adev); 2098 2099 if (amdgpu_sriov_vf(adev)) { 2100 r = amdgpu_virt_request_full_gpu(adev, true); 2101 if (r) 2102 return r; 2103 } 2104 2105 switch (adev->asic_type) { 2106 #ifdef CONFIG_DRM_AMDGPU_SI 2107 case CHIP_VERDE: 2108 case CHIP_TAHITI: 2109 case CHIP_PITCAIRN: 2110 case CHIP_OLAND: 2111 case CHIP_HAINAN: 2112 adev->family = AMDGPU_FAMILY_SI; 2113 r = si_set_ip_blocks(adev); 2114 if (r) 2115 return r; 2116 break; 2117 #endif 2118 #ifdef CONFIG_DRM_AMDGPU_CIK 2119 case CHIP_BONAIRE: 2120 case CHIP_HAWAII: 2121 case CHIP_KAVERI: 2122 case CHIP_KABINI: 2123 case CHIP_MULLINS: 2124 if (adev->flags & AMD_IS_APU) 2125 adev->family = AMDGPU_FAMILY_KV; 2126 else 2127 adev->family = AMDGPU_FAMILY_CI; 2128 2129 r = cik_set_ip_blocks(adev); 2130 if (r) 2131 return r; 2132 break; 2133 #endif 2134 case CHIP_TOPAZ: 2135 case CHIP_TONGA: 2136 case CHIP_FIJI: 2137 case CHIP_POLARIS10: 2138 case CHIP_POLARIS11: 2139 case CHIP_POLARIS12: 2140 case CHIP_VEGAM: 2141 case CHIP_CARRIZO: 2142 case CHIP_STONEY: 2143 if (adev->flags & AMD_IS_APU) 2144 adev->family = AMDGPU_FAMILY_CZ; 2145 else 2146 adev->family = AMDGPU_FAMILY_VI; 2147 2148 r = vi_set_ip_blocks(adev); 2149 if (r) 2150 return r; 2151 break; 2152 default: 2153 r = amdgpu_discovery_set_ip_blocks(adev); 2154 if (r) 2155 return r; 2156 break; 2157 } 2158 2159 if (amdgpu_has_atpx() && 2160 (amdgpu_is_atpx_hybrid() || 2161 amdgpu_has_atpx_dgpu_power_cntl()) && 2162 ((adev->flags & AMD_IS_APU) == 0) && 2163 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2164 adev->flags |= AMD_IS_PX; 2165 2166 if (!(adev->flags & AMD_IS_APU)) { 2167 parent = pci_upstream_bridge(adev->pdev); 2168 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2169 } 2170 2171 amdgpu_amdkfd_device_probe(adev); 2172 2173 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2174 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2175 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2176 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2177 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2178 2179 for (i = 0; i < adev->num_ip_blocks; i++) { 2180 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2181 DRM_ERROR("disabled ip block: %d <%s>\n", 2182 i, adev->ip_blocks[i].version->funcs->name); 2183 adev->ip_blocks[i].status.valid = false; 2184 } else { 2185 if (adev->ip_blocks[i].version->funcs->early_init) { 2186 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2187 if (r == -ENOENT) { 2188 adev->ip_blocks[i].status.valid = false; 2189 } else if (r) { 2190 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2191 adev->ip_blocks[i].version->funcs->name, r); 2192 return r; 2193 } else { 2194 adev->ip_blocks[i].status.valid = true; 2195 } 2196 } else { 2197 adev->ip_blocks[i].status.valid = true; 2198 } 2199 } 2200 /* get the vbios after the asic_funcs are set up */ 2201 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2202 r = amdgpu_device_parse_gpu_info_fw(adev); 2203 if (r) 2204 return r; 2205 2206 /* Read BIOS */ 2207 if (!amdgpu_get_bios(adev)) 2208 return -EINVAL; 2209 2210 r = amdgpu_atombios_init(adev); 2211 if (r) { 2212 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2213 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2214 return r; 2215 } 2216 2217 /*get pf2vf msg info at it's earliest time*/ 2218 if (amdgpu_sriov_vf(adev)) 2219 amdgpu_virt_init_data_exchange(adev); 2220 2221 } 2222 } 2223 2224 adev->cg_flags &= amdgpu_cg_mask; 2225 adev->pg_flags &= amdgpu_pg_mask; 2226 2227 return 0; 2228 } 2229 2230 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2231 { 2232 int i, r; 2233 2234 for (i = 0; i < adev->num_ip_blocks; i++) { 2235 if (!adev->ip_blocks[i].status.sw) 2236 continue; 2237 if (adev->ip_blocks[i].status.hw) 2238 continue; 2239 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2240 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2241 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2242 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2243 if (r) { 2244 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2245 adev->ip_blocks[i].version->funcs->name, r); 2246 return r; 2247 } 2248 adev->ip_blocks[i].status.hw = true; 2249 } 2250 } 2251 2252 return 0; 2253 } 2254 2255 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2256 { 2257 int i, r; 2258 2259 for (i = 0; i < adev->num_ip_blocks; i++) { 2260 if (!adev->ip_blocks[i].status.sw) 2261 continue; 2262 if (adev->ip_blocks[i].status.hw) 2263 continue; 2264 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2265 if (r) { 2266 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2267 adev->ip_blocks[i].version->funcs->name, r); 2268 return r; 2269 } 2270 adev->ip_blocks[i].status.hw = true; 2271 } 2272 2273 return 0; 2274 } 2275 2276 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2277 { 2278 int r = 0; 2279 int i; 2280 uint32_t smu_version; 2281 2282 if (adev->asic_type >= CHIP_VEGA10) { 2283 for (i = 0; i < adev->num_ip_blocks; i++) { 2284 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2285 continue; 2286 2287 if (!adev->ip_blocks[i].status.sw) 2288 continue; 2289 2290 /* no need to do the fw loading again if already done*/ 2291 if (adev->ip_blocks[i].status.hw == true) 2292 break; 2293 2294 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2295 r = adev->ip_blocks[i].version->funcs->resume(adev); 2296 if (r) { 2297 DRM_ERROR("resume of IP block <%s> failed %d\n", 2298 adev->ip_blocks[i].version->funcs->name, r); 2299 return r; 2300 } 2301 } else { 2302 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2303 if (r) { 2304 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2305 adev->ip_blocks[i].version->funcs->name, r); 2306 return r; 2307 } 2308 } 2309 2310 adev->ip_blocks[i].status.hw = true; 2311 break; 2312 } 2313 } 2314 2315 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2316 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2317 2318 return r; 2319 } 2320 2321 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2322 { 2323 long timeout; 2324 int r, i; 2325 2326 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2327 struct amdgpu_ring *ring = adev->rings[i]; 2328 2329 /* No need to setup the GPU scheduler for rings that don't need it */ 2330 if (!ring || ring->no_scheduler) 2331 continue; 2332 2333 switch (ring->funcs->type) { 2334 case AMDGPU_RING_TYPE_GFX: 2335 timeout = adev->gfx_timeout; 2336 break; 2337 case AMDGPU_RING_TYPE_COMPUTE: 2338 timeout = adev->compute_timeout; 2339 break; 2340 case AMDGPU_RING_TYPE_SDMA: 2341 timeout = adev->sdma_timeout; 2342 break; 2343 default: 2344 timeout = adev->video_timeout; 2345 break; 2346 } 2347 2348 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2349 ring->num_hw_submission, amdgpu_job_hang_limit, 2350 timeout, adev->reset_domain->wq, 2351 ring->sched_score, ring->name, 2352 adev->dev); 2353 if (r) { 2354 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2355 ring->name); 2356 return r; 2357 } 2358 } 2359 2360 return 0; 2361 } 2362 2363 2364 /** 2365 * amdgpu_device_ip_init - run init for hardware IPs 2366 * 2367 * @adev: amdgpu_device pointer 2368 * 2369 * Main initialization pass for hardware IPs. The list of all the hardware 2370 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2371 * are run. sw_init initializes the software state associated with each IP 2372 * and hw_init initializes the hardware associated with each IP. 2373 * Returns 0 on success, negative error code on failure. 2374 */ 2375 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2376 { 2377 int i, r; 2378 2379 r = amdgpu_ras_init(adev); 2380 if (r) 2381 return r; 2382 2383 for (i = 0; i < adev->num_ip_blocks; i++) { 2384 if (!adev->ip_blocks[i].status.valid) 2385 continue; 2386 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2387 if (r) { 2388 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2389 adev->ip_blocks[i].version->funcs->name, r); 2390 goto init_failed; 2391 } 2392 adev->ip_blocks[i].status.sw = true; 2393 2394 /* need to do gmc hw init early so we can allocate gpu mem */ 2395 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2396 /* Try to reserve bad pages early */ 2397 if (amdgpu_sriov_vf(adev)) 2398 amdgpu_virt_exchange_data(adev); 2399 2400 r = amdgpu_device_vram_scratch_init(adev); 2401 if (r) { 2402 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2403 goto init_failed; 2404 } 2405 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2406 if (r) { 2407 DRM_ERROR("hw_init %d failed %d\n", i, r); 2408 goto init_failed; 2409 } 2410 r = amdgpu_device_wb_init(adev); 2411 if (r) { 2412 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2413 goto init_failed; 2414 } 2415 adev->ip_blocks[i].status.hw = true; 2416 2417 /* right after GMC hw init, we create CSA */ 2418 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2419 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2420 AMDGPU_GEM_DOMAIN_VRAM, 2421 AMDGPU_CSA_SIZE); 2422 if (r) { 2423 DRM_ERROR("allocate CSA failed %d\n", r); 2424 goto init_failed; 2425 } 2426 } 2427 } 2428 } 2429 2430 if (amdgpu_sriov_vf(adev)) 2431 amdgpu_virt_init_data_exchange(adev); 2432 2433 r = amdgpu_ib_pool_init(adev); 2434 if (r) { 2435 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2436 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2437 goto init_failed; 2438 } 2439 2440 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2441 if (r) 2442 goto init_failed; 2443 2444 r = amdgpu_device_ip_hw_init_phase1(adev); 2445 if (r) 2446 goto init_failed; 2447 2448 r = amdgpu_device_fw_loading(adev); 2449 if (r) 2450 goto init_failed; 2451 2452 r = amdgpu_device_ip_hw_init_phase2(adev); 2453 if (r) 2454 goto init_failed; 2455 2456 /* 2457 * retired pages will be loaded from eeprom and reserved here, 2458 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2459 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2460 * for I2C communication which only true at this point. 2461 * 2462 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2463 * failure from bad gpu situation and stop amdgpu init process 2464 * accordingly. For other failed cases, it will still release all 2465 * the resource and print error message, rather than returning one 2466 * negative value to upper level. 2467 * 2468 * Note: theoretically, this should be called before all vram allocations 2469 * to protect retired page from abusing 2470 */ 2471 r = amdgpu_ras_recovery_init(adev); 2472 if (r) 2473 goto init_failed; 2474 2475 /** 2476 * In case of XGMI grab extra reference for reset domain for this device 2477 */ 2478 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2479 if (amdgpu_xgmi_add_device(adev) == 0) { 2480 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2481 2482 if (!hive->reset_domain || 2483 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2484 r = -ENOENT; 2485 goto init_failed; 2486 } 2487 2488 /* Drop the early temporary reset domain we created for device */ 2489 amdgpu_reset_put_reset_domain(adev->reset_domain); 2490 adev->reset_domain = hive->reset_domain; 2491 } 2492 } 2493 2494 r = amdgpu_device_init_schedulers(adev); 2495 if (r) 2496 goto init_failed; 2497 2498 /* Don't init kfd if whole hive need to be reset during init */ 2499 if (!adev->gmc.xgmi.pending_reset) 2500 amdgpu_amdkfd_device_init(adev); 2501 2502 amdgpu_fru_get_product_info(adev); 2503 2504 init_failed: 2505 if (amdgpu_sriov_vf(adev)) 2506 amdgpu_virt_release_full_gpu(adev, true); 2507 2508 return r; 2509 } 2510 2511 /** 2512 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2513 * 2514 * @adev: amdgpu_device pointer 2515 * 2516 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2517 * this function before a GPU reset. If the value is retained after a 2518 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2519 */ 2520 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2521 { 2522 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2523 } 2524 2525 /** 2526 * amdgpu_device_check_vram_lost - check if vram is valid 2527 * 2528 * @adev: amdgpu_device pointer 2529 * 2530 * Checks the reset magic value written to the gart pointer in VRAM. 2531 * The driver calls this after a GPU reset to see if the contents of 2532 * VRAM is lost or now. 2533 * returns true if vram is lost, false if not. 2534 */ 2535 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2536 { 2537 if (memcmp(adev->gart.ptr, adev->reset_magic, 2538 AMDGPU_RESET_MAGIC_NUM)) 2539 return true; 2540 2541 if (!amdgpu_in_reset(adev)) 2542 return false; 2543 2544 /* 2545 * For all ASICs with baco/mode1 reset, the VRAM is 2546 * always assumed to be lost. 2547 */ 2548 switch (amdgpu_asic_reset_method(adev)) { 2549 case AMD_RESET_METHOD_BACO: 2550 case AMD_RESET_METHOD_MODE1: 2551 return true; 2552 default: 2553 return false; 2554 } 2555 } 2556 2557 /** 2558 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2559 * 2560 * @adev: amdgpu_device pointer 2561 * @state: clockgating state (gate or ungate) 2562 * 2563 * The list of all the hardware IPs that make up the asic is walked and the 2564 * set_clockgating_state callbacks are run. 2565 * Late initialization pass enabling clockgating for hardware IPs. 2566 * Fini or suspend, pass disabling clockgating for hardware IPs. 2567 * Returns 0 on success, negative error code on failure. 2568 */ 2569 2570 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2571 enum amd_clockgating_state state) 2572 { 2573 int i, j, r; 2574 2575 if (amdgpu_emu_mode == 1) 2576 return 0; 2577 2578 for (j = 0; j < adev->num_ip_blocks; j++) { 2579 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2580 if (!adev->ip_blocks[i].status.late_initialized) 2581 continue; 2582 /* skip CG for GFX on S0ix */ 2583 if (adev->in_s0ix && 2584 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2585 continue; 2586 /* skip CG for VCE/UVD, it's handled specially */ 2587 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2588 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2589 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2590 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2591 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2592 /* enable clockgating to save power */ 2593 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2594 state); 2595 if (r) { 2596 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2597 adev->ip_blocks[i].version->funcs->name, r); 2598 return r; 2599 } 2600 } 2601 } 2602 2603 return 0; 2604 } 2605 2606 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2607 enum amd_powergating_state state) 2608 { 2609 int i, j, r; 2610 2611 if (amdgpu_emu_mode == 1) 2612 return 0; 2613 2614 for (j = 0; j < adev->num_ip_blocks; j++) { 2615 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2616 if (!adev->ip_blocks[i].status.late_initialized) 2617 continue; 2618 /* skip PG for GFX on S0ix */ 2619 if (adev->in_s0ix && 2620 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2621 continue; 2622 /* skip CG for VCE/UVD, it's handled specially */ 2623 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2624 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2625 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2626 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2627 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2628 /* enable powergating to save power */ 2629 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2630 state); 2631 if (r) { 2632 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2633 adev->ip_blocks[i].version->funcs->name, r); 2634 return r; 2635 } 2636 } 2637 } 2638 return 0; 2639 } 2640 2641 static int amdgpu_device_enable_mgpu_fan_boost(void) 2642 { 2643 struct amdgpu_gpu_instance *gpu_ins; 2644 struct amdgpu_device *adev; 2645 int i, ret = 0; 2646 2647 mutex_lock(&mgpu_info.mutex); 2648 2649 /* 2650 * MGPU fan boost feature should be enabled 2651 * only when there are two or more dGPUs in 2652 * the system 2653 */ 2654 if (mgpu_info.num_dgpu < 2) 2655 goto out; 2656 2657 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2658 gpu_ins = &(mgpu_info.gpu_ins[i]); 2659 adev = gpu_ins->adev; 2660 if (!(adev->flags & AMD_IS_APU) && 2661 !gpu_ins->mgpu_fan_enabled) { 2662 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2663 if (ret) 2664 break; 2665 2666 gpu_ins->mgpu_fan_enabled = 1; 2667 } 2668 } 2669 2670 out: 2671 mutex_unlock(&mgpu_info.mutex); 2672 2673 return ret; 2674 } 2675 2676 /** 2677 * amdgpu_device_ip_late_init - run late init for hardware IPs 2678 * 2679 * @adev: amdgpu_device pointer 2680 * 2681 * Late initialization pass for hardware IPs. The list of all the hardware 2682 * IPs that make up the asic is walked and the late_init callbacks are run. 2683 * late_init covers any special initialization that an IP requires 2684 * after all of the have been initialized or something that needs to happen 2685 * late in the init process. 2686 * Returns 0 on success, negative error code on failure. 2687 */ 2688 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2689 { 2690 struct amdgpu_gpu_instance *gpu_instance; 2691 int i = 0, r; 2692 2693 for (i = 0; i < adev->num_ip_blocks; i++) { 2694 if (!adev->ip_blocks[i].status.hw) 2695 continue; 2696 if (adev->ip_blocks[i].version->funcs->late_init) { 2697 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2698 if (r) { 2699 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2700 adev->ip_blocks[i].version->funcs->name, r); 2701 return r; 2702 } 2703 } 2704 adev->ip_blocks[i].status.late_initialized = true; 2705 } 2706 2707 r = amdgpu_ras_late_init(adev); 2708 if (r) { 2709 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2710 return r; 2711 } 2712 2713 amdgpu_ras_set_error_query_ready(adev, true); 2714 2715 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2716 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2717 2718 amdgpu_device_fill_reset_magic(adev); 2719 2720 r = amdgpu_device_enable_mgpu_fan_boost(); 2721 if (r) 2722 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2723 2724 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2725 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2726 adev->asic_type == CHIP_ALDEBARAN )) 2727 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2728 2729 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2730 mutex_lock(&mgpu_info.mutex); 2731 2732 /* 2733 * Reset device p-state to low as this was booted with high. 2734 * 2735 * This should be performed only after all devices from the same 2736 * hive get initialized. 2737 * 2738 * However, it's unknown how many device in the hive in advance. 2739 * As this is counted one by one during devices initializations. 2740 * 2741 * So, we wait for all XGMI interlinked devices initialized. 2742 * This may bring some delays as those devices may come from 2743 * different hives. But that should be OK. 2744 */ 2745 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2746 for (i = 0; i < mgpu_info.num_gpu; i++) { 2747 gpu_instance = &(mgpu_info.gpu_ins[i]); 2748 if (gpu_instance->adev->flags & AMD_IS_APU) 2749 continue; 2750 2751 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2752 AMDGPU_XGMI_PSTATE_MIN); 2753 if (r) { 2754 DRM_ERROR("pstate setting failed (%d).\n", r); 2755 break; 2756 } 2757 } 2758 } 2759 2760 mutex_unlock(&mgpu_info.mutex); 2761 } 2762 2763 return 0; 2764 } 2765 2766 /** 2767 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2768 * 2769 * @adev: amdgpu_device pointer 2770 * 2771 * For ASICs need to disable SMC first 2772 */ 2773 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2774 { 2775 int i, r; 2776 2777 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2778 return; 2779 2780 for (i = 0; i < adev->num_ip_blocks; i++) { 2781 if (!adev->ip_blocks[i].status.hw) 2782 continue; 2783 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2784 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2785 /* XXX handle errors */ 2786 if (r) { 2787 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2788 adev->ip_blocks[i].version->funcs->name, r); 2789 } 2790 adev->ip_blocks[i].status.hw = false; 2791 break; 2792 } 2793 } 2794 } 2795 2796 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2797 { 2798 int i, r; 2799 2800 for (i = 0; i < adev->num_ip_blocks; i++) { 2801 if (!adev->ip_blocks[i].version->funcs->early_fini) 2802 continue; 2803 2804 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2805 if (r) { 2806 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2807 adev->ip_blocks[i].version->funcs->name, r); 2808 } 2809 } 2810 2811 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2812 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2813 2814 amdgpu_amdkfd_suspend(adev, false); 2815 2816 /* Workaroud for ASICs need to disable SMC first */ 2817 amdgpu_device_smu_fini_early(adev); 2818 2819 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2820 if (!adev->ip_blocks[i].status.hw) 2821 continue; 2822 2823 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2824 /* XXX handle errors */ 2825 if (r) { 2826 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2827 adev->ip_blocks[i].version->funcs->name, r); 2828 } 2829 2830 adev->ip_blocks[i].status.hw = false; 2831 } 2832 2833 if (amdgpu_sriov_vf(adev)) { 2834 if (amdgpu_virt_release_full_gpu(adev, false)) 2835 DRM_ERROR("failed to release exclusive mode on fini\n"); 2836 } 2837 2838 return 0; 2839 } 2840 2841 /** 2842 * amdgpu_device_ip_fini - run fini for hardware IPs 2843 * 2844 * @adev: amdgpu_device pointer 2845 * 2846 * Main teardown pass for hardware IPs. The list of all the hardware 2847 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2848 * are run. hw_fini tears down the hardware associated with each IP 2849 * and sw_fini tears down any software state associated with each IP. 2850 * Returns 0 on success, negative error code on failure. 2851 */ 2852 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2853 { 2854 int i, r; 2855 2856 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2857 amdgpu_virt_release_ras_err_handler_data(adev); 2858 2859 if (adev->gmc.xgmi.num_physical_nodes > 1) 2860 amdgpu_xgmi_remove_device(adev); 2861 2862 amdgpu_amdkfd_device_fini_sw(adev); 2863 2864 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2865 if (!adev->ip_blocks[i].status.sw) 2866 continue; 2867 2868 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2869 amdgpu_ucode_free_bo(adev); 2870 amdgpu_free_static_csa(&adev->virt.csa_obj); 2871 amdgpu_device_wb_fini(adev); 2872 amdgpu_device_vram_scratch_fini(adev); 2873 amdgpu_ib_pool_fini(adev); 2874 } 2875 2876 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2877 /* XXX handle errors */ 2878 if (r) { 2879 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2880 adev->ip_blocks[i].version->funcs->name, r); 2881 } 2882 adev->ip_blocks[i].status.sw = false; 2883 adev->ip_blocks[i].status.valid = false; 2884 } 2885 2886 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2887 if (!adev->ip_blocks[i].status.late_initialized) 2888 continue; 2889 if (adev->ip_blocks[i].version->funcs->late_fini) 2890 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2891 adev->ip_blocks[i].status.late_initialized = false; 2892 } 2893 2894 amdgpu_ras_fini(adev); 2895 2896 return 0; 2897 } 2898 2899 /** 2900 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2901 * 2902 * @work: work_struct. 2903 */ 2904 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2905 { 2906 struct amdgpu_device *adev = 2907 container_of(work, struct amdgpu_device, delayed_init_work.work); 2908 int r; 2909 2910 r = amdgpu_ib_ring_tests(adev); 2911 if (r) 2912 DRM_ERROR("ib ring test failed (%d).\n", r); 2913 } 2914 2915 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2916 { 2917 struct amdgpu_device *adev = 2918 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2919 2920 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2921 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2922 2923 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2924 adev->gfx.gfx_off_state = true; 2925 } 2926 2927 /** 2928 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2929 * 2930 * @adev: amdgpu_device pointer 2931 * 2932 * Main suspend function for hardware IPs. The list of all the hardware 2933 * IPs that make up the asic is walked, clockgating is disabled and the 2934 * suspend callbacks are run. suspend puts the hardware and software state 2935 * in each IP into a state suitable for suspend. 2936 * Returns 0 on success, negative error code on failure. 2937 */ 2938 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2939 { 2940 int i, r; 2941 2942 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2943 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2944 2945 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2946 if (!adev->ip_blocks[i].status.valid) 2947 continue; 2948 2949 /* displays are handled separately */ 2950 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2951 continue; 2952 2953 /* XXX handle errors */ 2954 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2955 /* XXX handle errors */ 2956 if (r) { 2957 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2958 adev->ip_blocks[i].version->funcs->name, r); 2959 return r; 2960 } 2961 2962 adev->ip_blocks[i].status.hw = false; 2963 } 2964 2965 return 0; 2966 } 2967 2968 /** 2969 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2970 * 2971 * @adev: amdgpu_device pointer 2972 * 2973 * Main suspend function for hardware IPs. The list of all the hardware 2974 * IPs that make up the asic is walked, clockgating is disabled and the 2975 * suspend callbacks are run. suspend puts the hardware and software state 2976 * in each IP into a state suitable for suspend. 2977 * Returns 0 on success, negative error code on failure. 2978 */ 2979 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2980 { 2981 int i, r; 2982 2983 if (adev->in_s0ix) 2984 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2985 2986 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2987 if (!adev->ip_blocks[i].status.valid) 2988 continue; 2989 /* displays are handled in phase1 */ 2990 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2991 continue; 2992 /* PSP lost connection when err_event_athub occurs */ 2993 if (amdgpu_ras_intr_triggered() && 2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2995 adev->ip_blocks[i].status.hw = false; 2996 continue; 2997 } 2998 2999 /* skip unnecessary suspend if we do not initialize them yet */ 3000 if (adev->gmc.xgmi.pending_reset && 3001 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3002 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3003 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3005 adev->ip_blocks[i].status.hw = false; 3006 continue; 3007 } 3008 3009 /* skip suspend of gfx and psp for S0ix 3010 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3011 * like at runtime. PSP is also part of the always on hardware 3012 * so no need to suspend it. 3013 */ 3014 if (adev->in_s0ix && 3015 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3016 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 3017 continue; 3018 3019 /* XXX handle errors */ 3020 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3021 /* XXX handle errors */ 3022 if (r) { 3023 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3024 adev->ip_blocks[i].version->funcs->name, r); 3025 } 3026 adev->ip_blocks[i].status.hw = false; 3027 /* handle putting the SMC in the appropriate state */ 3028 if(!amdgpu_sriov_vf(adev)){ 3029 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3030 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3031 if (r) { 3032 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3033 adev->mp1_state, r); 3034 return r; 3035 } 3036 } 3037 } 3038 } 3039 3040 return 0; 3041 } 3042 3043 /** 3044 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3045 * 3046 * @adev: amdgpu_device pointer 3047 * 3048 * Main suspend function for hardware IPs. The list of all the hardware 3049 * IPs that make up the asic is walked, clockgating is disabled and the 3050 * suspend callbacks are run. suspend puts the hardware and software state 3051 * in each IP into a state suitable for suspend. 3052 * Returns 0 on success, negative error code on failure. 3053 */ 3054 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3055 { 3056 int r; 3057 3058 if (amdgpu_sriov_vf(adev)) { 3059 amdgpu_virt_fini_data_exchange(adev); 3060 amdgpu_virt_request_full_gpu(adev, false); 3061 } 3062 3063 r = amdgpu_device_ip_suspend_phase1(adev); 3064 if (r) 3065 return r; 3066 r = amdgpu_device_ip_suspend_phase2(adev); 3067 3068 if (amdgpu_sriov_vf(adev)) 3069 amdgpu_virt_release_full_gpu(adev, false); 3070 3071 return r; 3072 } 3073 3074 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3075 { 3076 int i, r; 3077 3078 static enum amd_ip_block_type ip_order[] = { 3079 AMD_IP_BLOCK_TYPE_GMC, 3080 AMD_IP_BLOCK_TYPE_COMMON, 3081 AMD_IP_BLOCK_TYPE_PSP, 3082 AMD_IP_BLOCK_TYPE_IH, 3083 }; 3084 3085 for (i = 0; i < adev->num_ip_blocks; i++) { 3086 int j; 3087 struct amdgpu_ip_block *block; 3088 3089 block = &adev->ip_blocks[i]; 3090 block->status.hw = false; 3091 3092 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3093 3094 if (block->version->type != ip_order[j] || 3095 !block->status.valid) 3096 continue; 3097 3098 r = block->version->funcs->hw_init(adev); 3099 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3100 if (r) 3101 return r; 3102 block->status.hw = true; 3103 } 3104 } 3105 3106 return 0; 3107 } 3108 3109 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3110 { 3111 int i, r; 3112 3113 static enum amd_ip_block_type ip_order[] = { 3114 AMD_IP_BLOCK_TYPE_SMC, 3115 AMD_IP_BLOCK_TYPE_DCE, 3116 AMD_IP_BLOCK_TYPE_GFX, 3117 AMD_IP_BLOCK_TYPE_SDMA, 3118 AMD_IP_BLOCK_TYPE_UVD, 3119 AMD_IP_BLOCK_TYPE_VCE, 3120 AMD_IP_BLOCK_TYPE_VCN 3121 }; 3122 3123 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3124 int j; 3125 struct amdgpu_ip_block *block; 3126 3127 for (j = 0; j < adev->num_ip_blocks; j++) { 3128 block = &adev->ip_blocks[j]; 3129 3130 if (block->version->type != ip_order[i] || 3131 !block->status.valid || 3132 block->status.hw) 3133 continue; 3134 3135 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3136 r = block->version->funcs->resume(adev); 3137 else 3138 r = block->version->funcs->hw_init(adev); 3139 3140 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3141 if (r) 3142 return r; 3143 block->status.hw = true; 3144 } 3145 } 3146 3147 return 0; 3148 } 3149 3150 /** 3151 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3152 * 3153 * @adev: amdgpu_device pointer 3154 * 3155 * First resume function for hardware IPs. The list of all the hardware 3156 * IPs that make up the asic is walked and the resume callbacks are run for 3157 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3158 * after a suspend and updates the software state as necessary. This 3159 * function is also used for restoring the GPU after a GPU reset. 3160 * Returns 0 on success, negative error code on failure. 3161 */ 3162 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3163 { 3164 int i, r; 3165 3166 for (i = 0; i < adev->num_ip_blocks; i++) { 3167 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3168 continue; 3169 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3170 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3172 3173 r = adev->ip_blocks[i].version->funcs->resume(adev); 3174 if (r) { 3175 DRM_ERROR("resume of IP block <%s> failed %d\n", 3176 adev->ip_blocks[i].version->funcs->name, r); 3177 return r; 3178 } 3179 adev->ip_blocks[i].status.hw = true; 3180 } 3181 } 3182 3183 return 0; 3184 } 3185 3186 /** 3187 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3188 * 3189 * @adev: amdgpu_device pointer 3190 * 3191 * First resume function for hardware IPs. The list of all the hardware 3192 * IPs that make up the asic is walked and the resume callbacks are run for 3193 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3194 * functional state after a suspend and updates the software state as 3195 * necessary. This function is also used for restoring the GPU after a GPU 3196 * reset. 3197 * Returns 0 on success, negative error code on failure. 3198 */ 3199 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3200 { 3201 int i, r; 3202 3203 for (i = 0; i < adev->num_ip_blocks; i++) { 3204 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3205 continue; 3206 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3207 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3208 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3209 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3210 continue; 3211 r = adev->ip_blocks[i].version->funcs->resume(adev); 3212 if (r) { 3213 DRM_ERROR("resume of IP block <%s> failed %d\n", 3214 adev->ip_blocks[i].version->funcs->name, r); 3215 return r; 3216 } 3217 adev->ip_blocks[i].status.hw = true; 3218 } 3219 3220 return 0; 3221 } 3222 3223 /** 3224 * amdgpu_device_ip_resume - run resume for hardware IPs 3225 * 3226 * @adev: amdgpu_device pointer 3227 * 3228 * Main resume function for hardware IPs. The hardware IPs 3229 * are split into two resume functions because they are 3230 * are also used in in recovering from a GPU reset and some additional 3231 * steps need to be take between them. In this case (S3/S4) they are 3232 * run sequentially. 3233 * Returns 0 on success, negative error code on failure. 3234 */ 3235 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3236 { 3237 int r; 3238 3239 r = amdgpu_amdkfd_resume_iommu(adev); 3240 if (r) 3241 return r; 3242 3243 r = amdgpu_device_ip_resume_phase1(adev); 3244 if (r) 3245 return r; 3246 3247 r = amdgpu_device_fw_loading(adev); 3248 if (r) 3249 return r; 3250 3251 r = amdgpu_device_ip_resume_phase2(adev); 3252 3253 return r; 3254 } 3255 3256 /** 3257 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3258 * 3259 * @adev: amdgpu_device pointer 3260 * 3261 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3262 */ 3263 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3264 { 3265 if (amdgpu_sriov_vf(adev)) { 3266 if (adev->is_atom_fw) { 3267 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3268 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3269 } else { 3270 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3271 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3272 } 3273 3274 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3275 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3276 } 3277 } 3278 3279 /** 3280 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3281 * 3282 * @asic_type: AMD asic type 3283 * 3284 * Check if there is DC (new modesetting infrastructre) support for an asic. 3285 * returns true if DC has support, false if not. 3286 */ 3287 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3288 { 3289 switch (asic_type) { 3290 #ifdef CONFIG_DRM_AMDGPU_SI 3291 case CHIP_HAINAN: 3292 #endif 3293 case CHIP_TOPAZ: 3294 /* chips with no display hardware */ 3295 return false; 3296 #if defined(CONFIG_DRM_AMD_DC) 3297 case CHIP_TAHITI: 3298 case CHIP_PITCAIRN: 3299 case CHIP_VERDE: 3300 case CHIP_OLAND: 3301 /* 3302 * We have systems in the wild with these ASICs that require 3303 * LVDS and VGA support which is not supported with DC. 3304 * 3305 * Fallback to the non-DC driver here by default so as not to 3306 * cause regressions. 3307 */ 3308 #if defined(CONFIG_DRM_AMD_DC_SI) 3309 return amdgpu_dc > 0; 3310 #else 3311 return false; 3312 #endif 3313 case CHIP_BONAIRE: 3314 case CHIP_KAVERI: 3315 case CHIP_KABINI: 3316 case CHIP_MULLINS: 3317 /* 3318 * We have systems in the wild with these ASICs that require 3319 * LVDS and VGA support which is not supported with DC. 3320 * 3321 * Fallback to the non-DC driver here by default so as not to 3322 * cause regressions. 3323 */ 3324 return amdgpu_dc > 0; 3325 case CHIP_HAWAII: 3326 case CHIP_CARRIZO: 3327 case CHIP_STONEY: 3328 case CHIP_POLARIS10: 3329 case CHIP_POLARIS11: 3330 case CHIP_POLARIS12: 3331 case CHIP_VEGAM: 3332 case CHIP_TONGA: 3333 case CHIP_FIJI: 3334 case CHIP_VEGA10: 3335 case CHIP_VEGA12: 3336 case CHIP_VEGA20: 3337 #if defined(CONFIG_DRM_AMD_DC_DCN) 3338 case CHIP_RAVEN: 3339 case CHIP_NAVI10: 3340 case CHIP_NAVI14: 3341 case CHIP_NAVI12: 3342 case CHIP_RENOIR: 3343 case CHIP_CYAN_SKILLFISH: 3344 case CHIP_SIENNA_CICHLID: 3345 case CHIP_NAVY_FLOUNDER: 3346 case CHIP_DIMGREY_CAVEFISH: 3347 case CHIP_BEIGE_GOBY: 3348 case CHIP_VANGOGH: 3349 case CHIP_YELLOW_CARP: 3350 #endif 3351 default: 3352 return amdgpu_dc != 0; 3353 #else 3354 default: 3355 if (amdgpu_dc > 0) 3356 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3357 "but isn't supported by ASIC, ignoring\n"); 3358 return false; 3359 #endif 3360 } 3361 } 3362 3363 /** 3364 * amdgpu_device_has_dc_support - check if dc is supported 3365 * 3366 * @adev: amdgpu_device pointer 3367 * 3368 * Returns true for supported, false for not supported 3369 */ 3370 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3371 { 3372 if (amdgpu_sriov_vf(adev) || 3373 adev->enable_virtual_display || 3374 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3375 return false; 3376 3377 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3378 } 3379 3380 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3381 { 3382 struct amdgpu_device *adev = 3383 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3384 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3385 3386 /* It's a bug to not have a hive within this function */ 3387 if (WARN_ON(!hive)) 3388 return; 3389 3390 /* 3391 * Use task barrier to synchronize all xgmi reset works across the 3392 * hive. task_barrier_enter and task_barrier_exit will block 3393 * until all the threads running the xgmi reset works reach 3394 * those points. task_barrier_full will do both blocks. 3395 */ 3396 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3397 3398 task_barrier_enter(&hive->tb); 3399 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3400 3401 if (adev->asic_reset_res) 3402 goto fail; 3403 3404 task_barrier_exit(&hive->tb); 3405 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3406 3407 if (adev->asic_reset_res) 3408 goto fail; 3409 3410 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3411 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3412 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3413 } else { 3414 3415 task_barrier_full(&hive->tb); 3416 adev->asic_reset_res = amdgpu_asic_reset(adev); 3417 } 3418 3419 fail: 3420 if (adev->asic_reset_res) 3421 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3422 adev->asic_reset_res, adev_to_drm(adev)->unique); 3423 amdgpu_put_xgmi_hive(hive); 3424 } 3425 3426 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3427 { 3428 char *input = amdgpu_lockup_timeout; 3429 char *timeout_setting = NULL; 3430 int index = 0; 3431 long timeout; 3432 int ret = 0; 3433 3434 /* 3435 * By default timeout for non compute jobs is 10000 3436 * and 60000 for compute jobs. 3437 * In SR-IOV or passthrough mode, timeout for compute 3438 * jobs are 60000 by default. 3439 */ 3440 adev->gfx_timeout = msecs_to_jiffies(10000); 3441 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3442 if (amdgpu_sriov_vf(adev)) 3443 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3444 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3445 else 3446 adev->compute_timeout = msecs_to_jiffies(60000); 3447 3448 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3449 while ((timeout_setting = strsep(&input, ",")) && 3450 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3451 ret = kstrtol(timeout_setting, 0, &timeout); 3452 if (ret) 3453 return ret; 3454 3455 if (timeout == 0) { 3456 index++; 3457 continue; 3458 } else if (timeout < 0) { 3459 timeout = MAX_SCHEDULE_TIMEOUT; 3460 dev_warn(adev->dev, "lockup timeout disabled"); 3461 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3462 } else { 3463 timeout = msecs_to_jiffies(timeout); 3464 } 3465 3466 switch (index++) { 3467 case 0: 3468 adev->gfx_timeout = timeout; 3469 break; 3470 case 1: 3471 adev->compute_timeout = timeout; 3472 break; 3473 case 2: 3474 adev->sdma_timeout = timeout; 3475 break; 3476 case 3: 3477 adev->video_timeout = timeout; 3478 break; 3479 default: 3480 break; 3481 } 3482 } 3483 /* 3484 * There is only one value specified and 3485 * it should apply to all non-compute jobs. 3486 */ 3487 if (index == 1) { 3488 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3489 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3490 adev->compute_timeout = adev->gfx_timeout; 3491 } 3492 } 3493 3494 return ret; 3495 } 3496 3497 /** 3498 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3499 * 3500 * @adev: amdgpu_device pointer 3501 * 3502 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3503 */ 3504 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3505 { 3506 struct iommu_domain *domain; 3507 3508 domain = iommu_get_domain_for_dev(adev->dev); 3509 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3510 adev->ram_is_direct_mapped = true; 3511 } 3512 3513 static const struct attribute *amdgpu_dev_attributes[] = { 3514 &dev_attr_product_name.attr, 3515 &dev_attr_product_number.attr, 3516 &dev_attr_serial_number.attr, 3517 &dev_attr_pcie_replay_count.attr, 3518 NULL 3519 }; 3520 3521 /** 3522 * amdgpu_device_init - initialize the driver 3523 * 3524 * @adev: amdgpu_device pointer 3525 * @flags: driver flags 3526 * 3527 * Initializes the driver info and hw (all asics). 3528 * Returns 0 for success or an error on failure. 3529 * Called at driver startup. 3530 */ 3531 int amdgpu_device_init(struct amdgpu_device *adev, 3532 uint32_t flags) 3533 { 3534 struct drm_device *ddev = adev_to_drm(adev); 3535 struct pci_dev *pdev = adev->pdev; 3536 int r, i; 3537 bool px = false; 3538 u32 max_MBps; 3539 3540 adev->shutdown = false; 3541 adev->flags = flags; 3542 3543 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3544 adev->asic_type = amdgpu_force_asic_type; 3545 else 3546 adev->asic_type = flags & AMD_ASIC_MASK; 3547 3548 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3549 if (amdgpu_emu_mode == 1) 3550 adev->usec_timeout *= 10; 3551 adev->gmc.gart_size = 512 * 1024 * 1024; 3552 adev->accel_working = false; 3553 adev->num_rings = 0; 3554 adev->mman.buffer_funcs = NULL; 3555 adev->mman.buffer_funcs_ring = NULL; 3556 adev->vm_manager.vm_pte_funcs = NULL; 3557 adev->vm_manager.vm_pte_num_scheds = 0; 3558 adev->gmc.gmc_funcs = NULL; 3559 adev->harvest_ip_mask = 0x0; 3560 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3561 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3562 3563 adev->smc_rreg = &amdgpu_invalid_rreg; 3564 adev->smc_wreg = &amdgpu_invalid_wreg; 3565 adev->pcie_rreg = &amdgpu_invalid_rreg; 3566 adev->pcie_wreg = &amdgpu_invalid_wreg; 3567 adev->pciep_rreg = &amdgpu_invalid_rreg; 3568 adev->pciep_wreg = &amdgpu_invalid_wreg; 3569 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3570 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3571 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3572 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3573 adev->didt_rreg = &amdgpu_invalid_rreg; 3574 adev->didt_wreg = &amdgpu_invalid_wreg; 3575 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3576 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3577 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3578 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3579 3580 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3581 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3582 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3583 3584 /* mutex initialization are all done here so we 3585 * can recall function without having locking issues */ 3586 mutex_init(&adev->firmware.mutex); 3587 mutex_init(&adev->pm.mutex); 3588 mutex_init(&adev->gfx.gpu_clock_mutex); 3589 mutex_init(&adev->srbm_mutex); 3590 mutex_init(&adev->gfx.pipe_reserve_mutex); 3591 mutex_init(&adev->gfx.gfx_off_mutex); 3592 mutex_init(&adev->grbm_idx_mutex); 3593 mutex_init(&adev->mn_lock); 3594 mutex_init(&adev->virt.vf_errors.lock); 3595 hash_init(adev->mn_hash); 3596 mutex_init(&adev->psp.mutex); 3597 mutex_init(&adev->notifier_lock); 3598 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3599 mutex_init(&adev->benchmark_mutex); 3600 3601 amdgpu_device_init_apu_flags(adev); 3602 3603 r = amdgpu_device_check_arguments(adev); 3604 if (r) 3605 return r; 3606 3607 spin_lock_init(&adev->mmio_idx_lock); 3608 spin_lock_init(&adev->smc_idx_lock); 3609 spin_lock_init(&adev->pcie_idx_lock); 3610 spin_lock_init(&adev->uvd_ctx_idx_lock); 3611 spin_lock_init(&adev->didt_idx_lock); 3612 spin_lock_init(&adev->gc_cac_idx_lock); 3613 spin_lock_init(&adev->se_cac_idx_lock); 3614 spin_lock_init(&adev->audio_endpt_idx_lock); 3615 spin_lock_init(&adev->mm_stats.lock); 3616 3617 INIT_LIST_HEAD(&adev->shadow_list); 3618 mutex_init(&adev->shadow_list_lock); 3619 3620 INIT_LIST_HEAD(&adev->reset_list); 3621 3622 INIT_LIST_HEAD(&adev->ras_list); 3623 3624 INIT_DELAYED_WORK(&adev->delayed_init_work, 3625 amdgpu_device_delayed_init_work_handler); 3626 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3627 amdgpu_device_delay_enable_gfx_off); 3628 3629 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3630 3631 adev->gfx.gfx_off_req_count = 1; 3632 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3633 3634 atomic_set(&adev->throttling_logging_enabled, 1); 3635 /* 3636 * If throttling continues, logging will be performed every minute 3637 * to avoid log flooding. "-1" is subtracted since the thermal 3638 * throttling interrupt comes every second. Thus, the total logging 3639 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3640 * for throttling interrupt) = 60 seconds. 3641 */ 3642 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3643 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3644 3645 /* Registers mapping */ 3646 /* TODO: block userspace mapping of io register */ 3647 if (adev->asic_type >= CHIP_BONAIRE) { 3648 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3649 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3650 } else { 3651 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3652 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3653 } 3654 3655 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3656 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3657 3658 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3659 if (adev->rmmio == NULL) { 3660 return -ENOMEM; 3661 } 3662 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3663 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3664 3665 amdgpu_device_get_pcie_info(adev); 3666 3667 if (amdgpu_mcbp) 3668 DRM_INFO("MCBP is enabled\n"); 3669 3670 if (adev->asic_type >= CHIP_NAVI10) { 3671 if (amdgpu_mes || amdgpu_mes_kiq) 3672 adev->enable_mes = true; 3673 3674 if (amdgpu_mes_kiq) 3675 adev->enable_mes_kiq = true; 3676 } 3677 3678 /* 3679 * Reset domain needs to be present early, before XGMI hive discovered 3680 * (if any) and intitialized to use reset sem and in_gpu reset flag 3681 * early on during init and before calling to RREG32. 3682 */ 3683 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3684 if (!adev->reset_domain) 3685 return -ENOMEM; 3686 3687 /* detect hw virtualization here */ 3688 amdgpu_detect_virtualization(adev); 3689 3690 r = amdgpu_device_get_job_timeout_settings(adev); 3691 if (r) { 3692 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3693 return r; 3694 } 3695 3696 /* early init functions */ 3697 r = amdgpu_device_ip_early_init(adev); 3698 if (r) 3699 return r; 3700 3701 /* Enable TMZ based on IP_VERSION */ 3702 amdgpu_gmc_tmz_set(adev); 3703 3704 amdgpu_gmc_noretry_set(adev); 3705 /* Need to get xgmi info early to decide the reset behavior*/ 3706 if (adev->gmc.xgmi.supported) { 3707 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3708 if (r) 3709 return r; 3710 } 3711 3712 /* enable PCIE atomic ops */ 3713 if (amdgpu_sriov_vf(adev)) 3714 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3715 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3716 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3717 else 3718 adev->have_atomics_support = 3719 !pci_enable_atomic_ops_to_root(adev->pdev, 3720 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3721 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3722 if (!adev->have_atomics_support) 3723 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3724 3725 /* doorbell bar mapping and doorbell index init*/ 3726 amdgpu_device_doorbell_init(adev); 3727 3728 if (amdgpu_emu_mode == 1) { 3729 /* post the asic on emulation mode */ 3730 emu_soc_asic_init(adev); 3731 goto fence_driver_init; 3732 } 3733 3734 amdgpu_reset_init(adev); 3735 3736 /* detect if we are with an SRIOV vbios */ 3737 amdgpu_device_detect_sriov_bios(adev); 3738 3739 /* check if we need to reset the asic 3740 * E.g., driver was not cleanly unloaded previously, etc. 3741 */ 3742 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3743 if (adev->gmc.xgmi.num_physical_nodes) { 3744 dev_info(adev->dev, "Pending hive reset.\n"); 3745 adev->gmc.xgmi.pending_reset = true; 3746 /* Only need to init necessary block for SMU to handle the reset */ 3747 for (i = 0; i < adev->num_ip_blocks; i++) { 3748 if (!adev->ip_blocks[i].status.valid) 3749 continue; 3750 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3752 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3753 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3754 DRM_DEBUG("IP %s disabled for hw_init.\n", 3755 adev->ip_blocks[i].version->funcs->name); 3756 adev->ip_blocks[i].status.hw = true; 3757 } 3758 } 3759 } else { 3760 r = amdgpu_asic_reset(adev); 3761 if (r) { 3762 dev_err(adev->dev, "asic reset on init failed\n"); 3763 goto failed; 3764 } 3765 } 3766 } 3767 3768 pci_enable_pcie_error_reporting(adev->pdev); 3769 3770 /* Post card if necessary */ 3771 if (amdgpu_device_need_post(adev)) { 3772 if (!adev->bios) { 3773 dev_err(adev->dev, "no vBIOS found\n"); 3774 r = -EINVAL; 3775 goto failed; 3776 } 3777 DRM_INFO("GPU posting now...\n"); 3778 r = amdgpu_device_asic_init(adev); 3779 if (r) { 3780 dev_err(adev->dev, "gpu post error!\n"); 3781 goto failed; 3782 } 3783 } 3784 3785 if (adev->is_atom_fw) { 3786 /* Initialize clocks */ 3787 r = amdgpu_atomfirmware_get_clock_info(adev); 3788 if (r) { 3789 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3790 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3791 goto failed; 3792 } 3793 } else { 3794 /* Initialize clocks */ 3795 r = amdgpu_atombios_get_clock_info(adev); 3796 if (r) { 3797 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3798 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3799 goto failed; 3800 } 3801 /* init i2c buses */ 3802 if (!amdgpu_device_has_dc_support(adev)) 3803 amdgpu_atombios_i2c_init(adev); 3804 } 3805 3806 fence_driver_init: 3807 /* Fence driver */ 3808 r = amdgpu_fence_driver_sw_init(adev); 3809 if (r) { 3810 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3811 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3812 goto failed; 3813 } 3814 3815 /* init the mode config */ 3816 drm_mode_config_init(adev_to_drm(adev)); 3817 3818 r = amdgpu_device_ip_init(adev); 3819 if (r) { 3820 /* failed in exclusive mode due to timeout */ 3821 if (amdgpu_sriov_vf(adev) && 3822 !amdgpu_sriov_runtime(adev) && 3823 amdgpu_virt_mmio_blocked(adev) && 3824 !amdgpu_virt_wait_reset(adev)) { 3825 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3826 /* Don't send request since VF is inactive. */ 3827 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3828 adev->virt.ops = NULL; 3829 r = -EAGAIN; 3830 goto release_ras_con; 3831 } 3832 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3833 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3834 goto release_ras_con; 3835 } 3836 3837 amdgpu_fence_driver_hw_init(adev); 3838 3839 dev_info(adev->dev, 3840 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3841 adev->gfx.config.max_shader_engines, 3842 adev->gfx.config.max_sh_per_se, 3843 adev->gfx.config.max_cu_per_sh, 3844 adev->gfx.cu_info.number); 3845 3846 adev->accel_working = true; 3847 3848 amdgpu_vm_check_compute_bug(adev); 3849 3850 /* Initialize the buffer migration limit. */ 3851 if (amdgpu_moverate >= 0) 3852 max_MBps = amdgpu_moverate; 3853 else 3854 max_MBps = 8; /* Allow 8 MB/s. */ 3855 /* Get a log2 for easy divisions. */ 3856 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3857 3858 r = amdgpu_pm_sysfs_init(adev); 3859 if (r) { 3860 adev->pm_sysfs_en = false; 3861 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3862 } else 3863 adev->pm_sysfs_en = true; 3864 3865 r = amdgpu_ucode_sysfs_init(adev); 3866 if (r) { 3867 adev->ucode_sysfs_en = false; 3868 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3869 } else 3870 adev->ucode_sysfs_en = true; 3871 3872 r = amdgpu_psp_sysfs_init(adev); 3873 if (r) { 3874 adev->psp_sysfs_en = false; 3875 if (!amdgpu_sriov_vf(adev)) 3876 DRM_ERROR("Creating psp sysfs failed\n"); 3877 } else 3878 adev->psp_sysfs_en = true; 3879 3880 /* 3881 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3882 * Otherwise the mgpu fan boost feature will be skipped due to the 3883 * gpu instance is counted less. 3884 */ 3885 amdgpu_register_gpu_instance(adev); 3886 3887 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3888 * explicit gating rather than handling it automatically. 3889 */ 3890 if (!adev->gmc.xgmi.pending_reset) { 3891 r = amdgpu_device_ip_late_init(adev); 3892 if (r) { 3893 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3894 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3895 goto release_ras_con; 3896 } 3897 /* must succeed. */ 3898 amdgpu_ras_resume(adev); 3899 queue_delayed_work(system_wq, &adev->delayed_init_work, 3900 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3901 } 3902 3903 if (amdgpu_sriov_vf(adev)) 3904 flush_delayed_work(&adev->delayed_init_work); 3905 3906 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3907 if (r) 3908 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3909 3910 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3911 r = amdgpu_pmu_init(adev); 3912 if (r) 3913 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3914 3915 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3916 if (amdgpu_device_cache_pci_state(adev->pdev)) 3917 pci_restore_state(pdev); 3918 3919 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3920 /* this will fail for cards that aren't VGA class devices, just 3921 * ignore it */ 3922 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3923 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3924 3925 if (amdgpu_device_supports_px(ddev)) { 3926 px = true; 3927 vga_switcheroo_register_client(adev->pdev, 3928 &amdgpu_switcheroo_ops, px); 3929 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3930 } 3931 3932 if (adev->gmc.xgmi.pending_reset) 3933 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3934 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3935 3936 amdgpu_device_check_iommu_direct_map(adev); 3937 3938 return 0; 3939 3940 release_ras_con: 3941 amdgpu_release_ras_context(adev); 3942 3943 failed: 3944 amdgpu_vf_error_trans_all(adev); 3945 3946 return r; 3947 } 3948 3949 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3950 { 3951 3952 /* Clear all CPU mappings pointing to this device */ 3953 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3954 3955 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3956 amdgpu_device_doorbell_fini(adev); 3957 3958 iounmap(adev->rmmio); 3959 adev->rmmio = NULL; 3960 if (adev->mman.aper_base_kaddr) 3961 iounmap(adev->mman.aper_base_kaddr); 3962 adev->mman.aper_base_kaddr = NULL; 3963 3964 /* Memory manager related */ 3965 if (!adev->gmc.xgmi.connected_to_cpu) { 3966 arch_phys_wc_del(adev->gmc.vram_mtrr); 3967 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3968 } 3969 } 3970 3971 /** 3972 * amdgpu_device_fini_hw - tear down the driver 3973 * 3974 * @adev: amdgpu_device pointer 3975 * 3976 * Tear down the driver info (all asics). 3977 * Called at driver shutdown. 3978 */ 3979 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3980 { 3981 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3982 flush_delayed_work(&adev->delayed_init_work); 3983 adev->shutdown = true; 3984 3985 /* make sure IB test finished before entering exclusive mode 3986 * to avoid preemption on IB test 3987 * */ 3988 if (amdgpu_sriov_vf(adev)) { 3989 amdgpu_virt_request_full_gpu(adev, false); 3990 amdgpu_virt_fini_data_exchange(adev); 3991 } 3992 3993 /* disable all interrupts */ 3994 amdgpu_irq_disable_all(adev); 3995 if (adev->mode_info.mode_config_initialized){ 3996 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3997 drm_helper_force_disable_all(adev_to_drm(adev)); 3998 else 3999 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4000 } 4001 amdgpu_fence_driver_hw_fini(adev); 4002 4003 if (adev->mman.initialized) { 4004 flush_delayed_work(&adev->mman.bdev.wq); 4005 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4006 } 4007 4008 if (adev->pm_sysfs_en) 4009 amdgpu_pm_sysfs_fini(adev); 4010 if (adev->ucode_sysfs_en) 4011 amdgpu_ucode_sysfs_fini(adev); 4012 if (adev->psp_sysfs_en) 4013 amdgpu_psp_sysfs_fini(adev); 4014 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4015 4016 /* disable ras feature must before hw fini */ 4017 amdgpu_ras_pre_fini(adev); 4018 4019 amdgpu_device_ip_fini_early(adev); 4020 4021 amdgpu_irq_fini_hw(adev); 4022 4023 if (adev->mman.initialized) 4024 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4025 4026 amdgpu_gart_dummy_page_fini(adev); 4027 4028 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4029 amdgpu_device_unmap_mmio(adev); 4030 4031 } 4032 4033 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4034 { 4035 int idx; 4036 4037 amdgpu_fence_driver_sw_fini(adev); 4038 amdgpu_device_ip_fini(adev); 4039 release_firmware(adev->firmware.gpu_info_fw); 4040 adev->firmware.gpu_info_fw = NULL; 4041 adev->accel_working = false; 4042 4043 amdgpu_reset_fini(adev); 4044 4045 /* free i2c buses */ 4046 if (!amdgpu_device_has_dc_support(adev)) 4047 amdgpu_i2c_fini(adev); 4048 4049 if (amdgpu_emu_mode != 1) 4050 amdgpu_atombios_fini(adev); 4051 4052 kfree(adev->bios); 4053 adev->bios = NULL; 4054 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4055 vga_switcheroo_unregister_client(adev->pdev); 4056 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4057 } 4058 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4059 vga_client_unregister(adev->pdev); 4060 4061 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4062 4063 iounmap(adev->rmmio); 4064 adev->rmmio = NULL; 4065 amdgpu_device_doorbell_fini(adev); 4066 drm_dev_exit(idx); 4067 } 4068 4069 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4070 amdgpu_pmu_fini(adev); 4071 if (adev->mman.discovery_bin) 4072 amdgpu_discovery_fini(adev); 4073 4074 amdgpu_reset_put_reset_domain(adev->reset_domain); 4075 adev->reset_domain = NULL; 4076 4077 kfree(adev->pci_state); 4078 4079 } 4080 4081 /** 4082 * amdgpu_device_evict_resources - evict device resources 4083 * @adev: amdgpu device object 4084 * 4085 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4086 * of the vram memory type. Mainly used for evicting device resources 4087 * at suspend time. 4088 * 4089 */ 4090 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4091 { 4092 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4093 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4094 return; 4095 4096 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4097 DRM_WARN("evicting device resources failed\n"); 4098 4099 } 4100 4101 /* 4102 * Suspend & resume. 4103 */ 4104 /** 4105 * amdgpu_device_suspend - initiate device suspend 4106 * 4107 * @dev: drm dev pointer 4108 * @fbcon : notify the fbdev of suspend 4109 * 4110 * Puts the hw in the suspend state (all asics). 4111 * Returns 0 for success or an error on failure. 4112 * Called at driver suspend. 4113 */ 4114 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4115 { 4116 struct amdgpu_device *adev = drm_to_adev(dev); 4117 4118 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4119 return 0; 4120 4121 adev->in_suspend = true; 4122 4123 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4124 DRM_WARN("smart shift update failed\n"); 4125 4126 drm_kms_helper_poll_disable(dev); 4127 4128 if (fbcon) 4129 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4130 4131 cancel_delayed_work_sync(&adev->delayed_init_work); 4132 4133 amdgpu_ras_suspend(adev); 4134 4135 amdgpu_device_ip_suspend_phase1(adev); 4136 4137 if (!adev->in_s0ix) 4138 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4139 4140 amdgpu_device_evict_resources(adev); 4141 4142 amdgpu_fence_driver_hw_fini(adev); 4143 4144 amdgpu_device_ip_suspend_phase2(adev); 4145 4146 return 0; 4147 } 4148 4149 /** 4150 * amdgpu_device_resume - initiate device resume 4151 * 4152 * @dev: drm dev pointer 4153 * @fbcon : notify the fbdev of resume 4154 * 4155 * Bring the hw back to operating state (all asics). 4156 * Returns 0 for success or an error on failure. 4157 * Called at driver resume. 4158 */ 4159 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4160 { 4161 struct amdgpu_device *adev = drm_to_adev(dev); 4162 int r = 0; 4163 4164 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4165 return 0; 4166 4167 if (adev->in_s0ix) 4168 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4169 4170 /* post card */ 4171 if (amdgpu_device_need_post(adev)) { 4172 r = amdgpu_device_asic_init(adev); 4173 if (r) 4174 dev_err(adev->dev, "amdgpu asic init failed\n"); 4175 } 4176 4177 r = amdgpu_device_ip_resume(adev); 4178 if (r) { 4179 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4180 return r; 4181 } 4182 amdgpu_fence_driver_hw_init(adev); 4183 4184 r = amdgpu_device_ip_late_init(adev); 4185 if (r) 4186 return r; 4187 4188 queue_delayed_work(system_wq, &adev->delayed_init_work, 4189 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4190 4191 if (!adev->in_s0ix) { 4192 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4193 if (r) 4194 return r; 4195 } 4196 4197 /* Make sure IB tests flushed */ 4198 flush_delayed_work(&adev->delayed_init_work); 4199 4200 if (fbcon) 4201 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4202 4203 drm_kms_helper_poll_enable(dev); 4204 4205 amdgpu_ras_resume(adev); 4206 4207 /* 4208 * Most of the connector probing functions try to acquire runtime pm 4209 * refs to ensure that the GPU is powered on when connector polling is 4210 * performed. Since we're calling this from a runtime PM callback, 4211 * trying to acquire rpm refs will cause us to deadlock. 4212 * 4213 * Since we're guaranteed to be holding the rpm lock, it's safe to 4214 * temporarily disable the rpm helpers so this doesn't deadlock us. 4215 */ 4216 #ifdef CONFIG_PM 4217 dev->dev->power.disable_depth++; 4218 #endif 4219 if (!amdgpu_device_has_dc_support(adev)) 4220 drm_helper_hpd_irq_event(dev); 4221 else 4222 drm_kms_helper_hotplug_event(dev); 4223 #ifdef CONFIG_PM 4224 dev->dev->power.disable_depth--; 4225 #endif 4226 adev->in_suspend = false; 4227 4228 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4229 DRM_WARN("smart shift update failed\n"); 4230 4231 return 0; 4232 } 4233 4234 /** 4235 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4236 * 4237 * @adev: amdgpu_device pointer 4238 * 4239 * The list of all the hardware IPs that make up the asic is walked and 4240 * the check_soft_reset callbacks are run. check_soft_reset determines 4241 * if the asic is still hung or not. 4242 * Returns true if any of the IPs are still in a hung state, false if not. 4243 */ 4244 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4245 { 4246 int i; 4247 bool asic_hang = false; 4248 4249 if (amdgpu_sriov_vf(adev)) 4250 return true; 4251 4252 if (amdgpu_asic_need_full_reset(adev)) 4253 return true; 4254 4255 for (i = 0; i < adev->num_ip_blocks; i++) { 4256 if (!adev->ip_blocks[i].status.valid) 4257 continue; 4258 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4259 adev->ip_blocks[i].status.hang = 4260 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4261 if (adev->ip_blocks[i].status.hang) { 4262 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4263 asic_hang = true; 4264 } 4265 } 4266 return asic_hang; 4267 } 4268 4269 /** 4270 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4271 * 4272 * @adev: amdgpu_device pointer 4273 * 4274 * The list of all the hardware IPs that make up the asic is walked and the 4275 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4276 * handles any IP specific hardware or software state changes that are 4277 * necessary for a soft reset to succeed. 4278 * Returns 0 on success, negative error code on failure. 4279 */ 4280 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4281 { 4282 int i, r = 0; 4283 4284 for (i = 0; i < adev->num_ip_blocks; i++) { 4285 if (!adev->ip_blocks[i].status.valid) 4286 continue; 4287 if (adev->ip_blocks[i].status.hang && 4288 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4289 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4290 if (r) 4291 return r; 4292 } 4293 } 4294 4295 return 0; 4296 } 4297 4298 /** 4299 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4300 * 4301 * @adev: amdgpu_device pointer 4302 * 4303 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4304 * reset is necessary to recover. 4305 * Returns true if a full asic reset is required, false if not. 4306 */ 4307 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4308 { 4309 int i; 4310 4311 if (amdgpu_asic_need_full_reset(adev)) 4312 return true; 4313 4314 for (i = 0; i < adev->num_ip_blocks; i++) { 4315 if (!adev->ip_blocks[i].status.valid) 4316 continue; 4317 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4318 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4319 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4320 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4321 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4322 if (adev->ip_blocks[i].status.hang) { 4323 dev_info(adev->dev, "Some block need full reset!\n"); 4324 return true; 4325 } 4326 } 4327 } 4328 return false; 4329 } 4330 4331 /** 4332 * amdgpu_device_ip_soft_reset - do a soft reset 4333 * 4334 * @adev: amdgpu_device pointer 4335 * 4336 * The list of all the hardware IPs that make up the asic is walked and the 4337 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4338 * IP specific hardware or software state changes that are necessary to soft 4339 * reset the IP. 4340 * Returns 0 on success, negative error code on failure. 4341 */ 4342 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4343 { 4344 int i, r = 0; 4345 4346 for (i = 0; i < adev->num_ip_blocks; i++) { 4347 if (!adev->ip_blocks[i].status.valid) 4348 continue; 4349 if (adev->ip_blocks[i].status.hang && 4350 adev->ip_blocks[i].version->funcs->soft_reset) { 4351 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4352 if (r) 4353 return r; 4354 } 4355 } 4356 4357 return 0; 4358 } 4359 4360 /** 4361 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4362 * 4363 * @adev: amdgpu_device pointer 4364 * 4365 * The list of all the hardware IPs that make up the asic is walked and the 4366 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4367 * handles any IP specific hardware or software state changes that are 4368 * necessary after the IP has been soft reset. 4369 * Returns 0 on success, negative error code on failure. 4370 */ 4371 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4372 { 4373 int i, r = 0; 4374 4375 for (i = 0; i < adev->num_ip_blocks; i++) { 4376 if (!adev->ip_blocks[i].status.valid) 4377 continue; 4378 if (adev->ip_blocks[i].status.hang && 4379 adev->ip_blocks[i].version->funcs->post_soft_reset) 4380 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4381 if (r) 4382 return r; 4383 } 4384 4385 return 0; 4386 } 4387 4388 /** 4389 * amdgpu_device_recover_vram - Recover some VRAM contents 4390 * 4391 * @adev: amdgpu_device pointer 4392 * 4393 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4394 * restore things like GPUVM page tables after a GPU reset where 4395 * the contents of VRAM might be lost. 4396 * 4397 * Returns: 4398 * 0 on success, negative error code on failure. 4399 */ 4400 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4401 { 4402 struct dma_fence *fence = NULL, *next = NULL; 4403 struct amdgpu_bo *shadow; 4404 struct amdgpu_bo_vm *vmbo; 4405 long r = 1, tmo; 4406 4407 if (amdgpu_sriov_runtime(adev)) 4408 tmo = msecs_to_jiffies(8000); 4409 else 4410 tmo = msecs_to_jiffies(100); 4411 4412 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4413 mutex_lock(&adev->shadow_list_lock); 4414 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4415 shadow = &vmbo->bo; 4416 /* No need to recover an evicted BO */ 4417 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4418 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4419 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4420 continue; 4421 4422 r = amdgpu_bo_restore_shadow(shadow, &next); 4423 if (r) 4424 break; 4425 4426 if (fence) { 4427 tmo = dma_fence_wait_timeout(fence, false, tmo); 4428 dma_fence_put(fence); 4429 fence = next; 4430 if (tmo == 0) { 4431 r = -ETIMEDOUT; 4432 break; 4433 } else if (tmo < 0) { 4434 r = tmo; 4435 break; 4436 } 4437 } else { 4438 fence = next; 4439 } 4440 } 4441 mutex_unlock(&adev->shadow_list_lock); 4442 4443 if (fence) 4444 tmo = dma_fence_wait_timeout(fence, false, tmo); 4445 dma_fence_put(fence); 4446 4447 if (r < 0 || tmo <= 0) { 4448 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4449 return -EIO; 4450 } 4451 4452 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4453 return 0; 4454 } 4455 4456 4457 /** 4458 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4459 * 4460 * @adev: amdgpu_device pointer 4461 * @from_hypervisor: request from hypervisor 4462 * 4463 * do VF FLR and reinitialize Asic 4464 * return 0 means succeeded otherwise failed 4465 */ 4466 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4467 bool from_hypervisor) 4468 { 4469 int r; 4470 struct amdgpu_hive_info *hive = NULL; 4471 int retry_limit = 0; 4472 4473 retry: 4474 amdgpu_amdkfd_pre_reset(adev); 4475 4476 amdgpu_amdkfd_pre_reset(adev); 4477 4478 if (from_hypervisor) 4479 r = amdgpu_virt_request_full_gpu(adev, true); 4480 else 4481 r = amdgpu_virt_reset_gpu(adev); 4482 if (r) 4483 return r; 4484 4485 /* Resume IP prior to SMC */ 4486 r = amdgpu_device_ip_reinit_early_sriov(adev); 4487 if (r) 4488 goto error; 4489 4490 amdgpu_virt_init_data_exchange(adev); 4491 4492 r = amdgpu_device_fw_loading(adev); 4493 if (r) 4494 return r; 4495 4496 /* now we are okay to resume SMC/CP/SDMA */ 4497 r = amdgpu_device_ip_reinit_late_sriov(adev); 4498 if (r) 4499 goto error; 4500 4501 hive = amdgpu_get_xgmi_hive(adev); 4502 /* Update PSP FW topology after reset */ 4503 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4504 r = amdgpu_xgmi_update_topology(hive, adev); 4505 4506 if (hive) 4507 amdgpu_put_xgmi_hive(hive); 4508 4509 if (!r) { 4510 amdgpu_irq_gpu_reset_resume_helper(adev); 4511 r = amdgpu_ib_ring_tests(adev); 4512 4513 amdgpu_amdkfd_post_reset(adev); 4514 } 4515 4516 error: 4517 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4518 amdgpu_inc_vram_lost(adev); 4519 r = amdgpu_device_recover_vram(adev); 4520 } 4521 amdgpu_virt_release_full_gpu(adev, true); 4522 4523 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4524 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4525 retry_limit++; 4526 goto retry; 4527 } else 4528 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4529 } 4530 4531 return r; 4532 } 4533 4534 /** 4535 * amdgpu_device_has_job_running - check if there is any job in mirror list 4536 * 4537 * @adev: amdgpu_device pointer 4538 * 4539 * check if there is any job in mirror list 4540 */ 4541 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4542 { 4543 int i; 4544 struct drm_sched_job *job; 4545 4546 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4547 struct amdgpu_ring *ring = adev->rings[i]; 4548 4549 if (!ring || !ring->sched.thread) 4550 continue; 4551 4552 spin_lock(&ring->sched.job_list_lock); 4553 job = list_first_entry_or_null(&ring->sched.pending_list, 4554 struct drm_sched_job, list); 4555 spin_unlock(&ring->sched.job_list_lock); 4556 if (job) 4557 return true; 4558 } 4559 return false; 4560 } 4561 4562 /** 4563 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4564 * 4565 * @adev: amdgpu_device pointer 4566 * 4567 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4568 * a hung GPU. 4569 */ 4570 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4571 { 4572 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4573 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4574 return false; 4575 } 4576 4577 if (amdgpu_gpu_recovery == 0) 4578 goto disabled; 4579 4580 if (amdgpu_sriov_vf(adev)) 4581 return true; 4582 4583 if (amdgpu_gpu_recovery == -1) { 4584 switch (adev->asic_type) { 4585 #ifdef CONFIG_DRM_AMDGPU_SI 4586 case CHIP_VERDE: 4587 case CHIP_TAHITI: 4588 case CHIP_PITCAIRN: 4589 case CHIP_OLAND: 4590 case CHIP_HAINAN: 4591 #endif 4592 #ifdef CONFIG_DRM_AMDGPU_CIK 4593 case CHIP_KAVERI: 4594 case CHIP_KABINI: 4595 case CHIP_MULLINS: 4596 #endif 4597 case CHIP_CARRIZO: 4598 case CHIP_STONEY: 4599 case CHIP_CYAN_SKILLFISH: 4600 goto disabled; 4601 default: 4602 break; 4603 } 4604 } 4605 4606 return true; 4607 4608 disabled: 4609 dev_info(adev->dev, "GPU recovery disabled.\n"); 4610 return false; 4611 } 4612 4613 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4614 { 4615 u32 i; 4616 int ret = 0; 4617 4618 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4619 4620 dev_info(adev->dev, "GPU mode1 reset\n"); 4621 4622 /* disable BM */ 4623 pci_clear_master(adev->pdev); 4624 4625 amdgpu_device_cache_pci_state(adev->pdev); 4626 4627 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4628 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4629 ret = amdgpu_dpm_mode1_reset(adev); 4630 } else { 4631 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4632 ret = psp_gpu_reset(adev); 4633 } 4634 4635 if (ret) 4636 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4637 4638 amdgpu_device_load_pci_state(adev->pdev); 4639 4640 /* wait for asic to come out of reset */ 4641 for (i = 0; i < adev->usec_timeout; i++) { 4642 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4643 4644 if (memsize != 0xffffffff) 4645 break; 4646 udelay(1); 4647 } 4648 4649 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4650 return ret; 4651 } 4652 4653 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4654 struct amdgpu_reset_context *reset_context) 4655 { 4656 int i, r = 0; 4657 struct amdgpu_job *job = NULL; 4658 bool need_full_reset = 4659 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4660 4661 if (reset_context->reset_req_dev == adev) 4662 job = reset_context->job; 4663 4664 if (amdgpu_sriov_vf(adev)) { 4665 /* stop the data exchange thread */ 4666 amdgpu_virt_fini_data_exchange(adev); 4667 } 4668 4669 /* block all schedulers and reset given job's ring */ 4670 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4671 struct amdgpu_ring *ring = adev->rings[i]; 4672 4673 if (!ring || !ring->sched.thread) 4674 continue; 4675 4676 /*clear job fence from fence drv to avoid force_completion 4677 *leave NULL and vm flush fence in fence drv */ 4678 amdgpu_fence_driver_clear_job_fences(ring); 4679 4680 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4681 amdgpu_fence_driver_force_completion(ring); 4682 } 4683 4684 if (job && job->vm) 4685 drm_sched_increase_karma(&job->base); 4686 4687 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4688 /* If reset handler not implemented, continue; otherwise return */ 4689 if (r == -ENOSYS) 4690 r = 0; 4691 else 4692 return r; 4693 4694 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4695 if (!amdgpu_sriov_vf(adev)) { 4696 4697 if (!need_full_reset) 4698 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4699 4700 if (!need_full_reset) { 4701 amdgpu_device_ip_pre_soft_reset(adev); 4702 r = amdgpu_device_ip_soft_reset(adev); 4703 amdgpu_device_ip_post_soft_reset(adev); 4704 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4705 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4706 need_full_reset = true; 4707 } 4708 } 4709 4710 if (need_full_reset) 4711 r = amdgpu_device_ip_suspend(adev); 4712 if (need_full_reset) 4713 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4714 else 4715 clear_bit(AMDGPU_NEED_FULL_RESET, 4716 &reset_context->flags); 4717 } 4718 4719 return r; 4720 } 4721 4722 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4723 { 4724 uint32_t reg_value; 4725 int i; 4726 4727 lockdep_assert_held(&adev->reset_domain->sem); 4728 dump_stack(); 4729 4730 for (i = 0; i < adev->num_regs; i++) { 4731 reg_value = RREG32(adev->reset_dump_reg_list[i]); 4732 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); 4733 } 4734 4735 return 0; 4736 } 4737 4738 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4739 struct amdgpu_reset_context *reset_context) 4740 { 4741 struct amdgpu_device *tmp_adev = NULL; 4742 bool need_full_reset, skip_hw_reset, vram_lost = false; 4743 int r = 0; 4744 4745 /* Try reset handler method first */ 4746 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4747 reset_list); 4748 amdgpu_reset_reg_dumps(tmp_adev); 4749 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4750 /* If reset handler not implemented, continue; otherwise return */ 4751 if (r == -ENOSYS) 4752 r = 0; 4753 else 4754 return r; 4755 4756 /* Reset handler not implemented, use the default method */ 4757 need_full_reset = 4758 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4759 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4760 4761 /* 4762 * ASIC reset has to be done on all XGMI hive nodes ASAP 4763 * to allow proper links negotiation in FW (within 1 sec) 4764 */ 4765 if (!skip_hw_reset && need_full_reset) { 4766 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4767 /* For XGMI run all resets in parallel to speed up the process */ 4768 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4769 tmp_adev->gmc.xgmi.pending_reset = false; 4770 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4771 r = -EALREADY; 4772 } else 4773 r = amdgpu_asic_reset(tmp_adev); 4774 4775 if (r) { 4776 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4777 r, adev_to_drm(tmp_adev)->unique); 4778 break; 4779 } 4780 } 4781 4782 /* For XGMI wait for all resets to complete before proceed */ 4783 if (!r) { 4784 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4785 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4786 flush_work(&tmp_adev->xgmi_reset_work); 4787 r = tmp_adev->asic_reset_res; 4788 if (r) 4789 break; 4790 } 4791 } 4792 } 4793 } 4794 4795 if (!r && amdgpu_ras_intr_triggered()) { 4796 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4797 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4798 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4799 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4800 } 4801 4802 amdgpu_ras_intr_cleared(); 4803 } 4804 4805 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4806 if (need_full_reset) { 4807 /* post card */ 4808 r = amdgpu_device_asic_init(tmp_adev); 4809 if (r) { 4810 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4811 } else { 4812 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4813 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4814 if (r) 4815 goto out; 4816 4817 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4818 if (r) 4819 goto out; 4820 4821 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4822 if (vram_lost) { 4823 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4824 amdgpu_inc_vram_lost(tmp_adev); 4825 } 4826 4827 r = amdgpu_device_fw_loading(tmp_adev); 4828 if (r) 4829 return r; 4830 4831 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4832 if (r) 4833 goto out; 4834 4835 if (vram_lost) 4836 amdgpu_device_fill_reset_magic(tmp_adev); 4837 4838 /* 4839 * Add this ASIC as tracked as reset was already 4840 * complete successfully. 4841 */ 4842 amdgpu_register_gpu_instance(tmp_adev); 4843 4844 if (!reset_context->hive && 4845 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4846 amdgpu_xgmi_add_device(tmp_adev); 4847 4848 r = amdgpu_device_ip_late_init(tmp_adev); 4849 if (r) 4850 goto out; 4851 4852 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4853 4854 /* 4855 * The GPU enters bad state once faulty pages 4856 * by ECC has reached the threshold, and ras 4857 * recovery is scheduled next. So add one check 4858 * here to break recovery if it indeed exceeds 4859 * bad page threshold, and remind user to 4860 * retire this GPU or setting one bigger 4861 * bad_page_threshold value to fix this once 4862 * probing driver again. 4863 */ 4864 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4865 /* must succeed. */ 4866 amdgpu_ras_resume(tmp_adev); 4867 } else { 4868 r = -EINVAL; 4869 goto out; 4870 } 4871 4872 /* Update PSP FW topology after reset */ 4873 if (reset_context->hive && 4874 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4875 r = amdgpu_xgmi_update_topology( 4876 reset_context->hive, tmp_adev); 4877 } 4878 } 4879 4880 out: 4881 if (!r) { 4882 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4883 r = amdgpu_ib_ring_tests(tmp_adev); 4884 if (r) { 4885 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4886 need_full_reset = true; 4887 r = -EAGAIN; 4888 goto end; 4889 } 4890 } 4891 4892 if (!r) 4893 r = amdgpu_device_recover_vram(tmp_adev); 4894 else 4895 tmp_adev->asic_reset_res = r; 4896 } 4897 4898 end: 4899 if (need_full_reset) 4900 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4901 else 4902 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4903 return r; 4904 } 4905 4906 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 4907 { 4908 4909 switch (amdgpu_asic_reset_method(adev)) { 4910 case AMD_RESET_METHOD_MODE1: 4911 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4912 break; 4913 case AMD_RESET_METHOD_MODE2: 4914 adev->mp1_state = PP_MP1_STATE_RESET; 4915 break; 4916 default: 4917 adev->mp1_state = PP_MP1_STATE_NONE; 4918 break; 4919 } 4920 } 4921 4922 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 4923 { 4924 amdgpu_vf_error_trans_all(adev); 4925 adev->mp1_state = PP_MP1_STATE_NONE; 4926 } 4927 4928 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4929 { 4930 struct pci_dev *p = NULL; 4931 4932 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4933 adev->pdev->bus->number, 1); 4934 if (p) { 4935 pm_runtime_enable(&(p->dev)); 4936 pm_runtime_resume(&(p->dev)); 4937 } 4938 } 4939 4940 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4941 { 4942 enum amd_reset_method reset_method; 4943 struct pci_dev *p = NULL; 4944 u64 expires; 4945 4946 /* 4947 * For now, only BACO and mode1 reset are confirmed 4948 * to suffer the audio issue without proper suspended. 4949 */ 4950 reset_method = amdgpu_asic_reset_method(adev); 4951 if ((reset_method != AMD_RESET_METHOD_BACO) && 4952 (reset_method != AMD_RESET_METHOD_MODE1)) 4953 return -EINVAL; 4954 4955 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4956 adev->pdev->bus->number, 1); 4957 if (!p) 4958 return -ENODEV; 4959 4960 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4961 if (!expires) 4962 /* 4963 * If we cannot get the audio device autosuspend delay, 4964 * a fixed 4S interval will be used. Considering 3S is 4965 * the audio controller default autosuspend delay setting. 4966 * 4S used here is guaranteed to cover that. 4967 */ 4968 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4969 4970 while (!pm_runtime_status_suspended(&(p->dev))) { 4971 if (!pm_runtime_suspend(&(p->dev))) 4972 break; 4973 4974 if (expires < ktime_get_mono_fast_ns()) { 4975 dev_warn(adev->dev, "failed to suspend display audio\n"); 4976 /* TODO: abort the succeeding gpu reset? */ 4977 return -ETIMEDOUT; 4978 } 4979 } 4980 4981 pm_runtime_disable(&(p->dev)); 4982 4983 return 0; 4984 } 4985 4986 static void amdgpu_device_recheck_guilty_jobs( 4987 struct amdgpu_device *adev, struct list_head *device_list_handle, 4988 struct amdgpu_reset_context *reset_context) 4989 { 4990 int i, r = 0; 4991 4992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4993 struct amdgpu_ring *ring = adev->rings[i]; 4994 int ret = 0; 4995 struct drm_sched_job *s_job; 4996 4997 if (!ring || !ring->sched.thread) 4998 continue; 4999 5000 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5001 struct drm_sched_job, list); 5002 if (s_job == NULL) 5003 continue; 5004 5005 /* clear job's guilty and depend the folowing step to decide the real one */ 5006 drm_sched_reset_karma(s_job); 5007 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get 5008 * to make sure fence is balanced */ 5009 dma_fence_get(s_job->s_fence->parent); 5010 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5011 5012 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5013 if (ret == 0) { /* timeout */ 5014 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5015 ring->sched.name, s_job->id); 5016 5017 /* set guilty */ 5018 drm_sched_increase_karma(s_job); 5019 retry: 5020 /* do hw reset */ 5021 if (amdgpu_sriov_vf(adev)) { 5022 amdgpu_virt_fini_data_exchange(adev); 5023 r = amdgpu_device_reset_sriov(adev, false); 5024 if (r) 5025 adev->asic_reset_res = r; 5026 } else { 5027 clear_bit(AMDGPU_SKIP_HW_RESET, 5028 &reset_context->flags); 5029 r = amdgpu_do_asic_reset(device_list_handle, 5030 reset_context); 5031 if (r && r == -EAGAIN) 5032 goto retry; 5033 } 5034 5035 /* 5036 * add reset counter so that the following 5037 * resubmitted job could flush vmid 5038 */ 5039 atomic_inc(&adev->gpu_reset_counter); 5040 continue; 5041 } 5042 5043 /* got the hw fence, signal finished fence */ 5044 atomic_dec(ring->sched.score); 5045 dma_fence_put(s_job->s_fence->parent); 5046 dma_fence_get(&s_job->s_fence->finished); 5047 dma_fence_signal(&s_job->s_fence->finished); 5048 dma_fence_put(&s_job->s_fence->finished); 5049 5050 /* remove node from list and free the job */ 5051 spin_lock(&ring->sched.job_list_lock); 5052 list_del_init(&s_job->list); 5053 spin_unlock(&ring->sched.job_list_lock); 5054 ring->sched.ops->free_job(s_job); 5055 } 5056 } 5057 5058 /** 5059 * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler 5060 * 5061 * @adev: amdgpu_device pointer 5062 * @job: which job trigger hang 5063 * 5064 * Attempt to reset the GPU if it has hung (all asics). 5065 * Attempt to do soft-reset or full-reset and reinitialize Asic 5066 * Returns 0 for success or an error on failure. 5067 */ 5068 5069 int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, 5070 struct amdgpu_job *job) 5071 { 5072 struct list_head device_list, *device_list_handle = NULL; 5073 bool job_signaled = false; 5074 struct amdgpu_hive_info *hive = NULL; 5075 struct amdgpu_device *tmp_adev = NULL; 5076 int i, r = 0; 5077 bool need_emergency_restart = false; 5078 bool audio_suspended = false; 5079 int tmp_vram_lost_counter; 5080 struct amdgpu_reset_context reset_context; 5081 5082 memset(&reset_context, 0, sizeof(reset_context)); 5083 5084 /* 5085 * Special case: RAS triggered and full reset isn't supported 5086 */ 5087 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5088 5089 /* 5090 * Flush RAM to disk so that after reboot 5091 * the user can read log and see why the system rebooted. 5092 */ 5093 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5094 DRM_WARN("Emergency reboot."); 5095 5096 ksys_sync_helper(); 5097 emergency_restart(); 5098 } 5099 5100 dev_info(adev->dev, "GPU %s begin!\n", 5101 need_emergency_restart ? "jobs stop":"reset"); 5102 5103 if (!amdgpu_sriov_vf(adev)) 5104 hive = amdgpu_get_xgmi_hive(adev); 5105 if (hive) 5106 mutex_lock(&hive->hive_lock); 5107 5108 reset_context.method = AMD_RESET_METHOD_NONE; 5109 reset_context.reset_req_dev = adev; 5110 reset_context.job = job; 5111 reset_context.hive = hive; 5112 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5113 5114 /* 5115 * Build list of devices to reset. 5116 * In case we are in XGMI hive mode, resort the device list 5117 * to put adev in the 1st position. 5118 */ 5119 INIT_LIST_HEAD(&device_list); 5120 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5121 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5122 list_add_tail(&tmp_adev->reset_list, &device_list); 5123 if (!list_is_first(&adev->reset_list, &device_list)) 5124 list_rotate_to_front(&adev->reset_list, &device_list); 5125 device_list_handle = &device_list; 5126 } else { 5127 list_add_tail(&adev->reset_list, &device_list); 5128 device_list_handle = &device_list; 5129 } 5130 5131 /* We need to lock reset domain only once both for XGMI and single device */ 5132 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5133 reset_list); 5134 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5135 5136 /* block all schedulers and reset given job's ring */ 5137 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5138 5139 amdgpu_device_set_mp1_state(tmp_adev); 5140 5141 /* 5142 * Try to put the audio codec into suspend state 5143 * before gpu reset started. 5144 * 5145 * Due to the power domain of the graphics device 5146 * is shared with AZ power domain. Without this, 5147 * we may change the audio hardware from behind 5148 * the audio driver's back. That will trigger 5149 * some audio codec errors. 5150 */ 5151 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5152 audio_suspended = true; 5153 5154 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5155 5156 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5157 5158 if (!amdgpu_sriov_vf(tmp_adev)) 5159 amdgpu_amdkfd_pre_reset(tmp_adev); 5160 5161 /* 5162 * Mark these ASICs to be reseted as untracked first 5163 * And add them back after reset completed 5164 */ 5165 amdgpu_unregister_gpu_instance(tmp_adev); 5166 5167 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5168 5169 /* disable ras on ALL IPs */ 5170 if (!need_emergency_restart && 5171 amdgpu_device_ip_need_full_reset(tmp_adev)) 5172 amdgpu_ras_suspend(tmp_adev); 5173 5174 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5175 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5176 5177 if (!ring || !ring->sched.thread) 5178 continue; 5179 5180 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5181 5182 if (need_emergency_restart) 5183 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5184 } 5185 atomic_inc(&tmp_adev->gpu_reset_counter); 5186 } 5187 5188 if (need_emergency_restart) 5189 goto skip_sched_resume; 5190 5191 /* 5192 * Must check guilty signal here since after this point all old 5193 * HW fences are force signaled. 5194 * 5195 * job->base holds a reference to parent fence 5196 */ 5197 if (job && job->base.s_fence->parent && 5198 dma_fence_is_signaled(job->base.s_fence->parent)) { 5199 job_signaled = true; 5200 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5201 goto skip_hw_reset; 5202 } 5203 5204 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5205 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5206 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5207 /*TODO Should we stop ?*/ 5208 if (r) { 5209 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5210 r, adev_to_drm(tmp_adev)->unique); 5211 tmp_adev->asic_reset_res = r; 5212 } 5213 } 5214 5215 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5216 /* Actual ASIC resets if needed.*/ 5217 /* Host driver will handle XGMI hive reset for SRIOV */ 5218 if (amdgpu_sriov_vf(adev)) { 5219 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5220 if (r) 5221 adev->asic_reset_res = r; 5222 5223 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5224 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5225 amdgpu_ras_resume(adev); 5226 } else { 5227 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5228 if (r && r == -EAGAIN) 5229 goto retry; 5230 } 5231 5232 skip_hw_reset: 5233 5234 /* Post ASIC reset for all devs .*/ 5235 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5236 5237 /* 5238 * Sometimes a later bad compute job can block a good gfx job as gfx 5239 * and compute ring share internal GC HW mutually. We add an additional 5240 * guilty jobs recheck step to find the real guilty job, it synchronously 5241 * submits and pends for the first job being signaled. If it gets timeout, 5242 * we identify it as a real guilty job. 5243 */ 5244 if (amdgpu_gpu_recovery == 2 && 5245 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5246 amdgpu_device_recheck_guilty_jobs( 5247 tmp_adev, device_list_handle, &reset_context); 5248 5249 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5250 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5251 5252 if (!ring || !ring->sched.thread) 5253 continue; 5254 5255 /* No point to resubmit jobs if we didn't HW reset*/ 5256 if (!tmp_adev->asic_reset_res && !job_signaled) 5257 drm_sched_resubmit_jobs(&ring->sched); 5258 5259 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5260 } 5261 5262 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5263 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5264 } 5265 5266 if (tmp_adev->asic_reset_res) 5267 r = tmp_adev->asic_reset_res; 5268 5269 tmp_adev->asic_reset_res = 0; 5270 5271 if (r) { 5272 /* bad news, how to tell it to userspace ? */ 5273 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5274 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5275 } else { 5276 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5277 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5278 DRM_WARN("smart shift update failed\n"); 5279 } 5280 } 5281 5282 skip_sched_resume: 5283 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5284 /* unlock kfd: SRIOV would do it separately */ 5285 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5286 amdgpu_amdkfd_post_reset(tmp_adev); 5287 5288 /* kfd_post_reset will do nothing if kfd device is not initialized, 5289 * need to bring up kfd here if it's not be initialized before 5290 */ 5291 if (!adev->kfd.init_complete) 5292 amdgpu_amdkfd_device_init(adev); 5293 5294 if (audio_suspended) 5295 amdgpu_device_resume_display_audio(tmp_adev); 5296 5297 amdgpu_device_unset_mp1_state(tmp_adev); 5298 } 5299 5300 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5301 reset_list); 5302 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5303 5304 if (hive) { 5305 mutex_unlock(&hive->hive_lock); 5306 amdgpu_put_xgmi_hive(hive); 5307 } 5308 5309 if (r) 5310 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5311 return r; 5312 } 5313 5314 struct amdgpu_recover_work_struct { 5315 struct work_struct base; 5316 struct amdgpu_device *adev; 5317 struct amdgpu_job *job; 5318 int ret; 5319 }; 5320 5321 static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) 5322 { 5323 struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); 5324 5325 recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); 5326 } 5327 /* 5328 * Serialize gpu recover into reset domain single threaded wq 5329 */ 5330 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5331 struct amdgpu_job *job) 5332 { 5333 struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; 5334 5335 INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); 5336 5337 if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) 5338 return -EAGAIN; 5339 5340 flush_work(&work.base); 5341 5342 return work.ret; 5343 } 5344 5345 /** 5346 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5347 * 5348 * @adev: amdgpu_device pointer 5349 * 5350 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5351 * and lanes) of the slot the device is in. Handles APUs and 5352 * virtualized environments where PCIE config space may not be available. 5353 */ 5354 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5355 { 5356 struct pci_dev *pdev; 5357 enum pci_bus_speed speed_cap, platform_speed_cap; 5358 enum pcie_link_width platform_link_width; 5359 5360 if (amdgpu_pcie_gen_cap) 5361 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5362 5363 if (amdgpu_pcie_lane_cap) 5364 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5365 5366 /* covers APUs as well */ 5367 if (pci_is_root_bus(adev->pdev->bus)) { 5368 if (adev->pm.pcie_gen_mask == 0) 5369 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5370 if (adev->pm.pcie_mlw_mask == 0) 5371 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5372 return; 5373 } 5374 5375 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5376 return; 5377 5378 pcie_bandwidth_available(adev->pdev, NULL, 5379 &platform_speed_cap, &platform_link_width); 5380 5381 if (adev->pm.pcie_gen_mask == 0) { 5382 /* asic caps */ 5383 pdev = adev->pdev; 5384 speed_cap = pcie_get_speed_cap(pdev); 5385 if (speed_cap == PCI_SPEED_UNKNOWN) { 5386 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5387 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5388 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5389 } else { 5390 if (speed_cap == PCIE_SPEED_32_0GT) 5391 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5392 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5393 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5394 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5395 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5396 else if (speed_cap == PCIE_SPEED_16_0GT) 5397 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5398 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5399 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5400 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5401 else if (speed_cap == PCIE_SPEED_8_0GT) 5402 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5403 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5404 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5405 else if (speed_cap == PCIE_SPEED_5_0GT) 5406 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5407 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5408 else 5409 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5410 } 5411 /* platform caps */ 5412 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5413 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5414 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5415 } else { 5416 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5417 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5418 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5419 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5420 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5421 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5422 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5423 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5424 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5425 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5426 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5427 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5428 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5429 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5430 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5431 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5432 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5433 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5434 else 5435 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5436 5437 } 5438 } 5439 if (adev->pm.pcie_mlw_mask == 0) { 5440 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5441 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5442 } else { 5443 switch (platform_link_width) { 5444 case PCIE_LNK_X32: 5445 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5447 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5448 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5449 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5450 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5451 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5452 break; 5453 case PCIE_LNK_X16: 5454 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5456 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5457 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5458 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5459 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5460 break; 5461 case PCIE_LNK_X12: 5462 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5463 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5464 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5465 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5466 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5467 break; 5468 case PCIE_LNK_X8: 5469 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5470 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5471 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5472 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5473 break; 5474 case PCIE_LNK_X4: 5475 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5476 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5477 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5478 break; 5479 case PCIE_LNK_X2: 5480 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5481 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5482 break; 5483 case PCIE_LNK_X1: 5484 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5485 break; 5486 default: 5487 break; 5488 } 5489 } 5490 } 5491 } 5492 5493 int amdgpu_device_baco_enter(struct drm_device *dev) 5494 { 5495 struct amdgpu_device *adev = drm_to_adev(dev); 5496 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5497 5498 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5499 return -ENOTSUPP; 5500 5501 if (ras && adev->ras_enabled && 5502 adev->nbio.funcs->enable_doorbell_interrupt) 5503 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5504 5505 return amdgpu_dpm_baco_enter(adev); 5506 } 5507 5508 int amdgpu_device_baco_exit(struct drm_device *dev) 5509 { 5510 struct amdgpu_device *adev = drm_to_adev(dev); 5511 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5512 int ret = 0; 5513 5514 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5515 return -ENOTSUPP; 5516 5517 ret = amdgpu_dpm_baco_exit(adev); 5518 if (ret) 5519 return ret; 5520 5521 if (ras && adev->ras_enabled && 5522 adev->nbio.funcs->enable_doorbell_interrupt) 5523 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5524 5525 if (amdgpu_passthrough(adev) && 5526 adev->nbio.funcs->clear_doorbell_interrupt) 5527 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5528 5529 return 0; 5530 } 5531 5532 /** 5533 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5534 * @pdev: PCI device struct 5535 * @state: PCI channel state 5536 * 5537 * Description: Called when a PCI error is detected. 5538 * 5539 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5540 */ 5541 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5542 { 5543 struct drm_device *dev = pci_get_drvdata(pdev); 5544 struct amdgpu_device *adev = drm_to_adev(dev); 5545 int i; 5546 5547 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5548 5549 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5550 DRM_WARN("No support for XGMI hive yet..."); 5551 return PCI_ERS_RESULT_DISCONNECT; 5552 } 5553 5554 adev->pci_channel_state = state; 5555 5556 switch (state) { 5557 case pci_channel_io_normal: 5558 return PCI_ERS_RESULT_CAN_RECOVER; 5559 /* Fatal error, prepare for slot reset */ 5560 case pci_channel_io_frozen: 5561 /* 5562 * Locking adev->reset_domain->sem will prevent any external access 5563 * to GPU during PCI error recovery 5564 */ 5565 amdgpu_device_lock_reset_domain(adev->reset_domain); 5566 amdgpu_device_set_mp1_state(adev); 5567 5568 /* 5569 * Block any work scheduling as we do for regular GPU reset 5570 * for the duration of the recovery 5571 */ 5572 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5573 struct amdgpu_ring *ring = adev->rings[i]; 5574 5575 if (!ring || !ring->sched.thread) 5576 continue; 5577 5578 drm_sched_stop(&ring->sched, NULL); 5579 } 5580 atomic_inc(&adev->gpu_reset_counter); 5581 return PCI_ERS_RESULT_NEED_RESET; 5582 case pci_channel_io_perm_failure: 5583 /* Permanent error, prepare for device removal */ 5584 return PCI_ERS_RESULT_DISCONNECT; 5585 } 5586 5587 return PCI_ERS_RESULT_NEED_RESET; 5588 } 5589 5590 /** 5591 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5592 * @pdev: pointer to PCI device 5593 */ 5594 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5595 { 5596 5597 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5598 5599 /* TODO - dump whatever for debugging purposes */ 5600 5601 /* This called only if amdgpu_pci_error_detected returns 5602 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5603 * works, no need to reset slot. 5604 */ 5605 5606 return PCI_ERS_RESULT_RECOVERED; 5607 } 5608 5609 /** 5610 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5611 * @pdev: PCI device struct 5612 * 5613 * Description: This routine is called by the pci error recovery 5614 * code after the PCI slot has been reset, just before we 5615 * should resume normal operations. 5616 */ 5617 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5618 { 5619 struct drm_device *dev = pci_get_drvdata(pdev); 5620 struct amdgpu_device *adev = drm_to_adev(dev); 5621 int r, i; 5622 struct amdgpu_reset_context reset_context; 5623 u32 memsize; 5624 struct list_head device_list; 5625 5626 DRM_INFO("PCI error: slot reset callback!!\n"); 5627 5628 memset(&reset_context, 0, sizeof(reset_context)); 5629 5630 INIT_LIST_HEAD(&device_list); 5631 list_add_tail(&adev->reset_list, &device_list); 5632 5633 /* wait for asic to come out of reset */ 5634 msleep(500); 5635 5636 /* Restore PCI confspace */ 5637 amdgpu_device_load_pci_state(pdev); 5638 5639 /* confirm ASIC came out of reset */ 5640 for (i = 0; i < adev->usec_timeout; i++) { 5641 memsize = amdgpu_asic_get_config_memsize(adev); 5642 5643 if (memsize != 0xffffffff) 5644 break; 5645 udelay(1); 5646 } 5647 if (memsize == 0xffffffff) { 5648 r = -ETIME; 5649 goto out; 5650 } 5651 5652 reset_context.method = AMD_RESET_METHOD_NONE; 5653 reset_context.reset_req_dev = adev; 5654 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5655 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5656 5657 adev->no_hw_access = true; 5658 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5659 adev->no_hw_access = false; 5660 if (r) 5661 goto out; 5662 5663 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5664 5665 out: 5666 if (!r) { 5667 if (amdgpu_device_cache_pci_state(adev->pdev)) 5668 pci_restore_state(adev->pdev); 5669 5670 DRM_INFO("PCIe error recovery succeeded\n"); 5671 } else { 5672 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5673 amdgpu_device_unset_mp1_state(adev); 5674 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5675 } 5676 5677 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5678 } 5679 5680 /** 5681 * amdgpu_pci_resume() - resume normal ops after PCI reset 5682 * @pdev: pointer to PCI device 5683 * 5684 * Called when the error recovery driver tells us that its 5685 * OK to resume normal operation. 5686 */ 5687 void amdgpu_pci_resume(struct pci_dev *pdev) 5688 { 5689 struct drm_device *dev = pci_get_drvdata(pdev); 5690 struct amdgpu_device *adev = drm_to_adev(dev); 5691 int i; 5692 5693 5694 DRM_INFO("PCI error: resume callback!!\n"); 5695 5696 /* Only continue execution for the case of pci_channel_io_frozen */ 5697 if (adev->pci_channel_state != pci_channel_io_frozen) 5698 return; 5699 5700 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5701 struct amdgpu_ring *ring = adev->rings[i]; 5702 5703 if (!ring || !ring->sched.thread) 5704 continue; 5705 5706 5707 drm_sched_resubmit_jobs(&ring->sched); 5708 drm_sched_start(&ring->sched, true); 5709 } 5710 5711 amdgpu_device_unset_mp1_state(adev); 5712 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5713 } 5714 5715 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5716 { 5717 struct drm_device *dev = pci_get_drvdata(pdev); 5718 struct amdgpu_device *adev = drm_to_adev(dev); 5719 int r; 5720 5721 r = pci_save_state(pdev); 5722 if (!r) { 5723 kfree(adev->pci_state); 5724 5725 adev->pci_state = pci_store_saved_state(pdev); 5726 5727 if (!adev->pci_state) { 5728 DRM_ERROR("Failed to store PCI saved state"); 5729 return false; 5730 } 5731 } else { 5732 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5733 return false; 5734 } 5735 5736 return true; 5737 } 5738 5739 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5740 { 5741 struct drm_device *dev = pci_get_drvdata(pdev); 5742 struct amdgpu_device *adev = drm_to_adev(dev); 5743 int r; 5744 5745 if (!adev->pci_state) 5746 return false; 5747 5748 r = pci_load_saved_state(pdev, adev->pci_state); 5749 5750 if (!r) { 5751 pci_restore_state(pdev); 5752 } else { 5753 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5754 return false; 5755 } 5756 5757 return true; 5758 } 5759 5760 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5761 struct amdgpu_ring *ring) 5762 { 5763 #ifdef CONFIG_X86_64 5764 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5765 return; 5766 #endif 5767 if (adev->gmc.xgmi.connected_to_cpu) 5768 return; 5769 5770 if (ring && ring->funcs->emit_hdp_flush) 5771 amdgpu_ring_emit_hdp_flush(ring); 5772 else 5773 amdgpu_asic_flush_hdp(adev, ring); 5774 } 5775 5776 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5777 struct amdgpu_ring *ring) 5778 { 5779 #ifdef CONFIG_X86_64 5780 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5781 return; 5782 #endif 5783 if (adev->gmc.xgmi.connected_to_cpu) 5784 return; 5785 5786 amdgpu_asic_invalidate_hdp(adev, ring); 5787 } 5788 5789 int amdgpu_in_reset(struct amdgpu_device *adev) 5790 { 5791 return atomic_read(&adev->reset_domain->in_gpu_reset); 5792 } 5793 5794 /** 5795 * amdgpu_device_halt() - bring hardware to some kind of halt state 5796 * 5797 * @adev: amdgpu_device pointer 5798 * 5799 * Bring hardware to some kind of halt state so that no one can touch it 5800 * any more. It will help to maintain error context when error occurred. 5801 * Compare to a simple hang, the system will keep stable at least for SSH 5802 * access. Then it should be trivial to inspect the hardware state and 5803 * see what's going on. Implemented as following: 5804 * 5805 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5806 * clears all CPU mappings to device, disallows remappings through page faults 5807 * 2. amdgpu_irq_disable_all() disables all interrupts 5808 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5809 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5810 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5811 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5812 * flush any in flight DMA operations 5813 */ 5814 void amdgpu_device_halt(struct amdgpu_device *adev) 5815 { 5816 struct pci_dev *pdev = adev->pdev; 5817 struct drm_device *ddev = adev_to_drm(adev); 5818 5819 drm_dev_unplug(ddev); 5820 5821 amdgpu_irq_disable_all(adev); 5822 5823 amdgpu_fence_driver_hw_fini(adev); 5824 5825 adev->no_hw_access = true; 5826 5827 amdgpu_device_unmap_mmio(adev); 5828 5829 pci_disable_device(pdev); 5830 pci_wait_for_pending_transaction(pdev); 5831 } 5832 5833 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5834 u32 reg) 5835 { 5836 unsigned long flags, address, data; 5837 u32 r; 5838 5839 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5840 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5841 5842 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5843 WREG32(address, reg * 4); 5844 (void)RREG32(address); 5845 r = RREG32(data); 5846 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5847 return r; 5848 } 5849 5850 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5851 u32 reg, u32 v) 5852 { 5853 unsigned long flags, address, data; 5854 5855 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5856 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5857 5858 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5859 WREG32(address, reg * 4); 5860 (void)RREG32(address); 5861 WREG32(data, v); 5862 (void)RREG32(data); 5863 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5864 } 5865