1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 36 #include <drm/drm_atomic_helper.h> 37 #include <drm/drm_probe_helper.h> 38 #include <drm/amdgpu_drm.h> 39 #include <linux/vgaarb.h> 40 #include <linux/vga_switcheroo.h> 41 #include <linux/efi.h> 42 #include "amdgpu.h" 43 #include "amdgpu_trace.h" 44 #include "amdgpu_i2c.h" 45 #include "atom.h" 46 #include "amdgpu_atombios.h" 47 #include "amdgpu_atomfirmware.h" 48 #include "amd_pcie.h" 49 #ifdef CONFIG_DRM_AMDGPU_SI 50 #include "si.h" 51 #endif 52 #ifdef CONFIG_DRM_AMDGPU_CIK 53 #include "cik.h" 54 #endif 55 #include "vi.h" 56 #include "soc15.h" 57 #include "nv.h" 58 #include "bif/bif_4_1_d.h" 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 #define AMDGPU_MAX_RETRY_LIMIT 2 87 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 88 89 const char *amdgpu_asic_name[] = { 90 "TAHITI", 91 "PITCAIRN", 92 "VERDE", 93 "OLAND", 94 "HAINAN", 95 "BONAIRE", 96 "KAVERI", 97 "KABINI", 98 "HAWAII", 99 "MULLINS", 100 "TOPAZ", 101 "TONGA", 102 "FIJI", 103 "CARRIZO", 104 "STONEY", 105 "POLARIS10", 106 "POLARIS11", 107 "POLARIS12", 108 "VEGAM", 109 "VEGA10", 110 "VEGA12", 111 "VEGA20", 112 "RAVEN", 113 "ARCTURUS", 114 "RENOIR", 115 "ALDEBARAN", 116 "NAVI10", 117 "CYAN_SKILLFISH", 118 "NAVI14", 119 "NAVI12", 120 "SIENNA_CICHLID", 121 "NAVY_FLOUNDER", 122 "VANGOGH", 123 "DIMGREY_CAVEFISH", 124 "BEIGE_GOBY", 125 "YELLOW_CARP", 126 "IP DISCOVERY", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 299 void *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0, tmp = 0; 303 uint32_t *data = buf; 304 uint64_t last; 305 int idx; 306 307 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 308 return; 309 310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 311 312 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 313 for (last = pos + size; pos < last; pos += 4) { 314 tmp = pos >> 31; 315 316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 317 if (tmp != hi) { 318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 319 hi = tmp; 320 } 321 if (write) 322 WREG32_NO_KIQ(mmMM_DATA, *data++); 323 else 324 *data++ = RREG32_NO_KIQ(mmMM_DATA); 325 } 326 327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 328 drm_dev_exit(idx); 329 } 330 331 /** 332 * amdgpu_device_aper_access - access vram by vram aperature 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 * 340 * The return value means how many bytes have been transferred. 341 */ 342 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 343 void *buf, size_t size, bool write) 344 { 345 #ifdef CONFIG_64BIT 346 void __iomem *addr; 347 size_t count = 0; 348 uint64_t last; 349 350 if (!adev->mman.aper_base_kaddr) 351 return 0; 352 353 last = min(pos + size, adev->gmc.visible_vram_size); 354 if (last > pos) { 355 addr = adev->mman.aper_base_kaddr + pos; 356 count = last - pos; 357 358 if (write) { 359 memcpy_toio(addr, buf, count); 360 mb(); 361 amdgpu_device_flush_hdp(adev, NULL); 362 } else { 363 amdgpu_device_invalidate_hdp(adev, NULL); 364 mb(); 365 memcpy_fromio(buf, addr, count); 366 } 367 368 } 369 370 return count; 371 #else 372 return 0; 373 #endif 374 } 375 376 /** 377 * amdgpu_device_vram_access - read/write a buffer in vram 378 * 379 * @adev: amdgpu_device pointer 380 * @pos: offset of the buffer in vram 381 * @buf: virtual address of the buffer in system memory 382 * @size: read/write size, sizeof(@buf) must > @size 383 * @write: true - write to vram, otherwise - read from vram 384 */ 385 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 386 void *buf, size_t size, bool write) 387 { 388 size_t count; 389 390 /* try to using vram apreature to access vram first */ 391 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 392 size -= count; 393 if (size) { 394 /* using MM to access rest vram */ 395 pos += count; 396 buf += count; 397 amdgpu_device_mm_access(adev, pos, buf, size, write); 398 } 399 } 400 401 /* 402 * register access helper functions. 403 */ 404 405 /* Check if hw access should be skipped because of hotplug or device error */ 406 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 407 { 408 if (adev->no_hw_access) 409 return true; 410 411 #ifdef CONFIG_LOCKDEP 412 /* 413 * This is a bit complicated to understand, so worth a comment. What we assert 414 * here is that the GPU reset is not running on another thread in parallel. 415 * 416 * For this we trylock the read side of the reset semaphore, if that succeeds 417 * we know that the reset is not running in paralell. 418 * 419 * If the trylock fails we assert that we are either already holding the read 420 * side of the lock or are the reset thread itself and hold the write side of 421 * the lock. 422 */ 423 if (in_task()) { 424 if (down_read_trylock(&adev->reset_domain->sem)) 425 up_read(&adev->reset_domain->sem); 426 else 427 lockdep_assert_held(&adev->reset_domain->sem); 428 } 429 #endif 430 return false; 431 } 432 433 /** 434 * amdgpu_device_rreg - read a memory mapped IO or indirect register 435 * 436 * @adev: amdgpu_device pointer 437 * @reg: dword aligned register offset 438 * @acc_flags: access flags which require special behavior 439 * 440 * Returns the 32 bit value from the offset specified. 441 */ 442 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 443 uint32_t reg, uint32_t acc_flags) 444 { 445 uint32_t ret; 446 447 if (amdgpu_device_skip_hw_access(adev)) 448 return 0; 449 450 if ((reg * 4) < adev->rmmio_size) { 451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 452 amdgpu_sriov_runtime(adev) && 453 down_read_trylock(&adev->reset_domain->sem)) { 454 ret = amdgpu_kiq_rreg(adev, reg); 455 up_read(&adev->reset_domain->sem); 456 } else { 457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 458 } 459 } else { 460 ret = adev->pcie_rreg(adev, reg * 4); 461 } 462 463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 464 465 return ret; 466 } 467 468 /* 469 * MMIO register read with bytes helper functions 470 * @offset:bytes offset from MMIO start 471 * 472 */ 473 474 /** 475 * amdgpu_mm_rreg8 - read a memory mapped IO register 476 * 477 * @adev: amdgpu_device pointer 478 * @offset: byte aligned register offset 479 * 480 * Returns the 8 bit value from the offset specified. 481 */ 482 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 483 { 484 if (amdgpu_device_skip_hw_access(adev)) 485 return 0; 486 487 if (offset < adev->rmmio_size) 488 return (readb(adev->rmmio + offset)); 489 BUG(); 490 } 491 492 /* 493 * MMIO register write with bytes helper functions 494 * @offset:bytes offset from MMIO start 495 * @value: the value want to be written to the register 496 * 497 */ 498 /** 499 * amdgpu_mm_wreg8 - read a memory mapped IO register 500 * 501 * @adev: amdgpu_device pointer 502 * @offset: byte aligned register offset 503 * @value: 8 bit value to write 504 * 505 * Writes the value specified to the offset specified. 506 */ 507 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 508 { 509 if (amdgpu_device_skip_hw_access(adev)) 510 return; 511 512 if (offset < adev->rmmio_size) 513 writeb(value, adev->rmmio + offset); 514 else 515 BUG(); 516 } 517 518 /** 519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 520 * 521 * @adev: amdgpu_device pointer 522 * @reg: dword aligned register offset 523 * @v: 32 bit value to write to the register 524 * @acc_flags: access flags which require special behavior 525 * 526 * Writes the value specified to the offset specified. 527 */ 528 void amdgpu_device_wreg(struct amdgpu_device *adev, 529 uint32_t reg, uint32_t v, 530 uint32_t acc_flags) 531 { 532 if (amdgpu_device_skip_hw_access(adev)) 533 return; 534 535 if ((reg * 4) < adev->rmmio_size) { 536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 537 amdgpu_sriov_runtime(adev) && 538 down_read_trylock(&adev->reset_domain->sem)) { 539 amdgpu_kiq_wreg(adev, reg, v); 540 up_read(&adev->reset_domain->sem); 541 } else { 542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 543 } 544 } else { 545 adev->pcie_wreg(adev, reg * 4, v); 546 } 547 548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 549 } 550 551 /** 552 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 553 * 554 * @adev: amdgpu_device pointer 555 * @reg: mmio/rlc register 556 * @v: value to write 557 * 558 * this function is invoked only for the debugfs register access 559 */ 560 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 561 uint32_t reg, uint32_t v) 562 { 563 if (amdgpu_device_skip_hw_access(adev)) 564 return; 565 566 if (amdgpu_sriov_fullaccess(adev) && 567 adev->gfx.rlc.funcs && 568 adev->gfx.rlc.funcs->is_rlcg_access_range) { 569 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 570 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 571 } else if ((reg * 4) >= adev->rmmio_size) { 572 adev->pcie_wreg(adev, reg * 4, v); 573 } else { 574 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 575 } 576 } 577 578 /** 579 * amdgpu_mm_rdoorbell - read a doorbell dword 580 * 581 * @adev: amdgpu_device pointer 582 * @index: doorbell index 583 * 584 * Returns the value in the doorbell aperture at the 585 * requested doorbell index (CIK). 586 */ 587 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 588 { 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if (index < adev->doorbell.num_doorbells) { 593 return readl(adev->doorbell.ptr + index); 594 } else { 595 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 596 return 0; 597 } 598 } 599 600 /** 601 * amdgpu_mm_wdoorbell - write a doorbell dword 602 * 603 * @adev: amdgpu_device pointer 604 * @index: doorbell index 605 * @v: value to write 606 * 607 * Writes @v to the doorbell aperture at the 608 * requested doorbell index (CIK). 609 */ 610 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 611 { 612 if (amdgpu_device_skip_hw_access(adev)) 613 return; 614 615 if (index < adev->doorbell.num_doorbells) { 616 writel(v, adev->doorbell.ptr + index); 617 } else { 618 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 619 } 620 } 621 622 /** 623 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 624 * 625 * @adev: amdgpu_device pointer 626 * @index: doorbell index 627 * 628 * Returns the value in the doorbell aperture at the 629 * requested doorbell index (VEGA10+). 630 */ 631 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 632 { 633 if (amdgpu_device_skip_hw_access(adev)) 634 return 0; 635 636 if (index < adev->doorbell.num_doorbells) { 637 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 638 } else { 639 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 640 return 0; 641 } 642 } 643 644 /** 645 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 646 * 647 * @adev: amdgpu_device pointer 648 * @index: doorbell index 649 * @v: value to write 650 * 651 * Writes @v to the doorbell aperture at the 652 * requested doorbell index (VEGA10+). 653 */ 654 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 655 { 656 if (amdgpu_device_skip_hw_access(adev)) 657 return; 658 659 if (index < adev->doorbell.num_doorbells) { 660 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 661 } else { 662 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 663 } 664 } 665 666 /** 667 * amdgpu_device_indirect_rreg - read an indirect register 668 * 669 * @adev: amdgpu_device pointer 670 * @pcie_index: mmio register offset 671 * @pcie_data: mmio register offset 672 * @reg_addr: indirect register address to read from 673 * 674 * Returns the value of indirect register @reg_addr 675 */ 676 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 677 u32 pcie_index, u32 pcie_data, 678 u32 reg_addr) 679 { 680 unsigned long flags; 681 u32 r; 682 void __iomem *pcie_index_offset; 683 void __iomem *pcie_data_offset; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 689 writel(reg_addr, pcie_index_offset); 690 readl(pcie_index_offset); 691 r = readl(pcie_data_offset); 692 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 693 694 return r; 695 } 696 697 /** 698 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 699 * 700 * @adev: amdgpu_device pointer 701 * @pcie_index: mmio register offset 702 * @pcie_data: mmio register offset 703 * @reg_addr: indirect register address to read from 704 * 705 * Returns the value of indirect register @reg_addr 706 */ 707 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 708 u32 pcie_index, u32 pcie_data, 709 u32 reg_addr) 710 { 711 unsigned long flags; 712 u64 r; 713 void __iomem *pcie_index_offset; 714 void __iomem *pcie_data_offset; 715 716 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 717 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 718 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 719 720 /* read low 32 bits */ 721 writel(reg_addr, pcie_index_offset); 722 readl(pcie_index_offset); 723 r = readl(pcie_data_offset); 724 /* read high 32 bits */ 725 writel(reg_addr + 4, pcie_index_offset); 726 readl(pcie_index_offset); 727 r |= ((u64)readl(pcie_data_offset) << 32); 728 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 729 730 return r; 731 } 732 733 /** 734 * amdgpu_device_indirect_wreg - write an indirect register address 735 * 736 * @adev: amdgpu_device pointer 737 * @pcie_index: mmio register offset 738 * @pcie_data: mmio register offset 739 * @reg_addr: indirect register offset 740 * @reg_data: indirect register data 741 * 742 */ 743 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 744 u32 pcie_index, u32 pcie_data, 745 u32 reg_addr, u32 reg_data) 746 { 747 unsigned long flags; 748 void __iomem *pcie_index_offset; 749 void __iomem *pcie_data_offset; 750 751 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 752 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 753 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 754 755 writel(reg_addr, pcie_index_offset); 756 readl(pcie_index_offset); 757 writel(reg_data, pcie_data_offset); 758 readl(pcie_data_offset); 759 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 760 } 761 762 /** 763 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 764 * 765 * @adev: amdgpu_device pointer 766 * @pcie_index: mmio register offset 767 * @pcie_data: mmio register offset 768 * @reg_addr: indirect register offset 769 * @reg_data: indirect register data 770 * 771 */ 772 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 773 u32 pcie_index, u32 pcie_data, 774 u32 reg_addr, u64 reg_data) 775 { 776 unsigned long flags; 777 void __iomem *pcie_index_offset; 778 void __iomem *pcie_data_offset; 779 780 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 781 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 782 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 783 784 /* write low 32 bits */ 785 writel(reg_addr, pcie_index_offset); 786 readl(pcie_index_offset); 787 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 788 readl(pcie_data_offset); 789 /* write high 32 bits */ 790 writel(reg_addr + 4, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data >> 32), pcie_data_offset); 793 readl(pcie_data_offset); 794 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 795 } 796 797 /** 798 * amdgpu_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @reg: offset of register 802 * 803 * Dummy register read function. Used for register blocks 804 * that certain asics don't have (all asics). 805 * Returns the value in the register. 806 */ 807 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 808 { 809 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 810 BUG(); 811 return 0; 812 } 813 814 /** 815 * amdgpu_invalid_wreg - dummy reg write function 816 * 817 * @adev: amdgpu_device pointer 818 * @reg: offset of register 819 * @v: value to write to the register 820 * 821 * Dummy register read function. Used for register blocks 822 * that certain asics don't have (all asics). 823 */ 824 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 825 { 826 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 827 reg, v); 828 BUG(); 829 } 830 831 /** 832 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 833 * 834 * @adev: amdgpu_device pointer 835 * @reg: offset of register 836 * 837 * Dummy register read function. Used for register blocks 838 * that certain asics don't have (all asics). 839 * Returns the value in the register. 840 */ 841 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 842 { 843 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 844 BUG(); 845 return 0; 846 } 847 848 /** 849 * amdgpu_invalid_wreg64 - dummy reg write function 850 * 851 * @adev: amdgpu_device pointer 852 * @reg: offset of register 853 * @v: value to write to the register 854 * 855 * Dummy register read function. Used for register blocks 856 * that certain asics don't have (all asics). 857 */ 858 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 859 { 860 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 861 reg, v); 862 BUG(); 863 } 864 865 /** 866 * amdgpu_block_invalid_rreg - dummy reg read function 867 * 868 * @adev: amdgpu_device pointer 869 * @block: offset of instance 870 * @reg: offset of register 871 * 872 * Dummy register read function. Used for register blocks 873 * that certain asics don't have (all asics). 874 * Returns the value in the register. 875 */ 876 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 877 uint32_t block, uint32_t reg) 878 { 879 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 880 reg, block); 881 BUG(); 882 return 0; 883 } 884 885 /** 886 * amdgpu_block_invalid_wreg - dummy reg write function 887 * 888 * @adev: amdgpu_device pointer 889 * @block: offset of instance 890 * @reg: offset of register 891 * @v: value to write to the register 892 * 893 * Dummy register read function. Used for register blocks 894 * that certain asics don't have (all asics). 895 */ 896 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 897 uint32_t block, 898 uint32_t reg, uint32_t v) 899 { 900 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 901 reg, block, v); 902 BUG(); 903 } 904 905 /** 906 * amdgpu_device_asic_init - Wrapper for atom asic_init 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Does any asic specific work and then calls atom asic init. 911 */ 912 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 913 { 914 amdgpu_asic_pre_asic_init(adev); 915 916 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 917 return amdgpu_atomfirmware_asic_init(adev, true); 918 else 919 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 920 } 921 922 /** 923 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 924 * 925 * @adev: amdgpu_device pointer 926 * 927 * Allocates a scratch page of VRAM for use by various things in the 928 * driver. 929 */ 930 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 931 { 932 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 933 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 934 &adev->vram_scratch.robj, 935 &adev->vram_scratch.gpu_addr, 936 (void **)&adev->vram_scratch.ptr); 937 } 938 939 /** 940 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 941 * 942 * @adev: amdgpu_device pointer 943 * 944 * Frees the VRAM scratch page. 945 */ 946 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 947 { 948 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 949 } 950 951 /** 952 * amdgpu_device_program_register_sequence - program an array of registers. 953 * 954 * @adev: amdgpu_device pointer 955 * @registers: pointer to the register array 956 * @array_size: size of the register array 957 * 958 * Programs an array or registers with and and or masks. 959 * This is a helper for setting golden registers. 960 */ 961 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 962 const u32 *registers, 963 const u32 array_size) 964 { 965 u32 tmp, reg, and_mask, or_mask; 966 int i; 967 968 if (array_size % 3) 969 return; 970 971 for (i = 0; i < array_size; i +=3) { 972 reg = registers[i + 0]; 973 and_mask = registers[i + 1]; 974 or_mask = registers[i + 2]; 975 976 if (and_mask == 0xffffffff) { 977 tmp = or_mask; 978 } else { 979 tmp = RREG32(reg); 980 tmp &= ~and_mask; 981 if (adev->family >= AMDGPU_FAMILY_AI) 982 tmp |= (or_mask & and_mask); 983 else 984 tmp |= or_mask; 985 } 986 WREG32(reg, tmp); 987 } 988 } 989 990 /** 991 * amdgpu_device_pci_config_reset - reset the GPU 992 * 993 * @adev: amdgpu_device pointer 994 * 995 * Resets the GPU using the pci config reset sequence. 996 * Only applicable to asics prior to vega10. 997 */ 998 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 999 { 1000 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1001 } 1002 1003 /** 1004 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1005 * 1006 * @adev: amdgpu_device pointer 1007 * 1008 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1009 */ 1010 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1011 { 1012 return pci_reset_function(adev->pdev); 1013 } 1014 1015 /* 1016 * GPU doorbell aperture helpers function. 1017 */ 1018 /** 1019 * amdgpu_device_doorbell_init - Init doorbell driver information. 1020 * 1021 * @adev: amdgpu_device pointer 1022 * 1023 * Init doorbell driver information (CIK) 1024 * Returns 0 on success, error on failure. 1025 */ 1026 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1027 { 1028 1029 /* No doorbell on SI hardware generation */ 1030 if (adev->asic_type < CHIP_BONAIRE) { 1031 adev->doorbell.base = 0; 1032 adev->doorbell.size = 0; 1033 adev->doorbell.num_doorbells = 0; 1034 adev->doorbell.ptr = NULL; 1035 return 0; 1036 } 1037 1038 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1039 return -EINVAL; 1040 1041 amdgpu_asic_init_doorbell_index(adev); 1042 1043 /* doorbell bar mapping */ 1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1046 1047 if (adev->enable_mes) { 1048 adev->doorbell.num_doorbells = 1049 adev->doorbell.size / sizeof(u32); 1050 } else { 1051 adev->doorbell.num_doorbells = 1052 min_t(u32, adev->doorbell.size / sizeof(u32), 1053 adev->doorbell_index.max_assignment+1); 1054 if (adev->doorbell.num_doorbells == 0) 1055 return -EINVAL; 1056 1057 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1058 * paging queue doorbell use the second page. The 1059 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1060 * doorbells are in the first page. So with paging queue enabled, 1061 * the max num_doorbells should + 1 page (0x400 in dword) 1062 */ 1063 if (adev->asic_type >= CHIP_VEGA10) 1064 adev->doorbell.num_doorbells += 0x400; 1065 } 1066 1067 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1068 adev->doorbell.num_doorbells * 1069 sizeof(u32)); 1070 if (adev->doorbell.ptr == NULL) 1071 return -ENOMEM; 1072 1073 return 0; 1074 } 1075 1076 /** 1077 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1078 * 1079 * @adev: amdgpu_device pointer 1080 * 1081 * Tear down doorbell driver information (CIK) 1082 */ 1083 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1084 { 1085 iounmap(adev->doorbell.ptr); 1086 adev->doorbell.ptr = NULL; 1087 } 1088 1089 1090 1091 /* 1092 * amdgpu_device_wb_*() 1093 * Writeback is the method by which the GPU updates special pages in memory 1094 * with the status of certain GPU events (fences, ring pointers,etc.). 1095 */ 1096 1097 /** 1098 * amdgpu_device_wb_fini - Disable Writeback and free memory 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Disables Writeback and frees the Writeback memory (all asics). 1103 * Used at driver shutdown. 1104 */ 1105 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1106 { 1107 if (adev->wb.wb_obj) { 1108 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1109 &adev->wb.gpu_addr, 1110 (void **)&adev->wb.wb); 1111 adev->wb.wb_obj = NULL; 1112 } 1113 } 1114 1115 /** 1116 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1117 * 1118 * @adev: amdgpu_device pointer 1119 * 1120 * Initializes writeback and allocates writeback memory (all asics). 1121 * Used at driver startup. 1122 * Returns 0 on success or an -error on failure. 1123 */ 1124 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1125 { 1126 int r; 1127 1128 if (adev->wb.wb_obj == NULL) { 1129 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1130 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1131 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1132 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1133 (void **)&adev->wb.wb); 1134 if (r) { 1135 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1136 return r; 1137 } 1138 1139 adev->wb.num_wb = AMDGPU_MAX_WB; 1140 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1141 1142 /* clear wb memory */ 1143 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1144 } 1145 1146 return 0; 1147 } 1148 1149 /** 1150 * amdgpu_device_wb_get - Allocate a wb entry 1151 * 1152 * @adev: amdgpu_device pointer 1153 * @wb: wb index 1154 * 1155 * Allocate a wb slot for use by the driver (all asics). 1156 * Returns 0 on success or -EINVAL on failure. 1157 */ 1158 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1159 { 1160 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1161 1162 if (offset < adev->wb.num_wb) { 1163 __set_bit(offset, adev->wb.used); 1164 *wb = offset << 3; /* convert to dw offset */ 1165 return 0; 1166 } else { 1167 return -EINVAL; 1168 } 1169 } 1170 1171 /** 1172 * amdgpu_device_wb_free - Free a wb entry 1173 * 1174 * @adev: amdgpu_device pointer 1175 * @wb: wb index 1176 * 1177 * Free a wb slot allocated for use by the driver (all asics) 1178 */ 1179 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1180 { 1181 wb >>= 3; 1182 if (wb < adev->wb.num_wb) 1183 __clear_bit(wb, adev->wb.used); 1184 } 1185 1186 /** 1187 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1188 * 1189 * @adev: amdgpu_device pointer 1190 * 1191 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1192 * to fail, but if any of the BARs is not accessible after the size we abort 1193 * driver loading by returning -ENODEV. 1194 */ 1195 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1196 { 1197 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1198 struct pci_bus *root; 1199 struct resource *res; 1200 unsigned i; 1201 u16 cmd; 1202 int r; 1203 1204 /* Bypass for VF */ 1205 if (amdgpu_sriov_vf(adev)) 1206 return 0; 1207 1208 /* skip if the bios has already enabled large BAR */ 1209 if (adev->gmc.real_vram_size && 1210 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1211 return 0; 1212 1213 /* Check if the root BUS has 64bit memory resources */ 1214 root = adev->pdev->bus; 1215 while (root->parent) 1216 root = root->parent; 1217 1218 pci_bus_for_each_resource(root, res, i) { 1219 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1220 res->start > 0x100000000ull) 1221 break; 1222 } 1223 1224 /* Trying to resize is pointless without a root hub window above 4GB */ 1225 if (!res) 1226 return 0; 1227 1228 /* Limit the BAR size to what is available */ 1229 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1230 rbar_size); 1231 1232 /* Disable memory decoding while we change the BAR addresses and size */ 1233 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1234 pci_write_config_word(adev->pdev, PCI_COMMAND, 1235 cmd & ~PCI_COMMAND_MEMORY); 1236 1237 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1238 amdgpu_device_doorbell_fini(adev); 1239 if (adev->asic_type >= CHIP_BONAIRE) 1240 pci_release_resource(adev->pdev, 2); 1241 1242 pci_release_resource(adev->pdev, 0); 1243 1244 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1245 if (r == -ENOSPC) 1246 DRM_INFO("Not enough PCI address space for a large BAR."); 1247 else if (r && r != -ENOTSUPP) 1248 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1249 1250 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1251 1252 /* When the doorbell or fb BAR isn't available we have no chance of 1253 * using the device. 1254 */ 1255 r = amdgpu_device_doorbell_init(adev); 1256 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1257 return -ENODEV; 1258 1259 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1260 1261 return 0; 1262 } 1263 1264 /* 1265 * GPU helpers function. 1266 */ 1267 /** 1268 * amdgpu_device_need_post - check if the hw need post or not 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Check if the asic has been initialized (all asics) at driver startup 1273 * or post is needed if hw reset is performed. 1274 * Returns true if need or false if not. 1275 */ 1276 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1277 { 1278 uint32_t reg; 1279 1280 if (amdgpu_sriov_vf(adev)) 1281 return false; 1282 1283 if (amdgpu_passthrough(adev)) { 1284 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1285 * some old smc fw still need driver do vPost otherwise gpu hang, while 1286 * those smc fw version above 22.15 doesn't have this flaw, so we force 1287 * vpost executed for smc version below 22.15 1288 */ 1289 if (adev->asic_type == CHIP_FIJI) { 1290 int err; 1291 uint32_t fw_ver; 1292 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1293 /* force vPost if error occured */ 1294 if (err) 1295 return true; 1296 1297 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1298 if (fw_ver < 0x00160e00) 1299 return true; 1300 } 1301 } 1302 1303 /* Don't post if we need to reset whole hive on init */ 1304 if (adev->gmc.xgmi.pending_reset) 1305 return false; 1306 1307 if (adev->has_hw_reset) { 1308 adev->has_hw_reset = false; 1309 return true; 1310 } 1311 1312 /* bios scratch used on CIK+ */ 1313 if (adev->asic_type >= CHIP_BONAIRE) 1314 return amdgpu_atombios_scratch_need_asic_init(adev); 1315 1316 /* check MEM_SIZE for older asics */ 1317 reg = amdgpu_asic_get_config_memsize(adev); 1318 1319 if ((reg != 0) && (reg != 0xffffffff)) 1320 return false; 1321 1322 return true; 1323 } 1324 1325 /** 1326 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1327 * 1328 * @adev: amdgpu_device pointer 1329 * 1330 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1331 * be set for this device. 1332 * 1333 * Returns true if it should be used or false if not. 1334 */ 1335 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1336 { 1337 switch (amdgpu_aspm) { 1338 case -1: 1339 break; 1340 case 0: 1341 return false; 1342 case 1: 1343 return true; 1344 default: 1345 return false; 1346 } 1347 return pcie_aspm_enabled(adev->pdev); 1348 } 1349 1350 /* if we get transitioned to only one device, take VGA back */ 1351 /** 1352 * amdgpu_device_vga_set_decode - enable/disable vga decode 1353 * 1354 * @pdev: PCI device pointer 1355 * @state: enable/disable vga decode 1356 * 1357 * Enable/disable vga decode (all asics). 1358 * Returns VGA resource flags. 1359 */ 1360 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1361 bool state) 1362 { 1363 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1364 amdgpu_asic_set_vga_state(adev, state); 1365 if (state) 1366 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1367 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1368 else 1369 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1370 } 1371 1372 /** 1373 * amdgpu_device_check_block_size - validate the vm block size 1374 * 1375 * @adev: amdgpu_device pointer 1376 * 1377 * Validates the vm block size specified via module parameter. 1378 * The vm block size defines number of bits in page table versus page directory, 1379 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1380 * page table and the remaining bits are in the page directory. 1381 */ 1382 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1383 { 1384 /* defines number of bits in page table versus page directory, 1385 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1386 * page table and the remaining bits are in the page directory */ 1387 if (amdgpu_vm_block_size == -1) 1388 return; 1389 1390 if (amdgpu_vm_block_size < 9) { 1391 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1392 amdgpu_vm_block_size); 1393 amdgpu_vm_block_size = -1; 1394 } 1395 } 1396 1397 /** 1398 * amdgpu_device_check_vm_size - validate the vm size 1399 * 1400 * @adev: amdgpu_device pointer 1401 * 1402 * Validates the vm size in GB specified via module parameter. 1403 * The VM size is the size of the GPU virtual memory space in GB. 1404 */ 1405 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1406 { 1407 /* no need to check the default value */ 1408 if (amdgpu_vm_size == -1) 1409 return; 1410 1411 if (amdgpu_vm_size < 1) { 1412 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1413 amdgpu_vm_size); 1414 amdgpu_vm_size = -1; 1415 } 1416 } 1417 1418 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1419 { 1420 struct sysinfo si; 1421 bool is_os_64 = (sizeof(void *) == 8); 1422 uint64_t total_memory; 1423 uint64_t dram_size_seven_GB = 0x1B8000000; 1424 uint64_t dram_size_three_GB = 0xB8000000; 1425 1426 if (amdgpu_smu_memory_pool_size == 0) 1427 return; 1428 1429 if (!is_os_64) { 1430 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1431 goto def_value; 1432 } 1433 si_meminfo(&si); 1434 total_memory = (uint64_t)si.totalram * si.mem_unit; 1435 1436 if ((amdgpu_smu_memory_pool_size == 1) || 1437 (amdgpu_smu_memory_pool_size == 2)) { 1438 if (total_memory < dram_size_three_GB) 1439 goto def_value1; 1440 } else if ((amdgpu_smu_memory_pool_size == 4) || 1441 (amdgpu_smu_memory_pool_size == 8)) { 1442 if (total_memory < dram_size_seven_GB) 1443 goto def_value1; 1444 } else { 1445 DRM_WARN("Smu memory pool size not supported\n"); 1446 goto def_value; 1447 } 1448 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1449 1450 return; 1451 1452 def_value1: 1453 DRM_WARN("No enough system memory\n"); 1454 def_value: 1455 adev->pm.smu_prv_buffer_size = 0; 1456 } 1457 1458 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1459 { 1460 if (!(adev->flags & AMD_IS_APU) || 1461 adev->asic_type < CHIP_RAVEN) 1462 return 0; 1463 1464 switch (adev->asic_type) { 1465 case CHIP_RAVEN: 1466 if (adev->pdev->device == 0x15dd) 1467 adev->apu_flags |= AMD_APU_IS_RAVEN; 1468 if (adev->pdev->device == 0x15d8) 1469 adev->apu_flags |= AMD_APU_IS_PICASSO; 1470 break; 1471 case CHIP_RENOIR: 1472 if ((adev->pdev->device == 0x1636) || 1473 (adev->pdev->device == 0x164c)) 1474 adev->apu_flags |= AMD_APU_IS_RENOIR; 1475 else 1476 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1477 break; 1478 case CHIP_VANGOGH: 1479 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1480 break; 1481 case CHIP_YELLOW_CARP: 1482 break; 1483 case CHIP_CYAN_SKILLFISH: 1484 if ((adev->pdev->device == 0x13FE) || 1485 (adev->pdev->device == 0x143F)) 1486 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1487 break; 1488 default: 1489 break; 1490 } 1491 1492 return 0; 1493 } 1494 1495 /** 1496 * amdgpu_device_check_arguments - validate module params 1497 * 1498 * @adev: amdgpu_device pointer 1499 * 1500 * Validates certain module parameters and updates 1501 * the associated values used by the driver (all asics). 1502 */ 1503 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1504 { 1505 if (amdgpu_sched_jobs < 4) { 1506 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1507 amdgpu_sched_jobs); 1508 amdgpu_sched_jobs = 4; 1509 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1510 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1511 amdgpu_sched_jobs); 1512 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1513 } 1514 1515 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1516 /* gart size must be greater or equal to 32M */ 1517 dev_warn(adev->dev, "gart size (%d) too small\n", 1518 amdgpu_gart_size); 1519 amdgpu_gart_size = -1; 1520 } 1521 1522 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1523 /* gtt size must be greater or equal to 32M */ 1524 dev_warn(adev->dev, "gtt size (%d) too small\n", 1525 amdgpu_gtt_size); 1526 amdgpu_gtt_size = -1; 1527 } 1528 1529 /* valid range is between 4 and 9 inclusive */ 1530 if (amdgpu_vm_fragment_size != -1 && 1531 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1532 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1533 amdgpu_vm_fragment_size = -1; 1534 } 1535 1536 if (amdgpu_sched_hw_submission < 2) { 1537 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1538 amdgpu_sched_hw_submission); 1539 amdgpu_sched_hw_submission = 2; 1540 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1541 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1542 amdgpu_sched_hw_submission); 1543 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1544 } 1545 1546 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1547 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1548 amdgpu_reset_method = -1; 1549 } 1550 1551 amdgpu_device_check_smu_prv_buffer_size(adev); 1552 1553 amdgpu_device_check_vm_size(adev); 1554 1555 amdgpu_device_check_block_size(adev); 1556 1557 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1558 1559 return 0; 1560 } 1561 1562 /** 1563 * amdgpu_switcheroo_set_state - set switcheroo state 1564 * 1565 * @pdev: pci dev pointer 1566 * @state: vga_switcheroo state 1567 * 1568 * Callback for the switcheroo driver. Suspends or resumes the 1569 * the asics before or after it is powered up using ACPI methods. 1570 */ 1571 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1572 enum vga_switcheroo_state state) 1573 { 1574 struct drm_device *dev = pci_get_drvdata(pdev); 1575 int r; 1576 1577 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1578 return; 1579 1580 if (state == VGA_SWITCHEROO_ON) { 1581 pr_info("switched on\n"); 1582 /* don't suspend or resume card normally */ 1583 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1584 1585 pci_set_power_state(pdev, PCI_D0); 1586 amdgpu_device_load_pci_state(pdev); 1587 r = pci_enable_device(pdev); 1588 if (r) 1589 DRM_WARN("pci_enable_device failed (%d)\n", r); 1590 amdgpu_device_resume(dev, true); 1591 1592 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1593 } else { 1594 pr_info("switched off\n"); 1595 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1596 amdgpu_device_suspend(dev, true); 1597 amdgpu_device_cache_pci_state(pdev); 1598 /* Shut down the device */ 1599 pci_disable_device(pdev); 1600 pci_set_power_state(pdev, PCI_D3cold); 1601 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1602 } 1603 } 1604 1605 /** 1606 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1607 * 1608 * @pdev: pci dev pointer 1609 * 1610 * Callback for the switcheroo driver. Check of the switcheroo 1611 * state can be changed. 1612 * Returns true if the state can be changed, false if not. 1613 */ 1614 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1615 { 1616 struct drm_device *dev = pci_get_drvdata(pdev); 1617 1618 /* 1619 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1620 * locking inversion with the driver load path. And the access here is 1621 * completely racy anyway. So don't bother with locking for now. 1622 */ 1623 return atomic_read(&dev->open_count) == 0; 1624 } 1625 1626 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1627 .set_gpu_state = amdgpu_switcheroo_set_state, 1628 .reprobe = NULL, 1629 .can_switch = amdgpu_switcheroo_can_switch, 1630 }; 1631 1632 /** 1633 * amdgpu_device_ip_set_clockgating_state - set the CG state 1634 * 1635 * @dev: amdgpu_device pointer 1636 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1637 * @state: clockgating state (gate or ungate) 1638 * 1639 * Sets the requested clockgating state for all instances of 1640 * the hardware IP specified. 1641 * Returns the error code from the last instance. 1642 */ 1643 int amdgpu_device_ip_set_clockgating_state(void *dev, 1644 enum amd_ip_block_type block_type, 1645 enum amd_clockgating_state state) 1646 { 1647 struct amdgpu_device *adev = dev; 1648 int i, r = 0; 1649 1650 for (i = 0; i < adev->num_ip_blocks; i++) { 1651 if (!adev->ip_blocks[i].status.valid) 1652 continue; 1653 if (adev->ip_blocks[i].version->type != block_type) 1654 continue; 1655 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1656 continue; 1657 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1658 (void *)adev, state); 1659 if (r) 1660 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1661 adev->ip_blocks[i].version->funcs->name, r); 1662 } 1663 return r; 1664 } 1665 1666 /** 1667 * amdgpu_device_ip_set_powergating_state - set the PG state 1668 * 1669 * @dev: amdgpu_device pointer 1670 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1671 * @state: powergating state (gate or ungate) 1672 * 1673 * Sets the requested powergating state for all instances of 1674 * the hardware IP specified. 1675 * Returns the error code from the last instance. 1676 */ 1677 int amdgpu_device_ip_set_powergating_state(void *dev, 1678 enum amd_ip_block_type block_type, 1679 enum amd_powergating_state state) 1680 { 1681 struct amdgpu_device *adev = dev; 1682 int i, r = 0; 1683 1684 for (i = 0; i < adev->num_ip_blocks; i++) { 1685 if (!adev->ip_blocks[i].status.valid) 1686 continue; 1687 if (adev->ip_blocks[i].version->type != block_type) 1688 continue; 1689 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1690 continue; 1691 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1692 (void *)adev, state); 1693 if (r) 1694 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1695 adev->ip_blocks[i].version->funcs->name, r); 1696 } 1697 return r; 1698 } 1699 1700 /** 1701 * amdgpu_device_ip_get_clockgating_state - get the CG state 1702 * 1703 * @adev: amdgpu_device pointer 1704 * @flags: clockgating feature flags 1705 * 1706 * Walks the list of IPs on the device and updates the clockgating 1707 * flags for each IP. 1708 * Updates @flags with the feature flags for each hardware IP where 1709 * clockgating is enabled. 1710 */ 1711 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1712 u64 *flags) 1713 { 1714 int i; 1715 1716 for (i = 0; i < adev->num_ip_blocks; i++) { 1717 if (!adev->ip_blocks[i].status.valid) 1718 continue; 1719 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1720 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1721 } 1722 } 1723 1724 /** 1725 * amdgpu_device_ip_wait_for_idle - wait for idle 1726 * 1727 * @adev: amdgpu_device pointer 1728 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1729 * 1730 * Waits for the request hardware IP to be idle. 1731 * Returns 0 for success or a negative error code on failure. 1732 */ 1733 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1734 enum amd_ip_block_type block_type) 1735 { 1736 int i, r; 1737 1738 for (i = 0; i < adev->num_ip_blocks; i++) { 1739 if (!adev->ip_blocks[i].status.valid) 1740 continue; 1741 if (adev->ip_blocks[i].version->type == block_type) { 1742 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1743 if (r) 1744 return r; 1745 break; 1746 } 1747 } 1748 return 0; 1749 1750 } 1751 1752 /** 1753 * amdgpu_device_ip_is_idle - is the hardware IP idle 1754 * 1755 * @adev: amdgpu_device pointer 1756 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1757 * 1758 * Check if the hardware IP is idle or not. 1759 * Returns true if it the IP is idle, false if not. 1760 */ 1761 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1762 enum amd_ip_block_type block_type) 1763 { 1764 int i; 1765 1766 for (i = 0; i < adev->num_ip_blocks; i++) { 1767 if (!adev->ip_blocks[i].status.valid) 1768 continue; 1769 if (adev->ip_blocks[i].version->type == block_type) 1770 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1771 } 1772 return true; 1773 1774 } 1775 1776 /** 1777 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1778 * 1779 * @adev: amdgpu_device pointer 1780 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1781 * 1782 * Returns a pointer to the hardware IP block structure 1783 * if it exists for the asic, otherwise NULL. 1784 */ 1785 struct amdgpu_ip_block * 1786 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1787 enum amd_ip_block_type type) 1788 { 1789 int i; 1790 1791 for (i = 0; i < adev->num_ip_blocks; i++) 1792 if (adev->ip_blocks[i].version->type == type) 1793 return &adev->ip_blocks[i]; 1794 1795 return NULL; 1796 } 1797 1798 /** 1799 * amdgpu_device_ip_block_version_cmp 1800 * 1801 * @adev: amdgpu_device pointer 1802 * @type: enum amd_ip_block_type 1803 * @major: major version 1804 * @minor: minor version 1805 * 1806 * return 0 if equal or greater 1807 * return 1 if smaller or the ip_block doesn't exist 1808 */ 1809 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1810 enum amd_ip_block_type type, 1811 u32 major, u32 minor) 1812 { 1813 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1814 1815 if (ip_block && ((ip_block->version->major > major) || 1816 ((ip_block->version->major == major) && 1817 (ip_block->version->minor >= minor)))) 1818 return 0; 1819 1820 return 1; 1821 } 1822 1823 /** 1824 * amdgpu_device_ip_block_add 1825 * 1826 * @adev: amdgpu_device pointer 1827 * @ip_block_version: pointer to the IP to add 1828 * 1829 * Adds the IP block driver information to the collection of IPs 1830 * on the asic. 1831 */ 1832 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1833 const struct amdgpu_ip_block_version *ip_block_version) 1834 { 1835 if (!ip_block_version) 1836 return -EINVAL; 1837 1838 switch (ip_block_version->type) { 1839 case AMD_IP_BLOCK_TYPE_VCN: 1840 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1841 return 0; 1842 break; 1843 case AMD_IP_BLOCK_TYPE_JPEG: 1844 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1845 return 0; 1846 break; 1847 default: 1848 break; 1849 } 1850 1851 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1852 ip_block_version->funcs->name); 1853 1854 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1855 1856 return 0; 1857 } 1858 1859 /** 1860 * amdgpu_device_enable_virtual_display - enable virtual display feature 1861 * 1862 * @adev: amdgpu_device pointer 1863 * 1864 * Enabled the virtual display feature if the user has enabled it via 1865 * the module parameter virtual_display. This feature provides a virtual 1866 * display hardware on headless boards or in virtualized environments. 1867 * This function parses and validates the configuration string specified by 1868 * the user and configues the virtual display configuration (number of 1869 * virtual connectors, crtcs, etc.) specified. 1870 */ 1871 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1872 { 1873 adev->enable_virtual_display = false; 1874 1875 if (amdgpu_virtual_display) { 1876 const char *pci_address_name = pci_name(adev->pdev); 1877 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1878 1879 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1880 pciaddstr_tmp = pciaddstr; 1881 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1882 pciaddname = strsep(&pciaddname_tmp, ","); 1883 if (!strcmp("all", pciaddname) 1884 || !strcmp(pci_address_name, pciaddname)) { 1885 long num_crtc; 1886 int res = -1; 1887 1888 adev->enable_virtual_display = true; 1889 1890 if (pciaddname_tmp) 1891 res = kstrtol(pciaddname_tmp, 10, 1892 &num_crtc); 1893 1894 if (!res) { 1895 if (num_crtc < 1) 1896 num_crtc = 1; 1897 if (num_crtc > 6) 1898 num_crtc = 6; 1899 adev->mode_info.num_crtc = num_crtc; 1900 } else { 1901 adev->mode_info.num_crtc = 1; 1902 } 1903 break; 1904 } 1905 } 1906 1907 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1908 amdgpu_virtual_display, pci_address_name, 1909 adev->enable_virtual_display, adev->mode_info.num_crtc); 1910 1911 kfree(pciaddstr); 1912 } 1913 } 1914 1915 /** 1916 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1917 * 1918 * @adev: amdgpu_device pointer 1919 * 1920 * Parses the asic configuration parameters specified in the gpu info 1921 * firmware and makes them availale to the driver for use in configuring 1922 * the asic. 1923 * Returns 0 on success, -EINVAL on failure. 1924 */ 1925 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1926 { 1927 const char *chip_name; 1928 char fw_name[40]; 1929 int err; 1930 const struct gpu_info_firmware_header_v1_0 *hdr; 1931 1932 adev->firmware.gpu_info_fw = NULL; 1933 1934 if (adev->mman.discovery_bin) { 1935 /* 1936 * FIXME: The bounding box is still needed by Navi12, so 1937 * temporarily read it from gpu_info firmware. Should be dropped 1938 * when DAL no longer needs it. 1939 */ 1940 if (adev->asic_type != CHIP_NAVI12) 1941 return 0; 1942 } 1943 1944 switch (adev->asic_type) { 1945 #ifdef CONFIG_DRM_AMDGPU_SI 1946 case CHIP_VERDE: 1947 case CHIP_TAHITI: 1948 case CHIP_PITCAIRN: 1949 case CHIP_OLAND: 1950 case CHIP_HAINAN: 1951 #endif 1952 #ifdef CONFIG_DRM_AMDGPU_CIK 1953 case CHIP_BONAIRE: 1954 case CHIP_HAWAII: 1955 case CHIP_KAVERI: 1956 case CHIP_KABINI: 1957 case CHIP_MULLINS: 1958 #endif 1959 case CHIP_TOPAZ: 1960 case CHIP_TONGA: 1961 case CHIP_FIJI: 1962 case CHIP_POLARIS10: 1963 case CHIP_POLARIS11: 1964 case CHIP_POLARIS12: 1965 case CHIP_VEGAM: 1966 case CHIP_CARRIZO: 1967 case CHIP_STONEY: 1968 case CHIP_VEGA20: 1969 case CHIP_ALDEBARAN: 1970 case CHIP_SIENNA_CICHLID: 1971 case CHIP_NAVY_FLOUNDER: 1972 case CHIP_DIMGREY_CAVEFISH: 1973 case CHIP_BEIGE_GOBY: 1974 default: 1975 return 0; 1976 case CHIP_VEGA10: 1977 chip_name = "vega10"; 1978 break; 1979 case CHIP_VEGA12: 1980 chip_name = "vega12"; 1981 break; 1982 case CHIP_RAVEN: 1983 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1984 chip_name = "raven2"; 1985 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1986 chip_name = "picasso"; 1987 else 1988 chip_name = "raven"; 1989 break; 1990 case CHIP_ARCTURUS: 1991 chip_name = "arcturus"; 1992 break; 1993 case CHIP_NAVI12: 1994 chip_name = "navi12"; 1995 break; 1996 } 1997 1998 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1999 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2000 if (err) { 2001 dev_err(adev->dev, 2002 "Failed to load gpu_info firmware \"%s\"\n", 2003 fw_name); 2004 goto out; 2005 } 2006 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2007 if (err) { 2008 dev_err(adev->dev, 2009 "Failed to validate gpu_info firmware \"%s\"\n", 2010 fw_name); 2011 goto out; 2012 } 2013 2014 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2015 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2016 2017 switch (hdr->version_major) { 2018 case 1: 2019 { 2020 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2021 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2022 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2023 2024 /* 2025 * Should be droped when DAL no longer needs it. 2026 */ 2027 if (adev->asic_type == CHIP_NAVI12) 2028 goto parse_soc_bounding_box; 2029 2030 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2031 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2032 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2033 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2034 adev->gfx.config.max_texture_channel_caches = 2035 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2036 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2037 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2038 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2039 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2040 adev->gfx.config.double_offchip_lds_buf = 2041 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2042 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2043 adev->gfx.cu_info.max_waves_per_simd = 2044 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2045 adev->gfx.cu_info.max_scratch_slots_per_cu = 2046 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2047 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2048 if (hdr->version_minor >= 1) { 2049 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2050 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2051 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2052 adev->gfx.config.num_sc_per_sh = 2053 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2054 adev->gfx.config.num_packer_per_sc = 2055 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2056 } 2057 2058 parse_soc_bounding_box: 2059 /* 2060 * soc bounding box info is not integrated in disocovery table, 2061 * we always need to parse it from gpu info firmware if needed. 2062 */ 2063 if (hdr->version_minor == 2) { 2064 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2065 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2066 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2067 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2068 } 2069 break; 2070 } 2071 default: 2072 dev_err(adev->dev, 2073 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2074 err = -EINVAL; 2075 goto out; 2076 } 2077 out: 2078 return err; 2079 } 2080 2081 /** 2082 * amdgpu_device_ip_early_init - run early init for hardware IPs 2083 * 2084 * @adev: amdgpu_device pointer 2085 * 2086 * Early initialization pass for hardware IPs. The hardware IPs that make 2087 * up each asic are discovered each IP's early_init callback is run. This 2088 * is the first stage in initializing the asic. 2089 * Returns 0 on success, negative error code on failure. 2090 */ 2091 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2092 { 2093 struct drm_device *dev = adev_to_drm(adev); 2094 struct pci_dev *parent; 2095 int i, r; 2096 2097 amdgpu_device_enable_virtual_display(adev); 2098 2099 if (amdgpu_sriov_vf(adev)) { 2100 r = amdgpu_virt_request_full_gpu(adev, true); 2101 if (r) 2102 return r; 2103 } 2104 2105 switch (adev->asic_type) { 2106 #ifdef CONFIG_DRM_AMDGPU_SI 2107 case CHIP_VERDE: 2108 case CHIP_TAHITI: 2109 case CHIP_PITCAIRN: 2110 case CHIP_OLAND: 2111 case CHIP_HAINAN: 2112 adev->family = AMDGPU_FAMILY_SI; 2113 r = si_set_ip_blocks(adev); 2114 if (r) 2115 return r; 2116 break; 2117 #endif 2118 #ifdef CONFIG_DRM_AMDGPU_CIK 2119 case CHIP_BONAIRE: 2120 case CHIP_HAWAII: 2121 case CHIP_KAVERI: 2122 case CHIP_KABINI: 2123 case CHIP_MULLINS: 2124 if (adev->flags & AMD_IS_APU) 2125 adev->family = AMDGPU_FAMILY_KV; 2126 else 2127 adev->family = AMDGPU_FAMILY_CI; 2128 2129 r = cik_set_ip_blocks(adev); 2130 if (r) 2131 return r; 2132 break; 2133 #endif 2134 case CHIP_TOPAZ: 2135 case CHIP_TONGA: 2136 case CHIP_FIJI: 2137 case CHIP_POLARIS10: 2138 case CHIP_POLARIS11: 2139 case CHIP_POLARIS12: 2140 case CHIP_VEGAM: 2141 case CHIP_CARRIZO: 2142 case CHIP_STONEY: 2143 if (adev->flags & AMD_IS_APU) 2144 adev->family = AMDGPU_FAMILY_CZ; 2145 else 2146 adev->family = AMDGPU_FAMILY_VI; 2147 2148 r = vi_set_ip_blocks(adev); 2149 if (r) 2150 return r; 2151 break; 2152 default: 2153 r = amdgpu_discovery_set_ip_blocks(adev); 2154 if (r) 2155 return r; 2156 break; 2157 } 2158 2159 if (amdgpu_has_atpx() && 2160 (amdgpu_is_atpx_hybrid() || 2161 amdgpu_has_atpx_dgpu_power_cntl()) && 2162 ((adev->flags & AMD_IS_APU) == 0) && 2163 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2164 adev->flags |= AMD_IS_PX; 2165 2166 if (!(adev->flags & AMD_IS_APU)) { 2167 parent = pci_upstream_bridge(adev->pdev); 2168 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2169 } 2170 2171 amdgpu_amdkfd_device_probe(adev); 2172 2173 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2174 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2175 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2176 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2177 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2178 2179 for (i = 0; i < adev->num_ip_blocks; i++) { 2180 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2181 DRM_ERROR("disabled ip block: %d <%s>\n", 2182 i, adev->ip_blocks[i].version->funcs->name); 2183 adev->ip_blocks[i].status.valid = false; 2184 } else { 2185 if (adev->ip_blocks[i].version->funcs->early_init) { 2186 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2187 if (r == -ENOENT) { 2188 adev->ip_blocks[i].status.valid = false; 2189 } else if (r) { 2190 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2191 adev->ip_blocks[i].version->funcs->name, r); 2192 return r; 2193 } else { 2194 adev->ip_blocks[i].status.valid = true; 2195 } 2196 } else { 2197 adev->ip_blocks[i].status.valid = true; 2198 } 2199 } 2200 /* get the vbios after the asic_funcs are set up */ 2201 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2202 r = amdgpu_device_parse_gpu_info_fw(adev); 2203 if (r) 2204 return r; 2205 2206 /* Read BIOS */ 2207 if (!amdgpu_get_bios(adev)) 2208 return -EINVAL; 2209 2210 r = amdgpu_atombios_init(adev); 2211 if (r) { 2212 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2213 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2214 return r; 2215 } 2216 2217 /*get pf2vf msg info at it's earliest time*/ 2218 if (amdgpu_sriov_vf(adev)) 2219 amdgpu_virt_init_data_exchange(adev); 2220 2221 } 2222 } 2223 2224 adev->cg_flags &= amdgpu_cg_mask; 2225 adev->pg_flags &= amdgpu_pg_mask; 2226 2227 return 0; 2228 } 2229 2230 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2231 { 2232 int i, r; 2233 2234 for (i = 0; i < adev->num_ip_blocks; i++) { 2235 if (!adev->ip_blocks[i].status.sw) 2236 continue; 2237 if (adev->ip_blocks[i].status.hw) 2238 continue; 2239 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2240 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2241 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2242 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2243 if (r) { 2244 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2245 adev->ip_blocks[i].version->funcs->name, r); 2246 return r; 2247 } 2248 adev->ip_blocks[i].status.hw = true; 2249 } 2250 } 2251 2252 return 0; 2253 } 2254 2255 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2256 { 2257 int i, r; 2258 2259 for (i = 0; i < adev->num_ip_blocks; i++) { 2260 if (!adev->ip_blocks[i].status.sw) 2261 continue; 2262 if (adev->ip_blocks[i].status.hw) 2263 continue; 2264 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2265 if (r) { 2266 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2267 adev->ip_blocks[i].version->funcs->name, r); 2268 return r; 2269 } 2270 adev->ip_blocks[i].status.hw = true; 2271 } 2272 2273 return 0; 2274 } 2275 2276 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2277 { 2278 int r = 0; 2279 int i; 2280 uint32_t smu_version; 2281 2282 if (adev->asic_type >= CHIP_VEGA10) { 2283 for (i = 0; i < adev->num_ip_blocks; i++) { 2284 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2285 continue; 2286 2287 if (!adev->ip_blocks[i].status.sw) 2288 continue; 2289 2290 /* no need to do the fw loading again if already done*/ 2291 if (adev->ip_blocks[i].status.hw == true) 2292 break; 2293 2294 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2295 r = adev->ip_blocks[i].version->funcs->resume(adev); 2296 if (r) { 2297 DRM_ERROR("resume of IP block <%s> failed %d\n", 2298 adev->ip_blocks[i].version->funcs->name, r); 2299 return r; 2300 } 2301 } else { 2302 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2303 if (r) { 2304 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2305 adev->ip_blocks[i].version->funcs->name, r); 2306 return r; 2307 } 2308 } 2309 2310 adev->ip_blocks[i].status.hw = true; 2311 break; 2312 } 2313 } 2314 2315 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2316 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2317 2318 return r; 2319 } 2320 2321 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2322 { 2323 long timeout; 2324 int r, i; 2325 2326 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2327 struct amdgpu_ring *ring = adev->rings[i]; 2328 2329 /* No need to setup the GPU scheduler for rings that don't need it */ 2330 if (!ring || ring->no_scheduler) 2331 continue; 2332 2333 switch (ring->funcs->type) { 2334 case AMDGPU_RING_TYPE_GFX: 2335 timeout = adev->gfx_timeout; 2336 break; 2337 case AMDGPU_RING_TYPE_COMPUTE: 2338 timeout = adev->compute_timeout; 2339 break; 2340 case AMDGPU_RING_TYPE_SDMA: 2341 timeout = adev->sdma_timeout; 2342 break; 2343 default: 2344 timeout = adev->video_timeout; 2345 break; 2346 } 2347 2348 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2349 ring->num_hw_submission, amdgpu_job_hang_limit, 2350 timeout, adev->reset_domain->wq, 2351 ring->sched_score, ring->name, 2352 adev->dev); 2353 if (r) { 2354 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2355 ring->name); 2356 return r; 2357 } 2358 } 2359 2360 return 0; 2361 } 2362 2363 2364 /** 2365 * amdgpu_device_ip_init - run init for hardware IPs 2366 * 2367 * @adev: amdgpu_device pointer 2368 * 2369 * Main initialization pass for hardware IPs. The list of all the hardware 2370 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2371 * are run. sw_init initializes the software state associated with each IP 2372 * and hw_init initializes the hardware associated with each IP. 2373 * Returns 0 on success, negative error code on failure. 2374 */ 2375 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2376 { 2377 int i, r; 2378 2379 r = amdgpu_ras_init(adev); 2380 if (r) 2381 return r; 2382 2383 for (i = 0; i < adev->num_ip_blocks; i++) { 2384 if (!adev->ip_blocks[i].status.valid) 2385 continue; 2386 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2387 if (r) { 2388 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2389 adev->ip_blocks[i].version->funcs->name, r); 2390 goto init_failed; 2391 } 2392 adev->ip_blocks[i].status.sw = true; 2393 2394 /* need to do gmc hw init early so we can allocate gpu mem */ 2395 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2396 /* Try to reserve bad pages early */ 2397 if (amdgpu_sriov_vf(adev)) 2398 amdgpu_virt_exchange_data(adev); 2399 2400 r = amdgpu_device_vram_scratch_init(adev); 2401 if (r) { 2402 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2403 goto init_failed; 2404 } 2405 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2406 if (r) { 2407 DRM_ERROR("hw_init %d failed %d\n", i, r); 2408 goto init_failed; 2409 } 2410 r = amdgpu_device_wb_init(adev); 2411 if (r) { 2412 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2413 goto init_failed; 2414 } 2415 adev->ip_blocks[i].status.hw = true; 2416 2417 /* right after GMC hw init, we create CSA */ 2418 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2419 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2420 AMDGPU_GEM_DOMAIN_VRAM, 2421 AMDGPU_CSA_SIZE); 2422 if (r) { 2423 DRM_ERROR("allocate CSA failed %d\n", r); 2424 goto init_failed; 2425 } 2426 } 2427 } 2428 } 2429 2430 if (amdgpu_sriov_vf(adev)) 2431 amdgpu_virt_init_data_exchange(adev); 2432 2433 r = amdgpu_ib_pool_init(adev); 2434 if (r) { 2435 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2436 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2437 goto init_failed; 2438 } 2439 2440 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2441 if (r) 2442 goto init_failed; 2443 2444 r = amdgpu_device_ip_hw_init_phase1(adev); 2445 if (r) 2446 goto init_failed; 2447 2448 r = amdgpu_device_fw_loading(adev); 2449 if (r) 2450 goto init_failed; 2451 2452 r = amdgpu_device_ip_hw_init_phase2(adev); 2453 if (r) 2454 goto init_failed; 2455 2456 /* 2457 * retired pages will be loaded from eeprom and reserved here, 2458 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2459 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2460 * for I2C communication which only true at this point. 2461 * 2462 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2463 * failure from bad gpu situation and stop amdgpu init process 2464 * accordingly. For other failed cases, it will still release all 2465 * the resource and print error message, rather than returning one 2466 * negative value to upper level. 2467 * 2468 * Note: theoretically, this should be called before all vram allocations 2469 * to protect retired page from abusing 2470 */ 2471 r = amdgpu_ras_recovery_init(adev); 2472 if (r) 2473 goto init_failed; 2474 2475 /** 2476 * In case of XGMI grab extra reference for reset domain for this device 2477 */ 2478 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2479 if (amdgpu_xgmi_add_device(adev) == 0) { 2480 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2481 2482 if (!hive->reset_domain || 2483 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2484 r = -ENOENT; 2485 goto init_failed; 2486 } 2487 2488 /* Drop the early temporary reset domain we created for device */ 2489 amdgpu_reset_put_reset_domain(adev->reset_domain); 2490 adev->reset_domain = hive->reset_domain; 2491 } 2492 } 2493 2494 r = amdgpu_device_init_schedulers(adev); 2495 if (r) 2496 goto init_failed; 2497 2498 /* Don't init kfd if whole hive need to be reset during init */ 2499 if (!adev->gmc.xgmi.pending_reset) 2500 amdgpu_amdkfd_device_init(adev); 2501 2502 amdgpu_fru_get_product_info(adev); 2503 2504 init_failed: 2505 if (amdgpu_sriov_vf(adev)) 2506 amdgpu_virt_release_full_gpu(adev, true); 2507 2508 return r; 2509 } 2510 2511 /** 2512 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2513 * 2514 * @adev: amdgpu_device pointer 2515 * 2516 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2517 * this function before a GPU reset. If the value is retained after a 2518 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2519 */ 2520 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2521 { 2522 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2523 } 2524 2525 /** 2526 * amdgpu_device_check_vram_lost - check if vram is valid 2527 * 2528 * @adev: amdgpu_device pointer 2529 * 2530 * Checks the reset magic value written to the gart pointer in VRAM. 2531 * The driver calls this after a GPU reset to see if the contents of 2532 * VRAM is lost or now. 2533 * returns true if vram is lost, false if not. 2534 */ 2535 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2536 { 2537 if (memcmp(adev->gart.ptr, adev->reset_magic, 2538 AMDGPU_RESET_MAGIC_NUM)) 2539 return true; 2540 2541 if (!amdgpu_in_reset(adev)) 2542 return false; 2543 2544 /* 2545 * For all ASICs with baco/mode1 reset, the VRAM is 2546 * always assumed to be lost. 2547 */ 2548 switch (amdgpu_asic_reset_method(adev)) { 2549 case AMD_RESET_METHOD_BACO: 2550 case AMD_RESET_METHOD_MODE1: 2551 return true; 2552 default: 2553 return false; 2554 } 2555 } 2556 2557 /** 2558 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2559 * 2560 * @adev: amdgpu_device pointer 2561 * @state: clockgating state (gate or ungate) 2562 * 2563 * The list of all the hardware IPs that make up the asic is walked and the 2564 * set_clockgating_state callbacks are run. 2565 * Late initialization pass enabling clockgating for hardware IPs. 2566 * Fini or suspend, pass disabling clockgating for hardware IPs. 2567 * Returns 0 on success, negative error code on failure. 2568 */ 2569 2570 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2571 enum amd_clockgating_state state) 2572 { 2573 int i, j, r; 2574 2575 if (amdgpu_emu_mode == 1) 2576 return 0; 2577 2578 for (j = 0; j < adev->num_ip_blocks; j++) { 2579 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2580 if (!adev->ip_blocks[i].status.late_initialized) 2581 continue; 2582 /* skip CG for GFX on S0ix */ 2583 if (adev->in_s0ix && 2584 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2585 continue; 2586 /* skip CG for VCE/UVD, it's handled specially */ 2587 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2588 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2589 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2590 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2591 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2592 /* enable clockgating to save power */ 2593 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2594 state); 2595 if (r) { 2596 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2597 adev->ip_blocks[i].version->funcs->name, r); 2598 return r; 2599 } 2600 } 2601 } 2602 2603 return 0; 2604 } 2605 2606 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2607 enum amd_powergating_state state) 2608 { 2609 int i, j, r; 2610 2611 if (amdgpu_emu_mode == 1) 2612 return 0; 2613 2614 for (j = 0; j < adev->num_ip_blocks; j++) { 2615 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2616 if (!adev->ip_blocks[i].status.late_initialized) 2617 continue; 2618 /* skip PG for GFX on S0ix */ 2619 if (adev->in_s0ix && 2620 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2621 continue; 2622 /* skip CG for VCE/UVD, it's handled specially */ 2623 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2624 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2625 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2626 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2627 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2628 /* enable powergating to save power */ 2629 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2630 state); 2631 if (r) { 2632 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2633 adev->ip_blocks[i].version->funcs->name, r); 2634 return r; 2635 } 2636 } 2637 } 2638 return 0; 2639 } 2640 2641 static int amdgpu_device_enable_mgpu_fan_boost(void) 2642 { 2643 struct amdgpu_gpu_instance *gpu_ins; 2644 struct amdgpu_device *adev; 2645 int i, ret = 0; 2646 2647 mutex_lock(&mgpu_info.mutex); 2648 2649 /* 2650 * MGPU fan boost feature should be enabled 2651 * only when there are two or more dGPUs in 2652 * the system 2653 */ 2654 if (mgpu_info.num_dgpu < 2) 2655 goto out; 2656 2657 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2658 gpu_ins = &(mgpu_info.gpu_ins[i]); 2659 adev = gpu_ins->adev; 2660 if (!(adev->flags & AMD_IS_APU) && 2661 !gpu_ins->mgpu_fan_enabled) { 2662 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2663 if (ret) 2664 break; 2665 2666 gpu_ins->mgpu_fan_enabled = 1; 2667 } 2668 } 2669 2670 out: 2671 mutex_unlock(&mgpu_info.mutex); 2672 2673 return ret; 2674 } 2675 2676 /** 2677 * amdgpu_device_ip_late_init - run late init for hardware IPs 2678 * 2679 * @adev: amdgpu_device pointer 2680 * 2681 * Late initialization pass for hardware IPs. The list of all the hardware 2682 * IPs that make up the asic is walked and the late_init callbacks are run. 2683 * late_init covers any special initialization that an IP requires 2684 * after all of the have been initialized or something that needs to happen 2685 * late in the init process. 2686 * Returns 0 on success, negative error code on failure. 2687 */ 2688 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2689 { 2690 struct amdgpu_gpu_instance *gpu_instance; 2691 int i = 0, r; 2692 2693 for (i = 0; i < adev->num_ip_blocks; i++) { 2694 if (!adev->ip_blocks[i].status.hw) 2695 continue; 2696 if (adev->ip_blocks[i].version->funcs->late_init) { 2697 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2698 if (r) { 2699 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2700 adev->ip_blocks[i].version->funcs->name, r); 2701 return r; 2702 } 2703 } 2704 adev->ip_blocks[i].status.late_initialized = true; 2705 } 2706 2707 r = amdgpu_ras_late_init(adev); 2708 if (r) { 2709 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2710 return r; 2711 } 2712 2713 amdgpu_ras_set_error_query_ready(adev, true); 2714 2715 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2716 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2717 2718 amdgpu_device_fill_reset_magic(adev); 2719 2720 r = amdgpu_device_enable_mgpu_fan_boost(); 2721 if (r) 2722 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2723 2724 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2725 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2726 adev->asic_type == CHIP_ALDEBARAN )) 2727 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2728 2729 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2730 mutex_lock(&mgpu_info.mutex); 2731 2732 /* 2733 * Reset device p-state to low as this was booted with high. 2734 * 2735 * This should be performed only after all devices from the same 2736 * hive get initialized. 2737 * 2738 * However, it's unknown how many device in the hive in advance. 2739 * As this is counted one by one during devices initializations. 2740 * 2741 * So, we wait for all XGMI interlinked devices initialized. 2742 * This may bring some delays as those devices may come from 2743 * different hives. But that should be OK. 2744 */ 2745 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2746 for (i = 0; i < mgpu_info.num_gpu; i++) { 2747 gpu_instance = &(mgpu_info.gpu_ins[i]); 2748 if (gpu_instance->adev->flags & AMD_IS_APU) 2749 continue; 2750 2751 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2752 AMDGPU_XGMI_PSTATE_MIN); 2753 if (r) { 2754 DRM_ERROR("pstate setting failed (%d).\n", r); 2755 break; 2756 } 2757 } 2758 } 2759 2760 mutex_unlock(&mgpu_info.mutex); 2761 } 2762 2763 return 0; 2764 } 2765 2766 /** 2767 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2768 * 2769 * @adev: amdgpu_device pointer 2770 * 2771 * For ASICs need to disable SMC first 2772 */ 2773 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2774 { 2775 int i, r; 2776 2777 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2778 return; 2779 2780 for (i = 0; i < adev->num_ip_blocks; i++) { 2781 if (!adev->ip_blocks[i].status.hw) 2782 continue; 2783 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2784 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2785 /* XXX handle errors */ 2786 if (r) { 2787 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2788 adev->ip_blocks[i].version->funcs->name, r); 2789 } 2790 adev->ip_blocks[i].status.hw = false; 2791 break; 2792 } 2793 } 2794 } 2795 2796 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2797 { 2798 int i, r; 2799 2800 for (i = 0; i < adev->num_ip_blocks; i++) { 2801 if (!adev->ip_blocks[i].version->funcs->early_fini) 2802 continue; 2803 2804 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2805 if (r) { 2806 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2807 adev->ip_blocks[i].version->funcs->name, r); 2808 } 2809 } 2810 2811 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2812 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2813 2814 amdgpu_amdkfd_suspend(adev, false); 2815 2816 /* Workaroud for ASICs need to disable SMC first */ 2817 amdgpu_device_smu_fini_early(adev); 2818 2819 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2820 if (!adev->ip_blocks[i].status.hw) 2821 continue; 2822 2823 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2824 /* XXX handle errors */ 2825 if (r) { 2826 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2827 adev->ip_blocks[i].version->funcs->name, r); 2828 } 2829 2830 adev->ip_blocks[i].status.hw = false; 2831 } 2832 2833 if (amdgpu_sriov_vf(adev)) { 2834 if (amdgpu_virt_release_full_gpu(adev, false)) 2835 DRM_ERROR("failed to release exclusive mode on fini\n"); 2836 } 2837 2838 return 0; 2839 } 2840 2841 /** 2842 * amdgpu_device_ip_fini - run fini for hardware IPs 2843 * 2844 * @adev: amdgpu_device pointer 2845 * 2846 * Main teardown pass for hardware IPs. The list of all the hardware 2847 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2848 * are run. hw_fini tears down the hardware associated with each IP 2849 * and sw_fini tears down any software state associated with each IP. 2850 * Returns 0 on success, negative error code on failure. 2851 */ 2852 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2853 { 2854 int i, r; 2855 2856 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2857 amdgpu_virt_release_ras_err_handler_data(adev); 2858 2859 if (adev->gmc.xgmi.num_physical_nodes > 1) 2860 amdgpu_xgmi_remove_device(adev); 2861 2862 amdgpu_amdkfd_device_fini_sw(adev); 2863 2864 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2865 if (!adev->ip_blocks[i].status.sw) 2866 continue; 2867 2868 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2869 amdgpu_ucode_free_bo(adev); 2870 amdgpu_free_static_csa(&adev->virt.csa_obj); 2871 amdgpu_device_wb_fini(adev); 2872 amdgpu_device_vram_scratch_fini(adev); 2873 amdgpu_ib_pool_fini(adev); 2874 } 2875 2876 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2877 /* XXX handle errors */ 2878 if (r) { 2879 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2880 adev->ip_blocks[i].version->funcs->name, r); 2881 } 2882 adev->ip_blocks[i].status.sw = false; 2883 adev->ip_blocks[i].status.valid = false; 2884 } 2885 2886 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2887 if (!adev->ip_blocks[i].status.late_initialized) 2888 continue; 2889 if (adev->ip_blocks[i].version->funcs->late_fini) 2890 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2891 adev->ip_blocks[i].status.late_initialized = false; 2892 } 2893 2894 amdgpu_ras_fini(adev); 2895 2896 return 0; 2897 } 2898 2899 /** 2900 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2901 * 2902 * @work: work_struct. 2903 */ 2904 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2905 { 2906 struct amdgpu_device *adev = 2907 container_of(work, struct amdgpu_device, delayed_init_work.work); 2908 int r; 2909 2910 r = amdgpu_ib_ring_tests(adev); 2911 if (r) 2912 DRM_ERROR("ib ring test failed (%d).\n", r); 2913 } 2914 2915 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2916 { 2917 struct amdgpu_device *adev = 2918 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2919 2920 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2921 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2922 2923 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2924 adev->gfx.gfx_off_state = true; 2925 } 2926 2927 /** 2928 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2929 * 2930 * @adev: amdgpu_device pointer 2931 * 2932 * Main suspend function for hardware IPs. The list of all the hardware 2933 * IPs that make up the asic is walked, clockgating is disabled and the 2934 * suspend callbacks are run. suspend puts the hardware and software state 2935 * in each IP into a state suitable for suspend. 2936 * Returns 0 on success, negative error code on failure. 2937 */ 2938 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2939 { 2940 int i, r; 2941 2942 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2943 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2944 2945 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2946 if (!adev->ip_blocks[i].status.valid) 2947 continue; 2948 2949 /* displays are handled separately */ 2950 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2951 continue; 2952 2953 /* XXX handle errors */ 2954 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2955 /* XXX handle errors */ 2956 if (r) { 2957 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2958 adev->ip_blocks[i].version->funcs->name, r); 2959 return r; 2960 } 2961 2962 adev->ip_blocks[i].status.hw = false; 2963 } 2964 2965 return 0; 2966 } 2967 2968 /** 2969 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2970 * 2971 * @adev: amdgpu_device pointer 2972 * 2973 * Main suspend function for hardware IPs. The list of all the hardware 2974 * IPs that make up the asic is walked, clockgating is disabled and the 2975 * suspend callbacks are run. suspend puts the hardware and software state 2976 * in each IP into a state suitable for suspend. 2977 * Returns 0 on success, negative error code on failure. 2978 */ 2979 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2980 { 2981 int i, r; 2982 2983 if (adev->in_s0ix) 2984 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2985 2986 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2987 if (!adev->ip_blocks[i].status.valid) 2988 continue; 2989 /* displays are handled in phase1 */ 2990 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2991 continue; 2992 /* PSP lost connection when err_event_athub occurs */ 2993 if (amdgpu_ras_intr_triggered() && 2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2995 adev->ip_blocks[i].status.hw = false; 2996 continue; 2997 } 2998 2999 /* skip unnecessary suspend if we do not initialize them yet */ 3000 if (adev->gmc.xgmi.pending_reset && 3001 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3002 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3003 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3005 adev->ip_blocks[i].status.hw = false; 3006 continue; 3007 } 3008 3009 /* skip suspend of gfx and psp for S0ix 3010 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3011 * like at runtime. PSP is also part of the always on hardware 3012 * so no need to suspend it. 3013 */ 3014 if (adev->in_s0ix && 3015 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3016 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 3017 continue; 3018 3019 /* XXX handle errors */ 3020 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3021 /* XXX handle errors */ 3022 if (r) { 3023 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3024 adev->ip_blocks[i].version->funcs->name, r); 3025 } 3026 adev->ip_blocks[i].status.hw = false; 3027 /* handle putting the SMC in the appropriate state */ 3028 if(!amdgpu_sriov_vf(adev)){ 3029 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3030 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3031 if (r) { 3032 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3033 adev->mp1_state, r); 3034 return r; 3035 } 3036 } 3037 } 3038 } 3039 3040 return 0; 3041 } 3042 3043 /** 3044 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3045 * 3046 * @adev: amdgpu_device pointer 3047 * 3048 * Main suspend function for hardware IPs. The list of all the hardware 3049 * IPs that make up the asic is walked, clockgating is disabled and the 3050 * suspend callbacks are run. suspend puts the hardware and software state 3051 * in each IP into a state suitable for suspend. 3052 * Returns 0 on success, negative error code on failure. 3053 */ 3054 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3055 { 3056 int r; 3057 3058 if (amdgpu_sriov_vf(adev)) { 3059 amdgpu_virt_fini_data_exchange(adev); 3060 amdgpu_virt_request_full_gpu(adev, false); 3061 } 3062 3063 r = amdgpu_device_ip_suspend_phase1(adev); 3064 if (r) 3065 return r; 3066 r = amdgpu_device_ip_suspend_phase2(adev); 3067 3068 if (amdgpu_sriov_vf(adev)) 3069 amdgpu_virt_release_full_gpu(adev, false); 3070 3071 return r; 3072 } 3073 3074 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3075 { 3076 int i, r; 3077 3078 static enum amd_ip_block_type ip_order[] = { 3079 AMD_IP_BLOCK_TYPE_GMC, 3080 AMD_IP_BLOCK_TYPE_COMMON, 3081 AMD_IP_BLOCK_TYPE_PSP, 3082 AMD_IP_BLOCK_TYPE_IH, 3083 }; 3084 3085 for (i = 0; i < adev->num_ip_blocks; i++) { 3086 int j; 3087 struct amdgpu_ip_block *block; 3088 3089 block = &adev->ip_blocks[i]; 3090 block->status.hw = false; 3091 3092 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3093 3094 if (block->version->type != ip_order[j] || 3095 !block->status.valid) 3096 continue; 3097 3098 r = block->version->funcs->hw_init(adev); 3099 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3100 if (r) 3101 return r; 3102 block->status.hw = true; 3103 } 3104 } 3105 3106 return 0; 3107 } 3108 3109 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3110 { 3111 int i, r; 3112 3113 static enum amd_ip_block_type ip_order[] = { 3114 AMD_IP_BLOCK_TYPE_SMC, 3115 AMD_IP_BLOCK_TYPE_DCE, 3116 AMD_IP_BLOCK_TYPE_GFX, 3117 AMD_IP_BLOCK_TYPE_SDMA, 3118 AMD_IP_BLOCK_TYPE_UVD, 3119 AMD_IP_BLOCK_TYPE_VCE, 3120 AMD_IP_BLOCK_TYPE_VCN 3121 }; 3122 3123 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3124 int j; 3125 struct amdgpu_ip_block *block; 3126 3127 for (j = 0; j < adev->num_ip_blocks; j++) { 3128 block = &adev->ip_blocks[j]; 3129 3130 if (block->version->type != ip_order[i] || 3131 !block->status.valid || 3132 block->status.hw) 3133 continue; 3134 3135 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3136 r = block->version->funcs->resume(adev); 3137 else 3138 r = block->version->funcs->hw_init(adev); 3139 3140 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3141 if (r) 3142 return r; 3143 block->status.hw = true; 3144 } 3145 } 3146 3147 return 0; 3148 } 3149 3150 /** 3151 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3152 * 3153 * @adev: amdgpu_device pointer 3154 * 3155 * First resume function for hardware IPs. The list of all the hardware 3156 * IPs that make up the asic is walked and the resume callbacks are run for 3157 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3158 * after a suspend and updates the software state as necessary. This 3159 * function is also used for restoring the GPU after a GPU reset. 3160 * Returns 0 on success, negative error code on failure. 3161 */ 3162 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3163 { 3164 int i, r; 3165 3166 for (i = 0; i < adev->num_ip_blocks; i++) { 3167 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3168 continue; 3169 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3170 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3172 3173 r = adev->ip_blocks[i].version->funcs->resume(adev); 3174 if (r) { 3175 DRM_ERROR("resume of IP block <%s> failed %d\n", 3176 adev->ip_blocks[i].version->funcs->name, r); 3177 return r; 3178 } 3179 adev->ip_blocks[i].status.hw = true; 3180 } 3181 } 3182 3183 return 0; 3184 } 3185 3186 /** 3187 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3188 * 3189 * @adev: amdgpu_device pointer 3190 * 3191 * First resume function for hardware IPs. The list of all the hardware 3192 * IPs that make up the asic is walked and the resume callbacks are run for 3193 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3194 * functional state after a suspend and updates the software state as 3195 * necessary. This function is also used for restoring the GPU after a GPU 3196 * reset. 3197 * Returns 0 on success, negative error code on failure. 3198 */ 3199 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3200 { 3201 int i, r; 3202 3203 for (i = 0; i < adev->num_ip_blocks; i++) { 3204 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3205 continue; 3206 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3207 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3208 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3209 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3210 continue; 3211 r = adev->ip_blocks[i].version->funcs->resume(adev); 3212 if (r) { 3213 DRM_ERROR("resume of IP block <%s> failed %d\n", 3214 adev->ip_blocks[i].version->funcs->name, r); 3215 return r; 3216 } 3217 adev->ip_blocks[i].status.hw = true; 3218 } 3219 3220 return 0; 3221 } 3222 3223 /** 3224 * amdgpu_device_ip_resume - run resume for hardware IPs 3225 * 3226 * @adev: amdgpu_device pointer 3227 * 3228 * Main resume function for hardware IPs. The hardware IPs 3229 * are split into two resume functions because they are 3230 * are also used in in recovering from a GPU reset and some additional 3231 * steps need to be take between them. In this case (S3/S4) they are 3232 * run sequentially. 3233 * Returns 0 on success, negative error code on failure. 3234 */ 3235 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3236 { 3237 int r; 3238 3239 r = amdgpu_amdkfd_resume_iommu(adev); 3240 if (r) 3241 return r; 3242 3243 r = amdgpu_device_ip_resume_phase1(adev); 3244 if (r) 3245 return r; 3246 3247 r = amdgpu_device_fw_loading(adev); 3248 if (r) 3249 return r; 3250 3251 r = amdgpu_device_ip_resume_phase2(adev); 3252 3253 return r; 3254 } 3255 3256 /** 3257 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3258 * 3259 * @adev: amdgpu_device pointer 3260 * 3261 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3262 */ 3263 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3264 { 3265 if (amdgpu_sriov_vf(adev)) { 3266 if (adev->is_atom_fw) { 3267 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3268 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3269 } else { 3270 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3271 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3272 } 3273 3274 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3275 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3276 } 3277 } 3278 3279 /** 3280 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3281 * 3282 * @asic_type: AMD asic type 3283 * 3284 * Check if there is DC (new modesetting infrastructre) support for an asic. 3285 * returns true if DC has support, false if not. 3286 */ 3287 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3288 { 3289 switch (asic_type) { 3290 #ifdef CONFIG_DRM_AMDGPU_SI 3291 case CHIP_HAINAN: 3292 #endif 3293 case CHIP_TOPAZ: 3294 /* chips with no display hardware */ 3295 return false; 3296 #if defined(CONFIG_DRM_AMD_DC) 3297 case CHIP_TAHITI: 3298 case CHIP_PITCAIRN: 3299 case CHIP_VERDE: 3300 case CHIP_OLAND: 3301 /* 3302 * We have systems in the wild with these ASICs that require 3303 * LVDS and VGA support which is not supported with DC. 3304 * 3305 * Fallback to the non-DC driver here by default so as not to 3306 * cause regressions. 3307 */ 3308 #if defined(CONFIG_DRM_AMD_DC_SI) 3309 return amdgpu_dc > 0; 3310 #else 3311 return false; 3312 #endif 3313 case CHIP_BONAIRE: 3314 case CHIP_KAVERI: 3315 case CHIP_KABINI: 3316 case CHIP_MULLINS: 3317 /* 3318 * We have systems in the wild with these ASICs that require 3319 * LVDS and VGA support which is not supported with DC. 3320 * 3321 * Fallback to the non-DC driver here by default so as not to 3322 * cause regressions. 3323 */ 3324 return amdgpu_dc > 0; 3325 default: 3326 return amdgpu_dc != 0; 3327 #else 3328 default: 3329 if (amdgpu_dc > 0) 3330 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3331 "but isn't supported by ASIC, ignoring\n"); 3332 return false; 3333 #endif 3334 } 3335 } 3336 3337 /** 3338 * amdgpu_device_has_dc_support - check if dc is supported 3339 * 3340 * @adev: amdgpu_device pointer 3341 * 3342 * Returns true for supported, false for not supported 3343 */ 3344 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3345 { 3346 if (amdgpu_sriov_vf(adev) || 3347 adev->enable_virtual_display || 3348 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3349 return false; 3350 3351 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3352 } 3353 3354 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3355 { 3356 struct amdgpu_device *adev = 3357 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3358 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3359 3360 /* It's a bug to not have a hive within this function */ 3361 if (WARN_ON(!hive)) 3362 return; 3363 3364 /* 3365 * Use task barrier to synchronize all xgmi reset works across the 3366 * hive. task_barrier_enter and task_barrier_exit will block 3367 * until all the threads running the xgmi reset works reach 3368 * those points. task_barrier_full will do both blocks. 3369 */ 3370 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3371 3372 task_barrier_enter(&hive->tb); 3373 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3374 3375 if (adev->asic_reset_res) 3376 goto fail; 3377 3378 task_barrier_exit(&hive->tb); 3379 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3380 3381 if (adev->asic_reset_res) 3382 goto fail; 3383 3384 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3385 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3386 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3387 } else { 3388 3389 task_barrier_full(&hive->tb); 3390 adev->asic_reset_res = amdgpu_asic_reset(adev); 3391 } 3392 3393 fail: 3394 if (adev->asic_reset_res) 3395 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3396 adev->asic_reset_res, adev_to_drm(adev)->unique); 3397 amdgpu_put_xgmi_hive(hive); 3398 } 3399 3400 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3401 { 3402 char *input = amdgpu_lockup_timeout; 3403 char *timeout_setting = NULL; 3404 int index = 0; 3405 long timeout; 3406 int ret = 0; 3407 3408 /* 3409 * By default timeout for non compute jobs is 10000 3410 * and 60000 for compute jobs. 3411 * In SR-IOV or passthrough mode, timeout for compute 3412 * jobs are 60000 by default. 3413 */ 3414 adev->gfx_timeout = msecs_to_jiffies(10000); 3415 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3416 if (amdgpu_sriov_vf(adev)) 3417 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3418 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3419 else 3420 adev->compute_timeout = msecs_to_jiffies(60000); 3421 3422 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3423 while ((timeout_setting = strsep(&input, ",")) && 3424 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3425 ret = kstrtol(timeout_setting, 0, &timeout); 3426 if (ret) 3427 return ret; 3428 3429 if (timeout == 0) { 3430 index++; 3431 continue; 3432 } else if (timeout < 0) { 3433 timeout = MAX_SCHEDULE_TIMEOUT; 3434 dev_warn(adev->dev, "lockup timeout disabled"); 3435 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3436 } else { 3437 timeout = msecs_to_jiffies(timeout); 3438 } 3439 3440 switch (index++) { 3441 case 0: 3442 adev->gfx_timeout = timeout; 3443 break; 3444 case 1: 3445 adev->compute_timeout = timeout; 3446 break; 3447 case 2: 3448 adev->sdma_timeout = timeout; 3449 break; 3450 case 3: 3451 adev->video_timeout = timeout; 3452 break; 3453 default: 3454 break; 3455 } 3456 } 3457 /* 3458 * There is only one value specified and 3459 * it should apply to all non-compute jobs. 3460 */ 3461 if (index == 1) { 3462 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3463 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3464 adev->compute_timeout = adev->gfx_timeout; 3465 } 3466 } 3467 3468 return ret; 3469 } 3470 3471 /** 3472 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3473 * 3474 * @adev: amdgpu_device pointer 3475 * 3476 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3477 */ 3478 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3479 { 3480 struct iommu_domain *domain; 3481 3482 domain = iommu_get_domain_for_dev(adev->dev); 3483 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3484 adev->ram_is_direct_mapped = true; 3485 } 3486 3487 static const struct attribute *amdgpu_dev_attributes[] = { 3488 &dev_attr_product_name.attr, 3489 &dev_attr_product_number.attr, 3490 &dev_attr_serial_number.attr, 3491 &dev_attr_pcie_replay_count.attr, 3492 NULL 3493 }; 3494 3495 /** 3496 * amdgpu_device_init - initialize the driver 3497 * 3498 * @adev: amdgpu_device pointer 3499 * @flags: driver flags 3500 * 3501 * Initializes the driver info and hw (all asics). 3502 * Returns 0 for success or an error on failure. 3503 * Called at driver startup. 3504 */ 3505 int amdgpu_device_init(struct amdgpu_device *adev, 3506 uint32_t flags) 3507 { 3508 struct drm_device *ddev = adev_to_drm(adev); 3509 struct pci_dev *pdev = adev->pdev; 3510 int r, i; 3511 bool px = false; 3512 u32 max_MBps; 3513 3514 adev->shutdown = false; 3515 adev->flags = flags; 3516 3517 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3518 adev->asic_type = amdgpu_force_asic_type; 3519 else 3520 adev->asic_type = flags & AMD_ASIC_MASK; 3521 3522 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3523 if (amdgpu_emu_mode == 1) 3524 adev->usec_timeout *= 10; 3525 adev->gmc.gart_size = 512 * 1024 * 1024; 3526 adev->accel_working = false; 3527 adev->num_rings = 0; 3528 adev->mman.buffer_funcs = NULL; 3529 adev->mman.buffer_funcs_ring = NULL; 3530 adev->vm_manager.vm_pte_funcs = NULL; 3531 adev->vm_manager.vm_pte_num_scheds = 0; 3532 adev->gmc.gmc_funcs = NULL; 3533 adev->harvest_ip_mask = 0x0; 3534 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3535 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3536 3537 adev->smc_rreg = &amdgpu_invalid_rreg; 3538 adev->smc_wreg = &amdgpu_invalid_wreg; 3539 adev->pcie_rreg = &amdgpu_invalid_rreg; 3540 adev->pcie_wreg = &amdgpu_invalid_wreg; 3541 adev->pciep_rreg = &amdgpu_invalid_rreg; 3542 adev->pciep_wreg = &amdgpu_invalid_wreg; 3543 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3544 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3545 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3546 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3547 adev->didt_rreg = &amdgpu_invalid_rreg; 3548 adev->didt_wreg = &amdgpu_invalid_wreg; 3549 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3550 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3551 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3552 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3553 3554 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3555 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3556 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3557 3558 /* mutex initialization are all done here so we 3559 * can recall function without having locking issues */ 3560 mutex_init(&adev->firmware.mutex); 3561 mutex_init(&adev->pm.mutex); 3562 mutex_init(&adev->gfx.gpu_clock_mutex); 3563 mutex_init(&adev->srbm_mutex); 3564 mutex_init(&adev->gfx.pipe_reserve_mutex); 3565 mutex_init(&adev->gfx.gfx_off_mutex); 3566 mutex_init(&adev->grbm_idx_mutex); 3567 mutex_init(&adev->mn_lock); 3568 mutex_init(&adev->virt.vf_errors.lock); 3569 hash_init(adev->mn_hash); 3570 mutex_init(&adev->psp.mutex); 3571 mutex_init(&adev->notifier_lock); 3572 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3573 mutex_init(&adev->benchmark_mutex); 3574 3575 amdgpu_device_init_apu_flags(adev); 3576 3577 r = amdgpu_device_check_arguments(adev); 3578 if (r) 3579 return r; 3580 3581 spin_lock_init(&adev->mmio_idx_lock); 3582 spin_lock_init(&adev->smc_idx_lock); 3583 spin_lock_init(&adev->pcie_idx_lock); 3584 spin_lock_init(&adev->uvd_ctx_idx_lock); 3585 spin_lock_init(&adev->didt_idx_lock); 3586 spin_lock_init(&adev->gc_cac_idx_lock); 3587 spin_lock_init(&adev->se_cac_idx_lock); 3588 spin_lock_init(&adev->audio_endpt_idx_lock); 3589 spin_lock_init(&adev->mm_stats.lock); 3590 3591 INIT_LIST_HEAD(&adev->shadow_list); 3592 mutex_init(&adev->shadow_list_lock); 3593 3594 INIT_LIST_HEAD(&adev->reset_list); 3595 3596 INIT_LIST_HEAD(&adev->ras_list); 3597 3598 INIT_DELAYED_WORK(&adev->delayed_init_work, 3599 amdgpu_device_delayed_init_work_handler); 3600 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3601 amdgpu_device_delay_enable_gfx_off); 3602 3603 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3604 3605 adev->gfx.gfx_off_req_count = 1; 3606 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3607 3608 atomic_set(&adev->throttling_logging_enabled, 1); 3609 /* 3610 * If throttling continues, logging will be performed every minute 3611 * to avoid log flooding. "-1" is subtracted since the thermal 3612 * throttling interrupt comes every second. Thus, the total logging 3613 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3614 * for throttling interrupt) = 60 seconds. 3615 */ 3616 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3617 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3618 3619 /* Registers mapping */ 3620 /* TODO: block userspace mapping of io register */ 3621 if (adev->asic_type >= CHIP_BONAIRE) { 3622 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3623 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3624 } else { 3625 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3626 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3627 } 3628 3629 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3630 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3631 3632 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3633 if (adev->rmmio == NULL) { 3634 return -ENOMEM; 3635 } 3636 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3637 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3638 3639 amdgpu_device_get_pcie_info(adev); 3640 3641 if (amdgpu_mcbp) 3642 DRM_INFO("MCBP is enabled\n"); 3643 3644 if (adev->asic_type >= CHIP_NAVI10) { 3645 if (amdgpu_mes || amdgpu_mes_kiq) 3646 adev->enable_mes = true; 3647 3648 if (amdgpu_mes_kiq) 3649 adev->enable_mes_kiq = true; 3650 } 3651 3652 /* 3653 * Reset domain needs to be present early, before XGMI hive discovered 3654 * (if any) and intitialized to use reset sem and in_gpu reset flag 3655 * early on during init and before calling to RREG32. 3656 */ 3657 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3658 if (!adev->reset_domain) 3659 return -ENOMEM; 3660 3661 /* detect hw virtualization here */ 3662 amdgpu_detect_virtualization(adev); 3663 3664 r = amdgpu_device_get_job_timeout_settings(adev); 3665 if (r) { 3666 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3667 return r; 3668 } 3669 3670 /* early init functions */ 3671 r = amdgpu_device_ip_early_init(adev); 3672 if (r) 3673 return r; 3674 3675 /* Enable TMZ based on IP_VERSION */ 3676 amdgpu_gmc_tmz_set(adev); 3677 3678 amdgpu_gmc_noretry_set(adev); 3679 /* Need to get xgmi info early to decide the reset behavior*/ 3680 if (adev->gmc.xgmi.supported) { 3681 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3682 if (r) 3683 return r; 3684 } 3685 3686 /* enable PCIE atomic ops */ 3687 if (amdgpu_sriov_vf(adev)) 3688 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3689 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3690 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3691 else 3692 adev->have_atomics_support = 3693 !pci_enable_atomic_ops_to_root(adev->pdev, 3694 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3695 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3696 if (!adev->have_atomics_support) 3697 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3698 3699 /* doorbell bar mapping and doorbell index init*/ 3700 amdgpu_device_doorbell_init(adev); 3701 3702 if (amdgpu_emu_mode == 1) { 3703 /* post the asic on emulation mode */ 3704 emu_soc_asic_init(adev); 3705 goto fence_driver_init; 3706 } 3707 3708 amdgpu_reset_init(adev); 3709 3710 /* detect if we are with an SRIOV vbios */ 3711 amdgpu_device_detect_sriov_bios(adev); 3712 3713 /* check if we need to reset the asic 3714 * E.g., driver was not cleanly unloaded previously, etc. 3715 */ 3716 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3717 if (adev->gmc.xgmi.num_physical_nodes) { 3718 dev_info(adev->dev, "Pending hive reset.\n"); 3719 adev->gmc.xgmi.pending_reset = true; 3720 /* Only need to init necessary block for SMU to handle the reset */ 3721 for (i = 0; i < adev->num_ip_blocks; i++) { 3722 if (!adev->ip_blocks[i].status.valid) 3723 continue; 3724 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3725 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3726 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3727 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3728 DRM_DEBUG("IP %s disabled for hw_init.\n", 3729 adev->ip_blocks[i].version->funcs->name); 3730 adev->ip_blocks[i].status.hw = true; 3731 } 3732 } 3733 } else { 3734 r = amdgpu_asic_reset(adev); 3735 if (r) { 3736 dev_err(adev->dev, "asic reset on init failed\n"); 3737 goto failed; 3738 } 3739 } 3740 } 3741 3742 pci_enable_pcie_error_reporting(adev->pdev); 3743 3744 /* Post card if necessary */ 3745 if (amdgpu_device_need_post(adev)) { 3746 if (!adev->bios) { 3747 dev_err(adev->dev, "no vBIOS found\n"); 3748 r = -EINVAL; 3749 goto failed; 3750 } 3751 DRM_INFO("GPU posting now...\n"); 3752 r = amdgpu_device_asic_init(adev); 3753 if (r) { 3754 dev_err(adev->dev, "gpu post error!\n"); 3755 goto failed; 3756 } 3757 } 3758 3759 if (adev->is_atom_fw) { 3760 /* Initialize clocks */ 3761 r = amdgpu_atomfirmware_get_clock_info(adev); 3762 if (r) { 3763 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3764 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3765 goto failed; 3766 } 3767 } else { 3768 /* Initialize clocks */ 3769 r = amdgpu_atombios_get_clock_info(adev); 3770 if (r) { 3771 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3772 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3773 goto failed; 3774 } 3775 /* init i2c buses */ 3776 if (!amdgpu_device_has_dc_support(adev)) 3777 amdgpu_atombios_i2c_init(adev); 3778 } 3779 3780 fence_driver_init: 3781 /* Fence driver */ 3782 r = amdgpu_fence_driver_sw_init(adev); 3783 if (r) { 3784 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3785 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3786 goto failed; 3787 } 3788 3789 /* init the mode config */ 3790 drm_mode_config_init(adev_to_drm(adev)); 3791 3792 r = amdgpu_device_ip_init(adev); 3793 if (r) { 3794 /* failed in exclusive mode due to timeout */ 3795 if (amdgpu_sriov_vf(adev) && 3796 !amdgpu_sriov_runtime(adev) && 3797 amdgpu_virt_mmio_blocked(adev) && 3798 !amdgpu_virt_wait_reset(adev)) { 3799 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3800 /* Don't send request since VF is inactive. */ 3801 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3802 adev->virt.ops = NULL; 3803 r = -EAGAIN; 3804 goto release_ras_con; 3805 } 3806 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3807 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3808 goto release_ras_con; 3809 } 3810 3811 amdgpu_fence_driver_hw_init(adev); 3812 3813 dev_info(adev->dev, 3814 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3815 adev->gfx.config.max_shader_engines, 3816 adev->gfx.config.max_sh_per_se, 3817 adev->gfx.config.max_cu_per_sh, 3818 adev->gfx.cu_info.number); 3819 3820 adev->accel_working = true; 3821 3822 amdgpu_vm_check_compute_bug(adev); 3823 3824 /* Initialize the buffer migration limit. */ 3825 if (amdgpu_moverate >= 0) 3826 max_MBps = amdgpu_moverate; 3827 else 3828 max_MBps = 8; /* Allow 8 MB/s. */ 3829 /* Get a log2 for easy divisions. */ 3830 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3831 3832 r = amdgpu_pm_sysfs_init(adev); 3833 if (r) { 3834 adev->pm_sysfs_en = false; 3835 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3836 } else 3837 adev->pm_sysfs_en = true; 3838 3839 r = amdgpu_ucode_sysfs_init(adev); 3840 if (r) { 3841 adev->ucode_sysfs_en = false; 3842 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3843 } else 3844 adev->ucode_sysfs_en = true; 3845 3846 r = amdgpu_psp_sysfs_init(adev); 3847 if (r) { 3848 adev->psp_sysfs_en = false; 3849 if (!amdgpu_sriov_vf(adev)) 3850 DRM_ERROR("Creating psp sysfs failed\n"); 3851 } else 3852 adev->psp_sysfs_en = true; 3853 3854 /* 3855 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3856 * Otherwise the mgpu fan boost feature will be skipped due to the 3857 * gpu instance is counted less. 3858 */ 3859 amdgpu_register_gpu_instance(adev); 3860 3861 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3862 * explicit gating rather than handling it automatically. 3863 */ 3864 if (!adev->gmc.xgmi.pending_reset) { 3865 r = amdgpu_device_ip_late_init(adev); 3866 if (r) { 3867 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3868 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3869 goto release_ras_con; 3870 } 3871 /* must succeed. */ 3872 amdgpu_ras_resume(adev); 3873 queue_delayed_work(system_wq, &adev->delayed_init_work, 3874 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3875 } 3876 3877 if (amdgpu_sriov_vf(adev)) 3878 flush_delayed_work(&adev->delayed_init_work); 3879 3880 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3881 if (r) 3882 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3883 3884 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3885 r = amdgpu_pmu_init(adev); 3886 if (r) 3887 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3888 3889 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3890 if (amdgpu_device_cache_pci_state(adev->pdev)) 3891 pci_restore_state(pdev); 3892 3893 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3894 /* this will fail for cards that aren't VGA class devices, just 3895 * ignore it */ 3896 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3897 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3898 3899 if (amdgpu_device_supports_px(ddev)) { 3900 px = true; 3901 vga_switcheroo_register_client(adev->pdev, 3902 &amdgpu_switcheroo_ops, px); 3903 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3904 } 3905 3906 if (adev->gmc.xgmi.pending_reset) 3907 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3908 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3909 3910 amdgpu_device_check_iommu_direct_map(adev); 3911 3912 return 0; 3913 3914 release_ras_con: 3915 amdgpu_release_ras_context(adev); 3916 3917 failed: 3918 amdgpu_vf_error_trans_all(adev); 3919 3920 return r; 3921 } 3922 3923 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3924 { 3925 3926 /* Clear all CPU mappings pointing to this device */ 3927 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3928 3929 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3930 amdgpu_device_doorbell_fini(adev); 3931 3932 iounmap(adev->rmmio); 3933 adev->rmmio = NULL; 3934 if (adev->mman.aper_base_kaddr) 3935 iounmap(adev->mman.aper_base_kaddr); 3936 adev->mman.aper_base_kaddr = NULL; 3937 3938 /* Memory manager related */ 3939 if (!adev->gmc.xgmi.connected_to_cpu) { 3940 arch_phys_wc_del(adev->gmc.vram_mtrr); 3941 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3942 } 3943 } 3944 3945 /** 3946 * amdgpu_device_fini_hw - tear down the driver 3947 * 3948 * @adev: amdgpu_device pointer 3949 * 3950 * Tear down the driver info (all asics). 3951 * Called at driver shutdown. 3952 */ 3953 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3954 { 3955 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3956 flush_delayed_work(&adev->delayed_init_work); 3957 adev->shutdown = true; 3958 3959 /* make sure IB test finished before entering exclusive mode 3960 * to avoid preemption on IB test 3961 * */ 3962 if (amdgpu_sriov_vf(adev)) { 3963 amdgpu_virt_request_full_gpu(adev, false); 3964 amdgpu_virt_fini_data_exchange(adev); 3965 } 3966 3967 /* disable all interrupts */ 3968 amdgpu_irq_disable_all(adev); 3969 if (adev->mode_info.mode_config_initialized){ 3970 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3971 drm_helper_force_disable_all(adev_to_drm(adev)); 3972 else 3973 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3974 } 3975 amdgpu_fence_driver_hw_fini(adev); 3976 3977 if (adev->mman.initialized) { 3978 flush_delayed_work(&adev->mman.bdev.wq); 3979 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3980 } 3981 3982 if (adev->pm_sysfs_en) 3983 amdgpu_pm_sysfs_fini(adev); 3984 if (adev->ucode_sysfs_en) 3985 amdgpu_ucode_sysfs_fini(adev); 3986 if (adev->psp_sysfs_en) 3987 amdgpu_psp_sysfs_fini(adev); 3988 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3989 3990 /* disable ras feature must before hw fini */ 3991 amdgpu_ras_pre_fini(adev); 3992 3993 amdgpu_device_ip_fini_early(adev); 3994 3995 amdgpu_irq_fini_hw(adev); 3996 3997 if (adev->mman.initialized) 3998 ttm_device_clear_dma_mappings(&adev->mman.bdev); 3999 4000 amdgpu_gart_dummy_page_fini(adev); 4001 4002 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4003 amdgpu_device_unmap_mmio(adev); 4004 4005 } 4006 4007 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4008 { 4009 int idx; 4010 4011 amdgpu_fence_driver_sw_fini(adev); 4012 amdgpu_device_ip_fini(adev); 4013 release_firmware(adev->firmware.gpu_info_fw); 4014 adev->firmware.gpu_info_fw = NULL; 4015 adev->accel_working = false; 4016 4017 amdgpu_reset_fini(adev); 4018 4019 /* free i2c buses */ 4020 if (!amdgpu_device_has_dc_support(adev)) 4021 amdgpu_i2c_fini(adev); 4022 4023 if (amdgpu_emu_mode != 1) 4024 amdgpu_atombios_fini(adev); 4025 4026 kfree(adev->bios); 4027 adev->bios = NULL; 4028 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4029 vga_switcheroo_unregister_client(adev->pdev); 4030 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4031 } 4032 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4033 vga_client_unregister(adev->pdev); 4034 4035 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4036 4037 iounmap(adev->rmmio); 4038 adev->rmmio = NULL; 4039 amdgpu_device_doorbell_fini(adev); 4040 drm_dev_exit(idx); 4041 } 4042 4043 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4044 amdgpu_pmu_fini(adev); 4045 if (adev->mman.discovery_bin) 4046 amdgpu_discovery_fini(adev); 4047 4048 amdgpu_reset_put_reset_domain(adev->reset_domain); 4049 adev->reset_domain = NULL; 4050 4051 kfree(adev->pci_state); 4052 4053 } 4054 4055 /** 4056 * amdgpu_device_evict_resources - evict device resources 4057 * @adev: amdgpu device object 4058 * 4059 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4060 * of the vram memory type. Mainly used for evicting device resources 4061 * at suspend time. 4062 * 4063 */ 4064 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4065 { 4066 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4067 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4068 return; 4069 4070 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4071 DRM_WARN("evicting device resources failed\n"); 4072 4073 } 4074 4075 /* 4076 * Suspend & resume. 4077 */ 4078 /** 4079 * amdgpu_device_suspend - initiate device suspend 4080 * 4081 * @dev: drm dev pointer 4082 * @fbcon : notify the fbdev of suspend 4083 * 4084 * Puts the hw in the suspend state (all asics). 4085 * Returns 0 for success or an error on failure. 4086 * Called at driver suspend. 4087 */ 4088 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4089 { 4090 struct amdgpu_device *adev = drm_to_adev(dev); 4091 4092 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4093 return 0; 4094 4095 adev->in_suspend = true; 4096 4097 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4098 DRM_WARN("smart shift update failed\n"); 4099 4100 drm_kms_helper_poll_disable(dev); 4101 4102 if (fbcon) 4103 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4104 4105 cancel_delayed_work_sync(&adev->delayed_init_work); 4106 4107 amdgpu_ras_suspend(adev); 4108 4109 amdgpu_device_ip_suspend_phase1(adev); 4110 4111 if (!adev->in_s0ix) 4112 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4113 4114 amdgpu_device_evict_resources(adev); 4115 4116 amdgpu_fence_driver_hw_fini(adev); 4117 4118 amdgpu_device_ip_suspend_phase2(adev); 4119 4120 return 0; 4121 } 4122 4123 /** 4124 * amdgpu_device_resume - initiate device resume 4125 * 4126 * @dev: drm dev pointer 4127 * @fbcon : notify the fbdev of resume 4128 * 4129 * Bring the hw back to operating state (all asics). 4130 * Returns 0 for success or an error on failure. 4131 * Called at driver resume. 4132 */ 4133 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4134 { 4135 struct amdgpu_device *adev = drm_to_adev(dev); 4136 int r = 0; 4137 4138 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4139 return 0; 4140 4141 if (adev->in_s0ix) 4142 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4143 4144 /* post card */ 4145 if (amdgpu_device_need_post(adev)) { 4146 r = amdgpu_device_asic_init(adev); 4147 if (r) 4148 dev_err(adev->dev, "amdgpu asic init failed\n"); 4149 } 4150 4151 r = amdgpu_device_ip_resume(adev); 4152 if (r) { 4153 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4154 return r; 4155 } 4156 amdgpu_fence_driver_hw_init(adev); 4157 4158 r = amdgpu_device_ip_late_init(adev); 4159 if (r) 4160 return r; 4161 4162 queue_delayed_work(system_wq, &adev->delayed_init_work, 4163 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4164 4165 if (!adev->in_s0ix) { 4166 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4167 if (r) 4168 return r; 4169 } 4170 4171 /* Make sure IB tests flushed */ 4172 flush_delayed_work(&adev->delayed_init_work); 4173 4174 if (fbcon) 4175 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4176 4177 drm_kms_helper_poll_enable(dev); 4178 4179 amdgpu_ras_resume(adev); 4180 4181 /* 4182 * Most of the connector probing functions try to acquire runtime pm 4183 * refs to ensure that the GPU is powered on when connector polling is 4184 * performed. Since we're calling this from a runtime PM callback, 4185 * trying to acquire rpm refs will cause us to deadlock. 4186 * 4187 * Since we're guaranteed to be holding the rpm lock, it's safe to 4188 * temporarily disable the rpm helpers so this doesn't deadlock us. 4189 */ 4190 #ifdef CONFIG_PM 4191 dev->dev->power.disable_depth++; 4192 #endif 4193 if (!amdgpu_device_has_dc_support(adev)) 4194 drm_helper_hpd_irq_event(dev); 4195 else 4196 drm_kms_helper_hotplug_event(dev); 4197 #ifdef CONFIG_PM 4198 dev->dev->power.disable_depth--; 4199 #endif 4200 adev->in_suspend = false; 4201 4202 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4203 DRM_WARN("smart shift update failed\n"); 4204 4205 return 0; 4206 } 4207 4208 /** 4209 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4210 * 4211 * @adev: amdgpu_device pointer 4212 * 4213 * The list of all the hardware IPs that make up the asic is walked and 4214 * the check_soft_reset callbacks are run. check_soft_reset determines 4215 * if the asic is still hung or not. 4216 * Returns true if any of the IPs are still in a hung state, false if not. 4217 */ 4218 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4219 { 4220 int i; 4221 bool asic_hang = false; 4222 4223 if (amdgpu_sriov_vf(adev)) 4224 return true; 4225 4226 if (amdgpu_asic_need_full_reset(adev)) 4227 return true; 4228 4229 for (i = 0; i < adev->num_ip_blocks; i++) { 4230 if (!adev->ip_blocks[i].status.valid) 4231 continue; 4232 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4233 adev->ip_blocks[i].status.hang = 4234 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4235 if (adev->ip_blocks[i].status.hang) { 4236 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4237 asic_hang = true; 4238 } 4239 } 4240 return asic_hang; 4241 } 4242 4243 /** 4244 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4245 * 4246 * @adev: amdgpu_device pointer 4247 * 4248 * The list of all the hardware IPs that make up the asic is walked and the 4249 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4250 * handles any IP specific hardware or software state changes that are 4251 * necessary for a soft reset to succeed. 4252 * Returns 0 on success, negative error code on failure. 4253 */ 4254 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4255 { 4256 int i, r = 0; 4257 4258 for (i = 0; i < adev->num_ip_blocks; i++) { 4259 if (!adev->ip_blocks[i].status.valid) 4260 continue; 4261 if (adev->ip_blocks[i].status.hang && 4262 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4263 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4264 if (r) 4265 return r; 4266 } 4267 } 4268 4269 return 0; 4270 } 4271 4272 /** 4273 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4274 * 4275 * @adev: amdgpu_device pointer 4276 * 4277 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4278 * reset is necessary to recover. 4279 * Returns true if a full asic reset is required, false if not. 4280 */ 4281 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4282 { 4283 int i; 4284 4285 if (amdgpu_asic_need_full_reset(adev)) 4286 return true; 4287 4288 for (i = 0; i < adev->num_ip_blocks; i++) { 4289 if (!adev->ip_blocks[i].status.valid) 4290 continue; 4291 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4292 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4293 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4294 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4295 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4296 if (adev->ip_blocks[i].status.hang) { 4297 dev_info(adev->dev, "Some block need full reset!\n"); 4298 return true; 4299 } 4300 } 4301 } 4302 return false; 4303 } 4304 4305 /** 4306 * amdgpu_device_ip_soft_reset - do a soft reset 4307 * 4308 * @adev: amdgpu_device pointer 4309 * 4310 * The list of all the hardware IPs that make up the asic is walked and the 4311 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4312 * IP specific hardware or software state changes that are necessary to soft 4313 * reset the IP. 4314 * Returns 0 on success, negative error code on failure. 4315 */ 4316 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4317 { 4318 int i, r = 0; 4319 4320 for (i = 0; i < adev->num_ip_blocks; i++) { 4321 if (!adev->ip_blocks[i].status.valid) 4322 continue; 4323 if (adev->ip_blocks[i].status.hang && 4324 adev->ip_blocks[i].version->funcs->soft_reset) { 4325 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4326 if (r) 4327 return r; 4328 } 4329 } 4330 4331 return 0; 4332 } 4333 4334 /** 4335 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4336 * 4337 * @adev: amdgpu_device pointer 4338 * 4339 * The list of all the hardware IPs that make up the asic is walked and the 4340 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4341 * handles any IP specific hardware or software state changes that are 4342 * necessary after the IP has been soft reset. 4343 * Returns 0 on success, negative error code on failure. 4344 */ 4345 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4346 { 4347 int i, r = 0; 4348 4349 for (i = 0; i < adev->num_ip_blocks; i++) { 4350 if (!adev->ip_blocks[i].status.valid) 4351 continue; 4352 if (adev->ip_blocks[i].status.hang && 4353 adev->ip_blocks[i].version->funcs->post_soft_reset) 4354 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4355 if (r) 4356 return r; 4357 } 4358 4359 return 0; 4360 } 4361 4362 /** 4363 * amdgpu_device_recover_vram - Recover some VRAM contents 4364 * 4365 * @adev: amdgpu_device pointer 4366 * 4367 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4368 * restore things like GPUVM page tables after a GPU reset where 4369 * the contents of VRAM might be lost. 4370 * 4371 * Returns: 4372 * 0 on success, negative error code on failure. 4373 */ 4374 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4375 { 4376 struct dma_fence *fence = NULL, *next = NULL; 4377 struct amdgpu_bo *shadow; 4378 struct amdgpu_bo_vm *vmbo; 4379 long r = 1, tmo; 4380 4381 if (amdgpu_sriov_runtime(adev)) 4382 tmo = msecs_to_jiffies(8000); 4383 else 4384 tmo = msecs_to_jiffies(100); 4385 4386 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4387 mutex_lock(&adev->shadow_list_lock); 4388 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4389 shadow = &vmbo->bo; 4390 /* No need to recover an evicted BO */ 4391 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4392 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4393 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4394 continue; 4395 4396 r = amdgpu_bo_restore_shadow(shadow, &next); 4397 if (r) 4398 break; 4399 4400 if (fence) { 4401 tmo = dma_fence_wait_timeout(fence, false, tmo); 4402 dma_fence_put(fence); 4403 fence = next; 4404 if (tmo == 0) { 4405 r = -ETIMEDOUT; 4406 break; 4407 } else if (tmo < 0) { 4408 r = tmo; 4409 break; 4410 } 4411 } else { 4412 fence = next; 4413 } 4414 } 4415 mutex_unlock(&adev->shadow_list_lock); 4416 4417 if (fence) 4418 tmo = dma_fence_wait_timeout(fence, false, tmo); 4419 dma_fence_put(fence); 4420 4421 if (r < 0 || tmo <= 0) { 4422 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4423 return -EIO; 4424 } 4425 4426 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4427 return 0; 4428 } 4429 4430 4431 /** 4432 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4433 * 4434 * @adev: amdgpu_device pointer 4435 * @from_hypervisor: request from hypervisor 4436 * 4437 * do VF FLR and reinitialize Asic 4438 * return 0 means succeeded otherwise failed 4439 */ 4440 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4441 bool from_hypervisor) 4442 { 4443 int r; 4444 struct amdgpu_hive_info *hive = NULL; 4445 int retry_limit = 0; 4446 4447 retry: 4448 amdgpu_amdkfd_pre_reset(adev); 4449 4450 amdgpu_amdkfd_pre_reset(adev); 4451 4452 if (from_hypervisor) 4453 r = amdgpu_virt_request_full_gpu(adev, true); 4454 else 4455 r = amdgpu_virt_reset_gpu(adev); 4456 if (r) 4457 return r; 4458 4459 /* Resume IP prior to SMC */ 4460 r = amdgpu_device_ip_reinit_early_sriov(adev); 4461 if (r) 4462 goto error; 4463 4464 amdgpu_virt_init_data_exchange(adev); 4465 4466 r = amdgpu_device_fw_loading(adev); 4467 if (r) 4468 return r; 4469 4470 /* now we are okay to resume SMC/CP/SDMA */ 4471 r = amdgpu_device_ip_reinit_late_sriov(adev); 4472 if (r) 4473 goto error; 4474 4475 hive = amdgpu_get_xgmi_hive(adev); 4476 /* Update PSP FW topology after reset */ 4477 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4478 r = amdgpu_xgmi_update_topology(hive, adev); 4479 4480 if (hive) 4481 amdgpu_put_xgmi_hive(hive); 4482 4483 if (!r) { 4484 amdgpu_irq_gpu_reset_resume_helper(adev); 4485 r = amdgpu_ib_ring_tests(adev); 4486 4487 amdgpu_amdkfd_post_reset(adev); 4488 } 4489 4490 error: 4491 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4492 amdgpu_inc_vram_lost(adev); 4493 r = amdgpu_device_recover_vram(adev); 4494 } 4495 amdgpu_virt_release_full_gpu(adev, true); 4496 4497 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4498 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4499 retry_limit++; 4500 goto retry; 4501 } else 4502 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4503 } 4504 4505 return r; 4506 } 4507 4508 /** 4509 * amdgpu_device_has_job_running - check if there is any job in mirror list 4510 * 4511 * @adev: amdgpu_device pointer 4512 * 4513 * check if there is any job in mirror list 4514 */ 4515 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4516 { 4517 int i; 4518 struct drm_sched_job *job; 4519 4520 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4521 struct amdgpu_ring *ring = adev->rings[i]; 4522 4523 if (!ring || !ring->sched.thread) 4524 continue; 4525 4526 spin_lock(&ring->sched.job_list_lock); 4527 job = list_first_entry_or_null(&ring->sched.pending_list, 4528 struct drm_sched_job, list); 4529 spin_unlock(&ring->sched.job_list_lock); 4530 if (job) 4531 return true; 4532 } 4533 return false; 4534 } 4535 4536 /** 4537 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4538 * 4539 * @adev: amdgpu_device pointer 4540 * 4541 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4542 * a hung GPU. 4543 */ 4544 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4545 { 4546 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4547 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4548 return false; 4549 } 4550 4551 if (amdgpu_gpu_recovery == 0) 4552 goto disabled; 4553 4554 if (amdgpu_sriov_vf(adev)) 4555 return true; 4556 4557 if (amdgpu_gpu_recovery == -1) { 4558 switch (adev->asic_type) { 4559 #ifdef CONFIG_DRM_AMDGPU_SI 4560 case CHIP_VERDE: 4561 case CHIP_TAHITI: 4562 case CHIP_PITCAIRN: 4563 case CHIP_OLAND: 4564 case CHIP_HAINAN: 4565 #endif 4566 #ifdef CONFIG_DRM_AMDGPU_CIK 4567 case CHIP_KAVERI: 4568 case CHIP_KABINI: 4569 case CHIP_MULLINS: 4570 #endif 4571 case CHIP_CARRIZO: 4572 case CHIP_STONEY: 4573 case CHIP_CYAN_SKILLFISH: 4574 goto disabled; 4575 default: 4576 break; 4577 } 4578 } 4579 4580 return true; 4581 4582 disabled: 4583 dev_info(adev->dev, "GPU recovery disabled.\n"); 4584 return false; 4585 } 4586 4587 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4588 { 4589 u32 i; 4590 int ret = 0; 4591 4592 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4593 4594 dev_info(adev->dev, "GPU mode1 reset\n"); 4595 4596 /* disable BM */ 4597 pci_clear_master(adev->pdev); 4598 4599 amdgpu_device_cache_pci_state(adev->pdev); 4600 4601 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4602 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4603 ret = amdgpu_dpm_mode1_reset(adev); 4604 } else { 4605 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4606 ret = psp_gpu_reset(adev); 4607 } 4608 4609 if (ret) 4610 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4611 4612 amdgpu_device_load_pci_state(adev->pdev); 4613 4614 /* wait for asic to come out of reset */ 4615 for (i = 0; i < adev->usec_timeout; i++) { 4616 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4617 4618 if (memsize != 0xffffffff) 4619 break; 4620 udelay(1); 4621 } 4622 4623 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4624 return ret; 4625 } 4626 4627 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4628 struct amdgpu_reset_context *reset_context) 4629 { 4630 int i, r = 0; 4631 struct amdgpu_job *job = NULL; 4632 bool need_full_reset = 4633 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4634 4635 if (reset_context->reset_req_dev == adev) 4636 job = reset_context->job; 4637 4638 if (amdgpu_sriov_vf(adev)) { 4639 /* stop the data exchange thread */ 4640 amdgpu_virt_fini_data_exchange(adev); 4641 } 4642 4643 /* block all schedulers and reset given job's ring */ 4644 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4645 struct amdgpu_ring *ring = adev->rings[i]; 4646 4647 if (!ring || !ring->sched.thread) 4648 continue; 4649 4650 /*clear job fence from fence drv to avoid force_completion 4651 *leave NULL and vm flush fence in fence drv */ 4652 amdgpu_fence_driver_clear_job_fences(ring); 4653 4654 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4655 amdgpu_fence_driver_force_completion(ring); 4656 } 4657 4658 if (job && job->vm) 4659 drm_sched_increase_karma(&job->base); 4660 4661 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4662 /* If reset handler not implemented, continue; otherwise return */ 4663 if (r == -ENOSYS) 4664 r = 0; 4665 else 4666 return r; 4667 4668 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4669 if (!amdgpu_sriov_vf(adev)) { 4670 4671 if (!need_full_reset) 4672 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4673 4674 if (!need_full_reset) { 4675 amdgpu_device_ip_pre_soft_reset(adev); 4676 r = amdgpu_device_ip_soft_reset(adev); 4677 amdgpu_device_ip_post_soft_reset(adev); 4678 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4679 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4680 need_full_reset = true; 4681 } 4682 } 4683 4684 if (need_full_reset) 4685 r = amdgpu_device_ip_suspend(adev); 4686 if (need_full_reset) 4687 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4688 else 4689 clear_bit(AMDGPU_NEED_FULL_RESET, 4690 &reset_context->flags); 4691 } 4692 4693 return r; 4694 } 4695 4696 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4697 { 4698 uint32_t reg_value; 4699 int i; 4700 4701 lockdep_assert_held(&adev->reset_domain->sem); 4702 dump_stack(); 4703 4704 for (i = 0; i < adev->num_regs; i++) { 4705 reg_value = RREG32(adev->reset_dump_reg_list[i]); 4706 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); 4707 } 4708 4709 return 0; 4710 } 4711 4712 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4713 struct amdgpu_reset_context *reset_context) 4714 { 4715 struct amdgpu_device *tmp_adev = NULL; 4716 bool need_full_reset, skip_hw_reset, vram_lost = false; 4717 int r = 0; 4718 4719 /* Try reset handler method first */ 4720 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4721 reset_list); 4722 amdgpu_reset_reg_dumps(tmp_adev); 4723 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4724 /* If reset handler not implemented, continue; otherwise return */ 4725 if (r == -ENOSYS) 4726 r = 0; 4727 else 4728 return r; 4729 4730 /* Reset handler not implemented, use the default method */ 4731 need_full_reset = 4732 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4733 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4734 4735 /* 4736 * ASIC reset has to be done on all XGMI hive nodes ASAP 4737 * to allow proper links negotiation in FW (within 1 sec) 4738 */ 4739 if (!skip_hw_reset && need_full_reset) { 4740 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4741 /* For XGMI run all resets in parallel to speed up the process */ 4742 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4743 tmp_adev->gmc.xgmi.pending_reset = false; 4744 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4745 r = -EALREADY; 4746 } else 4747 r = amdgpu_asic_reset(tmp_adev); 4748 4749 if (r) { 4750 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4751 r, adev_to_drm(tmp_adev)->unique); 4752 break; 4753 } 4754 } 4755 4756 /* For XGMI wait for all resets to complete before proceed */ 4757 if (!r) { 4758 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4759 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4760 flush_work(&tmp_adev->xgmi_reset_work); 4761 r = tmp_adev->asic_reset_res; 4762 if (r) 4763 break; 4764 } 4765 } 4766 } 4767 } 4768 4769 if (!r && amdgpu_ras_intr_triggered()) { 4770 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4771 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4772 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4773 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4774 } 4775 4776 amdgpu_ras_intr_cleared(); 4777 } 4778 4779 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4780 if (need_full_reset) { 4781 /* post card */ 4782 r = amdgpu_device_asic_init(tmp_adev); 4783 if (r) { 4784 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4785 } else { 4786 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4787 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4788 if (r) 4789 goto out; 4790 4791 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4792 if (r) 4793 goto out; 4794 4795 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4796 if (vram_lost) { 4797 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4798 amdgpu_inc_vram_lost(tmp_adev); 4799 } 4800 4801 r = amdgpu_device_fw_loading(tmp_adev); 4802 if (r) 4803 return r; 4804 4805 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4806 if (r) 4807 goto out; 4808 4809 if (vram_lost) 4810 amdgpu_device_fill_reset_magic(tmp_adev); 4811 4812 /* 4813 * Add this ASIC as tracked as reset was already 4814 * complete successfully. 4815 */ 4816 amdgpu_register_gpu_instance(tmp_adev); 4817 4818 if (!reset_context->hive && 4819 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4820 amdgpu_xgmi_add_device(tmp_adev); 4821 4822 r = amdgpu_device_ip_late_init(tmp_adev); 4823 if (r) 4824 goto out; 4825 4826 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4827 4828 /* 4829 * The GPU enters bad state once faulty pages 4830 * by ECC has reached the threshold, and ras 4831 * recovery is scheduled next. So add one check 4832 * here to break recovery if it indeed exceeds 4833 * bad page threshold, and remind user to 4834 * retire this GPU or setting one bigger 4835 * bad_page_threshold value to fix this once 4836 * probing driver again. 4837 */ 4838 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4839 /* must succeed. */ 4840 amdgpu_ras_resume(tmp_adev); 4841 } else { 4842 r = -EINVAL; 4843 goto out; 4844 } 4845 4846 /* Update PSP FW topology after reset */ 4847 if (reset_context->hive && 4848 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4849 r = amdgpu_xgmi_update_topology( 4850 reset_context->hive, tmp_adev); 4851 } 4852 } 4853 4854 out: 4855 if (!r) { 4856 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4857 r = amdgpu_ib_ring_tests(tmp_adev); 4858 if (r) { 4859 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4860 need_full_reset = true; 4861 r = -EAGAIN; 4862 goto end; 4863 } 4864 } 4865 4866 if (!r) 4867 r = amdgpu_device_recover_vram(tmp_adev); 4868 else 4869 tmp_adev->asic_reset_res = r; 4870 } 4871 4872 end: 4873 if (need_full_reset) 4874 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4875 else 4876 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4877 return r; 4878 } 4879 4880 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 4881 { 4882 4883 switch (amdgpu_asic_reset_method(adev)) { 4884 case AMD_RESET_METHOD_MODE1: 4885 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4886 break; 4887 case AMD_RESET_METHOD_MODE2: 4888 adev->mp1_state = PP_MP1_STATE_RESET; 4889 break; 4890 default: 4891 adev->mp1_state = PP_MP1_STATE_NONE; 4892 break; 4893 } 4894 } 4895 4896 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 4897 { 4898 amdgpu_vf_error_trans_all(adev); 4899 adev->mp1_state = PP_MP1_STATE_NONE; 4900 } 4901 4902 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4903 { 4904 struct pci_dev *p = NULL; 4905 4906 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4907 adev->pdev->bus->number, 1); 4908 if (p) { 4909 pm_runtime_enable(&(p->dev)); 4910 pm_runtime_resume(&(p->dev)); 4911 } 4912 } 4913 4914 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4915 { 4916 enum amd_reset_method reset_method; 4917 struct pci_dev *p = NULL; 4918 u64 expires; 4919 4920 /* 4921 * For now, only BACO and mode1 reset are confirmed 4922 * to suffer the audio issue without proper suspended. 4923 */ 4924 reset_method = amdgpu_asic_reset_method(adev); 4925 if ((reset_method != AMD_RESET_METHOD_BACO) && 4926 (reset_method != AMD_RESET_METHOD_MODE1)) 4927 return -EINVAL; 4928 4929 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4930 adev->pdev->bus->number, 1); 4931 if (!p) 4932 return -ENODEV; 4933 4934 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4935 if (!expires) 4936 /* 4937 * If we cannot get the audio device autosuspend delay, 4938 * a fixed 4S interval will be used. Considering 3S is 4939 * the audio controller default autosuspend delay setting. 4940 * 4S used here is guaranteed to cover that. 4941 */ 4942 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4943 4944 while (!pm_runtime_status_suspended(&(p->dev))) { 4945 if (!pm_runtime_suspend(&(p->dev))) 4946 break; 4947 4948 if (expires < ktime_get_mono_fast_ns()) { 4949 dev_warn(adev->dev, "failed to suspend display audio\n"); 4950 /* TODO: abort the succeeding gpu reset? */ 4951 return -ETIMEDOUT; 4952 } 4953 } 4954 4955 pm_runtime_disable(&(p->dev)); 4956 4957 return 0; 4958 } 4959 4960 static void amdgpu_device_recheck_guilty_jobs( 4961 struct amdgpu_device *adev, struct list_head *device_list_handle, 4962 struct amdgpu_reset_context *reset_context) 4963 { 4964 int i, r = 0; 4965 4966 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4967 struct amdgpu_ring *ring = adev->rings[i]; 4968 int ret = 0; 4969 struct drm_sched_job *s_job; 4970 4971 if (!ring || !ring->sched.thread) 4972 continue; 4973 4974 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4975 struct drm_sched_job, list); 4976 if (s_job == NULL) 4977 continue; 4978 4979 /* clear job's guilty and depend the folowing step to decide the real one */ 4980 drm_sched_reset_karma(s_job); 4981 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get 4982 * to make sure fence is balanced */ 4983 dma_fence_get(s_job->s_fence->parent); 4984 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4985 4986 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4987 if (ret == 0) { /* timeout */ 4988 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4989 ring->sched.name, s_job->id); 4990 4991 /* set guilty */ 4992 drm_sched_increase_karma(s_job); 4993 retry: 4994 /* do hw reset */ 4995 if (amdgpu_sriov_vf(adev)) { 4996 amdgpu_virt_fini_data_exchange(adev); 4997 r = amdgpu_device_reset_sriov(adev, false); 4998 if (r) 4999 adev->asic_reset_res = r; 5000 } else { 5001 clear_bit(AMDGPU_SKIP_HW_RESET, 5002 &reset_context->flags); 5003 r = amdgpu_do_asic_reset(device_list_handle, 5004 reset_context); 5005 if (r && r == -EAGAIN) 5006 goto retry; 5007 } 5008 5009 /* 5010 * add reset counter so that the following 5011 * resubmitted job could flush vmid 5012 */ 5013 atomic_inc(&adev->gpu_reset_counter); 5014 continue; 5015 } 5016 5017 /* got the hw fence, signal finished fence */ 5018 atomic_dec(ring->sched.score); 5019 dma_fence_put(s_job->s_fence->parent); 5020 dma_fence_get(&s_job->s_fence->finished); 5021 dma_fence_signal(&s_job->s_fence->finished); 5022 dma_fence_put(&s_job->s_fence->finished); 5023 5024 /* remove node from list and free the job */ 5025 spin_lock(&ring->sched.job_list_lock); 5026 list_del_init(&s_job->list); 5027 spin_unlock(&ring->sched.job_list_lock); 5028 ring->sched.ops->free_job(s_job); 5029 } 5030 } 5031 5032 /** 5033 * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler 5034 * 5035 * @adev: amdgpu_device pointer 5036 * @job: which job trigger hang 5037 * 5038 * Attempt to reset the GPU if it has hung (all asics). 5039 * Attempt to do soft-reset or full-reset and reinitialize Asic 5040 * Returns 0 for success or an error on failure. 5041 */ 5042 5043 int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, 5044 struct amdgpu_job *job) 5045 { 5046 struct list_head device_list, *device_list_handle = NULL; 5047 bool job_signaled = false; 5048 struct amdgpu_hive_info *hive = NULL; 5049 struct amdgpu_device *tmp_adev = NULL; 5050 int i, r = 0; 5051 bool need_emergency_restart = false; 5052 bool audio_suspended = false; 5053 int tmp_vram_lost_counter; 5054 struct amdgpu_reset_context reset_context; 5055 5056 memset(&reset_context, 0, sizeof(reset_context)); 5057 5058 /* 5059 * Special case: RAS triggered and full reset isn't supported 5060 */ 5061 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5062 5063 /* 5064 * Flush RAM to disk so that after reboot 5065 * the user can read log and see why the system rebooted. 5066 */ 5067 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5068 DRM_WARN("Emergency reboot."); 5069 5070 ksys_sync_helper(); 5071 emergency_restart(); 5072 } 5073 5074 dev_info(adev->dev, "GPU %s begin!\n", 5075 need_emergency_restart ? "jobs stop":"reset"); 5076 5077 if (!amdgpu_sriov_vf(adev)) 5078 hive = amdgpu_get_xgmi_hive(adev); 5079 if (hive) 5080 mutex_lock(&hive->hive_lock); 5081 5082 reset_context.method = AMD_RESET_METHOD_NONE; 5083 reset_context.reset_req_dev = adev; 5084 reset_context.job = job; 5085 reset_context.hive = hive; 5086 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5087 5088 /* 5089 * Build list of devices to reset. 5090 * In case we are in XGMI hive mode, resort the device list 5091 * to put adev in the 1st position. 5092 */ 5093 INIT_LIST_HEAD(&device_list); 5094 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5095 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5096 list_add_tail(&tmp_adev->reset_list, &device_list); 5097 if (!list_is_first(&adev->reset_list, &device_list)) 5098 list_rotate_to_front(&adev->reset_list, &device_list); 5099 device_list_handle = &device_list; 5100 } else { 5101 list_add_tail(&adev->reset_list, &device_list); 5102 device_list_handle = &device_list; 5103 } 5104 5105 /* We need to lock reset domain only once both for XGMI and single device */ 5106 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5107 reset_list); 5108 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5109 5110 /* block all schedulers and reset given job's ring */ 5111 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5112 5113 amdgpu_device_set_mp1_state(tmp_adev); 5114 5115 /* 5116 * Try to put the audio codec into suspend state 5117 * before gpu reset started. 5118 * 5119 * Due to the power domain of the graphics device 5120 * is shared with AZ power domain. Without this, 5121 * we may change the audio hardware from behind 5122 * the audio driver's back. That will trigger 5123 * some audio codec errors. 5124 */ 5125 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5126 audio_suspended = true; 5127 5128 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5129 5130 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5131 5132 if (!amdgpu_sriov_vf(tmp_adev)) 5133 amdgpu_amdkfd_pre_reset(tmp_adev); 5134 5135 /* 5136 * Mark these ASICs to be reseted as untracked first 5137 * And add them back after reset completed 5138 */ 5139 amdgpu_unregister_gpu_instance(tmp_adev); 5140 5141 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 5142 5143 /* disable ras on ALL IPs */ 5144 if (!need_emergency_restart && 5145 amdgpu_device_ip_need_full_reset(tmp_adev)) 5146 amdgpu_ras_suspend(tmp_adev); 5147 5148 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5149 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5150 5151 if (!ring || !ring->sched.thread) 5152 continue; 5153 5154 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5155 5156 if (need_emergency_restart) 5157 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5158 } 5159 atomic_inc(&tmp_adev->gpu_reset_counter); 5160 } 5161 5162 if (need_emergency_restart) 5163 goto skip_sched_resume; 5164 5165 /* 5166 * Must check guilty signal here since after this point all old 5167 * HW fences are force signaled. 5168 * 5169 * job->base holds a reference to parent fence 5170 */ 5171 if (job && job->base.s_fence->parent && 5172 dma_fence_is_signaled(job->base.s_fence->parent)) { 5173 job_signaled = true; 5174 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5175 goto skip_hw_reset; 5176 } 5177 5178 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5179 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5180 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5181 /*TODO Should we stop ?*/ 5182 if (r) { 5183 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5184 r, adev_to_drm(tmp_adev)->unique); 5185 tmp_adev->asic_reset_res = r; 5186 } 5187 } 5188 5189 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5190 /* Actual ASIC resets if needed.*/ 5191 /* Host driver will handle XGMI hive reset for SRIOV */ 5192 if (amdgpu_sriov_vf(adev)) { 5193 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5194 if (r) 5195 adev->asic_reset_res = r; 5196 5197 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5198 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5199 amdgpu_ras_resume(adev); 5200 } else { 5201 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5202 if (r && r == -EAGAIN) 5203 goto retry; 5204 } 5205 5206 skip_hw_reset: 5207 5208 /* Post ASIC reset for all devs .*/ 5209 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5210 5211 /* 5212 * Sometimes a later bad compute job can block a good gfx job as gfx 5213 * and compute ring share internal GC HW mutually. We add an additional 5214 * guilty jobs recheck step to find the real guilty job, it synchronously 5215 * submits and pends for the first job being signaled. If it gets timeout, 5216 * we identify it as a real guilty job. 5217 */ 5218 if (amdgpu_gpu_recovery == 2 && 5219 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5220 amdgpu_device_recheck_guilty_jobs( 5221 tmp_adev, device_list_handle, &reset_context); 5222 5223 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5224 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5225 5226 if (!ring || !ring->sched.thread) 5227 continue; 5228 5229 /* No point to resubmit jobs if we didn't HW reset*/ 5230 if (!tmp_adev->asic_reset_res && !job_signaled) 5231 drm_sched_resubmit_jobs(&ring->sched); 5232 5233 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5234 } 5235 5236 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5237 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5238 } 5239 5240 if (tmp_adev->asic_reset_res) 5241 r = tmp_adev->asic_reset_res; 5242 5243 tmp_adev->asic_reset_res = 0; 5244 5245 if (r) { 5246 /* bad news, how to tell it to userspace ? */ 5247 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5248 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5249 } else { 5250 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5251 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5252 DRM_WARN("smart shift update failed\n"); 5253 } 5254 } 5255 5256 skip_sched_resume: 5257 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5258 /* unlock kfd: SRIOV would do it separately */ 5259 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5260 amdgpu_amdkfd_post_reset(tmp_adev); 5261 5262 /* kfd_post_reset will do nothing if kfd device is not initialized, 5263 * need to bring up kfd here if it's not be initialized before 5264 */ 5265 if (!adev->kfd.init_complete) 5266 amdgpu_amdkfd_device_init(adev); 5267 5268 if (audio_suspended) 5269 amdgpu_device_resume_display_audio(tmp_adev); 5270 5271 amdgpu_device_unset_mp1_state(tmp_adev); 5272 } 5273 5274 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5275 reset_list); 5276 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5277 5278 if (hive) { 5279 mutex_unlock(&hive->hive_lock); 5280 amdgpu_put_xgmi_hive(hive); 5281 } 5282 5283 if (r) 5284 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5285 return r; 5286 } 5287 5288 struct amdgpu_recover_work_struct { 5289 struct work_struct base; 5290 struct amdgpu_device *adev; 5291 struct amdgpu_job *job; 5292 int ret; 5293 }; 5294 5295 static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) 5296 { 5297 struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); 5298 5299 recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); 5300 } 5301 /* 5302 * Serialize gpu recover into reset domain single threaded wq 5303 */ 5304 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5305 struct amdgpu_job *job) 5306 { 5307 struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; 5308 5309 INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); 5310 5311 if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) 5312 return -EAGAIN; 5313 5314 flush_work(&work.base); 5315 5316 return work.ret; 5317 } 5318 5319 /** 5320 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5321 * 5322 * @adev: amdgpu_device pointer 5323 * 5324 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5325 * and lanes) of the slot the device is in. Handles APUs and 5326 * virtualized environments where PCIE config space may not be available. 5327 */ 5328 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5329 { 5330 struct pci_dev *pdev; 5331 enum pci_bus_speed speed_cap, platform_speed_cap; 5332 enum pcie_link_width platform_link_width; 5333 5334 if (amdgpu_pcie_gen_cap) 5335 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5336 5337 if (amdgpu_pcie_lane_cap) 5338 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5339 5340 /* covers APUs as well */ 5341 if (pci_is_root_bus(adev->pdev->bus)) { 5342 if (adev->pm.pcie_gen_mask == 0) 5343 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5344 if (adev->pm.pcie_mlw_mask == 0) 5345 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5346 return; 5347 } 5348 5349 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5350 return; 5351 5352 pcie_bandwidth_available(adev->pdev, NULL, 5353 &platform_speed_cap, &platform_link_width); 5354 5355 if (adev->pm.pcie_gen_mask == 0) { 5356 /* asic caps */ 5357 pdev = adev->pdev; 5358 speed_cap = pcie_get_speed_cap(pdev); 5359 if (speed_cap == PCI_SPEED_UNKNOWN) { 5360 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5361 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5362 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5363 } else { 5364 if (speed_cap == PCIE_SPEED_32_0GT) 5365 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5366 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5367 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5368 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5369 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5370 else if (speed_cap == PCIE_SPEED_16_0GT) 5371 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5372 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5373 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5374 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5375 else if (speed_cap == PCIE_SPEED_8_0GT) 5376 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5377 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5378 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5379 else if (speed_cap == PCIE_SPEED_5_0GT) 5380 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5381 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5382 else 5383 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5384 } 5385 /* platform caps */ 5386 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5387 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5388 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5389 } else { 5390 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5391 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5392 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5393 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5394 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5395 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5396 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5397 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5398 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5399 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5400 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5401 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5402 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5403 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5404 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5405 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5406 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5407 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5408 else 5409 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5410 5411 } 5412 } 5413 if (adev->pm.pcie_mlw_mask == 0) { 5414 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5415 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5416 } else { 5417 switch (platform_link_width) { 5418 case PCIE_LNK_X32: 5419 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5420 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5424 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5425 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5426 break; 5427 case PCIE_LNK_X16: 5428 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5432 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5433 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5434 break; 5435 case PCIE_LNK_X12: 5436 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5438 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5439 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5440 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5441 break; 5442 case PCIE_LNK_X8: 5443 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5447 break; 5448 case PCIE_LNK_X4: 5449 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5450 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5451 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5452 break; 5453 case PCIE_LNK_X2: 5454 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5456 break; 5457 case PCIE_LNK_X1: 5458 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5459 break; 5460 default: 5461 break; 5462 } 5463 } 5464 } 5465 } 5466 5467 int amdgpu_device_baco_enter(struct drm_device *dev) 5468 { 5469 struct amdgpu_device *adev = drm_to_adev(dev); 5470 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5471 5472 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5473 return -ENOTSUPP; 5474 5475 if (ras && adev->ras_enabled && 5476 adev->nbio.funcs->enable_doorbell_interrupt) 5477 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5478 5479 return amdgpu_dpm_baco_enter(adev); 5480 } 5481 5482 int amdgpu_device_baco_exit(struct drm_device *dev) 5483 { 5484 struct amdgpu_device *adev = drm_to_adev(dev); 5485 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5486 int ret = 0; 5487 5488 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5489 return -ENOTSUPP; 5490 5491 ret = amdgpu_dpm_baco_exit(adev); 5492 if (ret) 5493 return ret; 5494 5495 if (ras && adev->ras_enabled && 5496 adev->nbio.funcs->enable_doorbell_interrupt) 5497 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5498 5499 if (amdgpu_passthrough(adev) && 5500 adev->nbio.funcs->clear_doorbell_interrupt) 5501 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5502 5503 return 0; 5504 } 5505 5506 /** 5507 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5508 * @pdev: PCI device struct 5509 * @state: PCI channel state 5510 * 5511 * Description: Called when a PCI error is detected. 5512 * 5513 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5514 */ 5515 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5516 { 5517 struct drm_device *dev = pci_get_drvdata(pdev); 5518 struct amdgpu_device *adev = drm_to_adev(dev); 5519 int i; 5520 5521 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5522 5523 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5524 DRM_WARN("No support for XGMI hive yet..."); 5525 return PCI_ERS_RESULT_DISCONNECT; 5526 } 5527 5528 adev->pci_channel_state = state; 5529 5530 switch (state) { 5531 case pci_channel_io_normal: 5532 return PCI_ERS_RESULT_CAN_RECOVER; 5533 /* Fatal error, prepare for slot reset */ 5534 case pci_channel_io_frozen: 5535 /* 5536 * Locking adev->reset_domain->sem will prevent any external access 5537 * to GPU during PCI error recovery 5538 */ 5539 amdgpu_device_lock_reset_domain(adev->reset_domain); 5540 amdgpu_device_set_mp1_state(adev); 5541 5542 /* 5543 * Block any work scheduling as we do for regular GPU reset 5544 * for the duration of the recovery 5545 */ 5546 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5547 struct amdgpu_ring *ring = adev->rings[i]; 5548 5549 if (!ring || !ring->sched.thread) 5550 continue; 5551 5552 drm_sched_stop(&ring->sched, NULL); 5553 } 5554 atomic_inc(&adev->gpu_reset_counter); 5555 return PCI_ERS_RESULT_NEED_RESET; 5556 case pci_channel_io_perm_failure: 5557 /* Permanent error, prepare for device removal */ 5558 return PCI_ERS_RESULT_DISCONNECT; 5559 } 5560 5561 return PCI_ERS_RESULT_NEED_RESET; 5562 } 5563 5564 /** 5565 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5566 * @pdev: pointer to PCI device 5567 */ 5568 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5569 { 5570 5571 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5572 5573 /* TODO - dump whatever for debugging purposes */ 5574 5575 /* This called only if amdgpu_pci_error_detected returns 5576 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5577 * works, no need to reset slot. 5578 */ 5579 5580 return PCI_ERS_RESULT_RECOVERED; 5581 } 5582 5583 /** 5584 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5585 * @pdev: PCI device struct 5586 * 5587 * Description: This routine is called by the pci error recovery 5588 * code after the PCI slot has been reset, just before we 5589 * should resume normal operations. 5590 */ 5591 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5592 { 5593 struct drm_device *dev = pci_get_drvdata(pdev); 5594 struct amdgpu_device *adev = drm_to_adev(dev); 5595 int r, i; 5596 struct amdgpu_reset_context reset_context; 5597 u32 memsize; 5598 struct list_head device_list; 5599 5600 DRM_INFO("PCI error: slot reset callback!!\n"); 5601 5602 memset(&reset_context, 0, sizeof(reset_context)); 5603 5604 INIT_LIST_HEAD(&device_list); 5605 list_add_tail(&adev->reset_list, &device_list); 5606 5607 /* wait for asic to come out of reset */ 5608 msleep(500); 5609 5610 /* Restore PCI confspace */ 5611 amdgpu_device_load_pci_state(pdev); 5612 5613 /* confirm ASIC came out of reset */ 5614 for (i = 0; i < adev->usec_timeout; i++) { 5615 memsize = amdgpu_asic_get_config_memsize(adev); 5616 5617 if (memsize != 0xffffffff) 5618 break; 5619 udelay(1); 5620 } 5621 if (memsize == 0xffffffff) { 5622 r = -ETIME; 5623 goto out; 5624 } 5625 5626 reset_context.method = AMD_RESET_METHOD_NONE; 5627 reset_context.reset_req_dev = adev; 5628 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5629 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5630 5631 adev->no_hw_access = true; 5632 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5633 adev->no_hw_access = false; 5634 if (r) 5635 goto out; 5636 5637 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5638 5639 out: 5640 if (!r) { 5641 if (amdgpu_device_cache_pci_state(adev->pdev)) 5642 pci_restore_state(adev->pdev); 5643 5644 DRM_INFO("PCIe error recovery succeeded\n"); 5645 } else { 5646 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5647 amdgpu_device_unset_mp1_state(adev); 5648 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5649 } 5650 5651 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5652 } 5653 5654 /** 5655 * amdgpu_pci_resume() - resume normal ops after PCI reset 5656 * @pdev: pointer to PCI device 5657 * 5658 * Called when the error recovery driver tells us that its 5659 * OK to resume normal operation. 5660 */ 5661 void amdgpu_pci_resume(struct pci_dev *pdev) 5662 { 5663 struct drm_device *dev = pci_get_drvdata(pdev); 5664 struct amdgpu_device *adev = drm_to_adev(dev); 5665 int i; 5666 5667 5668 DRM_INFO("PCI error: resume callback!!\n"); 5669 5670 /* Only continue execution for the case of pci_channel_io_frozen */ 5671 if (adev->pci_channel_state != pci_channel_io_frozen) 5672 return; 5673 5674 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5675 struct amdgpu_ring *ring = adev->rings[i]; 5676 5677 if (!ring || !ring->sched.thread) 5678 continue; 5679 5680 5681 drm_sched_resubmit_jobs(&ring->sched); 5682 drm_sched_start(&ring->sched, true); 5683 } 5684 5685 amdgpu_device_unset_mp1_state(adev); 5686 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5687 } 5688 5689 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5690 { 5691 struct drm_device *dev = pci_get_drvdata(pdev); 5692 struct amdgpu_device *adev = drm_to_adev(dev); 5693 int r; 5694 5695 r = pci_save_state(pdev); 5696 if (!r) { 5697 kfree(adev->pci_state); 5698 5699 adev->pci_state = pci_store_saved_state(pdev); 5700 5701 if (!adev->pci_state) { 5702 DRM_ERROR("Failed to store PCI saved state"); 5703 return false; 5704 } 5705 } else { 5706 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5707 return false; 5708 } 5709 5710 return true; 5711 } 5712 5713 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5714 { 5715 struct drm_device *dev = pci_get_drvdata(pdev); 5716 struct amdgpu_device *adev = drm_to_adev(dev); 5717 int r; 5718 5719 if (!adev->pci_state) 5720 return false; 5721 5722 r = pci_load_saved_state(pdev, adev->pci_state); 5723 5724 if (!r) { 5725 pci_restore_state(pdev); 5726 } else { 5727 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5728 return false; 5729 } 5730 5731 return true; 5732 } 5733 5734 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5735 struct amdgpu_ring *ring) 5736 { 5737 #ifdef CONFIG_X86_64 5738 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5739 return; 5740 #endif 5741 if (adev->gmc.xgmi.connected_to_cpu) 5742 return; 5743 5744 if (ring && ring->funcs->emit_hdp_flush) 5745 amdgpu_ring_emit_hdp_flush(ring); 5746 else 5747 amdgpu_asic_flush_hdp(adev, ring); 5748 } 5749 5750 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5751 struct amdgpu_ring *ring) 5752 { 5753 #ifdef CONFIG_X86_64 5754 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5755 return; 5756 #endif 5757 if (adev->gmc.xgmi.connected_to_cpu) 5758 return; 5759 5760 amdgpu_asic_invalidate_hdp(adev, ring); 5761 } 5762 5763 int amdgpu_in_reset(struct amdgpu_device *adev) 5764 { 5765 return atomic_read(&adev->reset_domain->in_gpu_reset); 5766 } 5767 5768 /** 5769 * amdgpu_device_halt() - bring hardware to some kind of halt state 5770 * 5771 * @adev: amdgpu_device pointer 5772 * 5773 * Bring hardware to some kind of halt state so that no one can touch it 5774 * any more. It will help to maintain error context when error occurred. 5775 * Compare to a simple hang, the system will keep stable at least for SSH 5776 * access. Then it should be trivial to inspect the hardware state and 5777 * see what's going on. Implemented as following: 5778 * 5779 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5780 * clears all CPU mappings to device, disallows remappings through page faults 5781 * 2. amdgpu_irq_disable_all() disables all interrupts 5782 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5783 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5784 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5785 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5786 * flush any in flight DMA operations 5787 */ 5788 void amdgpu_device_halt(struct amdgpu_device *adev) 5789 { 5790 struct pci_dev *pdev = adev->pdev; 5791 struct drm_device *ddev = adev_to_drm(adev); 5792 5793 drm_dev_unplug(ddev); 5794 5795 amdgpu_irq_disable_all(adev); 5796 5797 amdgpu_fence_driver_hw_fini(adev); 5798 5799 adev->no_hw_access = true; 5800 5801 amdgpu_device_unmap_mmio(adev); 5802 5803 pci_disable_device(pdev); 5804 pci_wait_for_pending_transaction(pdev); 5805 } 5806 5807 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5808 u32 reg) 5809 { 5810 unsigned long flags, address, data; 5811 u32 r; 5812 5813 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5814 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5815 5816 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5817 WREG32(address, reg * 4); 5818 (void)RREG32(address); 5819 r = RREG32(data); 5820 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5821 return r; 5822 } 5823 5824 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5825 u32 reg, u32 v) 5826 { 5827 unsigned long flags, address, data; 5828 5829 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5830 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5831 5832 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5833 WREG32(address, reg * 4); 5834 (void)RREG32(address); 5835 WREG32(data, v); 5836 (void)RREG32(data); 5837 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5838 } 5839