1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 36 #include <drm/drm_atomic_helper.h> 37 #include <drm/drm_probe_helper.h> 38 #include <drm/amdgpu_drm.h> 39 #include <linux/vgaarb.h> 40 #include <linux/vga_switcheroo.h> 41 #include <linux/efi.h> 42 #include "amdgpu.h" 43 #include "amdgpu_trace.h" 44 #include "amdgpu_i2c.h" 45 #include "atom.h" 46 #include "amdgpu_atombios.h" 47 #include "amdgpu_atomfirmware.h" 48 #include "amd_pcie.h" 49 #ifdef CONFIG_DRM_AMDGPU_SI 50 #include "si.h" 51 #endif 52 #ifdef CONFIG_DRM_AMDGPU_CIK 53 #include "cik.h" 54 #endif 55 #include "vi.h" 56 #include "soc15.h" 57 #include "nv.h" 58 #include "bif/bif_4_1_d.h" 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 #define AMDGPU_MAX_RETRY_LIMIT 2 87 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 88 89 const char *amdgpu_asic_name[] = { 90 "TAHITI", 91 "PITCAIRN", 92 "VERDE", 93 "OLAND", 94 "HAINAN", 95 "BONAIRE", 96 "KAVERI", 97 "KABINI", 98 "HAWAII", 99 "MULLINS", 100 "TOPAZ", 101 "TONGA", 102 "FIJI", 103 "CARRIZO", 104 "STONEY", 105 "POLARIS10", 106 "POLARIS11", 107 "POLARIS12", 108 "VEGAM", 109 "VEGA10", 110 "VEGA12", 111 "VEGA20", 112 "RAVEN", 113 "ARCTURUS", 114 "RENOIR", 115 "ALDEBARAN", 116 "NAVI10", 117 "CYAN_SKILLFISH", 118 "NAVI14", 119 "NAVI12", 120 "SIENNA_CICHLID", 121 "NAVY_FLOUNDER", 122 "VANGOGH", 123 "DIMGREY_CAVEFISH", 124 "BEIGE_GOBY", 125 "YELLOW_CARP", 126 "IP DISCOVERY", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 299 void *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0, tmp = 0; 303 uint32_t *data = buf; 304 uint64_t last; 305 int idx; 306 307 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 308 return; 309 310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 311 312 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 313 for (last = pos + size; pos < last; pos += 4) { 314 tmp = pos >> 31; 315 316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 317 if (tmp != hi) { 318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 319 hi = tmp; 320 } 321 if (write) 322 WREG32_NO_KIQ(mmMM_DATA, *data++); 323 else 324 *data++ = RREG32_NO_KIQ(mmMM_DATA); 325 } 326 327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 328 drm_dev_exit(idx); 329 } 330 331 /** 332 * amdgpu_device_aper_access - access vram by vram aperature 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 * 340 * The return value means how many bytes have been transferred. 341 */ 342 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 343 void *buf, size_t size, bool write) 344 { 345 #ifdef CONFIG_64BIT 346 void __iomem *addr; 347 size_t count = 0; 348 uint64_t last; 349 350 if (!adev->mman.aper_base_kaddr) 351 return 0; 352 353 last = min(pos + size, adev->gmc.visible_vram_size); 354 if (last > pos) { 355 addr = adev->mman.aper_base_kaddr + pos; 356 count = last - pos; 357 358 if (write) { 359 memcpy_toio(addr, buf, count); 360 mb(); 361 amdgpu_device_flush_hdp(adev, NULL); 362 } else { 363 amdgpu_device_invalidate_hdp(adev, NULL); 364 mb(); 365 memcpy_fromio(buf, addr, count); 366 } 367 368 } 369 370 return count; 371 #else 372 return 0; 373 #endif 374 } 375 376 /** 377 * amdgpu_device_vram_access - read/write a buffer in vram 378 * 379 * @adev: amdgpu_device pointer 380 * @pos: offset of the buffer in vram 381 * @buf: virtual address of the buffer in system memory 382 * @size: read/write size, sizeof(@buf) must > @size 383 * @write: true - write to vram, otherwise - read from vram 384 */ 385 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 386 void *buf, size_t size, bool write) 387 { 388 size_t count; 389 390 /* try to using vram apreature to access vram first */ 391 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 392 size -= count; 393 if (size) { 394 /* using MM to access rest vram */ 395 pos += count; 396 buf += count; 397 amdgpu_device_mm_access(adev, pos, buf, size, write); 398 } 399 } 400 401 /* 402 * register access helper functions. 403 */ 404 405 /* Check if hw access should be skipped because of hotplug or device error */ 406 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 407 { 408 if (adev->no_hw_access) 409 return true; 410 411 #ifdef CONFIG_LOCKDEP 412 /* 413 * This is a bit complicated to understand, so worth a comment. What we assert 414 * here is that the GPU reset is not running on another thread in parallel. 415 * 416 * For this we trylock the read side of the reset semaphore, if that succeeds 417 * we know that the reset is not running in paralell. 418 * 419 * If the trylock fails we assert that we are either already holding the read 420 * side of the lock or are the reset thread itself and hold the write side of 421 * the lock. 422 */ 423 if (in_task()) { 424 if (down_read_trylock(&adev->reset_domain->sem)) 425 up_read(&adev->reset_domain->sem); 426 else 427 lockdep_assert_held(&adev->reset_domain->sem); 428 } 429 #endif 430 return false; 431 } 432 433 /** 434 * amdgpu_device_rreg - read a memory mapped IO or indirect register 435 * 436 * @adev: amdgpu_device pointer 437 * @reg: dword aligned register offset 438 * @acc_flags: access flags which require special behavior 439 * 440 * Returns the 32 bit value from the offset specified. 441 */ 442 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 443 uint32_t reg, uint32_t acc_flags) 444 { 445 uint32_t ret; 446 447 if (amdgpu_device_skip_hw_access(adev)) 448 return 0; 449 450 if ((reg * 4) < adev->rmmio_size) { 451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 452 amdgpu_sriov_runtime(adev) && 453 down_read_trylock(&adev->reset_domain->sem)) { 454 ret = amdgpu_kiq_rreg(adev, reg); 455 up_read(&adev->reset_domain->sem); 456 } else { 457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 458 } 459 } else { 460 ret = adev->pcie_rreg(adev, reg * 4); 461 } 462 463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 464 465 return ret; 466 } 467 468 /* 469 * MMIO register read with bytes helper functions 470 * @offset:bytes offset from MMIO start 471 * 472 */ 473 474 /** 475 * amdgpu_mm_rreg8 - read a memory mapped IO register 476 * 477 * @adev: amdgpu_device pointer 478 * @offset: byte aligned register offset 479 * 480 * Returns the 8 bit value from the offset specified. 481 */ 482 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 483 { 484 if (amdgpu_device_skip_hw_access(adev)) 485 return 0; 486 487 if (offset < adev->rmmio_size) 488 return (readb(adev->rmmio + offset)); 489 BUG(); 490 } 491 492 /* 493 * MMIO register write with bytes helper functions 494 * @offset:bytes offset from MMIO start 495 * @value: the value want to be written to the register 496 * 497 */ 498 /** 499 * amdgpu_mm_wreg8 - read a memory mapped IO register 500 * 501 * @adev: amdgpu_device pointer 502 * @offset: byte aligned register offset 503 * @value: 8 bit value to write 504 * 505 * Writes the value specified to the offset specified. 506 */ 507 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 508 { 509 if (amdgpu_device_skip_hw_access(adev)) 510 return; 511 512 if (offset < adev->rmmio_size) 513 writeb(value, adev->rmmio + offset); 514 else 515 BUG(); 516 } 517 518 /** 519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 520 * 521 * @adev: amdgpu_device pointer 522 * @reg: dword aligned register offset 523 * @v: 32 bit value to write to the register 524 * @acc_flags: access flags which require special behavior 525 * 526 * Writes the value specified to the offset specified. 527 */ 528 void amdgpu_device_wreg(struct amdgpu_device *adev, 529 uint32_t reg, uint32_t v, 530 uint32_t acc_flags) 531 { 532 if (amdgpu_device_skip_hw_access(adev)) 533 return; 534 535 if ((reg * 4) < adev->rmmio_size) { 536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 537 amdgpu_sriov_runtime(adev) && 538 down_read_trylock(&adev->reset_domain->sem)) { 539 amdgpu_kiq_wreg(adev, reg, v); 540 up_read(&adev->reset_domain->sem); 541 } else { 542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 543 } 544 } else { 545 adev->pcie_wreg(adev, reg * 4, v); 546 } 547 548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 549 } 550 551 /** 552 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 553 * 554 * @adev: amdgpu_device pointer 555 * @reg: mmio/rlc register 556 * @v: value to write 557 * 558 * this function is invoked only for the debugfs register access 559 */ 560 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 561 uint32_t reg, uint32_t v) 562 { 563 if (amdgpu_device_skip_hw_access(adev)) 564 return; 565 566 if (amdgpu_sriov_fullaccess(adev) && 567 adev->gfx.rlc.funcs && 568 adev->gfx.rlc.funcs->is_rlcg_access_range) { 569 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 570 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 571 } else if ((reg * 4) >= adev->rmmio_size) { 572 adev->pcie_wreg(adev, reg * 4, v); 573 } else { 574 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 575 } 576 } 577 578 /** 579 * amdgpu_mm_rdoorbell - read a doorbell dword 580 * 581 * @adev: amdgpu_device pointer 582 * @index: doorbell index 583 * 584 * Returns the value in the doorbell aperture at the 585 * requested doorbell index (CIK). 586 */ 587 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 588 { 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if (index < adev->doorbell.num_doorbells) { 593 return readl(adev->doorbell.ptr + index); 594 } else { 595 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 596 return 0; 597 } 598 } 599 600 /** 601 * amdgpu_mm_wdoorbell - write a doorbell dword 602 * 603 * @adev: amdgpu_device pointer 604 * @index: doorbell index 605 * @v: value to write 606 * 607 * Writes @v to the doorbell aperture at the 608 * requested doorbell index (CIK). 609 */ 610 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 611 { 612 if (amdgpu_device_skip_hw_access(adev)) 613 return; 614 615 if (index < adev->doorbell.num_doorbells) { 616 writel(v, adev->doorbell.ptr + index); 617 } else { 618 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 619 } 620 } 621 622 /** 623 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 624 * 625 * @adev: amdgpu_device pointer 626 * @index: doorbell index 627 * 628 * Returns the value in the doorbell aperture at the 629 * requested doorbell index (VEGA10+). 630 */ 631 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 632 { 633 if (amdgpu_device_skip_hw_access(adev)) 634 return 0; 635 636 if (index < adev->doorbell.num_doorbells) { 637 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 638 } else { 639 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 640 return 0; 641 } 642 } 643 644 /** 645 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 646 * 647 * @adev: amdgpu_device pointer 648 * @index: doorbell index 649 * @v: value to write 650 * 651 * Writes @v to the doorbell aperture at the 652 * requested doorbell index (VEGA10+). 653 */ 654 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 655 { 656 if (amdgpu_device_skip_hw_access(adev)) 657 return; 658 659 if (index < adev->doorbell.num_doorbells) { 660 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 661 } else { 662 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 663 } 664 } 665 666 /** 667 * amdgpu_device_indirect_rreg - read an indirect register 668 * 669 * @adev: amdgpu_device pointer 670 * @pcie_index: mmio register offset 671 * @pcie_data: mmio register offset 672 * @reg_addr: indirect register address to read from 673 * 674 * Returns the value of indirect register @reg_addr 675 */ 676 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 677 u32 pcie_index, u32 pcie_data, 678 u32 reg_addr) 679 { 680 unsigned long flags; 681 u32 r; 682 void __iomem *pcie_index_offset; 683 void __iomem *pcie_data_offset; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 689 writel(reg_addr, pcie_index_offset); 690 readl(pcie_index_offset); 691 r = readl(pcie_data_offset); 692 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 693 694 return r; 695 } 696 697 /** 698 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 699 * 700 * @adev: amdgpu_device pointer 701 * @pcie_index: mmio register offset 702 * @pcie_data: mmio register offset 703 * @reg_addr: indirect register address to read from 704 * 705 * Returns the value of indirect register @reg_addr 706 */ 707 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 708 u32 pcie_index, u32 pcie_data, 709 u32 reg_addr) 710 { 711 unsigned long flags; 712 u64 r; 713 void __iomem *pcie_index_offset; 714 void __iomem *pcie_data_offset; 715 716 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 717 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 718 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 719 720 /* read low 32 bits */ 721 writel(reg_addr, pcie_index_offset); 722 readl(pcie_index_offset); 723 r = readl(pcie_data_offset); 724 /* read high 32 bits */ 725 writel(reg_addr + 4, pcie_index_offset); 726 readl(pcie_index_offset); 727 r |= ((u64)readl(pcie_data_offset) << 32); 728 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 729 730 return r; 731 } 732 733 /** 734 * amdgpu_device_indirect_wreg - write an indirect register address 735 * 736 * @adev: amdgpu_device pointer 737 * @pcie_index: mmio register offset 738 * @pcie_data: mmio register offset 739 * @reg_addr: indirect register offset 740 * @reg_data: indirect register data 741 * 742 */ 743 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 744 u32 pcie_index, u32 pcie_data, 745 u32 reg_addr, u32 reg_data) 746 { 747 unsigned long flags; 748 void __iomem *pcie_index_offset; 749 void __iomem *pcie_data_offset; 750 751 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 752 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 753 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 754 755 writel(reg_addr, pcie_index_offset); 756 readl(pcie_index_offset); 757 writel(reg_data, pcie_data_offset); 758 readl(pcie_data_offset); 759 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 760 } 761 762 /** 763 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 764 * 765 * @adev: amdgpu_device pointer 766 * @pcie_index: mmio register offset 767 * @pcie_data: mmio register offset 768 * @reg_addr: indirect register offset 769 * @reg_data: indirect register data 770 * 771 */ 772 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 773 u32 pcie_index, u32 pcie_data, 774 u32 reg_addr, u64 reg_data) 775 { 776 unsigned long flags; 777 void __iomem *pcie_index_offset; 778 void __iomem *pcie_data_offset; 779 780 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 781 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 782 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 783 784 /* write low 32 bits */ 785 writel(reg_addr, pcie_index_offset); 786 readl(pcie_index_offset); 787 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 788 readl(pcie_data_offset); 789 /* write high 32 bits */ 790 writel(reg_addr + 4, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data >> 32), pcie_data_offset); 793 readl(pcie_data_offset); 794 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 795 } 796 797 /** 798 * amdgpu_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @reg: offset of register 802 * 803 * Dummy register read function. Used for register blocks 804 * that certain asics don't have (all asics). 805 * Returns the value in the register. 806 */ 807 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 808 { 809 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 810 BUG(); 811 return 0; 812 } 813 814 /** 815 * amdgpu_invalid_wreg - dummy reg write function 816 * 817 * @adev: amdgpu_device pointer 818 * @reg: offset of register 819 * @v: value to write to the register 820 * 821 * Dummy register read function. Used for register blocks 822 * that certain asics don't have (all asics). 823 */ 824 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 825 { 826 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 827 reg, v); 828 BUG(); 829 } 830 831 /** 832 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 833 * 834 * @adev: amdgpu_device pointer 835 * @reg: offset of register 836 * 837 * Dummy register read function. Used for register blocks 838 * that certain asics don't have (all asics). 839 * Returns the value in the register. 840 */ 841 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 842 { 843 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 844 BUG(); 845 return 0; 846 } 847 848 /** 849 * amdgpu_invalid_wreg64 - dummy reg write function 850 * 851 * @adev: amdgpu_device pointer 852 * @reg: offset of register 853 * @v: value to write to the register 854 * 855 * Dummy register read function. Used for register blocks 856 * that certain asics don't have (all asics). 857 */ 858 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 859 { 860 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 861 reg, v); 862 BUG(); 863 } 864 865 /** 866 * amdgpu_block_invalid_rreg - dummy reg read function 867 * 868 * @adev: amdgpu_device pointer 869 * @block: offset of instance 870 * @reg: offset of register 871 * 872 * Dummy register read function. Used for register blocks 873 * that certain asics don't have (all asics). 874 * Returns the value in the register. 875 */ 876 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 877 uint32_t block, uint32_t reg) 878 { 879 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 880 reg, block); 881 BUG(); 882 return 0; 883 } 884 885 /** 886 * amdgpu_block_invalid_wreg - dummy reg write function 887 * 888 * @adev: amdgpu_device pointer 889 * @block: offset of instance 890 * @reg: offset of register 891 * @v: value to write to the register 892 * 893 * Dummy register read function. Used for register blocks 894 * that certain asics don't have (all asics). 895 */ 896 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 897 uint32_t block, 898 uint32_t reg, uint32_t v) 899 { 900 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 901 reg, block, v); 902 BUG(); 903 } 904 905 /** 906 * amdgpu_device_asic_init - Wrapper for atom asic_init 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Does any asic specific work and then calls atom asic init. 911 */ 912 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 913 { 914 amdgpu_asic_pre_asic_init(adev); 915 916 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 917 } 918 919 /** 920 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 921 * 922 * @adev: amdgpu_device pointer 923 * 924 * Allocates a scratch page of VRAM for use by various things in the 925 * driver. 926 */ 927 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 928 { 929 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 930 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 931 &adev->vram_scratch.robj, 932 &adev->vram_scratch.gpu_addr, 933 (void **)&adev->vram_scratch.ptr); 934 } 935 936 /** 937 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 938 * 939 * @adev: amdgpu_device pointer 940 * 941 * Frees the VRAM scratch page. 942 */ 943 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 944 { 945 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 946 } 947 948 /** 949 * amdgpu_device_program_register_sequence - program an array of registers. 950 * 951 * @adev: amdgpu_device pointer 952 * @registers: pointer to the register array 953 * @array_size: size of the register array 954 * 955 * Programs an array or registers with and and or masks. 956 * This is a helper for setting golden registers. 957 */ 958 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 959 const u32 *registers, 960 const u32 array_size) 961 { 962 u32 tmp, reg, and_mask, or_mask; 963 int i; 964 965 if (array_size % 3) 966 return; 967 968 for (i = 0; i < array_size; i +=3) { 969 reg = registers[i + 0]; 970 and_mask = registers[i + 1]; 971 or_mask = registers[i + 2]; 972 973 if (and_mask == 0xffffffff) { 974 tmp = or_mask; 975 } else { 976 tmp = RREG32(reg); 977 tmp &= ~and_mask; 978 if (adev->family >= AMDGPU_FAMILY_AI) 979 tmp |= (or_mask & and_mask); 980 else 981 tmp |= or_mask; 982 } 983 WREG32(reg, tmp); 984 } 985 } 986 987 /** 988 * amdgpu_device_pci_config_reset - reset the GPU 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Resets the GPU using the pci config reset sequence. 993 * Only applicable to asics prior to vega10. 994 */ 995 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 996 { 997 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 998 } 999 1000 /** 1001 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1002 * 1003 * @adev: amdgpu_device pointer 1004 * 1005 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1006 */ 1007 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1008 { 1009 return pci_reset_function(adev->pdev); 1010 } 1011 1012 /* 1013 * GPU doorbell aperture helpers function. 1014 */ 1015 /** 1016 * amdgpu_device_doorbell_init - Init doorbell driver information. 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Init doorbell driver information (CIK) 1021 * Returns 0 on success, error on failure. 1022 */ 1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1024 { 1025 1026 /* No doorbell on SI hardware generation */ 1027 if (adev->asic_type < CHIP_BONAIRE) { 1028 adev->doorbell.base = 0; 1029 adev->doorbell.size = 0; 1030 adev->doorbell.num_doorbells = 0; 1031 adev->doorbell.ptr = NULL; 1032 return 0; 1033 } 1034 1035 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1036 return -EINVAL; 1037 1038 amdgpu_asic_init_doorbell_index(adev); 1039 1040 /* doorbell bar mapping */ 1041 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1042 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1043 1044 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1045 adev->doorbell_index.max_assignment+1); 1046 if (adev->doorbell.num_doorbells == 0) 1047 return -EINVAL; 1048 1049 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1050 * paging queue doorbell use the second page. The 1051 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1052 * doorbells are in the first page. So with paging queue enabled, 1053 * the max num_doorbells should + 1 page (0x400 in dword) 1054 */ 1055 if (adev->asic_type >= CHIP_VEGA10) 1056 adev->doorbell.num_doorbells += 0x400; 1057 1058 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1059 adev->doorbell.num_doorbells * 1060 sizeof(u32)); 1061 if (adev->doorbell.ptr == NULL) 1062 return -ENOMEM; 1063 1064 return 0; 1065 } 1066 1067 /** 1068 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1069 * 1070 * @adev: amdgpu_device pointer 1071 * 1072 * Tear down doorbell driver information (CIK) 1073 */ 1074 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1075 { 1076 iounmap(adev->doorbell.ptr); 1077 adev->doorbell.ptr = NULL; 1078 } 1079 1080 1081 1082 /* 1083 * amdgpu_device_wb_*() 1084 * Writeback is the method by which the GPU updates special pages in memory 1085 * with the status of certain GPU events (fences, ring pointers,etc.). 1086 */ 1087 1088 /** 1089 * amdgpu_device_wb_fini - Disable Writeback and free memory 1090 * 1091 * @adev: amdgpu_device pointer 1092 * 1093 * Disables Writeback and frees the Writeback memory (all asics). 1094 * Used at driver shutdown. 1095 */ 1096 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1097 { 1098 if (adev->wb.wb_obj) { 1099 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1100 &adev->wb.gpu_addr, 1101 (void **)&adev->wb.wb); 1102 adev->wb.wb_obj = NULL; 1103 } 1104 } 1105 1106 /** 1107 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1108 * 1109 * @adev: amdgpu_device pointer 1110 * 1111 * Initializes writeback and allocates writeback memory (all asics). 1112 * Used at driver startup. 1113 * Returns 0 on success or an -error on failure. 1114 */ 1115 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1116 { 1117 int r; 1118 1119 if (adev->wb.wb_obj == NULL) { 1120 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1121 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1122 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1123 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1124 (void **)&adev->wb.wb); 1125 if (r) { 1126 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1127 return r; 1128 } 1129 1130 adev->wb.num_wb = AMDGPU_MAX_WB; 1131 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1132 1133 /* clear wb memory */ 1134 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1135 } 1136 1137 return 0; 1138 } 1139 1140 /** 1141 * amdgpu_device_wb_get - Allocate a wb entry 1142 * 1143 * @adev: amdgpu_device pointer 1144 * @wb: wb index 1145 * 1146 * Allocate a wb slot for use by the driver (all asics). 1147 * Returns 0 on success or -EINVAL on failure. 1148 */ 1149 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1150 { 1151 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1152 1153 if (offset < adev->wb.num_wb) { 1154 __set_bit(offset, adev->wb.used); 1155 *wb = offset << 3; /* convert to dw offset */ 1156 return 0; 1157 } else { 1158 return -EINVAL; 1159 } 1160 } 1161 1162 /** 1163 * amdgpu_device_wb_free - Free a wb entry 1164 * 1165 * @adev: amdgpu_device pointer 1166 * @wb: wb index 1167 * 1168 * Free a wb slot allocated for use by the driver (all asics) 1169 */ 1170 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1171 { 1172 wb >>= 3; 1173 if (wb < adev->wb.num_wb) 1174 __clear_bit(wb, adev->wb.used); 1175 } 1176 1177 /** 1178 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1179 * 1180 * @adev: amdgpu_device pointer 1181 * 1182 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1183 * to fail, but if any of the BARs is not accessible after the size we abort 1184 * driver loading by returning -ENODEV. 1185 */ 1186 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1187 { 1188 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1189 struct pci_bus *root; 1190 struct resource *res; 1191 unsigned i; 1192 u16 cmd; 1193 int r; 1194 1195 /* Bypass for VF */ 1196 if (amdgpu_sriov_vf(adev)) 1197 return 0; 1198 1199 /* skip if the bios has already enabled large BAR */ 1200 if (adev->gmc.real_vram_size && 1201 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1202 return 0; 1203 1204 /* Check if the root BUS has 64bit memory resources */ 1205 root = adev->pdev->bus; 1206 while (root->parent) 1207 root = root->parent; 1208 1209 pci_bus_for_each_resource(root, res, i) { 1210 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1211 res->start > 0x100000000ull) 1212 break; 1213 } 1214 1215 /* Trying to resize is pointless without a root hub window above 4GB */ 1216 if (!res) 1217 return 0; 1218 1219 /* Limit the BAR size to what is available */ 1220 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1221 rbar_size); 1222 1223 /* Disable memory decoding while we change the BAR addresses and size */ 1224 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1225 pci_write_config_word(adev->pdev, PCI_COMMAND, 1226 cmd & ~PCI_COMMAND_MEMORY); 1227 1228 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1229 amdgpu_device_doorbell_fini(adev); 1230 if (adev->asic_type >= CHIP_BONAIRE) 1231 pci_release_resource(adev->pdev, 2); 1232 1233 pci_release_resource(adev->pdev, 0); 1234 1235 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1236 if (r == -ENOSPC) 1237 DRM_INFO("Not enough PCI address space for a large BAR."); 1238 else if (r && r != -ENOTSUPP) 1239 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1240 1241 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1242 1243 /* When the doorbell or fb BAR isn't available we have no chance of 1244 * using the device. 1245 */ 1246 r = amdgpu_device_doorbell_init(adev); 1247 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1248 return -ENODEV; 1249 1250 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1251 1252 return 0; 1253 } 1254 1255 /* 1256 * GPU helpers function. 1257 */ 1258 /** 1259 * amdgpu_device_need_post - check if the hw need post or not 1260 * 1261 * @adev: amdgpu_device pointer 1262 * 1263 * Check if the asic has been initialized (all asics) at driver startup 1264 * or post is needed if hw reset is performed. 1265 * Returns true if need or false if not. 1266 */ 1267 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1268 { 1269 uint32_t reg; 1270 1271 if (amdgpu_sriov_vf(adev)) 1272 return false; 1273 1274 if (amdgpu_passthrough(adev)) { 1275 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1276 * some old smc fw still need driver do vPost otherwise gpu hang, while 1277 * those smc fw version above 22.15 doesn't have this flaw, so we force 1278 * vpost executed for smc version below 22.15 1279 */ 1280 if (adev->asic_type == CHIP_FIJI) { 1281 int err; 1282 uint32_t fw_ver; 1283 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1284 /* force vPost if error occured */ 1285 if (err) 1286 return true; 1287 1288 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1289 if (fw_ver < 0x00160e00) 1290 return true; 1291 } 1292 } 1293 1294 /* Don't post if we need to reset whole hive on init */ 1295 if (adev->gmc.xgmi.pending_reset) 1296 return false; 1297 1298 if (adev->has_hw_reset) { 1299 adev->has_hw_reset = false; 1300 return true; 1301 } 1302 1303 /* bios scratch used on CIK+ */ 1304 if (adev->asic_type >= CHIP_BONAIRE) 1305 return amdgpu_atombios_scratch_need_asic_init(adev); 1306 1307 /* check MEM_SIZE for older asics */ 1308 reg = amdgpu_asic_get_config_memsize(adev); 1309 1310 if ((reg != 0) && (reg != 0xffffffff)) 1311 return false; 1312 1313 return true; 1314 } 1315 1316 /** 1317 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1318 * 1319 * @adev: amdgpu_device pointer 1320 * 1321 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1322 * be set for this device. 1323 * 1324 * Returns true if it should be used or false if not. 1325 */ 1326 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1327 { 1328 switch (amdgpu_aspm) { 1329 case -1: 1330 break; 1331 case 0: 1332 return false; 1333 case 1: 1334 return true; 1335 default: 1336 return false; 1337 } 1338 return pcie_aspm_enabled(adev->pdev); 1339 } 1340 1341 /* if we get transitioned to only one device, take VGA back */ 1342 /** 1343 * amdgpu_device_vga_set_decode - enable/disable vga decode 1344 * 1345 * @pdev: PCI device pointer 1346 * @state: enable/disable vga decode 1347 * 1348 * Enable/disable vga decode (all asics). 1349 * Returns VGA resource flags. 1350 */ 1351 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1352 bool state) 1353 { 1354 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1355 amdgpu_asic_set_vga_state(adev, state); 1356 if (state) 1357 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1358 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1359 else 1360 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1361 } 1362 1363 /** 1364 * amdgpu_device_check_block_size - validate the vm block size 1365 * 1366 * @adev: amdgpu_device pointer 1367 * 1368 * Validates the vm block size specified via module parameter. 1369 * The vm block size defines number of bits in page table versus page directory, 1370 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1371 * page table and the remaining bits are in the page directory. 1372 */ 1373 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1374 { 1375 /* defines number of bits in page table versus page directory, 1376 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1377 * page table and the remaining bits are in the page directory */ 1378 if (amdgpu_vm_block_size == -1) 1379 return; 1380 1381 if (amdgpu_vm_block_size < 9) { 1382 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1383 amdgpu_vm_block_size); 1384 amdgpu_vm_block_size = -1; 1385 } 1386 } 1387 1388 /** 1389 * amdgpu_device_check_vm_size - validate the vm size 1390 * 1391 * @adev: amdgpu_device pointer 1392 * 1393 * Validates the vm size in GB specified via module parameter. 1394 * The VM size is the size of the GPU virtual memory space in GB. 1395 */ 1396 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1397 { 1398 /* no need to check the default value */ 1399 if (amdgpu_vm_size == -1) 1400 return; 1401 1402 if (amdgpu_vm_size < 1) { 1403 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1404 amdgpu_vm_size); 1405 amdgpu_vm_size = -1; 1406 } 1407 } 1408 1409 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1410 { 1411 struct sysinfo si; 1412 bool is_os_64 = (sizeof(void *) == 8); 1413 uint64_t total_memory; 1414 uint64_t dram_size_seven_GB = 0x1B8000000; 1415 uint64_t dram_size_three_GB = 0xB8000000; 1416 1417 if (amdgpu_smu_memory_pool_size == 0) 1418 return; 1419 1420 if (!is_os_64) { 1421 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1422 goto def_value; 1423 } 1424 si_meminfo(&si); 1425 total_memory = (uint64_t)si.totalram * si.mem_unit; 1426 1427 if ((amdgpu_smu_memory_pool_size == 1) || 1428 (amdgpu_smu_memory_pool_size == 2)) { 1429 if (total_memory < dram_size_three_GB) 1430 goto def_value1; 1431 } else if ((amdgpu_smu_memory_pool_size == 4) || 1432 (amdgpu_smu_memory_pool_size == 8)) { 1433 if (total_memory < dram_size_seven_GB) 1434 goto def_value1; 1435 } else { 1436 DRM_WARN("Smu memory pool size not supported\n"); 1437 goto def_value; 1438 } 1439 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1440 1441 return; 1442 1443 def_value1: 1444 DRM_WARN("No enough system memory\n"); 1445 def_value: 1446 adev->pm.smu_prv_buffer_size = 0; 1447 } 1448 1449 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1450 { 1451 if (!(adev->flags & AMD_IS_APU) || 1452 adev->asic_type < CHIP_RAVEN) 1453 return 0; 1454 1455 switch (adev->asic_type) { 1456 case CHIP_RAVEN: 1457 if (adev->pdev->device == 0x15dd) 1458 adev->apu_flags |= AMD_APU_IS_RAVEN; 1459 if (adev->pdev->device == 0x15d8) 1460 adev->apu_flags |= AMD_APU_IS_PICASSO; 1461 break; 1462 case CHIP_RENOIR: 1463 if ((adev->pdev->device == 0x1636) || 1464 (adev->pdev->device == 0x164c)) 1465 adev->apu_flags |= AMD_APU_IS_RENOIR; 1466 else 1467 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1468 break; 1469 case CHIP_VANGOGH: 1470 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1471 break; 1472 case CHIP_YELLOW_CARP: 1473 break; 1474 case CHIP_CYAN_SKILLFISH: 1475 if ((adev->pdev->device == 0x13FE) || 1476 (adev->pdev->device == 0x143F)) 1477 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1478 break; 1479 default: 1480 break; 1481 } 1482 1483 return 0; 1484 } 1485 1486 /** 1487 * amdgpu_device_check_arguments - validate module params 1488 * 1489 * @adev: amdgpu_device pointer 1490 * 1491 * Validates certain module parameters and updates 1492 * the associated values used by the driver (all asics). 1493 */ 1494 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1495 { 1496 if (amdgpu_sched_jobs < 4) { 1497 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1498 amdgpu_sched_jobs); 1499 amdgpu_sched_jobs = 4; 1500 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1501 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1502 amdgpu_sched_jobs); 1503 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1504 } 1505 1506 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1507 /* gart size must be greater or equal to 32M */ 1508 dev_warn(adev->dev, "gart size (%d) too small\n", 1509 amdgpu_gart_size); 1510 amdgpu_gart_size = -1; 1511 } 1512 1513 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1514 /* gtt size must be greater or equal to 32M */ 1515 dev_warn(adev->dev, "gtt size (%d) too small\n", 1516 amdgpu_gtt_size); 1517 amdgpu_gtt_size = -1; 1518 } 1519 1520 /* valid range is between 4 and 9 inclusive */ 1521 if (amdgpu_vm_fragment_size != -1 && 1522 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1523 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1524 amdgpu_vm_fragment_size = -1; 1525 } 1526 1527 if (amdgpu_sched_hw_submission < 2) { 1528 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1529 amdgpu_sched_hw_submission); 1530 amdgpu_sched_hw_submission = 2; 1531 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1532 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1533 amdgpu_sched_hw_submission); 1534 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1535 } 1536 1537 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1538 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1539 amdgpu_reset_method = -1; 1540 } 1541 1542 amdgpu_device_check_smu_prv_buffer_size(adev); 1543 1544 amdgpu_device_check_vm_size(adev); 1545 1546 amdgpu_device_check_block_size(adev); 1547 1548 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1549 1550 amdgpu_gmc_tmz_set(adev); 1551 1552 1553 return 0; 1554 } 1555 1556 /** 1557 * amdgpu_switcheroo_set_state - set switcheroo state 1558 * 1559 * @pdev: pci dev pointer 1560 * @state: vga_switcheroo state 1561 * 1562 * Callback for the switcheroo driver. Suspends or resumes the 1563 * the asics before or after it is powered up using ACPI methods. 1564 */ 1565 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1566 enum vga_switcheroo_state state) 1567 { 1568 struct drm_device *dev = pci_get_drvdata(pdev); 1569 int r; 1570 1571 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1572 return; 1573 1574 if (state == VGA_SWITCHEROO_ON) { 1575 pr_info("switched on\n"); 1576 /* don't suspend or resume card normally */ 1577 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1578 1579 pci_set_power_state(pdev, PCI_D0); 1580 amdgpu_device_load_pci_state(pdev); 1581 r = pci_enable_device(pdev); 1582 if (r) 1583 DRM_WARN("pci_enable_device failed (%d)\n", r); 1584 amdgpu_device_resume(dev, true); 1585 1586 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1587 } else { 1588 pr_info("switched off\n"); 1589 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1590 amdgpu_device_suspend(dev, true); 1591 amdgpu_device_cache_pci_state(pdev); 1592 /* Shut down the device */ 1593 pci_disable_device(pdev); 1594 pci_set_power_state(pdev, PCI_D3cold); 1595 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1596 } 1597 } 1598 1599 /** 1600 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1601 * 1602 * @pdev: pci dev pointer 1603 * 1604 * Callback for the switcheroo driver. Check of the switcheroo 1605 * state can be changed. 1606 * Returns true if the state can be changed, false if not. 1607 */ 1608 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1609 { 1610 struct drm_device *dev = pci_get_drvdata(pdev); 1611 1612 /* 1613 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1614 * locking inversion with the driver load path. And the access here is 1615 * completely racy anyway. So don't bother with locking for now. 1616 */ 1617 return atomic_read(&dev->open_count) == 0; 1618 } 1619 1620 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1621 .set_gpu_state = amdgpu_switcheroo_set_state, 1622 .reprobe = NULL, 1623 .can_switch = amdgpu_switcheroo_can_switch, 1624 }; 1625 1626 /** 1627 * amdgpu_device_ip_set_clockgating_state - set the CG state 1628 * 1629 * @dev: amdgpu_device pointer 1630 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1631 * @state: clockgating state (gate or ungate) 1632 * 1633 * Sets the requested clockgating state for all instances of 1634 * the hardware IP specified. 1635 * Returns the error code from the last instance. 1636 */ 1637 int amdgpu_device_ip_set_clockgating_state(void *dev, 1638 enum amd_ip_block_type block_type, 1639 enum amd_clockgating_state state) 1640 { 1641 struct amdgpu_device *adev = dev; 1642 int i, r = 0; 1643 1644 for (i = 0; i < adev->num_ip_blocks; i++) { 1645 if (!adev->ip_blocks[i].status.valid) 1646 continue; 1647 if (adev->ip_blocks[i].version->type != block_type) 1648 continue; 1649 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1650 continue; 1651 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1652 (void *)adev, state); 1653 if (r) 1654 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1655 adev->ip_blocks[i].version->funcs->name, r); 1656 } 1657 return r; 1658 } 1659 1660 /** 1661 * amdgpu_device_ip_set_powergating_state - set the PG state 1662 * 1663 * @dev: amdgpu_device pointer 1664 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1665 * @state: powergating state (gate or ungate) 1666 * 1667 * Sets the requested powergating state for all instances of 1668 * the hardware IP specified. 1669 * Returns the error code from the last instance. 1670 */ 1671 int amdgpu_device_ip_set_powergating_state(void *dev, 1672 enum amd_ip_block_type block_type, 1673 enum amd_powergating_state state) 1674 { 1675 struct amdgpu_device *adev = dev; 1676 int i, r = 0; 1677 1678 for (i = 0; i < adev->num_ip_blocks; i++) { 1679 if (!adev->ip_blocks[i].status.valid) 1680 continue; 1681 if (adev->ip_blocks[i].version->type != block_type) 1682 continue; 1683 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1684 continue; 1685 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1686 (void *)adev, state); 1687 if (r) 1688 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1689 adev->ip_blocks[i].version->funcs->name, r); 1690 } 1691 return r; 1692 } 1693 1694 /** 1695 * amdgpu_device_ip_get_clockgating_state - get the CG state 1696 * 1697 * @adev: amdgpu_device pointer 1698 * @flags: clockgating feature flags 1699 * 1700 * Walks the list of IPs on the device and updates the clockgating 1701 * flags for each IP. 1702 * Updates @flags with the feature flags for each hardware IP where 1703 * clockgating is enabled. 1704 */ 1705 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1706 u64 *flags) 1707 { 1708 int i; 1709 1710 for (i = 0; i < adev->num_ip_blocks; i++) { 1711 if (!adev->ip_blocks[i].status.valid) 1712 continue; 1713 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1714 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1715 } 1716 } 1717 1718 /** 1719 * amdgpu_device_ip_wait_for_idle - wait for idle 1720 * 1721 * @adev: amdgpu_device pointer 1722 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1723 * 1724 * Waits for the request hardware IP to be idle. 1725 * Returns 0 for success or a negative error code on failure. 1726 */ 1727 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1728 enum amd_ip_block_type block_type) 1729 { 1730 int i, r; 1731 1732 for (i = 0; i < adev->num_ip_blocks; i++) { 1733 if (!adev->ip_blocks[i].status.valid) 1734 continue; 1735 if (adev->ip_blocks[i].version->type == block_type) { 1736 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1737 if (r) 1738 return r; 1739 break; 1740 } 1741 } 1742 return 0; 1743 1744 } 1745 1746 /** 1747 * amdgpu_device_ip_is_idle - is the hardware IP idle 1748 * 1749 * @adev: amdgpu_device pointer 1750 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1751 * 1752 * Check if the hardware IP is idle or not. 1753 * Returns true if it the IP is idle, false if not. 1754 */ 1755 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1756 enum amd_ip_block_type block_type) 1757 { 1758 int i; 1759 1760 for (i = 0; i < adev->num_ip_blocks; i++) { 1761 if (!adev->ip_blocks[i].status.valid) 1762 continue; 1763 if (adev->ip_blocks[i].version->type == block_type) 1764 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1765 } 1766 return true; 1767 1768 } 1769 1770 /** 1771 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1772 * 1773 * @adev: amdgpu_device pointer 1774 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1775 * 1776 * Returns a pointer to the hardware IP block structure 1777 * if it exists for the asic, otherwise NULL. 1778 */ 1779 struct amdgpu_ip_block * 1780 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1781 enum amd_ip_block_type type) 1782 { 1783 int i; 1784 1785 for (i = 0; i < adev->num_ip_blocks; i++) 1786 if (adev->ip_blocks[i].version->type == type) 1787 return &adev->ip_blocks[i]; 1788 1789 return NULL; 1790 } 1791 1792 /** 1793 * amdgpu_device_ip_block_version_cmp 1794 * 1795 * @adev: amdgpu_device pointer 1796 * @type: enum amd_ip_block_type 1797 * @major: major version 1798 * @minor: minor version 1799 * 1800 * return 0 if equal or greater 1801 * return 1 if smaller or the ip_block doesn't exist 1802 */ 1803 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1804 enum amd_ip_block_type type, 1805 u32 major, u32 minor) 1806 { 1807 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1808 1809 if (ip_block && ((ip_block->version->major > major) || 1810 ((ip_block->version->major == major) && 1811 (ip_block->version->minor >= minor)))) 1812 return 0; 1813 1814 return 1; 1815 } 1816 1817 /** 1818 * amdgpu_device_ip_block_add 1819 * 1820 * @adev: amdgpu_device pointer 1821 * @ip_block_version: pointer to the IP to add 1822 * 1823 * Adds the IP block driver information to the collection of IPs 1824 * on the asic. 1825 */ 1826 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1827 const struct amdgpu_ip_block_version *ip_block_version) 1828 { 1829 if (!ip_block_version) 1830 return -EINVAL; 1831 1832 switch (ip_block_version->type) { 1833 case AMD_IP_BLOCK_TYPE_VCN: 1834 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1835 return 0; 1836 break; 1837 case AMD_IP_BLOCK_TYPE_JPEG: 1838 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1839 return 0; 1840 break; 1841 default: 1842 break; 1843 } 1844 1845 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1846 ip_block_version->funcs->name); 1847 1848 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1849 1850 return 0; 1851 } 1852 1853 /** 1854 * amdgpu_device_enable_virtual_display - enable virtual display feature 1855 * 1856 * @adev: amdgpu_device pointer 1857 * 1858 * Enabled the virtual display feature if the user has enabled it via 1859 * the module parameter virtual_display. This feature provides a virtual 1860 * display hardware on headless boards or in virtualized environments. 1861 * This function parses and validates the configuration string specified by 1862 * the user and configues the virtual display configuration (number of 1863 * virtual connectors, crtcs, etc.) specified. 1864 */ 1865 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1866 { 1867 adev->enable_virtual_display = false; 1868 1869 if (amdgpu_virtual_display) { 1870 const char *pci_address_name = pci_name(adev->pdev); 1871 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1872 1873 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1874 pciaddstr_tmp = pciaddstr; 1875 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1876 pciaddname = strsep(&pciaddname_tmp, ","); 1877 if (!strcmp("all", pciaddname) 1878 || !strcmp(pci_address_name, pciaddname)) { 1879 long num_crtc; 1880 int res = -1; 1881 1882 adev->enable_virtual_display = true; 1883 1884 if (pciaddname_tmp) 1885 res = kstrtol(pciaddname_tmp, 10, 1886 &num_crtc); 1887 1888 if (!res) { 1889 if (num_crtc < 1) 1890 num_crtc = 1; 1891 if (num_crtc > 6) 1892 num_crtc = 6; 1893 adev->mode_info.num_crtc = num_crtc; 1894 } else { 1895 adev->mode_info.num_crtc = 1; 1896 } 1897 break; 1898 } 1899 } 1900 1901 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1902 amdgpu_virtual_display, pci_address_name, 1903 adev->enable_virtual_display, adev->mode_info.num_crtc); 1904 1905 kfree(pciaddstr); 1906 } 1907 } 1908 1909 /** 1910 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1911 * 1912 * @adev: amdgpu_device pointer 1913 * 1914 * Parses the asic configuration parameters specified in the gpu info 1915 * firmware and makes them availale to the driver for use in configuring 1916 * the asic. 1917 * Returns 0 on success, -EINVAL on failure. 1918 */ 1919 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1920 { 1921 const char *chip_name; 1922 char fw_name[40]; 1923 int err; 1924 const struct gpu_info_firmware_header_v1_0 *hdr; 1925 1926 adev->firmware.gpu_info_fw = NULL; 1927 1928 if (adev->mman.discovery_bin) { 1929 amdgpu_discovery_get_gfx_info(adev); 1930 1931 /* 1932 * FIXME: The bounding box is still needed by Navi12, so 1933 * temporarily read it from gpu_info firmware. Should be droped 1934 * when DAL no longer needs it. 1935 */ 1936 if (adev->asic_type != CHIP_NAVI12) 1937 return 0; 1938 } 1939 1940 switch (adev->asic_type) { 1941 #ifdef CONFIG_DRM_AMDGPU_SI 1942 case CHIP_VERDE: 1943 case CHIP_TAHITI: 1944 case CHIP_PITCAIRN: 1945 case CHIP_OLAND: 1946 case CHIP_HAINAN: 1947 #endif 1948 #ifdef CONFIG_DRM_AMDGPU_CIK 1949 case CHIP_BONAIRE: 1950 case CHIP_HAWAII: 1951 case CHIP_KAVERI: 1952 case CHIP_KABINI: 1953 case CHIP_MULLINS: 1954 #endif 1955 case CHIP_TOPAZ: 1956 case CHIP_TONGA: 1957 case CHIP_FIJI: 1958 case CHIP_POLARIS10: 1959 case CHIP_POLARIS11: 1960 case CHIP_POLARIS12: 1961 case CHIP_VEGAM: 1962 case CHIP_CARRIZO: 1963 case CHIP_STONEY: 1964 case CHIP_VEGA20: 1965 case CHIP_ALDEBARAN: 1966 case CHIP_SIENNA_CICHLID: 1967 case CHIP_NAVY_FLOUNDER: 1968 case CHIP_DIMGREY_CAVEFISH: 1969 case CHIP_BEIGE_GOBY: 1970 default: 1971 return 0; 1972 case CHIP_VEGA10: 1973 chip_name = "vega10"; 1974 break; 1975 case CHIP_VEGA12: 1976 chip_name = "vega12"; 1977 break; 1978 case CHIP_RAVEN: 1979 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1980 chip_name = "raven2"; 1981 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1982 chip_name = "picasso"; 1983 else 1984 chip_name = "raven"; 1985 break; 1986 case CHIP_ARCTURUS: 1987 chip_name = "arcturus"; 1988 break; 1989 case CHIP_NAVI12: 1990 chip_name = "navi12"; 1991 break; 1992 } 1993 1994 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1995 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1996 if (err) { 1997 dev_err(adev->dev, 1998 "Failed to load gpu_info firmware \"%s\"\n", 1999 fw_name); 2000 goto out; 2001 } 2002 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2003 if (err) { 2004 dev_err(adev->dev, 2005 "Failed to validate gpu_info firmware \"%s\"\n", 2006 fw_name); 2007 goto out; 2008 } 2009 2010 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2011 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2012 2013 switch (hdr->version_major) { 2014 case 1: 2015 { 2016 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2017 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2018 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2019 2020 /* 2021 * Should be droped when DAL no longer needs it. 2022 */ 2023 if (adev->asic_type == CHIP_NAVI12) 2024 goto parse_soc_bounding_box; 2025 2026 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2027 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2028 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2029 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2030 adev->gfx.config.max_texture_channel_caches = 2031 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2032 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2033 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2034 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2035 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2036 adev->gfx.config.double_offchip_lds_buf = 2037 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2038 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2039 adev->gfx.cu_info.max_waves_per_simd = 2040 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2041 adev->gfx.cu_info.max_scratch_slots_per_cu = 2042 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2043 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2044 if (hdr->version_minor >= 1) { 2045 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2046 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2047 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2048 adev->gfx.config.num_sc_per_sh = 2049 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2050 adev->gfx.config.num_packer_per_sc = 2051 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2052 } 2053 2054 parse_soc_bounding_box: 2055 /* 2056 * soc bounding box info is not integrated in disocovery table, 2057 * we always need to parse it from gpu info firmware if needed. 2058 */ 2059 if (hdr->version_minor == 2) { 2060 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2061 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2062 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2063 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2064 } 2065 break; 2066 } 2067 default: 2068 dev_err(adev->dev, 2069 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2070 err = -EINVAL; 2071 goto out; 2072 } 2073 out: 2074 return err; 2075 } 2076 2077 /** 2078 * amdgpu_device_ip_early_init - run early init for hardware IPs 2079 * 2080 * @adev: amdgpu_device pointer 2081 * 2082 * Early initialization pass for hardware IPs. The hardware IPs that make 2083 * up each asic are discovered each IP's early_init callback is run. This 2084 * is the first stage in initializing the asic. 2085 * Returns 0 on success, negative error code on failure. 2086 */ 2087 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2088 { 2089 struct drm_device *dev = adev_to_drm(adev); 2090 struct pci_dev *parent; 2091 int i, r; 2092 2093 amdgpu_device_enable_virtual_display(adev); 2094 2095 if (amdgpu_sriov_vf(adev)) { 2096 r = amdgpu_virt_request_full_gpu(adev, true); 2097 if (r) 2098 return r; 2099 } 2100 2101 switch (adev->asic_type) { 2102 #ifdef CONFIG_DRM_AMDGPU_SI 2103 case CHIP_VERDE: 2104 case CHIP_TAHITI: 2105 case CHIP_PITCAIRN: 2106 case CHIP_OLAND: 2107 case CHIP_HAINAN: 2108 adev->family = AMDGPU_FAMILY_SI; 2109 r = si_set_ip_blocks(adev); 2110 if (r) 2111 return r; 2112 break; 2113 #endif 2114 #ifdef CONFIG_DRM_AMDGPU_CIK 2115 case CHIP_BONAIRE: 2116 case CHIP_HAWAII: 2117 case CHIP_KAVERI: 2118 case CHIP_KABINI: 2119 case CHIP_MULLINS: 2120 if (adev->flags & AMD_IS_APU) 2121 adev->family = AMDGPU_FAMILY_KV; 2122 else 2123 adev->family = AMDGPU_FAMILY_CI; 2124 2125 r = cik_set_ip_blocks(adev); 2126 if (r) 2127 return r; 2128 break; 2129 #endif 2130 case CHIP_TOPAZ: 2131 case CHIP_TONGA: 2132 case CHIP_FIJI: 2133 case CHIP_POLARIS10: 2134 case CHIP_POLARIS11: 2135 case CHIP_POLARIS12: 2136 case CHIP_VEGAM: 2137 case CHIP_CARRIZO: 2138 case CHIP_STONEY: 2139 if (adev->flags & AMD_IS_APU) 2140 adev->family = AMDGPU_FAMILY_CZ; 2141 else 2142 adev->family = AMDGPU_FAMILY_VI; 2143 2144 r = vi_set_ip_blocks(adev); 2145 if (r) 2146 return r; 2147 break; 2148 default: 2149 r = amdgpu_discovery_set_ip_blocks(adev); 2150 if (r) 2151 return r; 2152 break; 2153 } 2154 2155 if (amdgpu_has_atpx() && 2156 (amdgpu_is_atpx_hybrid() || 2157 amdgpu_has_atpx_dgpu_power_cntl()) && 2158 ((adev->flags & AMD_IS_APU) == 0) && 2159 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2160 adev->flags |= AMD_IS_PX; 2161 2162 if (!(adev->flags & AMD_IS_APU)) { 2163 parent = pci_upstream_bridge(adev->pdev); 2164 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2165 } 2166 2167 amdgpu_amdkfd_device_probe(adev); 2168 2169 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2170 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2171 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2172 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2173 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2174 2175 for (i = 0; i < adev->num_ip_blocks; i++) { 2176 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2177 DRM_ERROR("disabled ip block: %d <%s>\n", 2178 i, adev->ip_blocks[i].version->funcs->name); 2179 adev->ip_blocks[i].status.valid = false; 2180 } else { 2181 if (adev->ip_blocks[i].version->funcs->early_init) { 2182 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2183 if (r == -ENOENT) { 2184 adev->ip_blocks[i].status.valid = false; 2185 } else if (r) { 2186 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2187 adev->ip_blocks[i].version->funcs->name, r); 2188 return r; 2189 } else { 2190 adev->ip_blocks[i].status.valid = true; 2191 } 2192 } else { 2193 adev->ip_blocks[i].status.valid = true; 2194 } 2195 } 2196 /* get the vbios after the asic_funcs are set up */ 2197 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2198 r = amdgpu_device_parse_gpu_info_fw(adev); 2199 if (r) 2200 return r; 2201 2202 /* Read BIOS */ 2203 if (!amdgpu_get_bios(adev)) 2204 return -EINVAL; 2205 2206 r = amdgpu_atombios_init(adev); 2207 if (r) { 2208 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2209 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2210 return r; 2211 } 2212 2213 /*get pf2vf msg info at it's earliest time*/ 2214 if (amdgpu_sriov_vf(adev)) 2215 amdgpu_virt_init_data_exchange(adev); 2216 2217 } 2218 } 2219 2220 adev->cg_flags &= amdgpu_cg_mask; 2221 adev->pg_flags &= amdgpu_pg_mask; 2222 2223 return 0; 2224 } 2225 2226 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2227 { 2228 int i, r; 2229 2230 for (i = 0; i < adev->num_ip_blocks; i++) { 2231 if (!adev->ip_blocks[i].status.sw) 2232 continue; 2233 if (adev->ip_blocks[i].status.hw) 2234 continue; 2235 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2236 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2237 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2238 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2239 if (r) { 2240 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2241 adev->ip_blocks[i].version->funcs->name, r); 2242 return r; 2243 } 2244 adev->ip_blocks[i].status.hw = true; 2245 } 2246 } 2247 2248 return 0; 2249 } 2250 2251 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2252 { 2253 int i, r; 2254 2255 for (i = 0; i < adev->num_ip_blocks; i++) { 2256 if (!adev->ip_blocks[i].status.sw) 2257 continue; 2258 if (adev->ip_blocks[i].status.hw) 2259 continue; 2260 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2261 if (r) { 2262 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2263 adev->ip_blocks[i].version->funcs->name, r); 2264 return r; 2265 } 2266 adev->ip_blocks[i].status.hw = true; 2267 } 2268 2269 return 0; 2270 } 2271 2272 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2273 { 2274 int r = 0; 2275 int i; 2276 uint32_t smu_version; 2277 2278 if (adev->asic_type >= CHIP_VEGA10) { 2279 for (i = 0; i < adev->num_ip_blocks; i++) { 2280 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2281 continue; 2282 2283 if (!adev->ip_blocks[i].status.sw) 2284 continue; 2285 2286 /* no need to do the fw loading again if already done*/ 2287 if (adev->ip_blocks[i].status.hw == true) 2288 break; 2289 2290 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2291 r = adev->ip_blocks[i].version->funcs->resume(adev); 2292 if (r) { 2293 DRM_ERROR("resume of IP block <%s> failed %d\n", 2294 adev->ip_blocks[i].version->funcs->name, r); 2295 return r; 2296 } 2297 } else { 2298 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2299 if (r) { 2300 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2301 adev->ip_blocks[i].version->funcs->name, r); 2302 return r; 2303 } 2304 } 2305 2306 adev->ip_blocks[i].status.hw = true; 2307 break; 2308 } 2309 } 2310 2311 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2312 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2313 2314 return r; 2315 } 2316 2317 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2318 { 2319 long timeout; 2320 int r, i; 2321 2322 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2323 struct amdgpu_ring *ring = adev->rings[i]; 2324 2325 /* No need to setup the GPU scheduler for rings that don't need it */ 2326 if (!ring || ring->no_scheduler) 2327 continue; 2328 2329 switch (ring->funcs->type) { 2330 case AMDGPU_RING_TYPE_GFX: 2331 timeout = adev->gfx_timeout; 2332 break; 2333 case AMDGPU_RING_TYPE_COMPUTE: 2334 timeout = adev->compute_timeout; 2335 break; 2336 case AMDGPU_RING_TYPE_SDMA: 2337 timeout = adev->sdma_timeout; 2338 break; 2339 default: 2340 timeout = adev->video_timeout; 2341 break; 2342 } 2343 2344 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2345 ring->num_hw_submission, amdgpu_job_hang_limit, 2346 timeout, adev->reset_domain->wq, 2347 ring->sched_score, ring->name, 2348 adev->dev); 2349 if (r) { 2350 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2351 ring->name); 2352 return r; 2353 } 2354 } 2355 2356 return 0; 2357 } 2358 2359 2360 /** 2361 * amdgpu_device_ip_init - run init for hardware IPs 2362 * 2363 * @adev: amdgpu_device pointer 2364 * 2365 * Main initialization pass for hardware IPs. The list of all the hardware 2366 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2367 * are run. sw_init initializes the software state associated with each IP 2368 * and hw_init initializes the hardware associated with each IP. 2369 * Returns 0 on success, negative error code on failure. 2370 */ 2371 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2372 { 2373 int i, r; 2374 2375 r = amdgpu_ras_init(adev); 2376 if (r) 2377 return r; 2378 2379 for (i = 0; i < adev->num_ip_blocks; i++) { 2380 if (!adev->ip_blocks[i].status.valid) 2381 continue; 2382 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2383 if (r) { 2384 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2385 adev->ip_blocks[i].version->funcs->name, r); 2386 goto init_failed; 2387 } 2388 adev->ip_blocks[i].status.sw = true; 2389 2390 /* need to do gmc hw init early so we can allocate gpu mem */ 2391 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2392 /* Try to reserve bad pages early */ 2393 if (amdgpu_sriov_vf(adev)) 2394 amdgpu_virt_exchange_data(adev); 2395 2396 r = amdgpu_device_vram_scratch_init(adev); 2397 if (r) { 2398 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2399 goto init_failed; 2400 } 2401 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2402 if (r) { 2403 DRM_ERROR("hw_init %d failed %d\n", i, r); 2404 goto init_failed; 2405 } 2406 r = amdgpu_device_wb_init(adev); 2407 if (r) { 2408 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2409 goto init_failed; 2410 } 2411 adev->ip_blocks[i].status.hw = true; 2412 2413 /* right after GMC hw init, we create CSA */ 2414 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2415 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2416 AMDGPU_GEM_DOMAIN_VRAM, 2417 AMDGPU_CSA_SIZE); 2418 if (r) { 2419 DRM_ERROR("allocate CSA failed %d\n", r); 2420 goto init_failed; 2421 } 2422 } 2423 } 2424 } 2425 2426 if (amdgpu_sriov_vf(adev)) 2427 amdgpu_virt_init_data_exchange(adev); 2428 2429 r = amdgpu_ib_pool_init(adev); 2430 if (r) { 2431 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2432 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2433 goto init_failed; 2434 } 2435 2436 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2437 if (r) 2438 goto init_failed; 2439 2440 r = amdgpu_device_ip_hw_init_phase1(adev); 2441 if (r) 2442 goto init_failed; 2443 2444 r = amdgpu_device_fw_loading(adev); 2445 if (r) 2446 goto init_failed; 2447 2448 r = amdgpu_device_ip_hw_init_phase2(adev); 2449 if (r) 2450 goto init_failed; 2451 2452 /* 2453 * retired pages will be loaded from eeprom and reserved here, 2454 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2455 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2456 * for I2C communication which only true at this point. 2457 * 2458 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2459 * failure from bad gpu situation and stop amdgpu init process 2460 * accordingly. For other failed cases, it will still release all 2461 * the resource and print error message, rather than returning one 2462 * negative value to upper level. 2463 * 2464 * Note: theoretically, this should be called before all vram allocations 2465 * to protect retired page from abusing 2466 */ 2467 r = amdgpu_ras_recovery_init(adev); 2468 if (r) 2469 goto init_failed; 2470 2471 /** 2472 * In case of XGMI grab extra reference for reset domain for this device 2473 */ 2474 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2475 if (amdgpu_xgmi_add_device(adev) == 0) { 2476 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2477 2478 if (!hive->reset_domain || 2479 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2480 r = -ENOENT; 2481 goto init_failed; 2482 } 2483 2484 /* Drop the early temporary reset domain we created for device */ 2485 amdgpu_reset_put_reset_domain(adev->reset_domain); 2486 adev->reset_domain = hive->reset_domain; 2487 } 2488 } 2489 2490 r = amdgpu_device_init_schedulers(adev); 2491 if (r) 2492 goto init_failed; 2493 2494 /* Don't init kfd if whole hive need to be reset during init */ 2495 if (!adev->gmc.xgmi.pending_reset) 2496 amdgpu_amdkfd_device_init(adev); 2497 2498 amdgpu_fru_get_product_info(adev); 2499 2500 init_failed: 2501 if (amdgpu_sriov_vf(adev)) 2502 amdgpu_virt_release_full_gpu(adev, true); 2503 2504 return r; 2505 } 2506 2507 /** 2508 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2509 * 2510 * @adev: amdgpu_device pointer 2511 * 2512 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2513 * this function before a GPU reset. If the value is retained after a 2514 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2515 */ 2516 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2517 { 2518 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2519 } 2520 2521 /** 2522 * amdgpu_device_check_vram_lost - check if vram is valid 2523 * 2524 * @adev: amdgpu_device pointer 2525 * 2526 * Checks the reset magic value written to the gart pointer in VRAM. 2527 * The driver calls this after a GPU reset to see if the contents of 2528 * VRAM is lost or now. 2529 * returns true if vram is lost, false if not. 2530 */ 2531 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2532 { 2533 if (memcmp(adev->gart.ptr, adev->reset_magic, 2534 AMDGPU_RESET_MAGIC_NUM)) 2535 return true; 2536 2537 if (!amdgpu_in_reset(adev)) 2538 return false; 2539 2540 /* 2541 * For all ASICs with baco/mode1 reset, the VRAM is 2542 * always assumed to be lost. 2543 */ 2544 switch (amdgpu_asic_reset_method(adev)) { 2545 case AMD_RESET_METHOD_BACO: 2546 case AMD_RESET_METHOD_MODE1: 2547 return true; 2548 default: 2549 return false; 2550 } 2551 } 2552 2553 /** 2554 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2555 * 2556 * @adev: amdgpu_device pointer 2557 * @state: clockgating state (gate or ungate) 2558 * 2559 * The list of all the hardware IPs that make up the asic is walked and the 2560 * set_clockgating_state callbacks are run. 2561 * Late initialization pass enabling clockgating for hardware IPs. 2562 * Fini or suspend, pass disabling clockgating for hardware IPs. 2563 * Returns 0 on success, negative error code on failure. 2564 */ 2565 2566 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2567 enum amd_clockgating_state state) 2568 { 2569 int i, j, r; 2570 2571 if (amdgpu_emu_mode == 1) 2572 return 0; 2573 2574 for (j = 0; j < adev->num_ip_blocks; j++) { 2575 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2576 if (!adev->ip_blocks[i].status.late_initialized) 2577 continue; 2578 /* skip CG for GFX on S0ix */ 2579 if (adev->in_s0ix && 2580 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2581 continue; 2582 /* skip CG for VCE/UVD, it's handled specially */ 2583 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2584 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2585 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2586 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2587 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2588 /* enable clockgating to save power */ 2589 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2590 state); 2591 if (r) { 2592 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2593 adev->ip_blocks[i].version->funcs->name, r); 2594 return r; 2595 } 2596 } 2597 } 2598 2599 return 0; 2600 } 2601 2602 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2603 enum amd_powergating_state state) 2604 { 2605 int i, j, r; 2606 2607 if (amdgpu_emu_mode == 1) 2608 return 0; 2609 2610 for (j = 0; j < adev->num_ip_blocks; j++) { 2611 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2612 if (!adev->ip_blocks[i].status.late_initialized) 2613 continue; 2614 /* skip PG for GFX on S0ix */ 2615 if (adev->in_s0ix && 2616 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2617 continue; 2618 /* skip CG for VCE/UVD, it's handled specially */ 2619 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2620 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2621 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2622 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2623 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2624 /* enable powergating to save power */ 2625 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2626 state); 2627 if (r) { 2628 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2629 adev->ip_blocks[i].version->funcs->name, r); 2630 return r; 2631 } 2632 } 2633 } 2634 return 0; 2635 } 2636 2637 static int amdgpu_device_enable_mgpu_fan_boost(void) 2638 { 2639 struct amdgpu_gpu_instance *gpu_ins; 2640 struct amdgpu_device *adev; 2641 int i, ret = 0; 2642 2643 mutex_lock(&mgpu_info.mutex); 2644 2645 /* 2646 * MGPU fan boost feature should be enabled 2647 * only when there are two or more dGPUs in 2648 * the system 2649 */ 2650 if (mgpu_info.num_dgpu < 2) 2651 goto out; 2652 2653 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2654 gpu_ins = &(mgpu_info.gpu_ins[i]); 2655 adev = gpu_ins->adev; 2656 if (!(adev->flags & AMD_IS_APU) && 2657 !gpu_ins->mgpu_fan_enabled) { 2658 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2659 if (ret) 2660 break; 2661 2662 gpu_ins->mgpu_fan_enabled = 1; 2663 } 2664 } 2665 2666 out: 2667 mutex_unlock(&mgpu_info.mutex); 2668 2669 return ret; 2670 } 2671 2672 /** 2673 * amdgpu_device_ip_late_init - run late init for hardware IPs 2674 * 2675 * @adev: amdgpu_device pointer 2676 * 2677 * Late initialization pass for hardware IPs. The list of all the hardware 2678 * IPs that make up the asic is walked and the late_init callbacks are run. 2679 * late_init covers any special initialization that an IP requires 2680 * after all of the have been initialized or something that needs to happen 2681 * late in the init process. 2682 * Returns 0 on success, negative error code on failure. 2683 */ 2684 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2685 { 2686 struct amdgpu_gpu_instance *gpu_instance; 2687 int i = 0, r; 2688 2689 for (i = 0; i < adev->num_ip_blocks; i++) { 2690 if (!adev->ip_blocks[i].status.hw) 2691 continue; 2692 if (adev->ip_blocks[i].version->funcs->late_init) { 2693 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2694 if (r) { 2695 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2696 adev->ip_blocks[i].version->funcs->name, r); 2697 return r; 2698 } 2699 } 2700 adev->ip_blocks[i].status.late_initialized = true; 2701 } 2702 2703 r = amdgpu_ras_late_init(adev); 2704 if (r) { 2705 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2706 return r; 2707 } 2708 2709 amdgpu_ras_set_error_query_ready(adev, true); 2710 2711 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2712 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2713 2714 amdgpu_device_fill_reset_magic(adev); 2715 2716 r = amdgpu_device_enable_mgpu_fan_boost(); 2717 if (r) 2718 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2719 2720 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2721 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2722 adev->asic_type == CHIP_ALDEBARAN )) 2723 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2724 2725 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2726 mutex_lock(&mgpu_info.mutex); 2727 2728 /* 2729 * Reset device p-state to low as this was booted with high. 2730 * 2731 * This should be performed only after all devices from the same 2732 * hive get initialized. 2733 * 2734 * However, it's unknown how many device in the hive in advance. 2735 * As this is counted one by one during devices initializations. 2736 * 2737 * So, we wait for all XGMI interlinked devices initialized. 2738 * This may bring some delays as those devices may come from 2739 * different hives. But that should be OK. 2740 */ 2741 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2742 for (i = 0; i < mgpu_info.num_gpu; i++) { 2743 gpu_instance = &(mgpu_info.gpu_ins[i]); 2744 if (gpu_instance->adev->flags & AMD_IS_APU) 2745 continue; 2746 2747 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2748 AMDGPU_XGMI_PSTATE_MIN); 2749 if (r) { 2750 DRM_ERROR("pstate setting failed (%d).\n", r); 2751 break; 2752 } 2753 } 2754 } 2755 2756 mutex_unlock(&mgpu_info.mutex); 2757 } 2758 2759 return 0; 2760 } 2761 2762 /** 2763 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2764 * 2765 * @adev: amdgpu_device pointer 2766 * 2767 * For ASICs need to disable SMC first 2768 */ 2769 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2770 { 2771 int i, r; 2772 2773 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2774 return; 2775 2776 for (i = 0; i < adev->num_ip_blocks; i++) { 2777 if (!adev->ip_blocks[i].status.hw) 2778 continue; 2779 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2780 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2781 /* XXX handle errors */ 2782 if (r) { 2783 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2784 adev->ip_blocks[i].version->funcs->name, r); 2785 } 2786 adev->ip_blocks[i].status.hw = false; 2787 break; 2788 } 2789 } 2790 } 2791 2792 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2793 { 2794 int i, r; 2795 2796 for (i = 0; i < adev->num_ip_blocks; i++) { 2797 if (!adev->ip_blocks[i].version->funcs->early_fini) 2798 continue; 2799 2800 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2801 if (r) { 2802 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2803 adev->ip_blocks[i].version->funcs->name, r); 2804 } 2805 } 2806 2807 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2808 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2809 2810 amdgpu_amdkfd_suspend(adev, false); 2811 2812 /* Workaroud for ASICs need to disable SMC first */ 2813 amdgpu_device_smu_fini_early(adev); 2814 2815 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2816 if (!adev->ip_blocks[i].status.hw) 2817 continue; 2818 2819 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2820 /* XXX handle errors */ 2821 if (r) { 2822 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2823 adev->ip_blocks[i].version->funcs->name, r); 2824 } 2825 2826 adev->ip_blocks[i].status.hw = false; 2827 } 2828 2829 if (amdgpu_sriov_vf(adev)) { 2830 if (amdgpu_virt_release_full_gpu(adev, false)) 2831 DRM_ERROR("failed to release exclusive mode on fini\n"); 2832 } 2833 2834 return 0; 2835 } 2836 2837 /** 2838 * amdgpu_device_ip_fini - run fini for hardware IPs 2839 * 2840 * @adev: amdgpu_device pointer 2841 * 2842 * Main teardown pass for hardware IPs. The list of all the hardware 2843 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2844 * are run. hw_fini tears down the hardware associated with each IP 2845 * and sw_fini tears down any software state associated with each IP. 2846 * Returns 0 on success, negative error code on failure. 2847 */ 2848 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2849 { 2850 int i, r; 2851 2852 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2853 amdgpu_virt_release_ras_err_handler_data(adev); 2854 2855 if (adev->gmc.xgmi.num_physical_nodes > 1) 2856 amdgpu_xgmi_remove_device(adev); 2857 2858 amdgpu_amdkfd_device_fini_sw(adev); 2859 2860 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2861 if (!adev->ip_blocks[i].status.sw) 2862 continue; 2863 2864 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2865 amdgpu_ucode_free_bo(adev); 2866 amdgpu_free_static_csa(&adev->virt.csa_obj); 2867 amdgpu_device_wb_fini(adev); 2868 amdgpu_device_vram_scratch_fini(adev); 2869 amdgpu_ib_pool_fini(adev); 2870 } 2871 2872 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2873 /* XXX handle errors */ 2874 if (r) { 2875 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2876 adev->ip_blocks[i].version->funcs->name, r); 2877 } 2878 adev->ip_blocks[i].status.sw = false; 2879 adev->ip_blocks[i].status.valid = false; 2880 } 2881 2882 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2883 if (!adev->ip_blocks[i].status.late_initialized) 2884 continue; 2885 if (adev->ip_blocks[i].version->funcs->late_fini) 2886 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2887 adev->ip_blocks[i].status.late_initialized = false; 2888 } 2889 2890 amdgpu_ras_fini(adev); 2891 2892 return 0; 2893 } 2894 2895 /** 2896 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2897 * 2898 * @work: work_struct. 2899 */ 2900 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2901 { 2902 struct amdgpu_device *adev = 2903 container_of(work, struct amdgpu_device, delayed_init_work.work); 2904 int r; 2905 2906 r = amdgpu_ib_ring_tests(adev); 2907 if (r) 2908 DRM_ERROR("ib ring test failed (%d).\n", r); 2909 } 2910 2911 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2912 { 2913 struct amdgpu_device *adev = 2914 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2915 2916 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2917 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2918 2919 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2920 adev->gfx.gfx_off_state = true; 2921 } 2922 2923 /** 2924 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2925 * 2926 * @adev: amdgpu_device pointer 2927 * 2928 * Main suspend function for hardware IPs. The list of all the hardware 2929 * IPs that make up the asic is walked, clockgating is disabled and the 2930 * suspend callbacks are run. suspend puts the hardware and software state 2931 * in each IP into a state suitable for suspend. 2932 * Returns 0 on success, negative error code on failure. 2933 */ 2934 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2935 { 2936 int i, r; 2937 2938 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2939 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2940 2941 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2942 if (!adev->ip_blocks[i].status.valid) 2943 continue; 2944 2945 /* displays are handled separately */ 2946 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2947 continue; 2948 2949 /* XXX handle errors */ 2950 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2951 /* XXX handle errors */ 2952 if (r) { 2953 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2954 adev->ip_blocks[i].version->funcs->name, r); 2955 return r; 2956 } 2957 2958 adev->ip_blocks[i].status.hw = false; 2959 } 2960 2961 return 0; 2962 } 2963 2964 /** 2965 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2966 * 2967 * @adev: amdgpu_device pointer 2968 * 2969 * Main suspend function for hardware IPs. The list of all the hardware 2970 * IPs that make up the asic is walked, clockgating is disabled and the 2971 * suspend callbacks are run. suspend puts the hardware and software state 2972 * in each IP into a state suitable for suspend. 2973 * Returns 0 on success, negative error code on failure. 2974 */ 2975 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2976 { 2977 int i, r; 2978 2979 if (adev->in_s0ix) 2980 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2981 2982 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2983 if (!adev->ip_blocks[i].status.valid) 2984 continue; 2985 /* displays are handled in phase1 */ 2986 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2987 continue; 2988 /* PSP lost connection when err_event_athub occurs */ 2989 if (amdgpu_ras_intr_triggered() && 2990 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2991 adev->ip_blocks[i].status.hw = false; 2992 continue; 2993 } 2994 2995 /* skip unnecessary suspend if we do not initialize them yet */ 2996 if (adev->gmc.xgmi.pending_reset && 2997 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2998 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2999 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3001 adev->ip_blocks[i].status.hw = false; 3002 continue; 3003 } 3004 3005 /* skip suspend of gfx and psp for S0ix 3006 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3007 * like at runtime. PSP is also part of the always on hardware 3008 * so no need to suspend it. 3009 */ 3010 if (adev->in_s0ix && 3011 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3012 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 3013 continue; 3014 3015 /* XXX handle errors */ 3016 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3017 /* XXX handle errors */ 3018 if (r) { 3019 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3020 adev->ip_blocks[i].version->funcs->name, r); 3021 } 3022 adev->ip_blocks[i].status.hw = false; 3023 /* handle putting the SMC in the appropriate state */ 3024 if(!amdgpu_sriov_vf(adev)){ 3025 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3026 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3027 if (r) { 3028 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3029 adev->mp1_state, r); 3030 return r; 3031 } 3032 } 3033 } 3034 } 3035 3036 return 0; 3037 } 3038 3039 /** 3040 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3041 * 3042 * @adev: amdgpu_device pointer 3043 * 3044 * Main suspend function for hardware IPs. The list of all the hardware 3045 * IPs that make up the asic is walked, clockgating is disabled and the 3046 * suspend callbacks are run. suspend puts the hardware and software state 3047 * in each IP into a state suitable for suspend. 3048 * Returns 0 on success, negative error code on failure. 3049 */ 3050 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3051 { 3052 int r; 3053 3054 if (amdgpu_sriov_vf(adev)) { 3055 amdgpu_virt_fini_data_exchange(adev); 3056 amdgpu_virt_request_full_gpu(adev, false); 3057 } 3058 3059 r = amdgpu_device_ip_suspend_phase1(adev); 3060 if (r) 3061 return r; 3062 r = amdgpu_device_ip_suspend_phase2(adev); 3063 3064 if (amdgpu_sriov_vf(adev)) 3065 amdgpu_virt_release_full_gpu(adev, false); 3066 3067 return r; 3068 } 3069 3070 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3071 { 3072 int i, r; 3073 3074 static enum amd_ip_block_type ip_order[] = { 3075 AMD_IP_BLOCK_TYPE_GMC, 3076 AMD_IP_BLOCK_TYPE_COMMON, 3077 AMD_IP_BLOCK_TYPE_PSP, 3078 AMD_IP_BLOCK_TYPE_IH, 3079 }; 3080 3081 for (i = 0; i < adev->num_ip_blocks; i++) { 3082 int j; 3083 struct amdgpu_ip_block *block; 3084 3085 block = &adev->ip_blocks[i]; 3086 block->status.hw = false; 3087 3088 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3089 3090 if (block->version->type != ip_order[j] || 3091 !block->status.valid) 3092 continue; 3093 3094 r = block->version->funcs->hw_init(adev); 3095 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3096 if (r) 3097 return r; 3098 block->status.hw = true; 3099 } 3100 } 3101 3102 return 0; 3103 } 3104 3105 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3106 { 3107 int i, r; 3108 3109 static enum amd_ip_block_type ip_order[] = { 3110 AMD_IP_BLOCK_TYPE_SMC, 3111 AMD_IP_BLOCK_TYPE_DCE, 3112 AMD_IP_BLOCK_TYPE_GFX, 3113 AMD_IP_BLOCK_TYPE_SDMA, 3114 AMD_IP_BLOCK_TYPE_UVD, 3115 AMD_IP_BLOCK_TYPE_VCE, 3116 AMD_IP_BLOCK_TYPE_VCN 3117 }; 3118 3119 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3120 int j; 3121 struct amdgpu_ip_block *block; 3122 3123 for (j = 0; j < adev->num_ip_blocks; j++) { 3124 block = &adev->ip_blocks[j]; 3125 3126 if (block->version->type != ip_order[i] || 3127 !block->status.valid || 3128 block->status.hw) 3129 continue; 3130 3131 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3132 r = block->version->funcs->resume(adev); 3133 else 3134 r = block->version->funcs->hw_init(adev); 3135 3136 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3137 if (r) 3138 return r; 3139 block->status.hw = true; 3140 } 3141 } 3142 3143 return 0; 3144 } 3145 3146 /** 3147 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3148 * 3149 * @adev: amdgpu_device pointer 3150 * 3151 * First resume function for hardware IPs. The list of all the hardware 3152 * IPs that make up the asic is walked and the resume callbacks are run for 3153 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3154 * after a suspend and updates the software state as necessary. This 3155 * function is also used for restoring the GPU after a GPU reset. 3156 * Returns 0 on success, negative error code on failure. 3157 */ 3158 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3159 { 3160 int i, r; 3161 3162 for (i = 0; i < adev->num_ip_blocks; i++) { 3163 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3164 continue; 3165 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3166 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3167 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3168 3169 r = adev->ip_blocks[i].version->funcs->resume(adev); 3170 if (r) { 3171 DRM_ERROR("resume of IP block <%s> failed %d\n", 3172 adev->ip_blocks[i].version->funcs->name, r); 3173 return r; 3174 } 3175 adev->ip_blocks[i].status.hw = true; 3176 } 3177 } 3178 3179 return 0; 3180 } 3181 3182 /** 3183 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3184 * 3185 * @adev: amdgpu_device pointer 3186 * 3187 * First resume function for hardware IPs. The list of all the hardware 3188 * IPs that make up the asic is walked and the resume callbacks are run for 3189 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3190 * functional state after a suspend and updates the software state as 3191 * necessary. This function is also used for restoring the GPU after a GPU 3192 * reset. 3193 * Returns 0 on success, negative error code on failure. 3194 */ 3195 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3196 { 3197 int i, r; 3198 3199 for (i = 0; i < adev->num_ip_blocks; i++) { 3200 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3201 continue; 3202 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3203 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3204 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3205 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3206 continue; 3207 r = adev->ip_blocks[i].version->funcs->resume(adev); 3208 if (r) { 3209 DRM_ERROR("resume of IP block <%s> failed %d\n", 3210 adev->ip_blocks[i].version->funcs->name, r); 3211 return r; 3212 } 3213 adev->ip_blocks[i].status.hw = true; 3214 } 3215 3216 return 0; 3217 } 3218 3219 /** 3220 * amdgpu_device_ip_resume - run resume for hardware IPs 3221 * 3222 * @adev: amdgpu_device pointer 3223 * 3224 * Main resume function for hardware IPs. The hardware IPs 3225 * are split into two resume functions because they are 3226 * are also used in in recovering from a GPU reset and some additional 3227 * steps need to be take between them. In this case (S3/S4) they are 3228 * run sequentially. 3229 * Returns 0 on success, negative error code on failure. 3230 */ 3231 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3232 { 3233 int r; 3234 3235 r = amdgpu_amdkfd_resume_iommu(adev); 3236 if (r) 3237 return r; 3238 3239 r = amdgpu_device_ip_resume_phase1(adev); 3240 if (r) 3241 return r; 3242 3243 r = amdgpu_device_fw_loading(adev); 3244 if (r) 3245 return r; 3246 3247 r = amdgpu_device_ip_resume_phase2(adev); 3248 3249 return r; 3250 } 3251 3252 /** 3253 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3254 * 3255 * @adev: amdgpu_device pointer 3256 * 3257 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3258 */ 3259 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3260 { 3261 if (amdgpu_sriov_vf(adev)) { 3262 if (adev->is_atom_fw) { 3263 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3264 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3265 } else { 3266 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3267 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3268 } 3269 3270 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3271 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3272 } 3273 } 3274 3275 /** 3276 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3277 * 3278 * @asic_type: AMD asic type 3279 * 3280 * Check if there is DC (new modesetting infrastructre) support for an asic. 3281 * returns true if DC has support, false if not. 3282 */ 3283 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3284 { 3285 switch (asic_type) { 3286 #ifdef CONFIG_DRM_AMDGPU_SI 3287 case CHIP_HAINAN: 3288 #endif 3289 case CHIP_TOPAZ: 3290 /* chips with no display hardware */ 3291 return false; 3292 #if defined(CONFIG_DRM_AMD_DC) 3293 case CHIP_TAHITI: 3294 case CHIP_PITCAIRN: 3295 case CHIP_VERDE: 3296 case CHIP_OLAND: 3297 /* 3298 * We have systems in the wild with these ASICs that require 3299 * LVDS and VGA support which is not supported with DC. 3300 * 3301 * Fallback to the non-DC driver here by default so as not to 3302 * cause regressions. 3303 */ 3304 #if defined(CONFIG_DRM_AMD_DC_SI) 3305 return amdgpu_dc > 0; 3306 #else 3307 return false; 3308 #endif 3309 case CHIP_BONAIRE: 3310 case CHIP_KAVERI: 3311 case CHIP_KABINI: 3312 case CHIP_MULLINS: 3313 /* 3314 * We have systems in the wild with these ASICs that require 3315 * LVDS and VGA support which is not supported with DC. 3316 * 3317 * Fallback to the non-DC driver here by default so as not to 3318 * cause regressions. 3319 */ 3320 return amdgpu_dc > 0; 3321 case CHIP_HAWAII: 3322 case CHIP_CARRIZO: 3323 case CHIP_STONEY: 3324 case CHIP_POLARIS10: 3325 case CHIP_POLARIS11: 3326 case CHIP_POLARIS12: 3327 case CHIP_VEGAM: 3328 case CHIP_TONGA: 3329 case CHIP_FIJI: 3330 case CHIP_VEGA10: 3331 case CHIP_VEGA12: 3332 case CHIP_VEGA20: 3333 #if defined(CONFIG_DRM_AMD_DC_DCN) 3334 case CHIP_RAVEN: 3335 case CHIP_NAVI10: 3336 case CHIP_NAVI14: 3337 case CHIP_NAVI12: 3338 case CHIP_RENOIR: 3339 case CHIP_CYAN_SKILLFISH: 3340 case CHIP_SIENNA_CICHLID: 3341 case CHIP_NAVY_FLOUNDER: 3342 case CHIP_DIMGREY_CAVEFISH: 3343 case CHIP_BEIGE_GOBY: 3344 case CHIP_VANGOGH: 3345 case CHIP_YELLOW_CARP: 3346 #endif 3347 default: 3348 return amdgpu_dc != 0; 3349 #else 3350 default: 3351 if (amdgpu_dc > 0) 3352 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3353 "but isn't supported by ASIC, ignoring\n"); 3354 return false; 3355 #endif 3356 } 3357 } 3358 3359 /** 3360 * amdgpu_device_has_dc_support - check if dc is supported 3361 * 3362 * @adev: amdgpu_device pointer 3363 * 3364 * Returns true for supported, false for not supported 3365 */ 3366 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3367 { 3368 if (amdgpu_sriov_vf(adev) || 3369 adev->enable_virtual_display || 3370 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3371 return false; 3372 3373 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3374 } 3375 3376 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3377 { 3378 struct amdgpu_device *adev = 3379 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3380 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3381 3382 /* It's a bug to not have a hive within this function */ 3383 if (WARN_ON(!hive)) 3384 return; 3385 3386 /* 3387 * Use task barrier to synchronize all xgmi reset works across the 3388 * hive. task_barrier_enter and task_barrier_exit will block 3389 * until all the threads running the xgmi reset works reach 3390 * those points. task_barrier_full will do both blocks. 3391 */ 3392 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3393 3394 task_barrier_enter(&hive->tb); 3395 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3396 3397 if (adev->asic_reset_res) 3398 goto fail; 3399 3400 task_barrier_exit(&hive->tb); 3401 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3402 3403 if (adev->asic_reset_res) 3404 goto fail; 3405 3406 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3407 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3408 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3409 } else { 3410 3411 task_barrier_full(&hive->tb); 3412 adev->asic_reset_res = amdgpu_asic_reset(adev); 3413 } 3414 3415 fail: 3416 if (adev->asic_reset_res) 3417 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3418 adev->asic_reset_res, adev_to_drm(adev)->unique); 3419 amdgpu_put_xgmi_hive(hive); 3420 } 3421 3422 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3423 { 3424 char *input = amdgpu_lockup_timeout; 3425 char *timeout_setting = NULL; 3426 int index = 0; 3427 long timeout; 3428 int ret = 0; 3429 3430 /* 3431 * By default timeout for non compute jobs is 10000 3432 * and 60000 for compute jobs. 3433 * In SR-IOV or passthrough mode, timeout for compute 3434 * jobs are 60000 by default. 3435 */ 3436 adev->gfx_timeout = msecs_to_jiffies(10000); 3437 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3438 if (amdgpu_sriov_vf(adev)) 3439 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3440 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3441 else 3442 adev->compute_timeout = msecs_to_jiffies(60000); 3443 3444 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3445 while ((timeout_setting = strsep(&input, ",")) && 3446 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3447 ret = kstrtol(timeout_setting, 0, &timeout); 3448 if (ret) 3449 return ret; 3450 3451 if (timeout == 0) { 3452 index++; 3453 continue; 3454 } else if (timeout < 0) { 3455 timeout = MAX_SCHEDULE_TIMEOUT; 3456 dev_warn(adev->dev, "lockup timeout disabled"); 3457 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3458 } else { 3459 timeout = msecs_to_jiffies(timeout); 3460 } 3461 3462 switch (index++) { 3463 case 0: 3464 adev->gfx_timeout = timeout; 3465 break; 3466 case 1: 3467 adev->compute_timeout = timeout; 3468 break; 3469 case 2: 3470 adev->sdma_timeout = timeout; 3471 break; 3472 case 3: 3473 adev->video_timeout = timeout; 3474 break; 3475 default: 3476 break; 3477 } 3478 } 3479 /* 3480 * There is only one value specified and 3481 * it should apply to all non-compute jobs. 3482 */ 3483 if (index == 1) { 3484 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3485 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3486 adev->compute_timeout = adev->gfx_timeout; 3487 } 3488 } 3489 3490 return ret; 3491 } 3492 3493 /** 3494 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3495 * 3496 * @adev: amdgpu_device pointer 3497 * 3498 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3499 */ 3500 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3501 { 3502 struct iommu_domain *domain; 3503 3504 domain = iommu_get_domain_for_dev(adev->dev); 3505 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3506 adev->ram_is_direct_mapped = true; 3507 } 3508 3509 static const struct attribute *amdgpu_dev_attributes[] = { 3510 &dev_attr_product_name.attr, 3511 &dev_attr_product_number.attr, 3512 &dev_attr_serial_number.attr, 3513 &dev_attr_pcie_replay_count.attr, 3514 NULL 3515 }; 3516 3517 /** 3518 * amdgpu_device_init - initialize the driver 3519 * 3520 * @adev: amdgpu_device pointer 3521 * @flags: driver flags 3522 * 3523 * Initializes the driver info and hw (all asics). 3524 * Returns 0 for success or an error on failure. 3525 * Called at driver startup. 3526 */ 3527 int amdgpu_device_init(struct amdgpu_device *adev, 3528 uint32_t flags) 3529 { 3530 struct drm_device *ddev = adev_to_drm(adev); 3531 struct pci_dev *pdev = adev->pdev; 3532 int r, i; 3533 bool px = false; 3534 u32 max_MBps; 3535 3536 adev->shutdown = false; 3537 adev->flags = flags; 3538 3539 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3540 adev->asic_type = amdgpu_force_asic_type; 3541 else 3542 adev->asic_type = flags & AMD_ASIC_MASK; 3543 3544 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3545 if (amdgpu_emu_mode == 1) 3546 adev->usec_timeout *= 10; 3547 adev->gmc.gart_size = 512 * 1024 * 1024; 3548 adev->accel_working = false; 3549 adev->num_rings = 0; 3550 adev->mman.buffer_funcs = NULL; 3551 adev->mman.buffer_funcs_ring = NULL; 3552 adev->vm_manager.vm_pte_funcs = NULL; 3553 adev->vm_manager.vm_pte_num_scheds = 0; 3554 adev->gmc.gmc_funcs = NULL; 3555 adev->harvest_ip_mask = 0x0; 3556 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3557 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3558 3559 adev->smc_rreg = &amdgpu_invalid_rreg; 3560 adev->smc_wreg = &amdgpu_invalid_wreg; 3561 adev->pcie_rreg = &amdgpu_invalid_rreg; 3562 adev->pcie_wreg = &amdgpu_invalid_wreg; 3563 adev->pciep_rreg = &amdgpu_invalid_rreg; 3564 adev->pciep_wreg = &amdgpu_invalid_wreg; 3565 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3566 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3567 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3568 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3569 adev->didt_rreg = &amdgpu_invalid_rreg; 3570 adev->didt_wreg = &amdgpu_invalid_wreg; 3571 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3572 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3573 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3574 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3575 3576 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3577 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3578 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3579 3580 /* mutex initialization are all done here so we 3581 * can recall function without having locking issues */ 3582 mutex_init(&adev->firmware.mutex); 3583 mutex_init(&adev->pm.mutex); 3584 mutex_init(&adev->gfx.gpu_clock_mutex); 3585 mutex_init(&adev->srbm_mutex); 3586 mutex_init(&adev->gfx.pipe_reserve_mutex); 3587 mutex_init(&adev->gfx.gfx_off_mutex); 3588 mutex_init(&adev->grbm_idx_mutex); 3589 mutex_init(&adev->mn_lock); 3590 mutex_init(&adev->virt.vf_errors.lock); 3591 hash_init(adev->mn_hash); 3592 mutex_init(&adev->psp.mutex); 3593 mutex_init(&adev->notifier_lock); 3594 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3595 mutex_init(&adev->benchmark_mutex); 3596 3597 amdgpu_device_init_apu_flags(adev); 3598 3599 r = amdgpu_device_check_arguments(adev); 3600 if (r) 3601 return r; 3602 3603 spin_lock_init(&adev->mmio_idx_lock); 3604 spin_lock_init(&adev->smc_idx_lock); 3605 spin_lock_init(&adev->pcie_idx_lock); 3606 spin_lock_init(&adev->uvd_ctx_idx_lock); 3607 spin_lock_init(&adev->didt_idx_lock); 3608 spin_lock_init(&adev->gc_cac_idx_lock); 3609 spin_lock_init(&adev->se_cac_idx_lock); 3610 spin_lock_init(&adev->audio_endpt_idx_lock); 3611 spin_lock_init(&adev->mm_stats.lock); 3612 3613 INIT_LIST_HEAD(&adev->shadow_list); 3614 mutex_init(&adev->shadow_list_lock); 3615 3616 INIT_LIST_HEAD(&adev->reset_list); 3617 3618 INIT_LIST_HEAD(&adev->ras_list); 3619 3620 INIT_DELAYED_WORK(&adev->delayed_init_work, 3621 amdgpu_device_delayed_init_work_handler); 3622 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3623 amdgpu_device_delay_enable_gfx_off); 3624 3625 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3626 3627 adev->gfx.gfx_off_req_count = 1; 3628 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3629 3630 atomic_set(&adev->throttling_logging_enabled, 1); 3631 /* 3632 * If throttling continues, logging will be performed every minute 3633 * to avoid log flooding. "-1" is subtracted since the thermal 3634 * throttling interrupt comes every second. Thus, the total logging 3635 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3636 * for throttling interrupt) = 60 seconds. 3637 */ 3638 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3639 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3640 3641 /* Registers mapping */ 3642 /* TODO: block userspace mapping of io register */ 3643 if (adev->asic_type >= CHIP_BONAIRE) { 3644 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3645 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3646 } else { 3647 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3648 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3649 } 3650 3651 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3652 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3653 3654 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3655 if (adev->rmmio == NULL) { 3656 return -ENOMEM; 3657 } 3658 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3659 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3660 3661 amdgpu_device_get_pcie_info(adev); 3662 3663 if (amdgpu_mcbp) 3664 DRM_INFO("MCBP is enabled\n"); 3665 3666 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3667 adev->enable_mes = true; 3668 3669 /* 3670 * Reset domain needs to be present early, before XGMI hive discovered 3671 * (if any) and intitialized to use reset sem and in_gpu reset flag 3672 * early on during init and before calling to RREG32. 3673 */ 3674 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3675 if (!adev->reset_domain) 3676 return -ENOMEM; 3677 3678 /* detect hw virtualization here */ 3679 amdgpu_detect_virtualization(adev); 3680 3681 r = amdgpu_device_get_job_timeout_settings(adev); 3682 if (r) { 3683 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3684 return r; 3685 } 3686 3687 /* early init functions */ 3688 r = amdgpu_device_ip_early_init(adev); 3689 if (r) 3690 return r; 3691 3692 amdgpu_gmc_noretry_set(adev); 3693 /* Need to get xgmi info early to decide the reset behavior*/ 3694 if (adev->gmc.xgmi.supported) { 3695 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3696 if (r) 3697 return r; 3698 } 3699 3700 /* enable PCIE atomic ops */ 3701 if (amdgpu_sriov_vf(adev)) 3702 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3703 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags == 3704 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3705 else 3706 adev->have_atomics_support = 3707 !pci_enable_atomic_ops_to_root(adev->pdev, 3708 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3709 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3710 if (!adev->have_atomics_support) 3711 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3712 3713 /* doorbell bar mapping and doorbell index init*/ 3714 amdgpu_device_doorbell_init(adev); 3715 3716 if (amdgpu_emu_mode == 1) { 3717 /* post the asic on emulation mode */ 3718 emu_soc_asic_init(adev); 3719 goto fence_driver_init; 3720 } 3721 3722 amdgpu_reset_init(adev); 3723 3724 /* detect if we are with an SRIOV vbios */ 3725 amdgpu_device_detect_sriov_bios(adev); 3726 3727 /* check if we need to reset the asic 3728 * E.g., driver was not cleanly unloaded previously, etc. 3729 */ 3730 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3731 if (adev->gmc.xgmi.num_physical_nodes) { 3732 dev_info(adev->dev, "Pending hive reset.\n"); 3733 adev->gmc.xgmi.pending_reset = true; 3734 /* Only need to init necessary block for SMU to handle the reset */ 3735 for (i = 0; i < adev->num_ip_blocks; i++) { 3736 if (!adev->ip_blocks[i].status.valid) 3737 continue; 3738 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3739 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3740 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3742 DRM_DEBUG("IP %s disabled for hw_init.\n", 3743 adev->ip_blocks[i].version->funcs->name); 3744 adev->ip_blocks[i].status.hw = true; 3745 } 3746 } 3747 } else { 3748 r = amdgpu_asic_reset(adev); 3749 if (r) { 3750 dev_err(adev->dev, "asic reset on init failed\n"); 3751 goto failed; 3752 } 3753 } 3754 } 3755 3756 pci_enable_pcie_error_reporting(adev->pdev); 3757 3758 /* Post card if necessary */ 3759 if (amdgpu_device_need_post(adev)) { 3760 if (!adev->bios) { 3761 dev_err(adev->dev, "no vBIOS found\n"); 3762 r = -EINVAL; 3763 goto failed; 3764 } 3765 DRM_INFO("GPU posting now...\n"); 3766 r = amdgpu_device_asic_init(adev); 3767 if (r) { 3768 dev_err(adev->dev, "gpu post error!\n"); 3769 goto failed; 3770 } 3771 } 3772 3773 if (adev->is_atom_fw) { 3774 /* Initialize clocks */ 3775 r = amdgpu_atomfirmware_get_clock_info(adev); 3776 if (r) { 3777 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3778 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3779 goto failed; 3780 } 3781 } else { 3782 /* Initialize clocks */ 3783 r = amdgpu_atombios_get_clock_info(adev); 3784 if (r) { 3785 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3786 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3787 goto failed; 3788 } 3789 /* init i2c buses */ 3790 if (!amdgpu_device_has_dc_support(adev)) 3791 amdgpu_atombios_i2c_init(adev); 3792 } 3793 3794 fence_driver_init: 3795 /* Fence driver */ 3796 r = amdgpu_fence_driver_sw_init(adev); 3797 if (r) { 3798 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3799 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3800 goto failed; 3801 } 3802 3803 /* init the mode config */ 3804 drm_mode_config_init(adev_to_drm(adev)); 3805 3806 r = amdgpu_device_ip_init(adev); 3807 if (r) { 3808 /* failed in exclusive mode due to timeout */ 3809 if (amdgpu_sriov_vf(adev) && 3810 !amdgpu_sriov_runtime(adev) && 3811 amdgpu_virt_mmio_blocked(adev) && 3812 !amdgpu_virt_wait_reset(adev)) { 3813 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3814 /* Don't send request since VF is inactive. */ 3815 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3816 adev->virt.ops = NULL; 3817 r = -EAGAIN; 3818 goto release_ras_con; 3819 } 3820 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3821 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3822 goto release_ras_con; 3823 } 3824 3825 amdgpu_fence_driver_hw_init(adev); 3826 3827 dev_info(adev->dev, 3828 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3829 adev->gfx.config.max_shader_engines, 3830 adev->gfx.config.max_sh_per_se, 3831 adev->gfx.config.max_cu_per_sh, 3832 adev->gfx.cu_info.number); 3833 3834 adev->accel_working = true; 3835 3836 amdgpu_vm_check_compute_bug(adev); 3837 3838 /* Initialize the buffer migration limit. */ 3839 if (amdgpu_moverate >= 0) 3840 max_MBps = amdgpu_moverate; 3841 else 3842 max_MBps = 8; /* Allow 8 MB/s. */ 3843 /* Get a log2 for easy divisions. */ 3844 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3845 3846 r = amdgpu_pm_sysfs_init(adev); 3847 if (r) { 3848 adev->pm_sysfs_en = false; 3849 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3850 } else 3851 adev->pm_sysfs_en = true; 3852 3853 r = amdgpu_ucode_sysfs_init(adev); 3854 if (r) { 3855 adev->ucode_sysfs_en = false; 3856 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3857 } else 3858 adev->ucode_sysfs_en = true; 3859 3860 /* 3861 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3862 * Otherwise the mgpu fan boost feature will be skipped due to the 3863 * gpu instance is counted less. 3864 */ 3865 amdgpu_register_gpu_instance(adev); 3866 3867 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3868 * explicit gating rather than handling it automatically. 3869 */ 3870 if (!adev->gmc.xgmi.pending_reset) { 3871 r = amdgpu_device_ip_late_init(adev); 3872 if (r) { 3873 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3874 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3875 goto release_ras_con; 3876 } 3877 /* must succeed. */ 3878 amdgpu_ras_resume(adev); 3879 queue_delayed_work(system_wq, &adev->delayed_init_work, 3880 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3881 } 3882 3883 if (amdgpu_sriov_vf(adev)) 3884 flush_delayed_work(&adev->delayed_init_work); 3885 3886 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3887 if (r) 3888 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3889 3890 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3891 r = amdgpu_pmu_init(adev); 3892 if (r) 3893 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3894 3895 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3896 if (amdgpu_device_cache_pci_state(adev->pdev)) 3897 pci_restore_state(pdev); 3898 3899 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3900 /* this will fail for cards that aren't VGA class devices, just 3901 * ignore it */ 3902 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3903 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3904 3905 if (amdgpu_device_supports_px(ddev)) { 3906 px = true; 3907 vga_switcheroo_register_client(adev->pdev, 3908 &amdgpu_switcheroo_ops, px); 3909 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3910 } 3911 3912 if (adev->gmc.xgmi.pending_reset) 3913 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3914 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3915 3916 amdgpu_device_check_iommu_direct_map(adev); 3917 3918 return 0; 3919 3920 release_ras_con: 3921 amdgpu_release_ras_context(adev); 3922 3923 failed: 3924 amdgpu_vf_error_trans_all(adev); 3925 3926 return r; 3927 } 3928 3929 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3930 { 3931 3932 /* Clear all CPU mappings pointing to this device */ 3933 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3934 3935 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3936 amdgpu_device_doorbell_fini(adev); 3937 3938 iounmap(adev->rmmio); 3939 adev->rmmio = NULL; 3940 if (adev->mman.aper_base_kaddr) 3941 iounmap(adev->mman.aper_base_kaddr); 3942 adev->mman.aper_base_kaddr = NULL; 3943 3944 /* Memory manager related */ 3945 if (!adev->gmc.xgmi.connected_to_cpu) { 3946 arch_phys_wc_del(adev->gmc.vram_mtrr); 3947 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3948 } 3949 } 3950 3951 /** 3952 * amdgpu_device_fini_hw - tear down the driver 3953 * 3954 * @adev: amdgpu_device pointer 3955 * 3956 * Tear down the driver info (all asics). 3957 * Called at driver shutdown. 3958 */ 3959 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3960 { 3961 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3962 flush_delayed_work(&adev->delayed_init_work); 3963 if (adev->mman.initialized) { 3964 flush_delayed_work(&adev->mman.bdev.wq); 3965 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3966 } 3967 adev->shutdown = true; 3968 3969 /* make sure IB test finished before entering exclusive mode 3970 * to avoid preemption on IB test 3971 * */ 3972 if (amdgpu_sriov_vf(adev)) { 3973 amdgpu_virt_request_full_gpu(adev, false); 3974 amdgpu_virt_fini_data_exchange(adev); 3975 } 3976 3977 /* disable all interrupts */ 3978 amdgpu_irq_disable_all(adev); 3979 if (adev->mode_info.mode_config_initialized){ 3980 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3981 drm_helper_force_disable_all(adev_to_drm(adev)); 3982 else 3983 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3984 } 3985 amdgpu_fence_driver_hw_fini(adev); 3986 3987 if (adev->pm_sysfs_en) 3988 amdgpu_pm_sysfs_fini(adev); 3989 if (adev->ucode_sysfs_en) 3990 amdgpu_ucode_sysfs_fini(adev); 3991 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3992 3993 /* disable ras feature must before hw fini */ 3994 amdgpu_ras_pre_fini(adev); 3995 3996 amdgpu_device_ip_fini_early(adev); 3997 3998 amdgpu_irq_fini_hw(adev); 3999 4000 if (adev->mman.initialized) 4001 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4002 4003 amdgpu_gart_dummy_page_fini(adev); 4004 4005 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4006 amdgpu_device_unmap_mmio(adev); 4007 4008 } 4009 4010 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4011 { 4012 int idx; 4013 4014 amdgpu_fence_driver_sw_fini(adev); 4015 amdgpu_device_ip_fini(adev); 4016 release_firmware(adev->firmware.gpu_info_fw); 4017 adev->firmware.gpu_info_fw = NULL; 4018 adev->accel_working = false; 4019 4020 amdgpu_reset_fini(adev); 4021 4022 /* free i2c buses */ 4023 if (!amdgpu_device_has_dc_support(adev)) 4024 amdgpu_i2c_fini(adev); 4025 4026 if (amdgpu_emu_mode != 1) 4027 amdgpu_atombios_fini(adev); 4028 4029 kfree(adev->bios); 4030 adev->bios = NULL; 4031 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4032 vga_switcheroo_unregister_client(adev->pdev); 4033 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4034 } 4035 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4036 vga_client_unregister(adev->pdev); 4037 4038 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4039 4040 iounmap(adev->rmmio); 4041 adev->rmmio = NULL; 4042 amdgpu_device_doorbell_fini(adev); 4043 drm_dev_exit(idx); 4044 } 4045 4046 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4047 amdgpu_pmu_fini(adev); 4048 if (adev->mman.discovery_bin) 4049 amdgpu_discovery_fini(adev); 4050 4051 amdgpu_reset_put_reset_domain(adev->reset_domain); 4052 adev->reset_domain = NULL; 4053 4054 kfree(adev->pci_state); 4055 4056 } 4057 4058 /** 4059 * amdgpu_device_evict_resources - evict device resources 4060 * @adev: amdgpu device object 4061 * 4062 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4063 * of the vram memory type. Mainly used for evicting device resources 4064 * at suspend time. 4065 * 4066 */ 4067 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4068 { 4069 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4070 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4071 return; 4072 4073 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4074 DRM_WARN("evicting device resources failed\n"); 4075 4076 } 4077 4078 /* 4079 * Suspend & resume. 4080 */ 4081 /** 4082 * amdgpu_device_suspend - initiate device suspend 4083 * 4084 * @dev: drm dev pointer 4085 * @fbcon : notify the fbdev of suspend 4086 * 4087 * Puts the hw in the suspend state (all asics). 4088 * Returns 0 for success or an error on failure. 4089 * Called at driver suspend. 4090 */ 4091 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4092 { 4093 struct amdgpu_device *adev = drm_to_adev(dev); 4094 4095 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4096 return 0; 4097 4098 adev->in_suspend = true; 4099 4100 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4101 DRM_WARN("smart shift update failed\n"); 4102 4103 drm_kms_helper_poll_disable(dev); 4104 4105 if (fbcon) 4106 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4107 4108 cancel_delayed_work_sync(&adev->delayed_init_work); 4109 4110 amdgpu_ras_suspend(adev); 4111 4112 amdgpu_device_ip_suspend_phase1(adev); 4113 4114 if (!adev->in_s0ix) 4115 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4116 4117 amdgpu_device_evict_resources(adev); 4118 4119 amdgpu_fence_driver_hw_fini(adev); 4120 4121 amdgpu_device_ip_suspend_phase2(adev); 4122 4123 return 0; 4124 } 4125 4126 /** 4127 * amdgpu_device_resume - initiate device resume 4128 * 4129 * @dev: drm dev pointer 4130 * @fbcon : notify the fbdev of resume 4131 * 4132 * Bring the hw back to operating state (all asics). 4133 * Returns 0 for success or an error on failure. 4134 * Called at driver resume. 4135 */ 4136 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4137 { 4138 struct amdgpu_device *adev = drm_to_adev(dev); 4139 int r = 0; 4140 4141 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4142 return 0; 4143 4144 if (adev->in_s0ix) 4145 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4146 4147 /* post card */ 4148 if (amdgpu_device_need_post(adev)) { 4149 r = amdgpu_device_asic_init(adev); 4150 if (r) 4151 dev_err(adev->dev, "amdgpu asic init failed\n"); 4152 } 4153 4154 r = amdgpu_device_ip_resume(adev); 4155 if (r) { 4156 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4157 return r; 4158 } 4159 amdgpu_fence_driver_hw_init(adev); 4160 4161 r = amdgpu_device_ip_late_init(adev); 4162 if (r) 4163 return r; 4164 4165 queue_delayed_work(system_wq, &adev->delayed_init_work, 4166 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4167 4168 if (!adev->in_s0ix) { 4169 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4170 if (r) 4171 return r; 4172 } 4173 4174 /* Make sure IB tests flushed */ 4175 flush_delayed_work(&adev->delayed_init_work); 4176 4177 if (fbcon) 4178 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4179 4180 drm_kms_helper_poll_enable(dev); 4181 4182 amdgpu_ras_resume(adev); 4183 4184 /* 4185 * Most of the connector probing functions try to acquire runtime pm 4186 * refs to ensure that the GPU is powered on when connector polling is 4187 * performed. Since we're calling this from a runtime PM callback, 4188 * trying to acquire rpm refs will cause us to deadlock. 4189 * 4190 * Since we're guaranteed to be holding the rpm lock, it's safe to 4191 * temporarily disable the rpm helpers so this doesn't deadlock us. 4192 */ 4193 #ifdef CONFIG_PM 4194 dev->dev->power.disable_depth++; 4195 #endif 4196 if (!amdgpu_device_has_dc_support(adev)) 4197 drm_helper_hpd_irq_event(dev); 4198 else 4199 drm_kms_helper_hotplug_event(dev); 4200 #ifdef CONFIG_PM 4201 dev->dev->power.disable_depth--; 4202 #endif 4203 adev->in_suspend = false; 4204 4205 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4206 DRM_WARN("smart shift update failed\n"); 4207 4208 return 0; 4209 } 4210 4211 /** 4212 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4213 * 4214 * @adev: amdgpu_device pointer 4215 * 4216 * The list of all the hardware IPs that make up the asic is walked and 4217 * the check_soft_reset callbacks are run. check_soft_reset determines 4218 * if the asic is still hung or not. 4219 * Returns true if any of the IPs are still in a hung state, false if not. 4220 */ 4221 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4222 { 4223 int i; 4224 bool asic_hang = false; 4225 4226 if (amdgpu_sriov_vf(adev)) 4227 return true; 4228 4229 if (amdgpu_asic_need_full_reset(adev)) 4230 return true; 4231 4232 for (i = 0; i < adev->num_ip_blocks; i++) { 4233 if (!adev->ip_blocks[i].status.valid) 4234 continue; 4235 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4236 adev->ip_blocks[i].status.hang = 4237 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4238 if (adev->ip_blocks[i].status.hang) { 4239 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4240 asic_hang = true; 4241 } 4242 } 4243 return asic_hang; 4244 } 4245 4246 /** 4247 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4248 * 4249 * @adev: amdgpu_device pointer 4250 * 4251 * The list of all the hardware IPs that make up the asic is walked and the 4252 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4253 * handles any IP specific hardware or software state changes that are 4254 * necessary for a soft reset to succeed. 4255 * Returns 0 on success, negative error code on failure. 4256 */ 4257 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4258 { 4259 int i, r = 0; 4260 4261 for (i = 0; i < adev->num_ip_blocks; i++) { 4262 if (!adev->ip_blocks[i].status.valid) 4263 continue; 4264 if (adev->ip_blocks[i].status.hang && 4265 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4266 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4267 if (r) 4268 return r; 4269 } 4270 } 4271 4272 return 0; 4273 } 4274 4275 /** 4276 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4277 * 4278 * @adev: amdgpu_device pointer 4279 * 4280 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4281 * reset is necessary to recover. 4282 * Returns true if a full asic reset is required, false if not. 4283 */ 4284 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4285 { 4286 int i; 4287 4288 if (amdgpu_asic_need_full_reset(adev)) 4289 return true; 4290 4291 for (i = 0; i < adev->num_ip_blocks; i++) { 4292 if (!adev->ip_blocks[i].status.valid) 4293 continue; 4294 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4295 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4296 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4297 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4298 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4299 if (adev->ip_blocks[i].status.hang) { 4300 dev_info(adev->dev, "Some block need full reset!\n"); 4301 return true; 4302 } 4303 } 4304 } 4305 return false; 4306 } 4307 4308 /** 4309 * amdgpu_device_ip_soft_reset - do a soft reset 4310 * 4311 * @adev: amdgpu_device pointer 4312 * 4313 * The list of all the hardware IPs that make up the asic is walked and the 4314 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4315 * IP specific hardware or software state changes that are necessary to soft 4316 * reset the IP. 4317 * Returns 0 on success, negative error code on failure. 4318 */ 4319 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4320 { 4321 int i, r = 0; 4322 4323 for (i = 0; i < adev->num_ip_blocks; i++) { 4324 if (!adev->ip_blocks[i].status.valid) 4325 continue; 4326 if (adev->ip_blocks[i].status.hang && 4327 adev->ip_blocks[i].version->funcs->soft_reset) { 4328 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4329 if (r) 4330 return r; 4331 } 4332 } 4333 4334 return 0; 4335 } 4336 4337 /** 4338 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4339 * 4340 * @adev: amdgpu_device pointer 4341 * 4342 * The list of all the hardware IPs that make up the asic is walked and the 4343 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4344 * handles any IP specific hardware or software state changes that are 4345 * necessary after the IP has been soft reset. 4346 * Returns 0 on success, negative error code on failure. 4347 */ 4348 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4349 { 4350 int i, r = 0; 4351 4352 for (i = 0; i < adev->num_ip_blocks; i++) { 4353 if (!adev->ip_blocks[i].status.valid) 4354 continue; 4355 if (adev->ip_blocks[i].status.hang && 4356 adev->ip_blocks[i].version->funcs->post_soft_reset) 4357 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4358 if (r) 4359 return r; 4360 } 4361 4362 return 0; 4363 } 4364 4365 /** 4366 * amdgpu_device_recover_vram - Recover some VRAM contents 4367 * 4368 * @adev: amdgpu_device pointer 4369 * 4370 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4371 * restore things like GPUVM page tables after a GPU reset where 4372 * the contents of VRAM might be lost. 4373 * 4374 * Returns: 4375 * 0 on success, negative error code on failure. 4376 */ 4377 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4378 { 4379 struct dma_fence *fence = NULL, *next = NULL; 4380 struct amdgpu_bo *shadow; 4381 struct amdgpu_bo_vm *vmbo; 4382 long r = 1, tmo; 4383 4384 if (amdgpu_sriov_runtime(adev)) 4385 tmo = msecs_to_jiffies(8000); 4386 else 4387 tmo = msecs_to_jiffies(100); 4388 4389 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4390 mutex_lock(&adev->shadow_list_lock); 4391 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4392 shadow = &vmbo->bo; 4393 /* No need to recover an evicted BO */ 4394 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4395 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4396 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4397 continue; 4398 4399 r = amdgpu_bo_restore_shadow(shadow, &next); 4400 if (r) 4401 break; 4402 4403 if (fence) { 4404 tmo = dma_fence_wait_timeout(fence, false, tmo); 4405 dma_fence_put(fence); 4406 fence = next; 4407 if (tmo == 0) { 4408 r = -ETIMEDOUT; 4409 break; 4410 } else if (tmo < 0) { 4411 r = tmo; 4412 break; 4413 } 4414 } else { 4415 fence = next; 4416 } 4417 } 4418 mutex_unlock(&adev->shadow_list_lock); 4419 4420 if (fence) 4421 tmo = dma_fence_wait_timeout(fence, false, tmo); 4422 dma_fence_put(fence); 4423 4424 if (r < 0 || tmo <= 0) { 4425 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4426 return -EIO; 4427 } 4428 4429 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4430 return 0; 4431 } 4432 4433 4434 /** 4435 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4436 * 4437 * @adev: amdgpu_device pointer 4438 * @from_hypervisor: request from hypervisor 4439 * 4440 * do VF FLR and reinitialize Asic 4441 * return 0 means succeeded otherwise failed 4442 */ 4443 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4444 bool from_hypervisor) 4445 { 4446 int r; 4447 struct amdgpu_hive_info *hive = NULL; 4448 int retry_limit = 0; 4449 4450 retry: 4451 amdgpu_amdkfd_pre_reset(adev); 4452 4453 amdgpu_amdkfd_pre_reset(adev); 4454 4455 if (from_hypervisor) 4456 r = amdgpu_virt_request_full_gpu(adev, true); 4457 else 4458 r = amdgpu_virt_reset_gpu(adev); 4459 if (r) 4460 return r; 4461 4462 /* Resume IP prior to SMC */ 4463 r = amdgpu_device_ip_reinit_early_sriov(adev); 4464 if (r) 4465 goto error; 4466 4467 amdgpu_virt_init_data_exchange(adev); 4468 4469 r = amdgpu_device_fw_loading(adev); 4470 if (r) 4471 return r; 4472 4473 /* now we are okay to resume SMC/CP/SDMA */ 4474 r = amdgpu_device_ip_reinit_late_sriov(adev); 4475 if (r) 4476 goto error; 4477 4478 hive = amdgpu_get_xgmi_hive(adev); 4479 /* Update PSP FW topology after reset */ 4480 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4481 r = amdgpu_xgmi_update_topology(hive, adev); 4482 4483 if (hive) 4484 amdgpu_put_xgmi_hive(hive); 4485 4486 if (!r) { 4487 amdgpu_irq_gpu_reset_resume_helper(adev); 4488 r = amdgpu_ib_ring_tests(adev); 4489 amdgpu_amdkfd_post_reset(adev); 4490 } 4491 4492 error: 4493 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4494 amdgpu_inc_vram_lost(adev); 4495 r = amdgpu_device_recover_vram(adev); 4496 } 4497 amdgpu_virt_release_full_gpu(adev, true); 4498 4499 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4500 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4501 retry_limit++; 4502 goto retry; 4503 } else 4504 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4505 } 4506 4507 return r; 4508 } 4509 4510 /** 4511 * amdgpu_device_has_job_running - check if there is any job in mirror list 4512 * 4513 * @adev: amdgpu_device pointer 4514 * 4515 * check if there is any job in mirror list 4516 */ 4517 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4518 { 4519 int i; 4520 struct drm_sched_job *job; 4521 4522 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4523 struct amdgpu_ring *ring = adev->rings[i]; 4524 4525 if (!ring || !ring->sched.thread) 4526 continue; 4527 4528 spin_lock(&ring->sched.job_list_lock); 4529 job = list_first_entry_or_null(&ring->sched.pending_list, 4530 struct drm_sched_job, list); 4531 spin_unlock(&ring->sched.job_list_lock); 4532 if (job) 4533 return true; 4534 } 4535 return false; 4536 } 4537 4538 /** 4539 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4540 * 4541 * @adev: amdgpu_device pointer 4542 * 4543 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4544 * a hung GPU. 4545 */ 4546 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4547 { 4548 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4549 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4550 return false; 4551 } 4552 4553 if (amdgpu_gpu_recovery == 0) 4554 goto disabled; 4555 4556 if (amdgpu_sriov_vf(adev)) 4557 return true; 4558 4559 if (amdgpu_gpu_recovery == -1) { 4560 switch (adev->asic_type) { 4561 #ifdef CONFIG_DRM_AMDGPU_SI 4562 case CHIP_VERDE: 4563 case CHIP_TAHITI: 4564 case CHIP_PITCAIRN: 4565 case CHIP_OLAND: 4566 case CHIP_HAINAN: 4567 #endif 4568 #ifdef CONFIG_DRM_AMDGPU_CIK 4569 case CHIP_KAVERI: 4570 case CHIP_KABINI: 4571 case CHIP_MULLINS: 4572 #endif 4573 case CHIP_CARRIZO: 4574 case CHIP_STONEY: 4575 case CHIP_CYAN_SKILLFISH: 4576 goto disabled; 4577 default: 4578 break; 4579 } 4580 } 4581 4582 return true; 4583 4584 disabled: 4585 dev_info(adev->dev, "GPU recovery disabled.\n"); 4586 return false; 4587 } 4588 4589 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4590 { 4591 u32 i; 4592 int ret = 0; 4593 4594 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4595 4596 dev_info(adev->dev, "GPU mode1 reset\n"); 4597 4598 /* disable BM */ 4599 pci_clear_master(adev->pdev); 4600 4601 amdgpu_device_cache_pci_state(adev->pdev); 4602 4603 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4604 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4605 ret = amdgpu_dpm_mode1_reset(adev); 4606 } else { 4607 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4608 ret = psp_gpu_reset(adev); 4609 } 4610 4611 if (ret) 4612 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4613 4614 amdgpu_device_load_pci_state(adev->pdev); 4615 4616 /* wait for asic to come out of reset */ 4617 for (i = 0; i < adev->usec_timeout; i++) { 4618 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4619 4620 if (memsize != 0xffffffff) 4621 break; 4622 udelay(1); 4623 } 4624 4625 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4626 return ret; 4627 } 4628 4629 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4630 struct amdgpu_reset_context *reset_context) 4631 { 4632 int i, r = 0; 4633 struct amdgpu_job *job = NULL; 4634 bool need_full_reset = 4635 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4636 4637 if (reset_context->reset_req_dev == adev) 4638 job = reset_context->job; 4639 4640 if (amdgpu_sriov_vf(adev)) { 4641 /* stop the data exchange thread */ 4642 amdgpu_virt_fini_data_exchange(adev); 4643 } 4644 4645 /* block all schedulers and reset given job's ring */ 4646 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4647 struct amdgpu_ring *ring = adev->rings[i]; 4648 4649 if (!ring || !ring->sched.thread) 4650 continue; 4651 4652 /*clear job fence from fence drv to avoid force_completion 4653 *leave NULL and vm flush fence in fence drv */ 4654 amdgpu_fence_driver_clear_job_fences(ring); 4655 4656 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4657 amdgpu_fence_driver_force_completion(ring); 4658 } 4659 4660 if (job && job->vm) 4661 drm_sched_increase_karma(&job->base); 4662 4663 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4664 /* If reset handler not implemented, continue; otherwise return */ 4665 if (r == -ENOSYS) 4666 r = 0; 4667 else 4668 return r; 4669 4670 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4671 if (!amdgpu_sriov_vf(adev)) { 4672 4673 if (!need_full_reset) 4674 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4675 4676 if (!need_full_reset) { 4677 amdgpu_device_ip_pre_soft_reset(adev); 4678 r = amdgpu_device_ip_soft_reset(adev); 4679 amdgpu_device_ip_post_soft_reset(adev); 4680 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4681 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4682 need_full_reset = true; 4683 } 4684 } 4685 4686 if (need_full_reset) 4687 r = amdgpu_device_ip_suspend(adev); 4688 if (need_full_reset) 4689 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4690 else 4691 clear_bit(AMDGPU_NEED_FULL_RESET, 4692 &reset_context->flags); 4693 } 4694 4695 return r; 4696 } 4697 4698 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4699 { 4700 uint32_t reg_value; 4701 int i; 4702 4703 lockdep_assert_held(&adev->reset_domain->sem); 4704 dump_stack(); 4705 4706 for (i = 0; i < adev->num_regs; i++) { 4707 reg_value = RREG32(adev->reset_dump_reg_list[i]); 4708 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); 4709 } 4710 4711 return 0; 4712 } 4713 4714 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4715 struct amdgpu_reset_context *reset_context) 4716 { 4717 struct amdgpu_device *tmp_adev = NULL; 4718 bool need_full_reset, skip_hw_reset, vram_lost = false; 4719 int r = 0; 4720 4721 /* Try reset handler method first */ 4722 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4723 reset_list); 4724 amdgpu_reset_reg_dumps(tmp_adev); 4725 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4726 /* If reset handler not implemented, continue; otherwise return */ 4727 if (r == -ENOSYS) 4728 r = 0; 4729 else 4730 return r; 4731 4732 /* Reset handler not implemented, use the default method */ 4733 need_full_reset = 4734 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4735 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4736 4737 /* 4738 * ASIC reset has to be done on all XGMI hive nodes ASAP 4739 * to allow proper links negotiation in FW (within 1 sec) 4740 */ 4741 if (!skip_hw_reset && need_full_reset) { 4742 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4743 /* For XGMI run all resets in parallel to speed up the process */ 4744 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4745 tmp_adev->gmc.xgmi.pending_reset = false; 4746 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4747 r = -EALREADY; 4748 } else 4749 r = amdgpu_asic_reset(tmp_adev); 4750 4751 if (r) { 4752 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4753 r, adev_to_drm(tmp_adev)->unique); 4754 break; 4755 } 4756 } 4757 4758 /* For XGMI wait for all resets to complete before proceed */ 4759 if (!r) { 4760 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4761 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4762 flush_work(&tmp_adev->xgmi_reset_work); 4763 r = tmp_adev->asic_reset_res; 4764 if (r) 4765 break; 4766 } 4767 } 4768 } 4769 } 4770 4771 if (!r && amdgpu_ras_intr_triggered()) { 4772 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4773 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4774 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4775 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4776 } 4777 4778 amdgpu_ras_intr_cleared(); 4779 } 4780 4781 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4782 if (need_full_reset) { 4783 /* post card */ 4784 r = amdgpu_device_asic_init(tmp_adev); 4785 if (r) { 4786 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4787 } else { 4788 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4789 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4790 if (r) 4791 goto out; 4792 4793 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4794 if (r) 4795 goto out; 4796 4797 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4798 if (vram_lost) { 4799 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4800 amdgpu_inc_vram_lost(tmp_adev); 4801 } 4802 4803 r = amdgpu_device_fw_loading(tmp_adev); 4804 if (r) 4805 return r; 4806 4807 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4808 if (r) 4809 goto out; 4810 4811 if (vram_lost) 4812 amdgpu_device_fill_reset_magic(tmp_adev); 4813 4814 /* 4815 * Add this ASIC as tracked as reset was already 4816 * complete successfully. 4817 */ 4818 amdgpu_register_gpu_instance(tmp_adev); 4819 4820 if (!reset_context->hive && 4821 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4822 amdgpu_xgmi_add_device(tmp_adev); 4823 4824 r = amdgpu_device_ip_late_init(tmp_adev); 4825 if (r) 4826 goto out; 4827 4828 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4829 4830 /* 4831 * The GPU enters bad state once faulty pages 4832 * by ECC has reached the threshold, and ras 4833 * recovery is scheduled next. So add one check 4834 * here to break recovery if it indeed exceeds 4835 * bad page threshold, and remind user to 4836 * retire this GPU or setting one bigger 4837 * bad_page_threshold value to fix this once 4838 * probing driver again. 4839 */ 4840 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4841 /* must succeed. */ 4842 amdgpu_ras_resume(tmp_adev); 4843 } else { 4844 r = -EINVAL; 4845 goto out; 4846 } 4847 4848 /* Update PSP FW topology after reset */ 4849 if (reset_context->hive && 4850 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4851 r = amdgpu_xgmi_update_topology( 4852 reset_context->hive, tmp_adev); 4853 } 4854 } 4855 4856 out: 4857 if (!r) { 4858 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4859 r = amdgpu_ib_ring_tests(tmp_adev); 4860 if (r) { 4861 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4862 need_full_reset = true; 4863 r = -EAGAIN; 4864 goto end; 4865 } 4866 } 4867 4868 if (!r) 4869 r = amdgpu_device_recover_vram(tmp_adev); 4870 else 4871 tmp_adev->asic_reset_res = r; 4872 } 4873 4874 end: 4875 if (need_full_reset) 4876 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4877 else 4878 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4879 return r; 4880 } 4881 4882 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 4883 { 4884 4885 switch (amdgpu_asic_reset_method(adev)) { 4886 case AMD_RESET_METHOD_MODE1: 4887 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4888 break; 4889 case AMD_RESET_METHOD_MODE2: 4890 adev->mp1_state = PP_MP1_STATE_RESET; 4891 break; 4892 default: 4893 adev->mp1_state = PP_MP1_STATE_NONE; 4894 break; 4895 } 4896 } 4897 4898 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 4899 { 4900 amdgpu_vf_error_trans_all(adev); 4901 adev->mp1_state = PP_MP1_STATE_NONE; 4902 } 4903 4904 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4905 { 4906 struct pci_dev *p = NULL; 4907 4908 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4909 adev->pdev->bus->number, 1); 4910 if (p) { 4911 pm_runtime_enable(&(p->dev)); 4912 pm_runtime_resume(&(p->dev)); 4913 } 4914 } 4915 4916 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4917 { 4918 enum amd_reset_method reset_method; 4919 struct pci_dev *p = NULL; 4920 u64 expires; 4921 4922 /* 4923 * For now, only BACO and mode1 reset are confirmed 4924 * to suffer the audio issue without proper suspended. 4925 */ 4926 reset_method = amdgpu_asic_reset_method(adev); 4927 if ((reset_method != AMD_RESET_METHOD_BACO) && 4928 (reset_method != AMD_RESET_METHOD_MODE1)) 4929 return -EINVAL; 4930 4931 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4932 adev->pdev->bus->number, 1); 4933 if (!p) 4934 return -ENODEV; 4935 4936 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4937 if (!expires) 4938 /* 4939 * If we cannot get the audio device autosuspend delay, 4940 * a fixed 4S interval will be used. Considering 3S is 4941 * the audio controller default autosuspend delay setting. 4942 * 4S used here is guaranteed to cover that. 4943 */ 4944 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4945 4946 while (!pm_runtime_status_suspended(&(p->dev))) { 4947 if (!pm_runtime_suspend(&(p->dev))) 4948 break; 4949 4950 if (expires < ktime_get_mono_fast_ns()) { 4951 dev_warn(adev->dev, "failed to suspend display audio\n"); 4952 /* TODO: abort the succeeding gpu reset? */ 4953 return -ETIMEDOUT; 4954 } 4955 } 4956 4957 pm_runtime_disable(&(p->dev)); 4958 4959 return 0; 4960 } 4961 4962 static void amdgpu_device_recheck_guilty_jobs( 4963 struct amdgpu_device *adev, struct list_head *device_list_handle, 4964 struct amdgpu_reset_context *reset_context) 4965 { 4966 int i, r = 0; 4967 4968 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4969 struct amdgpu_ring *ring = adev->rings[i]; 4970 int ret = 0; 4971 struct drm_sched_job *s_job; 4972 4973 if (!ring || !ring->sched.thread) 4974 continue; 4975 4976 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4977 struct drm_sched_job, list); 4978 if (s_job == NULL) 4979 continue; 4980 4981 /* clear job's guilty and depend the folowing step to decide the real one */ 4982 drm_sched_reset_karma(s_job); 4983 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get 4984 * to make sure fence is balanced */ 4985 dma_fence_get(s_job->s_fence->parent); 4986 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4987 4988 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4989 if (ret == 0) { /* timeout */ 4990 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4991 ring->sched.name, s_job->id); 4992 4993 /* set guilty */ 4994 drm_sched_increase_karma(s_job); 4995 retry: 4996 /* do hw reset */ 4997 if (amdgpu_sriov_vf(adev)) { 4998 amdgpu_virt_fini_data_exchange(adev); 4999 r = amdgpu_device_reset_sriov(adev, false); 5000 if (r) 5001 adev->asic_reset_res = r; 5002 } else { 5003 clear_bit(AMDGPU_SKIP_HW_RESET, 5004 &reset_context->flags); 5005 r = amdgpu_do_asic_reset(device_list_handle, 5006 reset_context); 5007 if (r && r == -EAGAIN) 5008 goto retry; 5009 } 5010 5011 /* 5012 * add reset counter so that the following 5013 * resubmitted job could flush vmid 5014 */ 5015 atomic_inc(&adev->gpu_reset_counter); 5016 continue; 5017 } 5018 5019 /* got the hw fence, signal finished fence */ 5020 atomic_dec(ring->sched.score); 5021 dma_fence_put(s_job->s_fence->parent); 5022 dma_fence_get(&s_job->s_fence->finished); 5023 dma_fence_signal(&s_job->s_fence->finished); 5024 dma_fence_put(&s_job->s_fence->finished); 5025 5026 /* remove node from list and free the job */ 5027 spin_lock(&ring->sched.job_list_lock); 5028 list_del_init(&s_job->list); 5029 spin_unlock(&ring->sched.job_list_lock); 5030 ring->sched.ops->free_job(s_job); 5031 } 5032 } 5033 5034 /** 5035 * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler 5036 * 5037 * @adev: amdgpu_device pointer 5038 * @job: which job trigger hang 5039 * 5040 * Attempt to reset the GPU if it has hung (all asics). 5041 * Attempt to do soft-reset or full-reset and reinitialize Asic 5042 * Returns 0 for success or an error on failure. 5043 */ 5044 5045 int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, 5046 struct amdgpu_job *job) 5047 { 5048 struct list_head device_list, *device_list_handle = NULL; 5049 bool job_signaled = false; 5050 struct amdgpu_hive_info *hive = NULL; 5051 struct amdgpu_device *tmp_adev = NULL; 5052 int i, r = 0; 5053 bool need_emergency_restart = false; 5054 bool audio_suspended = false; 5055 int tmp_vram_lost_counter; 5056 struct amdgpu_reset_context reset_context; 5057 5058 memset(&reset_context, 0, sizeof(reset_context)); 5059 5060 /* 5061 * Special case: RAS triggered and full reset isn't supported 5062 */ 5063 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5064 5065 /* 5066 * Flush RAM to disk so that after reboot 5067 * the user can read log and see why the system rebooted. 5068 */ 5069 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5070 DRM_WARN("Emergency reboot."); 5071 5072 ksys_sync_helper(); 5073 emergency_restart(); 5074 } 5075 5076 dev_info(adev->dev, "GPU %s begin!\n", 5077 need_emergency_restart ? "jobs stop":"reset"); 5078 5079 if (!amdgpu_sriov_vf(adev)) 5080 hive = amdgpu_get_xgmi_hive(adev); 5081 if (hive) 5082 mutex_lock(&hive->hive_lock); 5083 5084 reset_context.method = AMD_RESET_METHOD_NONE; 5085 reset_context.reset_req_dev = adev; 5086 reset_context.job = job; 5087 reset_context.hive = hive; 5088 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5089 5090 /* 5091 * Build list of devices to reset. 5092 * In case we are in XGMI hive mode, resort the device list 5093 * to put adev in the 1st position. 5094 */ 5095 INIT_LIST_HEAD(&device_list); 5096 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5097 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5098 list_add_tail(&tmp_adev->reset_list, &device_list); 5099 if (!list_is_first(&adev->reset_list, &device_list)) 5100 list_rotate_to_front(&adev->reset_list, &device_list); 5101 device_list_handle = &device_list; 5102 } else { 5103 list_add_tail(&adev->reset_list, &device_list); 5104 device_list_handle = &device_list; 5105 } 5106 5107 /* We need to lock reset domain only once both for XGMI and single device */ 5108 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5109 reset_list); 5110 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5111 5112 /* block all schedulers and reset given job's ring */ 5113 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5114 5115 amdgpu_device_set_mp1_state(tmp_adev); 5116 5117 /* 5118 * Try to put the audio codec into suspend state 5119 * before gpu reset started. 5120 * 5121 * Due to the power domain of the graphics device 5122 * is shared with AZ power domain. Without this, 5123 * we may change the audio hardware from behind 5124 * the audio driver's back. That will trigger 5125 * some audio codec errors. 5126 */ 5127 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5128 audio_suspended = true; 5129 5130 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5131 5132 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5133 5134 if (!amdgpu_sriov_vf(tmp_adev)) 5135 amdgpu_amdkfd_pre_reset(tmp_adev); 5136 5137 /* 5138 * Mark these ASICs to be reseted as untracked first 5139 * And add them back after reset completed 5140 */ 5141 amdgpu_unregister_gpu_instance(tmp_adev); 5142 5143 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 5144 5145 /* disable ras on ALL IPs */ 5146 if (!need_emergency_restart && 5147 amdgpu_device_ip_need_full_reset(tmp_adev)) 5148 amdgpu_ras_suspend(tmp_adev); 5149 5150 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5151 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5152 5153 if (!ring || !ring->sched.thread) 5154 continue; 5155 5156 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5157 5158 if (need_emergency_restart) 5159 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5160 } 5161 atomic_inc(&tmp_adev->gpu_reset_counter); 5162 } 5163 5164 if (need_emergency_restart) 5165 goto skip_sched_resume; 5166 5167 /* 5168 * Must check guilty signal here since after this point all old 5169 * HW fences are force signaled. 5170 * 5171 * job->base holds a reference to parent fence 5172 */ 5173 if (job && job->base.s_fence->parent && 5174 dma_fence_is_signaled(job->base.s_fence->parent)) { 5175 job_signaled = true; 5176 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5177 goto skip_hw_reset; 5178 } 5179 5180 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5181 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5182 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5183 /*TODO Should we stop ?*/ 5184 if (r) { 5185 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5186 r, adev_to_drm(tmp_adev)->unique); 5187 tmp_adev->asic_reset_res = r; 5188 } 5189 } 5190 5191 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5192 /* Actual ASIC resets if needed.*/ 5193 /* Host driver will handle XGMI hive reset for SRIOV */ 5194 if (amdgpu_sriov_vf(adev)) { 5195 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5196 if (r) 5197 adev->asic_reset_res = r; 5198 } else { 5199 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5200 if (r && r == -EAGAIN) 5201 goto retry; 5202 } 5203 5204 skip_hw_reset: 5205 5206 /* Post ASIC reset for all devs .*/ 5207 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5208 5209 /* 5210 * Sometimes a later bad compute job can block a good gfx job as gfx 5211 * and compute ring share internal GC HW mutually. We add an additional 5212 * guilty jobs recheck step to find the real guilty job, it synchronously 5213 * submits and pends for the first job being signaled. If it gets timeout, 5214 * we identify it as a real guilty job. 5215 */ 5216 if (amdgpu_gpu_recovery == 2 && 5217 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5218 amdgpu_device_recheck_guilty_jobs( 5219 tmp_adev, device_list_handle, &reset_context); 5220 5221 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5222 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5223 5224 if (!ring || !ring->sched.thread) 5225 continue; 5226 5227 /* No point to resubmit jobs if we didn't HW reset*/ 5228 if (!tmp_adev->asic_reset_res && !job_signaled) 5229 drm_sched_resubmit_jobs(&ring->sched); 5230 5231 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5232 } 5233 5234 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5235 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5236 } 5237 5238 if (tmp_adev->asic_reset_res) 5239 r = tmp_adev->asic_reset_res; 5240 5241 tmp_adev->asic_reset_res = 0; 5242 5243 if (r) { 5244 /* bad news, how to tell it to userspace ? */ 5245 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5246 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5247 } else { 5248 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5249 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5250 DRM_WARN("smart shift update failed\n"); 5251 } 5252 } 5253 5254 skip_sched_resume: 5255 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5256 /* unlock kfd: SRIOV would do it separately */ 5257 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5258 amdgpu_amdkfd_post_reset(tmp_adev); 5259 5260 /* kfd_post_reset will do nothing if kfd device is not initialized, 5261 * need to bring up kfd here if it's not be initialized before 5262 */ 5263 if (!adev->kfd.init_complete) 5264 amdgpu_amdkfd_device_init(adev); 5265 5266 if (audio_suspended) 5267 amdgpu_device_resume_display_audio(tmp_adev); 5268 5269 amdgpu_device_unset_mp1_state(tmp_adev); 5270 } 5271 5272 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5273 reset_list); 5274 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5275 5276 if (hive) { 5277 mutex_unlock(&hive->hive_lock); 5278 amdgpu_put_xgmi_hive(hive); 5279 } 5280 5281 if (r) 5282 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5283 return r; 5284 } 5285 5286 struct amdgpu_recover_work_struct { 5287 struct work_struct base; 5288 struct amdgpu_device *adev; 5289 struct amdgpu_job *job; 5290 int ret; 5291 }; 5292 5293 static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) 5294 { 5295 struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); 5296 5297 recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); 5298 } 5299 /* 5300 * Serialize gpu recover into reset domain single threaded wq 5301 */ 5302 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5303 struct amdgpu_job *job) 5304 { 5305 struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; 5306 5307 INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); 5308 5309 if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) 5310 return -EAGAIN; 5311 5312 flush_work(&work.base); 5313 5314 return work.ret; 5315 } 5316 5317 /** 5318 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5319 * 5320 * @adev: amdgpu_device pointer 5321 * 5322 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5323 * and lanes) of the slot the device is in. Handles APUs and 5324 * virtualized environments where PCIE config space may not be available. 5325 */ 5326 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5327 { 5328 struct pci_dev *pdev; 5329 enum pci_bus_speed speed_cap, platform_speed_cap; 5330 enum pcie_link_width platform_link_width; 5331 5332 if (amdgpu_pcie_gen_cap) 5333 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5334 5335 if (amdgpu_pcie_lane_cap) 5336 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5337 5338 /* covers APUs as well */ 5339 if (pci_is_root_bus(adev->pdev->bus)) { 5340 if (adev->pm.pcie_gen_mask == 0) 5341 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5342 if (adev->pm.pcie_mlw_mask == 0) 5343 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5344 return; 5345 } 5346 5347 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5348 return; 5349 5350 pcie_bandwidth_available(adev->pdev, NULL, 5351 &platform_speed_cap, &platform_link_width); 5352 5353 if (adev->pm.pcie_gen_mask == 0) { 5354 /* asic caps */ 5355 pdev = adev->pdev; 5356 speed_cap = pcie_get_speed_cap(pdev); 5357 if (speed_cap == PCI_SPEED_UNKNOWN) { 5358 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5359 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5360 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5361 } else { 5362 if (speed_cap == PCIE_SPEED_32_0GT) 5363 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5364 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5365 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5366 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5367 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5368 else if (speed_cap == PCIE_SPEED_16_0GT) 5369 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5370 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5371 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5372 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5373 else if (speed_cap == PCIE_SPEED_8_0GT) 5374 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5375 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5376 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5377 else if (speed_cap == PCIE_SPEED_5_0GT) 5378 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5379 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5380 else 5381 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5382 } 5383 /* platform caps */ 5384 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5385 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5386 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5387 } else { 5388 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5389 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5390 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5391 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5392 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5393 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5394 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5395 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5396 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5397 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5398 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5399 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5400 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5401 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5402 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5403 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5404 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5405 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5406 else 5407 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5408 5409 } 5410 } 5411 if (adev->pm.pcie_mlw_mask == 0) { 5412 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5413 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5414 } else { 5415 switch (platform_link_width) { 5416 case PCIE_LNK_X32: 5417 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5418 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5419 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5420 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5424 break; 5425 case PCIE_LNK_X16: 5426 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5427 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5428 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5432 break; 5433 case PCIE_LNK_X12: 5434 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5435 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5436 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5438 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5439 break; 5440 case PCIE_LNK_X8: 5441 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5442 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5443 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5445 break; 5446 case PCIE_LNK_X4: 5447 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5448 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5449 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5450 break; 5451 case PCIE_LNK_X2: 5452 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5453 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5454 break; 5455 case PCIE_LNK_X1: 5456 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5457 break; 5458 default: 5459 break; 5460 } 5461 } 5462 } 5463 } 5464 5465 int amdgpu_device_baco_enter(struct drm_device *dev) 5466 { 5467 struct amdgpu_device *adev = drm_to_adev(dev); 5468 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5469 5470 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5471 return -ENOTSUPP; 5472 5473 if (ras && adev->ras_enabled && 5474 adev->nbio.funcs->enable_doorbell_interrupt) 5475 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5476 5477 return amdgpu_dpm_baco_enter(adev); 5478 } 5479 5480 int amdgpu_device_baco_exit(struct drm_device *dev) 5481 { 5482 struct amdgpu_device *adev = drm_to_adev(dev); 5483 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5484 int ret = 0; 5485 5486 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5487 return -ENOTSUPP; 5488 5489 ret = amdgpu_dpm_baco_exit(adev); 5490 if (ret) 5491 return ret; 5492 5493 if (ras && adev->ras_enabled && 5494 adev->nbio.funcs->enable_doorbell_interrupt) 5495 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5496 5497 if (amdgpu_passthrough(adev) && 5498 adev->nbio.funcs->clear_doorbell_interrupt) 5499 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5500 5501 return 0; 5502 } 5503 5504 /** 5505 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5506 * @pdev: PCI device struct 5507 * @state: PCI channel state 5508 * 5509 * Description: Called when a PCI error is detected. 5510 * 5511 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5512 */ 5513 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5514 { 5515 struct drm_device *dev = pci_get_drvdata(pdev); 5516 struct amdgpu_device *adev = drm_to_adev(dev); 5517 int i; 5518 5519 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5520 5521 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5522 DRM_WARN("No support for XGMI hive yet..."); 5523 return PCI_ERS_RESULT_DISCONNECT; 5524 } 5525 5526 adev->pci_channel_state = state; 5527 5528 switch (state) { 5529 case pci_channel_io_normal: 5530 return PCI_ERS_RESULT_CAN_RECOVER; 5531 /* Fatal error, prepare for slot reset */ 5532 case pci_channel_io_frozen: 5533 /* 5534 * Locking adev->reset_domain->sem will prevent any external access 5535 * to GPU during PCI error recovery 5536 */ 5537 amdgpu_device_lock_reset_domain(adev->reset_domain); 5538 amdgpu_device_set_mp1_state(adev); 5539 5540 /* 5541 * Block any work scheduling as we do for regular GPU reset 5542 * for the duration of the recovery 5543 */ 5544 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5545 struct amdgpu_ring *ring = adev->rings[i]; 5546 5547 if (!ring || !ring->sched.thread) 5548 continue; 5549 5550 drm_sched_stop(&ring->sched, NULL); 5551 } 5552 atomic_inc(&adev->gpu_reset_counter); 5553 return PCI_ERS_RESULT_NEED_RESET; 5554 case pci_channel_io_perm_failure: 5555 /* Permanent error, prepare for device removal */ 5556 return PCI_ERS_RESULT_DISCONNECT; 5557 } 5558 5559 return PCI_ERS_RESULT_NEED_RESET; 5560 } 5561 5562 /** 5563 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5564 * @pdev: pointer to PCI device 5565 */ 5566 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5567 { 5568 5569 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5570 5571 /* TODO - dump whatever for debugging purposes */ 5572 5573 /* This called only if amdgpu_pci_error_detected returns 5574 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5575 * works, no need to reset slot. 5576 */ 5577 5578 return PCI_ERS_RESULT_RECOVERED; 5579 } 5580 5581 /** 5582 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5583 * @pdev: PCI device struct 5584 * 5585 * Description: This routine is called by the pci error recovery 5586 * code after the PCI slot has been reset, just before we 5587 * should resume normal operations. 5588 */ 5589 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5590 { 5591 struct drm_device *dev = pci_get_drvdata(pdev); 5592 struct amdgpu_device *adev = drm_to_adev(dev); 5593 int r, i; 5594 struct amdgpu_reset_context reset_context; 5595 u32 memsize; 5596 struct list_head device_list; 5597 5598 DRM_INFO("PCI error: slot reset callback!!\n"); 5599 5600 memset(&reset_context, 0, sizeof(reset_context)); 5601 5602 INIT_LIST_HEAD(&device_list); 5603 list_add_tail(&adev->reset_list, &device_list); 5604 5605 /* wait for asic to come out of reset */ 5606 msleep(500); 5607 5608 /* Restore PCI confspace */ 5609 amdgpu_device_load_pci_state(pdev); 5610 5611 /* confirm ASIC came out of reset */ 5612 for (i = 0; i < adev->usec_timeout; i++) { 5613 memsize = amdgpu_asic_get_config_memsize(adev); 5614 5615 if (memsize != 0xffffffff) 5616 break; 5617 udelay(1); 5618 } 5619 if (memsize == 0xffffffff) { 5620 r = -ETIME; 5621 goto out; 5622 } 5623 5624 reset_context.method = AMD_RESET_METHOD_NONE; 5625 reset_context.reset_req_dev = adev; 5626 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5627 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5628 5629 adev->no_hw_access = true; 5630 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5631 adev->no_hw_access = false; 5632 if (r) 5633 goto out; 5634 5635 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5636 5637 out: 5638 if (!r) { 5639 if (amdgpu_device_cache_pci_state(adev->pdev)) 5640 pci_restore_state(adev->pdev); 5641 5642 DRM_INFO("PCIe error recovery succeeded\n"); 5643 } else { 5644 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5645 amdgpu_device_unset_mp1_state(adev); 5646 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5647 } 5648 5649 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5650 } 5651 5652 /** 5653 * amdgpu_pci_resume() - resume normal ops after PCI reset 5654 * @pdev: pointer to PCI device 5655 * 5656 * Called when the error recovery driver tells us that its 5657 * OK to resume normal operation. 5658 */ 5659 void amdgpu_pci_resume(struct pci_dev *pdev) 5660 { 5661 struct drm_device *dev = pci_get_drvdata(pdev); 5662 struct amdgpu_device *adev = drm_to_adev(dev); 5663 int i; 5664 5665 5666 DRM_INFO("PCI error: resume callback!!\n"); 5667 5668 /* Only continue execution for the case of pci_channel_io_frozen */ 5669 if (adev->pci_channel_state != pci_channel_io_frozen) 5670 return; 5671 5672 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5673 struct amdgpu_ring *ring = adev->rings[i]; 5674 5675 if (!ring || !ring->sched.thread) 5676 continue; 5677 5678 5679 drm_sched_resubmit_jobs(&ring->sched); 5680 drm_sched_start(&ring->sched, true); 5681 } 5682 5683 amdgpu_device_unset_mp1_state(adev); 5684 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5685 } 5686 5687 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5688 { 5689 struct drm_device *dev = pci_get_drvdata(pdev); 5690 struct amdgpu_device *adev = drm_to_adev(dev); 5691 int r; 5692 5693 r = pci_save_state(pdev); 5694 if (!r) { 5695 kfree(adev->pci_state); 5696 5697 adev->pci_state = pci_store_saved_state(pdev); 5698 5699 if (!adev->pci_state) { 5700 DRM_ERROR("Failed to store PCI saved state"); 5701 return false; 5702 } 5703 } else { 5704 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5705 return false; 5706 } 5707 5708 return true; 5709 } 5710 5711 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5712 { 5713 struct drm_device *dev = pci_get_drvdata(pdev); 5714 struct amdgpu_device *adev = drm_to_adev(dev); 5715 int r; 5716 5717 if (!adev->pci_state) 5718 return false; 5719 5720 r = pci_load_saved_state(pdev, adev->pci_state); 5721 5722 if (!r) { 5723 pci_restore_state(pdev); 5724 } else { 5725 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5726 return false; 5727 } 5728 5729 return true; 5730 } 5731 5732 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5733 struct amdgpu_ring *ring) 5734 { 5735 #ifdef CONFIG_X86_64 5736 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5737 return; 5738 #endif 5739 if (adev->gmc.xgmi.connected_to_cpu) 5740 return; 5741 5742 if (ring && ring->funcs->emit_hdp_flush) 5743 amdgpu_ring_emit_hdp_flush(ring); 5744 else 5745 amdgpu_asic_flush_hdp(adev, ring); 5746 } 5747 5748 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5749 struct amdgpu_ring *ring) 5750 { 5751 #ifdef CONFIG_X86_64 5752 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5753 return; 5754 #endif 5755 if (adev->gmc.xgmi.connected_to_cpu) 5756 return; 5757 5758 amdgpu_asic_invalidate_hdp(adev, ring); 5759 } 5760 5761 int amdgpu_in_reset(struct amdgpu_device *adev) 5762 { 5763 return atomic_read(&adev->reset_domain->in_gpu_reset); 5764 } 5765 5766 /** 5767 * amdgpu_device_halt() - bring hardware to some kind of halt state 5768 * 5769 * @adev: amdgpu_device pointer 5770 * 5771 * Bring hardware to some kind of halt state so that no one can touch it 5772 * any more. It will help to maintain error context when error occurred. 5773 * Compare to a simple hang, the system will keep stable at least for SSH 5774 * access. Then it should be trivial to inspect the hardware state and 5775 * see what's going on. Implemented as following: 5776 * 5777 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5778 * clears all CPU mappings to device, disallows remappings through page faults 5779 * 2. amdgpu_irq_disable_all() disables all interrupts 5780 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5781 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5782 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5783 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5784 * flush any in flight DMA operations 5785 */ 5786 void amdgpu_device_halt(struct amdgpu_device *adev) 5787 { 5788 struct pci_dev *pdev = adev->pdev; 5789 struct drm_device *ddev = adev_to_drm(adev); 5790 5791 drm_dev_unplug(ddev); 5792 5793 amdgpu_irq_disable_all(adev); 5794 5795 amdgpu_fence_driver_hw_fini(adev); 5796 5797 adev->no_hw_access = true; 5798 5799 amdgpu_device_unmap_mmio(adev); 5800 5801 pci_disable_device(pdev); 5802 pci_wait_for_pending_transaction(pdev); 5803 } 5804 5805 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5806 u32 reg) 5807 { 5808 unsigned long flags, address, data; 5809 u32 r; 5810 5811 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5812 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5813 5814 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5815 WREG32(address, reg * 4); 5816 (void)RREG32(address); 5817 r = RREG32(data); 5818 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5819 return r; 5820 } 5821 5822 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5823 u32 reg, u32 v) 5824 { 5825 unsigned long flags, address, data; 5826 5827 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5828 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5829 5830 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5831 WREG32(address, reg * 4); 5832 (void)RREG32(address); 5833 WREG32(data, v); 5834 (void)RREG32(data); 5835 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5836 } 5837