1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 39 #include <drm/drm_aperture.h> 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 #define AMDGPU_MAX_RETRY_LIMIT 2 92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 93 94 static const struct drm_driver amdgpu_kms_driver; 95 96 const char *amdgpu_asic_name[] = { 97 "TAHITI", 98 "PITCAIRN", 99 "VERDE", 100 "OLAND", 101 "HAINAN", 102 "BONAIRE", 103 "KAVERI", 104 "KABINI", 105 "HAWAII", 106 "MULLINS", 107 "TOPAZ", 108 "TONGA", 109 "FIJI", 110 "CARRIZO", 111 "STONEY", 112 "POLARIS10", 113 "POLARIS11", 114 "POLARIS12", 115 "VEGAM", 116 "VEGA10", 117 "VEGA12", 118 "VEGA20", 119 "RAVEN", 120 "ARCTURUS", 121 "RENOIR", 122 "ALDEBARAN", 123 "NAVI10", 124 "CYAN_SKILLFISH", 125 "NAVI14", 126 "NAVI12", 127 "SIENNA_CICHLID", 128 "NAVY_FLOUNDER", 129 "VANGOGH", 130 "DIMGREY_CAVEFISH", 131 "BEIGE_GOBY", 132 "YELLOW_CARP", 133 "IP DISCOVERY", 134 "LAST", 135 }; 136 137 /** 138 * DOC: pcie_replay_count 139 * 140 * The amdgpu driver provides a sysfs API for reporting the total number 141 * of PCIe replays (NAKs) 142 * The file pcie_replay_count is used for this and returns the total 143 * number of replays as a sum of the NAKs generated and NAKs received 144 */ 145 146 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 147 struct device_attribute *attr, char *buf) 148 { 149 struct drm_device *ddev = dev_get_drvdata(dev); 150 struct amdgpu_device *adev = drm_to_adev(ddev); 151 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 152 153 return sysfs_emit(buf, "%llu\n", cnt); 154 } 155 156 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 157 amdgpu_device_get_pcie_replay_count, NULL); 158 159 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 160 161 /** 162 * DOC: product_name 163 * 164 * The amdgpu driver provides a sysfs API for reporting the product name 165 * for the device 166 * The file serial_number is used for this and returns the product name 167 * as returned from the FRU. 168 * NOTE: This is only available for certain server cards 169 */ 170 171 static ssize_t amdgpu_device_get_product_name(struct device *dev, 172 struct device_attribute *attr, char *buf) 173 { 174 struct drm_device *ddev = dev_get_drvdata(dev); 175 struct amdgpu_device *adev = drm_to_adev(ddev); 176 177 return sysfs_emit(buf, "%s\n", adev->product_name); 178 } 179 180 static DEVICE_ATTR(product_name, S_IRUGO, 181 amdgpu_device_get_product_name, NULL); 182 183 /** 184 * DOC: product_number 185 * 186 * The amdgpu driver provides a sysfs API for reporting the part number 187 * for the device 188 * The file serial_number is used for this and returns the part number 189 * as returned from the FRU. 190 * NOTE: This is only available for certain server cards 191 */ 192 193 static ssize_t amdgpu_device_get_product_number(struct device *dev, 194 struct device_attribute *attr, char *buf) 195 { 196 struct drm_device *ddev = dev_get_drvdata(dev); 197 struct amdgpu_device *adev = drm_to_adev(ddev); 198 199 return sysfs_emit(buf, "%s\n", adev->product_number); 200 } 201 202 static DEVICE_ATTR(product_number, S_IRUGO, 203 amdgpu_device_get_product_number, NULL); 204 205 /** 206 * DOC: serial_number 207 * 208 * The amdgpu driver provides a sysfs API for reporting the serial number 209 * for the device 210 * The file serial_number is used for this and returns the serial number 211 * as returned from the FRU. 212 * NOTE: This is only available for certain server cards 213 */ 214 215 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 216 struct device_attribute *attr, char *buf) 217 { 218 struct drm_device *ddev = dev_get_drvdata(dev); 219 struct amdgpu_device *adev = drm_to_adev(ddev); 220 221 return sysfs_emit(buf, "%s\n", adev->serial); 222 } 223 224 static DEVICE_ATTR(serial_number, S_IRUGO, 225 amdgpu_device_get_serial_number, NULL); 226 227 /** 228 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 229 * 230 * @dev: drm_device pointer 231 * 232 * Returns true if the device is a dGPU with ATPX power control, 233 * otherwise return false. 234 */ 235 bool amdgpu_device_supports_px(struct drm_device *dev) 236 { 237 struct amdgpu_device *adev = drm_to_adev(dev); 238 239 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 240 return true; 241 return false; 242 } 243 244 /** 245 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 246 * 247 * @dev: drm_device pointer 248 * 249 * Returns true if the device is a dGPU with ACPI power control, 250 * otherwise return false. 251 */ 252 bool amdgpu_device_supports_boco(struct drm_device *dev) 253 { 254 struct amdgpu_device *adev = drm_to_adev(dev); 255 256 if (adev->has_pr3 || 257 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 258 return true; 259 return false; 260 } 261 262 /** 263 * amdgpu_device_supports_baco - Does the device support BACO 264 * 265 * @dev: drm_device pointer 266 * 267 * Returns true if the device supporte BACO, 268 * otherwise return false. 269 */ 270 bool amdgpu_device_supports_baco(struct drm_device *dev) 271 { 272 struct amdgpu_device *adev = drm_to_adev(dev); 273 274 return amdgpu_asic_supports_baco(adev); 275 } 276 277 /** 278 * amdgpu_device_supports_smart_shift - Is the device dGPU with 279 * smart shift support 280 * 281 * @dev: drm_device pointer 282 * 283 * Returns true if the device is a dGPU with Smart Shift support, 284 * otherwise returns false. 285 */ 286 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 287 { 288 return (amdgpu_device_supports_boco(dev) && 289 amdgpu_acpi_is_power_shift_control_supported()); 290 } 291 292 /* 293 * VRAM access helper functions 294 */ 295 296 /** 297 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 298 * 299 * @adev: amdgpu_device pointer 300 * @pos: offset of the buffer in vram 301 * @buf: virtual address of the buffer in system memory 302 * @size: read/write size, sizeof(@buf) must > @size 303 * @write: true - write to vram, otherwise - read from vram 304 */ 305 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 306 void *buf, size_t size, bool write) 307 { 308 unsigned long flags; 309 uint32_t hi = ~0, tmp = 0; 310 uint32_t *data = buf; 311 uint64_t last; 312 int idx; 313 314 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 315 return; 316 317 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 318 319 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 320 for (last = pos + size; pos < last; pos += 4) { 321 tmp = pos >> 31; 322 323 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 324 if (tmp != hi) { 325 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 326 hi = tmp; 327 } 328 if (write) 329 WREG32_NO_KIQ(mmMM_DATA, *data++); 330 else 331 *data++ = RREG32_NO_KIQ(mmMM_DATA); 332 } 333 334 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 335 drm_dev_exit(idx); 336 } 337 338 /** 339 * amdgpu_device_aper_access - access vram by vram aperature 340 * 341 * @adev: amdgpu_device pointer 342 * @pos: offset of the buffer in vram 343 * @buf: virtual address of the buffer in system memory 344 * @size: read/write size, sizeof(@buf) must > @size 345 * @write: true - write to vram, otherwise - read from vram 346 * 347 * The return value means how many bytes have been transferred. 348 */ 349 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 350 void *buf, size_t size, bool write) 351 { 352 #ifdef CONFIG_64BIT 353 void __iomem *addr; 354 size_t count = 0; 355 uint64_t last; 356 357 if (!adev->mman.aper_base_kaddr) 358 return 0; 359 360 last = min(pos + size, adev->gmc.visible_vram_size); 361 if (last > pos) { 362 addr = adev->mman.aper_base_kaddr + pos; 363 count = last - pos; 364 365 if (write) { 366 memcpy_toio(addr, buf, count); 367 mb(); 368 amdgpu_device_flush_hdp(adev, NULL); 369 } else { 370 amdgpu_device_invalidate_hdp(adev, NULL); 371 mb(); 372 memcpy_fromio(buf, addr, count); 373 } 374 375 } 376 377 return count; 378 #else 379 return 0; 380 #endif 381 } 382 383 /** 384 * amdgpu_device_vram_access - read/write a buffer in vram 385 * 386 * @adev: amdgpu_device pointer 387 * @pos: offset of the buffer in vram 388 * @buf: virtual address of the buffer in system memory 389 * @size: read/write size, sizeof(@buf) must > @size 390 * @write: true - write to vram, otherwise - read from vram 391 */ 392 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 393 void *buf, size_t size, bool write) 394 { 395 size_t count; 396 397 /* try to using vram apreature to access vram first */ 398 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 399 size -= count; 400 if (size) { 401 /* using MM to access rest vram */ 402 pos += count; 403 buf += count; 404 amdgpu_device_mm_access(adev, pos, buf, size, write); 405 } 406 } 407 408 /* 409 * register access helper functions. 410 */ 411 412 /* Check if hw access should be skipped because of hotplug or device error */ 413 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 414 { 415 if (adev->no_hw_access) 416 return true; 417 418 #ifdef CONFIG_LOCKDEP 419 /* 420 * This is a bit complicated to understand, so worth a comment. What we assert 421 * here is that the GPU reset is not running on another thread in parallel. 422 * 423 * For this we trylock the read side of the reset semaphore, if that succeeds 424 * we know that the reset is not running in paralell. 425 * 426 * If the trylock fails we assert that we are either already holding the read 427 * side of the lock or are the reset thread itself and hold the write side of 428 * the lock. 429 */ 430 if (in_task()) { 431 if (down_read_trylock(&adev->reset_domain->sem)) 432 up_read(&adev->reset_domain->sem); 433 else 434 lockdep_assert_held(&adev->reset_domain->sem); 435 } 436 #endif 437 return false; 438 } 439 440 /** 441 * amdgpu_device_rreg - read a memory mapped IO or indirect register 442 * 443 * @adev: amdgpu_device pointer 444 * @reg: dword aligned register offset 445 * @acc_flags: access flags which require special behavior 446 * 447 * Returns the 32 bit value from the offset specified. 448 */ 449 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 450 uint32_t reg, uint32_t acc_flags) 451 { 452 uint32_t ret; 453 454 if (amdgpu_device_skip_hw_access(adev)) 455 return 0; 456 457 if ((reg * 4) < adev->rmmio_size) { 458 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 459 amdgpu_sriov_runtime(adev) && 460 down_read_trylock(&adev->reset_domain->sem)) { 461 ret = amdgpu_kiq_rreg(adev, reg); 462 up_read(&adev->reset_domain->sem); 463 } else { 464 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 465 } 466 } else { 467 ret = adev->pcie_rreg(adev, reg * 4); 468 } 469 470 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 471 472 return ret; 473 } 474 475 /* 476 * MMIO register read with bytes helper functions 477 * @offset:bytes offset from MMIO start 478 * 479 */ 480 481 /** 482 * amdgpu_mm_rreg8 - read a memory mapped IO register 483 * 484 * @adev: amdgpu_device pointer 485 * @offset: byte aligned register offset 486 * 487 * Returns the 8 bit value from the offset specified. 488 */ 489 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 490 { 491 if (amdgpu_device_skip_hw_access(adev)) 492 return 0; 493 494 if (offset < adev->rmmio_size) 495 return (readb(adev->rmmio + offset)); 496 BUG(); 497 } 498 499 /* 500 * MMIO register write with bytes helper functions 501 * @offset:bytes offset from MMIO start 502 * @value: the value want to be written to the register 503 * 504 */ 505 /** 506 * amdgpu_mm_wreg8 - read a memory mapped IO register 507 * 508 * @adev: amdgpu_device pointer 509 * @offset: byte aligned register offset 510 * @value: 8 bit value to write 511 * 512 * Writes the value specified to the offset specified. 513 */ 514 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 515 { 516 if (amdgpu_device_skip_hw_access(adev)) 517 return; 518 519 if (offset < adev->rmmio_size) 520 writeb(value, adev->rmmio + offset); 521 else 522 BUG(); 523 } 524 525 /** 526 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 527 * 528 * @adev: amdgpu_device pointer 529 * @reg: dword aligned register offset 530 * @v: 32 bit value to write to the register 531 * @acc_flags: access flags which require special behavior 532 * 533 * Writes the value specified to the offset specified. 534 */ 535 void amdgpu_device_wreg(struct amdgpu_device *adev, 536 uint32_t reg, uint32_t v, 537 uint32_t acc_flags) 538 { 539 if (amdgpu_device_skip_hw_access(adev)) 540 return; 541 542 if ((reg * 4) < adev->rmmio_size) { 543 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 544 amdgpu_sriov_runtime(adev) && 545 down_read_trylock(&adev->reset_domain->sem)) { 546 amdgpu_kiq_wreg(adev, reg, v); 547 up_read(&adev->reset_domain->sem); 548 } else { 549 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 550 } 551 } else { 552 adev->pcie_wreg(adev, reg * 4, v); 553 } 554 555 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 556 } 557 558 /** 559 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 560 * 561 * @adev: amdgpu_device pointer 562 * @reg: mmio/rlc register 563 * @v: value to write 564 * 565 * this function is invoked only for the debugfs register access 566 */ 567 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 568 uint32_t reg, uint32_t v) 569 { 570 if (amdgpu_device_skip_hw_access(adev)) 571 return; 572 573 if (amdgpu_sriov_fullaccess(adev) && 574 adev->gfx.rlc.funcs && 575 adev->gfx.rlc.funcs->is_rlcg_access_range) { 576 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 577 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 578 } else if ((reg * 4) >= adev->rmmio_size) { 579 adev->pcie_wreg(adev, reg * 4, v); 580 } else { 581 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 582 } 583 } 584 585 /** 586 * amdgpu_mm_rdoorbell - read a doorbell dword 587 * 588 * @adev: amdgpu_device pointer 589 * @index: doorbell index 590 * 591 * Returns the value in the doorbell aperture at the 592 * requested doorbell index (CIK). 593 */ 594 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 595 { 596 if (amdgpu_device_skip_hw_access(adev)) 597 return 0; 598 599 if (index < adev->doorbell.num_doorbells) { 600 return readl(adev->doorbell.ptr + index); 601 } else { 602 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 603 return 0; 604 } 605 } 606 607 /** 608 * amdgpu_mm_wdoorbell - write a doorbell dword 609 * 610 * @adev: amdgpu_device pointer 611 * @index: doorbell index 612 * @v: value to write 613 * 614 * Writes @v to the doorbell aperture at the 615 * requested doorbell index (CIK). 616 */ 617 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 618 { 619 if (amdgpu_device_skip_hw_access(adev)) 620 return; 621 622 if (index < adev->doorbell.num_doorbells) { 623 writel(v, adev->doorbell.ptr + index); 624 } else { 625 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 626 } 627 } 628 629 /** 630 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 631 * 632 * @adev: amdgpu_device pointer 633 * @index: doorbell index 634 * 635 * Returns the value in the doorbell aperture at the 636 * requested doorbell index (VEGA10+). 637 */ 638 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 639 { 640 if (amdgpu_device_skip_hw_access(adev)) 641 return 0; 642 643 if (index < adev->doorbell.num_doorbells) { 644 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 645 } else { 646 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 647 return 0; 648 } 649 } 650 651 /** 652 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 653 * 654 * @adev: amdgpu_device pointer 655 * @index: doorbell index 656 * @v: value to write 657 * 658 * Writes @v to the doorbell aperture at the 659 * requested doorbell index (VEGA10+). 660 */ 661 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 662 { 663 if (amdgpu_device_skip_hw_access(adev)) 664 return; 665 666 if (index < adev->doorbell.num_doorbells) { 667 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 668 } else { 669 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 670 } 671 } 672 673 /** 674 * amdgpu_device_indirect_rreg - read an indirect register 675 * 676 * @adev: amdgpu_device pointer 677 * @pcie_index: mmio register offset 678 * @pcie_data: mmio register offset 679 * @reg_addr: indirect register address to read from 680 * 681 * Returns the value of indirect register @reg_addr 682 */ 683 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 684 u32 pcie_index, u32 pcie_data, 685 u32 reg_addr) 686 { 687 unsigned long flags; 688 u32 r; 689 void __iomem *pcie_index_offset; 690 void __iomem *pcie_data_offset; 691 692 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 693 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 694 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 695 696 writel(reg_addr, pcie_index_offset); 697 readl(pcie_index_offset); 698 r = readl(pcie_data_offset); 699 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 700 701 return r; 702 } 703 704 /** 705 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 706 * 707 * @adev: amdgpu_device pointer 708 * @pcie_index: mmio register offset 709 * @pcie_data: mmio register offset 710 * @reg_addr: indirect register address to read from 711 * 712 * Returns the value of indirect register @reg_addr 713 */ 714 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 715 u32 pcie_index, u32 pcie_data, 716 u32 reg_addr) 717 { 718 unsigned long flags; 719 u64 r; 720 void __iomem *pcie_index_offset; 721 void __iomem *pcie_data_offset; 722 723 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 724 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 725 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 726 727 /* read low 32 bits */ 728 writel(reg_addr, pcie_index_offset); 729 readl(pcie_index_offset); 730 r = readl(pcie_data_offset); 731 /* read high 32 bits */ 732 writel(reg_addr + 4, pcie_index_offset); 733 readl(pcie_index_offset); 734 r |= ((u64)readl(pcie_data_offset) << 32); 735 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 736 737 return r; 738 } 739 740 /** 741 * amdgpu_device_indirect_wreg - write an indirect register address 742 * 743 * @adev: amdgpu_device pointer 744 * @pcie_index: mmio register offset 745 * @pcie_data: mmio register offset 746 * @reg_addr: indirect register offset 747 * @reg_data: indirect register data 748 * 749 */ 750 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 751 u32 pcie_index, u32 pcie_data, 752 u32 reg_addr, u32 reg_data) 753 { 754 unsigned long flags; 755 void __iomem *pcie_index_offset; 756 void __iomem *pcie_data_offset; 757 758 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 759 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 760 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 761 762 writel(reg_addr, pcie_index_offset); 763 readl(pcie_index_offset); 764 writel(reg_data, pcie_data_offset); 765 readl(pcie_data_offset); 766 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 767 } 768 769 /** 770 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 771 * 772 * @adev: amdgpu_device pointer 773 * @pcie_index: mmio register offset 774 * @pcie_data: mmio register offset 775 * @reg_addr: indirect register offset 776 * @reg_data: indirect register data 777 * 778 */ 779 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 780 u32 pcie_index, u32 pcie_data, 781 u32 reg_addr, u64 reg_data) 782 { 783 unsigned long flags; 784 void __iomem *pcie_index_offset; 785 void __iomem *pcie_data_offset; 786 787 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 788 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 789 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 790 791 /* write low 32 bits */ 792 writel(reg_addr, pcie_index_offset); 793 readl(pcie_index_offset); 794 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 795 readl(pcie_data_offset); 796 /* write high 32 bits */ 797 writel(reg_addr + 4, pcie_index_offset); 798 readl(pcie_index_offset); 799 writel((u32)(reg_data >> 32), pcie_data_offset); 800 readl(pcie_data_offset); 801 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 802 } 803 804 /** 805 * amdgpu_invalid_rreg - dummy reg read function 806 * 807 * @adev: amdgpu_device pointer 808 * @reg: offset of register 809 * 810 * Dummy register read function. Used for register blocks 811 * that certain asics don't have (all asics). 812 * Returns the value in the register. 813 */ 814 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 815 { 816 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 817 BUG(); 818 return 0; 819 } 820 821 /** 822 * amdgpu_invalid_wreg - dummy reg write function 823 * 824 * @adev: amdgpu_device pointer 825 * @reg: offset of register 826 * @v: value to write to the register 827 * 828 * Dummy register read function. Used for register blocks 829 * that certain asics don't have (all asics). 830 */ 831 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 832 { 833 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 834 reg, v); 835 BUG(); 836 } 837 838 /** 839 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 840 * 841 * @adev: amdgpu_device pointer 842 * @reg: offset of register 843 * 844 * Dummy register read function. Used for register blocks 845 * that certain asics don't have (all asics). 846 * Returns the value in the register. 847 */ 848 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 849 { 850 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 851 BUG(); 852 return 0; 853 } 854 855 /** 856 * amdgpu_invalid_wreg64 - dummy reg write function 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: offset of register 860 * @v: value to write to the register 861 * 862 * Dummy register read function. Used for register blocks 863 * that certain asics don't have (all asics). 864 */ 865 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 866 { 867 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 868 reg, v); 869 BUG(); 870 } 871 872 /** 873 * amdgpu_block_invalid_rreg - dummy reg read function 874 * 875 * @adev: amdgpu_device pointer 876 * @block: offset of instance 877 * @reg: offset of register 878 * 879 * Dummy register read function. Used for register blocks 880 * that certain asics don't have (all asics). 881 * Returns the value in the register. 882 */ 883 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 884 uint32_t block, uint32_t reg) 885 { 886 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 887 reg, block); 888 BUG(); 889 return 0; 890 } 891 892 /** 893 * amdgpu_block_invalid_wreg - dummy reg write function 894 * 895 * @adev: amdgpu_device pointer 896 * @block: offset of instance 897 * @reg: offset of register 898 * @v: value to write to the register 899 * 900 * Dummy register read function. Used for register blocks 901 * that certain asics don't have (all asics). 902 */ 903 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 904 uint32_t block, 905 uint32_t reg, uint32_t v) 906 { 907 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 908 reg, block, v); 909 BUG(); 910 } 911 912 /** 913 * amdgpu_device_asic_init - Wrapper for atom asic_init 914 * 915 * @adev: amdgpu_device pointer 916 * 917 * Does any asic specific work and then calls atom asic init. 918 */ 919 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 920 { 921 amdgpu_asic_pre_asic_init(adev); 922 923 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 924 return amdgpu_atomfirmware_asic_init(adev, true); 925 else 926 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 927 } 928 929 /** 930 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 931 * 932 * @adev: amdgpu_device pointer 933 * 934 * Allocates a scratch page of VRAM for use by various things in the 935 * driver. 936 */ 937 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 938 { 939 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 940 AMDGPU_GEM_DOMAIN_VRAM | 941 AMDGPU_GEM_DOMAIN_GTT, 942 &adev->mem_scratch.robj, 943 &adev->mem_scratch.gpu_addr, 944 (void **)&adev->mem_scratch.ptr); 945 } 946 947 /** 948 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 949 * 950 * @adev: amdgpu_device pointer 951 * 952 * Frees the VRAM scratch page. 953 */ 954 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 955 { 956 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 957 } 958 959 /** 960 * amdgpu_device_program_register_sequence - program an array of registers. 961 * 962 * @adev: amdgpu_device pointer 963 * @registers: pointer to the register array 964 * @array_size: size of the register array 965 * 966 * Programs an array or registers with and and or masks. 967 * This is a helper for setting golden registers. 968 */ 969 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 970 const u32 *registers, 971 const u32 array_size) 972 { 973 u32 tmp, reg, and_mask, or_mask; 974 int i; 975 976 if (array_size % 3) 977 return; 978 979 for (i = 0; i < array_size; i +=3) { 980 reg = registers[i + 0]; 981 and_mask = registers[i + 1]; 982 or_mask = registers[i + 2]; 983 984 if (and_mask == 0xffffffff) { 985 tmp = or_mask; 986 } else { 987 tmp = RREG32(reg); 988 tmp &= ~and_mask; 989 if (adev->family >= AMDGPU_FAMILY_AI) 990 tmp |= (or_mask & and_mask); 991 else 992 tmp |= or_mask; 993 } 994 WREG32(reg, tmp); 995 } 996 } 997 998 /** 999 * amdgpu_device_pci_config_reset - reset the GPU 1000 * 1001 * @adev: amdgpu_device pointer 1002 * 1003 * Resets the GPU using the pci config reset sequence. 1004 * Only applicable to asics prior to vega10. 1005 */ 1006 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1007 { 1008 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1009 } 1010 1011 /** 1012 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1013 * 1014 * @adev: amdgpu_device pointer 1015 * 1016 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1017 */ 1018 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1019 { 1020 return pci_reset_function(adev->pdev); 1021 } 1022 1023 /* 1024 * GPU doorbell aperture helpers function. 1025 */ 1026 /** 1027 * amdgpu_device_doorbell_init - Init doorbell driver information. 1028 * 1029 * @adev: amdgpu_device pointer 1030 * 1031 * Init doorbell driver information (CIK) 1032 * Returns 0 on success, error on failure. 1033 */ 1034 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1035 { 1036 1037 /* No doorbell on SI hardware generation */ 1038 if (adev->asic_type < CHIP_BONAIRE) { 1039 adev->doorbell.base = 0; 1040 adev->doorbell.size = 0; 1041 adev->doorbell.num_doorbells = 0; 1042 adev->doorbell.ptr = NULL; 1043 return 0; 1044 } 1045 1046 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1047 return -EINVAL; 1048 1049 amdgpu_asic_init_doorbell_index(adev); 1050 1051 /* doorbell bar mapping */ 1052 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1053 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1054 1055 if (adev->enable_mes) { 1056 adev->doorbell.num_doorbells = 1057 adev->doorbell.size / sizeof(u32); 1058 } else { 1059 adev->doorbell.num_doorbells = 1060 min_t(u32, adev->doorbell.size / sizeof(u32), 1061 adev->doorbell_index.max_assignment+1); 1062 if (adev->doorbell.num_doorbells == 0) 1063 return -EINVAL; 1064 1065 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1066 * paging queue doorbell use the second page. The 1067 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1068 * doorbells are in the first page. So with paging queue enabled, 1069 * the max num_doorbells should + 1 page (0x400 in dword) 1070 */ 1071 if (adev->asic_type >= CHIP_VEGA10) 1072 adev->doorbell.num_doorbells += 0x400; 1073 } 1074 1075 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1076 adev->doorbell.num_doorbells * 1077 sizeof(u32)); 1078 if (adev->doorbell.ptr == NULL) 1079 return -ENOMEM; 1080 1081 return 0; 1082 } 1083 1084 /** 1085 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1086 * 1087 * @adev: amdgpu_device pointer 1088 * 1089 * Tear down doorbell driver information (CIK) 1090 */ 1091 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1092 { 1093 iounmap(adev->doorbell.ptr); 1094 adev->doorbell.ptr = NULL; 1095 } 1096 1097 1098 1099 /* 1100 * amdgpu_device_wb_*() 1101 * Writeback is the method by which the GPU updates special pages in memory 1102 * with the status of certain GPU events (fences, ring pointers,etc.). 1103 */ 1104 1105 /** 1106 * amdgpu_device_wb_fini - Disable Writeback and free memory 1107 * 1108 * @adev: amdgpu_device pointer 1109 * 1110 * Disables Writeback and frees the Writeback memory (all asics). 1111 * Used at driver shutdown. 1112 */ 1113 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1114 { 1115 if (adev->wb.wb_obj) { 1116 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1117 &adev->wb.gpu_addr, 1118 (void **)&adev->wb.wb); 1119 adev->wb.wb_obj = NULL; 1120 } 1121 } 1122 1123 /** 1124 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1125 * 1126 * @adev: amdgpu_device pointer 1127 * 1128 * Initializes writeback and allocates writeback memory (all asics). 1129 * Used at driver startup. 1130 * Returns 0 on success or an -error on failure. 1131 */ 1132 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1133 { 1134 int r; 1135 1136 if (adev->wb.wb_obj == NULL) { 1137 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1138 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1139 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1140 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1141 (void **)&adev->wb.wb); 1142 if (r) { 1143 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1144 return r; 1145 } 1146 1147 adev->wb.num_wb = AMDGPU_MAX_WB; 1148 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1149 1150 /* clear wb memory */ 1151 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1152 } 1153 1154 return 0; 1155 } 1156 1157 /** 1158 * amdgpu_device_wb_get - Allocate a wb entry 1159 * 1160 * @adev: amdgpu_device pointer 1161 * @wb: wb index 1162 * 1163 * Allocate a wb slot for use by the driver (all asics). 1164 * Returns 0 on success or -EINVAL on failure. 1165 */ 1166 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1167 { 1168 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1169 1170 if (offset < adev->wb.num_wb) { 1171 __set_bit(offset, adev->wb.used); 1172 *wb = offset << 3; /* convert to dw offset */ 1173 return 0; 1174 } else { 1175 return -EINVAL; 1176 } 1177 } 1178 1179 /** 1180 * amdgpu_device_wb_free - Free a wb entry 1181 * 1182 * @adev: amdgpu_device pointer 1183 * @wb: wb index 1184 * 1185 * Free a wb slot allocated for use by the driver (all asics) 1186 */ 1187 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1188 { 1189 wb >>= 3; 1190 if (wb < adev->wb.num_wb) 1191 __clear_bit(wb, adev->wb.used); 1192 } 1193 1194 /** 1195 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1196 * 1197 * @adev: amdgpu_device pointer 1198 * 1199 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1200 * to fail, but if any of the BARs is not accessible after the size we abort 1201 * driver loading by returning -ENODEV. 1202 */ 1203 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1204 { 1205 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1206 struct pci_bus *root; 1207 struct resource *res; 1208 unsigned i; 1209 u16 cmd; 1210 int r; 1211 1212 /* Bypass for VF */ 1213 if (amdgpu_sriov_vf(adev)) 1214 return 0; 1215 1216 /* skip if the bios has already enabled large BAR */ 1217 if (adev->gmc.real_vram_size && 1218 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1219 return 0; 1220 1221 /* Check if the root BUS has 64bit memory resources */ 1222 root = adev->pdev->bus; 1223 while (root->parent) 1224 root = root->parent; 1225 1226 pci_bus_for_each_resource(root, res, i) { 1227 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1228 res->start > 0x100000000ull) 1229 break; 1230 } 1231 1232 /* Trying to resize is pointless without a root hub window above 4GB */ 1233 if (!res) 1234 return 0; 1235 1236 /* Limit the BAR size to what is available */ 1237 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1238 rbar_size); 1239 1240 /* Disable memory decoding while we change the BAR addresses and size */ 1241 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1242 pci_write_config_word(adev->pdev, PCI_COMMAND, 1243 cmd & ~PCI_COMMAND_MEMORY); 1244 1245 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1246 amdgpu_device_doorbell_fini(adev); 1247 if (adev->asic_type >= CHIP_BONAIRE) 1248 pci_release_resource(adev->pdev, 2); 1249 1250 pci_release_resource(adev->pdev, 0); 1251 1252 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1253 if (r == -ENOSPC) 1254 DRM_INFO("Not enough PCI address space for a large BAR."); 1255 else if (r && r != -ENOTSUPP) 1256 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1257 1258 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1259 1260 /* When the doorbell or fb BAR isn't available we have no chance of 1261 * using the device. 1262 */ 1263 r = amdgpu_device_doorbell_init(adev); 1264 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1265 return -ENODEV; 1266 1267 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1268 1269 return 0; 1270 } 1271 1272 /* 1273 * GPU helpers function. 1274 */ 1275 /** 1276 * amdgpu_device_need_post - check if the hw need post or not 1277 * 1278 * @adev: amdgpu_device pointer 1279 * 1280 * Check if the asic has been initialized (all asics) at driver startup 1281 * or post is needed if hw reset is performed. 1282 * Returns true if need or false if not. 1283 */ 1284 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1285 { 1286 uint32_t reg; 1287 1288 if (amdgpu_sriov_vf(adev)) 1289 return false; 1290 1291 if (amdgpu_passthrough(adev)) { 1292 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1293 * some old smc fw still need driver do vPost otherwise gpu hang, while 1294 * those smc fw version above 22.15 doesn't have this flaw, so we force 1295 * vpost executed for smc version below 22.15 1296 */ 1297 if (adev->asic_type == CHIP_FIJI) { 1298 int err; 1299 uint32_t fw_ver; 1300 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1301 /* force vPost if error occured */ 1302 if (err) 1303 return true; 1304 1305 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1306 if (fw_ver < 0x00160e00) 1307 return true; 1308 } 1309 } 1310 1311 /* Don't post if we need to reset whole hive on init */ 1312 if (adev->gmc.xgmi.pending_reset) 1313 return false; 1314 1315 if (adev->has_hw_reset) { 1316 adev->has_hw_reset = false; 1317 return true; 1318 } 1319 1320 /* bios scratch used on CIK+ */ 1321 if (adev->asic_type >= CHIP_BONAIRE) 1322 return amdgpu_atombios_scratch_need_asic_init(adev); 1323 1324 /* check MEM_SIZE for older asics */ 1325 reg = amdgpu_asic_get_config_memsize(adev); 1326 1327 if ((reg != 0) && (reg != 0xffffffff)) 1328 return false; 1329 1330 return true; 1331 } 1332 1333 /** 1334 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1335 * 1336 * @adev: amdgpu_device pointer 1337 * 1338 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1339 * be set for this device. 1340 * 1341 * Returns true if it should be used or false if not. 1342 */ 1343 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1344 { 1345 switch (amdgpu_aspm) { 1346 case -1: 1347 break; 1348 case 0: 1349 return false; 1350 case 1: 1351 return true; 1352 default: 1353 return false; 1354 } 1355 return pcie_aspm_enabled(adev->pdev); 1356 } 1357 1358 /* if we get transitioned to only one device, take VGA back */ 1359 /** 1360 * amdgpu_device_vga_set_decode - enable/disable vga decode 1361 * 1362 * @pdev: PCI device pointer 1363 * @state: enable/disable vga decode 1364 * 1365 * Enable/disable vga decode (all asics). 1366 * Returns VGA resource flags. 1367 */ 1368 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1369 bool state) 1370 { 1371 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1372 amdgpu_asic_set_vga_state(adev, state); 1373 if (state) 1374 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1375 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1376 else 1377 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1378 } 1379 1380 /** 1381 * amdgpu_device_check_block_size - validate the vm block size 1382 * 1383 * @adev: amdgpu_device pointer 1384 * 1385 * Validates the vm block size specified via module parameter. 1386 * The vm block size defines number of bits in page table versus page directory, 1387 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1388 * page table and the remaining bits are in the page directory. 1389 */ 1390 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1391 { 1392 /* defines number of bits in page table versus page directory, 1393 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1394 * page table and the remaining bits are in the page directory */ 1395 if (amdgpu_vm_block_size == -1) 1396 return; 1397 1398 if (amdgpu_vm_block_size < 9) { 1399 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1400 amdgpu_vm_block_size); 1401 amdgpu_vm_block_size = -1; 1402 } 1403 } 1404 1405 /** 1406 * amdgpu_device_check_vm_size - validate the vm size 1407 * 1408 * @adev: amdgpu_device pointer 1409 * 1410 * Validates the vm size in GB specified via module parameter. 1411 * The VM size is the size of the GPU virtual memory space in GB. 1412 */ 1413 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1414 { 1415 /* no need to check the default value */ 1416 if (amdgpu_vm_size == -1) 1417 return; 1418 1419 if (amdgpu_vm_size < 1) { 1420 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1421 amdgpu_vm_size); 1422 amdgpu_vm_size = -1; 1423 } 1424 } 1425 1426 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1427 { 1428 struct sysinfo si; 1429 bool is_os_64 = (sizeof(void *) == 8); 1430 uint64_t total_memory; 1431 uint64_t dram_size_seven_GB = 0x1B8000000; 1432 uint64_t dram_size_three_GB = 0xB8000000; 1433 1434 if (amdgpu_smu_memory_pool_size == 0) 1435 return; 1436 1437 if (!is_os_64) { 1438 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1439 goto def_value; 1440 } 1441 si_meminfo(&si); 1442 total_memory = (uint64_t)si.totalram * si.mem_unit; 1443 1444 if ((amdgpu_smu_memory_pool_size == 1) || 1445 (amdgpu_smu_memory_pool_size == 2)) { 1446 if (total_memory < dram_size_three_GB) 1447 goto def_value1; 1448 } else if ((amdgpu_smu_memory_pool_size == 4) || 1449 (amdgpu_smu_memory_pool_size == 8)) { 1450 if (total_memory < dram_size_seven_GB) 1451 goto def_value1; 1452 } else { 1453 DRM_WARN("Smu memory pool size not supported\n"); 1454 goto def_value; 1455 } 1456 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1457 1458 return; 1459 1460 def_value1: 1461 DRM_WARN("No enough system memory\n"); 1462 def_value: 1463 adev->pm.smu_prv_buffer_size = 0; 1464 } 1465 1466 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1467 { 1468 if (!(adev->flags & AMD_IS_APU) || 1469 adev->asic_type < CHIP_RAVEN) 1470 return 0; 1471 1472 switch (adev->asic_type) { 1473 case CHIP_RAVEN: 1474 if (adev->pdev->device == 0x15dd) 1475 adev->apu_flags |= AMD_APU_IS_RAVEN; 1476 if (adev->pdev->device == 0x15d8) 1477 adev->apu_flags |= AMD_APU_IS_PICASSO; 1478 break; 1479 case CHIP_RENOIR: 1480 if ((adev->pdev->device == 0x1636) || 1481 (adev->pdev->device == 0x164c)) 1482 adev->apu_flags |= AMD_APU_IS_RENOIR; 1483 else 1484 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1485 break; 1486 case CHIP_VANGOGH: 1487 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1488 break; 1489 case CHIP_YELLOW_CARP: 1490 break; 1491 case CHIP_CYAN_SKILLFISH: 1492 if ((adev->pdev->device == 0x13FE) || 1493 (adev->pdev->device == 0x143F)) 1494 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1495 break; 1496 default: 1497 break; 1498 } 1499 1500 return 0; 1501 } 1502 1503 /** 1504 * amdgpu_device_check_arguments - validate module params 1505 * 1506 * @adev: amdgpu_device pointer 1507 * 1508 * Validates certain module parameters and updates 1509 * the associated values used by the driver (all asics). 1510 */ 1511 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1512 { 1513 if (amdgpu_sched_jobs < 4) { 1514 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1515 amdgpu_sched_jobs); 1516 amdgpu_sched_jobs = 4; 1517 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1518 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1519 amdgpu_sched_jobs); 1520 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1521 } 1522 1523 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1524 /* gart size must be greater or equal to 32M */ 1525 dev_warn(adev->dev, "gart size (%d) too small\n", 1526 amdgpu_gart_size); 1527 amdgpu_gart_size = -1; 1528 } 1529 1530 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1531 /* gtt size must be greater or equal to 32M */ 1532 dev_warn(adev->dev, "gtt size (%d) too small\n", 1533 amdgpu_gtt_size); 1534 amdgpu_gtt_size = -1; 1535 } 1536 1537 /* valid range is between 4 and 9 inclusive */ 1538 if (amdgpu_vm_fragment_size != -1 && 1539 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1540 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1541 amdgpu_vm_fragment_size = -1; 1542 } 1543 1544 if (amdgpu_sched_hw_submission < 2) { 1545 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1546 amdgpu_sched_hw_submission); 1547 amdgpu_sched_hw_submission = 2; 1548 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1549 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1550 amdgpu_sched_hw_submission); 1551 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1552 } 1553 1554 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1555 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1556 amdgpu_reset_method = -1; 1557 } 1558 1559 amdgpu_device_check_smu_prv_buffer_size(adev); 1560 1561 amdgpu_device_check_vm_size(adev); 1562 1563 amdgpu_device_check_block_size(adev); 1564 1565 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1566 1567 return 0; 1568 } 1569 1570 /** 1571 * amdgpu_switcheroo_set_state - set switcheroo state 1572 * 1573 * @pdev: pci dev pointer 1574 * @state: vga_switcheroo state 1575 * 1576 * Callback for the switcheroo driver. Suspends or resumes 1577 * the asics before or after it is powered up using ACPI methods. 1578 */ 1579 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1580 enum vga_switcheroo_state state) 1581 { 1582 struct drm_device *dev = pci_get_drvdata(pdev); 1583 int r; 1584 1585 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1586 return; 1587 1588 if (state == VGA_SWITCHEROO_ON) { 1589 pr_info("switched on\n"); 1590 /* don't suspend or resume card normally */ 1591 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1592 1593 pci_set_power_state(pdev, PCI_D0); 1594 amdgpu_device_load_pci_state(pdev); 1595 r = pci_enable_device(pdev); 1596 if (r) 1597 DRM_WARN("pci_enable_device failed (%d)\n", r); 1598 amdgpu_device_resume(dev, true); 1599 1600 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1601 } else { 1602 pr_info("switched off\n"); 1603 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1604 amdgpu_device_suspend(dev, true); 1605 amdgpu_device_cache_pci_state(pdev); 1606 /* Shut down the device */ 1607 pci_disable_device(pdev); 1608 pci_set_power_state(pdev, PCI_D3cold); 1609 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1610 } 1611 } 1612 1613 /** 1614 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1615 * 1616 * @pdev: pci dev pointer 1617 * 1618 * Callback for the switcheroo driver. Check of the switcheroo 1619 * state can be changed. 1620 * Returns true if the state can be changed, false if not. 1621 */ 1622 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1623 { 1624 struct drm_device *dev = pci_get_drvdata(pdev); 1625 1626 /* 1627 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1628 * locking inversion with the driver load path. And the access here is 1629 * completely racy anyway. So don't bother with locking for now. 1630 */ 1631 return atomic_read(&dev->open_count) == 0; 1632 } 1633 1634 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1635 .set_gpu_state = amdgpu_switcheroo_set_state, 1636 .reprobe = NULL, 1637 .can_switch = amdgpu_switcheroo_can_switch, 1638 }; 1639 1640 /** 1641 * amdgpu_device_ip_set_clockgating_state - set the CG state 1642 * 1643 * @dev: amdgpu_device pointer 1644 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1645 * @state: clockgating state (gate or ungate) 1646 * 1647 * Sets the requested clockgating state for all instances of 1648 * the hardware IP specified. 1649 * Returns the error code from the last instance. 1650 */ 1651 int amdgpu_device_ip_set_clockgating_state(void *dev, 1652 enum amd_ip_block_type block_type, 1653 enum amd_clockgating_state state) 1654 { 1655 struct amdgpu_device *adev = dev; 1656 int i, r = 0; 1657 1658 for (i = 0; i < adev->num_ip_blocks; i++) { 1659 if (!adev->ip_blocks[i].status.valid) 1660 continue; 1661 if (adev->ip_blocks[i].version->type != block_type) 1662 continue; 1663 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1664 continue; 1665 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1666 (void *)adev, state); 1667 if (r) 1668 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1669 adev->ip_blocks[i].version->funcs->name, r); 1670 } 1671 return r; 1672 } 1673 1674 /** 1675 * amdgpu_device_ip_set_powergating_state - set the PG state 1676 * 1677 * @dev: amdgpu_device pointer 1678 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1679 * @state: powergating state (gate or ungate) 1680 * 1681 * Sets the requested powergating state for all instances of 1682 * the hardware IP specified. 1683 * Returns the error code from the last instance. 1684 */ 1685 int amdgpu_device_ip_set_powergating_state(void *dev, 1686 enum amd_ip_block_type block_type, 1687 enum amd_powergating_state state) 1688 { 1689 struct amdgpu_device *adev = dev; 1690 int i, r = 0; 1691 1692 for (i = 0; i < adev->num_ip_blocks; i++) { 1693 if (!adev->ip_blocks[i].status.valid) 1694 continue; 1695 if (adev->ip_blocks[i].version->type != block_type) 1696 continue; 1697 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1698 continue; 1699 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1700 (void *)adev, state); 1701 if (r) 1702 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1703 adev->ip_blocks[i].version->funcs->name, r); 1704 } 1705 return r; 1706 } 1707 1708 /** 1709 * amdgpu_device_ip_get_clockgating_state - get the CG state 1710 * 1711 * @adev: amdgpu_device pointer 1712 * @flags: clockgating feature flags 1713 * 1714 * Walks the list of IPs on the device and updates the clockgating 1715 * flags for each IP. 1716 * Updates @flags with the feature flags for each hardware IP where 1717 * clockgating is enabled. 1718 */ 1719 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1720 u64 *flags) 1721 { 1722 int i; 1723 1724 for (i = 0; i < adev->num_ip_blocks; i++) { 1725 if (!adev->ip_blocks[i].status.valid) 1726 continue; 1727 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1728 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1729 } 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_wait_for_idle - wait for idle 1734 * 1735 * @adev: amdgpu_device pointer 1736 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1737 * 1738 * Waits for the request hardware IP to be idle. 1739 * Returns 0 for success or a negative error code on failure. 1740 */ 1741 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1742 enum amd_ip_block_type block_type) 1743 { 1744 int i, r; 1745 1746 for (i = 0; i < adev->num_ip_blocks; i++) { 1747 if (!adev->ip_blocks[i].status.valid) 1748 continue; 1749 if (adev->ip_blocks[i].version->type == block_type) { 1750 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1751 if (r) 1752 return r; 1753 break; 1754 } 1755 } 1756 return 0; 1757 1758 } 1759 1760 /** 1761 * amdgpu_device_ip_is_idle - is the hardware IP idle 1762 * 1763 * @adev: amdgpu_device pointer 1764 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1765 * 1766 * Check if the hardware IP is idle or not. 1767 * Returns true if it the IP is idle, false if not. 1768 */ 1769 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1770 enum amd_ip_block_type block_type) 1771 { 1772 int i; 1773 1774 for (i = 0; i < adev->num_ip_blocks; i++) { 1775 if (!adev->ip_blocks[i].status.valid) 1776 continue; 1777 if (adev->ip_blocks[i].version->type == block_type) 1778 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1779 } 1780 return true; 1781 1782 } 1783 1784 /** 1785 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1786 * 1787 * @adev: amdgpu_device pointer 1788 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1789 * 1790 * Returns a pointer to the hardware IP block structure 1791 * if it exists for the asic, otherwise NULL. 1792 */ 1793 struct amdgpu_ip_block * 1794 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1795 enum amd_ip_block_type type) 1796 { 1797 int i; 1798 1799 for (i = 0; i < adev->num_ip_blocks; i++) 1800 if (adev->ip_blocks[i].version->type == type) 1801 return &adev->ip_blocks[i]; 1802 1803 return NULL; 1804 } 1805 1806 /** 1807 * amdgpu_device_ip_block_version_cmp 1808 * 1809 * @adev: amdgpu_device pointer 1810 * @type: enum amd_ip_block_type 1811 * @major: major version 1812 * @minor: minor version 1813 * 1814 * return 0 if equal or greater 1815 * return 1 if smaller or the ip_block doesn't exist 1816 */ 1817 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1818 enum amd_ip_block_type type, 1819 u32 major, u32 minor) 1820 { 1821 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1822 1823 if (ip_block && ((ip_block->version->major > major) || 1824 ((ip_block->version->major == major) && 1825 (ip_block->version->minor >= minor)))) 1826 return 0; 1827 1828 return 1; 1829 } 1830 1831 /** 1832 * amdgpu_device_ip_block_add 1833 * 1834 * @adev: amdgpu_device pointer 1835 * @ip_block_version: pointer to the IP to add 1836 * 1837 * Adds the IP block driver information to the collection of IPs 1838 * on the asic. 1839 */ 1840 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1841 const struct amdgpu_ip_block_version *ip_block_version) 1842 { 1843 if (!ip_block_version) 1844 return -EINVAL; 1845 1846 switch (ip_block_version->type) { 1847 case AMD_IP_BLOCK_TYPE_VCN: 1848 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1849 return 0; 1850 break; 1851 case AMD_IP_BLOCK_TYPE_JPEG: 1852 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1853 return 0; 1854 break; 1855 default: 1856 break; 1857 } 1858 1859 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1860 ip_block_version->funcs->name); 1861 1862 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1863 1864 return 0; 1865 } 1866 1867 /** 1868 * amdgpu_device_enable_virtual_display - enable virtual display feature 1869 * 1870 * @adev: amdgpu_device pointer 1871 * 1872 * Enabled the virtual display feature if the user has enabled it via 1873 * the module parameter virtual_display. This feature provides a virtual 1874 * display hardware on headless boards or in virtualized environments. 1875 * This function parses and validates the configuration string specified by 1876 * the user and configues the virtual display configuration (number of 1877 * virtual connectors, crtcs, etc.) specified. 1878 */ 1879 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1880 { 1881 adev->enable_virtual_display = false; 1882 1883 if (amdgpu_virtual_display) { 1884 const char *pci_address_name = pci_name(adev->pdev); 1885 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1886 1887 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1888 pciaddstr_tmp = pciaddstr; 1889 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1890 pciaddname = strsep(&pciaddname_tmp, ","); 1891 if (!strcmp("all", pciaddname) 1892 || !strcmp(pci_address_name, pciaddname)) { 1893 long num_crtc; 1894 int res = -1; 1895 1896 adev->enable_virtual_display = true; 1897 1898 if (pciaddname_tmp) 1899 res = kstrtol(pciaddname_tmp, 10, 1900 &num_crtc); 1901 1902 if (!res) { 1903 if (num_crtc < 1) 1904 num_crtc = 1; 1905 if (num_crtc > 6) 1906 num_crtc = 6; 1907 adev->mode_info.num_crtc = num_crtc; 1908 } else { 1909 adev->mode_info.num_crtc = 1; 1910 } 1911 break; 1912 } 1913 } 1914 1915 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1916 amdgpu_virtual_display, pci_address_name, 1917 adev->enable_virtual_display, adev->mode_info.num_crtc); 1918 1919 kfree(pciaddstr); 1920 } 1921 } 1922 1923 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1924 { 1925 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1926 adev->mode_info.num_crtc = 1; 1927 adev->enable_virtual_display = true; 1928 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1929 adev->enable_virtual_display, adev->mode_info.num_crtc); 1930 } 1931 } 1932 1933 /** 1934 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1935 * 1936 * @adev: amdgpu_device pointer 1937 * 1938 * Parses the asic configuration parameters specified in the gpu info 1939 * firmware and makes them availale to the driver for use in configuring 1940 * the asic. 1941 * Returns 0 on success, -EINVAL on failure. 1942 */ 1943 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1944 { 1945 const char *chip_name; 1946 char fw_name[40]; 1947 int err; 1948 const struct gpu_info_firmware_header_v1_0 *hdr; 1949 1950 adev->firmware.gpu_info_fw = NULL; 1951 1952 if (adev->mman.discovery_bin) { 1953 /* 1954 * FIXME: The bounding box is still needed by Navi12, so 1955 * temporarily read it from gpu_info firmware. Should be dropped 1956 * when DAL no longer needs it. 1957 */ 1958 if (adev->asic_type != CHIP_NAVI12) 1959 return 0; 1960 } 1961 1962 switch (adev->asic_type) { 1963 default: 1964 return 0; 1965 case CHIP_VEGA10: 1966 chip_name = "vega10"; 1967 break; 1968 case CHIP_VEGA12: 1969 chip_name = "vega12"; 1970 break; 1971 case CHIP_RAVEN: 1972 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1973 chip_name = "raven2"; 1974 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1975 chip_name = "picasso"; 1976 else 1977 chip_name = "raven"; 1978 break; 1979 case CHIP_ARCTURUS: 1980 chip_name = "arcturus"; 1981 break; 1982 case CHIP_NAVI12: 1983 chip_name = "navi12"; 1984 break; 1985 } 1986 1987 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1988 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1989 if (err) { 1990 dev_err(adev->dev, 1991 "Failed to get gpu_info firmware \"%s\"\n", 1992 fw_name); 1993 goto out; 1994 } 1995 1996 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1997 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1998 1999 switch (hdr->version_major) { 2000 case 1: 2001 { 2002 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2003 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2004 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2005 2006 /* 2007 * Should be droped when DAL no longer needs it. 2008 */ 2009 if (adev->asic_type == CHIP_NAVI12) 2010 goto parse_soc_bounding_box; 2011 2012 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2013 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2014 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2015 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2016 adev->gfx.config.max_texture_channel_caches = 2017 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2018 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2019 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2020 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2021 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2022 adev->gfx.config.double_offchip_lds_buf = 2023 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2024 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2025 adev->gfx.cu_info.max_waves_per_simd = 2026 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2027 adev->gfx.cu_info.max_scratch_slots_per_cu = 2028 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2029 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2030 if (hdr->version_minor >= 1) { 2031 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2032 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2033 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2034 adev->gfx.config.num_sc_per_sh = 2035 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2036 adev->gfx.config.num_packer_per_sc = 2037 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2038 } 2039 2040 parse_soc_bounding_box: 2041 /* 2042 * soc bounding box info is not integrated in disocovery table, 2043 * we always need to parse it from gpu info firmware if needed. 2044 */ 2045 if (hdr->version_minor == 2) { 2046 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2047 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2048 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2049 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2050 } 2051 break; 2052 } 2053 default: 2054 dev_err(adev->dev, 2055 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2056 err = -EINVAL; 2057 goto out; 2058 } 2059 out: 2060 return err; 2061 } 2062 2063 /** 2064 * amdgpu_device_ip_early_init - run early init for hardware IPs 2065 * 2066 * @adev: amdgpu_device pointer 2067 * 2068 * Early initialization pass for hardware IPs. The hardware IPs that make 2069 * up each asic are discovered each IP's early_init callback is run. This 2070 * is the first stage in initializing the asic. 2071 * Returns 0 on success, negative error code on failure. 2072 */ 2073 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2074 { 2075 struct drm_device *dev = adev_to_drm(adev); 2076 struct pci_dev *parent; 2077 int i, r; 2078 2079 amdgpu_device_enable_virtual_display(adev); 2080 2081 if (amdgpu_sriov_vf(adev)) { 2082 r = amdgpu_virt_request_full_gpu(adev, true); 2083 if (r) 2084 return r; 2085 } 2086 2087 switch (adev->asic_type) { 2088 #ifdef CONFIG_DRM_AMDGPU_SI 2089 case CHIP_VERDE: 2090 case CHIP_TAHITI: 2091 case CHIP_PITCAIRN: 2092 case CHIP_OLAND: 2093 case CHIP_HAINAN: 2094 adev->family = AMDGPU_FAMILY_SI; 2095 r = si_set_ip_blocks(adev); 2096 if (r) 2097 return r; 2098 break; 2099 #endif 2100 #ifdef CONFIG_DRM_AMDGPU_CIK 2101 case CHIP_BONAIRE: 2102 case CHIP_HAWAII: 2103 case CHIP_KAVERI: 2104 case CHIP_KABINI: 2105 case CHIP_MULLINS: 2106 if (adev->flags & AMD_IS_APU) 2107 adev->family = AMDGPU_FAMILY_KV; 2108 else 2109 adev->family = AMDGPU_FAMILY_CI; 2110 2111 r = cik_set_ip_blocks(adev); 2112 if (r) 2113 return r; 2114 break; 2115 #endif 2116 case CHIP_TOPAZ: 2117 case CHIP_TONGA: 2118 case CHIP_FIJI: 2119 case CHIP_POLARIS10: 2120 case CHIP_POLARIS11: 2121 case CHIP_POLARIS12: 2122 case CHIP_VEGAM: 2123 case CHIP_CARRIZO: 2124 case CHIP_STONEY: 2125 if (adev->flags & AMD_IS_APU) 2126 adev->family = AMDGPU_FAMILY_CZ; 2127 else 2128 adev->family = AMDGPU_FAMILY_VI; 2129 2130 r = vi_set_ip_blocks(adev); 2131 if (r) 2132 return r; 2133 break; 2134 default: 2135 r = amdgpu_discovery_set_ip_blocks(adev); 2136 if (r) 2137 return r; 2138 break; 2139 } 2140 2141 if (amdgpu_has_atpx() && 2142 (amdgpu_is_atpx_hybrid() || 2143 amdgpu_has_atpx_dgpu_power_cntl()) && 2144 ((adev->flags & AMD_IS_APU) == 0) && 2145 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2146 adev->flags |= AMD_IS_PX; 2147 2148 if (!(adev->flags & AMD_IS_APU)) { 2149 parent = pci_upstream_bridge(adev->pdev); 2150 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2151 } 2152 2153 amdgpu_amdkfd_device_probe(adev); 2154 2155 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2156 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2157 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2158 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2159 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2160 2161 for (i = 0; i < adev->num_ip_blocks; i++) { 2162 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2163 DRM_ERROR("disabled ip block: %d <%s>\n", 2164 i, adev->ip_blocks[i].version->funcs->name); 2165 adev->ip_blocks[i].status.valid = false; 2166 } else { 2167 if (adev->ip_blocks[i].version->funcs->early_init) { 2168 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2169 if (r == -ENOENT) { 2170 adev->ip_blocks[i].status.valid = false; 2171 } else if (r) { 2172 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2173 adev->ip_blocks[i].version->funcs->name, r); 2174 return r; 2175 } else { 2176 adev->ip_blocks[i].status.valid = true; 2177 } 2178 } else { 2179 adev->ip_blocks[i].status.valid = true; 2180 } 2181 } 2182 /* get the vbios after the asic_funcs are set up */ 2183 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2184 r = amdgpu_device_parse_gpu_info_fw(adev); 2185 if (r) 2186 return r; 2187 2188 /* Read BIOS */ 2189 if (!amdgpu_get_bios(adev)) 2190 return -EINVAL; 2191 2192 r = amdgpu_atombios_init(adev); 2193 if (r) { 2194 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2195 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2196 return r; 2197 } 2198 2199 /*get pf2vf msg info at it's earliest time*/ 2200 if (amdgpu_sriov_vf(adev)) 2201 amdgpu_virt_init_data_exchange(adev); 2202 2203 } 2204 } 2205 2206 adev->cg_flags &= amdgpu_cg_mask; 2207 adev->pg_flags &= amdgpu_pg_mask; 2208 2209 return 0; 2210 } 2211 2212 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2213 { 2214 int i, r; 2215 2216 for (i = 0; i < adev->num_ip_blocks; i++) { 2217 if (!adev->ip_blocks[i].status.sw) 2218 continue; 2219 if (adev->ip_blocks[i].status.hw) 2220 continue; 2221 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2222 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2223 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2224 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2225 if (r) { 2226 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2227 adev->ip_blocks[i].version->funcs->name, r); 2228 return r; 2229 } 2230 adev->ip_blocks[i].status.hw = true; 2231 } 2232 } 2233 2234 return 0; 2235 } 2236 2237 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2238 { 2239 int i, r; 2240 2241 for (i = 0; i < adev->num_ip_blocks; i++) { 2242 if (!adev->ip_blocks[i].status.sw) 2243 continue; 2244 if (adev->ip_blocks[i].status.hw) 2245 continue; 2246 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2247 if (r) { 2248 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2249 adev->ip_blocks[i].version->funcs->name, r); 2250 return r; 2251 } 2252 adev->ip_blocks[i].status.hw = true; 2253 } 2254 2255 return 0; 2256 } 2257 2258 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2259 { 2260 int r = 0; 2261 int i; 2262 uint32_t smu_version; 2263 2264 if (adev->asic_type >= CHIP_VEGA10) { 2265 for (i = 0; i < adev->num_ip_blocks; i++) { 2266 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2267 continue; 2268 2269 if (!adev->ip_blocks[i].status.sw) 2270 continue; 2271 2272 /* no need to do the fw loading again if already done*/ 2273 if (adev->ip_blocks[i].status.hw == true) 2274 break; 2275 2276 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2277 r = adev->ip_blocks[i].version->funcs->resume(adev); 2278 if (r) { 2279 DRM_ERROR("resume of IP block <%s> failed %d\n", 2280 adev->ip_blocks[i].version->funcs->name, r); 2281 return r; 2282 } 2283 } else { 2284 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2285 if (r) { 2286 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2287 adev->ip_blocks[i].version->funcs->name, r); 2288 return r; 2289 } 2290 } 2291 2292 adev->ip_blocks[i].status.hw = true; 2293 break; 2294 } 2295 } 2296 2297 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2298 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2299 2300 return r; 2301 } 2302 2303 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2304 { 2305 long timeout; 2306 int r, i; 2307 2308 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2309 struct amdgpu_ring *ring = adev->rings[i]; 2310 2311 /* No need to setup the GPU scheduler for rings that don't need it */ 2312 if (!ring || ring->no_scheduler) 2313 continue; 2314 2315 switch (ring->funcs->type) { 2316 case AMDGPU_RING_TYPE_GFX: 2317 timeout = adev->gfx_timeout; 2318 break; 2319 case AMDGPU_RING_TYPE_COMPUTE: 2320 timeout = adev->compute_timeout; 2321 break; 2322 case AMDGPU_RING_TYPE_SDMA: 2323 timeout = adev->sdma_timeout; 2324 break; 2325 default: 2326 timeout = adev->video_timeout; 2327 break; 2328 } 2329 2330 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2331 ring->num_hw_submission, amdgpu_job_hang_limit, 2332 timeout, adev->reset_domain->wq, 2333 ring->sched_score, ring->name, 2334 adev->dev); 2335 if (r) { 2336 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2337 ring->name); 2338 return r; 2339 } 2340 } 2341 2342 return 0; 2343 } 2344 2345 2346 /** 2347 * amdgpu_device_ip_init - run init for hardware IPs 2348 * 2349 * @adev: amdgpu_device pointer 2350 * 2351 * Main initialization pass for hardware IPs. The list of all the hardware 2352 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2353 * are run. sw_init initializes the software state associated with each IP 2354 * and hw_init initializes the hardware associated with each IP. 2355 * Returns 0 on success, negative error code on failure. 2356 */ 2357 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2358 { 2359 int i, r; 2360 2361 r = amdgpu_ras_init(adev); 2362 if (r) 2363 return r; 2364 2365 for (i = 0; i < adev->num_ip_blocks; i++) { 2366 if (!adev->ip_blocks[i].status.valid) 2367 continue; 2368 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2369 if (r) { 2370 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2371 adev->ip_blocks[i].version->funcs->name, r); 2372 goto init_failed; 2373 } 2374 adev->ip_blocks[i].status.sw = true; 2375 2376 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2377 /* need to do common hw init early so everything is set up for gmc */ 2378 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2379 if (r) { 2380 DRM_ERROR("hw_init %d failed %d\n", i, r); 2381 goto init_failed; 2382 } 2383 adev->ip_blocks[i].status.hw = true; 2384 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2385 /* need to do gmc hw init early so we can allocate gpu mem */ 2386 /* Try to reserve bad pages early */ 2387 if (amdgpu_sriov_vf(adev)) 2388 amdgpu_virt_exchange_data(adev); 2389 2390 r = amdgpu_device_mem_scratch_init(adev); 2391 if (r) { 2392 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2393 goto init_failed; 2394 } 2395 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2396 if (r) { 2397 DRM_ERROR("hw_init %d failed %d\n", i, r); 2398 goto init_failed; 2399 } 2400 r = amdgpu_device_wb_init(adev); 2401 if (r) { 2402 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2403 goto init_failed; 2404 } 2405 adev->ip_blocks[i].status.hw = true; 2406 2407 /* right after GMC hw init, we create CSA */ 2408 if (amdgpu_mcbp) { 2409 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2410 AMDGPU_GEM_DOMAIN_VRAM | 2411 AMDGPU_GEM_DOMAIN_GTT, 2412 AMDGPU_CSA_SIZE); 2413 if (r) { 2414 DRM_ERROR("allocate CSA failed %d\n", r); 2415 goto init_failed; 2416 } 2417 } 2418 } 2419 } 2420 2421 if (amdgpu_sriov_vf(adev)) 2422 amdgpu_virt_init_data_exchange(adev); 2423 2424 r = amdgpu_ib_pool_init(adev); 2425 if (r) { 2426 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2427 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2428 goto init_failed; 2429 } 2430 2431 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2432 if (r) 2433 goto init_failed; 2434 2435 r = amdgpu_device_ip_hw_init_phase1(adev); 2436 if (r) 2437 goto init_failed; 2438 2439 r = amdgpu_device_fw_loading(adev); 2440 if (r) 2441 goto init_failed; 2442 2443 r = amdgpu_device_ip_hw_init_phase2(adev); 2444 if (r) 2445 goto init_failed; 2446 2447 /* 2448 * retired pages will be loaded from eeprom and reserved here, 2449 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2450 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2451 * for I2C communication which only true at this point. 2452 * 2453 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2454 * failure from bad gpu situation and stop amdgpu init process 2455 * accordingly. For other failed cases, it will still release all 2456 * the resource and print error message, rather than returning one 2457 * negative value to upper level. 2458 * 2459 * Note: theoretically, this should be called before all vram allocations 2460 * to protect retired page from abusing 2461 */ 2462 r = amdgpu_ras_recovery_init(adev); 2463 if (r) 2464 goto init_failed; 2465 2466 /** 2467 * In case of XGMI grab extra reference for reset domain for this device 2468 */ 2469 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2470 if (amdgpu_xgmi_add_device(adev) == 0) { 2471 if (!amdgpu_sriov_vf(adev)) { 2472 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2473 2474 if (WARN_ON(!hive)) { 2475 r = -ENOENT; 2476 goto init_failed; 2477 } 2478 2479 if (!hive->reset_domain || 2480 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2481 r = -ENOENT; 2482 amdgpu_put_xgmi_hive(hive); 2483 goto init_failed; 2484 } 2485 2486 /* Drop the early temporary reset domain we created for device */ 2487 amdgpu_reset_put_reset_domain(adev->reset_domain); 2488 adev->reset_domain = hive->reset_domain; 2489 amdgpu_put_xgmi_hive(hive); 2490 } 2491 } 2492 } 2493 2494 r = amdgpu_device_init_schedulers(adev); 2495 if (r) 2496 goto init_failed; 2497 2498 /* Don't init kfd if whole hive need to be reset during init */ 2499 if (!adev->gmc.xgmi.pending_reset) 2500 amdgpu_amdkfd_device_init(adev); 2501 2502 amdgpu_fru_get_product_info(adev); 2503 2504 init_failed: 2505 if (amdgpu_sriov_vf(adev)) 2506 amdgpu_virt_release_full_gpu(adev, true); 2507 2508 return r; 2509 } 2510 2511 /** 2512 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2513 * 2514 * @adev: amdgpu_device pointer 2515 * 2516 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2517 * this function before a GPU reset. If the value is retained after a 2518 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2519 */ 2520 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2521 { 2522 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2523 } 2524 2525 /** 2526 * amdgpu_device_check_vram_lost - check if vram is valid 2527 * 2528 * @adev: amdgpu_device pointer 2529 * 2530 * Checks the reset magic value written to the gart pointer in VRAM. 2531 * The driver calls this after a GPU reset to see if the contents of 2532 * VRAM is lost or now. 2533 * returns true if vram is lost, false if not. 2534 */ 2535 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2536 { 2537 if (memcmp(adev->gart.ptr, adev->reset_magic, 2538 AMDGPU_RESET_MAGIC_NUM)) 2539 return true; 2540 2541 if (!amdgpu_in_reset(adev)) 2542 return false; 2543 2544 /* 2545 * For all ASICs with baco/mode1 reset, the VRAM is 2546 * always assumed to be lost. 2547 */ 2548 switch (amdgpu_asic_reset_method(adev)) { 2549 case AMD_RESET_METHOD_BACO: 2550 case AMD_RESET_METHOD_MODE1: 2551 return true; 2552 default: 2553 return false; 2554 } 2555 } 2556 2557 /** 2558 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2559 * 2560 * @adev: amdgpu_device pointer 2561 * @state: clockgating state (gate or ungate) 2562 * 2563 * The list of all the hardware IPs that make up the asic is walked and the 2564 * set_clockgating_state callbacks are run. 2565 * Late initialization pass enabling clockgating for hardware IPs. 2566 * Fini or suspend, pass disabling clockgating for hardware IPs. 2567 * Returns 0 on success, negative error code on failure. 2568 */ 2569 2570 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2571 enum amd_clockgating_state state) 2572 { 2573 int i, j, r; 2574 2575 if (amdgpu_emu_mode == 1) 2576 return 0; 2577 2578 for (j = 0; j < adev->num_ip_blocks; j++) { 2579 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2580 if (!adev->ip_blocks[i].status.late_initialized) 2581 continue; 2582 /* skip CG for GFX, SDMA on S0ix */ 2583 if (adev->in_s0ix && 2584 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2585 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2586 continue; 2587 /* skip CG for VCE/UVD, it's handled specially */ 2588 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2589 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2590 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2591 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2592 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2593 /* enable clockgating to save power */ 2594 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2595 state); 2596 if (r) { 2597 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2598 adev->ip_blocks[i].version->funcs->name, r); 2599 return r; 2600 } 2601 } 2602 } 2603 2604 return 0; 2605 } 2606 2607 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2608 enum amd_powergating_state state) 2609 { 2610 int i, j, r; 2611 2612 if (amdgpu_emu_mode == 1) 2613 return 0; 2614 2615 for (j = 0; j < adev->num_ip_blocks; j++) { 2616 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2617 if (!adev->ip_blocks[i].status.late_initialized) 2618 continue; 2619 /* skip PG for GFX, SDMA on S0ix */ 2620 if (adev->in_s0ix && 2621 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2622 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2623 continue; 2624 /* skip CG for VCE/UVD, it's handled specially */ 2625 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2626 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2627 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2628 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2629 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2630 /* enable powergating to save power */ 2631 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2632 state); 2633 if (r) { 2634 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2635 adev->ip_blocks[i].version->funcs->name, r); 2636 return r; 2637 } 2638 } 2639 } 2640 return 0; 2641 } 2642 2643 static int amdgpu_device_enable_mgpu_fan_boost(void) 2644 { 2645 struct amdgpu_gpu_instance *gpu_ins; 2646 struct amdgpu_device *adev; 2647 int i, ret = 0; 2648 2649 mutex_lock(&mgpu_info.mutex); 2650 2651 /* 2652 * MGPU fan boost feature should be enabled 2653 * only when there are two or more dGPUs in 2654 * the system 2655 */ 2656 if (mgpu_info.num_dgpu < 2) 2657 goto out; 2658 2659 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2660 gpu_ins = &(mgpu_info.gpu_ins[i]); 2661 adev = gpu_ins->adev; 2662 if (!(adev->flags & AMD_IS_APU) && 2663 !gpu_ins->mgpu_fan_enabled) { 2664 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2665 if (ret) 2666 break; 2667 2668 gpu_ins->mgpu_fan_enabled = 1; 2669 } 2670 } 2671 2672 out: 2673 mutex_unlock(&mgpu_info.mutex); 2674 2675 return ret; 2676 } 2677 2678 /** 2679 * amdgpu_device_ip_late_init - run late init for hardware IPs 2680 * 2681 * @adev: amdgpu_device pointer 2682 * 2683 * Late initialization pass for hardware IPs. The list of all the hardware 2684 * IPs that make up the asic is walked and the late_init callbacks are run. 2685 * late_init covers any special initialization that an IP requires 2686 * after all of the have been initialized or something that needs to happen 2687 * late in the init process. 2688 * Returns 0 on success, negative error code on failure. 2689 */ 2690 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2691 { 2692 struct amdgpu_gpu_instance *gpu_instance; 2693 int i = 0, r; 2694 2695 for (i = 0; i < adev->num_ip_blocks; i++) { 2696 if (!adev->ip_blocks[i].status.hw) 2697 continue; 2698 if (adev->ip_blocks[i].version->funcs->late_init) { 2699 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2700 if (r) { 2701 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2702 adev->ip_blocks[i].version->funcs->name, r); 2703 return r; 2704 } 2705 } 2706 adev->ip_blocks[i].status.late_initialized = true; 2707 } 2708 2709 r = amdgpu_ras_late_init(adev); 2710 if (r) { 2711 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2712 return r; 2713 } 2714 2715 amdgpu_ras_set_error_query_ready(adev, true); 2716 2717 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2718 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2719 2720 amdgpu_device_fill_reset_magic(adev); 2721 2722 r = amdgpu_device_enable_mgpu_fan_boost(); 2723 if (r) 2724 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2725 2726 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2727 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2728 adev->asic_type == CHIP_ALDEBARAN )) 2729 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2730 2731 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2732 mutex_lock(&mgpu_info.mutex); 2733 2734 /* 2735 * Reset device p-state to low as this was booted with high. 2736 * 2737 * This should be performed only after all devices from the same 2738 * hive get initialized. 2739 * 2740 * However, it's unknown how many device in the hive in advance. 2741 * As this is counted one by one during devices initializations. 2742 * 2743 * So, we wait for all XGMI interlinked devices initialized. 2744 * This may bring some delays as those devices may come from 2745 * different hives. But that should be OK. 2746 */ 2747 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2748 for (i = 0; i < mgpu_info.num_gpu; i++) { 2749 gpu_instance = &(mgpu_info.gpu_ins[i]); 2750 if (gpu_instance->adev->flags & AMD_IS_APU) 2751 continue; 2752 2753 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2754 AMDGPU_XGMI_PSTATE_MIN); 2755 if (r) { 2756 DRM_ERROR("pstate setting failed (%d).\n", r); 2757 break; 2758 } 2759 } 2760 } 2761 2762 mutex_unlock(&mgpu_info.mutex); 2763 } 2764 2765 return 0; 2766 } 2767 2768 /** 2769 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2770 * 2771 * @adev: amdgpu_device pointer 2772 * 2773 * For ASICs need to disable SMC first 2774 */ 2775 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2776 { 2777 int i, r; 2778 2779 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2780 return; 2781 2782 for (i = 0; i < adev->num_ip_blocks; i++) { 2783 if (!adev->ip_blocks[i].status.hw) 2784 continue; 2785 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2786 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2787 /* XXX handle errors */ 2788 if (r) { 2789 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2790 adev->ip_blocks[i].version->funcs->name, r); 2791 } 2792 adev->ip_blocks[i].status.hw = false; 2793 break; 2794 } 2795 } 2796 } 2797 2798 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2799 { 2800 int i, r; 2801 2802 for (i = 0; i < adev->num_ip_blocks; i++) { 2803 if (!adev->ip_blocks[i].version->funcs->early_fini) 2804 continue; 2805 2806 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2807 if (r) { 2808 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2809 adev->ip_blocks[i].version->funcs->name, r); 2810 } 2811 } 2812 2813 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2814 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2815 2816 amdgpu_amdkfd_suspend(adev, false); 2817 2818 /* Workaroud for ASICs need to disable SMC first */ 2819 amdgpu_device_smu_fini_early(adev); 2820 2821 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2822 if (!adev->ip_blocks[i].status.hw) 2823 continue; 2824 2825 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2826 /* XXX handle errors */ 2827 if (r) { 2828 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2829 adev->ip_blocks[i].version->funcs->name, r); 2830 } 2831 2832 adev->ip_blocks[i].status.hw = false; 2833 } 2834 2835 if (amdgpu_sriov_vf(adev)) { 2836 if (amdgpu_virt_release_full_gpu(adev, false)) 2837 DRM_ERROR("failed to release exclusive mode on fini\n"); 2838 } 2839 2840 return 0; 2841 } 2842 2843 /** 2844 * amdgpu_device_ip_fini - run fini for hardware IPs 2845 * 2846 * @adev: amdgpu_device pointer 2847 * 2848 * Main teardown pass for hardware IPs. The list of all the hardware 2849 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2850 * are run. hw_fini tears down the hardware associated with each IP 2851 * and sw_fini tears down any software state associated with each IP. 2852 * Returns 0 on success, negative error code on failure. 2853 */ 2854 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2855 { 2856 int i, r; 2857 2858 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2859 amdgpu_virt_release_ras_err_handler_data(adev); 2860 2861 if (adev->gmc.xgmi.num_physical_nodes > 1) 2862 amdgpu_xgmi_remove_device(adev); 2863 2864 amdgpu_amdkfd_device_fini_sw(adev); 2865 2866 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2867 if (!adev->ip_blocks[i].status.sw) 2868 continue; 2869 2870 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2871 amdgpu_ucode_free_bo(adev); 2872 amdgpu_free_static_csa(&adev->virt.csa_obj); 2873 amdgpu_device_wb_fini(adev); 2874 amdgpu_device_mem_scratch_fini(adev); 2875 amdgpu_ib_pool_fini(adev); 2876 } 2877 2878 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2879 /* XXX handle errors */ 2880 if (r) { 2881 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2882 adev->ip_blocks[i].version->funcs->name, r); 2883 } 2884 adev->ip_blocks[i].status.sw = false; 2885 adev->ip_blocks[i].status.valid = false; 2886 } 2887 2888 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2889 if (!adev->ip_blocks[i].status.late_initialized) 2890 continue; 2891 if (adev->ip_blocks[i].version->funcs->late_fini) 2892 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2893 adev->ip_blocks[i].status.late_initialized = false; 2894 } 2895 2896 amdgpu_ras_fini(adev); 2897 2898 return 0; 2899 } 2900 2901 /** 2902 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2903 * 2904 * @work: work_struct. 2905 */ 2906 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2907 { 2908 struct amdgpu_device *adev = 2909 container_of(work, struct amdgpu_device, delayed_init_work.work); 2910 int r; 2911 2912 r = amdgpu_ib_ring_tests(adev); 2913 if (r) 2914 DRM_ERROR("ib ring test failed (%d).\n", r); 2915 } 2916 2917 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2918 { 2919 struct amdgpu_device *adev = 2920 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2921 2922 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2923 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2924 2925 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2926 adev->gfx.gfx_off_state = true; 2927 } 2928 2929 /** 2930 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2931 * 2932 * @adev: amdgpu_device pointer 2933 * 2934 * Main suspend function for hardware IPs. The list of all the hardware 2935 * IPs that make up the asic is walked, clockgating is disabled and the 2936 * suspend callbacks are run. suspend puts the hardware and software state 2937 * in each IP into a state suitable for suspend. 2938 * Returns 0 on success, negative error code on failure. 2939 */ 2940 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2941 { 2942 int i, r; 2943 2944 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2945 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2946 2947 /* 2948 * Per PMFW team's suggestion, driver needs to handle gfxoff 2949 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2950 * scenario. Add the missing df cstate disablement here. 2951 */ 2952 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2953 dev_warn(adev->dev, "Failed to disallow df cstate"); 2954 2955 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2956 if (!adev->ip_blocks[i].status.valid) 2957 continue; 2958 2959 /* displays are handled separately */ 2960 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2961 continue; 2962 2963 /* XXX handle errors */ 2964 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2965 /* XXX handle errors */ 2966 if (r) { 2967 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2968 adev->ip_blocks[i].version->funcs->name, r); 2969 return r; 2970 } 2971 2972 adev->ip_blocks[i].status.hw = false; 2973 } 2974 2975 return 0; 2976 } 2977 2978 /** 2979 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2980 * 2981 * @adev: amdgpu_device pointer 2982 * 2983 * Main suspend function for hardware IPs. The list of all the hardware 2984 * IPs that make up the asic is walked, clockgating is disabled and the 2985 * suspend callbacks are run. suspend puts the hardware and software state 2986 * in each IP into a state suitable for suspend. 2987 * Returns 0 on success, negative error code on failure. 2988 */ 2989 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2990 { 2991 int i, r; 2992 2993 if (adev->in_s0ix) 2994 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2995 2996 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2997 if (!adev->ip_blocks[i].status.valid) 2998 continue; 2999 /* displays are handled in phase1 */ 3000 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3001 continue; 3002 /* PSP lost connection when err_event_athub occurs */ 3003 if (amdgpu_ras_intr_triggered() && 3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3005 adev->ip_blocks[i].status.hw = false; 3006 continue; 3007 } 3008 3009 /* skip unnecessary suspend if we do not initialize them yet */ 3010 if (adev->gmc.xgmi.pending_reset && 3011 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3012 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3013 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3014 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3015 adev->ip_blocks[i].status.hw = false; 3016 continue; 3017 } 3018 3019 /* skip suspend of gfx/mes and psp for S0ix 3020 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3021 * like at runtime. PSP is also part of the always on hardware 3022 * so no need to suspend it. 3023 */ 3024 if (adev->in_s0ix && 3025 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3026 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3027 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3028 continue; 3029 3030 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3031 if (adev->in_s0ix && 3032 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3033 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3034 continue; 3035 3036 /* XXX handle errors */ 3037 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3038 /* XXX handle errors */ 3039 if (r) { 3040 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3041 adev->ip_blocks[i].version->funcs->name, r); 3042 } 3043 adev->ip_blocks[i].status.hw = false; 3044 /* handle putting the SMC in the appropriate state */ 3045 if(!amdgpu_sriov_vf(adev)){ 3046 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3047 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3048 if (r) { 3049 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3050 adev->mp1_state, r); 3051 return r; 3052 } 3053 } 3054 } 3055 } 3056 3057 return 0; 3058 } 3059 3060 /** 3061 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3062 * 3063 * @adev: amdgpu_device pointer 3064 * 3065 * Main suspend function for hardware IPs. The list of all the hardware 3066 * IPs that make up the asic is walked, clockgating is disabled and the 3067 * suspend callbacks are run. suspend puts the hardware and software state 3068 * in each IP into a state suitable for suspend. 3069 * Returns 0 on success, negative error code on failure. 3070 */ 3071 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3072 { 3073 int r; 3074 3075 if (amdgpu_sriov_vf(adev)) { 3076 amdgpu_virt_fini_data_exchange(adev); 3077 amdgpu_virt_request_full_gpu(adev, false); 3078 } 3079 3080 r = amdgpu_device_ip_suspend_phase1(adev); 3081 if (r) 3082 return r; 3083 r = amdgpu_device_ip_suspend_phase2(adev); 3084 3085 if (amdgpu_sriov_vf(adev)) 3086 amdgpu_virt_release_full_gpu(adev, false); 3087 3088 return r; 3089 } 3090 3091 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3092 { 3093 int i, r; 3094 3095 static enum amd_ip_block_type ip_order[] = { 3096 AMD_IP_BLOCK_TYPE_COMMON, 3097 AMD_IP_BLOCK_TYPE_GMC, 3098 AMD_IP_BLOCK_TYPE_PSP, 3099 AMD_IP_BLOCK_TYPE_IH, 3100 }; 3101 3102 for (i = 0; i < adev->num_ip_blocks; i++) { 3103 int j; 3104 struct amdgpu_ip_block *block; 3105 3106 block = &adev->ip_blocks[i]; 3107 block->status.hw = false; 3108 3109 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3110 3111 if (block->version->type != ip_order[j] || 3112 !block->status.valid) 3113 continue; 3114 3115 r = block->version->funcs->hw_init(adev); 3116 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3117 if (r) 3118 return r; 3119 block->status.hw = true; 3120 } 3121 } 3122 3123 return 0; 3124 } 3125 3126 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3127 { 3128 int i, r; 3129 3130 static enum amd_ip_block_type ip_order[] = { 3131 AMD_IP_BLOCK_TYPE_SMC, 3132 AMD_IP_BLOCK_TYPE_DCE, 3133 AMD_IP_BLOCK_TYPE_GFX, 3134 AMD_IP_BLOCK_TYPE_SDMA, 3135 AMD_IP_BLOCK_TYPE_UVD, 3136 AMD_IP_BLOCK_TYPE_VCE, 3137 AMD_IP_BLOCK_TYPE_VCN 3138 }; 3139 3140 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3141 int j; 3142 struct amdgpu_ip_block *block; 3143 3144 for (j = 0; j < adev->num_ip_blocks; j++) { 3145 block = &adev->ip_blocks[j]; 3146 3147 if (block->version->type != ip_order[i] || 3148 !block->status.valid || 3149 block->status.hw) 3150 continue; 3151 3152 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3153 r = block->version->funcs->resume(adev); 3154 else 3155 r = block->version->funcs->hw_init(adev); 3156 3157 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3158 if (r) 3159 return r; 3160 block->status.hw = true; 3161 } 3162 } 3163 3164 return 0; 3165 } 3166 3167 /** 3168 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3169 * 3170 * @adev: amdgpu_device pointer 3171 * 3172 * First resume function for hardware IPs. The list of all the hardware 3173 * IPs that make up the asic is walked and the resume callbacks are run for 3174 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3175 * after a suspend and updates the software state as necessary. This 3176 * function is also used for restoring the GPU after a GPU reset. 3177 * Returns 0 on success, negative error code on failure. 3178 */ 3179 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3180 { 3181 int i, r; 3182 3183 for (i = 0; i < adev->num_ip_blocks; i++) { 3184 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3185 continue; 3186 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3187 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3188 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3189 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3190 3191 r = adev->ip_blocks[i].version->funcs->resume(adev); 3192 if (r) { 3193 DRM_ERROR("resume of IP block <%s> failed %d\n", 3194 adev->ip_blocks[i].version->funcs->name, r); 3195 return r; 3196 } 3197 adev->ip_blocks[i].status.hw = true; 3198 } 3199 } 3200 3201 return 0; 3202 } 3203 3204 /** 3205 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3206 * 3207 * @adev: amdgpu_device pointer 3208 * 3209 * First resume function for hardware IPs. The list of all the hardware 3210 * IPs that make up the asic is walked and the resume callbacks are run for 3211 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3212 * functional state after a suspend and updates the software state as 3213 * necessary. This function is also used for restoring the GPU after a GPU 3214 * reset. 3215 * Returns 0 on success, negative error code on failure. 3216 */ 3217 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3218 { 3219 int i, r; 3220 3221 for (i = 0; i < adev->num_ip_blocks; i++) { 3222 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3223 continue; 3224 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3225 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3226 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3228 continue; 3229 r = adev->ip_blocks[i].version->funcs->resume(adev); 3230 if (r) { 3231 DRM_ERROR("resume of IP block <%s> failed %d\n", 3232 adev->ip_blocks[i].version->funcs->name, r); 3233 return r; 3234 } 3235 adev->ip_blocks[i].status.hw = true; 3236 } 3237 3238 return 0; 3239 } 3240 3241 /** 3242 * amdgpu_device_ip_resume - run resume for hardware IPs 3243 * 3244 * @adev: amdgpu_device pointer 3245 * 3246 * Main resume function for hardware IPs. The hardware IPs 3247 * are split into two resume functions because they are 3248 * are also used in in recovering from a GPU reset and some additional 3249 * steps need to be take between them. In this case (S3/S4) they are 3250 * run sequentially. 3251 * Returns 0 on success, negative error code on failure. 3252 */ 3253 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3254 { 3255 int r; 3256 3257 r = amdgpu_amdkfd_resume_iommu(adev); 3258 if (r) 3259 return r; 3260 3261 r = amdgpu_device_ip_resume_phase1(adev); 3262 if (r) 3263 return r; 3264 3265 r = amdgpu_device_fw_loading(adev); 3266 if (r) 3267 return r; 3268 3269 r = amdgpu_device_ip_resume_phase2(adev); 3270 3271 return r; 3272 } 3273 3274 /** 3275 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3276 * 3277 * @adev: amdgpu_device pointer 3278 * 3279 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3280 */ 3281 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3282 { 3283 if (amdgpu_sriov_vf(adev)) { 3284 if (adev->is_atom_fw) { 3285 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3286 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3287 } else { 3288 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3289 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3290 } 3291 3292 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3293 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3294 } 3295 } 3296 3297 /** 3298 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3299 * 3300 * @asic_type: AMD asic type 3301 * 3302 * Check if there is DC (new modesetting infrastructre) support for an asic. 3303 * returns true if DC has support, false if not. 3304 */ 3305 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3306 { 3307 switch (asic_type) { 3308 #ifdef CONFIG_DRM_AMDGPU_SI 3309 case CHIP_HAINAN: 3310 #endif 3311 case CHIP_TOPAZ: 3312 /* chips with no display hardware */ 3313 return false; 3314 #if defined(CONFIG_DRM_AMD_DC) 3315 case CHIP_TAHITI: 3316 case CHIP_PITCAIRN: 3317 case CHIP_VERDE: 3318 case CHIP_OLAND: 3319 /* 3320 * We have systems in the wild with these ASICs that require 3321 * LVDS and VGA support which is not supported with DC. 3322 * 3323 * Fallback to the non-DC driver here by default so as not to 3324 * cause regressions. 3325 */ 3326 #if defined(CONFIG_DRM_AMD_DC_SI) 3327 return amdgpu_dc > 0; 3328 #else 3329 return false; 3330 #endif 3331 case CHIP_BONAIRE: 3332 case CHIP_KAVERI: 3333 case CHIP_KABINI: 3334 case CHIP_MULLINS: 3335 /* 3336 * We have systems in the wild with these ASICs that require 3337 * VGA support which is not supported with DC. 3338 * 3339 * Fallback to the non-DC driver here by default so as not to 3340 * cause regressions. 3341 */ 3342 return amdgpu_dc > 0; 3343 default: 3344 return amdgpu_dc != 0; 3345 #else 3346 default: 3347 if (amdgpu_dc > 0) 3348 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3349 "but isn't supported by ASIC, ignoring\n"); 3350 return false; 3351 #endif 3352 } 3353 } 3354 3355 /** 3356 * amdgpu_device_has_dc_support - check if dc is supported 3357 * 3358 * @adev: amdgpu_device pointer 3359 * 3360 * Returns true for supported, false for not supported 3361 */ 3362 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3363 { 3364 if (adev->enable_virtual_display || 3365 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3366 return false; 3367 3368 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3369 } 3370 3371 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3372 { 3373 struct amdgpu_device *adev = 3374 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3375 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3376 3377 /* It's a bug to not have a hive within this function */ 3378 if (WARN_ON(!hive)) 3379 return; 3380 3381 /* 3382 * Use task barrier to synchronize all xgmi reset works across the 3383 * hive. task_barrier_enter and task_barrier_exit will block 3384 * until all the threads running the xgmi reset works reach 3385 * those points. task_barrier_full will do both blocks. 3386 */ 3387 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3388 3389 task_barrier_enter(&hive->tb); 3390 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3391 3392 if (adev->asic_reset_res) 3393 goto fail; 3394 3395 task_barrier_exit(&hive->tb); 3396 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3397 3398 if (adev->asic_reset_res) 3399 goto fail; 3400 3401 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3402 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3403 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3404 } else { 3405 3406 task_barrier_full(&hive->tb); 3407 adev->asic_reset_res = amdgpu_asic_reset(adev); 3408 } 3409 3410 fail: 3411 if (adev->asic_reset_res) 3412 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3413 adev->asic_reset_res, adev_to_drm(adev)->unique); 3414 amdgpu_put_xgmi_hive(hive); 3415 } 3416 3417 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3418 { 3419 char *input = amdgpu_lockup_timeout; 3420 char *timeout_setting = NULL; 3421 int index = 0; 3422 long timeout; 3423 int ret = 0; 3424 3425 /* 3426 * By default timeout for non compute jobs is 10000 3427 * and 60000 for compute jobs. 3428 * In SR-IOV or passthrough mode, timeout for compute 3429 * jobs are 60000 by default. 3430 */ 3431 adev->gfx_timeout = msecs_to_jiffies(10000); 3432 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3433 if (amdgpu_sriov_vf(adev)) 3434 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3435 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3436 else 3437 adev->compute_timeout = msecs_to_jiffies(60000); 3438 3439 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3440 while ((timeout_setting = strsep(&input, ",")) && 3441 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3442 ret = kstrtol(timeout_setting, 0, &timeout); 3443 if (ret) 3444 return ret; 3445 3446 if (timeout == 0) { 3447 index++; 3448 continue; 3449 } else if (timeout < 0) { 3450 timeout = MAX_SCHEDULE_TIMEOUT; 3451 dev_warn(adev->dev, "lockup timeout disabled"); 3452 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3453 } else { 3454 timeout = msecs_to_jiffies(timeout); 3455 } 3456 3457 switch (index++) { 3458 case 0: 3459 adev->gfx_timeout = timeout; 3460 break; 3461 case 1: 3462 adev->compute_timeout = timeout; 3463 break; 3464 case 2: 3465 adev->sdma_timeout = timeout; 3466 break; 3467 case 3: 3468 adev->video_timeout = timeout; 3469 break; 3470 default: 3471 break; 3472 } 3473 } 3474 /* 3475 * There is only one value specified and 3476 * it should apply to all non-compute jobs. 3477 */ 3478 if (index == 1) { 3479 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3480 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3481 adev->compute_timeout = adev->gfx_timeout; 3482 } 3483 } 3484 3485 return ret; 3486 } 3487 3488 /** 3489 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3490 * 3491 * @adev: amdgpu_device pointer 3492 * 3493 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3494 */ 3495 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3496 { 3497 struct iommu_domain *domain; 3498 3499 domain = iommu_get_domain_for_dev(adev->dev); 3500 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3501 adev->ram_is_direct_mapped = true; 3502 } 3503 3504 static const struct attribute *amdgpu_dev_attributes[] = { 3505 &dev_attr_product_name.attr, 3506 &dev_attr_product_number.attr, 3507 &dev_attr_serial_number.attr, 3508 &dev_attr_pcie_replay_count.attr, 3509 NULL 3510 }; 3511 3512 /** 3513 * amdgpu_device_init - initialize the driver 3514 * 3515 * @adev: amdgpu_device pointer 3516 * @flags: driver flags 3517 * 3518 * Initializes the driver info and hw (all asics). 3519 * Returns 0 for success or an error on failure. 3520 * Called at driver startup. 3521 */ 3522 int amdgpu_device_init(struct amdgpu_device *adev, 3523 uint32_t flags) 3524 { 3525 struct drm_device *ddev = adev_to_drm(adev); 3526 struct pci_dev *pdev = adev->pdev; 3527 int r, i; 3528 bool px = false; 3529 u32 max_MBps; 3530 3531 adev->shutdown = false; 3532 adev->flags = flags; 3533 3534 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3535 adev->asic_type = amdgpu_force_asic_type; 3536 else 3537 adev->asic_type = flags & AMD_ASIC_MASK; 3538 3539 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3540 if (amdgpu_emu_mode == 1) 3541 adev->usec_timeout *= 10; 3542 adev->gmc.gart_size = 512 * 1024 * 1024; 3543 adev->accel_working = false; 3544 adev->num_rings = 0; 3545 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3546 adev->mman.buffer_funcs = NULL; 3547 adev->mman.buffer_funcs_ring = NULL; 3548 adev->vm_manager.vm_pte_funcs = NULL; 3549 adev->vm_manager.vm_pte_num_scheds = 0; 3550 adev->gmc.gmc_funcs = NULL; 3551 adev->harvest_ip_mask = 0x0; 3552 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3553 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3554 3555 adev->smc_rreg = &amdgpu_invalid_rreg; 3556 adev->smc_wreg = &amdgpu_invalid_wreg; 3557 adev->pcie_rreg = &amdgpu_invalid_rreg; 3558 adev->pcie_wreg = &amdgpu_invalid_wreg; 3559 adev->pciep_rreg = &amdgpu_invalid_rreg; 3560 adev->pciep_wreg = &amdgpu_invalid_wreg; 3561 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3562 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3563 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3564 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3565 adev->didt_rreg = &amdgpu_invalid_rreg; 3566 adev->didt_wreg = &amdgpu_invalid_wreg; 3567 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3568 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3569 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3570 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3571 3572 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3573 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3574 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3575 3576 /* mutex initialization are all done here so we 3577 * can recall function without having locking issues */ 3578 mutex_init(&adev->firmware.mutex); 3579 mutex_init(&adev->pm.mutex); 3580 mutex_init(&adev->gfx.gpu_clock_mutex); 3581 mutex_init(&adev->srbm_mutex); 3582 mutex_init(&adev->gfx.pipe_reserve_mutex); 3583 mutex_init(&adev->gfx.gfx_off_mutex); 3584 mutex_init(&adev->grbm_idx_mutex); 3585 mutex_init(&adev->mn_lock); 3586 mutex_init(&adev->virt.vf_errors.lock); 3587 hash_init(adev->mn_hash); 3588 mutex_init(&adev->psp.mutex); 3589 mutex_init(&adev->notifier_lock); 3590 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3591 mutex_init(&adev->benchmark_mutex); 3592 3593 amdgpu_device_init_apu_flags(adev); 3594 3595 r = amdgpu_device_check_arguments(adev); 3596 if (r) 3597 return r; 3598 3599 spin_lock_init(&adev->mmio_idx_lock); 3600 spin_lock_init(&adev->smc_idx_lock); 3601 spin_lock_init(&adev->pcie_idx_lock); 3602 spin_lock_init(&adev->uvd_ctx_idx_lock); 3603 spin_lock_init(&adev->didt_idx_lock); 3604 spin_lock_init(&adev->gc_cac_idx_lock); 3605 spin_lock_init(&adev->se_cac_idx_lock); 3606 spin_lock_init(&adev->audio_endpt_idx_lock); 3607 spin_lock_init(&adev->mm_stats.lock); 3608 3609 INIT_LIST_HEAD(&adev->shadow_list); 3610 mutex_init(&adev->shadow_list_lock); 3611 3612 INIT_LIST_HEAD(&adev->reset_list); 3613 3614 INIT_LIST_HEAD(&adev->ras_list); 3615 3616 INIT_DELAYED_WORK(&adev->delayed_init_work, 3617 amdgpu_device_delayed_init_work_handler); 3618 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3619 amdgpu_device_delay_enable_gfx_off); 3620 3621 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3622 3623 adev->gfx.gfx_off_req_count = 1; 3624 adev->gfx.gfx_off_residency = 0; 3625 adev->gfx.gfx_off_entrycount = 0; 3626 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3627 3628 atomic_set(&adev->throttling_logging_enabled, 1); 3629 /* 3630 * If throttling continues, logging will be performed every minute 3631 * to avoid log flooding. "-1" is subtracted since the thermal 3632 * throttling interrupt comes every second. Thus, the total logging 3633 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3634 * for throttling interrupt) = 60 seconds. 3635 */ 3636 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3637 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3638 3639 /* Registers mapping */ 3640 /* TODO: block userspace mapping of io register */ 3641 if (adev->asic_type >= CHIP_BONAIRE) { 3642 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3643 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3644 } else { 3645 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3646 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3647 } 3648 3649 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3650 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3651 3652 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3653 if (adev->rmmio == NULL) { 3654 return -ENOMEM; 3655 } 3656 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3657 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3658 3659 amdgpu_device_get_pcie_info(adev); 3660 3661 if (amdgpu_mcbp) 3662 DRM_INFO("MCBP is enabled\n"); 3663 3664 /* 3665 * Reset domain needs to be present early, before XGMI hive discovered 3666 * (if any) and intitialized to use reset sem and in_gpu reset flag 3667 * early on during init and before calling to RREG32. 3668 */ 3669 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3670 if (!adev->reset_domain) 3671 return -ENOMEM; 3672 3673 /* detect hw virtualization here */ 3674 amdgpu_detect_virtualization(adev); 3675 3676 r = amdgpu_device_get_job_timeout_settings(adev); 3677 if (r) { 3678 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3679 return r; 3680 } 3681 3682 /* early init functions */ 3683 r = amdgpu_device_ip_early_init(adev); 3684 if (r) 3685 return r; 3686 3687 /* Get rid of things like offb */ 3688 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3689 if (r) 3690 return r; 3691 3692 /* Enable TMZ based on IP_VERSION */ 3693 amdgpu_gmc_tmz_set(adev); 3694 3695 amdgpu_gmc_noretry_set(adev); 3696 /* Need to get xgmi info early to decide the reset behavior*/ 3697 if (adev->gmc.xgmi.supported) { 3698 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3699 if (r) 3700 return r; 3701 } 3702 3703 /* enable PCIE atomic ops */ 3704 if (amdgpu_sriov_vf(adev)) 3705 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3706 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3707 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3708 else 3709 adev->have_atomics_support = 3710 !pci_enable_atomic_ops_to_root(adev->pdev, 3711 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3712 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3713 if (!adev->have_atomics_support) 3714 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3715 3716 /* doorbell bar mapping and doorbell index init*/ 3717 amdgpu_device_doorbell_init(adev); 3718 3719 if (amdgpu_emu_mode == 1) { 3720 /* post the asic on emulation mode */ 3721 emu_soc_asic_init(adev); 3722 goto fence_driver_init; 3723 } 3724 3725 amdgpu_reset_init(adev); 3726 3727 /* detect if we are with an SRIOV vbios */ 3728 amdgpu_device_detect_sriov_bios(adev); 3729 3730 /* check if we need to reset the asic 3731 * E.g., driver was not cleanly unloaded previously, etc. 3732 */ 3733 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3734 if (adev->gmc.xgmi.num_physical_nodes) { 3735 dev_info(adev->dev, "Pending hive reset.\n"); 3736 adev->gmc.xgmi.pending_reset = true; 3737 /* Only need to init necessary block for SMU to handle the reset */ 3738 for (i = 0; i < adev->num_ip_blocks; i++) { 3739 if (!adev->ip_blocks[i].status.valid) 3740 continue; 3741 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3744 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3745 DRM_DEBUG("IP %s disabled for hw_init.\n", 3746 adev->ip_blocks[i].version->funcs->name); 3747 adev->ip_blocks[i].status.hw = true; 3748 } 3749 } 3750 } else { 3751 r = amdgpu_asic_reset(adev); 3752 if (r) { 3753 dev_err(adev->dev, "asic reset on init failed\n"); 3754 goto failed; 3755 } 3756 } 3757 } 3758 3759 pci_enable_pcie_error_reporting(adev->pdev); 3760 3761 /* Post card if necessary */ 3762 if (amdgpu_device_need_post(adev)) { 3763 if (!adev->bios) { 3764 dev_err(adev->dev, "no vBIOS found\n"); 3765 r = -EINVAL; 3766 goto failed; 3767 } 3768 DRM_INFO("GPU posting now...\n"); 3769 r = amdgpu_device_asic_init(adev); 3770 if (r) { 3771 dev_err(adev->dev, "gpu post error!\n"); 3772 goto failed; 3773 } 3774 } 3775 3776 if (adev->is_atom_fw) { 3777 /* Initialize clocks */ 3778 r = amdgpu_atomfirmware_get_clock_info(adev); 3779 if (r) { 3780 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3781 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3782 goto failed; 3783 } 3784 } else { 3785 /* Initialize clocks */ 3786 r = amdgpu_atombios_get_clock_info(adev); 3787 if (r) { 3788 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3789 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3790 goto failed; 3791 } 3792 /* init i2c buses */ 3793 if (!amdgpu_device_has_dc_support(adev)) 3794 amdgpu_atombios_i2c_init(adev); 3795 } 3796 3797 fence_driver_init: 3798 /* Fence driver */ 3799 r = amdgpu_fence_driver_sw_init(adev); 3800 if (r) { 3801 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3802 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3803 goto failed; 3804 } 3805 3806 /* init the mode config */ 3807 drm_mode_config_init(adev_to_drm(adev)); 3808 3809 r = amdgpu_device_ip_init(adev); 3810 if (r) { 3811 /* failed in exclusive mode due to timeout */ 3812 if (amdgpu_sriov_vf(adev) && 3813 !amdgpu_sriov_runtime(adev) && 3814 amdgpu_virt_mmio_blocked(adev) && 3815 !amdgpu_virt_wait_reset(adev)) { 3816 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3817 /* Don't send request since VF is inactive. */ 3818 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3819 adev->virt.ops = NULL; 3820 r = -EAGAIN; 3821 goto release_ras_con; 3822 } 3823 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3824 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3825 goto release_ras_con; 3826 } 3827 3828 amdgpu_fence_driver_hw_init(adev); 3829 3830 dev_info(adev->dev, 3831 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3832 adev->gfx.config.max_shader_engines, 3833 adev->gfx.config.max_sh_per_se, 3834 adev->gfx.config.max_cu_per_sh, 3835 adev->gfx.cu_info.number); 3836 3837 adev->accel_working = true; 3838 3839 amdgpu_vm_check_compute_bug(adev); 3840 3841 /* Initialize the buffer migration limit. */ 3842 if (amdgpu_moverate >= 0) 3843 max_MBps = amdgpu_moverate; 3844 else 3845 max_MBps = 8; /* Allow 8 MB/s. */ 3846 /* Get a log2 for easy divisions. */ 3847 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3848 3849 r = amdgpu_pm_sysfs_init(adev); 3850 if (r) { 3851 adev->pm_sysfs_en = false; 3852 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3853 } else 3854 adev->pm_sysfs_en = true; 3855 3856 r = amdgpu_ucode_sysfs_init(adev); 3857 if (r) { 3858 adev->ucode_sysfs_en = false; 3859 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3860 } else 3861 adev->ucode_sysfs_en = true; 3862 3863 r = amdgpu_psp_sysfs_init(adev); 3864 if (r) { 3865 adev->psp_sysfs_en = false; 3866 if (!amdgpu_sriov_vf(adev)) 3867 DRM_ERROR("Creating psp sysfs failed\n"); 3868 } else 3869 adev->psp_sysfs_en = true; 3870 3871 /* 3872 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3873 * Otherwise the mgpu fan boost feature will be skipped due to the 3874 * gpu instance is counted less. 3875 */ 3876 amdgpu_register_gpu_instance(adev); 3877 3878 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3879 * explicit gating rather than handling it automatically. 3880 */ 3881 if (!adev->gmc.xgmi.pending_reset) { 3882 r = amdgpu_device_ip_late_init(adev); 3883 if (r) { 3884 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3885 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3886 goto release_ras_con; 3887 } 3888 /* must succeed. */ 3889 amdgpu_ras_resume(adev); 3890 queue_delayed_work(system_wq, &adev->delayed_init_work, 3891 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3892 } 3893 3894 if (amdgpu_sriov_vf(adev)) 3895 flush_delayed_work(&adev->delayed_init_work); 3896 3897 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3898 if (r) 3899 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3900 3901 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3902 r = amdgpu_pmu_init(adev); 3903 if (r) 3904 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3905 3906 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3907 if (amdgpu_device_cache_pci_state(adev->pdev)) 3908 pci_restore_state(pdev); 3909 3910 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3911 /* this will fail for cards that aren't VGA class devices, just 3912 * ignore it */ 3913 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3914 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3915 3916 if (amdgpu_device_supports_px(ddev)) { 3917 px = true; 3918 vga_switcheroo_register_client(adev->pdev, 3919 &amdgpu_switcheroo_ops, px); 3920 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3921 } 3922 3923 if (adev->gmc.xgmi.pending_reset) 3924 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3925 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3926 3927 amdgpu_device_check_iommu_direct_map(adev); 3928 3929 return 0; 3930 3931 release_ras_con: 3932 amdgpu_release_ras_context(adev); 3933 3934 failed: 3935 amdgpu_vf_error_trans_all(adev); 3936 3937 return r; 3938 } 3939 3940 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3941 { 3942 3943 /* Clear all CPU mappings pointing to this device */ 3944 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3945 3946 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3947 amdgpu_device_doorbell_fini(adev); 3948 3949 iounmap(adev->rmmio); 3950 adev->rmmio = NULL; 3951 if (adev->mman.aper_base_kaddr) 3952 iounmap(adev->mman.aper_base_kaddr); 3953 adev->mman.aper_base_kaddr = NULL; 3954 3955 /* Memory manager related */ 3956 if (!adev->gmc.xgmi.connected_to_cpu) { 3957 arch_phys_wc_del(adev->gmc.vram_mtrr); 3958 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3959 } 3960 } 3961 3962 /** 3963 * amdgpu_device_fini_hw - tear down the driver 3964 * 3965 * @adev: amdgpu_device pointer 3966 * 3967 * Tear down the driver info (all asics). 3968 * Called at driver shutdown. 3969 */ 3970 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3971 { 3972 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3973 flush_delayed_work(&adev->delayed_init_work); 3974 adev->shutdown = true; 3975 3976 /* make sure IB test finished before entering exclusive mode 3977 * to avoid preemption on IB test 3978 * */ 3979 if (amdgpu_sriov_vf(adev)) { 3980 amdgpu_virt_request_full_gpu(adev, false); 3981 amdgpu_virt_fini_data_exchange(adev); 3982 } 3983 3984 /* disable all interrupts */ 3985 amdgpu_irq_disable_all(adev); 3986 if (adev->mode_info.mode_config_initialized){ 3987 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3988 drm_helper_force_disable_all(adev_to_drm(adev)); 3989 else 3990 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3991 } 3992 amdgpu_fence_driver_hw_fini(adev); 3993 3994 if (adev->mman.initialized) { 3995 flush_delayed_work(&adev->mman.bdev.wq); 3996 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3997 } 3998 3999 if (adev->pm_sysfs_en) 4000 amdgpu_pm_sysfs_fini(adev); 4001 if (adev->ucode_sysfs_en) 4002 amdgpu_ucode_sysfs_fini(adev); 4003 if (adev->psp_sysfs_en) 4004 amdgpu_psp_sysfs_fini(adev); 4005 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4006 4007 /* disable ras feature must before hw fini */ 4008 amdgpu_ras_pre_fini(adev); 4009 4010 amdgpu_device_ip_fini_early(adev); 4011 4012 amdgpu_irq_fini_hw(adev); 4013 4014 if (adev->mman.initialized) 4015 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4016 4017 amdgpu_gart_dummy_page_fini(adev); 4018 4019 amdgpu_device_unmap_mmio(adev); 4020 4021 } 4022 4023 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4024 { 4025 int idx; 4026 4027 amdgpu_fence_driver_sw_fini(adev); 4028 amdgpu_device_ip_fini(adev); 4029 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4030 adev->accel_working = false; 4031 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4032 4033 amdgpu_reset_fini(adev); 4034 4035 /* free i2c buses */ 4036 if (!amdgpu_device_has_dc_support(adev)) 4037 amdgpu_i2c_fini(adev); 4038 4039 if (amdgpu_emu_mode != 1) 4040 amdgpu_atombios_fini(adev); 4041 4042 kfree(adev->bios); 4043 adev->bios = NULL; 4044 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4045 vga_switcheroo_unregister_client(adev->pdev); 4046 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4047 } 4048 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4049 vga_client_unregister(adev->pdev); 4050 4051 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4052 4053 iounmap(adev->rmmio); 4054 adev->rmmio = NULL; 4055 amdgpu_device_doorbell_fini(adev); 4056 drm_dev_exit(idx); 4057 } 4058 4059 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4060 amdgpu_pmu_fini(adev); 4061 if (adev->mman.discovery_bin) 4062 amdgpu_discovery_fini(adev); 4063 4064 amdgpu_reset_put_reset_domain(adev->reset_domain); 4065 adev->reset_domain = NULL; 4066 4067 kfree(adev->pci_state); 4068 4069 } 4070 4071 /** 4072 * amdgpu_device_evict_resources - evict device resources 4073 * @adev: amdgpu device object 4074 * 4075 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4076 * of the vram memory type. Mainly used for evicting device resources 4077 * at suspend time. 4078 * 4079 */ 4080 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4081 { 4082 int ret; 4083 4084 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4085 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4086 return 0; 4087 4088 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4089 if (ret) 4090 DRM_WARN("evicting device resources failed\n"); 4091 return ret; 4092 } 4093 4094 /* 4095 * Suspend & resume. 4096 */ 4097 /** 4098 * amdgpu_device_suspend - initiate device suspend 4099 * 4100 * @dev: drm dev pointer 4101 * @fbcon : notify the fbdev of suspend 4102 * 4103 * Puts the hw in the suspend state (all asics). 4104 * Returns 0 for success or an error on failure. 4105 * Called at driver suspend. 4106 */ 4107 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4108 { 4109 struct amdgpu_device *adev = drm_to_adev(dev); 4110 int r = 0; 4111 4112 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4113 return 0; 4114 4115 adev->in_suspend = true; 4116 4117 /* Evict the majority of BOs before grabbing the full access */ 4118 r = amdgpu_device_evict_resources(adev); 4119 if (r) 4120 return r; 4121 4122 if (amdgpu_sriov_vf(adev)) { 4123 amdgpu_virt_fini_data_exchange(adev); 4124 r = amdgpu_virt_request_full_gpu(adev, false); 4125 if (r) 4126 return r; 4127 } 4128 4129 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4130 DRM_WARN("smart shift update failed\n"); 4131 4132 drm_kms_helper_poll_disable(dev); 4133 4134 if (fbcon) 4135 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4136 4137 cancel_delayed_work_sync(&adev->delayed_init_work); 4138 4139 amdgpu_ras_suspend(adev); 4140 4141 amdgpu_device_ip_suspend_phase1(adev); 4142 4143 if (!adev->in_s0ix) 4144 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4145 4146 r = amdgpu_device_evict_resources(adev); 4147 if (r) 4148 return r; 4149 4150 amdgpu_fence_driver_hw_fini(adev); 4151 4152 amdgpu_device_ip_suspend_phase2(adev); 4153 4154 if (amdgpu_sriov_vf(adev)) 4155 amdgpu_virt_release_full_gpu(adev, false); 4156 4157 return 0; 4158 } 4159 4160 /** 4161 * amdgpu_device_resume - initiate device resume 4162 * 4163 * @dev: drm dev pointer 4164 * @fbcon : notify the fbdev of resume 4165 * 4166 * Bring the hw back to operating state (all asics). 4167 * Returns 0 for success or an error on failure. 4168 * Called at driver resume. 4169 */ 4170 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4171 { 4172 struct amdgpu_device *adev = drm_to_adev(dev); 4173 int r = 0; 4174 4175 if (amdgpu_sriov_vf(adev)) { 4176 r = amdgpu_virt_request_full_gpu(adev, true); 4177 if (r) 4178 return r; 4179 } 4180 4181 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4182 return 0; 4183 4184 if (adev->in_s0ix) 4185 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4186 4187 /* post card */ 4188 if (amdgpu_device_need_post(adev)) { 4189 r = amdgpu_device_asic_init(adev); 4190 if (r) 4191 dev_err(adev->dev, "amdgpu asic init failed\n"); 4192 } 4193 4194 r = amdgpu_device_ip_resume(adev); 4195 4196 if (r) { 4197 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4198 goto exit; 4199 } 4200 amdgpu_fence_driver_hw_init(adev); 4201 4202 r = amdgpu_device_ip_late_init(adev); 4203 if (r) 4204 goto exit; 4205 4206 queue_delayed_work(system_wq, &adev->delayed_init_work, 4207 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4208 4209 if (!adev->in_s0ix) { 4210 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4211 if (r) 4212 goto exit; 4213 } 4214 4215 exit: 4216 if (amdgpu_sriov_vf(adev)) { 4217 amdgpu_virt_init_data_exchange(adev); 4218 amdgpu_virt_release_full_gpu(adev, true); 4219 } 4220 4221 if (r) 4222 return r; 4223 4224 /* Make sure IB tests flushed */ 4225 flush_delayed_work(&adev->delayed_init_work); 4226 4227 if (fbcon) 4228 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4229 4230 drm_kms_helper_poll_enable(dev); 4231 4232 amdgpu_ras_resume(adev); 4233 4234 if (adev->mode_info.num_crtc) { 4235 /* 4236 * Most of the connector probing functions try to acquire runtime pm 4237 * refs to ensure that the GPU is powered on when connector polling is 4238 * performed. Since we're calling this from a runtime PM callback, 4239 * trying to acquire rpm refs will cause us to deadlock. 4240 * 4241 * Since we're guaranteed to be holding the rpm lock, it's safe to 4242 * temporarily disable the rpm helpers so this doesn't deadlock us. 4243 */ 4244 #ifdef CONFIG_PM 4245 dev->dev->power.disable_depth++; 4246 #endif 4247 if (!adev->dc_enabled) 4248 drm_helper_hpd_irq_event(dev); 4249 else 4250 drm_kms_helper_hotplug_event(dev); 4251 #ifdef CONFIG_PM 4252 dev->dev->power.disable_depth--; 4253 #endif 4254 } 4255 adev->in_suspend = false; 4256 4257 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4258 DRM_WARN("smart shift update failed\n"); 4259 4260 return 0; 4261 } 4262 4263 /** 4264 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4265 * 4266 * @adev: amdgpu_device pointer 4267 * 4268 * The list of all the hardware IPs that make up the asic is walked and 4269 * the check_soft_reset callbacks are run. check_soft_reset determines 4270 * if the asic is still hung or not. 4271 * Returns true if any of the IPs are still in a hung state, false if not. 4272 */ 4273 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4274 { 4275 int i; 4276 bool asic_hang = false; 4277 4278 if (amdgpu_sriov_vf(adev)) 4279 return true; 4280 4281 if (amdgpu_asic_need_full_reset(adev)) 4282 return true; 4283 4284 for (i = 0; i < adev->num_ip_blocks; i++) { 4285 if (!adev->ip_blocks[i].status.valid) 4286 continue; 4287 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4288 adev->ip_blocks[i].status.hang = 4289 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4290 if (adev->ip_blocks[i].status.hang) { 4291 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4292 asic_hang = true; 4293 } 4294 } 4295 return asic_hang; 4296 } 4297 4298 /** 4299 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4300 * 4301 * @adev: amdgpu_device pointer 4302 * 4303 * The list of all the hardware IPs that make up the asic is walked and the 4304 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4305 * handles any IP specific hardware or software state changes that are 4306 * necessary for a soft reset to succeed. 4307 * Returns 0 on success, negative error code on failure. 4308 */ 4309 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4310 { 4311 int i, r = 0; 4312 4313 for (i = 0; i < adev->num_ip_blocks; i++) { 4314 if (!adev->ip_blocks[i].status.valid) 4315 continue; 4316 if (adev->ip_blocks[i].status.hang && 4317 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4318 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4319 if (r) 4320 return r; 4321 } 4322 } 4323 4324 return 0; 4325 } 4326 4327 /** 4328 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4329 * 4330 * @adev: amdgpu_device pointer 4331 * 4332 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4333 * reset is necessary to recover. 4334 * Returns true if a full asic reset is required, false if not. 4335 */ 4336 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4337 { 4338 int i; 4339 4340 if (amdgpu_asic_need_full_reset(adev)) 4341 return true; 4342 4343 for (i = 0; i < adev->num_ip_blocks; i++) { 4344 if (!adev->ip_blocks[i].status.valid) 4345 continue; 4346 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4347 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4348 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4349 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4350 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4351 if (adev->ip_blocks[i].status.hang) { 4352 dev_info(adev->dev, "Some block need full reset!\n"); 4353 return true; 4354 } 4355 } 4356 } 4357 return false; 4358 } 4359 4360 /** 4361 * amdgpu_device_ip_soft_reset - do a soft reset 4362 * 4363 * @adev: amdgpu_device pointer 4364 * 4365 * The list of all the hardware IPs that make up the asic is walked and the 4366 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4367 * IP specific hardware or software state changes that are necessary to soft 4368 * reset the IP. 4369 * Returns 0 on success, negative error code on failure. 4370 */ 4371 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4372 { 4373 int i, r = 0; 4374 4375 for (i = 0; i < adev->num_ip_blocks; i++) { 4376 if (!adev->ip_blocks[i].status.valid) 4377 continue; 4378 if (adev->ip_blocks[i].status.hang && 4379 adev->ip_blocks[i].version->funcs->soft_reset) { 4380 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4381 if (r) 4382 return r; 4383 } 4384 } 4385 4386 return 0; 4387 } 4388 4389 /** 4390 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4391 * 4392 * @adev: amdgpu_device pointer 4393 * 4394 * The list of all the hardware IPs that make up the asic is walked and the 4395 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4396 * handles any IP specific hardware or software state changes that are 4397 * necessary after the IP has been soft reset. 4398 * Returns 0 on success, negative error code on failure. 4399 */ 4400 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4401 { 4402 int i, r = 0; 4403 4404 for (i = 0; i < adev->num_ip_blocks; i++) { 4405 if (!adev->ip_blocks[i].status.valid) 4406 continue; 4407 if (adev->ip_blocks[i].status.hang && 4408 adev->ip_blocks[i].version->funcs->post_soft_reset) 4409 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4410 if (r) 4411 return r; 4412 } 4413 4414 return 0; 4415 } 4416 4417 /** 4418 * amdgpu_device_recover_vram - Recover some VRAM contents 4419 * 4420 * @adev: amdgpu_device pointer 4421 * 4422 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4423 * restore things like GPUVM page tables after a GPU reset where 4424 * the contents of VRAM might be lost. 4425 * 4426 * Returns: 4427 * 0 on success, negative error code on failure. 4428 */ 4429 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4430 { 4431 struct dma_fence *fence = NULL, *next = NULL; 4432 struct amdgpu_bo *shadow; 4433 struct amdgpu_bo_vm *vmbo; 4434 long r = 1, tmo; 4435 4436 if (amdgpu_sriov_runtime(adev)) 4437 tmo = msecs_to_jiffies(8000); 4438 else 4439 tmo = msecs_to_jiffies(100); 4440 4441 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4442 mutex_lock(&adev->shadow_list_lock); 4443 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4444 shadow = &vmbo->bo; 4445 /* No need to recover an evicted BO */ 4446 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4447 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4448 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4449 continue; 4450 4451 r = amdgpu_bo_restore_shadow(shadow, &next); 4452 if (r) 4453 break; 4454 4455 if (fence) { 4456 tmo = dma_fence_wait_timeout(fence, false, tmo); 4457 dma_fence_put(fence); 4458 fence = next; 4459 if (tmo == 0) { 4460 r = -ETIMEDOUT; 4461 break; 4462 } else if (tmo < 0) { 4463 r = tmo; 4464 break; 4465 } 4466 } else { 4467 fence = next; 4468 } 4469 } 4470 mutex_unlock(&adev->shadow_list_lock); 4471 4472 if (fence) 4473 tmo = dma_fence_wait_timeout(fence, false, tmo); 4474 dma_fence_put(fence); 4475 4476 if (r < 0 || tmo <= 0) { 4477 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4478 return -EIO; 4479 } 4480 4481 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4482 return 0; 4483 } 4484 4485 4486 /** 4487 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4488 * 4489 * @adev: amdgpu_device pointer 4490 * @from_hypervisor: request from hypervisor 4491 * 4492 * do VF FLR and reinitialize Asic 4493 * return 0 means succeeded otherwise failed 4494 */ 4495 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4496 bool from_hypervisor) 4497 { 4498 int r; 4499 struct amdgpu_hive_info *hive = NULL; 4500 int retry_limit = 0; 4501 4502 retry: 4503 amdgpu_amdkfd_pre_reset(adev); 4504 4505 if (from_hypervisor) 4506 r = amdgpu_virt_request_full_gpu(adev, true); 4507 else 4508 r = amdgpu_virt_reset_gpu(adev); 4509 if (r) 4510 return r; 4511 4512 /* Resume IP prior to SMC */ 4513 r = amdgpu_device_ip_reinit_early_sriov(adev); 4514 if (r) 4515 goto error; 4516 4517 amdgpu_virt_init_data_exchange(adev); 4518 4519 r = amdgpu_device_fw_loading(adev); 4520 if (r) 4521 return r; 4522 4523 /* now we are okay to resume SMC/CP/SDMA */ 4524 r = amdgpu_device_ip_reinit_late_sriov(adev); 4525 if (r) 4526 goto error; 4527 4528 hive = amdgpu_get_xgmi_hive(adev); 4529 /* Update PSP FW topology after reset */ 4530 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4531 r = amdgpu_xgmi_update_topology(hive, adev); 4532 4533 if (hive) 4534 amdgpu_put_xgmi_hive(hive); 4535 4536 if (!r) { 4537 amdgpu_irq_gpu_reset_resume_helper(adev); 4538 r = amdgpu_ib_ring_tests(adev); 4539 4540 amdgpu_amdkfd_post_reset(adev); 4541 } 4542 4543 error: 4544 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4545 amdgpu_inc_vram_lost(adev); 4546 r = amdgpu_device_recover_vram(adev); 4547 } 4548 amdgpu_virt_release_full_gpu(adev, true); 4549 4550 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4551 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4552 retry_limit++; 4553 goto retry; 4554 } else 4555 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4556 } 4557 4558 return r; 4559 } 4560 4561 /** 4562 * amdgpu_device_has_job_running - check if there is any job in mirror list 4563 * 4564 * @adev: amdgpu_device pointer 4565 * 4566 * check if there is any job in mirror list 4567 */ 4568 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4569 { 4570 int i; 4571 struct drm_sched_job *job; 4572 4573 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4574 struct amdgpu_ring *ring = adev->rings[i]; 4575 4576 if (!ring || !ring->sched.thread) 4577 continue; 4578 4579 spin_lock(&ring->sched.job_list_lock); 4580 job = list_first_entry_or_null(&ring->sched.pending_list, 4581 struct drm_sched_job, list); 4582 spin_unlock(&ring->sched.job_list_lock); 4583 if (job) 4584 return true; 4585 } 4586 return false; 4587 } 4588 4589 /** 4590 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4591 * 4592 * @adev: amdgpu_device pointer 4593 * 4594 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4595 * a hung GPU. 4596 */ 4597 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4598 { 4599 4600 if (amdgpu_gpu_recovery == 0) 4601 goto disabled; 4602 4603 /* Skip soft reset check in fatal error mode */ 4604 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4605 return true; 4606 4607 if (amdgpu_sriov_vf(adev)) 4608 return true; 4609 4610 if (amdgpu_gpu_recovery == -1) { 4611 switch (adev->asic_type) { 4612 #ifdef CONFIG_DRM_AMDGPU_SI 4613 case CHIP_VERDE: 4614 case CHIP_TAHITI: 4615 case CHIP_PITCAIRN: 4616 case CHIP_OLAND: 4617 case CHIP_HAINAN: 4618 #endif 4619 #ifdef CONFIG_DRM_AMDGPU_CIK 4620 case CHIP_KAVERI: 4621 case CHIP_KABINI: 4622 case CHIP_MULLINS: 4623 #endif 4624 case CHIP_CARRIZO: 4625 case CHIP_STONEY: 4626 case CHIP_CYAN_SKILLFISH: 4627 goto disabled; 4628 default: 4629 break; 4630 } 4631 } 4632 4633 return true; 4634 4635 disabled: 4636 dev_info(adev->dev, "GPU recovery disabled.\n"); 4637 return false; 4638 } 4639 4640 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4641 { 4642 u32 i; 4643 int ret = 0; 4644 4645 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4646 4647 dev_info(adev->dev, "GPU mode1 reset\n"); 4648 4649 /* disable BM */ 4650 pci_clear_master(adev->pdev); 4651 4652 amdgpu_device_cache_pci_state(adev->pdev); 4653 4654 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4655 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4656 ret = amdgpu_dpm_mode1_reset(adev); 4657 } else { 4658 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4659 ret = psp_gpu_reset(adev); 4660 } 4661 4662 if (ret) 4663 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4664 4665 amdgpu_device_load_pci_state(adev->pdev); 4666 4667 /* wait for asic to come out of reset */ 4668 for (i = 0; i < adev->usec_timeout; i++) { 4669 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4670 4671 if (memsize != 0xffffffff) 4672 break; 4673 udelay(1); 4674 } 4675 4676 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4677 return ret; 4678 } 4679 4680 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4681 struct amdgpu_reset_context *reset_context) 4682 { 4683 int i, r = 0; 4684 struct amdgpu_job *job = NULL; 4685 bool need_full_reset = 4686 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4687 4688 if (reset_context->reset_req_dev == adev) 4689 job = reset_context->job; 4690 4691 if (amdgpu_sriov_vf(adev)) { 4692 /* stop the data exchange thread */ 4693 amdgpu_virt_fini_data_exchange(adev); 4694 } 4695 4696 amdgpu_fence_driver_isr_toggle(adev, true); 4697 4698 /* block all schedulers and reset given job's ring */ 4699 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4700 struct amdgpu_ring *ring = adev->rings[i]; 4701 4702 if (!ring || !ring->sched.thread) 4703 continue; 4704 4705 /*clear job fence from fence drv to avoid force_completion 4706 *leave NULL and vm flush fence in fence drv */ 4707 amdgpu_fence_driver_clear_job_fences(ring); 4708 4709 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4710 amdgpu_fence_driver_force_completion(ring); 4711 } 4712 4713 amdgpu_fence_driver_isr_toggle(adev, false); 4714 4715 if (job && job->vm) 4716 drm_sched_increase_karma(&job->base); 4717 4718 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4719 /* If reset handler not implemented, continue; otherwise return */ 4720 if (r == -ENOSYS) 4721 r = 0; 4722 else 4723 return r; 4724 4725 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4726 if (!amdgpu_sriov_vf(adev)) { 4727 4728 if (!need_full_reset) 4729 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4730 4731 if (!need_full_reset && amdgpu_gpu_recovery && 4732 amdgpu_device_ip_check_soft_reset(adev)) { 4733 amdgpu_device_ip_pre_soft_reset(adev); 4734 r = amdgpu_device_ip_soft_reset(adev); 4735 amdgpu_device_ip_post_soft_reset(adev); 4736 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4737 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4738 need_full_reset = true; 4739 } 4740 } 4741 4742 if (need_full_reset) 4743 r = amdgpu_device_ip_suspend(adev); 4744 if (need_full_reset) 4745 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4746 else 4747 clear_bit(AMDGPU_NEED_FULL_RESET, 4748 &reset_context->flags); 4749 } 4750 4751 return r; 4752 } 4753 4754 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4755 { 4756 int i; 4757 4758 lockdep_assert_held(&adev->reset_domain->sem); 4759 4760 for (i = 0; i < adev->num_regs; i++) { 4761 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4762 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4763 adev->reset_dump_reg_value[i]); 4764 } 4765 4766 return 0; 4767 } 4768 4769 #ifdef CONFIG_DEV_COREDUMP 4770 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4771 size_t count, void *data, size_t datalen) 4772 { 4773 struct drm_printer p; 4774 struct amdgpu_device *adev = data; 4775 struct drm_print_iterator iter; 4776 int i; 4777 4778 iter.data = buffer; 4779 iter.offset = 0; 4780 iter.start = offset; 4781 iter.remain = count; 4782 4783 p = drm_coredump_printer(&iter); 4784 4785 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4786 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4787 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4788 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4789 if (adev->reset_task_info.pid) 4790 drm_printf(&p, "process_name: %s PID: %d\n", 4791 adev->reset_task_info.process_name, 4792 adev->reset_task_info.pid); 4793 4794 if (adev->reset_vram_lost) 4795 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4796 if (adev->num_regs) { 4797 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4798 4799 for (i = 0; i < adev->num_regs; i++) 4800 drm_printf(&p, "0x%08x: 0x%08x\n", 4801 adev->reset_dump_reg_list[i], 4802 adev->reset_dump_reg_value[i]); 4803 } 4804 4805 return count - iter.remain; 4806 } 4807 4808 static void amdgpu_devcoredump_free(void *data) 4809 { 4810 } 4811 4812 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4813 { 4814 struct drm_device *dev = adev_to_drm(adev); 4815 4816 ktime_get_ts64(&adev->reset_time); 4817 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4818 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4819 } 4820 #endif 4821 4822 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4823 struct amdgpu_reset_context *reset_context) 4824 { 4825 struct amdgpu_device *tmp_adev = NULL; 4826 bool need_full_reset, skip_hw_reset, vram_lost = false; 4827 int r = 0; 4828 bool gpu_reset_for_dev_remove = 0; 4829 4830 /* Try reset handler method first */ 4831 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4832 reset_list); 4833 amdgpu_reset_reg_dumps(tmp_adev); 4834 4835 reset_context->reset_device_list = device_list_handle; 4836 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4837 /* If reset handler not implemented, continue; otherwise return */ 4838 if (r == -ENOSYS) 4839 r = 0; 4840 else 4841 return r; 4842 4843 /* Reset handler not implemented, use the default method */ 4844 need_full_reset = 4845 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4846 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4847 4848 gpu_reset_for_dev_remove = 4849 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4850 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4851 4852 /* 4853 * ASIC reset has to be done on all XGMI hive nodes ASAP 4854 * to allow proper links negotiation in FW (within 1 sec) 4855 */ 4856 if (!skip_hw_reset && need_full_reset) { 4857 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4858 /* For XGMI run all resets in parallel to speed up the process */ 4859 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4860 tmp_adev->gmc.xgmi.pending_reset = false; 4861 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4862 r = -EALREADY; 4863 } else 4864 r = amdgpu_asic_reset(tmp_adev); 4865 4866 if (r) { 4867 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4868 r, adev_to_drm(tmp_adev)->unique); 4869 break; 4870 } 4871 } 4872 4873 /* For XGMI wait for all resets to complete before proceed */ 4874 if (!r) { 4875 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4876 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4877 flush_work(&tmp_adev->xgmi_reset_work); 4878 r = tmp_adev->asic_reset_res; 4879 if (r) 4880 break; 4881 } 4882 } 4883 } 4884 } 4885 4886 if (!r && amdgpu_ras_intr_triggered()) { 4887 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4888 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4889 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4890 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4891 } 4892 4893 amdgpu_ras_intr_cleared(); 4894 } 4895 4896 /* Since the mode1 reset affects base ip blocks, the 4897 * phase1 ip blocks need to be resumed. Otherwise there 4898 * will be a BIOS signature error and the psp bootloader 4899 * can't load kdb on the next amdgpu install. 4900 */ 4901 if (gpu_reset_for_dev_remove) { 4902 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4903 amdgpu_device_ip_resume_phase1(tmp_adev); 4904 4905 goto end; 4906 } 4907 4908 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4909 if (need_full_reset) { 4910 /* post card */ 4911 r = amdgpu_device_asic_init(tmp_adev); 4912 if (r) { 4913 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4914 } else { 4915 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4916 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4917 if (r) 4918 goto out; 4919 4920 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4921 if (r) 4922 goto out; 4923 4924 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4925 #ifdef CONFIG_DEV_COREDUMP 4926 tmp_adev->reset_vram_lost = vram_lost; 4927 memset(&tmp_adev->reset_task_info, 0, 4928 sizeof(tmp_adev->reset_task_info)); 4929 if (reset_context->job && reset_context->job->vm) 4930 tmp_adev->reset_task_info = 4931 reset_context->job->vm->task_info; 4932 amdgpu_reset_capture_coredumpm(tmp_adev); 4933 #endif 4934 if (vram_lost) { 4935 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4936 amdgpu_inc_vram_lost(tmp_adev); 4937 } 4938 4939 r = amdgpu_device_fw_loading(tmp_adev); 4940 if (r) 4941 return r; 4942 4943 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4944 if (r) 4945 goto out; 4946 4947 if (vram_lost) 4948 amdgpu_device_fill_reset_magic(tmp_adev); 4949 4950 /* 4951 * Add this ASIC as tracked as reset was already 4952 * complete successfully. 4953 */ 4954 amdgpu_register_gpu_instance(tmp_adev); 4955 4956 if (!reset_context->hive && 4957 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4958 amdgpu_xgmi_add_device(tmp_adev); 4959 4960 r = amdgpu_device_ip_late_init(tmp_adev); 4961 if (r) 4962 goto out; 4963 4964 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4965 4966 /* 4967 * The GPU enters bad state once faulty pages 4968 * by ECC has reached the threshold, and ras 4969 * recovery is scheduled next. So add one check 4970 * here to break recovery if it indeed exceeds 4971 * bad page threshold, and remind user to 4972 * retire this GPU or setting one bigger 4973 * bad_page_threshold value to fix this once 4974 * probing driver again. 4975 */ 4976 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4977 /* must succeed. */ 4978 amdgpu_ras_resume(tmp_adev); 4979 } else { 4980 r = -EINVAL; 4981 goto out; 4982 } 4983 4984 /* Update PSP FW topology after reset */ 4985 if (reset_context->hive && 4986 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4987 r = amdgpu_xgmi_update_topology( 4988 reset_context->hive, tmp_adev); 4989 } 4990 } 4991 4992 out: 4993 if (!r) { 4994 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4995 r = amdgpu_ib_ring_tests(tmp_adev); 4996 if (r) { 4997 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4998 need_full_reset = true; 4999 r = -EAGAIN; 5000 goto end; 5001 } 5002 } 5003 5004 if (!r) 5005 r = amdgpu_device_recover_vram(tmp_adev); 5006 else 5007 tmp_adev->asic_reset_res = r; 5008 } 5009 5010 end: 5011 if (need_full_reset) 5012 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5013 else 5014 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5015 return r; 5016 } 5017 5018 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5019 { 5020 5021 switch (amdgpu_asic_reset_method(adev)) { 5022 case AMD_RESET_METHOD_MODE1: 5023 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5024 break; 5025 case AMD_RESET_METHOD_MODE2: 5026 adev->mp1_state = PP_MP1_STATE_RESET; 5027 break; 5028 default: 5029 adev->mp1_state = PP_MP1_STATE_NONE; 5030 break; 5031 } 5032 } 5033 5034 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5035 { 5036 amdgpu_vf_error_trans_all(adev); 5037 adev->mp1_state = PP_MP1_STATE_NONE; 5038 } 5039 5040 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5041 { 5042 struct pci_dev *p = NULL; 5043 5044 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5045 adev->pdev->bus->number, 1); 5046 if (p) { 5047 pm_runtime_enable(&(p->dev)); 5048 pm_runtime_resume(&(p->dev)); 5049 } 5050 5051 pci_dev_put(p); 5052 } 5053 5054 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5055 { 5056 enum amd_reset_method reset_method; 5057 struct pci_dev *p = NULL; 5058 u64 expires; 5059 5060 /* 5061 * For now, only BACO and mode1 reset are confirmed 5062 * to suffer the audio issue without proper suspended. 5063 */ 5064 reset_method = amdgpu_asic_reset_method(adev); 5065 if ((reset_method != AMD_RESET_METHOD_BACO) && 5066 (reset_method != AMD_RESET_METHOD_MODE1)) 5067 return -EINVAL; 5068 5069 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5070 adev->pdev->bus->number, 1); 5071 if (!p) 5072 return -ENODEV; 5073 5074 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5075 if (!expires) 5076 /* 5077 * If we cannot get the audio device autosuspend delay, 5078 * a fixed 4S interval will be used. Considering 3S is 5079 * the audio controller default autosuspend delay setting. 5080 * 4S used here is guaranteed to cover that. 5081 */ 5082 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5083 5084 while (!pm_runtime_status_suspended(&(p->dev))) { 5085 if (!pm_runtime_suspend(&(p->dev))) 5086 break; 5087 5088 if (expires < ktime_get_mono_fast_ns()) { 5089 dev_warn(adev->dev, "failed to suspend display audio\n"); 5090 pci_dev_put(p); 5091 /* TODO: abort the succeeding gpu reset? */ 5092 return -ETIMEDOUT; 5093 } 5094 } 5095 5096 pm_runtime_disable(&(p->dev)); 5097 5098 pci_dev_put(p); 5099 return 0; 5100 } 5101 5102 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5103 { 5104 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5105 5106 #if defined(CONFIG_DEBUG_FS) 5107 if (!amdgpu_sriov_vf(adev)) 5108 cancel_work(&adev->reset_work); 5109 #endif 5110 5111 if (adev->kfd.dev) 5112 cancel_work(&adev->kfd.reset_work); 5113 5114 if (amdgpu_sriov_vf(adev)) 5115 cancel_work(&adev->virt.flr_work); 5116 5117 if (con && adev->ras_enabled) 5118 cancel_work(&con->recovery_work); 5119 5120 } 5121 5122 /** 5123 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5124 * 5125 * @adev: amdgpu_device pointer 5126 * @job: which job trigger hang 5127 * 5128 * Attempt to reset the GPU if it has hung (all asics). 5129 * Attempt to do soft-reset or full-reset and reinitialize Asic 5130 * Returns 0 for success or an error on failure. 5131 */ 5132 5133 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5134 struct amdgpu_job *job, 5135 struct amdgpu_reset_context *reset_context) 5136 { 5137 struct list_head device_list, *device_list_handle = NULL; 5138 bool job_signaled = false; 5139 struct amdgpu_hive_info *hive = NULL; 5140 struct amdgpu_device *tmp_adev = NULL; 5141 int i, r = 0; 5142 bool need_emergency_restart = false; 5143 bool audio_suspended = false; 5144 bool gpu_reset_for_dev_remove = false; 5145 5146 gpu_reset_for_dev_remove = 5147 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5148 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5149 5150 /* 5151 * Special case: RAS triggered and full reset isn't supported 5152 */ 5153 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5154 5155 /* 5156 * Flush RAM to disk so that after reboot 5157 * the user can read log and see why the system rebooted. 5158 */ 5159 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5160 DRM_WARN("Emergency reboot."); 5161 5162 ksys_sync_helper(); 5163 emergency_restart(); 5164 } 5165 5166 dev_info(adev->dev, "GPU %s begin!\n", 5167 need_emergency_restart ? "jobs stop":"reset"); 5168 5169 if (!amdgpu_sriov_vf(adev)) 5170 hive = amdgpu_get_xgmi_hive(adev); 5171 if (hive) 5172 mutex_lock(&hive->hive_lock); 5173 5174 reset_context->job = job; 5175 reset_context->hive = hive; 5176 /* 5177 * Build list of devices to reset. 5178 * In case we are in XGMI hive mode, resort the device list 5179 * to put adev in the 1st position. 5180 */ 5181 INIT_LIST_HEAD(&device_list); 5182 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5183 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5184 list_add_tail(&tmp_adev->reset_list, &device_list); 5185 if (gpu_reset_for_dev_remove && adev->shutdown) 5186 tmp_adev->shutdown = true; 5187 } 5188 if (!list_is_first(&adev->reset_list, &device_list)) 5189 list_rotate_to_front(&adev->reset_list, &device_list); 5190 device_list_handle = &device_list; 5191 } else { 5192 list_add_tail(&adev->reset_list, &device_list); 5193 device_list_handle = &device_list; 5194 } 5195 5196 /* We need to lock reset domain only once both for XGMI and single device */ 5197 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5198 reset_list); 5199 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5200 5201 /* block all schedulers and reset given job's ring */ 5202 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5203 5204 amdgpu_device_set_mp1_state(tmp_adev); 5205 5206 /* 5207 * Try to put the audio codec into suspend state 5208 * before gpu reset started. 5209 * 5210 * Due to the power domain of the graphics device 5211 * is shared with AZ power domain. Without this, 5212 * we may change the audio hardware from behind 5213 * the audio driver's back. That will trigger 5214 * some audio codec errors. 5215 */ 5216 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5217 audio_suspended = true; 5218 5219 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5220 5221 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5222 5223 if (!amdgpu_sriov_vf(tmp_adev)) 5224 amdgpu_amdkfd_pre_reset(tmp_adev); 5225 5226 /* 5227 * Mark these ASICs to be reseted as untracked first 5228 * And add them back after reset completed 5229 */ 5230 amdgpu_unregister_gpu_instance(tmp_adev); 5231 5232 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5233 5234 /* disable ras on ALL IPs */ 5235 if (!need_emergency_restart && 5236 amdgpu_device_ip_need_full_reset(tmp_adev)) 5237 amdgpu_ras_suspend(tmp_adev); 5238 5239 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5240 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5241 5242 if (!ring || !ring->sched.thread) 5243 continue; 5244 5245 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5246 5247 if (need_emergency_restart) 5248 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5249 } 5250 atomic_inc(&tmp_adev->gpu_reset_counter); 5251 } 5252 5253 if (need_emergency_restart) 5254 goto skip_sched_resume; 5255 5256 /* 5257 * Must check guilty signal here since after this point all old 5258 * HW fences are force signaled. 5259 * 5260 * job->base holds a reference to parent fence 5261 */ 5262 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5263 job_signaled = true; 5264 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5265 goto skip_hw_reset; 5266 } 5267 5268 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5269 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5270 if (gpu_reset_for_dev_remove) { 5271 /* Workaroud for ASICs need to disable SMC first */ 5272 amdgpu_device_smu_fini_early(tmp_adev); 5273 } 5274 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5275 /*TODO Should we stop ?*/ 5276 if (r) { 5277 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5278 r, adev_to_drm(tmp_adev)->unique); 5279 tmp_adev->asic_reset_res = r; 5280 } 5281 5282 /* 5283 * Drop all pending non scheduler resets. Scheduler resets 5284 * were already dropped during drm_sched_stop 5285 */ 5286 amdgpu_device_stop_pending_resets(tmp_adev); 5287 } 5288 5289 /* Actual ASIC resets if needed.*/ 5290 /* Host driver will handle XGMI hive reset for SRIOV */ 5291 if (amdgpu_sriov_vf(adev)) { 5292 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5293 if (r) 5294 adev->asic_reset_res = r; 5295 5296 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5297 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5298 amdgpu_ras_resume(adev); 5299 } else { 5300 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5301 if (r && r == -EAGAIN) 5302 goto retry; 5303 5304 if (!r && gpu_reset_for_dev_remove) 5305 goto recover_end; 5306 } 5307 5308 skip_hw_reset: 5309 5310 /* Post ASIC reset for all devs .*/ 5311 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5312 5313 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5314 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5315 5316 if (!ring || !ring->sched.thread) 5317 continue; 5318 5319 drm_sched_start(&ring->sched, true); 5320 } 5321 5322 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5323 amdgpu_mes_self_test(tmp_adev); 5324 5325 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5326 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5327 } 5328 5329 if (tmp_adev->asic_reset_res) 5330 r = tmp_adev->asic_reset_res; 5331 5332 tmp_adev->asic_reset_res = 0; 5333 5334 if (r) { 5335 /* bad news, how to tell it to userspace ? */ 5336 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5337 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5338 } else { 5339 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5340 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5341 DRM_WARN("smart shift update failed\n"); 5342 } 5343 } 5344 5345 skip_sched_resume: 5346 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5347 /* unlock kfd: SRIOV would do it separately */ 5348 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5349 amdgpu_amdkfd_post_reset(tmp_adev); 5350 5351 /* kfd_post_reset will do nothing if kfd device is not initialized, 5352 * need to bring up kfd here if it's not be initialized before 5353 */ 5354 if (!adev->kfd.init_complete) 5355 amdgpu_amdkfd_device_init(adev); 5356 5357 if (audio_suspended) 5358 amdgpu_device_resume_display_audio(tmp_adev); 5359 5360 amdgpu_device_unset_mp1_state(tmp_adev); 5361 5362 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5363 } 5364 5365 recover_end: 5366 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5367 reset_list); 5368 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5369 5370 if (hive) { 5371 mutex_unlock(&hive->hive_lock); 5372 amdgpu_put_xgmi_hive(hive); 5373 } 5374 5375 if (r) 5376 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5377 5378 atomic_set(&adev->reset_domain->reset_res, r); 5379 return r; 5380 } 5381 5382 /** 5383 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5384 * 5385 * @adev: amdgpu_device pointer 5386 * 5387 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5388 * and lanes) of the slot the device is in. Handles APUs and 5389 * virtualized environments where PCIE config space may not be available. 5390 */ 5391 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5392 { 5393 struct pci_dev *pdev; 5394 enum pci_bus_speed speed_cap, platform_speed_cap; 5395 enum pcie_link_width platform_link_width; 5396 5397 if (amdgpu_pcie_gen_cap) 5398 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5399 5400 if (amdgpu_pcie_lane_cap) 5401 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5402 5403 /* covers APUs as well */ 5404 if (pci_is_root_bus(adev->pdev->bus)) { 5405 if (adev->pm.pcie_gen_mask == 0) 5406 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5407 if (adev->pm.pcie_mlw_mask == 0) 5408 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5409 return; 5410 } 5411 5412 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5413 return; 5414 5415 pcie_bandwidth_available(adev->pdev, NULL, 5416 &platform_speed_cap, &platform_link_width); 5417 5418 if (adev->pm.pcie_gen_mask == 0) { 5419 /* asic caps */ 5420 pdev = adev->pdev; 5421 speed_cap = pcie_get_speed_cap(pdev); 5422 if (speed_cap == PCI_SPEED_UNKNOWN) { 5423 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5424 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5425 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5426 } else { 5427 if (speed_cap == PCIE_SPEED_32_0GT) 5428 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5429 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5430 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5431 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5432 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5433 else if (speed_cap == PCIE_SPEED_16_0GT) 5434 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5435 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5436 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5437 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5438 else if (speed_cap == PCIE_SPEED_8_0GT) 5439 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5440 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5441 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5442 else if (speed_cap == PCIE_SPEED_5_0GT) 5443 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5444 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5445 else 5446 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5447 } 5448 /* platform caps */ 5449 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5450 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5451 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5452 } else { 5453 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5454 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5455 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5456 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5457 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5458 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5459 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5460 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5461 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5462 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5463 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5464 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5465 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5466 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5467 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5468 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5469 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5470 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5471 else 5472 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5473 5474 } 5475 } 5476 if (adev->pm.pcie_mlw_mask == 0) { 5477 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5478 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5479 } else { 5480 switch (platform_link_width) { 5481 case PCIE_LNK_X32: 5482 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5483 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5484 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5485 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5486 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5487 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5488 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5489 break; 5490 case PCIE_LNK_X16: 5491 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5492 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5493 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5494 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5495 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5496 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5497 break; 5498 case PCIE_LNK_X12: 5499 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5500 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5501 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5502 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5503 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5504 break; 5505 case PCIE_LNK_X8: 5506 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5507 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5508 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5509 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5510 break; 5511 case PCIE_LNK_X4: 5512 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5515 break; 5516 case PCIE_LNK_X2: 5517 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5518 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5519 break; 5520 case PCIE_LNK_X1: 5521 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5522 break; 5523 default: 5524 break; 5525 } 5526 } 5527 } 5528 } 5529 5530 /** 5531 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5532 * 5533 * @adev: amdgpu_device pointer 5534 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5535 * 5536 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5537 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5538 * @peer_adev. 5539 */ 5540 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5541 struct amdgpu_device *peer_adev) 5542 { 5543 #ifdef CONFIG_HSA_AMD_P2P 5544 uint64_t address_mask = peer_adev->dev->dma_mask ? 5545 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5546 resource_size_t aper_limit = 5547 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5548 bool p2p_access = 5549 !adev->gmc.xgmi.connected_to_cpu && 5550 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5551 5552 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5553 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5554 !(adev->gmc.aper_base & address_mask || 5555 aper_limit & address_mask)); 5556 #else 5557 return false; 5558 #endif 5559 } 5560 5561 int amdgpu_device_baco_enter(struct drm_device *dev) 5562 { 5563 struct amdgpu_device *adev = drm_to_adev(dev); 5564 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5565 5566 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5567 return -ENOTSUPP; 5568 5569 if (ras && adev->ras_enabled && 5570 adev->nbio.funcs->enable_doorbell_interrupt) 5571 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5572 5573 return amdgpu_dpm_baco_enter(adev); 5574 } 5575 5576 int amdgpu_device_baco_exit(struct drm_device *dev) 5577 { 5578 struct amdgpu_device *adev = drm_to_adev(dev); 5579 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5580 int ret = 0; 5581 5582 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5583 return -ENOTSUPP; 5584 5585 ret = amdgpu_dpm_baco_exit(adev); 5586 if (ret) 5587 return ret; 5588 5589 if (ras && adev->ras_enabled && 5590 adev->nbio.funcs->enable_doorbell_interrupt) 5591 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5592 5593 if (amdgpu_passthrough(adev) && 5594 adev->nbio.funcs->clear_doorbell_interrupt) 5595 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5596 5597 return 0; 5598 } 5599 5600 /** 5601 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5602 * @pdev: PCI device struct 5603 * @state: PCI channel state 5604 * 5605 * Description: Called when a PCI error is detected. 5606 * 5607 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5608 */ 5609 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5610 { 5611 struct drm_device *dev = pci_get_drvdata(pdev); 5612 struct amdgpu_device *adev = drm_to_adev(dev); 5613 int i; 5614 5615 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5616 5617 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5618 DRM_WARN("No support for XGMI hive yet..."); 5619 return PCI_ERS_RESULT_DISCONNECT; 5620 } 5621 5622 adev->pci_channel_state = state; 5623 5624 switch (state) { 5625 case pci_channel_io_normal: 5626 return PCI_ERS_RESULT_CAN_RECOVER; 5627 /* Fatal error, prepare for slot reset */ 5628 case pci_channel_io_frozen: 5629 /* 5630 * Locking adev->reset_domain->sem will prevent any external access 5631 * to GPU during PCI error recovery 5632 */ 5633 amdgpu_device_lock_reset_domain(adev->reset_domain); 5634 amdgpu_device_set_mp1_state(adev); 5635 5636 /* 5637 * Block any work scheduling as we do for regular GPU reset 5638 * for the duration of the recovery 5639 */ 5640 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5641 struct amdgpu_ring *ring = adev->rings[i]; 5642 5643 if (!ring || !ring->sched.thread) 5644 continue; 5645 5646 drm_sched_stop(&ring->sched, NULL); 5647 } 5648 atomic_inc(&adev->gpu_reset_counter); 5649 return PCI_ERS_RESULT_NEED_RESET; 5650 case pci_channel_io_perm_failure: 5651 /* Permanent error, prepare for device removal */ 5652 return PCI_ERS_RESULT_DISCONNECT; 5653 } 5654 5655 return PCI_ERS_RESULT_NEED_RESET; 5656 } 5657 5658 /** 5659 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5660 * @pdev: pointer to PCI device 5661 */ 5662 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5663 { 5664 5665 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5666 5667 /* TODO - dump whatever for debugging purposes */ 5668 5669 /* This called only if amdgpu_pci_error_detected returns 5670 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5671 * works, no need to reset slot. 5672 */ 5673 5674 return PCI_ERS_RESULT_RECOVERED; 5675 } 5676 5677 /** 5678 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5679 * @pdev: PCI device struct 5680 * 5681 * Description: This routine is called by the pci error recovery 5682 * code after the PCI slot has been reset, just before we 5683 * should resume normal operations. 5684 */ 5685 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5686 { 5687 struct drm_device *dev = pci_get_drvdata(pdev); 5688 struct amdgpu_device *adev = drm_to_adev(dev); 5689 int r, i; 5690 struct amdgpu_reset_context reset_context; 5691 u32 memsize; 5692 struct list_head device_list; 5693 5694 DRM_INFO("PCI error: slot reset callback!!\n"); 5695 5696 memset(&reset_context, 0, sizeof(reset_context)); 5697 5698 INIT_LIST_HEAD(&device_list); 5699 list_add_tail(&adev->reset_list, &device_list); 5700 5701 /* wait for asic to come out of reset */ 5702 msleep(500); 5703 5704 /* Restore PCI confspace */ 5705 amdgpu_device_load_pci_state(pdev); 5706 5707 /* confirm ASIC came out of reset */ 5708 for (i = 0; i < adev->usec_timeout; i++) { 5709 memsize = amdgpu_asic_get_config_memsize(adev); 5710 5711 if (memsize != 0xffffffff) 5712 break; 5713 udelay(1); 5714 } 5715 if (memsize == 0xffffffff) { 5716 r = -ETIME; 5717 goto out; 5718 } 5719 5720 reset_context.method = AMD_RESET_METHOD_NONE; 5721 reset_context.reset_req_dev = adev; 5722 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5723 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5724 5725 adev->no_hw_access = true; 5726 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5727 adev->no_hw_access = false; 5728 if (r) 5729 goto out; 5730 5731 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5732 5733 out: 5734 if (!r) { 5735 if (amdgpu_device_cache_pci_state(adev->pdev)) 5736 pci_restore_state(adev->pdev); 5737 5738 DRM_INFO("PCIe error recovery succeeded\n"); 5739 } else { 5740 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5741 amdgpu_device_unset_mp1_state(adev); 5742 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5743 } 5744 5745 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5746 } 5747 5748 /** 5749 * amdgpu_pci_resume() - resume normal ops after PCI reset 5750 * @pdev: pointer to PCI device 5751 * 5752 * Called when the error recovery driver tells us that its 5753 * OK to resume normal operation. 5754 */ 5755 void amdgpu_pci_resume(struct pci_dev *pdev) 5756 { 5757 struct drm_device *dev = pci_get_drvdata(pdev); 5758 struct amdgpu_device *adev = drm_to_adev(dev); 5759 int i; 5760 5761 5762 DRM_INFO("PCI error: resume callback!!\n"); 5763 5764 /* Only continue execution for the case of pci_channel_io_frozen */ 5765 if (adev->pci_channel_state != pci_channel_io_frozen) 5766 return; 5767 5768 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5769 struct amdgpu_ring *ring = adev->rings[i]; 5770 5771 if (!ring || !ring->sched.thread) 5772 continue; 5773 5774 drm_sched_start(&ring->sched, true); 5775 } 5776 5777 amdgpu_device_unset_mp1_state(adev); 5778 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5779 } 5780 5781 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5782 { 5783 struct drm_device *dev = pci_get_drvdata(pdev); 5784 struct amdgpu_device *adev = drm_to_adev(dev); 5785 int r; 5786 5787 r = pci_save_state(pdev); 5788 if (!r) { 5789 kfree(adev->pci_state); 5790 5791 adev->pci_state = pci_store_saved_state(pdev); 5792 5793 if (!adev->pci_state) { 5794 DRM_ERROR("Failed to store PCI saved state"); 5795 return false; 5796 } 5797 } else { 5798 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5799 return false; 5800 } 5801 5802 return true; 5803 } 5804 5805 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5806 { 5807 struct drm_device *dev = pci_get_drvdata(pdev); 5808 struct amdgpu_device *adev = drm_to_adev(dev); 5809 int r; 5810 5811 if (!adev->pci_state) 5812 return false; 5813 5814 r = pci_load_saved_state(pdev, adev->pci_state); 5815 5816 if (!r) { 5817 pci_restore_state(pdev); 5818 } else { 5819 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5820 return false; 5821 } 5822 5823 return true; 5824 } 5825 5826 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5827 struct amdgpu_ring *ring) 5828 { 5829 #ifdef CONFIG_X86_64 5830 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5831 return; 5832 #endif 5833 if (adev->gmc.xgmi.connected_to_cpu) 5834 return; 5835 5836 if (ring && ring->funcs->emit_hdp_flush) 5837 amdgpu_ring_emit_hdp_flush(ring); 5838 else 5839 amdgpu_asic_flush_hdp(adev, ring); 5840 } 5841 5842 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5843 struct amdgpu_ring *ring) 5844 { 5845 #ifdef CONFIG_X86_64 5846 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5847 return; 5848 #endif 5849 if (adev->gmc.xgmi.connected_to_cpu) 5850 return; 5851 5852 amdgpu_asic_invalidate_hdp(adev, ring); 5853 } 5854 5855 int amdgpu_in_reset(struct amdgpu_device *adev) 5856 { 5857 return atomic_read(&adev->reset_domain->in_gpu_reset); 5858 } 5859 5860 /** 5861 * amdgpu_device_halt() - bring hardware to some kind of halt state 5862 * 5863 * @adev: amdgpu_device pointer 5864 * 5865 * Bring hardware to some kind of halt state so that no one can touch it 5866 * any more. It will help to maintain error context when error occurred. 5867 * Compare to a simple hang, the system will keep stable at least for SSH 5868 * access. Then it should be trivial to inspect the hardware state and 5869 * see what's going on. Implemented as following: 5870 * 5871 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5872 * clears all CPU mappings to device, disallows remappings through page faults 5873 * 2. amdgpu_irq_disable_all() disables all interrupts 5874 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5875 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5876 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5877 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5878 * flush any in flight DMA operations 5879 */ 5880 void amdgpu_device_halt(struct amdgpu_device *adev) 5881 { 5882 struct pci_dev *pdev = adev->pdev; 5883 struct drm_device *ddev = adev_to_drm(adev); 5884 5885 drm_dev_unplug(ddev); 5886 5887 amdgpu_irq_disable_all(adev); 5888 5889 amdgpu_fence_driver_hw_fini(adev); 5890 5891 adev->no_hw_access = true; 5892 5893 amdgpu_device_unmap_mmio(adev); 5894 5895 pci_disable_device(pdev); 5896 pci_wait_for_pending_transaction(pdev); 5897 } 5898 5899 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5900 u32 reg) 5901 { 5902 unsigned long flags, address, data; 5903 u32 r; 5904 5905 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5906 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5907 5908 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5909 WREG32(address, reg * 4); 5910 (void)RREG32(address); 5911 r = RREG32(data); 5912 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5913 return r; 5914 } 5915 5916 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5917 u32 reg, u32 v) 5918 { 5919 unsigned long flags, address, data; 5920 5921 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5922 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5923 5924 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5925 WREG32(address, reg * 4); 5926 (void)RREG32(address); 5927 WREG32(data, v); 5928 (void)RREG32(data); 5929 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5930 } 5931 5932 /** 5933 * amdgpu_device_switch_gang - switch to a new gang 5934 * @adev: amdgpu_device pointer 5935 * @gang: the gang to switch to 5936 * 5937 * Try to switch to a new gang. 5938 * Returns: NULL if we switched to the new gang or a reference to the current 5939 * gang leader. 5940 */ 5941 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5942 struct dma_fence *gang) 5943 { 5944 struct dma_fence *old = NULL; 5945 5946 do { 5947 dma_fence_put(old); 5948 rcu_read_lock(); 5949 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5950 rcu_read_unlock(); 5951 5952 if (old == gang) 5953 break; 5954 5955 if (!dma_fence_is_signaled(old)) 5956 return old; 5957 5958 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5959 old, gang) != old); 5960 5961 dma_fence_put(old); 5962 return NULL; 5963 } 5964 5965 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5966 { 5967 switch (adev->asic_type) { 5968 #ifdef CONFIG_DRM_AMDGPU_SI 5969 case CHIP_HAINAN: 5970 #endif 5971 case CHIP_TOPAZ: 5972 /* chips with no display hardware */ 5973 return false; 5974 #ifdef CONFIG_DRM_AMDGPU_SI 5975 case CHIP_TAHITI: 5976 case CHIP_PITCAIRN: 5977 case CHIP_VERDE: 5978 case CHIP_OLAND: 5979 #endif 5980 #ifdef CONFIG_DRM_AMDGPU_CIK 5981 case CHIP_BONAIRE: 5982 case CHIP_HAWAII: 5983 case CHIP_KAVERI: 5984 case CHIP_KABINI: 5985 case CHIP_MULLINS: 5986 #endif 5987 case CHIP_TONGA: 5988 case CHIP_FIJI: 5989 case CHIP_POLARIS10: 5990 case CHIP_POLARIS11: 5991 case CHIP_POLARIS12: 5992 case CHIP_VEGAM: 5993 case CHIP_CARRIZO: 5994 case CHIP_STONEY: 5995 /* chips with display hardware */ 5996 return true; 5997 default: 5998 /* IP discovery */ 5999 if (!adev->ip_versions[DCE_HWIP][0] || 6000 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6001 return false; 6002 return true; 6003 } 6004 } 6005