1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 36 #include <drm/drm_atomic_helper.h> 37 #include <drm/drm_probe_helper.h> 38 #include <drm/amdgpu_drm.h> 39 #include <linux/vgaarb.h> 40 #include <linux/vga_switcheroo.h> 41 #include <linux/efi.h> 42 #include "amdgpu.h" 43 #include "amdgpu_trace.h" 44 #include "amdgpu_i2c.h" 45 #include "atom.h" 46 #include "amdgpu_atombios.h" 47 #include "amdgpu_atomfirmware.h" 48 #include "amd_pcie.h" 49 #ifdef CONFIG_DRM_AMDGPU_SI 50 #include "si.h" 51 #endif 52 #ifdef CONFIG_DRM_AMDGPU_CIK 53 #include "cik.h" 54 #endif 55 #include "vi.h" 56 #include "soc15.h" 57 #include "nv.h" 58 #include "bif/bif_4_1_d.h" 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 #define AMDGPU_MAX_RETRY_LIMIT 2 92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 93 94 const char *amdgpu_asic_name[] = { 95 "TAHITI", 96 "PITCAIRN", 97 "VERDE", 98 "OLAND", 99 "HAINAN", 100 "BONAIRE", 101 "KAVERI", 102 "KABINI", 103 "HAWAII", 104 "MULLINS", 105 "TOPAZ", 106 "TONGA", 107 "FIJI", 108 "CARRIZO", 109 "STONEY", 110 "POLARIS10", 111 "POLARIS11", 112 "POLARIS12", 113 "VEGAM", 114 "VEGA10", 115 "VEGA12", 116 "VEGA20", 117 "RAVEN", 118 "ARCTURUS", 119 "RENOIR", 120 "ALDEBARAN", 121 "NAVI10", 122 "CYAN_SKILLFISH", 123 "NAVI14", 124 "NAVI12", 125 "SIENNA_CICHLID", 126 "NAVY_FLOUNDER", 127 "VANGOGH", 128 "DIMGREY_CAVEFISH", 129 "BEIGE_GOBY", 130 "YELLOW_CARP", 131 "IP DISCOVERY", 132 "LAST", 133 }; 134 135 /** 136 * DOC: pcie_replay_count 137 * 138 * The amdgpu driver provides a sysfs API for reporting the total number 139 * of PCIe replays (NAKs) 140 * The file pcie_replay_count is used for this and returns the total 141 * number of replays as a sum of the NAKs generated and NAKs received 142 */ 143 144 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 145 struct device_attribute *attr, char *buf) 146 { 147 struct drm_device *ddev = dev_get_drvdata(dev); 148 struct amdgpu_device *adev = drm_to_adev(ddev); 149 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 150 151 return sysfs_emit(buf, "%llu\n", cnt); 152 } 153 154 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 155 amdgpu_device_get_pcie_replay_count, NULL); 156 157 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 158 159 /** 160 * DOC: product_name 161 * 162 * The amdgpu driver provides a sysfs API for reporting the product name 163 * for the device 164 * The file serial_number is used for this and returns the product name 165 * as returned from the FRU. 166 * NOTE: This is only available for certain server cards 167 */ 168 169 static ssize_t amdgpu_device_get_product_name(struct device *dev, 170 struct device_attribute *attr, char *buf) 171 { 172 struct drm_device *ddev = dev_get_drvdata(dev); 173 struct amdgpu_device *adev = drm_to_adev(ddev); 174 175 return sysfs_emit(buf, "%s\n", adev->product_name); 176 } 177 178 static DEVICE_ATTR(product_name, S_IRUGO, 179 amdgpu_device_get_product_name, NULL); 180 181 /** 182 * DOC: product_number 183 * 184 * The amdgpu driver provides a sysfs API for reporting the part number 185 * for the device 186 * The file serial_number is used for this and returns the part number 187 * as returned from the FRU. 188 * NOTE: This is only available for certain server cards 189 */ 190 191 static ssize_t amdgpu_device_get_product_number(struct device *dev, 192 struct device_attribute *attr, char *buf) 193 { 194 struct drm_device *ddev = dev_get_drvdata(dev); 195 struct amdgpu_device *adev = drm_to_adev(ddev); 196 197 return sysfs_emit(buf, "%s\n", adev->product_number); 198 } 199 200 static DEVICE_ATTR(product_number, S_IRUGO, 201 amdgpu_device_get_product_number, NULL); 202 203 /** 204 * DOC: serial_number 205 * 206 * The amdgpu driver provides a sysfs API for reporting the serial number 207 * for the device 208 * The file serial_number is used for this and returns the serial number 209 * as returned from the FRU. 210 * NOTE: This is only available for certain server cards 211 */ 212 213 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 214 struct device_attribute *attr, char *buf) 215 { 216 struct drm_device *ddev = dev_get_drvdata(dev); 217 struct amdgpu_device *adev = drm_to_adev(ddev); 218 219 return sysfs_emit(buf, "%s\n", adev->serial); 220 } 221 222 static DEVICE_ATTR(serial_number, S_IRUGO, 223 amdgpu_device_get_serial_number, NULL); 224 225 /** 226 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 227 * 228 * @dev: drm_device pointer 229 * 230 * Returns true if the device is a dGPU with ATPX power control, 231 * otherwise return false. 232 */ 233 bool amdgpu_device_supports_px(struct drm_device *dev) 234 { 235 struct amdgpu_device *adev = drm_to_adev(dev); 236 237 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 238 return true; 239 return false; 240 } 241 242 /** 243 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 244 * 245 * @dev: drm_device pointer 246 * 247 * Returns true if the device is a dGPU with ACPI power control, 248 * otherwise return false. 249 */ 250 bool amdgpu_device_supports_boco(struct drm_device *dev) 251 { 252 struct amdgpu_device *adev = drm_to_adev(dev); 253 254 if (adev->has_pr3 || 255 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 256 return true; 257 return false; 258 } 259 260 /** 261 * amdgpu_device_supports_baco - Does the device support BACO 262 * 263 * @dev: drm_device pointer 264 * 265 * Returns true if the device supporte BACO, 266 * otherwise return false. 267 */ 268 bool amdgpu_device_supports_baco(struct drm_device *dev) 269 { 270 struct amdgpu_device *adev = drm_to_adev(dev); 271 272 return amdgpu_asic_supports_baco(adev); 273 } 274 275 /** 276 * amdgpu_device_supports_smart_shift - Is the device dGPU with 277 * smart shift support 278 * 279 * @dev: drm_device pointer 280 * 281 * Returns true if the device is a dGPU with Smart Shift support, 282 * otherwise returns false. 283 */ 284 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 285 { 286 return (amdgpu_device_supports_boco(dev) && 287 amdgpu_acpi_is_power_shift_control_supported()); 288 } 289 290 /* 291 * VRAM access helper functions 292 */ 293 294 /** 295 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 296 * 297 * @adev: amdgpu_device pointer 298 * @pos: offset of the buffer in vram 299 * @buf: virtual address of the buffer in system memory 300 * @size: read/write size, sizeof(@buf) must > @size 301 * @write: true - write to vram, otherwise - read from vram 302 */ 303 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 304 void *buf, size_t size, bool write) 305 { 306 unsigned long flags; 307 uint32_t hi = ~0, tmp = 0; 308 uint32_t *data = buf; 309 uint64_t last; 310 int idx; 311 312 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 313 return; 314 315 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 316 317 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 318 for (last = pos + size; pos < last; pos += 4) { 319 tmp = pos >> 31; 320 321 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 322 if (tmp != hi) { 323 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 324 hi = tmp; 325 } 326 if (write) 327 WREG32_NO_KIQ(mmMM_DATA, *data++); 328 else 329 *data++ = RREG32_NO_KIQ(mmMM_DATA); 330 } 331 332 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 333 drm_dev_exit(idx); 334 } 335 336 /** 337 * amdgpu_device_aper_access - access vram by vram aperature 338 * 339 * @adev: amdgpu_device pointer 340 * @pos: offset of the buffer in vram 341 * @buf: virtual address of the buffer in system memory 342 * @size: read/write size, sizeof(@buf) must > @size 343 * @write: true - write to vram, otherwise - read from vram 344 * 345 * The return value means how many bytes have been transferred. 346 */ 347 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 348 void *buf, size_t size, bool write) 349 { 350 #ifdef CONFIG_64BIT 351 void __iomem *addr; 352 size_t count = 0; 353 uint64_t last; 354 355 if (!adev->mman.aper_base_kaddr) 356 return 0; 357 358 last = min(pos + size, adev->gmc.visible_vram_size); 359 if (last > pos) { 360 addr = adev->mman.aper_base_kaddr + pos; 361 count = last - pos; 362 363 if (write) { 364 memcpy_toio(addr, buf, count); 365 mb(); 366 amdgpu_device_flush_hdp(adev, NULL); 367 } else { 368 amdgpu_device_invalidate_hdp(adev, NULL); 369 mb(); 370 memcpy_fromio(buf, addr, count); 371 } 372 373 } 374 375 return count; 376 #else 377 return 0; 378 #endif 379 } 380 381 /** 382 * amdgpu_device_vram_access - read/write a buffer in vram 383 * 384 * @adev: amdgpu_device pointer 385 * @pos: offset of the buffer in vram 386 * @buf: virtual address of the buffer in system memory 387 * @size: read/write size, sizeof(@buf) must > @size 388 * @write: true - write to vram, otherwise - read from vram 389 */ 390 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 391 void *buf, size_t size, bool write) 392 { 393 size_t count; 394 395 /* try to using vram apreature to access vram first */ 396 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 397 size -= count; 398 if (size) { 399 /* using MM to access rest vram */ 400 pos += count; 401 buf += count; 402 amdgpu_device_mm_access(adev, pos, buf, size, write); 403 } 404 } 405 406 /* 407 * register access helper functions. 408 */ 409 410 /* Check if hw access should be skipped because of hotplug or device error */ 411 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 412 { 413 if (adev->no_hw_access) 414 return true; 415 416 #ifdef CONFIG_LOCKDEP 417 /* 418 * This is a bit complicated to understand, so worth a comment. What we assert 419 * here is that the GPU reset is not running on another thread in parallel. 420 * 421 * For this we trylock the read side of the reset semaphore, if that succeeds 422 * we know that the reset is not running in paralell. 423 * 424 * If the trylock fails we assert that we are either already holding the read 425 * side of the lock or are the reset thread itself and hold the write side of 426 * the lock. 427 */ 428 if (in_task()) { 429 if (down_read_trylock(&adev->reset_sem)) 430 up_read(&adev->reset_sem); 431 else 432 lockdep_assert_held(&adev->reset_sem); 433 } 434 #endif 435 return false; 436 } 437 438 /** 439 * amdgpu_device_rreg - read a memory mapped IO or indirect register 440 * 441 * @adev: amdgpu_device pointer 442 * @reg: dword aligned register offset 443 * @acc_flags: access flags which require special behavior 444 * 445 * Returns the 32 bit value from the offset specified. 446 */ 447 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 448 uint32_t reg, uint32_t acc_flags) 449 { 450 uint32_t ret; 451 452 if (amdgpu_device_skip_hw_access(adev)) 453 return 0; 454 455 if ((reg * 4) < adev->rmmio_size) { 456 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 457 amdgpu_sriov_runtime(adev) && 458 down_read_trylock(&adev->reset_sem)) { 459 ret = amdgpu_kiq_rreg(adev, reg); 460 up_read(&adev->reset_sem); 461 } else { 462 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 463 } 464 } else { 465 ret = adev->pcie_rreg(adev, reg * 4); 466 } 467 468 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 469 470 return ret; 471 } 472 473 /* 474 * MMIO register read with bytes helper functions 475 * @offset:bytes offset from MMIO start 476 * 477 */ 478 479 /** 480 * amdgpu_mm_rreg8 - read a memory mapped IO register 481 * 482 * @adev: amdgpu_device pointer 483 * @offset: byte aligned register offset 484 * 485 * Returns the 8 bit value from the offset specified. 486 */ 487 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 488 { 489 if (amdgpu_device_skip_hw_access(adev)) 490 return 0; 491 492 if (offset < adev->rmmio_size) 493 return (readb(adev->rmmio + offset)); 494 BUG(); 495 } 496 497 /* 498 * MMIO register write with bytes helper functions 499 * @offset:bytes offset from MMIO start 500 * @value: the value want to be written to the register 501 * 502 */ 503 /** 504 * amdgpu_mm_wreg8 - read a memory mapped IO register 505 * 506 * @adev: amdgpu_device pointer 507 * @offset: byte aligned register offset 508 * @value: 8 bit value to write 509 * 510 * Writes the value specified to the offset specified. 511 */ 512 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 513 { 514 if (amdgpu_device_skip_hw_access(adev)) 515 return; 516 517 if (offset < adev->rmmio_size) 518 writeb(value, adev->rmmio + offset); 519 else 520 BUG(); 521 } 522 523 /** 524 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 525 * 526 * @adev: amdgpu_device pointer 527 * @reg: dword aligned register offset 528 * @v: 32 bit value to write to the register 529 * @acc_flags: access flags which require special behavior 530 * 531 * Writes the value specified to the offset specified. 532 */ 533 void amdgpu_device_wreg(struct amdgpu_device *adev, 534 uint32_t reg, uint32_t v, 535 uint32_t acc_flags) 536 { 537 if (amdgpu_device_skip_hw_access(adev)) 538 return; 539 540 if ((reg * 4) < adev->rmmio_size) { 541 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 542 amdgpu_sriov_runtime(adev) && 543 down_read_trylock(&adev->reset_sem)) { 544 amdgpu_kiq_wreg(adev, reg, v); 545 up_read(&adev->reset_sem); 546 } else { 547 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 548 } 549 } else { 550 adev->pcie_wreg(adev, reg * 4, v); 551 } 552 553 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 554 } 555 556 /** 557 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 558 * 559 * @adev: amdgpu_device pointer 560 * @reg: mmio/rlc register 561 * @v: value to write 562 * 563 * this function is invoked only for the debugfs register access 564 */ 565 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 566 uint32_t reg, uint32_t v) 567 { 568 if (amdgpu_device_skip_hw_access(adev)) 569 return; 570 571 if (amdgpu_sriov_fullaccess(adev) && 572 adev->gfx.rlc.funcs && 573 adev->gfx.rlc.funcs->is_rlcg_access_range) { 574 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 575 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 576 } else if ((reg * 4) >= adev->rmmio_size) { 577 adev->pcie_wreg(adev, reg * 4, v); 578 } else { 579 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 580 } 581 } 582 583 /** 584 * amdgpu_mm_rdoorbell - read a doorbell dword 585 * 586 * @adev: amdgpu_device pointer 587 * @index: doorbell index 588 * 589 * Returns the value in the doorbell aperture at the 590 * requested doorbell index (CIK). 591 */ 592 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 593 { 594 if (amdgpu_device_skip_hw_access(adev)) 595 return 0; 596 597 if (index < adev->doorbell.num_doorbells) { 598 return readl(adev->doorbell.ptr + index); 599 } else { 600 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 601 return 0; 602 } 603 } 604 605 /** 606 * amdgpu_mm_wdoorbell - write a doorbell dword 607 * 608 * @adev: amdgpu_device pointer 609 * @index: doorbell index 610 * @v: value to write 611 * 612 * Writes @v to the doorbell aperture at the 613 * requested doorbell index (CIK). 614 */ 615 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 616 { 617 if (amdgpu_device_skip_hw_access(adev)) 618 return; 619 620 if (index < adev->doorbell.num_doorbells) { 621 writel(v, adev->doorbell.ptr + index); 622 } else { 623 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 624 } 625 } 626 627 /** 628 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 629 * 630 * @adev: amdgpu_device pointer 631 * @index: doorbell index 632 * 633 * Returns the value in the doorbell aperture at the 634 * requested doorbell index (VEGA10+). 635 */ 636 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 637 { 638 if (amdgpu_device_skip_hw_access(adev)) 639 return 0; 640 641 if (index < adev->doorbell.num_doorbells) { 642 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 643 } else { 644 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 645 return 0; 646 } 647 } 648 649 /** 650 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 651 * 652 * @adev: amdgpu_device pointer 653 * @index: doorbell index 654 * @v: value to write 655 * 656 * Writes @v to the doorbell aperture at the 657 * requested doorbell index (VEGA10+). 658 */ 659 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 660 { 661 if (amdgpu_device_skip_hw_access(adev)) 662 return; 663 664 if (index < adev->doorbell.num_doorbells) { 665 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 666 } else { 667 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 668 } 669 } 670 671 /** 672 * amdgpu_device_indirect_rreg - read an indirect register 673 * 674 * @adev: amdgpu_device pointer 675 * @pcie_index: mmio register offset 676 * @pcie_data: mmio register offset 677 * @reg_addr: indirect register address to read from 678 * 679 * Returns the value of indirect register @reg_addr 680 */ 681 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 682 u32 pcie_index, u32 pcie_data, 683 u32 reg_addr) 684 { 685 unsigned long flags; 686 u32 r; 687 void __iomem *pcie_index_offset; 688 void __iomem *pcie_data_offset; 689 690 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 693 694 writel(reg_addr, pcie_index_offset); 695 readl(pcie_index_offset); 696 r = readl(pcie_data_offset); 697 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 698 699 return r; 700 } 701 702 /** 703 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 704 * 705 * @adev: amdgpu_device pointer 706 * @pcie_index: mmio register offset 707 * @pcie_data: mmio register offset 708 * @reg_addr: indirect register address to read from 709 * 710 * Returns the value of indirect register @reg_addr 711 */ 712 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 713 u32 pcie_index, u32 pcie_data, 714 u32 reg_addr) 715 { 716 unsigned long flags; 717 u64 r; 718 void __iomem *pcie_index_offset; 719 void __iomem *pcie_data_offset; 720 721 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 722 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 723 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 724 725 /* read low 32 bits */ 726 writel(reg_addr, pcie_index_offset); 727 readl(pcie_index_offset); 728 r = readl(pcie_data_offset); 729 /* read high 32 bits */ 730 writel(reg_addr + 4, pcie_index_offset); 731 readl(pcie_index_offset); 732 r |= ((u64)readl(pcie_data_offset) << 32); 733 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 734 735 return r; 736 } 737 738 /** 739 * amdgpu_device_indirect_wreg - write an indirect register address 740 * 741 * @adev: amdgpu_device pointer 742 * @pcie_index: mmio register offset 743 * @pcie_data: mmio register offset 744 * @reg_addr: indirect register offset 745 * @reg_data: indirect register data 746 * 747 */ 748 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 749 u32 pcie_index, u32 pcie_data, 750 u32 reg_addr, u32 reg_data) 751 { 752 unsigned long flags; 753 void __iomem *pcie_index_offset; 754 void __iomem *pcie_data_offset; 755 756 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 757 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 758 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 759 760 writel(reg_addr, pcie_index_offset); 761 readl(pcie_index_offset); 762 writel(reg_data, pcie_data_offset); 763 readl(pcie_data_offset); 764 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 765 } 766 767 /** 768 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 769 * 770 * @adev: amdgpu_device pointer 771 * @pcie_index: mmio register offset 772 * @pcie_data: mmio register offset 773 * @reg_addr: indirect register offset 774 * @reg_data: indirect register data 775 * 776 */ 777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 778 u32 pcie_index, u32 pcie_data, 779 u32 reg_addr, u64 reg_data) 780 { 781 unsigned long flags; 782 void __iomem *pcie_index_offset; 783 void __iomem *pcie_data_offset; 784 785 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 786 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 787 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 788 789 /* write low 32 bits */ 790 writel(reg_addr, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 793 readl(pcie_data_offset); 794 /* write high 32 bits */ 795 writel(reg_addr + 4, pcie_index_offset); 796 readl(pcie_index_offset); 797 writel((u32)(reg_data >> 32), pcie_data_offset); 798 readl(pcie_data_offset); 799 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 800 } 801 802 /** 803 * amdgpu_invalid_rreg - dummy reg read function 804 * 805 * @adev: amdgpu_device pointer 806 * @reg: offset of register 807 * 808 * Dummy register read function. Used for register blocks 809 * that certain asics don't have (all asics). 810 * Returns the value in the register. 811 */ 812 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 813 { 814 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 815 BUG(); 816 return 0; 817 } 818 819 /** 820 * amdgpu_invalid_wreg - dummy reg write function 821 * 822 * @adev: amdgpu_device pointer 823 * @reg: offset of register 824 * @v: value to write to the register 825 * 826 * Dummy register read function. Used for register blocks 827 * that certain asics don't have (all asics). 828 */ 829 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 830 { 831 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 832 reg, v); 833 BUG(); 834 } 835 836 /** 837 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 838 * 839 * @adev: amdgpu_device pointer 840 * @reg: offset of register 841 * 842 * Dummy register read function. Used for register blocks 843 * that certain asics don't have (all asics). 844 * Returns the value in the register. 845 */ 846 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 847 { 848 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 849 BUG(); 850 return 0; 851 } 852 853 /** 854 * amdgpu_invalid_wreg64 - dummy reg write function 855 * 856 * @adev: amdgpu_device pointer 857 * @reg: offset of register 858 * @v: value to write to the register 859 * 860 * Dummy register read function. Used for register blocks 861 * that certain asics don't have (all asics). 862 */ 863 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 864 { 865 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 866 reg, v); 867 BUG(); 868 } 869 870 /** 871 * amdgpu_block_invalid_rreg - dummy reg read function 872 * 873 * @adev: amdgpu_device pointer 874 * @block: offset of instance 875 * @reg: offset of register 876 * 877 * Dummy register read function. Used for register blocks 878 * that certain asics don't have (all asics). 879 * Returns the value in the register. 880 */ 881 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 882 uint32_t block, uint32_t reg) 883 { 884 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 885 reg, block); 886 BUG(); 887 return 0; 888 } 889 890 /** 891 * amdgpu_block_invalid_wreg - dummy reg write function 892 * 893 * @adev: amdgpu_device pointer 894 * @block: offset of instance 895 * @reg: offset of register 896 * @v: value to write to the register 897 * 898 * Dummy register read function. Used for register blocks 899 * that certain asics don't have (all asics). 900 */ 901 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 902 uint32_t block, 903 uint32_t reg, uint32_t v) 904 { 905 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 906 reg, block, v); 907 BUG(); 908 } 909 910 /** 911 * amdgpu_device_asic_init - Wrapper for atom asic_init 912 * 913 * @adev: amdgpu_device pointer 914 * 915 * Does any asic specific work and then calls atom asic init. 916 */ 917 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 918 { 919 amdgpu_asic_pre_asic_init(adev); 920 921 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 922 } 923 924 /** 925 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 926 * 927 * @adev: amdgpu_device pointer 928 * 929 * Allocates a scratch page of VRAM for use by various things in the 930 * driver. 931 */ 932 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 933 { 934 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 935 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 936 &adev->vram_scratch.robj, 937 &adev->vram_scratch.gpu_addr, 938 (void **)&adev->vram_scratch.ptr); 939 } 940 941 /** 942 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 943 * 944 * @adev: amdgpu_device pointer 945 * 946 * Frees the VRAM scratch page. 947 */ 948 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 949 { 950 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 951 } 952 953 /** 954 * amdgpu_device_program_register_sequence - program an array of registers. 955 * 956 * @adev: amdgpu_device pointer 957 * @registers: pointer to the register array 958 * @array_size: size of the register array 959 * 960 * Programs an array or registers with and and or masks. 961 * This is a helper for setting golden registers. 962 */ 963 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 964 const u32 *registers, 965 const u32 array_size) 966 { 967 u32 tmp, reg, and_mask, or_mask; 968 int i; 969 970 if (array_size % 3) 971 return; 972 973 for (i = 0; i < array_size; i +=3) { 974 reg = registers[i + 0]; 975 and_mask = registers[i + 1]; 976 or_mask = registers[i + 2]; 977 978 if (and_mask == 0xffffffff) { 979 tmp = or_mask; 980 } else { 981 tmp = RREG32(reg); 982 tmp &= ~and_mask; 983 if (adev->family >= AMDGPU_FAMILY_AI) 984 tmp |= (or_mask & and_mask); 985 else 986 tmp |= or_mask; 987 } 988 WREG32(reg, tmp); 989 } 990 } 991 992 /** 993 * amdgpu_device_pci_config_reset - reset the GPU 994 * 995 * @adev: amdgpu_device pointer 996 * 997 * Resets the GPU using the pci config reset sequence. 998 * Only applicable to asics prior to vega10. 999 */ 1000 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1001 { 1002 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1003 } 1004 1005 /** 1006 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1011 */ 1012 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1013 { 1014 return pci_reset_function(adev->pdev); 1015 } 1016 1017 /* 1018 * GPU doorbell aperture helpers function. 1019 */ 1020 /** 1021 * amdgpu_device_doorbell_init - Init doorbell driver information. 1022 * 1023 * @adev: amdgpu_device pointer 1024 * 1025 * Init doorbell driver information (CIK) 1026 * Returns 0 on success, error on failure. 1027 */ 1028 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1029 { 1030 1031 /* No doorbell on SI hardware generation */ 1032 if (adev->asic_type < CHIP_BONAIRE) { 1033 adev->doorbell.base = 0; 1034 adev->doorbell.size = 0; 1035 adev->doorbell.num_doorbells = 0; 1036 adev->doorbell.ptr = NULL; 1037 return 0; 1038 } 1039 1040 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1041 return -EINVAL; 1042 1043 amdgpu_asic_init_doorbell_index(adev); 1044 1045 /* doorbell bar mapping */ 1046 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1047 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1048 1049 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1050 adev->doorbell_index.max_assignment+1); 1051 if (adev->doorbell.num_doorbells == 0) 1052 return -EINVAL; 1053 1054 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1055 * paging queue doorbell use the second page. The 1056 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1057 * doorbells are in the first page. So with paging queue enabled, 1058 * the max num_doorbells should + 1 page (0x400 in dword) 1059 */ 1060 if (adev->asic_type >= CHIP_VEGA10) 1061 adev->doorbell.num_doorbells += 0x400; 1062 1063 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1064 adev->doorbell.num_doorbells * 1065 sizeof(u32)); 1066 if (adev->doorbell.ptr == NULL) 1067 return -ENOMEM; 1068 1069 return 0; 1070 } 1071 1072 /** 1073 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1074 * 1075 * @adev: amdgpu_device pointer 1076 * 1077 * Tear down doorbell driver information (CIK) 1078 */ 1079 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1080 { 1081 iounmap(adev->doorbell.ptr); 1082 adev->doorbell.ptr = NULL; 1083 } 1084 1085 1086 1087 /* 1088 * amdgpu_device_wb_*() 1089 * Writeback is the method by which the GPU updates special pages in memory 1090 * with the status of certain GPU events (fences, ring pointers,etc.). 1091 */ 1092 1093 /** 1094 * amdgpu_device_wb_fini - Disable Writeback and free memory 1095 * 1096 * @adev: amdgpu_device pointer 1097 * 1098 * Disables Writeback and frees the Writeback memory (all asics). 1099 * Used at driver shutdown. 1100 */ 1101 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1102 { 1103 if (adev->wb.wb_obj) { 1104 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1105 &adev->wb.gpu_addr, 1106 (void **)&adev->wb.wb); 1107 adev->wb.wb_obj = NULL; 1108 } 1109 } 1110 1111 /** 1112 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1113 * 1114 * @adev: amdgpu_device pointer 1115 * 1116 * Initializes writeback and allocates writeback memory (all asics). 1117 * Used at driver startup. 1118 * Returns 0 on success or an -error on failure. 1119 */ 1120 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1121 { 1122 int r; 1123 1124 if (adev->wb.wb_obj == NULL) { 1125 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1126 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1127 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1128 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1129 (void **)&adev->wb.wb); 1130 if (r) { 1131 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1132 return r; 1133 } 1134 1135 adev->wb.num_wb = AMDGPU_MAX_WB; 1136 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1137 1138 /* clear wb memory */ 1139 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1140 } 1141 1142 return 0; 1143 } 1144 1145 /** 1146 * amdgpu_device_wb_get - Allocate a wb entry 1147 * 1148 * @adev: amdgpu_device pointer 1149 * @wb: wb index 1150 * 1151 * Allocate a wb slot for use by the driver (all asics). 1152 * Returns 0 on success or -EINVAL on failure. 1153 */ 1154 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1155 { 1156 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1157 1158 if (offset < adev->wb.num_wb) { 1159 __set_bit(offset, adev->wb.used); 1160 *wb = offset << 3; /* convert to dw offset */ 1161 return 0; 1162 } else { 1163 return -EINVAL; 1164 } 1165 } 1166 1167 /** 1168 * amdgpu_device_wb_free - Free a wb entry 1169 * 1170 * @adev: amdgpu_device pointer 1171 * @wb: wb index 1172 * 1173 * Free a wb slot allocated for use by the driver (all asics) 1174 */ 1175 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1176 { 1177 wb >>= 3; 1178 if (wb < adev->wb.num_wb) 1179 __clear_bit(wb, adev->wb.used); 1180 } 1181 1182 /** 1183 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1184 * 1185 * @adev: amdgpu_device pointer 1186 * 1187 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1188 * to fail, but if any of the BARs is not accessible after the size we abort 1189 * driver loading by returning -ENODEV. 1190 */ 1191 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1192 { 1193 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1194 struct pci_bus *root; 1195 struct resource *res; 1196 unsigned i; 1197 u16 cmd; 1198 int r; 1199 1200 /* Bypass for VF */ 1201 if (amdgpu_sriov_vf(adev)) 1202 return 0; 1203 1204 /* skip if the bios has already enabled large BAR */ 1205 if (adev->gmc.real_vram_size && 1206 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1207 return 0; 1208 1209 /* Check if the root BUS has 64bit memory resources */ 1210 root = adev->pdev->bus; 1211 while (root->parent) 1212 root = root->parent; 1213 1214 pci_bus_for_each_resource(root, res, i) { 1215 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1216 res->start > 0x100000000ull) 1217 break; 1218 } 1219 1220 /* Trying to resize is pointless without a root hub window above 4GB */ 1221 if (!res) 1222 return 0; 1223 1224 /* Limit the BAR size to what is available */ 1225 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1226 rbar_size); 1227 1228 /* Disable memory decoding while we change the BAR addresses and size */ 1229 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1230 pci_write_config_word(adev->pdev, PCI_COMMAND, 1231 cmd & ~PCI_COMMAND_MEMORY); 1232 1233 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1234 amdgpu_device_doorbell_fini(adev); 1235 if (adev->asic_type >= CHIP_BONAIRE) 1236 pci_release_resource(adev->pdev, 2); 1237 1238 pci_release_resource(adev->pdev, 0); 1239 1240 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1241 if (r == -ENOSPC) 1242 DRM_INFO("Not enough PCI address space for a large BAR."); 1243 else if (r && r != -ENOTSUPP) 1244 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1245 1246 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1247 1248 /* When the doorbell or fb BAR isn't available we have no chance of 1249 * using the device. 1250 */ 1251 r = amdgpu_device_doorbell_init(adev); 1252 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1253 return -ENODEV; 1254 1255 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1256 1257 return 0; 1258 } 1259 1260 /* 1261 * GPU helpers function. 1262 */ 1263 /** 1264 * amdgpu_device_need_post - check if the hw need post or not 1265 * 1266 * @adev: amdgpu_device pointer 1267 * 1268 * Check if the asic has been initialized (all asics) at driver startup 1269 * or post is needed if hw reset is performed. 1270 * Returns true if need or false if not. 1271 */ 1272 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1273 { 1274 uint32_t reg; 1275 1276 if (amdgpu_sriov_vf(adev)) 1277 return false; 1278 1279 if (amdgpu_passthrough(adev)) { 1280 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1281 * some old smc fw still need driver do vPost otherwise gpu hang, while 1282 * those smc fw version above 22.15 doesn't have this flaw, so we force 1283 * vpost executed for smc version below 22.15 1284 */ 1285 if (adev->asic_type == CHIP_FIJI) { 1286 int err; 1287 uint32_t fw_ver; 1288 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1289 /* force vPost if error occured */ 1290 if (err) 1291 return true; 1292 1293 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1294 if (fw_ver < 0x00160e00) 1295 return true; 1296 } 1297 } 1298 1299 /* Don't post if we need to reset whole hive on init */ 1300 if (adev->gmc.xgmi.pending_reset) 1301 return false; 1302 1303 if (adev->has_hw_reset) { 1304 adev->has_hw_reset = false; 1305 return true; 1306 } 1307 1308 /* bios scratch used on CIK+ */ 1309 if (adev->asic_type >= CHIP_BONAIRE) 1310 return amdgpu_atombios_scratch_need_asic_init(adev); 1311 1312 /* check MEM_SIZE for older asics */ 1313 reg = amdgpu_asic_get_config_memsize(adev); 1314 1315 if ((reg != 0) && (reg != 0xffffffff)) 1316 return false; 1317 1318 return true; 1319 } 1320 1321 /** 1322 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1323 * 1324 * @adev: amdgpu_device pointer 1325 * 1326 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1327 * be set for this device. 1328 * 1329 * Returns true if it should be used or false if not. 1330 */ 1331 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1332 { 1333 switch (amdgpu_aspm) { 1334 case -1: 1335 break; 1336 case 0: 1337 return false; 1338 case 1: 1339 return true; 1340 default: 1341 return false; 1342 } 1343 return pcie_aspm_enabled(adev->pdev); 1344 } 1345 1346 /* if we get transitioned to only one device, take VGA back */ 1347 /** 1348 * amdgpu_device_vga_set_decode - enable/disable vga decode 1349 * 1350 * @pdev: PCI device pointer 1351 * @state: enable/disable vga decode 1352 * 1353 * Enable/disable vga decode (all asics). 1354 * Returns VGA resource flags. 1355 */ 1356 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1357 bool state) 1358 { 1359 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1360 amdgpu_asic_set_vga_state(adev, state); 1361 if (state) 1362 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1363 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1364 else 1365 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1366 } 1367 1368 /** 1369 * amdgpu_device_check_block_size - validate the vm block size 1370 * 1371 * @adev: amdgpu_device pointer 1372 * 1373 * Validates the vm block size specified via module parameter. 1374 * The vm block size defines number of bits in page table versus page directory, 1375 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1376 * page table and the remaining bits are in the page directory. 1377 */ 1378 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1379 { 1380 /* defines number of bits in page table versus page directory, 1381 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1382 * page table and the remaining bits are in the page directory */ 1383 if (amdgpu_vm_block_size == -1) 1384 return; 1385 1386 if (amdgpu_vm_block_size < 9) { 1387 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1388 amdgpu_vm_block_size); 1389 amdgpu_vm_block_size = -1; 1390 } 1391 } 1392 1393 /** 1394 * amdgpu_device_check_vm_size - validate the vm size 1395 * 1396 * @adev: amdgpu_device pointer 1397 * 1398 * Validates the vm size in GB specified via module parameter. 1399 * The VM size is the size of the GPU virtual memory space in GB. 1400 */ 1401 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1402 { 1403 /* no need to check the default value */ 1404 if (amdgpu_vm_size == -1) 1405 return; 1406 1407 if (amdgpu_vm_size < 1) { 1408 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1409 amdgpu_vm_size); 1410 amdgpu_vm_size = -1; 1411 } 1412 } 1413 1414 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1415 { 1416 struct sysinfo si; 1417 bool is_os_64 = (sizeof(void *) == 8); 1418 uint64_t total_memory; 1419 uint64_t dram_size_seven_GB = 0x1B8000000; 1420 uint64_t dram_size_three_GB = 0xB8000000; 1421 1422 if (amdgpu_smu_memory_pool_size == 0) 1423 return; 1424 1425 if (!is_os_64) { 1426 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1427 goto def_value; 1428 } 1429 si_meminfo(&si); 1430 total_memory = (uint64_t)si.totalram * si.mem_unit; 1431 1432 if ((amdgpu_smu_memory_pool_size == 1) || 1433 (amdgpu_smu_memory_pool_size == 2)) { 1434 if (total_memory < dram_size_three_GB) 1435 goto def_value1; 1436 } else if ((amdgpu_smu_memory_pool_size == 4) || 1437 (amdgpu_smu_memory_pool_size == 8)) { 1438 if (total_memory < dram_size_seven_GB) 1439 goto def_value1; 1440 } else { 1441 DRM_WARN("Smu memory pool size not supported\n"); 1442 goto def_value; 1443 } 1444 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1445 1446 return; 1447 1448 def_value1: 1449 DRM_WARN("No enough system memory\n"); 1450 def_value: 1451 adev->pm.smu_prv_buffer_size = 0; 1452 } 1453 1454 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1455 { 1456 if (!(adev->flags & AMD_IS_APU) || 1457 adev->asic_type < CHIP_RAVEN) 1458 return 0; 1459 1460 switch (adev->asic_type) { 1461 case CHIP_RAVEN: 1462 if (adev->pdev->device == 0x15dd) 1463 adev->apu_flags |= AMD_APU_IS_RAVEN; 1464 if (adev->pdev->device == 0x15d8) 1465 adev->apu_flags |= AMD_APU_IS_PICASSO; 1466 break; 1467 case CHIP_RENOIR: 1468 if ((adev->pdev->device == 0x1636) || 1469 (adev->pdev->device == 0x164c)) 1470 adev->apu_flags |= AMD_APU_IS_RENOIR; 1471 else 1472 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1473 break; 1474 case CHIP_VANGOGH: 1475 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1476 break; 1477 case CHIP_YELLOW_CARP: 1478 break; 1479 case CHIP_CYAN_SKILLFISH: 1480 if ((adev->pdev->device == 0x13FE) || 1481 (adev->pdev->device == 0x143F)) 1482 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1483 break; 1484 default: 1485 break; 1486 } 1487 1488 return 0; 1489 } 1490 1491 /** 1492 * amdgpu_device_check_arguments - validate module params 1493 * 1494 * @adev: amdgpu_device pointer 1495 * 1496 * Validates certain module parameters and updates 1497 * the associated values used by the driver (all asics). 1498 */ 1499 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1500 { 1501 if (amdgpu_sched_jobs < 4) { 1502 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1503 amdgpu_sched_jobs); 1504 amdgpu_sched_jobs = 4; 1505 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1506 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1507 amdgpu_sched_jobs); 1508 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1509 } 1510 1511 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1512 /* gart size must be greater or equal to 32M */ 1513 dev_warn(adev->dev, "gart size (%d) too small\n", 1514 amdgpu_gart_size); 1515 amdgpu_gart_size = -1; 1516 } 1517 1518 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1519 /* gtt size must be greater or equal to 32M */ 1520 dev_warn(adev->dev, "gtt size (%d) too small\n", 1521 amdgpu_gtt_size); 1522 amdgpu_gtt_size = -1; 1523 } 1524 1525 /* valid range is between 4 and 9 inclusive */ 1526 if (amdgpu_vm_fragment_size != -1 && 1527 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1528 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1529 amdgpu_vm_fragment_size = -1; 1530 } 1531 1532 if (amdgpu_sched_hw_submission < 2) { 1533 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1534 amdgpu_sched_hw_submission); 1535 amdgpu_sched_hw_submission = 2; 1536 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1537 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1538 amdgpu_sched_hw_submission); 1539 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1540 } 1541 1542 amdgpu_device_check_smu_prv_buffer_size(adev); 1543 1544 amdgpu_device_check_vm_size(adev); 1545 1546 amdgpu_device_check_block_size(adev); 1547 1548 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1549 1550 amdgpu_gmc_tmz_set(adev); 1551 1552 amdgpu_gmc_noretry_set(adev); 1553 1554 return 0; 1555 } 1556 1557 /** 1558 * amdgpu_switcheroo_set_state - set switcheroo state 1559 * 1560 * @pdev: pci dev pointer 1561 * @state: vga_switcheroo state 1562 * 1563 * Callback for the switcheroo driver. Suspends or resumes the 1564 * the asics before or after it is powered up using ACPI methods. 1565 */ 1566 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1567 enum vga_switcheroo_state state) 1568 { 1569 struct drm_device *dev = pci_get_drvdata(pdev); 1570 int r; 1571 1572 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1573 return; 1574 1575 if (state == VGA_SWITCHEROO_ON) { 1576 pr_info("switched on\n"); 1577 /* don't suspend or resume card normally */ 1578 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1579 1580 pci_set_power_state(pdev, PCI_D0); 1581 amdgpu_device_load_pci_state(pdev); 1582 r = pci_enable_device(pdev); 1583 if (r) 1584 DRM_WARN("pci_enable_device failed (%d)\n", r); 1585 amdgpu_device_resume(dev, true); 1586 1587 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1588 } else { 1589 pr_info("switched off\n"); 1590 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1591 amdgpu_device_suspend(dev, true); 1592 amdgpu_device_cache_pci_state(pdev); 1593 /* Shut down the device */ 1594 pci_disable_device(pdev); 1595 pci_set_power_state(pdev, PCI_D3cold); 1596 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1597 } 1598 } 1599 1600 /** 1601 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1602 * 1603 * @pdev: pci dev pointer 1604 * 1605 * Callback for the switcheroo driver. Check of the switcheroo 1606 * state can be changed. 1607 * Returns true if the state can be changed, false if not. 1608 */ 1609 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1610 { 1611 struct drm_device *dev = pci_get_drvdata(pdev); 1612 1613 /* 1614 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1615 * locking inversion with the driver load path. And the access here is 1616 * completely racy anyway. So don't bother with locking for now. 1617 */ 1618 return atomic_read(&dev->open_count) == 0; 1619 } 1620 1621 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1622 .set_gpu_state = amdgpu_switcheroo_set_state, 1623 .reprobe = NULL, 1624 .can_switch = amdgpu_switcheroo_can_switch, 1625 }; 1626 1627 /** 1628 * amdgpu_device_ip_set_clockgating_state - set the CG state 1629 * 1630 * @dev: amdgpu_device pointer 1631 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1632 * @state: clockgating state (gate or ungate) 1633 * 1634 * Sets the requested clockgating state for all instances of 1635 * the hardware IP specified. 1636 * Returns the error code from the last instance. 1637 */ 1638 int amdgpu_device_ip_set_clockgating_state(void *dev, 1639 enum amd_ip_block_type block_type, 1640 enum amd_clockgating_state state) 1641 { 1642 struct amdgpu_device *adev = dev; 1643 int i, r = 0; 1644 1645 for (i = 0; i < adev->num_ip_blocks; i++) { 1646 if (!adev->ip_blocks[i].status.valid) 1647 continue; 1648 if (adev->ip_blocks[i].version->type != block_type) 1649 continue; 1650 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1651 continue; 1652 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1653 (void *)adev, state); 1654 if (r) 1655 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1656 adev->ip_blocks[i].version->funcs->name, r); 1657 } 1658 return r; 1659 } 1660 1661 /** 1662 * amdgpu_device_ip_set_powergating_state - set the PG state 1663 * 1664 * @dev: amdgpu_device pointer 1665 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1666 * @state: powergating state (gate or ungate) 1667 * 1668 * Sets the requested powergating state for all instances of 1669 * the hardware IP specified. 1670 * Returns the error code from the last instance. 1671 */ 1672 int amdgpu_device_ip_set_powergating_state(void *dev, 1673 enum amd_ip_block_type block_type, 1674 enum amd_powergating_state state) 1675 { 1676 struct amdgpu_device *adev = dev; 1677 int i, r = 0; 1678 1679 for (i = 0; i < adev->num_ip_blocks; i++) { 1680 if (!adev->ip_blocks[i].status.valid) 1681 continue; 1682 if (adev->ip_blocks[i].version->type != block_type) 1683 continue; 1684 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1685 continue; 1686 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1687 (void *)adev, state); 1688 if (r) 1689 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1690 adev->ip_blocks[i].version->funcs->name, r); 1691 } 1692 return r; 1693 } 1694 1695 /** 1696 * amdgpu_device_ip_get_clockgating_state - get the CG state 1697 * 1698 * @adev: amdgpu_device pointer 1699 * @flags: clockgating feature flags 1700 * 1701 * Walks the list of IPs on the device and updates the clockgating 1702 * flags for each IP. 1703 * Updates @flags with the feature flags for each hardware IP where 1704 * clockgating is enabled. 1705 */ 1706 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1707 u32 *flags) 1708 { 1709 int i; 1710 1711 for (i = 0; i < adev->num_ip_blocks; i++) { 1712 if (!adev->ip_blocks[i].status.valid) 1713 continue; 1714 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1715 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1716 } 1717 } 1718 1719 /** 1720 * amdgpu_device_ip_wait_for_idle - wait for idle 1721 * 1722 * @adev: amdgpu_device pointer 1723 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1724 * 1725 * Waits for the request hardware IP to be idle. 1726 * Returns 0 for success or a negative error code on failure. 1727 */ 1728 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1729 enum amd_ip_block_type block_type) 1730 { 1731 int i, r; 1732 1733 for (i = 0; i < adev->num_ip_blocks; i++) { 1734 if (!adev->ip_blocks[i].status.valid) 1735 continue; 1736 if (adev->ip_blocks[i].version->type == block_type) { 1737 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1738 if (r) 1739 return r; 1740 break; 1741 } 1742 } 1743 return 0; 1744 1745 } 1746 1747 /** 1748 * amdgpu_device_ip_is_idle - is the hardware IP idle 1749 * 1750 * @adev: amdgpu_device pointer 1751 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1752 * 1753 * Check if the hardware IP is idle or not. 1754 * Returns true if it the IP is idle, false if not. 1755 */ 1756 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1757 enum amd_ip_block_type block_type) 1758 { 1759 int i; 1760 1761 for (i = 0; i < adev->num_ip_blocks; i++) { 1762 if (!adev->ip_blocks[i].status.valid) 1763 continue; 1764 if (adev->ip_blocks[i].version->type == block_type) 1765 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1766 } 1767 return true; 1768 1769 } 1770 1771 /** 1772 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1773 * 1774 * @adev: amdgpu_device pointer 1775 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1776 * 1777 * Returns a pointer to the hardware IP block structure 1778 * if it exists for the asic, otherwise NULL. 1779 */ 1780 struct amdgpu_ip_block * 1781 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1782 enum amd_ip_block_type type) 1783 { 1784 int i; 1785 1786 for (i = 0; i < adev->num_ip_blocks; i++) 1787 if (adev->ip_blocks[i].version->type == type) 1788 return &adev->ip_blocks[i]; 1789 1790 return NULL; 1791 } 1792 1793 /** 1794 * amdgpu_device_ip_block_version_cmp 1795 * 1796 * @adev: amdgpu_device pointer 1797 * @type: enum amd_ip_block_type 1798 * @major: major version 1799 * @minor: minor version 1800 * 1801 * return 0 if equal or greater 1802 * return 1 if smaller or the ip_block doesn't exist 1803 */ 1804 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1805 enum amd_ip_block_type type, 1806 u32 major, u32 minor) 1807 { 1808 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1809 1810 if (ip_block && ((ip_block->version->major > major) || 1811 ((ip_block->version->major == major) && 1812 (ip_block->version->minor >= minor)))) 1813 return 0; 1814 1815 return 1; 1816 } 1817 1818 /** 1819 * amdgpu_device_ip_block_add 1820 * 1821 * @adev: amdgpu_device pointer 1822 * @ip_block_version: pointer to the IP to add 1823 * 1824 * Adds the IP block driver information to the collection of IPs 1825 * on the asic. 1826 */ 1827 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1828 const struct amdgpu_ip_block_version *ip_block_version) 1829 { 1830 if (!ip_block_version) 1831 return -EINVAL; 1832 1833 switch (ip_block_version->type) { 1834 case AMD_IP_BLOCK_TYPE_VCN: 1835 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1836 return 0; 1837 break; 1838 case AMD_IP_BLOCK_TYPE_JPEG: 1839 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1840 return 0; 1841 break; 1842 default: 1843 break; 1844 } 1845 1846 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1847 ip_block_version->funcs->name); 1848 1849 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1850 1851 return 0; 1852 } 1853 1854 /** 1855 * amdgpu_device_enable_virtual_display - enable virtual display feature 1856 * 1857 * @adev: amdgpu_device pointer 1858 * 1859 * Enabled the virtual display feature if the user has enabled it via 1860 * the module parameter virtual_display. This feature provides a virtual 1861 * display hardware on headless boards or in virtualized environments. 1862 * This function parses and validates the configuration string specified by 1863 * the user and configues the virtual display configuration (number of 1864 * virtual connectors, crtcs, etc.) specified. 1865 */ 1866 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1867 { 1868 adev->enable_virtual_display = false; 1869 1870 if (amdgpu_virtual_display) { 1871 const char *pci_address_name = pci_name(adev->pdev); 1872 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1873 1874 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1875 pciaddstr_tmp = pciaddstr; 1876 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1877 pciaddname = strsep(&pciaddname_tmp, ","); 1878 if (!strcmp("all", pciaddname) 1879 || !strcmp(pci_address_name, pciaddname)) { 1880 long num_crtc; 1881 int res = -1; 1882 1883 adev->enable_virtual_display = true; 1884 1885 if (pciaddname_tmp) 1886 res = kstrtol(pciaddname_tmp, 10, 1887 &num_crtc); 1888 1889 if (!res) { 1890 if (num_crtc < 1) 1891 num_crtc = 1; 1892 if (num_crtc > 6) 1893 num_crtc = 6; 1894 adev->mode_info.num_crtc = num_crtc; 1895 } else { 1896 adev->mode_info.num_crtc = 1; 1897 } 1898 break; 1899 } 1900 } 1901 1902 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1903 amdgpu_virtual_display, pci_address_name, 1904 adev->enable_virtual_display, adev->mode_info.num_crtc); 1905 1906 kfree(pciaddstr); 1907 } 1908 } 1909 1910 /** 1911 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1912 * 1913 * @adev: amdgpu_device pointer 1914 * 1915 * Parses the asic configuration parameters specified in the gpu info 1916 * firmware and makes them availale to the driver for use in configuring 1917 * the asic. 1918 * Returns 0 on success, -EINVAL on failure. 1919 */ 1920 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1921 { 1922 const char *chip_name; 1923 char fw_name[40]; 1924 int err; 1925 const struct gpu_info_firmware_header_v1_0 *hdr; 1926 1927 adev->firmware.gpu_info_fw = NULL; 1928 1929 if (adev->mman.discovery_bin) { 1930 amdgpu_discovery_get_gfx_info(adev); 1931 1932 /* 1933 * FIXME: The bounding box is still needed by Navi12, so 1934 * temporarily read it from gpu_info firmware. Should be droped 1935 * when DAL no longer needs it. 1936 */ 1937 if (adev->asic_type != CHIP_NAVI12) 1938 return 0; 1939 } 1940 1941 switch (adev->asic_type) { 1942 #ifdef CONFIG_DRM_AMDGPU_SI 1943 case CHIP_VERDE: 1944 case CHIP_TAHITI: 1945 case CHIP_PITCAIRN: 1946 case CHIP_OLAND: 1947 case CHIP_HAINAN: 1948 #endif 1949 #ifdef CONFIG_DRM_AMDGPU_CIK 1950 case CHIP_BONAIRE: 1951 case CHIP_HAWAII: 1952 case CHIP_KAVERI: 1953 case CHIP_KABINI: 1954 case CHIP_MULLINS: 1955 #endif 1956 case CHIP_TOPAZ: 1957 case CHIP_TONGA: 1958 case CHIP_FIJI: 1959 case CHIP_POLARIS10: 1960 case CHIP_POLARIS11: 1961 case CHIP_POLARIS12: 1962 case CHIP_VEGAM: 1963 case CHIP_CARRIZO: 1964 case CHIP_STONEY: 1965 case CHIP_VEGA20: 1966 case CHIP_ALDEBARAN: 1967 case CHIP_SIENNA_CICHLID: 1968 case CHIP_NAVY_FLOUNDER: 1969 case CHIP_DIMGREY_CAVEFISH: 1970 case CHIP_BEIGE_GOBY: 1971 default: 1972 return 0; 1973 case CHIP_VEGA10: 1974 chip_name = "vega10"; 1975 break; 1976 case CHIP_VEGA12: 1977 chip_name = "vega12"; 1978 break; 1979 case CHIP_RAVEN: 1980 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1981 chip_name = "raven2"; 1982 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1983 chip_name = "picasso"; 1984 else 1985 chip_name = "raven"; 1986 break; 1987 case CHIP_ARCTURUS: 1988 chip_name = "arcturus"; 1989 break; 1990 case CHIP_RENOIR: 1991 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1992 chip_name = "renoir"; 1993 else 1994 chip_name = "green_sardine"; 1995 break; 1996 case CHIP_NAVI10: 1997 chip_name = "navi10"; 1998 break; 1999 case CHIP_NAVI14: 2000 chip_name = "navi14"; 2001 break; 2002 case CHIP_NAVI12: 2003 chip_name = "navi12"; 2004 break; 2005 case CHIP_VANGOGH: 2006 chip_name = "vangogh"; 2007 break; 2008 case CHIP_YELLOW_CARP: 2009 chip_name = "yellow_carp"; 2010 break; 2011 } 2012 2013 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2014 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2015 if (err) { 2016 dev_err(adev->dev, 2017 "Failed to load gpu_info firmware \"%s\"\n", 2018 fw_name); 2019 goto out; 2020 } 2021 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2022 if (err) { 2023 dev_err(adev->dev, 2024 "Failed to validate gpu_info firmware \"%s\"\n", 2025 fw_name); 2026 goto out; 2027 } 2028 2029 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2030 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2031 2032 switch (hdr->version_major) { 2033 case 1: 2034 { 2035 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2036 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2037 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2038 2039 /* 2040 * Should be droped when DAL no longer needs it. 2041 */ 2042 if (adev->asic_type == CHIP_NAVI12) 2043 goto parse_soc_bounding_box; 2044 2045 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2046 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2047 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2048 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2049 adev->gfx.config.max_texture_channel_caches = 2050 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2051 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2052 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2053 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2054 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2055 adev->gfx.config.double_offchip_lds_buf = 2056 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2057 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2058 adev->gfx.cu_info.max_waves_per_simd = 2059 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2060 adev->gfx.cu_info.max_scratch_slots_per_cu = 2061 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2062 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2063 if (hdr->version_minor >= 1) { 2064 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2065 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2066 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2067 adev->gfx.config.num_sc_per_sh = 2068 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2069 adev->gfx.config.num_packer_per_sc = 2070 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2071 } 2072 2073 parse_soc_bounding_box: 2074 /* 2075 * soc bounding box info is not integrated in disocovery table, 2076 * we always need to parse it from gpu info firmware if needed. 2077 */ 2078 if (hdr->version_minor == 2) { 2079 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2080 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2081 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2082 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2083 } 2084 break; 2085 } 2086 default: 2087 dev_err(adev->dev, 2088 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2089 err = -EINVAL; 2090 goto out; 2091 } 2092 out: 2093 return err; 2094 } 2095 2096 /** 2097 * amdgpu_device_ip_early_init - run early init for hardware IPs 2098 * 2099 * @adev: amdgpu_device pointer 2100 * 2101 * Early initialization pass for hardware IPs. The hardware IPs that make 2102 * up each asic are discovered each IP's early_init callback is run. This 2103 * is the first stage in initializing the asic. 2104 * Returns 0 on success, negative error code on failure. 2105 */ 2106 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2107 { 2108 struct drm_device *dev = adev_to_drm(adev); 2109 struct pci_dev *parent; 2110 int i, r; 2111 2112 amdgpu_device_enable_virtual_display(adev); 2113 2114 if (amdgpu_sriov_vf(adev)) { 2115 r = amdgpu_virt_request_full_gpu(adev, true); 2116 if (r) 2117 return r; 2118 } 2119 2120 switch (adev->asic_type) { 2121 #ifdef CONFIG_DRM_AMDGPU_SI 2122 case CHIP_VERDE: 2123 case CHIP_TAHITI: 2124 case CHIP_PITCAIRN: 2125 case CHIP_OLAND: 2126 case CHIP_HAINAN: 2127 adev->family = AMDGPU_FAMILY_SI; 2128 r = si_set_ip_blocks(adev); 2129 if (r) 2130 return r; 2131 break; 2132 #endif 2133 #ifdef CONFIG_DRM_AMDGPU_CIK 2134 case CHIP_BONAIRE: 2135 case CHIP_HAWAII: 2136 case CHIP_KAVERI: 2137 case CHIP_KABINI: 2138 case CHIP_MULLINS: 2139 if (adev->flags & AMD_IS_APU) 2140 adev->family = AMDGPU_FAMILY_KV; 2141 else 2142 adev->family = AMDGPU_FAMILY_CI; 2143 2144 r = cik_set_ip_blocks(adev); 2145 if (r) 2146 return r; 2147 break; 2148 #endif 2149 case CHIP_TOPAZ: 2150 case CHIP_TONGA: 2151 case CHIP_FIJI: 2152 case CHIP_POLARIS10: 2153 case CHIP_POLARIS11: 2154 case CHIP_POLARIS12: 2155 case CHIP_VEGAM: 2156 case CHIP_CARRIZO: 2157 case CHIP_STONEY: 2158 if (adev->flags & AMD_IS_APU) 2159 adev->family = AMDGPU_FAMILY_CZ; 2160 else 2161 adev->family = AMDGPU_FAMILY_VI; 2162 2163 r = vi_set_ip_blocks(adev); 2164 if (r) 2165 return r; 2166 break; 2167 default: 2168 r = amdgpu_discovery_set_ip_blocks(adev); 2169 if (r) 2170 return r; 2171 break; 2172 } 2173 2174 if (amdgpu_has_atpx() && 2175 (amdgpu_is_atpx_hybrid() || 2176 amdgpu_has_atpx_dgpu_power_cntl()) && 2177 ((adev->flags & AMD_IS_APU) == 0) && 2178 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2179 adev->flags |= AMD_IS_PX; 2180 2181 parent = pci_upstream_bridge(adev->pdev); 2182 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2183 2184 amdgpu_amdkfd_device_probe(adev); 2185 2186 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2187 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2188 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2189 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2190 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2191 2192 for (i = 0; i < adev->num_ip_blocks; i++) { 2193 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2194 DRM_ERROR("disabled ip block: %d <%s>\n", 2195 i, adev->ip_blocks[i].version->funcs->name); 2196 adev->ip_blocks[i].status.valid = false; 2197 } else { 2198 if (adev->ip_blocks[i].version->funcs->early_init) { 2199 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2200 if (r == -ENOENT) { 2201 adev->ip_blocks[i].status.valid = false; 2202 } else if (r) { 2203 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2204 adev->ip_blocks[i].version->funcs->name, r); 2205 return r; 2206 } else { 2207 adev->ip_blocks[i].status.valid = true; 2208 } 2209 } else { 2210 adev->ip_blocks[i].status.valid = true; 2211 } 2212 } 2213 /* get the vbios after the asic_funcs are set up */ 2214 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2215 r = amdgpu_device_parse_gpu_info_fw(adev); 2216 if (r) 2217 return r; 2218 2219 /* Read BIOS */ 2220 if (!amdgpu_get_bios(adev)) 2221 return -EINVAL; 2222 2223 r = amdgpu_atombios_init(adev); 2224 if (r) { 2225 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2226 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2227 return r; 2228 } 2229 2230 /*get pf2vf msg info at it's earliest time*/ 2231 if (amdgpu_sriov_vf(adev)) 2232 amdgpu_virt_init_data_exchange(adev); 2233 2234 } 2235 } 2236 2237 adev->cg_flags &= amdgpu_cg_mask; 2238 adev->pg_flags &= amdgpu_pg_mask; 2239 2240 return 0; 2241 } 2242 2243 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2244 { 2245 int i, r; 2246 2247 for (i = 0; i < adev->num_ip_blocks; i++) { 2248 if (!adev->ip_blocks[i].status.sw) 2249 continue; 2250 if (adev->ip_blocks[i].status.hw) 2251 continue; 2252 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2253 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2254 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2255 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2256 if (r) { 2257 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2258 adev->ip_blocks[i].version->funcs->name, r); 2259 return r; 2260 } 2261 adev->ip_blocks[i].status.hw = true; 2262 } 2263 } 2264 2265 return 0; 2266 } 2267 2268 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2269 { 2270 int i, r; 2271 2272 for (i = 0; i < adev->num_ip_blocks; i++) { 2273 if (!adev->ip_blocks[i].status.sw) 2274 continue; 2275 if (adev->ip_blocks[i].status.hw) 2276 continue; 2277 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2278 if (r) { 2279 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2280 adev->ip_blocks[i].version->funcs->name, r); 2281 return r; 2282 } 2283 adev->ip_blocks[i].status.hw = true; 2284 } 2285 2286 return 0; 2287 } 2288 2289 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2290 { 2291 int r = 0; 2292 int i; 2293 uint32_t smu_version; 2294 2295 if (adev->asic_type >= CHIP_VEGA10) { 2296 for (i = 0; i < adev->num_ip_blocks; i++) { 2297 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2298 continue; 2299 2300 if (!adev->ip_blocks[i].status.sw) 2301 continue; 2302 2303 /* no need to do the fw loading again if already done*/ 2304 if (adev->ip_blocks[i].status.hw == true) 2305 break; 2306 2307 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2308 r = adev->ip_blocks[i].version->funcs->resume(adev); 2309 if (r) { 2310 DRM_ERROR("resume of IP block <%s> failed %d\n", 2311 adev->ip_blocks[i].version->funcs->name, r); 2312 return r; 2313 } 2314 } else { 2315 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2316 if (r) { 2317 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2318 adev->ip_blocks[i].version->funcs->name, r); 2319 return r; 2320 } 2321 } 2322 2323 adev->ip_blocks[i].status.hw = true; 2324 break; 2325 } 2326 } 2327 2328 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2329 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2330 2331 return r; 2332 } 2333 2334 /** 2335 * amdgpu_device_ip_init - run init for hardware IPs 2336 * 2337 * @adev: amdgpu_device pointer 2338 * 2339 * Main initialization pass for hardware IPs. The list of all the hardware 2340 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2341 * are run. sw_init initializes the software state associated with each IP 2342 * and hw_init initializes the hardware associated with each IP. 2343 * Returns 0 on success, negative error code on failure. 2344 */ 2345 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2346 { 2347 int i, r; 2348 2349 r = amdgpu_ras_init(adev); 2350 if (r) 2351 return r; 2352 2353 for (i = 0; i < adev->num_ip_blocks; i++) { 2354 if (!adev->ip_blocks[i].status.valid) 2355 continue; 2356 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2357 if (r) { 2358 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2359 adev->ip_blocks[i].version->funcs->name, r); 2360 goto init_failed; 2361 } 2362 adev->ip_blocks[i].status.sw = true; 2363 2364 /* need to do gmc hw init early so we can allocate gpu mem */ 2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2366 /* Try to reserve bad pages early */ 2367 if (amdgpu_sriov_vf(adev)) 2368 amdgpu_virt_exchange_data(adev); 2369 2370 r = amdgpu_device_vram_scratch_init(adev); 2371 if (r) { 2372 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2373 goto init_failed; 2374 } 2375 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2376 if (r) { 2377 DRM_ERROR("hw_init %d failed %d\n", i, r); 2378 goto init_failed; 2379 } 2380 r = amdgpu_device_wb_init(adev); 2381 if (r) { 2382 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2383 goto init_failed; 2384 } 2385 adev->ip_blocks[i].status.hw = true; 2386 2387 /* right after GMC hw init, we create CSA */ 2388 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2389 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2390 AMDGPU_GEM_DOMAIN_VRAM, 2391 AMDGPU_CSA_SIZE); 2392 if (r) { 2393 DRM_ERROR("allocate CSA failed %d\n", r); 2394 goto init_failed; 2395 } 2396 } 2397 } 2398 } 2399 2400 if (amdgpu_sriov_vf(adev)) 2401 amdgpu_virt_init_data_exchange(adev); 2402 2403 r = amdgpu_ib_pool_init(adev); 2404 if (r) { 2405 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2406 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2407 goto init_failed; 2408 } 2409 2410 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2411 if (r) 2412 goto init_failed; 2413 2414 r = amdgpu_device_ip_hw_init_phase1(adev); 2415 if (r) 2416 goto init_failed; 2417 2418 r = amdgpu_device_fw_loading(adev); 2419 if (r) 2420 goto init_failed; 2421 2422 r = amdgpu_device_ip_hw_init_phase2(adev); 2423 if (r) 2424 goto init_failed; 2425 2426 /* 2427 * retired pages will be loaded from eeprom and reserved here, 2428 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2429 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2430 * for I2C communication which only true at this point. 2431 * 2432 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2433 * failure from bad gpu situation and stop amdgpu init process 2434 * accordingly. For other failed cases, it will still release all 2435 * the resource and print error message, rather than returning one 2436 * negative value to upper level. 2437 * 2438 * Note: theoretically, this should be called before all vram allocations 2439 * to protect retired page from abusing 2440 */ 2441 r = amdgpu_ras_recovery_init(adev); 2442 if (r) 2443 goto init_failed; 2444 2445 if (adev->gmc.xgmi.num_physical_nodes > 1) 2446 amdgpu_xgmi_add_device(adev); 2447 2448 /* Don't init kfd if whole hive need to be reset during init */ 2449 if (!adev->gmc.xgmi.pending_reset) 2450 amdgpu_amdkfd_device_init(adev); 2451 2452 amdgpu_fru_get_product_info(adev); 2453 2454 init_failed: 2455 if (amdgpu_sriov_vf(adev)) 2456 amdgpu_virt_release_full_gpu(adev, true); 2457 2458 return r; 2459 } 2460 2461 /** 2462 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2463 * 2464 * @adev: amdgpu_device pointer 2465 * 2466 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2467 * this function before a GPU reset. If the value is retained after a 2468 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2469 */ 2470 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2471 { 2472 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2473 } 2474 2475 /** 2476 * amdgpu_device_check_vram_lost - check if vram is valid 2477 * 2478 * @adev: amdgpu_device pointer 2479 * 2480 * Checks the reset magic value written to the gart pointer in VRAM. 2481 * The driver calls this after a GPU reset to see if the contents of 2482 * VRAM is lost or now. 2483 * returns true if vram is lost, false if not. 2484 */ 2485 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2486 { 2487 if (memcmp(adev->gart.ptr, adev->reset_magic, 2488 AMDGPU_RESET_MAGIC_NUM)) 2489 return true; 2490 2491 if (!amdgpu_in_reset(adev)) 2492 return false; 2493 2494 /* 2495 * For all ASICs with baco/mode1 reset, the VRAM is 2496 * always assumed to be lost. 2497 */ 2498 switch (amdgpu_asic_reset_method(adev)) { 2499 case AMD_RESET_METHOD_BACO: 2500 case AMD_RESET_METHOD_MODE1: 2501 return true; 2502 default: 2503 return false; 2504 } 2505 } 2506 2507 /** 2508 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2509 * 2510 * @adev: amdgpu_device pointer 2511 * @state: clockgating state (gate or ungate) 2512 * 2513 * The list of all the hardware IPs that make up the asic is walked and the 2514 * set_clockgating_state callbacks are run. 2515 * Late initialization pass enabling clockgating for hardware IPs. 2516 * Fini or suspend, pass disabling clockgating for hardware IPs. 2517 * Returns 0 on success, negative error code on failure. 2518 */ 2519 2520 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2521 enum amd_clockgating_state state) 2522 { 2523 int i, j, r; 2524 2525 if (amdgpu_emu_mode == 1) 2526 return 0; 2527 2528 for (j = 0; j < adev->num_ip_blocks; j++) { 2529 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2530 if (!adev->ip_blocks[i].status.late_initialized) 2531 continue; 2532 /* skip CG for GFX on S0ix */ 2533 if (adev->in_s0ix && 2534 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2535 continue; 2536 /* skip CG for VCE/UVD, it's handled specially */ 2537 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2538 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2539 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2540 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2541 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2542 /* enable clockgating to save power */ 2543 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2544 state); 2545 if (r) { 2546 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2547 adev->ip_blocks[i].version->funcs->name, r); 2548 return r; 2549 } 2550 } 2551 } 2552 2553 return 0; 2554 } 2555 2556 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2557 enum amd_powergating_state state) 2558 { 2559 int i, j, r; 2560 2561 if (amdgpu_emu_mode == 1) 2562 return 0; 2563 2564 for (j = 0; j < adev->num_ip_blocks; j++) { 2565 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2566 if (!adev->ip_blocks[i].status.late_initialized) 2567 continue; 2568 /* skip PG for GFX on S0ix */ 2569 if (adev->in_s0ix && 2570 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2571 continue; 2572 /* skip CG for VCE/UVD, it's handled specially */ 2573 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2574 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2575 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2576 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2577 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2578 /* enable powergating to save power */ 2579 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2580 state); 2581 if (r) { 2582 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2583 adev->ip_blocks[i].version->funcs->name, r); 2584 return r; 2585 } 2586 } 2587 } 2588 return 0; 2589 } 2590 2591 static int amdgpu_device_enable_mgpu_fan_boost(void) 2592 { 2593 struct amdgpu_gpu_instance *gpu_ins; 2594 struct amdgpu_device *adev; 2595 int i, ret = 0; 2596 2597 mutex_lock(&mgpu_info.mutex); 2598 2599 /* 2600 * MGPU fan boost feature should be enabled 2601 * only when there are two or more dGPUs in 2602 * the system 2603 */ 2604 if (mgpu_info.num_dgpu < 2) 2605 goto out; 2606 2607 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2608 gpu_ins = &(mgpu_info.gpu_ins[i]); 2609 adev = gpu_ins->adev; 2610 if (!(adev->flags & AMD_IS_APU) && 2611 !gpu_ins->mgpu_fan_enabled) { 2612 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2613 if (ret) 2614 break; 2615 2616 gpu_ins->mgpu_fan_enabled = 1; 2617 } 2618 } 2619 2620 out: 2621 mutex_unlock(&mgpu_info.mutex); 2622 2623 return ret; 2624 } 2625 2626 /** 2627 * amdgpu_device_ip_late_init - run late init for hardware IPs 2628 * 2629 * @adev: amdgpu_device pointer 2630 * 2631 * Late initialization pass for hardware IPs. The list of all the hardware 2632 * IPs that make up the asic is walked and the late_init callbacks are run. 2633 * late_init covers any special initialization that an IP requires 2634 * after all of the have been initialized or something that needs to happen 2635 * late in the init process. 2636 * Returns 0 on success, negative error code on failure. 2637 */ 2638 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2639 { 2640 struct amdgpu_gpu_instance *gpu_instance; 2641 int i = 0, r; 2642 2643 for (i = 0; i < adev->num_ip_blocks; i++) { 2644 if (!adev->ip_blocks[i].status.hw) 2645 continue; 2646 if (adev->ip_blocks[i].version->funcs->late_init) { 2647 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2648 if (r) { 2649 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2650 adev->ip_blocks[i].version->funcs->name, r); 2651 return r; 2652 } 2653 } 2654 adev->ip_blocks[i].status.late_initialized = true; 2655 } 2656 2657 r = amdgpu_ras_late_init(adev); 2658 if (r) { 2659 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2660 return r; 2661 } 2662 2663 amdgpu_ras_set_error_query_ready(adev, true); 2664 2665 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2666 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2667 2668 amdgpu_device_fill_reset_magic(adev); 2669 2670 r = amdgpu_device_enable_mgpu_fan_boost(); 2671 if (r) 2672 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2673 2674 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2675 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2676 adev->asic_type == CHIP_ALDEBARAN )) 2677 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2678 2679 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2680 mutex_lock(&mgpu_info.mutex); 2681 2682 /* 2683 * Reset device p-state to low as this was booted with high. 2684 * 2685 * This should be performed only after all devices from the same 2686 * hive get initialized. 2687 * 2688 * However, it's unknown how many device in the hive in advance. 2689 * As this is counted one by one during devices initializations. 2690 * 2691 * So, we wait for all XGMI interlinked devices initialized. 2692 * This may bring some delays as those devices may come from 2693 * different hives. But that should be OK. 2694 */ 2695 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2696 for (i = 0; i < mgpu_info.num_gpu; i++) { 2697 gpu_instance = &(mgpu_info.gpu_ins[i]); 2698 if (gpu_instance->adev->flags & AMD_IS_APU) 2699 continue; 2700 2701 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2702 AMDGPU_XGMI_PSTATE_MIN); 2703 if (r) { 2704 DRM_ERROR("pstate setting failed (%d).\n", r); 2705 break; 2706 } 2707 } 2708 } 2709 2710 mutex_unlock(&mgpu_info.mutex); 2711 } 2712 2713 return 0; 2714 } 2715 2716 /** 2717 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2718 * 2719 * @adev: amdgpu_device pointer 2720 * 2721 * For ASICs need to disable SMC first 2722 */ 2723 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2724 { 2725 int i, r; 2726 2727 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2728 return; 2729 2730 for (i = 0; i < adev->num_ip_blocks; i++) { 2731 if (!adev->ip_blocks[i].status.hw) 2732 continue; 2733 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2734 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2735 /* XXX handle errors */ 2736 if (r) { 2737 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2738 adev->ip_blocks[i].version->funcs->name, r); 2739 } 2740 adev->ip_blocks[i].status.hw = false; 2741 break; 2742 } 2743 } 2744 } 2745 2746 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2747 { 2748 int i, r; 2749 2750 for (i = 0; i < adev->num_ip_blocks; i++) { 2751 if (!adev->ip_blocks[i].version->funcs->early_fini) 2752 continue; 2753 2754 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2755 if (r) { 2756 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2757 adev->ip_blocks[i].version->funcs->name, r); 2758 } 2759 } 2760 2761 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2762 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2763 2764 amdgpu_amdkfd_suspend(adev, false); 2765 2766 /* Workaroud for ASICs need to disable SMC first */ 2767 amdgpu_device_smu_fini_early(adev); 2768 2769 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2770 if (!adev->ip_blocks[i].status.hw) 2771 continue; 2772 2773 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2774 /* XXX handle errors */ 2775 if (r) { 2776 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2777 adev->ip_blocks[i].version->funcs->name, r); 2778 } 2779 2780 adev->ip_blocks[i].status.hw = false; 2781 } 2782 2783 if (amdgpu_sriov_vf(adev)) { 2784 if (amdgpu_virt_release_full_gpu(adev, false)) 2785 DRM_ERROR("failed to release exclusive mode on fini\n"); 2786 } 2787 2788 return 0; 2789 } 2790 2791 /** 2792 * amdgpu_device_ip_fini - run fini for hardware IPs 2793 * 2794 * @adev: amdgpu_device pointer 2795 * 2796 * Main teardown pass for hardware IPs. The list of all the hardware 2797 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2798 * are run. hw_fini tears down the hardware associated with each IP 2799 * and sw_fini tears down any software state associated with each IP. 2800 * Returns 0 on success, negative error code on failure. 2801 */ 2802 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2803 { 2804 int i, r; 2805 2806 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2807 amdgpu_virt_release_ras_err_handler_data(adev); 2808 2809 if (adev->gmc.xgmi.num_physical_nodes > 1) 2810 amdgpu_xgmi_remove_device(adev); 2811 2812 amdgpu_amdkfd_device_fini_sw(adev); 2813 2814 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2815 if (!adev->ip_blocks[i].status.sw) 2816 continue; 2817 2818 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2819 amdgpu_ucode_free_bo(adev); 2820 amdgpu_free_static_csa(&adev->virt.csa_obj); 2821 amdgpu_device_wb_fini(adev); 2822 amdgpu_device_vram_scratch_fini(adev); 2823 amdgpu_ib_pool_fini(adev); 2824 } 2825 2826 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2827 /* XXX handle errors */ 2828 if (r) { 2829 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2830 adev->ip_blocks[i].version->funcs->name, r); 2831 } 2832 adev->ip_blocks[i].status.sw = false; 2833 adev->ip_blocks[i].status.valid = false; 2834 } 2835 2836 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2837 if (!adev->ip_blocks[i].status.late_initialized) 2838 continue; 2839 if (adev->ip_blocks[i].version->funcs->late_fini) 2840 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2841 adev->ip_blocks[i].status.late_initialized = false; 2842 } 2843 2844 amdgpu_ras_fini(adev); 2845 2846 return 0; 2847 } 2848 2849 /** 2850 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2851 * 2852 * @work: work_struct. 2853 */ 2854 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2855 { 2856 struct amdgpu_device *adev = 2857 container_of(work, struct amdgpu_device, delayed_init_work.work); 2858 int r; 2859 2860 r = amdgpu_ib_ring_tests(adev); 2861 if (r) 2862 DRM_ERROR("ib ring test failed (%d).\n", r); 2863 } 2864 2865 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2866 { 2867 struct amdgpu_device *adev = 2868 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2869 2870 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2871 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2872 2873 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2874 adev->gfx.gfx_off_state = true; 2875 } 2876 2877 /** 2878 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2879 * 2880 * @adev: amdgpu_device pointer 2881 * 2882 * Main suspend function for hardware IPs. The list of all the hardware 2883 * IPs that make up the asic is walked, clockgating is disabled and the 2884 * suspend callbacks are run. suspend puts the hardware and software state 2885 * in each IP into a state suitable for suspend. 2886 * Returns 0 on success, negative error code on failure. 2887 */ 2888 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2889 { 2890 int i, r; 2891 2892 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2893 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2894 2895 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2896 if (!adev->ip_blocks[i].status.valid) 2897 continue; 2898 2899 /* displays are handled separately */ 2900 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2901 continue; 2902 2903 /* XXX handle errors */ 2904 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2905 /* XXX handle errors */ 2906 if (r) { 2907 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2908 adev->ip_blocks[i].version->funcs->name, r); 2909 return r; 2910 } 2911 2912 adev->ip_blocks[i].status.hw = false; 2913 } 2914 2915 return 0; 2916 } 2917 2918 /** 2919 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2920 * 2921 * @adev: amdgpu_device pointer 2922 * 2923 * Main suspend function for hardware IPs. The list of all the hardware 2924 * IPs that make up the asic is walked, clockgating is disabled and the 2925 * suspend callbacks are run. suspend puts the hardware and software state 2926 * in each IP into a state suitable for suspend. 2927 * Returns 0 on success, negative error code on failure. 2928 */ 2929 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2930 { 2931 int i, r; 2932 2933 if (adev->in_s0ix) 2934 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2935 2936 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2937 if (!adev->ip_blocks[i].status.valid) 2938 continue; 2939 /* displays are handled in phase1 */ 2940 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2941 continue; 2942 /* PSP lost connection when err_event_athub occurs */ 2943 if (amdgpu_ras_intr_triggered() && 2944 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2945 adev->ip_blocks[i].status.hw = false; 2946 continue; 2947 } 2948 2949 /* skip unnecessary suspend if we do not initialize them yet */ 2950 if (adev->gmc.xgmi.pending_reset && 2951 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2952 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2953 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2954 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2955 adev->ip_blocks[i].status.hw = false; 2956 continue; 2957 } 2958 2959 /* skip suspend of gfx and psp for S0ix 2960 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2961 * like at runtime. PSP is also part of the always on hardware 2962 * so no need to suspend it. 2963 */ 2964 if (adev->in_s0ix && 2965 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2966 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2967 continue; 2968 2969 /* XXX handle errors */ 2970 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2971 /* XXX handle errors */ 2972 if (r) { 2973 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2974 adev->ip_blocks[i].version->funcs->name, r); 2975 } 2976 adev->ip_blocks[i].status.hw = false; 2977 /* handle putting the SMC in the appropriate state */ 2978 if(!amdgpu_sriov_vf(adev)){ 2979 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2980 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2981 if (r) { 2982 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2983 adev->mp1_state, r); 2984 return r; 2985 } 2986 } 2987 } 2988 } 2989 2990 return 0; 2991 } 2992 2993 /** 2994 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2995 * 2996 * @adev: amdgpu_device pointer 2997 * 2998 * Main suspend function for hardware IPs. The list of all the hardware 2999 * IPs that make up the asic is walked, clockgating is disabled and the 3000 * suspend callbacks are run. suspend puts the hardware and software state 3001 * in each IP into a state suitable for suspend. 3002 * Returns 0 on success, negative error code on failure. 3003 */ 3004 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3005 { 3006 int r; 3007 3008 if (amdgpu_sriov_vf(adev)) { 3009 amdgpu_virt_fini_data_exchange(adev); 3010 amdgpu_virt_request_full_gpu(adev, false); 3011 } 3012 3013 r = amdgpu_device_ip_suspend_phase1(adev); 3014 if (r) 3015 return r; 3016 r = amdgpu_device_ip_suspend_phase2(adev); 3017 3018 if (amdgpu_sriov_vf(adev)) 3019 amdgpu_virt_release_full_gpu(adev, false); 3020 3021 return r; 3022 } 3023 3024 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3025 { 3026 int i, r; 3027 3028 static enum amd_ip_block_type ip_order[] = { 3029 AMD_IP_BLOCK_TYPE_GMC, 3030 AMD_IP_BLOCK_TYPE_COMMON, 3031 AMD_IP_BLOCK_TYPE_PSP, 3032 AMD_IP_BLOCK_TYPE_IH, 3033 }; 3034 3035 for (i = 0; i < adev->num_ip_blocks; i++) { 3036 int j; 3037 struct amdgpu_ip_block *block; 3038 3039 block = &adev->ip_blocks[i]; 3040 block->status.hw = false; 3041 3042 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3043 3044 if (block->version->type != ip_order[j] || 3045 !block->status.valid) 3046 continue; 3047 3048 r = block->version->funcs->hw_init(adev); 3049 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3050 if (r) 3051 return r; 3052 block->status.hw = true; 3053 } 3054 } 3055 3056 return 0; 3057 } 3058 3059 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3060 { 3061 int i, r; 3062 3063 static enum amd_ip_block_type ip_order[] = { 3064 AMD_IP_BLOCK_TYPE_SMC, 3065 AMD_IP_BLOCK_TYPE_DCE, 3066 AMD_IP_BLOCK_TYPE_GFX, 3067 AMD_IP_BLOCK_TYPE_SDMA, 3068 AMD_IP_BLOCK_TYPE_UVD, 3069 AMD_IP_BLOCK_TYPE_VCE, 3070 AMD_IP_BLOCK_TYPE_VCN 3071 }; 3072 3073 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3074 int j; 3075 struct amdgpu_ip_block *block; 3076 3077 for (j = 0; j < adev->num_ip_blocks; j++) { 3078 block = &adev->ip_blocks[j]; 3079 3080 if (block->version->type != ip_order[i] || 3081 !block->status.valid || 3082 block->status.hw) 3083 continue; 3084 3085 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3086 r = block->version->funcs->resume(adev); 3087 else 3088 r = block->version->funcs->hw_init(adev); 3089 3090 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3091 if (r) 3092 return r; 3093 block->status.hw = true; 3094 } 3095 } 3096 3097 return 0; 3098 } 3099 3100 /** 3101 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3102 * 3103 * @adev: amdgpu_device pointer 3104 * 3105 * First resume function for hardware IPs. The list of all the hardware 3106 * IPs that make up the asic is walked and the resume callbacks are run for 3107 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3108 * after a suspend and updates the software state as necessary. This 3109 * function is also used for restoring the GPU after a GPU reset. 3110 * Returns 0 on success, negative error code on failure. 3111 */ 3112 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3113 { 3114 int i, r; 3115 3116 for (i = 0; i < adev->num_ip_blocks; i++) { 3117 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3118 continue; 3119 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3120 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3122 3123 r = adev->ip_blocks[i].version->funcs->resume(adev); 3124 if (r) { 3125 DRM_ERROR("resume of IP block <%s> failed %d\n", 3126 adev->ip_blocks[i].version->funcs->name, r); 3127 return r; 3128 } 3129 adev->ip_blocks[i].status.hw = true; 3130 } 3131 } 3132 3133 return 0; 3134 } 3135 3136 /** 3137 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3138 * 3139 * @adev: amdgpu_device pointer 3140 * 3141 * First resume function for hardware IPs. The list of all the hardware 3142 * IPs that make up the asic is walked and the resume callbacks are run for 3143 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3144 * functional state after a suspend and updates the software state as 3145 * necessary. This function is also used for restoring the GPU after a GPU 3146 * reset. 3147 * Returns 0 on success, negative error code on failure. 3148 */ 3149 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3150 { 3151 int i, r; 3152 3153 for (i = 0; i < adev->num_ip_blocks; i++) { 3154 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3155 continue; 3156 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3157 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3158 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3159 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3160 continue; 3161 r = adev->ip_blocks[i].version->funcs->resume(adev); 3162 if (r) { 3163 DRM_ERROR("resume of IP block <%s> failed %d\n", 3164 adev->ip_blocks[i].version->funcs->name, r); 3165 return r; 3166 } 3167 adev->ip_blocks[i].status.hw = true; 3168 } 3169 3170 return 0; 3171 } 3172 3173 /** 3174 * amdgpu_device_ip_resume - run resume for hardware IPs 3175 * 3176 * @adev: amdgpu_device pointer 3177 * 3178 * Main resume function for hardware IPs. The hardware IPs 3179 * are split into two resume functions because they are 3180 * are also used in in recovering from a GPU reset and some additional 3181 * steps need to be take between them. In this case (S3/S4) they are 3182 * run sequentially. 3183 * Returns 0 on success, negative error code on failure. 3184 */ 3185 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3186 { 3187 int r; 3188 3189 r = amdgpu_amdkfd_resume_iommu(adev); 3190 if (r) 3191 return r; 3192 3193 r = amdgpu_device_ip_resume_phase1(adev); 3194 if (r) 3195 return r; 3196 3197 r = amdgpu_device_fw_loading(adev); 3198 if (r) 3199 return r; 3200 3201 r = amdgpu_device_ip_resume_phase2(adev); 3202 3203 return r; 3204 } 3205 3206 /** 3207 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3208 * 3209 * @adev: amdgpu_device pointer 3210 * 3211 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3212 */ 3213 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3214 { 3215 if (amdgpu_sriov_vf(adev)) { 3216 if (adev->is_atom_fw) { 3217 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3218 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3219 } else { 3220 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3221 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3222 } 3223 3224 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3225 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3226 } 3227 } 3228 3229 /** 3230 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3231 * 3232 * @asic_type: AMD asic type 3233 * 3234 * Check if there is DC (new modesetting infrastructre) support for an asic. 3235 * returns true if DC has support, false if not. 3236 */ 3237 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3238 { 3239 switch (asic_type) { 3240 #ifdef CONFIG_DRM_AMDGPU_SI 3241 case CHIP_HAINAN: 3242 #endif 3243 case CHIP_TOPAZ: 3244 /* chips with no display hardware */ 3245 return false; 3246 #if defined(CONFIG_DRM_AMD_DC) 3247 case CHIP_TAHITI: 3248 case CHIP_PITCAIRN: 3249 case CHIP_VERDE: 3250 case CHIP_OLAND: 3251 /* 3252 * We have systems in the wild with these ASICs that require 3253 * LVDS and VGA support which is not supported with DC. 3254 * 3255 * Fallback to the non-DC driver here by default so as not to 3256 * cause regressions. 3257 */ 3258 #if defined(CONFIG_DRM_AMD_DC_SI) 3259 return amdgpu_dc > 0; 3260 #else 3261 return false; 3262 #endif 3263 case CHIP_BONAIRE: 3264 case CHIP_KAVERI: 3265 case CHIP_KABINI: 3266 case CHIP_MULLINS: 3267 /* 3268 * We have systems in the wild with these ASICs that require 3269 * LVDS and VGA support which is not supported with DC. 3270 * 3271 * Fallback to the non-DC driver here by default so as not to 3272 * cause regressions. 3273 */ 3274 return amdgpu_dc > 0; 3275 case CHIP_HAWAII: 3276 case CHIP_CARRIZO: 3277 case CHIP_STONEY: 3278 case CHIP_POLARIS10: 3279 case CHIP_POLARIS11: 3280 case CHIP_POLARIS12: 3281 case CHIP_VEGAM: 3282 case CHIP_TONGA: 3283 case CHIP_FIJI: 3284 case CHIP_VEGA10: 3285 case CHIP_VEGA12: 3286 case CHIP_VEGA20: 3287 #if defined(CONFIG_DRM_AMD_DC_DCN) 3288 case CHIP_RAVEN: 3289 case CHIP_NAVI10: 3290 case CHIP_NAVI14: 3291 case CHIP_NAVI12: 3292 case CHIP_RENOIR: 3293 case CHIP_CYAN_SKILLFISH: 3294 case CHIP_SIENNA_CICHLID: 3295 case CHIP_NAVY_FLOUNDER: 3296 case CHIP_DIMGREY_CAVEFISH: 3297 case CHIP_BEIGE_GOBY: 3298 case CHIP_VANGOGH: 3299 case CHIP_YELLOW_CARP: 3300 #endif 3301 default: 3302 return amdgpu_dc != 0; 3303 #else 3304 default: 3305 if (amdgpu_dc > 0) 3306 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3307 "but isn't supported by ASIC, ignoring\n"); 3308 return false; 3309 #endif 3310 } 3311 } 3312 3313 /** 3314 * amdgpu_device_has_dc_support - check if dc is supported 3315 * 3316 * @adev: amdgpu_device pointer 3317 * 3318 * Returns true for supported, false for not supported 3319 */ 3320 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3321 { 3322 if (amdgpu_sriov_vf(adev) || 3323 adev->enable_virtual_display || 3324 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3325 return false; 3326 3327 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3328 } 3329 3330 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3331 { 3332 struct amdgpu_device *adev = 3333 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3334 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3335 3336 /* It's a bug to not have a hive within this function */ 3337 if (WARN_ON(!hive)) 3338 return; 3339 3340 /* 3341 * Use task barrier to synchronize all xgmi reset works across the 3342 * hive. task_barrier_enter and task_barrier_exit will block 3343 * until all the threads running the xgmi reset works reach 3344 * those points. task_barrier_full will do both blocks. 3345 */ 3346 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3347 3348 task_barrier_enter(&hive->tb); 3349 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3350 3351 if (adev->asic_reset_res) 3352 goto fail; 3353 3354 task_barrier_exit(&hive->tb); 3355 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3356 3357 if (adev->asic_reset_res) 3358 goto fail; 3359 3360 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3361 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3362 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3363 } else { 3364 3365 task_barrier_full(&hive->tb); 3366 adev->asic_reset_res = amdgpu_asic_reset(adev); 3367 } 3368 3369 fail: 3370 if (adev->asic_reset_res) 3371 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3372 adev->asic_reset_res, adev_to_drm(adev)->unique); 3373 amdgpu_put_xgmi_hive(hive); 3374 } 3375 3376 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3377 { 3378 char *input = amdgpu_lockup_timeout; 3379 char *timeout_setting = NULL; 3380 int index = 0; 3381 long timeout; 3382 int ret = 0; 3383 3384 /* 3385 * By default timeout for non compute jobs is 10000 3386 * and 60000 for compute jobs. 3387 * In SR-IOV or passthrough mode, timeout for compute 3388 * jobs are 60000 by default. 3389 */ 3390 adev->gfx_timeout = msecs_to_jiffies(10000); 3391 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3392 if (amdgpu_sriov_vf(adev)) 3393 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3394 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3395 else 3396 adev->compute_timeout = msecs_to_jiffies(60000); 3397 3398 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3399 while ((timeout_setting = strsep(&input, ",")) && 3400 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3401 ret = kstrtol(timeout_setting, 0, &timeout); 3402 if (ret) 3403 return ret; 3404 3405 if (timeout == 0) { 3406 index++; 3407 continue; 3408 } else if (timeout < 0) { 3409 timeout = MAX_SCHEDULE_TIMEOUT; 3410 dev_warn(adev->dev, "lockup timeout disabled"); 3411 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3412 } else { 3413 timeout = msecs_to_jiffies(timeout); 3414 } 3415 3416 switch (index++) { 3417 case 0: 3418 adev->gfx_timeout = timeout; 3419 break; 3420 case 1: 3421 adev->compute_timeout = timeout; 3422 break; 3423 case 2: 3424 adev->sdma_timeout = timeout; 3425 break; 3426 case 3: 3427 adev->video_timeout = timeout; 3428 break; 3429 default: 3430 break; 3431 } 3432 } 3433 /* 3434 * There is only one value specified and 3435 * it should apply to all non-compute jobs. 3436 */ 3437 if (index == 1) { 3438 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3439 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3440 adev->compute_timeout = adev->gfx_timeout; 3441 } 3442 } 3443 3444 return ret; 3445 } 3446 3447 /** 3448 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3449 * 3450 * @adev: amdgpu_device pointer 3451 * 3452 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3453 */ 3454 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3455 { 3456 struct iommu_domain *domain; 3457 3458 domain = iommu_get_domain_for_dev(adev->dev); 3459 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3460 adev->ram_is_direct_mapped = true; 3461 } 3462 3463 static const struct attribute *amdgpu_dev_attributes[] = { 3464 &dev_attr_product_name.attr, 3465 &dev_attr_product_number.attr, 3466 &dev_attr_serial_number.attr, 3467 &dev_attr_pcie_replay_count.attr, 3468 NULL 3469 }; 3470 3471 /** 3472 * amdgpu_device_init - initialize the driver 3473 * 3474 * @adev: amdgpu_device pointer 3475 * @flags: driver flags 3476 * 3477 * Initializes the driver info and hw (all asics). 3478 * Returns 0 for success or an error on failure. 3479 * Called at driver startup. 3480 */ 3481 int amdgpu_device_init(struct amdgpu_device *adev, 3482 uint32_t flags) 3483 { 3484 struct drm_device *ddev = adev_to_drm(adev); 3485 struct pci_dev *pdev = adev->pdev; 3486 int r, i; 3487 bool px = false; 3488 u32 max_MBps; 3489 3490 adev->shutdown = false; 3491 adev->flags = flags; 3492 3493 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3494 adev->asic_type = amdgpu_force_asic_type; 3495 else 3496 adev->asic_type = flags & AMD_ASIC_MASK; 3497 3498 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3499 if (amdgpu_emu_mode == 1) 3500 adev->usec_timeout *= 10; 3501 adev->gmc.gart_size = 512 * 1024 * 1024; 3502 adev->accel_working = false; 3503 adev->num_rings = 0; 3504 adev->mman.buffer_funcs = NULL; 3505 adev->mman.buffer_funcs_ring = NULL; 3506 adev->vm_manager.vm_pte_funcs = NULL; 3507 adev->vm_manager.vm_pte_num_scheds = 0; 3508 adev->gmc.gmc_funcs = NULL; 3509 adev->harvest_ip_mask = 0x0; 3510 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3511 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3512 3513 adev->smc_rreg = &amdgpu_invalid_rreg; 3514 adev->smc_wreg = &amdgpu_invalid_wreg; 3515 adev->pcie_rreg = &amdgpu_invalid_rreg; 3516 adev->pcie_wreg = &amdgpu_invalid_wreg; 3517 adev->pciep_rreg = &amdgpu_invalid_rreg; 3518 adev->pciep_wreg = &amdgpu_invalid_wreg; 3519 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3520 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3521 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3522 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3523 adev->didt_rreg = &amdgpu_invalid_rreg; 3524 adev->didt_wreg = &amdgpu_invalid_wreg; 3525 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3526 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3527 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3528 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3529 3530 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3531 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3532 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3533 3534 /* mutex initialization are all done here so we 3535 * can recall function without having locking issues */ 3536 mutex_init(&adev->firmware.mutex); 3537 mutex_init(&adev->pm.mutex); 3538 mutex_init(&adev->gfx.gpu_clock_mutex); 3539 mutex_init(&adev->srbm_mutex); 3540 mutex_init(&adev->gfx.pipe_reserve_mutex); 3541 mutex_init(&adev->gfx.gfx_off_mutex); 3542 mutex_init(&adev->grbm_idx_mutex); 3543 mutex_init(&adev->mn_lock); 3544 mutex_init(&adev->virt.vf_errors.lock); 3545 hash_init(adev->mn_hash); 3546 atomic_set(&adev->in_gpu_reset, 0); 3547 init_rwsem(&adev->reset_sem); 3548 mutex_init(&adev->psp.mutex); 3549 mutex_init(&adev->notifier_lock); 3550 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3551 mutex_init(&adev->benchmark_mutex); 3552 3553 amdgpu_device_init_apu_flags(adev); 3554 3555 r = amdgpu_device_check_arguments(adev); 3556 if (r) 3557 return r; 3558 3559 spin_lock_init(&adev->mmio_idx_lock); 3560 spin_lock_init(&adev->smc_idx_lock); 3561 spin_lock_init(&adev->pcie_idx_lock); 3562 spin_lock_init(&adev->uvd_ctx_idx_lock); 3563 spin_lock_init(&adev->didt_idx_lock); 3564 spin_lock_init(&adev->gc_cac_idx_lock); 3565 spin_lock_init(&adev->se_cac_idx_lock); 3566 spin_lock_init(&adev->audio_endpt_idx_lock); 3567 spin_lock_init(&adev->mm_stats.lock); 3568 3569 INIT_LIST_HEAD(&adev->shadow_list); 3570 mutex_init(&adev->shadow_list_lock); 3571 3572 INIT_LIST_HEAD(&adev->reset_list); 3573 3574 INIT_LIST_HEAD(&adev->ras_list); 3575 3576 INIT_DELAYED_WORK(&adev->delayed_init_work, 3577 amdgpu_device_delayed_init_work_handler); 3578 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3579 amdgpu_device_delay_enable_gfx_off); 3580 3581 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3582 3583 adev->gfx.gfx_off_req_count = 1; 3584 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3585 3586 atomic_set(&adev->throttling_logging_enabled, 1); 3587 /* 3588 * If throttling continues, logging will be performed every minute 3589 * to avoid log flooding. "-1" is subtracted since the thermal 3590 * throttling interrupt comes every second. Thus, the total logging 3591 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3592 * for throttling interrupt) = 60 seconds. 3593 */ 3594 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3595 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3596 3597 /* Registers mapping */ 3598 /* TODO: block userspace mapping of io register */ 3599 if (adev->asic_type >= CHIP_BONAIRE) { 3600 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3601 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3602 } else { 3603 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3604 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3605 } 3606 3607 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3608 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3609 3610 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3611 if (adev->rmmio == NULL) { 3612 return -ENOMEM; 3613 } 3614 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3615 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3616 3617 amdgpu_device_get_pcie_info(adev); 3618 3619 if (amdgpu_mcbp) 3620 DRM_INFO("MCBP is enabled\n"); 3621 3622 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3623 adev->enable_mes = true; 3624 3625 /* detect hw virtualization here */ 3626 amdgpu_detect_virtualization(adev); 3627 3628 r = amdgpu_device_get_job_timeout_settings(adev); 3629 if (r) { 3630 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3631 return r; 3632 } 3633 3634 /* early init functions */ 3635 r = amdgpu_device_ip_early_init(adev); 3636 if (r) 3637 return r; 3638 3639 /* Need to get xgmi info early to decide the reset behavior*/ 3640 if (adev->gmc.xgmi.supported) { 3641 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3642 if (r) 3643 return r; 3644 } 3645 3646 /* enable PCIE atomic ops */ 3647 if (amdgpu_sriov_vf(adev)) 3648 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3649 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags == 3650 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3651 else 3652 adev->have_atomics_support = 3653 !pci_enable_atomic_ops_to_root(adev->pdev, 3654 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3655 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3656 if (!adev->have_atomics_support) 3657 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3658 3659 /* doorbell bar mapping and doorbell index init*/ 3660 amdgpu_device_doorbell_init(adev); 3661 3662 if (amdgpu_emu_mode == 1) { 3663 /* post the asic on emulation mode */ 3664 emu_soc_asic_init(adev); 3665 goto fence_driver_init; 3666 } 3667 3668 amdgpu_reset_init(adev); 3669 3670 /* detect if we are with an SRIOV vbios */ 3671 amdgpu_device_detect_sriov_bios(adev); 3672 3673 /* check if we need to reset the asic 3674 * E.g., driver was not cleanly unloaded previously, etc. 3675 */ 3676 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3677 if (adev->gmc.xgmi.num_physical_nodes) { 3678 dev_info(adev->dev, "Pending hive reset.\n"); 3679 adev->gmc.xgmi.pending_reset = true; 3680 /* Only need to init necessary block for SMU to handle the reset */ 3681 for (i = 0; i < adev->num_ip_blocks; i++) { 3682 if (!adev->ip_blocks[i].status.valid) 3683 continue; 3684 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3685 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3687 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3688 DRM_DEBUG("IP %s disabled for hw_init.\n", 3689 adev->ip_blocks[i].version->funcs->name); 3690 adev->ip_blocks[i].status.hw = true; 3691 } 3692 } 3693 } else { 3694 r = amdgpu_asic_reset(adev); 3695 if (r) { 3696 dev_err(adev->dev, "asic reset on init failed\n"); 3697 goto failed; 3698 } 3699 } 3700 } 3701 3702 pci_enable_pcie_error_reporting(adev->pdev); 3703 3704 /* Post card if necessary */ 3705 if (amdgpu_device_need_post(adev)) { 3706 if (!adev->bios) { 3707 dev_err(adev->dev, "no vBIOS found\n"); 3708 r = -EINVAL; 3709 goto failed; 3710 } 3711 DRM_INFO("GPU posting now...\n"); 3712 r = amdgpu_device_asic_init(adev); 3713 if (r) { 3714 dev_err(adev->dev, "gpu post error!\n"); 3715 goto failed; 3716 } 3717 } 3718 3719 if (adev->is_atom_fw) { 3720 /* Initialize clocks */ 3721 r = amdgpu_atomfirmware_get_clock_info(adev); 3722 if (r) { 3723 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3724 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3725 goto failed; 3726 } 3727 } else { 3728 /* Initialize clocks */ 3729 r = amdgpu_atombios_get_clock_info(adev); 3730 if (r) { 3731 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3732 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3733 goto failed; 3734 } 3735 /* init i2c buses */ 3736 if (!amdgpu_device_has_dc_support(adev)) 3737 amdgpu_atombios_i2c_init(adev); 3738 } 3739 3740 fence_driver_init: 3741 /* Fence driver */ 3742 r = amdgpu_fence_driver_sw_init(adev); 3743 if (r) { 3744 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3745 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3746 goto failed; 3747 } 3748 3749 /* init the mode config */ 3750 drm_mode_config_init(adev_to_drm(adev)); 3751 3752 r = amdgpu_device_ip_init(adev); 3753 if (r) { 3754 /* failed in exclusive mode due to timeout */ 3755 if (amdgpu_sriov_vf(adev) && 3756 !amdgpu_sriov_runtime(adev) && 3757 amdgpu_virt_mmio_blocked(adev) && 3758 !amdgpu_virt_wait_reset(adev)) { 3759 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3760 /* Don't send request since VF is inactive. */ 3761 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3762 adev->virt.ops = NULL; 3763 r = -EAGAIN; 3764 goto release_ras_con; 3765 } 3766 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3767 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3768 goto release_ras_con; 3769 } 3770 3771 amdgpu_fence_driver_hw_init(adev); 3772 3773 dev_info(adev->dev, 3774 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3775 adev->gfx.config.max_shader_engines, 3776 adev->gfx.config.max_sh_per_se, 3777 adev->gfx.config.max_cu_per_sh, 3778 adev->gfx.cu_info.number); 3779 3780 adev->accel_working = true; 3781 3782 amdgpu_vm_check_compute_bug(adev); 3783 3784 /* Initialize the buffer migration limit. */ 3785 if (amdgpu_moverate >= 0) 3786 max_MBps = amdgpu_moverate; 3787 else 3788 max_MBps = 8; /* Allow 8 MB/s. */ 3789 /* Get a log2 for easy divisions. */ 3790 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3791 3792 r = amdgpu_pm_sysfs_init(adev); 3793 if (r) { 3794 adev->pm_sysfs_en = false; 3795 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3796 } else 3797 adev->pm_sysfs_en = true; 3798 3799 r = amdgpu_ucode_sysfs_init(adev); 3800 if (r) { 3801 adev->ucode_sysfs_en = false; 3802 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3803 } else 3804 adev->ucode_sysfs_en = true; 3805 3806 /* 3807 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3808 * Otherwise the mgpu fan boost feature will be skipped due to the 3809 * gpu instance is counted less. 3810 */ 3811 amdgpu_register_gpu_instance(adev); 3812 3813 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3814 * explicit gating rather than handling it automatically. 3815 */ 3816 if (!adev->gmc.xgmi.pending_reset) { 3817 r = amdgpu_device_ip_late_init(adev); 3818 if (r) { 3819 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3820 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3821 goto release_ras_con; 3822 } 3823 /* must succeed. */ 3824 amdgpu_ras_resume(adev); 3825 queue_delayed_work(system_wq, &adev->delayed_init_work, 3826 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3827 } 3828 3829 if (amdgpu_sriov_vf(adev)) 3830 flush_delayed_work(&adev->delayed_init_work); 3831 3832 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3833 if (r) 3834 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3835 3836 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3837 r = amdgpu_pmu_init(adev); 3838 if (r) 3839 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3840 3841 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3842 if (amdgpu_device_cache_pci_state(adev->pdev)) 3843 pci_restore_state(pdev); 3844 3845 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3846 /* this will fail for cards that aren't VGA class devices, just 3847 * ignore it */ 3848 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3849 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3850 3851 if (amdgpu_device_supports_px(ddev)) { 3852 px = true; 3853 vga_switcheroo_register_client(adev->pdev, 3854 &amdgpu_switcheroo_ops, px); 3855 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3856 } 3857 3858 if (adev->gmc.xgmi.pending_reset) 3859 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3860 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3861 3862 amdgpu_device_check_iommu_direct_map(adev); 3863 3864 return 0; 3865 3866 release_ras_con: 3867 amdgpu_release_ras_context(adev); 3868 3869 failed: 3870 amdgpu_vf_error_trans_all(adev); 3871 3872 return r; 3873 } 3874 3875 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3876 { 3877 3878 /* Clear all CPU mappings pointing to this device */ 3879 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3880 3881 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3882 amdgpu_device_doorbell_fini(adev); 3883 3884 iounmap(adev->rmmio); 3885 adev->rmmio = NULL; 3886 if (adev->mman.aper_base_kaddr) 3887 iounmap(adev->mman.aper_base_kaddr); 3888 adev->mman.aper_base_kaddr = NULL; 3889 3890 /* Memory manager related */ 3891 if (!adev->gmc.xgmi.connected_to_cpu) { 3892 arch_phys_wc_del(adev->gmc.vram_mtrr); 3893 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3894 } 3895 } 3896 3897 /** 3898 * amdgpu_device_fini_hw - tear down the driver 3899 * 3900 * @adev: amdgpu_device pointer 3901 * 3902 * Tear down the driver info (all asics). 3903 * Called at driver shutdown. 3904 */ 3905 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3906 { 3907 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3908 flush_delayed_work(&adev->delayed_init_work); 3909 if (adev->mman.initialized) { 3910 flush_delayed_work(&adev->mman.bdev.wq); 3911 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3912 } 3913 adev->shutdown = true; 3914 3915 /* make sure IB test finished before entering exclusive mode 3916 * to avoid preemption on IB test 3917 * */ 3918 if (amdgpu_sriov_vf(adev)) { 3919 amdgpu_virt_request_full_gpu(adev, false); 3920 amdgpu_virt_fini_data_exchange(adev); 3921 } 3922 3923 /* disable all interrupts */ 3924 amdgpu_irq_disable_all(adev); 3925 if (adev->mode_info.mode_config_initialized){ 3926 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3927 drm_helper_force_disable_all(adev_to_drm(adev)); 3928 else 3929 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3930 } 3931 amdgpu_fence_driver_hw_fini(adev); 3932 3933 if (adev->pm_sysfs_en) 3934 amdgpu_pm_sysfs_fini(adev); 3935 if (adev->ucode_sysfs_en) 3936 amdgpu_ucode_sysfs_fini(adev); 3937 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3938 3939 /* disable ras feature must before hw fini */ 3940 amdgpu_ras_pre_fini(adev); 3941 3942 amdgpu_device_ip_fini_early(adev); 3943 3944 amdgpu_irq_fini_hw(adev); 3945 3946 if (adev->mman.initialized) 3947 ttm_device_clear_dma_mappings(&adev->mman.bdev); 3948 3949 amdgpu_gart_dummy_page_fini(adev); 3950 3951 if (drm_dev_is_unplugged(adev_to_drm(adev))) 3952 amdgpu_device_unmap_mmio(adev); 3953 3954 } 3955 3956 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3957 { 3958 int idx; 3959 3960 amdgpu_fence_driver_sw_fini(adev); 3961 amdgpu_device_ip_fini(adev); 3962 release_firmware(adev->firmware.gpu_info_fw); 3963 adev->firmware.gpu_info_fw = NULL; 3964 adev->accel_working = false; 3965 3966 amdgpu_reset_fini(adev); 3967 3968 /* free i2c buses */ 3969 if (!amdgpu_device_has_dc_support(adev)) 3970 amdgpu_i2c_fini(adev); 3971 3972 if (amdgpu_emu_mode != 1) 3973 amdgpu_atombios_fini(adev); 3974 3975 kfree(adev->bios); 3976 adev->bios = NULL; 3977 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3978 vga_switcheroo_unregister_client(adev->pdev); 3979 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3980 } 3981 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3982 vga_client_unregister(adev->pdev); 3983 3984 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 3985 3986 iounmap(adev->rmmio); 3987 adev->rmmio = NULL; 3988 amdgpu_device_doorbell_fini(adev); 3989 drm_dev_exit(idx); 3990 } 3991 3992 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3993 amdgpu_pmu_fini(adev); 3994 if (adev->mman.discovery_bin) 3995 amdgpu_discovery_fini(adev); 3996 3997 kfree(adev->pci_state); 3998 3999 } 4000 4001 /** 4002 * amdgpu_device_evict_resources - evict device resources 4003 * @adev: amdgpu device object 4004 * 4005 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4006 * of the vram memory type. Mainly used for evicting device resources 4007 * at suspend time. 4008 * 4009 */ 4010 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4011 { 4012 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4013 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4014 return; 4015 4016 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4017 DRM_WARN("evicting device resources failed\n"); 4018 4019 } 4020 4021 /* 4022 * Suspend & resume. 4023 */ 4024 /** 4025 * amdgpu_device_suspend - initiate device suspend 4026 * 4027 * @dev: drm dev pointer 4028 * @fbcon : notify the fbdev of suspend 4029 * 4030 * Puts the hw in the suspend state (all asics). 4031 * Returns 0 for success or an error on failure. 4032 * Called at driver suspend. 4033 */ 4034 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4035 { 4036 struct amdgpu_device *adev = drm_to_adev(dev); 4037 4038 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4039 return 0; 4040 4041 adev->in_suspend = true; 4042 4043 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4044 DRM_WARN("smart shift update failed\n"); 4045 4046 drm_kms_helper_poll_disable(dev); 4047 4048 if (fbcon) 4049 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4050 4051 cancel_delayed_work_sync(&adev->delayed_init_work); 4052 4053 amdgpu_ras_suspend(adev); 4054 4055 amdgpu_device_ip_suspend_phase1(adev); 4056 4057 if (!adev->in_s0ix) 4058 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4059 4060 amdgpu_device_evict_resources(adev); 4061 4062 amdgpu_fence_driver_hw_fini(adev); 4063 4064 amdgpu_device_ip_suspend_phase2(adev); 4065 4066 return 0; 4067 } 4068 4069 /** 4070 * amdgpu_device_resume - initiate device resume 4071 * 4072 * @dev: drm dev pointer 4073 * @fbcon : notify the fbdev of resume 4074 * 4075 * Bring the hw back to operating state (all asics). 4076 * Returns 0 for success or an error on failure. 4077 * Called at driver resume. 4078 */ 4079 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4080 { 4081 struct amdgpu_device *adev = drm_to_adev(dev); 4082 int r = 0; 4083 4084 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4085 return 0; 4086 4087 if (adev->in_s0ix) 4088 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4089 4090 /* post card */ 4091 if (amdgpu_device_need_post(adev)) { 4092 r = amdgpu_device_asic_init(adev); 4093 if (r) 4094 dev_err(adev->dev, "amdgpu asic init failed\n"); 4095 } 4096 4097 r = amdgpu_device_ip_resume(adev); 4098 if (r) { 4099 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4100 return r; 4101 } 4102 amdgpu_fence_driver_hw_init(adev); 4103 4104 r = amdgpu_device_ip_late_init(adev); 4105 if (r) 4106 return r; 4107 4108 queue_delayed_work(system_wq, &adev->delayed_init_work, 4109 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4110 4111 if (!adev->in_s0ix) { 4112 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4113 if (r) 4114 return r; 4115 } 4116 4117 /* Make sure IB tests flushed */ 4118 flush_delayed_work(&adev->delayed_init_work); 4119 4120 if (fbcon) 4121 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4122 4123 drm_kms_helper_poll_enable(dev); 4124 4125 amdgpu_ras_resume(adev); 4126 4127 /* 4128 * Most of the connector probing functions try to acquire runtime pm 4129 * refs to ensure that the GPU is powered on when connector polling is 4130 * performed. Since we're calling this from a runtime PM callback, 4131 * trying to acquire rpm refs will cause us to deadlock. 4132 * 4133 * Since we're guaranteed to be holding the rpm lock, it's safe to 4134 * temporarily disable the rpm helpers so this doesn't deadlock us. 4135 */ 4136 #ifdef CONFIG_PM 4137 dev->dev->power.disable_depth++; 4138 #endif 4139 if (!amdgpu_device_has_dc_support(adev)) 4140 drm_helper_hpd_irq_event(dev); 4141 else 4142 drm_kms_helper_hotplug_event(dev); 4143 #ifdef CONFIG_PM 4144 dev->dev->power.disable_depth--; 4145 #endif 4146 adev->in_suspend = false; 4147 4148 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4149 DRM_WARN("smart shift update failed\n"); 4150 4151 return 0; 4152 } 4153 4154 /** 4155 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4156 * 4157 * @adev: amdgpu_device pointer 4158 * 4159 * The list of all the hardware IPs that make up the asic is walked and 4160 * the check_soft_reset callbacks are run. check_soft_reset determines 4161 * if the asic is still hung or not. 4162 * Returns true if any of the IPs are still in a hung state, false if not. 4163 */ 4164 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4165 { 4166 int i; 4167 bool asic_hang = false; 4168 4169 if (amdgpu_sriov_vf(adev)) 4170 return true; 4171 4172 if (amdgpu_asic_need_full_reset(adev)) 4173 return true; 4174 4175 for (i = 0; i < adev->num_ip_blocks; i++) { 4176 if (!adev->ip_blocks[i].status.valid) 4177 continue; 4178 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4179 adev->ip_blocks[i].status.hang = 4180 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4181 if (adev->ip_blocks[i].status.hang) { 4182 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4183 asic_hang = true; 4184 } 4185 } 4186 return asic_hang; 4187 } 4188 4189 /** 4190 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4191 * 4192 * @adev: amdgpu_device pointer 4193 * 4194 * The list of all the hardware IPs that make up the asic is walked and the 4195 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4196 * handles any IP specific hardware or software state changes that are 4197 * necessary for a soft reset to succeed. 4198 * Returns 0 on success, negative error code on failure. 4199 */ 4200 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4201 { 4202 int i, r = 0; 4203 4204 for (i = 0; i < adev->num_ip_blocks; i++) { 4205 if (!adev->ip_blocks[i].status.valid) 4206 continue; 4207 if (adev->ip_blocks[i].status.hang && 4208 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4209 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4210 if (r) 4211 return r; 4212 } 4213 } 4214 4215 return 0; 4216 } 4217 4218 /** 4219 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4220 * 4221 * @adev: amdgpu_device pointer 4222 * 4223 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4224 * reset is necessary to recover. 4225 * Returns true if a full asic reset is required, false if not. 4226 */ 4227 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4228 { 4229 int i; 4230 4231 if (amdgpu_asic_need_full_reset(adev)) 4232 return true; 4233 4234 for (i = 0; i < adev->num_ip_blocks; i++) { 4235 if (!adev->ip_blocks[i].status.valid) 4236 continue; 4237 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4238 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4239 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4240 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4241 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4242 if (adev->ip_blocks[i].status.hang) { 4243 dev_info(adev->dev, "Some block need full reset!\n"); 4244 return true; 4245 } 4246 } 4247 } 4248 return false; 4249 } 4250 4251 /** 4252 * amdgpu_device_ip_soft_reset - do a soft reset 4253 * 4254 * @adev: amdgpu_device pointer 4255 * 4256 * The list of all the hardware IPs that make up the asic is walked and the 4257 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4258 * IP specific hardware or software state changes that are necessary to soft 4259 * reset the IP. 4260 * Returns 0 on success, negative error code on failure. 4261 */ 4262 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4263 { 4264 int i, r = 0; 4265 4266 for (i = 0; i < adev->num_ip_blocks; i++) { 4267 if (!adev->ip_blocks[i].status.valid) 4268 continue; 4269 if (adev->ip_blocks[i].status.hang && 4270 adev->ip_blocks[i].version->funcs->soft_reset) { 4271 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4272 if (r) 4273 return r; 4274 } 4275 } 4276 4277 return 0; 4278 } 4279 4280 /** 4281 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4282 * 4283 * @adev: amdgpu_device pointer 4284 * 4285 * The list of all the hardware IPs that make up the asic is walked and the 4286 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4287 * handles any IP specific hardware or software state changes that are 4288 * necessary after the IP has been soft reset. 4289 * Returns 0 on success, negative error code on failure. 4290 */ 4291 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4292 { 4293 int i, r = 0; 4294 4295 for (i = 0; i < adev->num_ip_blocks; i++) { 4296 if (!adev->ip_blocks[i].status.valid) 4297 continue; 4298 if (adev->ip_blocks[i].status.hang && 4299 adev->ip_blocks[i].version->funcs->post_soft_reset) 4300 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4301 if (r) 4302 return r; 4303 } 4304 4305 return 0; 4306 } 4307 4308 /** 4309 * amdgpu_device_recover_vram - Recover some VRAM contents 4310 * 4311 * @adev: amdgpu_device pointer 4312 * 4313 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4314 * restore things like GPUVM page tables after a GPU reset where 4315 * the contents of VRAM might be lost. 4316 * 4317 * Returns: 4318 * 0 on success, negative error code on failure. 4319 */ 4320 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4321 { 4322 struct dma_fence *fence = NULL, *next = NULL; 4323 struct amdgpu_bo *shadow; 4324 struct amdgpu_bo_vm *vmbo; 4325 long r = 1, tmo; 4326 4327 if (amdgpu_sriov_runtime(adev)) 4328 tmo = msecs_to_jiffies(8000); 4329 else 4330 tmo = msecs_to_jiffies(100); 4331 4332 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4333 mutex_lock(&adev->shadow_list_lock); 4334 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4335 shadow = &vmbo->bo; 4336 /* No need to recover an evicted BO */ 4337 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4338 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4339 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4340 continue; 4341 4342 r = amdgpu_bo_restore_shadow(shadow, &next); 4343 if (r) 4344 break; 4345 4346 if (fence) { 4347 tmo = dma_fence_wait_timeout(fence, false, tmo); 4348 dma_fence_put(fence); 4349 fence = next; 4350 if (tmo == 0) { 4351 r = -ETIMEDOUT; 4352 break; 4353 } else if (tmo < 0) { 4354 r = tmo; 4355 break; 4356 } 4357 } else { 4358 fence = next; 4359 } 4360 } 4361 mutex_unlock(&adev->shadow_list_lock); 4362 4363 if (fence) 4364 tmo = dma_fence_wait_timeout(fence, false, tmo); 4365 dma_fence_put(fence); 4366 4367 if (r < 0 || tmo <= 0) { 4368 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4369 return -EIO; 4370 } 4371 4372 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4373 return 0; 4374 } 4375 4376 4377 /** 4378 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4379 * 4380 * @adev: amdgpu_device pointer 4381 * @from_hypervisor: request from hypervisor 4382 * 4383 * do VF FLR and reinitialize Asic 4384 * return 0 means succeeded otherwise failed 4385 */ 4386 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4387 bool from_hypervisor) 4388 { 4389 int r; 4390 struct amdgpu_hive_info *hive = NULL; 4391 int retry_limit = 0; 4392 4393 retry: 4394 amdgpu_amdkfd_pre_reset(adev); 4395 4396 amdgpu_amdkfd_pre_reset(adev); 4397 4398 if (from_hypervisor) 4399 r = amdgpu_virt_request_full_gpu(adev, true); 4400 else 4401 r = amdgpu_virt_reset_gpu(adev); 4402 if (r) 4403 return r; 4404 4405 /* Resume IP prior to SMC */ 4406 r = amdgpu_device_ip_reinit_early_sriov(adev); 4407 if (r) 4408 goto error; 4409 4410 amdgpu_virt_init_data_exchange(adev); 4411 4412 r = amdgpu_device_fw_loading(adev); 4413 if (r) 4414 return r; 4415 4416 /* now we are okay to resume SMC/CP/SDMA */ 4417 r = amdgpu_device_ip_reinit_late_sriov(adev); 4418 if (r) 4419 goto error; 4420 4421 hive = amdgpu_get_xgmi_hive(adev); 4422 /* Update PSP FW topology after reset */ 4423 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4424 r = amdgpu_xgmi_update_topology(hive, adev); 4425 4426 if (hive) 4427 amdgpu_put_xgmi_hive(hive); 4428 4429 if (!r) { 4430 amdgpu_irq_gpu_reset_resume_helper(adev); 4431 r = amdgpu_ib_ring_tests(adev); 4432 amdgpu_amdkfd_post_reset(adev); 4433 } 4434 4435 error: 4436 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4437 amdgpu_inc_vram_lost(adev); 4438 r = amdgpu_device_recover_vram(adev); 4439 } 4440 amdgpu_virt_release_full_gpu(adev, true); 4441 4442 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4443 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4444 retry_limit++; 4445 goto retry; 4446 } else 4447 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4448 } 4449 4450 return r; 4451 } 4452 4453 /** 4454 * amdgpu_device_has_job_running - check if there is any job in mirror list 4455 * 4456 * @adev: amdgpu_device pointer 4457 * 4458 * check if there is any job in mirror list 4459 */ 4460 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4461 { 4462 int i; 4463 struct drm_sched_job *job; 4464 4465 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4466 struct amdgpu_ring *ring = adev->rings[i]; 4467 4468 if (!ring || !ring->sched.thread) 4469 continue; 4470 4471 spin_lock(&ring->sched.job_list_lock); 4472 job = list_first_entry_or_null(&ring->sched.pending_list, 4473 struct drm_sched_job, list); 4474 spin_unlock(&ring->sched.job_list_lock); 4475 if (job) 4476 return true; 4477 } 4478 return false; 4479 } 4480 4481 /** 4482 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4483 * 4484 * @adev: amdgpu_device pointer 4485 * 4486 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4487 * a hung GPU. 4488 */ 4489 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4490 { 4491 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4492 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4493 return false; 4494 } 4495 4496 if (amdgpu_gpu_recovery == 0) 4497 goto disabled; 4498 4499 if (amdgpu_sriov_vf(adev)) 4500 return true; 4501 4502 if (amdgpu_gpu_recovery == -1) { 4503 switch (adev->asic_type) { 4504 #ifdef CONFIG_DRM_AMDGPU_SI 4505 case CHIP_VERDE: 4506 case CHIP_TAHITI: 4507 case CHIP_PITCAIRN: 4508 case CHIP_OLAND: 4509 case CHIP_HAINAN: 4510 #endif 4511 #ifdef CONFIG_DRM_AMDGPU_CIK 4512 case CHIP_KAVERI: 4513 case CHIP_KABINI: 4514 case CHIP_MULLINS: 4515 #endif 4516 case CHIP_CARRIZO: 4517 case CHIP_STONEY: 4518 case CHIP_CYAN_SKILLFISH: 4519 goto disabled; 4520 default: 4521 break; 4522 } 4523 } 4524 4525 return true; 4526 4527 disabled: 4528 dev_info(adev->dev, "GPU recovery disabled.\n"); 4529 return false; 4530 } 4531 4532 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4533 { 4534 u32 i; 4535 int ret = 0; 4536 4537 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4538 4539 dev_info(adev->dev, "GPU mode1 reset\n"); 4540 4541 /* disable BM */ 4542 pci_clear_master(adev->pdev); 4543 4544 amdgpu_device_cache_pci_state(adev->pdev); 4545 4546 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4547 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4548 ret = amdgpu_dpm_mode1_reset(adev); 4549 } else { 4550 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4551 ret = psp_gpu_reset(adev); 4552 } 4553 4554 if (ret) 4555 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4556 4557 amdgpu_device_load_pci_state(adev->pdev); 4558 4559 /* wait for asic to come out of reset */ 4560 for (i = 0; i < adev->usec_timeout; i++) { 4561 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4562 4563 if (memsize != 0xffffffff) 4564 break; 4565 udelay(1); 4566 } 4567 4568 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4569 return ret; 4570 } 4571 4572 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4573 struct amdgpu_reset_context *reset_context) 4574 { 4575 int i, r = 0; 4576 struct amdgpu_job *job = NULL; 4577 bool need_full_reset = 4578 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4579 4580 if (reset_context->reset_req_dev == adev) 4581 job = reset_context->job; 4582 4583 if (amdgpu_sriov_vf(adev)) { 4584 /* stop the data exchange thread */ 4585 amdgpu_virt_fini_data_exchange(adev); 4586 } 4587 4588 /* block all schedulers and reset given job's ring */ 4589 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4590 struct amdgpu_ring *ring = adev->rings[i]; 4591 4592 if (!ring || !ring->sched.thread) 4593 continue; 4594 4595 /*clear job fence from fence drv to avoid force_completion 4596 *leave NULL and vm flush fence in fence drv */ 4597 amdgpu_fence_driver_clear_job_fences(ring); 4598 4599 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4600 amdgpu_fence_driver_force_completion(ring); 4601 } 4602 4603 if (job && job->vm) 4604 drm_sched_increase_karma(&job->base); 4605 4606 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4607 /* If reset handler not implemented, continue; otherwise return */ 4608 if (r == -ENOSYS) 4609 r = 0; 4610 else 4611 return r; 4612 4613 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4614 if (!amdgpu_sriov_vf(adev)) { 4615 4616 if (!need_full_reset) 4617 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4618 4619 if (!need_full_reset) { 4620 amdgpu_device_ip_pre_soft_reset(adev); 4621 r = amdgpu_device_ip_soft_reset(adev); 4622 amdgpu_device_ip_post_soft_reset(adev); 4623 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4624 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4625 need_full_reset = true; 4626 } 4627 } 4628 4629 if (need_full_reset) 4630 r = amdgpu_device_ip_suspend(adev); 4631 if (need_full_reset) 4632 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4633 else 4634 clear_bit(AMDGPU_NEED_FULL_RESET, 4635 &reset_context->flags); 4636 } 4637 4638 return r; 4639 } 4640 4641 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4642 struct amdgpu_reset_context *reset_context) 4643 { 4644 struct amdgpu_device *tmp_adev = NULL; 4645 bool need_full_reset, skip_hw_reset, vram_lost = false; 4646 int r = 0; 4647 4648 /* Try reset handler method first */ 4649 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4650 reset_list); 4651 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4652 /* If reset handler not implemented, continue; otherwise return */ 4653 if (r == -ENOSYS) 4654 r = 0; 4655 else 4656 return r; 4657 4658 /* Reset handler not implemented, use the default method */ 4659 need_full_reset = 4660 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4661 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4662 4663 /* 4664 * ASIC reset has to be done on all XGMI hive nodes ASAP 4665 * to allow proper links negotiation in FW (within 1 sec) 4666 */ 4667 if (!skip_hw_reset && need_full_reset) { 4668 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4669 /* For XGMI run all resets in parallel to speed up the process */ 4670 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4671 tmp_adev->gmc.xgmi.pending_reset = false; 4672 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4673 r = -EALREADY; 4674 } else 4675 r = amdgpu_asic_reset(tmp_adev); 4676 4677 if (r) { 4678 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4679 r, adev_to_drm(tmp_adev)->unique); 4680 break; 4681 } 4682 } 4683 4684 /* For XGMI wait for all resets to complete before proceed */ 4685 if (!r) { 4686 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4687 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4688 flush_work(&tmp_adev->xgmi_reset_work); 4689 r = tmp_adev->asic_reset_res; 4690 if (r) 4691 break; 4692 } 4693 } 4694 } 4695 } 4696 4697 if (!r && amdgpu_ras_intr_triggered()) { 4698 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4699 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4700 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4701 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4702 } 4703 4704 amdgpu_ras_intr_cleared(); 4705 } 4706 4707 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4708 if (need_full_reset) { 4709 /* post card */ 4710 r = amdgpu_device_asic_init(tmp_adev); 4711 if (r) { 4712 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4713 } else { 4714 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4715 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4716 if (r) 4717 goto out; 4718 4719 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4720 if (r) 4721 goto out; 4722 4723 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4724 if (vram_lost) { 4725 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4726 amdgpu_inc_vram_lost(tmp_adev); 4727 } 4728 4729 r = amdgpu_device_fw_loading(tmp_adev); 4730 if (r) 4731 return r; 4732 4733 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4734 if (r) 4735 goto out; 4736 4737 if (vram_lost) 4738 amdgpu_device_fill_reset_magic(tmp_adev); 4739 4740 /* 4741 * Add this ASIC as tracked as reset was already 4742 * complete successfully. 4743 */ 4744 amdgpu_register_gpu_instance(tmp_adev); 4745 4746 if (!reset_context->hive && 4747 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4748 amdgpu_xgmi_add_device(tmp_adev); 4749 4750 r = amdgpu_device_ip_late_init(tmp_adev); 4751 if (r) 4752 goto out; 4753 4754 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4755 4756 /* 4757 * The GPU enters bad state once faulty pages 4758 * by ECC has reached the threshold, and ras 4759 * recovery is scheduled next. So add one check 4760 * here to break recovery if it indeed exceeds 4761 * bad page threshold, and remind user to 4762 * retire this GPU or setting one bigger 4763 * bad_page_threshold value to fix this once 4764 * probing driver again. 4765 */ 4766 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4767 /* must succeed. */ 4768 amdgpu_ras_resume(tmp_adev); 4769 } else { 4770 r = -EINVAL; 4771 goto out; 4772 } 4773 4774 /* Update PSP FW topology after reset */ 4775 if (reset_context->hive && 4776 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4777 r = amdgpu_xgmi_update_topology( 4778 reset_context->hive, tmp_adev); 4779 } 4780 } 4781 4782 out: 4783 if (!r) { 4784 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4785 r = amdgpu_ib_ring_tests(tmp_adev); 4786 if (r) { 4787 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4788 need_full_reset = true; 4789 r = -EAGAIN; 4790 goto end; 4791 } 4792 } 4793 4794 if (!r) 4795 r = amdgpu_device_recover_vram(tmp_adev); 4796 else 4797 tmp_adev->asic_reset_res = r; 4798 } 4799 4800 end: 4801 if (need_full_reset) 4802 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4803 else 4804 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4805 return r; 4806 } 4807 4808 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4809 struct amdgpu_hive_info *hive) 4810 { 4811 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4812 return false; 4813 4814 if (hive) { 4815 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4816 } else { 4817 down_write(&adev->reset_sem); 4818 } 4819 4820 switch (amdgpu_asic_reset_method(adev)) { 4821 case AMD_RESET_METHOD_MODE1: 4822 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4823 break; 4824 case AMD_RESET_METHOD_MODE2: 4825 adev->mp1_state = PP_MP1_STATE_RESET; 4826 break; 4827 default: 4828 adev->mp1_state = PP_MP1_STATE_NONE; 4829 break; 4830 } 4831 4832 return true; 4833 } 4834 4835 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4836 { 4837 amdgpu_vf_error_trans_all(adev); 4838 adev->mp1_state = PP_MP1_STATE_NONE; 4839 atomic_set(&adev->in_gpu_reset, 0); 4840 up_write(&adev->reset_sem); 4841 } 4842 4843 /* 4844 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4845 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4846 * 4847 * unlock won't require roll back. 4848 */ 4849 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4850 { 4851 struct amdgpu_device *tmp_adev = NULL; 4852 4853 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 4854 if (!hive) { 4855 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4856 return -ENODEV; 4857 } 4858 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4859 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4860 goto roll_back; 4861 } 4862 } else if (!amdgpu_device_lock_adev(adev, hive)) 4863 return -EAGAIN; 4864 4865 return 0; 4866 roll_back: 4867 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4868 /* 4869 * if the lockup iteration break in the middle of a hive, 4870 * it may means there may has a race issue, 4871 * or a hive device locked up independently. 4872 * we may be in trouble and may not, so will try to roll back 4873 * the lock and give out a warnning. 4874 */ 4875 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4876 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4877 amdgpu_device_unlock_adev(tmp_adev); 4878 } 4879 } 4880 return -EAGAIN; 4881 } 4882 4883 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4884 { 4885 struct pci_dev *p = NULL; 4886 4887 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4888 adev->pdev->bus->number, 1); 4889 if (p) { 4890 pm_runtime_enable(&(p->dev)); 4891 pm_runtime_resume(&(p->dev)); 4892 } 4893 } 4894 4895 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4896 { 4897 enum amd_reset_method reset_method; 4898 struct pci_dev *p = NULL; 4899 u64 expires; 4900 4901 /* 4902 * For now, only BACO and mode1 reset are confirmed 4903 * to suffer the audio issue without proper suspended. 4904 */ 4905 reset_method = amdgpu_asic_reset_method(adev); 4906 if ((reset_method != AMD_RESET_METHOD_BACO) && 4907 (reset_method != AMD_RESET_METHOD_MODE1)) 4908 return -EINVAL; 4909 4910 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4911 adev->pdev->bus->number, 1); 4912 if (!p) 4913 return -ENODEV; 4914 4915 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4916 if (!expires) 4917 /* 4918 * If we cannot get the audio device autosuspend delay, 4919 * a fixed 4S interval will be used. Considering 3S is 4920 * the audio controller default autosuspend delay setting. 4921 * 4S used here is guaranteed to cover that. 4922 */ 4923 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4924 4925 while (!pm_runtime_status_suspended(&(p->dev))) { 4926 if (!pm_runtime_suspend(&(p->dev))) 4927 break; 4928 4929 if (expires < ktime_get_mono_fast_ns()) { 4930 dev_warn(adev->dev, "failed to suspend display audio\n"); 4931 /* TODO: abort the succeeding gpu reset? */ 4932 return -ETIMEDOUT; 4933 } 4934 } 4935 4936 pm_runtime_disable(&(p->dev)); 4937 4938 return 0; 4939 } 4940 4941 static void amdgpu_device_recheck_guilty_jobs( 4942 struct amdgpu_device *adev, struct list_head *device_list_handle, 4943 struct amdgpu_reset_context *reset_context) 4944 { 4945 int i, r = 0; 4946 4947 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4948 struct amdgpu_ring *ring = adev->rings[i]; 4949 int ret = 0; 4950 struct drm_sched_job *s_job; 4951 4952 if (!ring || !ring->sched.thread) 4953 continue; 4954 4955 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4956 struct drm_sched_job, list); 4957 if (s_job == NULL) 4958 continue; 4959 4960 /* clear job's guilty and depend the folowing step to decide the real one */ 4961 drm_sched_reset_karma(s_job); 4962 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get 4963 * to make sure fence is balanced */ 4964 dma_fence_get(s_job->s_fence->parent); 4965 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4966 4967 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4968 if (ret == 0) { /* timeout */ 4969 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4970 ring->sched.name, s_job->id); 4971 4972 /* set guilty */ 4973 drm_sched_increase_karma(s_job); 4974 retry: 4975 /* do hw reset */ 4976 if (amdgpu_sriov_vf(adev)) { 4977 amdgpu_virt_fini_data_exchange(adev); 4978 r = amdgpu_device_reset_sriov(adev, false); 4979 if (r) 4980 adev->asic_reset_res = r; 4981 } else { 4982 clear_bit(AMDGPU_SKIP_HW_RESET, 4983 &reset_context->flags); 4984 r = amdgpu_do_asic_reset(device_list_handle, 4985 reset_context); 4986 if (r && r == -EAGAIN) 4987 goto retry; 4988 } 4989 4990 /* 4991 * add reset counter so that the following 4992 * resubmitted job could flush vmid 4993 */ 4994 atomic_inc(&adev->gpu_reset_counter); 4995 continue; 4996 } 4997 4998 /* got the hw fence, signal finished fence */ 4999 atomic_dec(ring->sched.score); 5000 dma_fence_put(s_job->s_fence->parent); 5001 dma_fence_get(&s_job->s_fence->finished); 5002 dma_fence_signal(&s_job->s_fence->finished); 5003 dma_fence_put(&s_job->s_fence->finished); 5004 5005 /* remove node from list and free the job */ 5006 spin_lock(&ring->sched.job_list_lock); 5007 list_del_init(&s_job->list); 5008 spin_unlock(&ring->sched.job_list_lock); 5009 ring->sched.ops->free_job(s_job); 5010 } 5011 } 5012 5013 /** 5014 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5015 * 5016 * @adev: amdgpu_device pointer 5017 * @job: which job trigger hang 5018 * 5019 * Attempt to reset the GPU if it has hung (all asics). 5020 * Attempt to do soft-reset or full-reset and reinitialize Asic 5021 * Returns 0 for success or an error on failure. 5022 */ 5023 5024 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5025 struct amdgpu_job *job) 5026 { 5027 struct list_head device_list, *device_list_handle = NULL; 5028 bool job_signaled = false; 5029 struct amdgpu_hive_info *hive = NULL; 5030 struct amdgpu_device *tmp_adev = NULL; 5031 int i, r = 0; 5032 bool need_emergency_restart = false; 5033 bool audio_suspended = false; 5034 int tmp_vram_lost_counter; 5035 struct amdgpu_reset_context reset_context; 5036 5037 memset(&reset_context, 0, sizeof(reset_context)); 5038 5039 /* 5040 * Special case: RAS triggered and full reset isn't supported 5041 */ 5042 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5043 5044 /* 5045 * Flush RAM to disk so that after reboot 5046 * the user can read log and see why the system rebooted. 5047 */ 5048 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5049 DRM_WARN("Emergency reboot."); 5050 5051 ksys_sync_helper(); 5052 emergency_restart(); 5053 } 5054 5055 dev_info(adev->dev, "GPU %s begin!\n", 5056 need_emergency_restart ? "jobs stop":"reset"); 5057 5058 /* 5059 * Here we trylock to avoid chain of resets executing from 5060 * either trigger by jobs on different adevs in XGMI hive or jobs on 5061 * different schedulers for same device while this TO handler is running. 5062 * We always reset all schedulers for device and all devices for XGMI 5063 * hive so that should take care of them too. 5064 */ 5065 if (!amdgpu_sriov_vf(adev)) 5066 hive = amdgpu_get_xgmi_hive(adev); 5067 if (hive) { 5068 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 5069 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 5070 job ? job->base.id : -1, hive->hive_id); 5071 amdgpu_put_xgmi_hive(hive); 5072 if (job && job->vm) 5073 drm_sched_increase_karma(&job->base); 5074 return 0; 5075 } 5076 mutex_lock(&hive->hive_lock); 5077 } 5078 5079 reset_context.method = AMD_RESET_METHOD_NONE; 5080 reset_context.reset_req_dev = adev; 5081 reset_context.job = job; 5082 reset_context.hive = hive; 5083 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5084 5085 /* 5086 * lock the device before we try to operate the linked list 5087 * if didn't get the device lock, don't touch the linked list since 5088 * others may iterating it. 5089 */ 5090 r = amdgpu_device_lock_hive_adev(adev, hive); 5091 if (r) { 5092 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 5093 job ? job->base.id : -1); 5094 5095 /* even we skipped this reset, still need to set the job to guilty */ 5096 if (job && job->vm) 5097 drm_sched_increase_karma(&job->base); 5098 goto skip_recovery; 5099 } 5100 5101 /* 5102 * Build list of devices to reset. 5103 * In case we are in XGMI hive mode, resort the device list 5104 * to put adev in the 1st position. 5105 */ 5106 INIT_LIST_HEAD(&device_list); 5107 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5108 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5109 list_add_tail(&tmp_adev->reset_list, &device_list); 5110 if (!list_is_first(&adev->reset_list, &device_list)) 5111 list_rotate_to_front(&adev->reset_list, &device_list); 5112 device_list_handle = &device_list; 5113 } else { 5114 list_add_tail(&adev->reset_list, &device_list); 5115 device_list_handle = &device_list; 5116 } 5117 5118 /* block all schedulers and reset given job's ring */ 5119 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5120 /* 5121 * Try to put the audio codec into suspend state 5122 * before gpu reset started. 5123 * 5124 * Due to the power domain of the graphics device 5125 * is shared with AZ power domain. Without this, 5126 * we may change the audio hardware from behind 5127 * the audio driver's back. That will trigger 5128 * some audio codec errors. 5129 */ 5130 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5131 audio_suspended = true; 5132 5133 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5134 5135 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5136 5137 if (!amdgpu_sriov_vf(tmp_adev)) 5138 amdgpu_amdkfd_pre_reset(tmp_adev); 5139 5140 /* 5141 * Mark these ASICs to be reseted as untracked first 5142 * And add them back after reset completed 5143 */ 5144 amdgpu_unregister_gpu_instance(tmp_adev); 5145 5146 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 5147 5148 /* disable ras on ALL IPs */ 5149 if (!need_emergency_restart && 5150 amdgpu_device_ip_need_full_reset(tmp_adev)) 5151 amdgpu_ras_suspend(tmp_adev); 5152 5153 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5154 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5155 5156 if (!ring || !ring->sched.thread) 5157 continue; 5158 5159 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5160 5161 if (need_emergency_restart) 5162 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5163 } 5164 atomic_inc(&tmp_adev->gpu_reset_counter); 5165 } 5166 5167 if (need_emergency_restart) 5168 goto skip_sched_resume; 5169 5170 /* 5171 * Must check guilty signal here since after this point all old 5172 * HW fences are force signaled. 5173 * 5174 * job->base holds a reference to parent fence 5175 */ 5176 if (job && job->base.s_fence->parent && 5177 dma_fence_is_signaled(job->base.s_fence->parent)) { 5178 job_signaled = true; 5179 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5180 goto skip_hw_reset; 5181 } 5182 5183 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5184 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5185 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5186 /*TODO Should we stop ?*/ 5187 if (r) { 5188 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5189 r, adev_to_drm(tmp_adev)->unique); 5190 tmp_adev->asic_reset_res = r; 5191 } 5192 } 5193 5194 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5195 /* Actual ASIC resets if needed.*/ 5196 /* Host driver will handle XGMI hive reset for SRIOV */ 5197 if (amdgpu_sriov_vf(adev)) { 5198 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5199 if (r) 5200 adev->asic_reset_res = r; 5201 } else { 5202 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5203 if (r && r == -EAGAIN) 5204 goto retry; 5205 } 5206 5207 skip_hw_reset: 5208 5209 /* Post ASIC reset for all devs .*/ 5210 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5211 5212 /* 5213 * Sometimes a later bad compute job can block a good gfx job as gfx 5214 * and compute ring share internal GC HW mutually. We add an additional 5215 * guilty jobs recheck step to find the real guilty job, it synchronously 5216 * submits and pends for the first job being signaled. If it gets timeout, 5217 * we identify it as a real guilty job. 5218 */ 5219 if (amdgpu_gpu_recovery == 2 && 5220 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5221 amdgpu_device_recheck_guilty_jobs( 5222 tmp_adev, device_list_handle, &reset_context); 5223 5224 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5225 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5226 5227 if (!ring || !ring->sched.thread) 5228 continue; 5229 5230 /* No point to resubmit jobs if we didn't HW reset*/ 5231 if (!tmp_adev->asic_reset_res && !job_signaled) 5232 drm_sched_resubmit_jobs(&ring->sched); 5233 5234 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5235 } 5236 5237 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5238 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5239 } 5240 5241 if (tmp_adev->asic_reset_res) 5242 r = tmp_adev->asic_reset_res; 5243 5244 tmp_adev->asic_reset_res = 0; 5245 5246 if (r) { 5247 /* bad news, how to tell it to userspace ? */ 5248 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5249 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5250 } else { 5251 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5252 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5253 DRM_WARN("smart shift update failed\n"); 5254 } 5255 } 5256 5257 skip_sched_resume: 5258 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5259 /* unlock kfd: SRIOV would do it separately */ 5260 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5261 amdgpu_amdkfd_post_reset(tmp_adev); 5262 5263 /* kfd_post_reset will do nothing if kfd device is not initialized, 5264 * need to bring up kfd here if it's not be initialized before 5265 */ 5266 if (!adev->kfd.init_complete) 5267 amdgpu_amdkfd_device_init(adev); 5268 5269 if (audio_suspended) 5270 amdgpu_device_resume_display_audio(tmp_adev); 5271 amdgpu_device_unlock_adev(tmp_adev); 5272 } 5273 5274 skip_recovery: 5275 if (hive) { 5276 atomic_set(&hive->in_reset, 0); 5277 mutex_unlock(&hive->hive_lock); 5278 amdgpu_put_xgmi_hive(hive); 5279 } 5280 5281 if (r && r != -EAGAIN) 5282 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5283 return r; 5284 } 5285 5286 /** 5287 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5288 * 5289 * @adev: amdgpu_device pointer 5290 * 5291 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5292 * and lanes) of the slot the device is in. Handles APUs and 5293 * virtualized environments where PCIE config space may not be available. 5294 */ 5295 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5296 { 5297 struct pci_dev *pdev; 5298 enum pci_bus_speed speed_cap, platform_speed_cap; 5299 enum pcie_link_width platform_link_width; 5300 5301 if (amdgpu_pcie_gen_cap) 5302 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5303 5304 if (amdgpu_pcie_lane_cap) 5305 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5306 5307 /* covers APUs as well */ 5308 if (pci_is_root_bus(adev->pdev->bus)) { 5309 if (adev->pm.pcie_gen_mask == 0) 5310 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5311 if (adev->pm.pcie_mlw_mask == 0) 5312 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5313 return; 5314 } 5315 5316 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5317 return; 5318 5319 pcie_bandwidth_available(adev->pdev, NULL, 5320 &platform_speed_cap, &platform_link_width); 5321 5322 if (adev->pm.pcie_gen_mask == 0) { 5323 /* asic caps */ 5324 pdev = adev->pdev; 5325 speed_cap = pcie_get_speed_cap(pdev); 5326 if (speed_cap == PCI_SPEED_UNKNOWN) { 5327 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5328 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5329 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5330 } else { 5331 if (speed_cap == PCIE_SPEED_32_0GT) 5332 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5333 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5334 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5335 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5336 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5337 else if (speed_cap == PCIE_SPEED_16_0GT) 5338 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5339 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5340 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5341 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5342 else if (speed_cap == PCIE_SPEED_8_0GT) 5343 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5344 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5345 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5346 else if (speed_cap == PCIE_SPEED_5_0GT) 5347 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5348 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5349 else 5350 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5351 } 5352 /* platform caps */ 5353 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5354 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5355 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5356 } else { 5357 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5358 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5359 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5360 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5361 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5362 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5363 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5364 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5365 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5366 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5367 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5368 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5369 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5370 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5371 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5372 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5373 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5374 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5375 else 5376 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5377 5378 } 5379 } 5380 if (adev->pm.pcie_mlw_mask == 0) { 5381 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5382 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5383 } else { 5384 switch (platform_link_width) { 5385 case PCIE_LNK_X32: 5386 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5387 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5388 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5389 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5390 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5391 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5392 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5393 break; 5394 case PCIE_LNK_X16: 5395 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5396 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5397 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5398 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5399 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5400 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5401 break; 5402 case PCIE_LNK_X12: 5403 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5404 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5405 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5406 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5407 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5408 break; 5409 case PCIE_LNK_X8: 5410 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5411 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5412 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5413 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5414 break; 5415 case PCIE_LNK_X4: 5416 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5417 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5418 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5419 break; 5420 case PCIE_LNK_X2: 5421 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5423 break; 5424 case PCIE_LNK_X1: 5425 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5426 break; 5427 default: 5428 break; 5429 } 5430 } 5431 } 5432 } 5433 5434 int amdgpu_device_baco_enter(struct drm_device *dev) 5435 { 5436 struct amdgpu_device *adev = drm_to_adev(dev); 5437 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5438 5439 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5440 return -ENOTSUPP; 5441 5442 if (ras && adev->ras_enabled && 5443 adev->nbio.funcs->enable_doorbell_interrupt) 5444 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5445 5446 return amdgpu_dpm_baco_enter(adev); 5447 } 5448 5449 int amdgpu_device_baco_exit(struct drm_device *dev) 5450 { 5451 struct amdgpu_device *adev = drm_to_adev(dev); 5452 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5453 int ret = 0; 5454 5455 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5456 return -ENOTSUPP; 5457 5458 ret = amdgpu_dpm_baco_exit(adev); 5459 if (ret) 5460 return ret; 5461 5462 if (ras && adev->ras_enabled && 5463 adev->nbio.funcs->enable_doorbell_interrupt) 5464 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5465 5466 if (amdgpu_passthrough(adev) && 5467 adev->nbio.funcs->clear_doorbell_interrupt) 5468 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5469 5470 return 0; 5471 } 5472 5473 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5474 { 5475 int i; 5476 5477 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5478 struct amdgpu_ring *ring = adev->rings[i]; 5479 5480 if (!ring || !ring->sched.thread) 5481 continue; 5482 5483 cancel_delayed_work_sync(&ring->sched.work_tdr); 5484 } 5485 } 5486 5487 /** 5488 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5489 * @pdev: PCI device struct 5490 * @state: PCI channel state 5491 * 5492 * Description: Called when a PCI error is detected. 5493 * 5494 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5495 */ 5496 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5497 { 5498 struct drm_device *dev = pci_get_drvdata(pdev); 5499 struct amdgpu_device *adev = drm_to_adev(dev); 5500 int i; 5501 5502 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5503 5504 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5505 DRM_WARN("No support for XGMI hive yet..."); 5506 return PCI_ERS_RESULT_DISCONNECT; 5507 } 5508 5509 adev->pci_channel_state = state; 5510 5511 switch (state) { 5512 case pci_channel_io_normal: 5513 return PCI_ERS_RESULT_CAN_RECOVER; 5514 /* Fatal error, prepare for slot reset */ 5515 case pci_channel_io_frozen: 5516 /* 5517 * Cancel and wait for all TDRs in progress if failing to 5518 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5519 * 5520 * Locking adev->reset_sem will prevent any external access 5521 * to GPU during PCI error recovery 5522 */ 5523 while (!amdgpu_device_lock_adev(adev, NULL)) 5524 amdgpu_cancel_all_tdr(adev); 5525 5526 /* 5527 * Block any work scheduling as we do for regular GPU reset 5528 * for the duration of the recovery 5529 */ 5530 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5531 struct amdgpu_ring *ring = adev->rings[i]; 5532 5533 if (!ring || !ring->sched.thread) 5534 continue; 5535 5536 drm_sched_stop(&ring->sched, NULL); 5537 } 5538 atomic_inc(&adev->gpu_reset_counter); 5539 return PCI_ERS_RESULT_NEED_RESET; 5540 case pci_channel_io_perm_failure: 5541 /* Permanent error, prepare for device removal */ 5542 return PCI_ERS_RESULT_DISCONNECT; 5543 } 5544 5545 return PCI_ERS_RESULT_NEED_RESET; 5546 } 5547 5548 /** 5549 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5550 * @pdev: pointer to PCI device 5551 */ 5552 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5553 { 5554 5555 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5556 5557 /* TODO - dump whatever for debugging purposes */ 5558 5559 /* This called only if amdgpu_pci_error_detected returns 5560 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5561 * works, no need to reset slot. 5562 */ 5563 5564 return PCI_ERS_RESULT_RECOVERED; 5565 } 5566 5567 /** 5568 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5569 * @pdev: PCI device struct 5570 * 5571 * Description: This routine is called by the pci error recovery 5572 * code after the PCI slot has been reset, just before we 5573 * should resume normal operations. 5574 */ 5575 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5576 { 5577 struct drm_device *dev = pci_get_drvdata(pdev); 5578 struct amdgpu_device *adev = drm_to_adev(dev); 5579 int r, i; 5580 struct amdgpu_reset_context reset_context; 5581 u32 memsize; 5582 struct list_head device_list; 5583 5584 DRM_INFO("PCI error: slot reset callback!!\n"); 5585 5586 memset(&reset_context, 0, sizeof(reset_context)); 5587 5588 INIT_LIST_HEAD(&device_list); 5589 list_add_tail(&adev->reset_list, &device_list); 5590 5591 /* wait for asic to come out of reset */ 5592 msleep(500); 5593 5594 /* Restore PCI confspace */ 5595 amdgpu_device_load_pci_state(pdev); 5596 5597 /* confirm ASIC came out of reset */ 5598 for (i = 0; i < adev->usec_timeout; i++) { 5599 memsize = amdgpu_asic_get_config_memsize(adev); 5600 5601 if (memsize != 0xffffffff) 5602 break; 5603 udelay(1); 5604 } 5605 if (memsize == 0xffffffff) { 5606 r = -ETIME; 5607 goto out; 5608 } 5609 5610 reset_context.method = AMD_RESET_METHOD_NONE; 5611 reset_context.reset_req_dev = adev; 5612 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5613 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5614 5615 adev->no_hw_access = true; 5616 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5617 adev->no_hw_access = false; 5618 if (r) 5619 goto out; 5620 5621 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5622 5623 out: 5624 if (!r) { 5625 if (amdgpu_device_cache_pci_state(adev->pdev)) 5626 pci_restore_state(adev->pdev); 5627 5628 DRM_INFO("PCIe error recovery succeeded\n"); 5629 } else { 5630 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5631 amdgpu_device_unlock_adev(adev); 5632 } 5633 5634 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5635 } 5636 5637 /** 5638 * amdgpu_pci_resume() - resume normal ops after PCI reset 5639 * @pdev: pointer to PCI device 5640 * 5641 * Called when the error recovery driver tells us that its 5642 * OK to resume normal operation. 5643 */ 5644 void amdgpu_pci_resume(struct pci_dev *pdev) 5645 { 5646 struct drm_device *dev = pci_get_drvdata(pdev); 5647 struct amdgpu_device *adev = drm_to_adev(dev); 5648 int i; 5649 5650 5651 DRM_INFO("PCI error: resume callback!!\n"); 5652 5653 /* Only continue execution for the case of pci_channel_io_frozen */ 5654 if (adev->pci_channel_state != pci_channel_io_frozen) 5655 return; 5656 5657 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5658 struct amdgpu_ring *ring = adev->rings[i]; 5659 5660 if (!ring || !ring->sched.thread) 5661 continue; 5662 5663 5664 drm_sched_resubmit_jobs(&ring->sched); 5665 drm_sched_start(&ring->sched, true); 5666 } 5667 5668 amdgpu_device_unlock_adev(adev); 5669 } 5670 5671 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5672 { 5673 struct drm_device *dev = pci_get_drvdata(pdev); 5674 struct amdgpu_device *adev = drm_to_adev(dev); 5675 int r; 5676 5677 r = pci_save_state(pdev); 5678 if (!r) { 5679 kfree(adev->pci_state); 5680 5681 adev->pci_state = pci_store_saved_state(pdev); 5682 5683 if (!adev->pci_state) { 5684 DRM_ERROR("Failed to store PCI saved state"); 5685 return false; 5686 } 5687 } else { 5688 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5689 return false; 5690 } 5691 5692 return true; 5693 } 5694 5695 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5696 { 5697 struct drm_device *dev = pci_get_drvdata(pdev); 5698 struct amdgpu_device *adev = drm_to_adev(dev); 5699 int r; 5700 5701 if (!adev->pci_state) 5702 return false; 5703 5704 r = pci_load_saved_state(pdev, adev->pci_state); 5705 5706 if (!r) { 5707 pci_restore_state(pdev); 5708 } else { 5709 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5710 return false; 5711 } 5712 5713 return true; 5714 } 5715 5716 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5717 struct amdgpu_ring *ring) 5718 { 5719 #ifdef CONFIG_X86_64 5720 if (adev->flags & AMD_IS_APU) 5721 return; 5722 #endif 5723 if (adev->gmc.xgmi.connected_to_cpu) 5724 return; 5725 5726 if (ring && ring->funcs->emit_hdp_flush) 5727 amdgpu_ring_emit_hdp_flush(ring); 5728 else 5729 amdgpu_asic_flush_hdp(adev, ring); 5730 } 5731 5732 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5733 struct amdgpu_ring *ring) 5734 { 5735 #ifdef CONFIG_X86_64 5736 if (adev->flags & AMD_IS_APU) 5737 return; 5738 #endif 5739 if (adev->gmc.xgmi.connected_to_cpu) 5740 return; 5741 5742 amdgpu_asic_invalidate_hdp(adev, ring); 5743 } 5744 5745 /** 5746 * amdgpu_device_halt() - bring hardware to some kind of halt state 5747 * 5748 * @adev: amdgpu_device pointer 5749 * 5750 * Bring hardware to some kind of halt state so that no one can touch it 5751 * any more. It will help to maintain error context when error occurred. 5752 * Compare to a simple hang, the system will keep stable at least for SSH 5753 * access. Then it should be trivial to inspect the hardware state and 5754 * see what's going on. Implemented as following: 5755 * 5756 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5757 * clears all CPU mappings to device, disallows remappings through page faults 5758 * 2. amdgpu_irq_disable_all() disables all interrupts 5759 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5760 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5761 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5762 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5763 * flush any in flight DMA operations 5764 */ 5765 void amdgpu_device_halt(struct amdgpu_device *adev) 5766 { 5767 struct pci_dev *pdev = adev->pdev; 5768 struct drm_device *ddev = adev_to_drm(adev); 5769 5770 drm_dev_unplug(ddev); 5771 5772 amdgpu_irq_disable_all(adev); 5773 5774 amdgpu_fence_driver_hw_fini(adev); 5775 5776 adev->no_hw_access = true; 5777 5778 amdgpu_device_unmap_mmio(adev); 5779 5780 pci_disable_device(pdev); 5781 pci_wait_for_pending_transaction(pdev); 5782 } 5783 5784 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5785 u32 reg) 5786 { 5787 unsigned long flags, address, data; 5788 u32 r; 5789 5790 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5791 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5792 5793 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5794 WREG32(address, reg * 4); 5795 (void)RREG32(address); 5796 r = RREG32(data); 5797 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5798 return r; 5799 } 5800 5801 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5802 u32 reg, u32 v) 5803 { 5804 unsigned long flags, address, data; 5805 5806 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5807 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5808 5809 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5810 WREG32(address, reg * 4); 5811 (void)RREG32(address); 5812 WREG32(data, v); 5813 (void)RREG32(data); 5814 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5815 } 5816