1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 #include <drm/drm_drv.h> 75 76 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 91 const char *amdgpu_asic_name[] = { 92 "TAHITI", 93 "PITCAIRN", 94 "VERDE", 95 "OLAND", 96 "HAINAN", 97 "BONAIRE", 98 "KAVERI", 99 "KABINI", 100 "HAWAII", 101 "MULLINS", 102 "TOPAZ", 103 "TONGA", 104 "FIJI", 105 "CARRIZO", 106 "STONEY", 107 "POLARIS10", 108 "POLARIS11", 109 "POLARIS12", 110 "VEGAM", 111 "VEGA10", 112 "VEGA12", 113 "VEGA20", 114 "RAVEN", 115 "ARCTURUS", 116 "RENOIR", 117 "ALDEBARAN", 118 "NAVI10", 119 "NAVI14", 120 "NAVI12", 121 "SIENNA_CICHLID", 122 "NAVY_FLOUNDER", 123 "VANGOGH", 124 "DIMGREY_CAVEFISH", 125 "BEIGE_GOBY", 126 "YELLOW_CARP", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 299 void *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0, tmp = 0; 303 uint32_t *data = buf; 304 uint64_t last; 305 int idx; 306 307 if (!drm_dev_enter(&adev->ddev, &idx)) 308 return; 309 310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 311 312 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 313 for (last = pos + size; pos < last; pos += 4) { 314 tmp = pos >> 31; 315 316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 317 if (tmp != hi) { 318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 319 hi = tmp; 320 } 321 if (write) 322 WREG32_NO_KIQ(mmMM_DATA, *data++); 323 else 324 *data++ = RREG32_NO_KIQ(mmMM_DATA); 325 } 326 327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 328 drm_dev_exit(idx); 329 } 330 331 /** 332 * amdgpu_device_vram_access - access vram by vram aperature 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 * 340 * The return value means how many bytes have been transferred. 341 */ 342 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 343 void *buf, size_t size, bool write) 344 { 345 #ifdef CONFIG_64BIT 346 void __iomem *addr; 347 size_t count = 0; 348 uint64_t last; 349 350 if (!adev->mman.aper_base_kaddr) 351 return 0; 352 353 last = min(pos + size, adev->gmc.visible_vram_size); 354 if (last > pos) { 355 addr = adev->mman.aper_base_kaddr + pos; 356 count = last - pos; 357 358 if (write) { 359 memcpy_toio(addr, buf, count); 360 mb(); 361 amdgpu_device_flush_hdp(adev, NULL); 362 } else { 363 amdgpu_device_invalidate_hdp(adev, NULL); 364 mb(); 365 memcpy_fromio(buf, addr, count); 366 } 367 368 } 369 370 return count; 371 #else 372 return 0; 373 #endif 374 } 375 376 /** 377 * amdgpu_device_vram_access - read/write a buffer in vram 378 * 379 * @adev: amdgpu_device pointer 380 * @pos: offset of the buffer in vram 381 * @buf: virtual address of the buffer in system memory 382 * @size: read/write size, sizeof(@buf) must > @size 383 * @write: true - write to vram, otherwise - read from vram 384 */ 385 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 386 void *buf, size_t size, bool write) 387 { 388 size_t count; 389 390 /* try to using vram apreature to access vram first */ 391 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 392 size -= count; 393 if (size) { 394 /* using MM to access rest vram */ 395 pos += count; 396 buf += count; 397 amdgpu_device_mm_access(adev, pos, buf, size, write); 398 } 399 } 400 401 /* 402 * register access helper functions. 403 */ 404 405 /* Check if hw access should be skipped because of hotplug or device error */ 406 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 407 { 408 if (adev->no_hw_access) 409 return true; 410 411 #ifdef CONFIG_LOCKDEP 412 /* 413 * This is a bit complicated to understand, so worth a comment. What we assert 414 * here is that the GPU reset is not running on another thread in parallel. 415 * 416 * For this we trylock the read side of the reset semaphore, if that succeeds 417 * we know that the reset is not running in paralell. 418 * 419 * If the trylock fails we assert that we are either already holding the read 420 * side of the lock or are the reset thread itself and hold the write side of 421 * the lock. 422 */ 423 if (in_task()) { 424 if (down_read_trylock(&adev->reset_sem)) 425 up_read(&adev->reset_sem); 426 else 427 lockdep_assert_held(&adev->reset_sem); 428 } 429 #endif 430 return false; 431 } 432 433 /** 434 * amdgpu_device_rreg - read a memory mapped IO or indirect register 435 * 436 * @adev: amdgpu_device pointer 437 * @reg: dword aligned register offset 438 * @acc_flags: access flags which require special behavior 439 * 440 * Returns the 32 bit value from the offset specified. 441 */ 442 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 443 uint32_t reg, uint32_t acc_flags) 444 { 445 uint32_t ret; 446 447 if (amdgpu_device_skip_hw_access(adev)) 448 return 0; 449 450 if ((reg * 4) < adev->rmmio_size) { 451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 452 amdgpu_sriov_runtime(adev) && 453 down_read_trylock(&adev->reset_sem)) { 454 ret = amdgpu_kiq_rreg(adev, reg); 455 up_read(&adev->reset_sem); 456 } else { 457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 458 } 459 } else { 460 ret = adev->pcie_rreg(adev, reg * 4); 461 } 462 463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 464 465 return ret; 466 } 467 468 /* 469 * MMIO register read with bytes helper functions 470 * @offset:bytes offset from MMIO start 471 * 472 */ 473 474 /** 475 * amdgpu_mm_rreg8 - read a memory mapped IO register 476 * 477 * @adev: amdgpu_device pointer 478 * @offset: byte aligned register offset 479 * 480 * Returns the 8 bit value from the offset specified. 481 */ 482 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 483 { 484 if (amdgpu_device_skip_hw_access(adev)) 485 return 0; 486 487 if (offset < adev->rmmio_size) 488 return (readb(adev->rmmio + offset)); 489 BUG(); 490 } 491 492 /* 493 * MMIO register write with bytes helper functions 494 * @offset:bytes offset from MMIO start 495 * @value: the value want to be written to the register 496 * 497 */ 498 /** 499 * amdgpu_mm_wreg8 - read a memory mapped IO register 500 * 501 * @adev: amdgpu_device pointer 502 * @offset: byte aligned register offset 503 * @value: 8 bit value to write 504 * 505 * Writes the value specified to the offset specified. 506 */ 507 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 508 { 509 if (amdgpu_device_skip_hw_access(adev)) 510 return; 511 512 if (offset < adev->rmmio_size) 513 writeb(value, adev->rmmio + offset); 514 else 515 BUG(); 516 } 517 518 /** 519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 520 * 521 * @adev: amdgpu_device pointer 522 * @reg: dword aligned register offset 523 * @v: 32 bit value to write to the register 524 * @acc_flags: access flags which require special behavior 525 * 526 * Writes the value specified to the offset specified. 527 */ 528 void amdgpu_device_wreg(struct amdgpu_device *adev, 529 uint32_t reg, uint32_t v, 530 uint32_t acc_flags) 531 { 532 if (amdgpu_device_skip_hw_access(adev)) 533 return; 534 535 if ((reg * 4) < adev->rmmio_size) { 536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 537 amdgpu_sriov_runtime(adev) && 538 down_read_trylock(&adev->reset_sem)) { 539 amdgpu_kiq_wreg(adev, reg, v); 540 up_read(&adev->reset_sem); 541 } else { 542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 543 } 544 } else { 545 adev->pcie_wreg(adev, reg * 4, v); 546 } 547 548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 549 } 550 551 /* 552 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 553 * 554 * this function is invoked only the debugfs register access 555 * */ 556 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 557 uint32_t reg, uint32_t v) 558 { 559 if (amdgpu_device_skip_hw_access(adev)) 560 return; 561 562 if (amdgpu_sriov_fullaccess(adev) && 563 adev->gfx.rlc.funcs && 564 adev->gfx.rlc.funcs->is_rlcg_access_range) { 565 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 566 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0, 0); 567 } else { 568 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 569 } 570 } 571 572 /** 573 * amdgpu_mm_rdoorbell - read a doorbell dword 574 * 575 * @adev: amdgpu_device pointer 576 * @index: doorbell index 577 * 578 * Returns the value in the doorbell aperture at the 579 * requested doorbell index (CIK). 580 */ 581 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 582 { 583 if (amdgpu_device_skip_hw_access(adev)) 584 return 0; 585 586 if (index < adev->doorbell.num_doorbells) { 587 return readl(adev->doorbell.ptr + index); 588 } else { 589 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 590 return 0; 591 } 592 } 593 594 /** 595 * amdgpu_mm_wdoorbell - write a doorbell dword 596 * 597 * @adev: amdgpu_device pointer 598 * @index: doorbell index 599 * @v: value to write 600 * 601 * Writes @v to the doorbell aperture at the 602 * requested doorbell index (CIK). 603 */ 604 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 605 { 606 if (amdgpu_device_skip_hw_access(adev)) 607 return; 608 609 if (index < adev->doorbell.num_doorbells) { 610 writel(v, adev->doorbell.ptr + index); 611 } else { 612 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 613 } 614 } 615 616 /** 617 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 618 * 619 * @adev: amdgpu_device pointer 620 * @index: doorbell index 621 * 622 * Returns the value in the doorbell aperture at the 623 * requested doorbell index (VEGA10+). 624 */ 625 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 626 { 627 if (amdgpu_device_skip_hw_access(adev)) 628 return 0; 629 630 if (index < adev->doorbell.num_doorbells) { 631 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 632 } else { 633 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 634 return 0; 635 } 636 } 637 638 /** 639 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 640 * 641 * @adev: amdgpu_device pointer 642 * @index: doorbell index 643 * @v: value to write 644 * 645 * Writes @v to the doorbell aperture at the 646 * requested doorbell index (VEGA10+). 647 */ 648 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 649 { 650 if (amdgpu_device_skip_hw_access(adev)) 651 return; 652 653 if (index < adev->doorbell.num_doorbells) { 654 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 655 } else { 656 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 657 } 658 } 659 660 /** 661 * amdgpu_device_indirect_rreg - read an indirect register 662 * 663 * @adev: amdgpu_device pointer 664 * @pcie_index: mmio register offset 665 * @pcie_data: mmio register offset 666 * @reg_addr: indirect register address to read from 667 * 668 * Returns the value of indirect register @reg_addr 669 */ 670 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 671 u32 pcie_index, u32 pcie_data, 672 u32 reg_addr) 673 { 674 unsigned long flags; 675 u32 r; 676 void __iomem *pcie_index_offset; 677 void __iomem *pcie_data_offset; 678 679 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 680 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 681 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 682 683 writel(reg_addr, pcie_index_offset); 684 readl(pcie_index_offset); 685 r = readl(pcie_data_offset); 686 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 687 688 return r; 689 } 690 691 /** 692 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 693 * 694 * @adev: amdgpu_device pointer 695 * @pcie_index: mmio register offset 696 * @pcie_data: mmio register offset 697 * @reg_addr: indirect register address to read from 698 * 699 * Returns the value of indirect register @reg_addr 700 */ 701 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 702 u32 pcie_index, u32 pcie_data, 703 u32 reg_addr) 704 { 705 unsigned long flags; 706 u64 r; 707 void __iomem *pcie_index_offset; 708 void __iomem *pcie_data_offset; 709 710 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 711 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 712 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 713 714 /* read low 32 bits */ 715 writel(reg_addr, pcie_index_offset); 716 readl(pcie_index_offset); 717 r = readl(pcie_data_offset); 718 /* read high 32 bits */ 719 writel(reg_addr + 4, pcie_index_offset); 720 readl(pcie_index_offset); 721 r |= ((u64)readl(pcie_data_offset) << 32); 722 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 723 724 return r; 725 } 726 727 /** 728 * amdgpu_device_indirect_wreg - write an indirect register address 729 * 730 * @adev: amdgpu_device pointer 731 * @pcie_index: mmio register offset 732 * @pcie_data: mmio register offset 733 * @reg_addr: indirect register offset 734 * @reg_data: indirect register data 735 * 736 */ 737 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 738 u32 pcie_index, u32 pcie_data, 739 u32 reg_addr, u32 reg_data) 740 { 741 unsigned long flags; 742 void __iomem *pcie_index_offset; 743 void __iomem *pcie_data_offset; 744 745 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 746 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 747 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 748 749 writel(reg_addr, pcie_index_offset); 750 readl(pcie_index_offset); 751 writel(reg_data, pcie_data_offset); 752 readl(pcie_data_offset); 753 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 754 } 755 756 /** 757 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 758 * 759 * @adev: amdgpu_device pointer 760 * @pcie_index: mmio register offset 761 * @pcie_data: mmio register offset 762 * @reg_addr: indirect register offset 763 * @reg_data: indirect register data 764 * 765 */ 766 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 767 u32 pcie_index, u32 pcie_data, 768 u32 reg_addr, u64 reg_data) 769 { 770 unsigned long flags; 771 void __iomem *pcie_index_offset; 772 void __iomem *pcie_data_offset; 773 774 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 775 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 776 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 777 778 /* write low 32 bits */ 779 writel(reg_addr, pcie_index_offset); 780 readl(pcie_index_offset); 781 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 782 readl(pcie_data_offset); 783 /* write high 32 bits */ 784 writel(reg_addr + 4, pcie_index_offset); 785 readl(pcie_index_offset); 786 writel((u32)(reg_data >> 32), pcie_data_offset); 787 readl(pcie_data_offset); 788 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 789 } 790 791 /** 792 * amdgpu_invalid_rreg - dummy reg read function 793 * 794 * @adev: amdgpu_device pointer 795 * @reg: offset of register 796 * 797 * Dummy register read function. Used for register blocks 798 * that certain asics don't have (all asics). 799 * Returns the value in the register. 800 */ 801 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 802 { 803 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 804 BUG(); 805 return 0; 806 } 807 808 /** 809 * amdgpu_invalid_wreg - dummy reg write function 810 * 811 * @adev: amdgpu_device pointer 812 * @reg: offset of register 813 * @v: value to write to the register 814 * 815 * Dummy register read function. Used for register blocks 816 * that certain asics don't have (all asics). 817 */ 818 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 819 { 820 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 821 reg, v); 822 BUG(); 823 } 824 825 /** 826 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 827 * 828 * @adev: amdgpu_device pointer 829 * @reg: offset of register 830 * 831 * Dummy register read function. Used for register blocks 832 * that certain asics don't have (all asics). 833 * Returns the value in the register. 834 */ 835 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 836 { 837 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 838 BUG(); 839 return 0; 840 } 841 842 /** 843 * amdgpu_invalid_wreg64 - dummy reg write function 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: offset of register 847 * @v: value to write to the register 848 * 849 * Dummy register read function. Used for register blocks 850 * that certain asics don't have (all asics). 851 */ 852 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 853 { 854 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 855 reg, v); 856 BUG(); 857 } 858 859 /** 860 * amdgpu_block_invalid_rreg - dummy reg read function 861 * 862 * @adev: amdgpu_device pointer 863 * @block: offset of instance 864 * @reg: offset of register 865 * 866 * Dummy register read function. Used for register blocks 867 * that certain asics don't have (all asics). 868 * Returns the value in the register. 869 */ 870 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 871 uint32_t block, uint32_t reg) 872 { 873 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 874 reg, block); 875 BUG(); 876 return 0; 877 } 878 879 /** 880 * amdgpu_block_invalid_wreg - dummy reg write function 881 * 882 * @adev: amdgpu_device pointer 883 * @block: offset of instance 884 * @reg: offset of register 885 * @v: value to write to the register 886 * 887 * Dummy register read function. Used for register blocks 888 * that certain asics don't have (all asics). 889 */ 890 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 891 uint32_t block, 892 uint32_t reg, uint32_t v) 893 { 894 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 895 reg, block, v); 896 BUG(); 897 } 898 899 /** 900 * amdgpu_device_asic_init - Wrapper for atom asic_init 901 * 902 * @adev: amdgpu_device pointer 903 * 904 * Does any asic specific work and then calls atom asic init. 905 */ 906 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 907 { 908 amdgpu_asic_pre_asic_init(adev); 909 910 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 911 } 912 913 /** 914 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 915 * 916 * @adev: amdgpu_device pointer 917 * 918 * Allocates a scratch page of VRAM for use by various things in the 919 * driver. 920 */ 921 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 922 { 923 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 924 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 925 &adev->vram_scratch.robj, 926 &adev->vram_scratch.gpu_addr, 927 (void **)&adev->vram_scratch.ptr); 928 } 929 930 /** 931 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 932 * 933 * @adev: amdgpu_device pointer 934 * 935 * Frees the VRAM scratch page. 936 */ 937 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 938 { 939 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 940 } 941 942 /** 943 * amdgpu_device_program_register_sequence - program an array of registers. 944 * 945 * @adev: amdgpu_device pointer 946 * @registers: pointer to the register array 947 * @array_size: size of the register array 948 * 949 * Programs an array or registers with and and or masks. 950 * This is a helper for setting golden registers. 951 */ 952 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 953 const u32 *registers, 954 const u32 array_size) 955 { 956 u32 tmp, reg, and_mask, or_mask; 957 int i; 958 959 if (array_size % 3) 960 return; 961 962 for (i = 0; i < array_size; i +=3) { 963 reg = registers[i + 0]; 964 and_mask = registers[i + 1]; 965 or_mask = registers[i + 2]; 966 967 if (and_mask == 0xffffffff) { 968 tmp = or_mask; 969 } else { 970 tmp = RREG32(reg); 971 tmp &= ~and_mask; 972 if (adev->family >= AMDGPU_FAMILY_AI) 973 tmp |= (or_mask & and_mask); 974 else 975 tmp |= or_mask; 976 } 977 WREG32(reg, tmp); 978 } 979 } 980 981 /** 982 * amdgpu_device_pci_config_reset - reset the GPU 983 * 984 * @adev: amdgpu_device pointer 985 * 986 * Resets the GPU using the pci config reset sequence. 987 * Only applicable to asics prior to vega10. 988 */ 989 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 990 { 991 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 992 } 993 994 /** 995 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 996 * 997 * @adev: amdgpu_device pointer 998 * 999 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1000 */ 1001 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1002 { 1003 return pci_reset_function(adev->pdev); 1004 } 1005 1006 /* 1007 * GPU doorbell aperture helpers function. 1008 */ 1009 /** 1010 * amdgpu_device_doorbell_init - Init doorbell driver information. 1011 * 1012 * @adev: amdgpu_device pointer 1013 * 1014 * Init doorbell driver information (CIK) 1015 * Returns 0 on success, error on failure. 1016 */ 1017 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1018 { 1019 1020 /* No doorbell on SI hardware generation */ 1021 if (adev->asic_type < CHIP_BONAIRE) { 1022 adev->doorbell.base = 0; 1023 adev->doorbell.size = 0; 1024 adev->doorbell.num_doorbells = 0; 1025 adev->doorbell.ptr = NULL; 1026 return 0; 1027 } 1028 1029 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1030 return -EINVAL; 1031 1032 amdgpu_asic_init_doorbell_index(adev); 1033 1034 /* doorbell bar mapping */ 1035 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1036 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1037 1038 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1039 adev->doorbell_index.max_assignment+1); 1040 if (adev->doorbell.num_doorbells == 0) 1041 return -EINVAL; 1042 1043 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1044 * paging queue doorbell use the second page. The 1045 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1046 * doorbells are in the first page. So with paging queue enabled, 1047 * the max num_doorbells should + 1 page (0x400 in dword) 1048 */ 1049 if (adev->asic_type >= CHIP_VEGA10) 1050 adev->doorbell.num_doorbells += 0x400; 1051 1052 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1053 adev->doorbell.num_doorbells * 1054 sizeof(u32)); 1055 if (adev->doorbell.ptr == NULL) 1056 return -ENOMEM; 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1063 * 1064 * @adev: amdgpu_device pointer 1065 * 1066 * Tear down doorbell driver information (CIK) 1067 */ 1068 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1069 { 1070 iounmap(adev->doorbell.ptr); 1071 adev->doorbell.ptr = NULL; 1072 } 1073 1074 1075 1076 /* 1077 * amdgpu_device_wb_*() 1078 * Writeback is the method by which the GPU updates special pages in memory 1079 * with the status of certain GPU events (fences, ring pointers,etc.). 1080 */ 1081 1082 /** 1083 * amdgpu_device_wb_fini - Disable Writeback and free memory 1084 * 1085 * @adev: amdgpu_device pointer 1086 * 1087 * Disables Writeback and frees the Writeback memory (all asics). 1088 * Used at driver shutdown. 1089 */ 1090 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1091 { 1092 if (adev->wb.wb_obj) { 1093 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1094 &adev->wb.gpu_addr, 1095 (void **)&adev->wb.wb); 1096 adev->wb.wb_obj = NULL; 1097 } 1098 } 1099 1100 /** 1101 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1102 * 1103 * @adev: amdgpu_device pointer 1104 * 1105 * Initializes writeback and allocates writeback memory (all asics). 1106 * Used at driver startup. 1107 * Returns 0 on success or an -error on failure. 1108 */ 1109 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1110 { 1111 int r; 1112 1113 if (adev->wb.wb_obj == NULL) { 1114 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1115 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1116 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1117 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1118 (void **)&adev->wb.wb); 1119 if (r) { 1120 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1121 return r; 1122 } 1123 1124 adev->wb.num_wb = AMDGPU_MAX_WB; 1125 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1126 1127 /* clear wb memory */ 1128 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1129 } 1130 1131 return 0; 1132 } 1133 1134 /** 1135 * amdgpu_device_wb_get - Allocate a wb entry 1136 * 1137 * @adev: amdgpu_device pointer 1138 * @wb: wb index 1139 * 1140 * Allocate a wb slot for use by the driver (all asics). 1141 * Returns 0 on success or -EINVAL on failure. 1142 */ 1143 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1144 { 1145 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1146 1147 if (offset < adev->wb.num_wb) { 1148 __set_bit(offset, adev->wb.used); 1149 *wb = offset << 3; /* convert to dw offset */ 1150 return 0; 1151 } else { 1152 return -EINVAL; 1153 } 1154 } 1155 1156 /** 1157 * amdgpu_device_wb_free - Free a wb entry 1158 * 1159 * @adev: amdgpu_device pointer 1160 * @wb: wb index 1161 * 1162 * Free a wb slot allocated for use by the driver (all asics) 1163 */ 1164 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1165 { 1166 wb >>= 3; 1167 if (wb < adev->wb.num_wb) 1168 __clear_bit(wb, adev->wb.used); 1169 } 1170 1171 /** 1172 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1173 * 1174 * @adev: amdgpu_device pointer 1175 * 1176 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1177 * to fail, but if any of the BARs is not accessible after the size we abort 1178 * driver loading by returning -ENODEV. 1179 */ 1180 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1181 { 1182 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1183 struct pci_bus *root; 1184 struct resource *res; 1185 unsigned i; 1186 u16 cmd; 1187 int r; 1188 1189 /* Bypass for VF */ 1190 if (amdgpu_sriov_vf(adev)) 1191 return 0; 1192 1193 /* skip if the bios has already enabled large BAR */ 1194 if (adev->gmc.real_vram_size && 1195 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1196 return 0; 1197 1198 /* Check if the root BUS has 64bit memory resources */ 1199 root = adev->pdev->bus; 1200 while (root->parent) 1201 root = root->parent; 1202 1203 pci_bus_for_each_resource(root, res, i) { 1204 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1205 res->start > 0x100000000ull) 1206 break; 1207 } 1208 1209 /* Trying to resize is pointless without a root hub window above 4GB */ 1210 if (!res) 1211 return 0; 1212 1213 /* Limit the BAR size to what is available */ 1214 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1215 rbar_size); 1216 1217 /* Disable memory decoding while we change the BAR addresses and size */ 1218 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1219 pci_write_config_word(adev->pdev, PCI_COMMAND, 1220 cmd & ~PCI_COMMAND_MEMORY); 1221 1222 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1223 amdgpu_device_doorbell_fini(adev); 1224 if (adev->asic_type >= CHIP_BONAIRE) 1225 pci_release_resource(adev->pdev, 2); 1226 1227 pci_release_resource(adev->pdev, 0); 1228 1229 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1230 if (r == -ENOSPC) 1231 DRM_INFO("Not enough PCI address space for a large BAR."); 1232 else if (r && r != -ENOTSUPP) 1233 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1234 1235 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1236 1237 /* When the doorbell or fb BAR isn't available we have no chance of 1238 * using the device. 1239 */ 1240 r = amdgpu_device_doorbell_init(adev); 1241 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1242 return -ENODEV; 1243 1244 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1245 1246 return 0; 1247 } 1248 1249 /* 1250 * GPU helpers function. 1251 */ 1252 /** 1253 * amdgpu_device_need_post - check if the hw need post or not 1254 * 1255 * @adev: amdgpu_device pointer 1256 * 1257 * Check if the asic has been initialized (all asics) at driver startup 1258 * or post is needed if hw reset is performed. 1259 * Returns true if need or false if not. 1260 */ 1261 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1262 { 1263 uint32_t reg; 1264 1265 if (amdgpu_sriov_vf(adev)) 1266 return false; 1267 1268 if (amdgpu_passthrough(adev)) { 1269 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1270 * some old smc fw still need driver do vPost otherwise gpu hang, while 1271 * those smc fw version above 22.15 doesn't have this flaw, so we force 1272 * vpost executed for smc version below 22.15 1273 */ 1274 if (adev->asic_type == CHIP_FIJI) { 1275 int err; 1276 uint32_t fw_ver; 1277 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1278 /* force vPost if error occured */ 1279 if (err) 1280 return true; 1281 1282 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1283 if (fw_ver < 0x00160e00) 1284 return true; 1285 } 1286 } 1287 1288 /* Don't post if we need to reset whole hive on init */ 1289 if (adev->gmc.xgmi.pending_reset) 1290 return false; 1291 1292 if (adev->has_hw_reset) { 1293 adev->has_hw_reset = false; 1294 return true; 1295 } 1296 1297 /* bios scratch used on CIK+ */ 1298 if (adev->asic_type >= CHIP_BONAIRE) 1299 return amdgpu_atombios_scratch_need_asic_init(adev); 1300 1301 /* check MEM_SIZE for older asics */ 1302 reg = amdgpu_asic_get_config_memsize(adev); 1303 1304 if ((reg != 0) && (reg != 0xffffffff)) 1305 return false; 1306 1307 return true; 1308 } 1309 1310 /* if we get transitioned to only one device, take VGA back */ 1311 /** 1312 * amdgpu_device_vga_set_decode - enable/disable vga decode 1313 * 1314 * @cookie: amdgpu_device pointer 1315 * @state: enable/disable vga decode 1316 * 1317 * Enable/disable vga decode (all asics). 1318 * Returns VGA resource flags. 1319 */ 1320 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1321 { 1322 struct amdgpu_device *adev = cookie; 1323 amdgpu_asic_set_vga_state(adev, state); 1324 if (state) 1325 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1326 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1327 else 1328 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1329 } 1330 1331 /** 1332 * amdgpu_device_check_block_size - validate the vm block size 1333 * 1334 * @adev: amdgpu_device pointer 1335 * 1336 * Validates the vm block size specified via module parameter. 1337 * The vm block size defines number of bits in page table versus page directory, 1338 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1339 * page table and the remaining bits are in the page directory. 1340 */ 1341 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1342 { 1343 /* defines number of bits in page table versus page directory, 1344 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1345 * page table and the remaining bits are in the page directory */ 1346 if (amdgpu_vm_block_size == -1) 1347 return; 1348 1349 if (amdgpu_vm_block_size < 9) { 1350 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1351 amdgpu_vm_block_size); 1352 amdgpu_vm_block_size = -1; 1353 } 1354 } 1355 1356 /** 1357 * amdgpu_device_check_vm_size - validate the vm size 1358 * 1359 * @adev: amdgpu_device pointer 1360 * 1361 * Validates the vm size in GB specified via module parameter. 1362 * The VM size is the size of the GPU virtual memory space in GB. 1363 */ 1364 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1365 { 1366 /* no need to check the default value */ 1367 if (amdgpu_vm_size == -1) 1368 return; 1369 1370 if (amdgpu_vm_size < 1) { 1371 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1372 amdgpu_vm_size); 1373 amdgpu_vm_size = -1; 1374 } 1375 } 1376 1377 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1378 { 1379 struct sysinfo si; 1380 bool is_os_64 = (sizeof(void *) == 8); 1381 uint64_t total_memory; 1382 uint64_t dram_size_seven_GB = 0x1B8000000; 1383 uint64_t dram_size_three_GB = 0xB8000000; 1384 1385 if (amdgpu_smu_memory_pool_size == 0) 1386 return; 1387 1388 if (!is_os_64) { 1389 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1390 goto def_value; 1391 } 1392 si_meminfo(&si); 1393 total_memory = (uint64_t)si.totalram * si.mem_unit; 1394 1395 if ((amdgpu_smu_memory_pool_size == 1) || 1396 (amdgpu_smu_memory_pool_size == 2)) { 1397 if (total_memory < dram_size_three_GB) 1398 goto def_value1; 1399 } else if ((amdgpu_smu_memory_pool_size == 4) || 1400 (amdgpu_smu_memory_pool_size == 8)) { 1401 if (total_memory < dram_size_seven_GB) 1402 goto def_value1; 1403 } else { 1404 DRM_WARN("Smu memory pool size not supported\n"); 1405 goto def_value; 1406 } 1407 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1408 1409 return; 1410 1411 def_value1: 1412 DRM_WARN("No enough system memory\n"); 1413 def_value: 1414 adev->pm.smu_prv_buffer_size = 0; 1415 } 1416 1417 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1418 { 1419 if (!(adev->flags & AMD_IS_APU) || 1420 adev->asic_type < CHIP_RAVEN) 1421 return 0; 1422 1423 switch (adev->asic_type) { 1424 case CHIP_RAVEN: 1425 if (adev->pdev->device == 0x15dd) 1426 adev->apu_flags |= AMD_APU_IS_RAVEN; 1427 if (adev->pdev->device == 0x15d8) 1428 adev->apu_flags |= AMD_APU_IS_PICASSO; 1429 break; 1430 case CHIP_RENOIR: 1431 if ((adev->pdev->device == 0x1636) || 1432 (adev->pdev->device == 0x164c)) 1433 adev->apu_flags |= AMD_APU_IS_RENOIR; 1434 else 1435 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1436 break; 1437 case CHIP_VANGOGH: 1438 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1439 break; 1440 case CHIP_YELLOW_CARP: 1441 break; 1442 default: 1443 return -EINVAL; 1444 } 1445 1446 return 0; 1447 } 1448 1449 /** 1450 * amdgpu_device_check_arguments - validate module params 1451 * 1452 * @adev: amdgpu_device pointer 1453 * 1454 * Validates certain module parameters and updates 1455 * the associated values used by the driver (all asics). 1456 */ 1457 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1458 { 1459 if (amdgpu_sched_jobs < 4) { 1460 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1461 amdgpu_sched_jobs); 1462 amdgpu_sched_jobs = 4; 1463 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1464 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1465 amdgpu_sched_jobs); 1466 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1467 } 1468 1469 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1470 /* gart size must be greater or equal to 32M */ 1471 dev_warn(adev->dev, "gart size (%d) too small\n", 1472 amdgpu_gart_size); 1473 amdgpu_gart_size = -1; 1474 } 1475 1476 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1477 /* gtt size must be greater or equal to 32M */ 1478 dev_warn(adev->dev, "gtt size (%d) too small\n", 1479 amdgpu_gtt_size); 1480 amdgpu_gtt_size = -1; 1481 } 1482 1483 /* valid range is between 4 and 9 inclusive */ 1484 if (amdgpu_vm_fragment_size != -1 && 1485 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1486 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1487 amdgpu_vm_fragment_size = -1; 1488 } 1489 1490 if (amdgpu_sched_hw_submission < 2) { 1491 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1492 amdgpu_sched_hw_submission); 1493 amdgpu_sched_hw_submission = 2; 1494 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1495 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1496 amdgpu_sched_hw_submission); 1497 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1498 } 1499 1500 amdgpu_device_check_smu_prv_buffer_size(adev); 1501 1502 amdgpu_device_check_vm_size(adev); 1503 1504 amdgpu_device_check_block_size(adev); 1505 1506 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1507 1508 amdgpu_gmc_tmz_set(adev); 1509 1510 amdgpu_gmc_noretry_set(adev); 1511 1512 return 0; 1513 } 1514 1515 /** 1516 * amdgpu_switcheroo_set_state - set switcheroo state 1517 * 1518 * @pdev: pci dev pointer 1519 * @state: vga_switcheroo state 1520 * 1521 * Callback for the switcheroo driver. Suspends or resumes the 1522 * the asics before or after it is powered up using ACPI methods. 1523 */ 1524 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1525 enum vga_switcheroo_state state) 1526 { 1527 struct drm_device *dev = pci_get_drvdata(pdev); 1528 int r; 1529 1530 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1531 return; 1532 1533 if (state == VGA_SWITCHEROO_ON) { 1534 pr_info("switched on\n"); 1535 /* don't suspend or resume card normally */ 1536 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1537 1538 pci_set_power_state(pdev, PCI_D0); 1539 amdgpu_device_load_pci_state(pdev); 1540 r = pci_enable_device(pdev); 1541 if (r) 1542 DRM_WARN("pci_enable_device failed (%d)\n", r); 1543 amdgpu_device_resume(dev, true); 1544 1545 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1546 } else { 1547 pr_info("switched off\n"); 1548 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1549 amdgpu_device_suspend(dev, true); 1550 amdgpu_device_cache_pci_state(pdev); 1551 /* Shut down the device */ 1552 pci_disable_device(pdev); 1553 pci_set_power_state(pdev, PCI_D3cold); 1554 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1555 } 1556 } 1557 1558 /** 1559 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1560 * 1561 * @pdev: pci dev pointer 1562 * 1563 * Callback for the switcheroo driver. Check of the switcheroo 1564 * state can be changed. 1565 * Returns true if the state can be changed, false if not. 1566 */ 1567 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1568 { 1569 struct drm_device *dev = pci_get_drvdata(pdev); 1570 1571 /* 1572 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1573 * locking inversion with the driver load path. And the access here is 1574 * completely racy anyway. So don't bother with locking for now. 1575 */ 1576 return atomic_read(&dev->open_count) == 0; 1577 } 1578 1579 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1580 .set_gpu_state = amdgpu_switcheroo_set_state, 1581 .reprobe = NULL, 1582 .can_switch = amdgpu_switcheroo_can_switch, 1583 }; 1584 1585 /** 1586 * amdgpu_device_ip_set_clockgating_state - set the CG state 1587 * 1588 * @dev: amdgpu_device pointer 1589 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1590 * @state: clockgating state (gate or ungate) 1591 * 1592 * Sets the requested clockgating state for all instances of 1593 * the hardware IP specified. 1594 * Returns the error code from the last instance. 1595 */ 1596 int amdgpu_device_ip_set_clockgating_state(void *dev, 1597 enum amd_ip_block_type block_type, 1598 enum amd_clockgating_state state) 1599 { 1600 struct amdgpu_device *adev = dev; 1601 int i, r = 0; 1602 1603 for (i = 0; i < adev->num_ip_blocks; i++) { 1604 if (!adev->ip_blocks[i].status.valid) 1605 continue; 1606 if (adev->ip_blocks[i].version->type != block_type) 1607 continue; 1608 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1609 continue; 1610 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1611 (void *)adev, state); 1612 if (r) 1613 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1614 adev->ip_blocks[i].version->funcs->name, r); 1615 } 1616 return r; 1617 } 1618 1619 /** 1620 * amdgpu_device_ip_set_powergating_state - set the PG state 1621 * 1622 * @dev: amdgpu_device pointer 1623 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1624 * @state: powergating state (gate or ungate) 1625 * 1626 * Sets the requested powergating state for all instances of 1627 * the hardware IP specified. 1628 * Returns the error code from the last instance. 1629 */ 1630 int amdgpu_device_ip_set_powergating_state(void *dev, 1631 enum amd_ip_block_type block_type, 1632 enum amd_powergating_state state) 1633 { 1634 struct amdgpu_device *adev = dev; 1635 int i, r = 0; 1636 1637 for (i = 0; i < adev->num_ip_blocks; i++) { 1638 if (!adev->ip_blocks[i].status.valid) 1639 continue; 1640 if (adev->ip_blocks[i].version->type != block_type) 1641 continue; 1642 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1643 continue; 1644 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1645 (void *)adev, state); 1646 if (r) 1647 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1648 adev->ip_blocks[i].version->funcs->name, r); 1649 } 1650 return r; 1651 } 1652 1653 /** 1654 * amdgpu_device_ip_get_clockgating_state - get the CG state 1655 * 1656 * @adev: amdgpu_device pointer 1657 * @flags: clockgating feature flags 1658 * 1659 * Walks the list of IPs on the device and updates the clockgating 1660 * flags for each IP. 1661 * Updates @flags with the feature flags for each hardware IP where 1662 * clockgating is enabled. 1663 */ 1664 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1665 u32 *flags) 1666 { 1667 int i; 1668 1669 for (i = 0; i < adev->num_ip_blocks; i++) { 1670 if (!adev->ip_blocks[i].status.valid) 1671 continue; 1672 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1673 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1674 } 1675 } 1676 1677 /** 1678 * amdgpu_device_ip_wait_for_idle - wait for idle 1679 * 1680 * @adev: amdgpu_device pointer 1681 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1682 * 1683 * Waits for the request hardware IP to be idle. 1684 * Returns 0 for success or a negative error code on failure. 1685 */ 1686 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1687 enum amd_ip_block_type block_type) 1688 { 1689 int i, r; 1690 1691 for (i = 0; i < adev->num_ip_blocks; i++) { 1692 if (!adev->ip_blocks[i].status.valid) 1693 continue; 1694 if (adev->ip_blocks[i].version->type == block_type) { 1695 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1696 if (r) 1697 return r; 1698 break; 1699 } 1700 } 1701 return 0; 1702 1703 } 1704 1705 /** 1706 * amdgpu_device_ip_is_idle - is the hardware IP idle 1707 * 1708 * @adev: amdgpu_device pointer 1709 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1710 * 1711 * Check if the hardware IP is idle or not. 1712 * Returns true if it the IP is idle, false if not. 1713 */ 1714 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1715 enum amd_ip_block_type block_type) 1716 { 1717 int i; 1718 1719 for (i = 0; i < adev->num_ip_blocks; i++) { 1720 if (!adev->ip_blocks[i].status.valid) 1721 continue; 1722 if (adev->ip_blocks[i].version->type == block_type) 1723 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1724 } 1725 return true; 1726 1727 } 1728 1729 /** 1730 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1731 * 1732 * @adev: amdgpu_device pointer 1733 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1734 * 1735 * Returns a pointer to the hardware IP block structure 1736 * if it exists for the asic, otherwise NULL. 1737 */ 1738 struct amdgpu_ip_block * 1739 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1740 enum amd_ip_block_type type) 1741 { 1742 int i; 1743 1744 for (i = 0; i < adev->num_ip_blocks; i++) 1745 if (adev->ip_blocks[i].version->type == type) 1746 return &adev->ip_blocks[i]; 1747 1748 return NULL; 1749 } 1750 1751 /** 1752 * amdgpu_device_ip_block_version_cmp 1753 * 1754 * @adev: amdgpu_device pointer 1755 * @type: enum amd_ip_block_type 1756 * @major: major version 1757 * @minor: minor version 1758 * 1759 * return 0 if equal or greater 1760 * return 1 if smaller or the ip_block doesn't exist 1761 */ 1762 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1763 enum amd_ip_block_type type, 1764 u32 major, u32 minor) 1765 { 1766 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1767 1768 if (ip_block && ((ip_block->version->major > major) || 1769 ((ip_block->version->major == major) && 1770 (ip_block->version->minor >= minor)))) 1771 return 0; 1772 1773 return 1; 1774 } 1775 1776 /** 1777 * amdgpu_device_ip_block_add 1778 * 1779 * @adev: amdgpu_device pointer 1780 * @ip_block_version: pointer to the IP to add 1781 * 1782 * Adds the IP block driver information to the collection of IPs 1783 * on the asic. 1784 */ 1785 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1786 const struct amdgpu_ip_block_version *ip_block_version) 1787 { 1788 if (!ip_block_version) 1789 return -EINVAL; 1790 1791 switch (ip_block_version->type) { 1792 case AMD_IP_BLOCK_TYPE_VCN: 1793 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1794 return 0; 1795 break; 1796 case AMD_IP_BLOCK_TYPE_JPEG: 1797 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1798 return 0; 1799 break; 1800 default: 1801 break; 1802 } 1803 1804 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1805 ip_block_version->funcs->name); 1806 1807 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1808 1809 return 0; 1810 } 1811 1812 /** 1813 * amdgpu_device_enable_virtual_display - enable virtual display feature 1814 * 1815 * @adev: amdgpu_device pointer 1816 * 1817 * Enabled the virtual display feature if the user has enabled it via 1818 * the module parameter virtual_display. This feature provides a virtual 1819 * display hardware on headless boards or in virtualized environments. 1820 * This function parses and validates the configuration string specified by 1821 * the user and configues the virtual display configuration (number of 1822 * virtual connectors, crtcs, etc.) specified. 1823 */ 1824 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1825 { 1826 adev->enable_virtual_display = false; 1827 1828 if (amdgpu_virtual_display) { 1829 const char *pci_address_name = pci_name(adev->pdev); 1830 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1831 1832 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1833 pciaddstr_tmp = pciaddstr; 1834 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1835 pciaddname = strsep(&pciaddname_tmp, ","); 1836 if (!strcmp("all", pciaddname) 1837 || !strcmp(pci_address_name, pciaddname)) { 1838 long num_crtc; 1839 int res = -1; 1840 1841 adev->enable_virtual_display = true; 1842 1843 if (pciaddname_tmp) 1844 res = kstrtol(pciaddname_tmp, 10, 1845 &num_crtc); 1846 1847 if (!res) { 1848 if (num_crtc < 1) 1849 num_crtc = 1; 1850 if (num_crtc > 6) 1851 num_crtc = 6; 1852 adev->mode_info.num_crtc = num_crtc; 1853 } else { 1854 adev->mode_info.num_crtc = 1; 1855 } 1856 break; 1857 } 1858 } 1859 1860 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1861 amdgpu_virtual_display, pci_address_name, 1862 adev->enable_virtual_display, adev->mode_info.num_crtc); 1863 1864 kfree(pciaddstr); 1865 } 1866 } 1867 1868 /** 1869 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1870 * 1871 * @adev: amdgpu_device pointer 1872 * 1873 * Parses the asic configuration parameters specified in the gpu info 1874 * firmware and makes them availale to the driver for use in configuring 1875 * the asic. 1876 * Returns 0 on success, -EINVAL on failure. 1877 */ 1878 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1879 { 1880 const char *chip_name; 1881 char fw_name[40]; 1882 int err; 1883 const struct gpu_info_firmware_header_v1_0 *hdr; 1884 1885 adev->firmware.gpu_info_fw = NULL; 1886 1887 if (adev->mman.discovery_bin) { 1888 amdgpu_discovery_get_gfx_info(adev); 1889 1890 /* 1891 * FIXME: The bounding box is still needed by Navi12, so 1892 * temporarily read it from gpu_info firmware. Should be droped 1893 * when DAL no longer needs it. 1894 */ 1895 if (adev->asic_type != CHIP_NAVI12) 1896 return 0; 1897 } 1898 1899 switch (adev->asic_type) { 1900 #ifdef CONFIG_DRM_AMDGPU_SI 1901 case CHIP_VERDE: 1902 case CHIP_TAHITI: 1903 case CHIP_PITCAIRN: 1904 case CHIP_OLAND: 1905 case CHIP_HAINAN: 1906 #endif 1907 #ifdef CONFIG_DRM_AMDGPU_CIK 1908 case CHIP_BONAIRE: 1909 case CHIP_HAWAII: 1910 case CHIP_KAVERI: 1911 case CHIP_KABINI: 1912 case CHIP_MULLINS: 1913 #endif 1914 case CHIP_TOPAZ: 1915 case CHIP_TONGA: 1916 case CHIP_FIJI: 1917 case CHIP_POLARIS10: 1918 case CHIP_POLARIS11: 1919 case CHIP_POLARIS12: 1920 case CHIP_VEGAM: 1921 case CHIP_CARRIZO: 1922 case CHIP_STONEY: 1923 case CHIP_VEGA20: 1924 case CHIP_ALDEBARAN: 1925 case CHIP_SIENNA_CICHLID: 1926 case CHIP_NAVY_FLOUNDER: 1927 case CHIP_DIMGREY_CAVEFISH: 1928 case CHIP_BEIGE_GOBY: 1929 default: 1930 return 0; 1931 case CHIP_VEGA10: 1932 chip_name = "vega10"; 1933 break; 1934 case CHIP_VEGA12: 1935 chip_name = "vega12"; 1936 break; 1937 case CHIP_RAVEN: 1938 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1939 chip_name = "raven2"; 1940 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1941 chip_name = "picasso"; 1942 else 1943 chip_name = "raven"; 1944 break; 1945 case CHIP_ARCTURUS: 1946 chip_name = "arcturus"; 1947 break; 1948 case CHIP_RENOIR: 1949 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1950 chip_name = "renoir"; 1951 else 1952 chip_name = "green_sardine"; 1953 break; 1954 case CHIP_NAVI10: 1955 chip_name = "navi10"; 1956 break; 1957 case CHIP_NAVI14: 1958 chip_name = "navi14"; 1959 break; 1960 case CHIP_NAVI12: 1961 chip_name = "navi12"; 1962 break; 1963 case CHIP_VANGOGH: 1964 chip_name = "vangogh"; 1965 break; 1966 case CHIP_YELLOW_CARP: 1967 chip_name = "yellow_carp"; 1968 break; 1969 } 1970 1971 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1972 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1973 if (err) { 1974 dev_err(adev->dev, 1975 "Failed to load gpu_info firmware \"%s\"\n", 1976 fw_name); 1977 goto out; 1978 } 1979 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1980 if (err) { 1981 dev_err(adev->dev, 1982 "Failed to validate gpu_info firmware \"%s\"\n", 1983 fw_name); 1984 goto out; 1985 } 1986 1987 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1988 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1989 1990 switch (hdr->version_major) { 1991 case 1: 1992 { 1993 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1994 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1995 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1996 1997 /* 1998 * Should be droped when DAL no longer needs it. 1999 */ 2000 if (adev->asic_type == CHIP_NAVI12) 2001 goto parse_soc_bounding_box; 2002 2003 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2004 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2005 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2006 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2007 adev->gfx.config.max_texture_channel_caches = 2008 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2009 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2010 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2011 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2012 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2013 adev->gfx.config.double_offchip_lds_buf = 2014 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2015 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2016 adev->gfx.cu_info.max_waves_per_simd = 2017 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2018 adev->gfx.cu_info.max_scratch_slots_per_cu = 2019 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2020 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2021 if (hdr->version_minor >= 1) { 2022 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2023 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2024 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2025 adev->gfx.config.num_sc_per_sh = 2026 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2027 adev->gfx.config.num_packer_per_sc = 2028 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2029 } 2030 2031 parse_soc_bounding_box: 2032 /* 2033 * soc bounding box info is not integrated in disocovery table, 2034 * we always need to parse it from gpu info firmware if needed. 2035 */ 2036 if (hdr->version_minor == 2) { 2037 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2038 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2039 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2040 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2041 } 2042 break; 2043 } 2044 default: 2045 dev_err(adev->dev, 2046 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2047 err = -EINVAL; 2048 goto out; 2049 } 2050 out: 2051 return err; 2052 } 2053 2054 /** 2055 * amdgpu_device_ip_early_init - run early init for hardware IPs 2056 * 2057 * @adev: amdgpu_device pointer 2058 * 2059 * Early initialization pass for hardware IPs. The hardware IPs that make 2060 * up each asic are discovered each IP's early_init callback is run. This 2061 * is the first stage in initializing the asic. 2062 * Returns 0 on success, negative error code on failure. 2063 */ 2064 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2065 { 2066 int i, r; 2067 2068 amdgpu_device_enable_virtual_display(adev); 2069 2070 if (amdgpu_sriov_vf(adev)) { 2071 r = amdgpu_virt_request_full_gpu(adev, true); 2072 if (r) 2073 return r; 2074 } 2075 2076 switch (adev->asic_type) { 2077 #ifdef CONFIG_DRM_AMDGPU_SI 2078 case CHIP_VERDE: 2079 case CHIP_TAHITI: 2080 case CHIP_PITCAIRN: 2081 case CHIP_OLAND: 2082 case CHIP_HAINAN: 2083 adev->family = AMDGPU_FAMILY_SI; 2084 r = si_set_ip_blocks(adev); 2085 if (r) 2086 return r; 2087 break; 2088 #endif 2089 #ifdef CONFIG_DRM_AMDGPU_CIK 2090 case CHIP_BONAIRE: 2091 case CHIP_HAWAII: 2092 case CHIP_KAVERI: 2093 case CHIP_KABINI: 2094 case CHIP_MULLINS: 2095 if (adev->flags & AMD_IS_APU) 2096 adev->family = AMDGPU_FAMILY_KV; 2097 else 2098 adev->family = AMDGPU_FAMILY_CI; 2099 2100 r = cik_set_ip_blocks(adev); 2101 if (r) 2102 return r; 2103 break; 2104 #endif 2105 case CHIP_TOPAZ: 2106 case CHIP_TONGA: 2107 case CHIP_FIJI: 2108 case CHIP_POLARIS10: 2109 case CHIP_POLARIS11: 2110 case CHIP_POLARIS12: 2111 case CHIP_VEGAM: 2112 case CHIP_CARRIZO: 2113 case CHIP_STONEY: 2114 if (adev->flags & AMD_IS_APU) 2115 adev->family = AMDGPU_FAMILY_CZ; 2116 else 2117 adev->family = AMDGPU_FAMILY_VI; 2118 2119 r = vi_set_ip_blocks(adev); 2120 if (r) 2121 return r; 2122 break; 2123 case CHIP_VEGA10: 2124 case CHIP_VEGA12: 2125 case CHIP_VEGA20: 2126 case CHIP_RAVEN: 2127 case CHIP_ARCTURUS: 2128 case CHIP_RENOIR: 2129 case CHIP_ALDEBARAN: 2130 if (adev->flags & AMD_IS_APU) 2131 adev->family = AMDGPU_FAMILY_RV; 2132 else 2133 adev->family = AMDGPU_FAMILY_AI; 2134 2135 r = soc15_set_ip_blocks(adev); 2136 if (r) 2137 return r; 2138 break; 2139 case CHIP_NAVI10: 2140 case CHIP_NAVI14: 2141 case CHIP_NAVI12: 2142 case CHIP_SIENNA_CICHLID: 2143 case CHIP_NAVY_FLOUNDER: 2144 case CHIP_DIMGREY_CAVEFISH: 2145 case CHIP_BEIGE_GOBY: 2146 case CHIP_VANGOGH: 2147 case CHIP_YELLOW_CARP: 2148 if (adev->asic_type == CHIP_VANGOGH) 2149 adev->family = AMDGPU_FAMILY_VGH; 2150 else if (adev->asic_type == CHIP_YELLOW_CARP) 2151 adev->family = AMDGPU_FAMILY_YC; 2152 else 2153 adev->family = AMDGPU_FAMILY_NV; 2154 2155 r = nv_set_ip_blocks(adev); 2156 if (r) 2157 return r; 2158 break; 2159 default: 2160 /* FIXME: not supported yet */ 2161 return -EINVAL; 2162 } 2163 2164 amdgpu_amdkfd_device_probe(adev); 2165 2166 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2167 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2168 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2169 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2170 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2171 2172 for (i = 0; i < adev->num_ip_blocks; i++) { 2173 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2174 DRM_ERROR("disabled ip block: %d <%s>\n", 2175 i, adev->ip_blocks[i].version->funcs->name); 2176 adev->ip_blocks[i].status.valid = false; 2177 } else { 2178 if (adev->ip_blocks[i].version->funcs->early_init) { 2179 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2180 if (r == -ENOENT) { 2181 adev->ip_blocks[i].status.valid = false; 2182 } else if (r) { 2183 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2184 adev->ip_blocks[i].version->funcs->name, r); 2185 return r; 2186 } else { 2187 adev->ip_blocks[i].status.valid = true; 2188 } 2189 } else { 2190 adev->ip_blocks[i].status.valid = true; 2191 } 2192 } 2193 /* get the vbios after the asic_funcs are set up */ 2194 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2195 r = amdgpu_device_parse_gpu_info_fw(adev); 2196 if (r) 2197 return r; 2198 2199 /* Read BIOS */ 2200 if (!amdgpu_get_bios(adev)) 2201 return -EINVAL; 2202 2203 r = amdgpu_atombios_init(adev); 2204 if (r) { 2205 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2206 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2207 return r; 2208 } 2209 2210 /*get pf2vf msg info at it's earliest time*/ 2211 if (amdgpu_sriov_vf(adev)) 2212 amdgpu_virt_init_data_exchange(adev); 2213 2214 } 2215 } 2216 2217 adev->cg_flags &= amdgpu_cg_mask; 2218 adev->pg_flags &= amdgpu_pg_mask; 2219 2220 return 0; 2221 } 2222 2223 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2224 { 2225 int i, r; 2226 2227 for (i = 0; i < adev->num_ip_blocks; i++) { 2228 if (!adev->ip_blocks[i].status.sw) 2229 continue; 2230 if (adev->ip_blocks[i].status.hw) 2231 continue; 2232 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2233 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2234 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2235 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2236 if (r) { 2237 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2238 adev->ip_blocks[i].version->funcs->name, r); 2239 return r; 2240 } 2241 adev->ip_blocks[i].status.hw = true; 2242 } 2243 } 2244 2245 return 0; 2246 } 2247 2248 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2249 { 2250 int i, r; 2251 2252 for (i = 0; i < adev->num_ip_blocks; i++) { 2253 if (!adev->ip_blocks[i].status.sw) 2254 continue; 2255 if (adev->ip_blocks[i].status.hw) 2256 continue; 2257 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2258 if (r) { 2259 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2260 adev->ip_blocks[i].version->funcs->name, r); 2261 return r; 2262 } 2263 adev->ip_blocks[i].status.hw = true; 2264 } 2265 2266 return 0; 2267 } 2268 2269 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2270 { 2271 int r = 0; 2272 int i; 2273 uint32_t smu_version; 2274 2275 if (adev->asic_type >= CHIP_VEGA10) { 2276 for (i = 0; i < adev->num_ip_blocks; i++) { 2277 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2278 continue; 2279 2280 if (!adev->ip_blocks[i].status.sw) 2281 continue; 2282 2283 /* no need to do the fw loading again if already done*/ 2284 if (adev->ip_blocks[i].status.hw == true) 2285 break; 2286 2287 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2288 r = adev->ip_blocks[i].version->funcs->resume(adev); 2289 if (r) { 2290 DRM_ERROR("resume of IP block <%s> failed %d\n", 2291 adev->ip_blocks[i].version->funcs->name, r); 2292 return r; 2293 } 2294 } else { 2295 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2296 if (r) { 2297 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2298 adev->ip_blocks[i].version->funcs->name, r); 2299 return r; 2300 } 2301 } 2302 2303 adev->ip_blocks[i].status.hw = true; 2304 break; 2305 } 2306 } 2307 2308 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2309 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2310 2311 return r; 2312 } 2313 2314 /** 2315 * amdgpu_device_ip_init - run init for hardware IPs 2316 * 2317 * @adev: amdgpu_device pointer 2318 * 2319 * Main initialization pass for hardware IPs. The list of all the hardware 2320 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2321 * are run. sw_init initializes the software state associated with each IP 2322 * and hw_init initializes the hardware associated with each IP. 2323 * Returns 0 on success, negative error code on failure. 2324 */ 2325 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2326 { 2327 int i, r; 2328 2329 r = amdgpu_ras_init(adev); 2330 if (r) 2331 return r; 2332 2333 for (i = 0; i < adev->num_ip_blocks; i++) { 2334 if (!adev->ip_blocks[i].status.valid) 2335 continue; 2336 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2337 if (r) { 2338 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2339 adev->ip_blocks[i].version->funcs->name, r); 2340 goto init_failed; 2341 } 2342 adev->ip_blocks[i].status.sw = true; 2343 2344 /* need to do gmc hw init early so we can allocate gpu mem */ 2345 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2346 r = amdgpu_device_vram_scratch_init(adev); 2347 if (r) { 2348 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2349 goto init_failed; 2350 } 2351 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2352 if (r) { 2353 DRM_ERROR("hw_init %d failed %d\n", i, r); 2354 goto init_failed; 2355 } 2356 r = amdgpu_device_wb_init(adev); 2357 if (r) { 2358 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2359 goto init_failed; 2360 } 2361 adev->ip_blocks[i].status.hw = true; 2362 2363 /* right after GMC hw init, we create CSA */ 2364 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2365 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2366 AMDGPU_GEM_DOMAIN_VRAM, 2367 AMDGPU_CSA_SIZE); 2368 if (r) { 2369 DRM_ERROR("allocate CSA failed %d\n", r); 2370 goto init_failed; 2371 } 2372 } 2373 } 2374 } 2375 2376 if (amdgpu_sriov_vf(adev)) 2377 amdgpu_virt_init_data_exchange(adev); 2378 2379 r = amdgpu_ib_pool_init(adev); 2380 if (r) { 2381 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2382 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2383 goto init_failed; 2384 } 2385 2386 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2387 if (r) 2388 goto init_failed; 2389 2390 r = amdgpu_device_ip_hw_init_phase1(adev); 2391 if (r) 2392 goto init_failed; 2393 2394 r = amdgpu_device_fw_loading(adev); 2395 if (r) 2396 goto init_failed; 2397 2398 r = amdgpu_device_ip_hw_init_phase2(adev); 2399 if (r) 2400 goto init_failed; 2401 2402 /* 2403 * retired pages will be loaded from eeprom and reserved here, 2404 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2405 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2406 * for I2C communication which only true at this point. 2407 * 2408 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2409 * failure from bad gpu situation and stop amdgpu init process 2410 * accordingly. For other failed cases, it will still release all 2411 * the resource and print error message, rather than returning one 2412 * negative value to upper level. 2413 * 2414 * Note: theoretically, this should be called before all vram allocations 2415 * to protect retired page from abusing 2416 */ 2417 r = amdgpu_ras_recovery_init(adev); 2418 if (r) 2419 goto init_failed; 2420 2421 if (adev->gmc.xgmi.num_physical_nodes > 1) 2422 amdgpu_xgmi_add_device(adev); 2423 2424 /* Don't init kfd if whole hive need to be reset during init */ 2425 if (!adev->gmc.xgmi.pending_reset) 2426 amdgpu_amdkfd_device_init(adev); 2427 2428 amdgpu_fru_get_product_info(adev); 2429 2430 init_failed: 2431 if (amdgpu_sriov_vf(adev)) 2432 amdgpu_virt_release_full_gpu(adev, true); 2433 2434 return r; 2435 } 2436 2437 /** 2438 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2439 * 2440 * @adev: amdgpu_device pointer 2441 * 2442 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2443 * this function before a GPU reset. If the value is retained after a 2444 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2445 */ 2446 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2447 { 2448 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2449 } 2450 2451 /** 2452 * amdgpu_device_check_vram_lost - check if vram is valid 2453 * 2454 * @adev: amdgpu_device pointer 2455 * 2456 * Checks the reset magic value written to the gart pointer in VRAM. 2457 * The driver calls this after a GPU reset to see if the contents of 2458 * VRAM is lost or now. 2459 * returns true if vram is lost, false if not. 2460 */ 2461 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2462 { 2463 if (memcmp(adev->gart.ptr, adev->reset_magic, 2464 AMDGPU_RESET_MAGIC_NUM)) 2465 return true; 2466 2467 if (!amdgpu_in_reset(adev)) 2468 return false; 2469 2470 /* 2471 * For all ASICs with baco/mode1 reset, the VRAM is 2472 * always assumed to be lost. 2473 */ 2474 switch (amdgpu_asic_reset_method(adev)) { 2475 case AMD_RESET_METHOD_BACO: 2476 case AMD_RESET_METHOD_MODE1: 2477 return true; 2478 default: 2479 return false; 2480 } 2481 } 2482 2483 /** 2484 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2485 * 2486 * @adev: amdgpu_device pointer 2487 * @state: clockgating state (gate or ungate) 2488 * 2489 * The list of all the hardware IPs that make up the asic is walked and the 2490 * set_clockgating_state callbacks are run. 2491 * Late initialization pass enabling clockgating for hardware IPs. 2492 * Fini or suspend, pass disabling clockgating for hardware IPs. 2493 * Returns 0 on success, negative error code on failure. 2494 */ 2495 2496 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2497 enum amd_clockgating_state state) 2498 { 2499 int i, j, r; 2500 2501 if (amdgpu_emu_mode == 1) 2502 return 0; 2503 2504 for (j = 0; j < adev->num_ip_blocks; j++) { 2505 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2506 if (!adev->ip_blocks[i].status.late_initialized) 2507 continue; 2508 /* skip CG for GFX on S0ix */ 2509 if (adev->in_s0ix && 2510 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2511 continue; 2512 /* skip CG for VCE/UVD, it's handled specially */ 2513 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2514 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2515 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2516 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2517 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2518 /* enable clockgating to save power */ 2519 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2520 state); 2521 if (r) { 2522 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2523 adev->ip_blocks[i].version->funcs->name, r); 2524 return r; 2525 } 2526 } 2527 } 2528 2529 return 0; 2530 } 2531 2532 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2533 enum amd_powergating_state state) 2534 { 2535 int i, j, r; 2536 2537 if (amdgpu_emu_mode == 1) 2538 return 0; 2539 2540 for (j = 0; j < adev->num_ip_blocks; j++) { 2541 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2542 if (!adev->ip_blocks[i].status.late_initialized) 2543 continue; 2544 /* skip PG for GFX on S0ix */ 2545 if (adev->in_s0ix && 2546 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2547 continue; 2548 /* skip CG for VCE/UVD, it's handled specially */ 2549 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2550 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2551 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2552 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2553 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2554 /* enable powergating to save power */ 2555 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2556 state); 2557 if (r) { 2558 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2559 adev->ip_blocks[i].version->funcs->name, r); 2560 return r; 2561 } 2562 } 2563 } 2564 return 0; 2565 } 2566 2567 static int amdgpu_device_enable_mgpu_fan_boost(void) 2568 { 2569 struct amdgpu_gpu_instance *gpu_ins; 2570 struct amdgpu_device *adev; 2571 int i, ret = 0; 2572 2573 mutex_lock(&mgpu_info.mutex); 2574 2575 /* 2576 * MGPU fan boost feature should be enabled 2577 * only when there are two or more dGPUs in 2578 * the system 2579 */ 2580 if (mgpu_info.num_dgpu < 2) 2581 goto out; 2582 2583 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2584 gpu_ins = &(mgpu_info.gpu_ins[i]); 2585 adev = gpu_ins->adev; 2586 if (!(adev->flags & AMD_IS_APU) && 2587 !gpu_ins->mgpu_fan_enabled) { 2588 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2589 if (ret) 2590 break; 2591 2592 gpu_ins->mgpu_fan_enabled = 1; 2593 } 2594 } 2595 2596 out: 2597 mutex_unlock(&mgpu_info.mutex); 2598 2599 return ret; 2600 } 2601 2602 /** 2603 * amdgpu_device_ip_late_init - run late init for hardware IPs 2604 * 2605 * @adev: amdgpu_device pointer 2606 * 2607 * Late initialization pass for hardware IPs. The list of all the hardware 2608 * IPs that make up the asic is walked and the late_init callbacks are run. 2609 * late_init covers any special initialization that an IP requires 2610 * after all of the have been initialized or something that needs to happen 2611 * late in the init process. 2612 * Returns 0 on success, negative error code on failure. 2613 */ 2614 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2615 { 2616 struct amdgpu_gpu_instance *gpu_instance; 2617 int i = 0, r; 2618 2619 for (i = 0; i < adev->num_ip_blocks; i++) { 2620 if (!adev->ip_blocks[i].status.hw) 2621 continue; 2622 if (adev->ip_blocks[i].version->funcs->late_init) { 2623 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2624 if (r) { 2625 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2626 adev->ip_blocks[i].version->funcs->name, r); 2627 return r; 2628 } 2629 } 2630 adev->ip_blocks[i].status.late_initialized = true; 2631 } 2632 2633 amdgpu_ras_set_error_query_ready(adev, true); 2634 2635 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2636 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2637 2638 amdgpu_device_fill_reset_magic(adev); 2639 2640 r = amdgpu_device_enable_mgpu_fan_boost(); 2641 if (r) 2642 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2643 2644 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2645 if (adev->asic_type == CHIP_ARCTURUS && 2646 amdgpu_passthrough(adev) && 2647 adev->gmc.xgmi.num_physical_nodes > 1) 2648 smu_set_light_sbr(&adev->smu, true); 2649 2650 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2651 mutex_lock(&mgpu_info.mutex); 2652 2653 /* 2654 * Reset device p-state to low as this was booted with high. 2655 * 2656 * This should be performed only after all devices from the same 2657 * hive get initialized. 2658 * 2659 * However, it's unknown how many device in the hive in advance. 2660 * As this is counted one by one during devices initializations. 2661 * 2662 * So, we wait for all XGMI interlinked devices initialized. 2663 * This may bring some delays as those devices may come from 2664 * different hives. But that should be OK. 2665 */ 2666 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2667 for (i = 0; i < mgpu_info.num_gpu; i++) { 2668 gpu_instance = &(mgpu_info.gpu_ins[i]); 2669 if (gpu_instance->adev->flags & AMD_IS_APU) 2670 continue; 2671 2672 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2673 AMDGPU_XGMI_PSTATE_MIN); 2674 if (r) { 2675 DRM_ERROR("pstate setting failed (%d).\n", r); 2676 break; 2677 } 2678 } 2679 } 2680 2681 mutex_unlock(&mgpu_info.mutex); 2682 } 2683 2684 return 0; 2685 } 2686 2687 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2688 { 2689 int i, r; 2690 2691 for (i = 0; i < adev->num_ip_blocks; i++) { 2692 if (!adev->ip_blocks[i].version->funcs->early_fini) 2693 continue; 2694 2695 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2696 if (r) { 2697 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2698 adev->ip_blocks[i].version->funcs->name, r); 2699 } 2700 } 2701 2702 amdgpu_amdkfd_suspend(adev, false); 2703 2704 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2705 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2706 2707 /* need to disable SMC first */ 2708 for (i = 0; i < adev->num_ip_blocks; i++) { 2709 if (!adev->ip_blocks[i].status.hw) 2710 continue; 2711 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2712 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2713 /* XXX handle errors */ 2714 if (r) { 2715 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2716 adev->ip_blocks[i].version->funcs->name, r); 2717 } 2718 adev->ip_blocks[i].status.hw = false; 2719 break; 2720 } 2721 } 2722 2723 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2724 if (!adev->ip_blocks[i].status.hw) 2725 continue; 2726 2727 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2728 /* XXX handle errors */ 2729 if (r) { 2730 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2731 adev->ip_blocks[i].version->funcs->name, r); 2732 } 2733 2734 adev->ip_blocks[i].status.hw = false; 2735 } 2736 2737 return 0; 2738 } 2739 2740 /** 2741 * amdgpu_device_ip_fini - run fini for hardware IPs 2742 * 2743 * @adev: amdgpu_device pointer 2744 * 2745 * Main teardown pass for hardware IPs. The list of all the hardware 2746 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2747 * are run. hw_fini tears down the hardware associated with each IP 2748 * and sw_fini tears down any software state associated with each IP. 2749 * Returns 0 on success, negative error code on failure. 2750 */ 2751 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2752 { 2753 int i, r; 2754 2755 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2756 amdgpu_virt_release_ras_err_handler_data(adev); 2757 2758 amdgpu_ras_pre_fini(adev); 2759 2760 if (adev->gmc.xgmi.num_physical_nodes > 1) 2761 amdgpu_xgmi_remove_device(adev); 2762 2763 amdgpu_amdkfd_device_fini_sw(adev); 2764 2765 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2766 if (!adev->ip_blocks[i].status.sw) 2767 continue; 2768 2769 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2770 amdgpu_ucode_free_bo(adev); 2771 amdgpu_free_static_csa(&adev->virt.csa_obj); 2772 amdgpu_device_wb_fini(adev); 2773 amdgpu_device_vram_scratch_fini(adev); 2774 amdgpu_ib_pool_fini(adev); 2775 } 2776 2777 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2778 /* XXX handle errors */ 2779 if (r) { 2780 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2781 adev->ip_blocks[i].version->funcs->name, r); 2782 } 2783 adev->ip_blocks[i].status.sw = false; 2784 adev->ip_blocks[i].status.valid = false; 2785 } 2786 2787 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2788 if (!adev->ip_blocks[i].status.late_initialized) 2789 continue; 2790 if (adev->ip_blocks[i].version->funcs->late_fini) 2791 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2792 adev->ip_blocks[i].status.late_initialized = false; 2793 } 2794 2795 amdgpu_ras_fini(adev); 2796 2797 if (amdgpu_sriov_vf(adev)) 2798 if (amdgpu_virt_release_full_gpu(adev, false)) 2799 DRM_ERROR("failed to release exclusive mode on fini\n"); 2800 2801 return 0; 2802 } 2803 2804 /** 2805 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2806 * 2807 * @work: work_struct. 2808 */ 2809 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2810 { 2811 struct amdgpu_device *adev = 2812 container_of(work, struct amdgpu_device, delayed_init_work.work); 2813 int r; 2814 2815 r = amdgpu_ib_ring_tests(adev); 2816 if (r) 2817 DRM_ERROR("ib ring test failed (%d).\n", r); 2818 } 2819 2820 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2821 { 2822 struct amdgpu_device *adev = 2823 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2824 2825 mutex_lock(&adev->gfx.gfx_off_mutex); 2826 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2827 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2828 adev->gfx.gfx_off_state = true; 2829 } 2830 mutex_unlock(&adev->gfx.gfx_off_mutex); 2831 } 2832 2833 /** 2834 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2835 * 2836 * @adev: amdgpu_device pointer 2837 * 2838 * Main suspend function for hardware IPs. The list of all the hardware 2839 * IPs that make up the asic is walked, clockgating is disabled and the 2840 * suspend callbacks are run. suspend puts the hardware and software state 2841 * in each IP into a state suitable for suspend. 2842 * Returns 0 on success, negative error code on failure. 2843 */ 2844 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2845 { 2846 int i, r; 2847 2848 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2849 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2850 2851 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2852 if (!adev->ip_blocks[i].status.valid) 2853 continue; 2854 2855 /* displays are handled separately */ 2856 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2857 continue; 2858 2859 /* XXX handle errors */ 2860 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2861 /* XXX handle errors */ 2862 if (r) { 2863 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2864 adev->ip_blocks[i].version->funcs->name, r); 2865 return r; 2866 } 2867 2868 adev->ip_blocks[i].status.hw = false; 2869 } 2870 2871 return 0; 2872 } 2873 2874 /** 2875 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2876 * 2877 * @adev: amdgpu_device pointer 2878 * 2879 * Main suspend function for hardware IPs. The list of all the hardware 2880 * IPs that make up the asic is walked, clockgating is disabled and the 2881 * suspend callbacks are run. suspend puts the hardware and software state 2882 * in each IP into a state suitable for suspend. 2883 * Returns 0 on success, negative error code on failure. 2884 */ 2885 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2886 { 2887 int i, r; 2888 2889 if (adev->in_s0ix) 2890 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2891 2892 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2893 if (!adev->ip_blocks[i].status.valid) 2894 continue; 2895 /* displays are handled in phase1 */ 2896 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2897 continue; 2898 /* PSP lost connection when err_event_athub occurs */ 2899 if (amdgpu_ras_intr_triggered() && 2900 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2901 adev->ip_blocks[i].status.hw = false; 2902 continue; 2903 } 2904 2905 /* skip unnecessary suspend if we do not initialize them yet */ 2906 if (adev->gmc.xgmi.pending_reset && 2907 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2908 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2909 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2910 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2911 adev->ip_blocks[i].status.hw = false; 2912 continue; 2913 } 2914 2915 /* skip suspend of gfx and psp for S0ix 2916 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2917 * like at runtime. PSP is also part of the always on hardware 2918 * so no need to suspend it. 2919 */ 2920 if (adev->in_s0ix && 2921 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2922 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2923 continue; 2924 2925 /* XXX handle errors */ 2926 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2927 /* XXX handle errors */ 2928 if (r) { 2929 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2930 adev->ip_blocks[i].version->funcs->name, r); 2931 } 2932 adev->ip_blocks[i].status.hw = false; 2933 /* handle putting the SMC in the appropriate state */ 2934 if(!amdgpu_sriov_vf(adev)){ 2935 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2936 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2937 if (r) { 2938 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2939 adev->mp1_state, r); 2940 return r; 2941 } 2942 } 2943 } 2944 } 2945 2946 return 0; 2947 } 2948 2949 /** 2950 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2951 * 2952 * @adev: amdgpu_device pointer 2953 * 2954 * Main suspend function for hardware IPs. The list of all the hardware 2955 * IPs that make up the asic is walked, clockgating is disabled and the 2956 * suspend callbacks are run. suspend puts the hardware and software state 2957 * in each IP into a state suitable for suspend. 2958 * Returns 0 on success, negative error code on failure. 2959 */ 2960 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2961 { 2962 int r; 2963 2964 if (amdgpu_sriov_vf(adev)) { 2965 amdgpu_virt_fini_data_exchange(adev); 2966 amdgpu_virt_request_full_gpu(adev, false); 2967 } 2968 2969 r = amdgpu_device_ip_suspend_phase1(adev); 2970 if (r) 2971 return r; 2972 r = amdgpu_device_ip_suspend_phase2(adev); 2973 2974 if (amdgpu_sriov_vf(adev)) 2975 amdgpu_virt_release_full_gpu(adev, false); 2976 2977 return r; 2978 } 2979 2980 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2981 { 2982 int i, r; 2983 2984 static enum amd_ip_block_type ip_order[] = { 2985 AMD_IP_BLOCK_TYPE_GMC, 2986 AMD_IP_BLOCK_TYPE_COMMON, 2987 AMD_IP_BLOCK_TYPE_PSP, 2988 AMD_IP_BLOCK_TYPE_IH, 2989 }; 2990 2991 for (i = 0; i < adev->num_ip_blocks; i++) { 2992 int j; 2993 struct amdgpu_ip_block *block; 2994 2995 block = &adev->ip_blocks[i]; 2996 block->status.hw = false; 2997 2998 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2999 3000 if (block->version->type != ip_order[j] || 3001 !block->status.valid) 3002 continue; 3003 3004 r = block->version->funcs->hw_init(adev); 3005 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3006 if (r) 3007 return r; 3008 block->status.hw = true; 3009 } 3010 } 3011 3012 return 0; 3013 } 3014 3015 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3016 { 3017 int i, r; 3018 3019 static enum amd_ip_block_type ip_order[] = { 3020 AMD_IP_BLOCK_TYPE_SMC, 3021 AMD_IP_BLOCK_TYPE_DCE, 3022 AMD_IP_BLOCK_TYPE_GFX, 3023 AMD_IP_BLOCK_TYPE_SDMA, 3024 AMD_IP_BLOCK_TYPE_UVD, 3025 AMD_IP_BLOCK_TYPE_VCE, 3026 AMD_IP_BLOCK_TYPE_VCN 3027 }; 3028 3029 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3030 int j; 3031 struct amdgpu_ip_block *block; 3032 3033 for (j = 0; j < adev->num_ip_blocks; j++) { 3034 block = &adev->ip_blocks[j]; 3035 3036 if (block->version->type != ip_order[i] || 3037 !block->status.valid || 3038 block->status.hw) 3039 continue; 3040 3041 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3042 r = block->version->funcs->resume(adev); 3043 else 3044 r = block->version->funcs->hw_init(adev); 3045 3046 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3047 if (r) 3048 return r; 3049 block->status.hw = true; 3050 } 3051 } 3052 3053 return 0; 3054 } 3055 3056 /** 3057 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3058 * 3059 * @adev: amdgpu_device pointer 3060 * 3061 * First resume function for hardware IPs. The list of all the hardware 3062 * IPs that make up the asic is walked and the resume callbacks are run for 3063 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3064 * after a suspend and updates the software state as necessary. This 3065 * function is also used for restoring the GPU after a GPU reset. 3066 * Returns 0 on success, negative error code on failure. 3067 */ 3068 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3069 { 3070 int i, r; 3071 3072 for (i = 0; i < adev->num_ip_blocks; i++) { 3073 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3074 continue; 3075 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3076 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3077 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3078 3079 r = adev->ip_blocks[i].version->funcs->resume(adev); 3080 if (r) { 3081 DRM_ERROR("resume of IP block <%s> failed %d\n", 3082 adev->ip_blocks[i].version->funcs->name, r); 3083 return r; 3084 } 3085 adev->ip_blocks[i].status.hw = true; 3086 } 3087 } 3088 3089 return 0; 3090 } 3091 3092 /** 3093 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3094 * 3095 * @adev: amdgpu_device pointer 3096 * 3097 * First resume function for hardware IPs. The list of all the hardware 3098 * IPs that make up the asic is walked and the resume callbacks are run for 3099 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3100 * functional state after a suspend and updates the software state as 3101 * necessary. This function is also used for restoring the GPU after a GPU 3102 * reset. 3103 * Returns 0 on success, negative error code on failure. 3104 */ 3105 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3106 { 3107 int i, r; 3108 3109 for (i = 0; i < adev->num_ip_blocks; i++) { 3110 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3111 continue; 3112 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3113 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3114 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3115 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3116 continue; 3117 r = adev->ip_blocks[i].version->funcs->resume(adev); 3118 if (r) { 3119 DRM_ERROR("resume of IP block <%s> failed %d\n", 3120 adev->ip_blocks[i].version->funcs->name, r); 3121 return r; 3122 } 3123 adev->ip_blocks[i].status.hw = true; 3124 } 3125 3126 return 0; 3127 } 3128 3129 /** 3130 * amdgpu_device_ip_resume - run resume for hardware IPs 3131 * 3132 * @adev: amdgpu_device pointer 3133 * 3134 * Main resume function for hardware IPs. The hardware IPs 3135 * are split into two resume functions because they are 3136 * are also used in in recovering from a GPU reset and some additional 3137 * steps need to be take between them. In this case (S3/S4) they are 3138 * run sequentially. 3139 * Returns 0 on success, negative error code on failure. 3140 */ 3141 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3142 { 3143 int r; 3144 3145 r = amdgpu_device_ip_resume_phase1(adev); 3146 if (r) 3147 return r; 3148 3149 r = amdgpu_device_fw_loading(adev); 3150 if (r) 3151 return r; 3152 3153 r = amdgpu_device_ip_resume_phase2(adev); 3154 3155 return r; 3156 } 3157 3158 /** 3159 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3160 * 3161 * @adev: amdgpu_device pointer 3162 * 3163 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3164 */ 3165 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3166 { 3167 if (amdgpu_sriov_vf(adev)) { 3168 if (adev->is_atom_fw) { 3169 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3170 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3171 } else { 3172 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3173 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3174 } 3175 3176 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3177 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3178 } 3179 } 3180 3181 /** 3182 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3183 * 3184 * @asic_type: AMD asic type 3185 * 3186 * Check if there is DC (new modesetting infrastructre) support for an asic. 3187 * returns true if DC has support, false if not. 3188 */ 3189 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3190 { 3191 switch (asic_type) { 3192 #if defined(CONFIG_DRM_AMD_DC) 3193 #if defined(CONFIG_DRM_AMD_DC_SI) 3194 case CHIP_TAHITI: 3195 case CHIP_PITCAIRN: 3196 case CHIP_VERDE: 3197 case CHIP_OLAND: 3198 #endif 3199 case CHIP_BONAIRE: 3200 case CHIP_KAVERI: 3201 case CHIP_KABINI: 3202 case CHIP_MULLINS: 3203 /* 3204 * We have systems in the wild with these ASICs that require 3205 * LVDS and VGA support which is not supported with DC. 3206 * 3207 * Fallback to the non-DC driver here by default so as not to 3208 * cause regressions. 3209 */ 3210 return amdgpu_dc > 0; 3211 case CHIP_HAWAII: 3212 case CHIP_CARRIZO: 3213 case CHIP_STONEY: 3214 case CHIP_POLARIS10: 3215 case CHIP_POLARIS11: 3216 case CHIP_POLARIS12: 3217 case CHIP_VEGAM: 3218 case CHIP_TONGA: 3219 case CHIP_FIJI: 3220 case CHIP_VEGA10: 3221 case CHIP_VEGA12: 3222 case CHIP_VEGA20: 3223 #if defined(CONFIG_DRM_AMD_DC_DCN) 3224 case CHIP_RAVEN: 3225 case CHIP_NAVI10: 3226 case CHIP_NAVI14: 3227 case CHIP_NAVI12: 3228 case CHIP_RENOIR: 3229 case CHIP_SIENNA_CICHLID: 3230 case CHIP_NAVY_FLOUNDER: 3231 case CHIP_DIMGREY_CAVEFISH: 3232 case CHIP_BEIGE_GOBY: 3233 case CHIP_VANGOGH: 3234 case CHIP_YELLOW_CARP: 3235 #endif 3236 return amdgpu_dc != 0; 3237 #endif 3238 default: 3239 if (amdgpu_dc > 0) 3240 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3241 "but isn't supported by ASIC, ignoring\n"); 3242 return false; 3243 } 3244 } 3245 3246 /** 3247 * amdgpu_device_has_dc_support - check if dc is supported 3248 * 3249 * @adev: amdgpu_device pointer 3250 * 3251 * Returns true for supported, false for not supported 3252 */ 3253 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3254 { 3255 if (amdgpu_sriov_vf(adev) || 3256 adev->enable_virtual_display || 3257 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3258 return false; 3259 3260 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3261 } 3262 3263 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3264 { 3265 struct amdgpu_device *adev = 3266 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3267 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3268 3269 /* It's a bug to not have a hive within this function */ 3270 if (WARN_ON(!hive)) 3271 return; 3272 3273 /* 3274 * Use task barrier to synchronize all xgmi reset works across the 3275 * hive. task_barrier_enter and task_barrier_exit will block 3276 * until all the threads running the xgmi reset works reach 3277 * those points. task_barrier_full will do both blocks. 3278 */ 3279 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3280 3281 task_barrier_enter(&hive->tb); 3282 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3283 3284 if (adev->asic_reset_res) 3285 goto fail; 3286 3287 task_barrier_exit(&hive->tb); 3288 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3289 3290 if (adev->asic_reset_res) 3291 goto fail; 3292 3293 if (adev->mmhub.ras_funcs && 3294 adev->mmhub.ras_funcs->reset_ras_error_count) 3295 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3296 } else { 3297 3298 task_barrier_full(&hive->tb); 3299 adev->asic_reset_res = amdgpu_asic_reset(adev); 3300 } 3301 3302 fail: 3303 if (adev->asic_reset_res) 3304 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3305 adev->asic_reset_res, adev_to_drm(adev)->unique); 3306 amdgpu_put_xgmi_hive(hive); 3307 } 3308 3309 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3310 { 3311 char *input = amdgpu_lockup_timeout; 3312 char *timeout_setting = NULL; 3313 int index = 0; 3314 long timeout; 3315 int ret = 0; 3316 3317 /* 3318 * By default timeout for non compute jobs is 10000 3319 * and 60000 for compute jobs. 3320 * In SR-IOV or passthrough mode, timeout for compute 3321 * jobs are 60000 by default. 3322 */ 3323 adev->gfx_timeout = msecs_to_jiffies(10000); 3324 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3325 if (amdgpu_sriov_vf(adev)) 3326 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3327 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3328 else 3329 adev->compute_timeout = msecs_to_jiffies(60000); 3330 3331 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3332 while ((timeout_setting = strsep(&input, ",")) && 3333 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3334 ret = kstrtol(timeout_setting, 0, &timeout); 3335 if (ret) 3336 return ret; 3337 3338 if (timeout == 0) { 3339 index++; 3340 continue; 3341 } else if (timeout < 0) { 3342 timeout = MAX_SCHEDULE_TIMEOUT; 3343 } else { 3344 timeout = msecs_to_jiffies(timeout); 3345 } 3346 3347 switch (index++) { 3348 case 0: 3349 adev->gfx_timeout = timeout; 3350 break; 3351 case 1: 3352 adev->compute_timeout = timeout; 3353 break; 3354 case 2: 3355 adev->sdma_timeout = timeout; 3356 break; 3357 case 3: 3358 adev->video_timeout = timeout; 3359 break; 3360 default: 3361 break; 3362 } 3363 } 3364 /* 3365 * There is only one value specified and 3366 * it should apply to all non-compute jobs. 3367 */ 3368 if (index == 1) { 3369 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3370 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3371 adev->compute_timeout = adev->gfx_timeout; 3372 } 3373 } 3374 3375 return ret; 3376 } 3377 3378 static const struct attribute *amdgpu_dev_attributes[] = { 3379 &dev_attr_product_name.attr, 3380 &dev_attr_product_number.attr, 3381 &dev_attr_serial_number.attr, 3382 &dev_attr_pcie_replay_count.attr, 3383 NULL 3384 }; 3385 3386 /** 3387 * amdgpu_device_init - initialize the driver 3388 * 3389 * @adev: amdgpu_device pointer 3390 * @flags: driver flags 3391 * 3392 * Initializes the driver info and hw (all asics). 3393 * Returns 0 for success or an error on failure. 3394 * Called at driver startup. 3395 */ 3396 int amdgpu_device_init(struct amdgpu_device *adev, 3397 uint32_t flags) 3398 { 3399 struct drm_device *ddev = adev_to_drm(adev); 3400 struct pci_dev *pdev = adev->pdev; 3401 int r, i; 3402 bool px = false; 3403 u32 max_MBps; 3404 3405 adev->shutdown = false; 3406 adev->flags = flags; 3407 3408 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3409 adev->asic_type = amdgpu_force_asic_type; 3410 else 3411 adev->asic_type = flags & AMD_ASIC_MASK; 3412 3413 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3414 if (amdgpu_emu_mode == 1) 3415 adev->usec_timeout *= 10; 3416 adev->gmc.gart_size = 512 * 1024 * 1024; 3417 adev->accel_working = false; 3418 adev->num_rings = 0; 3419 adev->mman.buffer_funcs = NULL; 3420 adev->mman.buffer_funcs_ring = NULL; 3421 adev->vm_manager.vm_pte_funcs = NULL; 3422 adev->vm_manager.vm_pte_num_scheds = 0; 3423 adev->gmc.gmc_funcs = NULL; 3424 adev->harvest_ip_mask = 0x0; 3425 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3426 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3427 3428 adev->smc_rreg = &amdgpu_invalid_rreg; 3429 adev->smc_wreg = &amdgpu_invalid_wreg; 3430 adev->pcie_rreg = &amdgpu_invalid_rreg; 3431 adev->pcie_wreg = &amdgpu_invalid_wreg; 3432 adev->pciep_rreg = &amdgpu_invalid_rreg; 3433 adev->pciep_wreg = &amdgpu_invalid_wreg; 3434 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3435 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3436 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3437 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3438 adev->didt_rreg = &amdgpu_invalid_rreg; 3439 adev->didt_wreg = &amdgpu_invalid_wreg; 3440 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3441 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3442 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3443 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3444 3445 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3446 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3447 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3448 3449 /* mutex initialization are all done here so we 3450 * can recall function without having locking issues */ 3451 mutex_init(&adev->firmware.mutex); 3452 mutex_init(&adev->pm.mutex); 3453 mutex_init(&adev->gfx.gpu_clock_mutex); 3454 mutex_init(&adev->srbm_mutex); 3455 mutex_init(&adev->gfx.pipe_reserve_mutex); 3456 mutex_init(&adev->gfx.gfx_off_mutex); 3457 mutex_init(&adev->grbm_idx_mutex); 3458 mutex_init(&adev->mn_lock); 3459 mutex_init(&adev->virt.vf_errors.lock); 3460 hash_init(adev->mn_hash); 3461 atomic_set(&adev->in_gpu_reset, 0); 3462 init_rwsem(&adev->reset_sem); 3463 mutex_init(&adev->psp.mutex); 3464 mutex_init(&adev->notifier_lock); 3465 3466 r = amdgpu_device_init_apu_flags(adev); 3467 if (r) 3468 return r; 3469 3470 r = amdgpu_device_check_arguments(adev); 3471 if (r) 3472 return r; 3473 3474 spin_lock_init(&adev->mmio_idx_lock); 3475 spin_lock_init(&adev->smc_idx_lock); 3476 spin_lock_init(&adev->pcie_idx_lock); 3477 spin_lock_init(&adev->uvd_ctx_idx_lock); 3478 spin_lock_init(&adev->didt_idx_lock); 3479 spin_lock_init(&adev->gc_cac_idx_lock); 3480 spin_lock_init(&adev->se_cac_idx_lock); 3481 spin_lock_init(&adev->audio_endpt_idx_lock); 3482 spin_lock_init(&adev->mm_stats.lock); 3483 3484 INIT_LIST_HEAD(&adev->shadow_list); 3485 mutex_init(&adev->shadow_list_lock); 3486 3487 INIT_LIST_HEAD(&adev->reset_list); 3488 3489 INIT_DELAYED_WORK(&adev->delayed_init_work, 3490 amdgpu_device_delayed_init_work_handler); 3491 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3492 amdgpu_device_delay_enable_gfx_off); 3493 3494 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3495 3496 adev->gfx.gfx_off_req_count = 1; 3497 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3498 3499 atomic_set(&adev->throttling_logging_enabled, 1); 3500 /* 3501 * If throttling continues, logging will be performed every minute 3502 * to avoid log flooding. "-1" is subtracted since the thermal 3503 * throttling interrupt comes every second. Thus, the total logging 3504 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3505 * for throttling interrupt) = 60 seconds. 3506 */ 3507 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3508 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3509 3510 /* Registers mapping */ 3511 /* TODO: block userspace mapping of io register */ 3512 if (adev->asic_type >= CHIP_BONAIRE) { 3513 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3514 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3515 } else { 3516 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3517 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3518 } 3519 3520 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3521 if (adev->rmmio == NULL) { 3522 return -ENOMEM; 3523 } 3524 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3525 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3526 3527 /* enable PCIE atomic ops */ 3528 r = pci_enable_atomic_ops_to_root(adev->pdev, 3529 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3530 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3531 if (r) { 3532 adev->have_atomics_support = false; 3533 DRM_INFO("PCIE atomic ops is not supported\n"); 3534 } else { 3535 adev->have_atomics_support = true; 3536 } 3537 3538 amdgpu_device_get_pcie_info(adev); 3539 3540 if (amdgpu_mcbp) 3541 DRM_INFO("MCBP is enabled\n"); 3542 3543 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3544 adev->enable_mes = true; 3545 3546 /* detect hw virtualization here */ 3547 amdgpu_detect_virtualization(adev); 3548 3549 r = amdgpu_device_get_job_timeout_settings(adev); 3550 if (r) { 3551 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3552 return r; 3553 } 3554 3555 /* early init functions */ 3556 r = amdgpu_device_ip_early_init(adev); 3557 if (r) 3558 return r; 3559 3560 /* doorbell bar mapping and doorbell index init*/ 3561 amdgpu_device_doorbell_init(adev); 3562 3563 if (amdgpu_emu_mode == 1) { 3564 /* post the asic on emulation mode */ 3565 emu_soc_asic_init(adev); 3566 goto fence_driver_init; 3567 } 3568 3569 amdgpu_reset_init(adev); 3570 3571 /* detect if we are with an SRIOV vbios */ 3572 amdgpu_device_detect_sriov_bios(adev); 3573 3574 /* check if we need to reset the asic 3575 * E.g., driver was not cleanly unloaded previously, etc. 3576 */ 3577 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3578 if (adev->gmc.xgmi.num_physical_nodes) { 3579 dev_info(adev->dev, "Pending hive reset.\n"); 3580 adev->gmc.xgmi.pending_reset = true; 3581 /* Only need to init necessary block for SMU to handle the reset */ 3582 for (i = 0; i < adev->num_ip_blocks; i++) { 3583 if (!adev->ip_blocks[i].status.valid) 3584 continue; 3585 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3586 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3587 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3588 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3589 DRM_DEBUG("IP %s disabled for hw_init.\n", 3590 adev->ip_blocks[i].version->funcs->name); 3591 adev->ip_blocks[i].status.hw = true; 3592 } 3593 } 3594 } else { 3595 r = amdgpu_asic_reset(adev); 3596 if (r) { 3597 dev_err(adev->dev, "asic reset on init failed\n"); 3598 goto failed; 3599 } 3600 } 3601 } 3602 3603 pci_enable_pcie_error_reporting(adev->pdev); 3604 3605 /* Post card if necessary */ 3606 if (amdgpu_device_need_post(adev)) { 3607 if (!adev->bios) { 3608 dev_err(adev->dev, "no vBIOS found\n"); 3609 r = -EINVAL; 3610 goto failed; 3611 } 3612 DRM_INFO("GPU posting now...\n"); 3613 r = amdgpu_device_asic_init(adev); 3614 if (r) { 3615 dev_err(adev->dev, "gpu post error!\n"); 3616 goto failed; 3617 } 3618 } 3619 3620 if (adev->is_atom_fw) { 3621 /* Initialize clocks */ 3622 r = amdgpu_atomfirmware_get_clock_info(adev); 3623 if (r) { 3624 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3625 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3626 goto failed; 3627 } 3628 } else { 3629 /* Initialize clocks */ 3630 r = amdgpu_atombios_get_clock_info(adev); 3631 if (r) { 3632 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3633 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3634 goto failed; 3635 } 3636 /* init i2c buses */ 3637 if (!amdgpu_device_has_dc_support(adev)) 3638 amdgpu_atombios_i2c_init(adev); 3639 } 3640 3641 fence_driver_init: 3642 /* Fence driver */ 3643 r = amdgpu_fence_driver_init(adev); 3644 if (r) { 3645 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3646 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3647 goto failed; 3648 } 3649 3650 /* init the mode config */ 3651 drm_mode_config_init(adev_to_drm(adev)); 3652 3653 r = amdgpu_device_ip_init(adev); 3654 if (r) { 3655 /* failed in exclusive mode due to timeout */ 3656 if (amdgpu_sriov_vf(adev) && 3657 !amdgpu_sriov_runtime(adev) && 3658 amdgpu_virt_mmio_blocked(adev) && 3659 !amdgpu_virt_wait_reset(adev)) { 3660 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3661 /* Don't send request since VF is inactive. */ 3662 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3663 adev->virt.ops = NULL; 3664 r = -EAGAIN; 3665 goto release_ras_con; 3666 } 3667 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3668 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3669 goto release_ras_con; 3670 } 3671 3672 dev_info(adev->dev, 3673 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3674 adev->gfx.config.max_shader_engines, 3675 adev->gfx.config.max_sh_per_se, 3676 adev->gfx.config.max_cu_per_sh, 3677 adev->gfx.cu_info.number); 3678 3679 adev->accel_working = true; 3680 3681 amdgpu_vm_check_compute_bug(adev); 3682 3683 /* Initialize the buffer migration limit. */ 3684 if (amdgpu_moverate >= 0) 3685 max_MBps = amdgpu_moverate; 3686 else 3687 max_MBps = 8; /* Allow 8 MB/s. */ 3688 /* Get a log2 for easy divisions. */ 3689 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3690 3691 amdgpu_fbdev_init(adev); 3692 3693 r = amdgpu_pm_sysfs_init(adev); 3694 if (r) { 3695 adev->pm_sysfs_en = false; 3696 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3697 } else 3698 adev->pm_sysfs_en = true; 3699 3700 r = amdgpu_ucode_sysfs_init(adev); 3701 if (r) { 3702 adev->ucode_sysfs_en = false; 3703 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3704 } else 3705 adev->ucode_sysfs_en = true; 3706 3707 if ((amdgpu_testing & 1)) { 3708 if (adev->accel_working) 3709 amdgpu_test_moves(adev); 3710 else 3711 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3712 } 3713 if (amdgpu_benchmarking) { 3714 if (adev->accel_working) 3715 amdgpu_benchmark(adev, amdgpu_benchmarking); 3716 else 3717 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3718 } 3719 3720 /* 3721 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3722 * Otherwise the mgpu fan boost feature will be skipped due to the 3723 * gpu instance is counted less. 3724 */ 3725 amdgpu_register_gpu_instance(adev); 3726 3727 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3728 * explicit gating rather than handling it automatically. 3729 */ 3730 if (!adev->gmc.xgmi.pending_reset) { 3731 r = amdgpu_device_ip_late_init(adev); 3732 if (r) { 3733 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3734 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3735 goto release_ras_con; 3736 } 3737 /* must succeed. */ 3738 amdgpu_ras_resume(adev); 3739 queue_delayed_work(system_wq, &adev->delayed_init_work, 3740 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3741 } 3742 3743 if (amdgpu_sriov_vf(adev)) 3744 flush_delayed_work(&adev->delayed_init_work); 3745 3746 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3747 if (r) 3748 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3749 3750 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3751 r = amdgpu_pmu_init(adev); 3752 if (r) 3753 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3754 3755 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3756 if (amdgpu_device_cache_pci_state(adev->pdev)) 3757 pci_restore_state(pdev); 3758 3759 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3760 /* this will fail for cards that aren't VGA class devices, just 3761 * ignore it */ 3762 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3763 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3764 3765 if (amdgpu_device_supports_px(ddev)) { 3766 px = true; 3767 vga_switcheroo_register_client(adev->pdev, 3768 &amdgpu_switcheroo_ops, px); 3769 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3770 } 3771 3772 if (adev->gmc.xgmi.pending_reset) 3773 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3774 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3775 3776 return 0; 3777 3778 release_ras_con: 3779 amdgpu_release_ras_context(adev); 3780 3781 failed: 3782 amdgpu_vf_error_trans_all(adev); 3783 3784 return r; 3785 } 3786 3787 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3788 { 3789 /* Clear all CPU mappings pointing to this device */ 3790 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3791 3792 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3793 amdgpu_device_doorbell_fini(adev); 3794 3795 iounmap(adev->rmmio); 3796 adev->rmmio = NULL; 3797 if (adev->mman.aper_base_kaddr) 3798 iounmap(adev->mman.aper_base_kaddr); 3799 adev->mman.aper_base_kaddr = NULL; 3800 3801 /* Memory manager related */ 3802 if (!adev->gmc.xgmi.connected_to_cpu) { 3803 arch_phys_wc_del(adev->gmc.vram_mtrr); 3804 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3805 } 3806 } 3807 3808 /** 3809 * amdgpu_device_fini - tear down the driver 3810 * 3811 * @adev: amdgpu_device pointer 3812 * 3813 * Tear down the driver info (all asics). 3814 * Called at driver shutdown. 3815 */ 3816 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3817 { 3818 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3819 flush_delayed_work(&adev->delayed_init_work); 3820 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3821 adev->shutdown = true; 3822 3823 /* make sure IB test finished before entering exclusive mode 3824 * to avoid preemption on IB test 3825 * */ 3826 if (amdgpu_sriov_vf(adev)) { 3827 amdgpu_virt_request_full_gpu(adev, false); 3828 amdgpu_virt_fini_data_exchange(adev); 3829 } 3830 3831 /* disable all interrupts */ 3832 amdgpu_irq_disable_all(adev); 3833 if (adev->mode_info.mode_config_initialized){ 3834 if (!amdgpu_device_has_dc_support(adev)) 3835 drm_helper_force_disable_all(adev_to_drm(adev)); 3836 else 3837 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3838 } 3839 amdgpu_fence_driver_fini_hw(adev); 3840 3841 if (adev->pm_sysfs_en) 3842 amdgpu_pm_sysfs_fini(adev); 3843 if (adev->ucode_sysfs_en) 3844 amdgpu_ucode_sysfs_fini(adev); 3845 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3846 3847 amdgpu_fbdev_fini(adev); 3848 3849 amdgpu_irq_fini_hw(adev); 3850 3851 amdgpu_device_ip_fini_early(adev); 3852 3853 amdgpu_gart_dummy_page_fini(adev); 3854 3855 amdgpu_device_unmap_mmio(adev); 3856 } 3857 3858 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3859 { 3860 amdgpu_device_ip_fini(adev); 3861 amdgpu_fence_driver_fini_sw(adev); 3862 release_firmware(adev->firmware.gpu_info_fw); 3863 adev->firmware.gpu_info_fw = NULL; 3864 adev->accel_working = false; 3865 3866 amdgpu_reset_fini(adev); 3867 3868 /* free i2c buses */ 3869 if (!amdgpu_device_has_dc_support(adev)) 3870 amdgpu_i2c_fini(adev); 3871 3872 if (amdgpu_emu_mode != 1) 3873 amdgpu_atombios_fini(adev); 3874 3875 kfree(adev->bios); 3876 adev->bios = NULL; 3877 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3878 vga_switcheroo_unregister_client(adev->pdev); 3879 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3880 } 3881 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3882 vga_client_register(adev->pdev, NULL, NULL, NULL); 3883 3884 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3885 amdgpu_pmu_fini(adev); 3886 if (adev->mman.discovery_bin) 3887 amdgpu_discovery_fini(adev); 3888 3889 kfree(adev->pci_state); 3890 3891 } 3892 3893 3894 /* 3895 * Suspend & resume. 3896 */ 3897 /** 3898 * amdgpu_device_suspend - initiate device suspend 3899 * 3900 * @dev: drm dev pointer 3901 * @fbcon : notify the fbdev of suspend 3902 * 3903 * Puts the hw in the suspend state (all asics). 3904 * Returns 0 for success or an error on failure. 3905 * Called at driver suspend. 3906 */ 3907 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3908 { 3909 struct amdgpu_device *adev = drm_to_adev(dev); 3910 3911 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3912 return 0; 3913 3914 adev->in_suspend = true; 3915 3916 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 3917 DRM_WARN("smart shift update failed\n"); 3918 3919 drm_kms_helper_poll_disable(dev); 3920 3921 if (fbcon) 3922 amdgpu_fbdev_set_suspend(adev, 1); 3923 3924 cancel_delayed_work_sync(&adev->delayed_init_work); 3925 3926 amdgpu_ras_suspend(adev); 3927 3928 amdgpu_device_ip_suspend_phase1(adev); 3929 3930 if (!adev->in_s0ix) 3931 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3932 3933 /* evict vram memory */ 3934 amdgpu_bo_evict_vram(adev); 3935 3936 amdgpu_fence_driver_suspend(adev); 3937 3938 amdgpu_device_ip_suspend_phase2(adev); 3939 /* evict remaining vram memory 3940 * This second call to evict vram is to evict the gart page table 3941 * using the CPU. 3942 */ 3943 amdgpu_bo_evict_vram(adev); 3944 3945 return 0; 3946 } 3947 3948 /** 3949 * amdgpu_device_resume - initiate device resume 3950 * 3951 * @dev: drm dev pointer 3952 * @fbcon : notify the fbdev of resume 3953 * 3954 * Bring the hw back to operating state (all asics). 3955 * Returns 0 for success or an error on failure. 3956 * Called at driver resume. 3957 */ 3958 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3959 { 3960 struct amdgpu_device *adev = drm_to_adev(dev); 3961 int r = 0; 3962 3963 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3964 return 0; 3965 3966 if (adev->in_s0ix) 3967 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3968 3969 /* post card */ 3970 if (amdgpu_device_need_post(adev)) { 3971 r = amdgpu_device_asic_init(adev); 3972 if (r) 3973 dev_err(adev->dev, "amdgpu asic init failed\n"); 3974 } 3975 3976 r = amdgpu_device_ip_resume(adev); 3977 if (r) { 3978 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3979 return r; 3980 } 3981 amdgpu_fence_driver_resume(adev); 3982 3983 3984 r = amdgpu_device_ip_late_init(adev); 3985 if (r) 3986 return r; 3987 3988 queue_delayed_work(system_wq, &adev->delayed_init_work, 3989 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3990 3991 if (!adev->in_s0ix) { 3992 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3993 if (r) 3994 return r; 3995 } 3996 3997 /* Make sure IB tests flushed */ 3998 flush_delayed_work(&adev->delayed_init_work); 3999 4000 if (fbcon) 4001 amdgpu_fbdev_set_suspend(adev, 0); 4002 4003 drm_kms_helper_poll_enable(dev); 4004 4005 amdgpu_ras_resume(adev); 4006 4007 /* 4008 * Most of the connector probing functions try to acquire runtime pm 4009 * refs to ensure that the GPU is powered on when connector polling is 4010 * performed. Since we're calling this from a runtime PM callback, 4011 * trying to acquire rpm refs will cause us to deadlock. 4012 * 4013 * Since we're guaranteed to be holding the rpm lock, it's safe to 4014 * temporarily disable the rpm helpers so this doesn't deadlock us. 4015 */ 4016 #ifdef CONFIG_PM 4017 dev->dev->power.disable_depth++; 4018 #endif 4019 if (!amdgpu_device_has_dc_support(adev)) 4020 drm_helper_hpd_irq_event(dev); 4021 else 4022 drm_kms_helper_hotplug_event(dev); 4023 #ifdef CONFIG_PM 4024 dev->dev->power.disable_depth--; 4025 #endif 4026 adev->in_suspend = false; 4027 4028 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4029 DRM_WARN("smart shift update failed\n"); 4030 4031 return 0; 4032 } 4033 4034 /** 4035 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4036 * 4037 * @adev: amdgpu_device pointer 4038 * 4039 * The list of all the hardware IPs that make up the asic is walked and 4040 * the check_soft_reset callbacks are run. check_soft_reset determines 4041 * if the asic is still hung or not. 4042 * Returns true if any of the IPs are still in a hung state, false if not. 4043 */ 4044 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4045 { 4046 int i; 4047 bool asic_hang = false; 4048 4049 if (amdgpu_sriov_vf(adev)) 4050 return true; 4051 4052 if (amdgpu_asic_need_full_reset(adev)) 4053 return true; 4054 4055 for (i = 0; i < adev->num_ip_blocks; i++) { 4056 if (!adev->ip_blocks[i].status.valid) 4057 continue; 4058 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4059 adev->ip_blocks[i].status.hang = 4060 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4061 if (adev->ip_blocks[i].status.hang) { 4062 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4063 asic_hang = true; 4064 } 4065 } 4066 return asic_hang; 4067 } 4068 4069 /** 4070 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4071 * 4072 * @adev: amdgpu_device pointer 4073 * 4074 * The list of all the hardware IPs that make up the asic is walked and the 4075 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4076 * handles any IP specific hardware or software state changes that are 4077 * necessary for a soft reset to succeed. 4078 * Returns 0 on success, negative error code on failure. 4079 */ 4080 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4081 { 4082 int i, r = 0; 4083 4084 for (i = 0; i < adev->num_ip_blocks; i++) { 4085 if (!adev->ip_blocks[i].status.valid) 4086 continue; 4087 if (adev->ip_blocks[i].status.hang && 4088 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4089 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4090 if (r) 4091 return r; 4092 } 4093 } 4094 4095 return 0; 4096 } 4097 4098 /** 4099 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4100 * 4101 * @adev: amdgpu_device pointer 4102 * 4103 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4104 * reset is necessary to recover. 4105 * Returns true if a full asic reset is required, false if not. 4106 */ 4107 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4108 { 4109 int i; 4110 4111 if (amdgpu_asic_need_full_reset(adev)) 4112 return true; 4113 4114 for (i = 0; i < adev->num_ip_blocks; i++) { 4115 if (!adev->ip_blocks[i].status.valid) 4116 continue; 4117 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4118 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4119 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4120 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4122 if (adev->ip_blocks[i].status.hang) { 4123 dev_info(adev->dev, "Some block need full reset!\n"); 4124 return true; 4125 } 4126 } 4127 } 4128 return false; 4129 } 4130 4131 /** 4132 * amdgpu_device_ip_soft_reset - do a soft reset 4133 * 4134 * @adev: amdgpu_device pointer 4135 * 4136 * The list of all the hardware IPs that make up the asic is walked and the 4137 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4138 * IP specific hardware or software state changes that are necessary to soft 4139 * reset the IP. 4140 * Returns 0 on success, negative error code on failure. 4141 */ 4142 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4143 { 4144 int i, r = 0; 4145 4146 for (i = 0; i < adev->num_ip_blocks; i++) { 4147 if (!adev->ip_blocks[i].status.valid) 4148 continue; 4149 if (adev->ip_blocks[i].status.hang && 4150 adev->ip_blocks[i].version->funcs->soft_reset) { 4151 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4152 if (r) 4153 return r; 4154 } 4155 } 4156 4157 return 0; 4158 } 4159 4160 /** 4161 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4162 * 4163 * @adev: amdgpu_device pointer 4164 * 4165 * The list of all the hardware IPs that make up the asic is walked and the 4166 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4167 * handles any IP specific hardware or software state changes that are 4168 * necessary after the IP has been soft reset. 4169 * Returns 0 on success, negative error code on failure. 4170 */ 4171 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4172 { 4173 int i, r = 0; 4174 4175 for (i = 0; i < adev->num_ip_blocks; i++) { 4176 if (!adev->ip_blocks[i].status.valid) 4177 continue; 4178 if (adev->ip_blocks[i].status.hang && 4179 adev->ip_blocks[i].version->funcs->post_soft_reset) 4180 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4181 if (r) 4182 return r; 4183 } 4184 4185 return 0; 4186 } 4187 4188 /** 4189 * amdgpu_device_recover_vram - Recover some VRAM contents 4190 * 4191 * @adev: amdgpu_device pointer 4192 * 4193 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4194 * restore things like GPUVM page tables after a GPU reset where 4195 * the contents of VRAM might be lost. 4196 * 4197 * Returns: 4198 * 0 on success, negative error code on failure. 4199 */ 4200 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4201 { 4202 struct dma_fence *fence = NULL, *next = NULL; 4203 struct amdgpu_bo *shadow; 4204 struct amdgpu_bo_vm *vmbo; 4205 long r = 1, tmo; 4206 4207 if (amdgpu_sriov_runtime(adev)) 4208 tmo = msecs_to_jiffies(8000); 4209 else 4210 tmo = msecs_to_jiffies(100); 4211 4212 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4213 mutex_lock(&adev->shadow_list_lock); 4214 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4215 shadow = &vmbo->bo; 4216 /* No need to recover an evicted BO */ 4217 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4218 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4219 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4220 continue; 4221 4222 r = amdgpu_bo_restore_shadow(shadow, &next); 4223 if (r) 4224 break; 4225 4226 if (fence) { 4227 tmo = dma_fence_wait_timeout(fence, false, tmo); 4228 dma_fence_put(fence); 4229 fence = next; 4230 if (tmo == 0) { 4231 r = -ETIMEDOUT; 4232 break; 4233 } else if (tmo < 0) { 4234 r = tmo; 4235 break; 4236 } 4237 } else { 4238 fence = next; 4239 } 4240 } 4241 mutex_unlock(&adev->shadow_list_lock); 4242 4243 if (fence) 4244 tmo = dma_fence_wait_timeout(fence, false, tmo); 4245 dma_fence_put(fence); 4246 4247 if (r < 0 || tmo <= 0) { 4248 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4249 return -EIO; 4250 } 4251 4252 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4253 return 0; 4254 } 4255 4256 4257 /** 4258 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4259 * 4260 * @adev: amdgpu_device pointer 4261 * @from_hypervisor: request from hypervisor 4262 * 4263 * do VF FLR and reinitialize Asic 4264 * return 0 means succeeded otherwise failed 4265 */ 4266 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4267 bool from_hypervisor) 4268 { 4269 int r; 4270 4271 if (from_hypervisor) 4272 r = amdgpu_virt_request_full_gpu(adev, true); 4273 else 4274 r = amdgpu_virt_reset_gpu(adev); 4275 if (r) 4276 return r; 4277 4278 amdgpu_amdkfd_pre_reset(adev); 4279 4280 /* Resume IP prior to SMC */ 4281 r = amdgpu_device_ip_reinit_early_sriov(adev); 4282 if (r) 4283 goto error; 4284 4285 amdgpu_virt_init_data_exchange(adev); 4286 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4287 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4288 4289 r = amdgpu_device_fw_loading(adev); 4290 if (r) 4291 return r; 4292 4293 /* now we are okay to resume SMC/CP/SDMA */ 4294 r = amdgpu_device_ip_reinit_late_sriov(adev); 4295 if (r) 4296 goto error; 4297 4298 amdgpu_irq_gpu_reset_resume_helper(adev); 4299 r = amdgpu_ib_ring_tests(adev); 4300 amdgpu_amdkfd_post_reset(adev); 4301 4302 error: 4303 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4304 amdgpu_inc_vram_lost(adev); 4305 r = amdgpu_device_recover_vram(adev); 4306 } 4307 amdgpu_virt_release_full_gpu(adev, true); 4308 4309 return r; 4310 } 4311 4312 /** 4313 * amdgpu_device_has_job_running - check if there is any job in mirror list 4314 * 4315 * @adev: amdgpu_device pointer 4316 * 4317 * check if there is any job in mirror list 4318 */ 4319 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4320 { 4321 int i; 4322 struct drm_sched_job *job; 4323 4324 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4325 struct amdgpu_ring *ring = adev->rings[i]; 4326 4327 if (!ring || !ring->sched.thread) 4328 continue; 4329 4330 spin_lock(&ring->sched.job_list_lock); 4331 job = list_first_entry_or_null(&ring->sched.pending_list, 4332 struct drm_sched_job, list); 4333 spin_unlock(&ring->sched.job_list_lock); 4334 if (job) 4335 return true; 4336 } 4337 return false; 4338 } 4339 4340 /** 4341 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4342 * 4343 * @adev: amdgpu_device pointer 4344 * 4345 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4346 * a hung GPU. 4347 */ 4348 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4349 { 4350 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4351 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4352 return false; 4353 } 4354 4355 if (amdgpu_gpu_recovery == 0) 4356 goto disabled; 4357 4358 if (amdgpu_sriov_vf(adev)) 4359 return true; 4360 4361 if (amdgpu_gpu_recovery == -1) { 4362 switch (adev->asic_type) { 4363 case CHIP_BONAIRE: 4364 case CHIP_HAWAII: 4365 case CHIP_TOPAZ: 4366 case CHIP_TONGA: 4367 case CHIP_FIJI: 4368 case CHIP_POLARIS10: 4369 case CHIP_POLARIS11: 4370 case CHIP_POLARIS12: 4371 case CHIP_VEGAM: 4372 case CHIP_VEGA20: 4373 case CHIP_VEGA10: 4374 case CHIP_VEGA12: 4375 case CHIP_RAVEN: 4376 case CHIP_ARCTURUS: 4377 case CHIP_RENOIR: 4378 case CHIP_NAVI10: 4379 case CHIP_NAVI14: 4380 case CHIP_NAVI12: 4381 case CHIP_SIENNA_CICHLID: 4382 case CHIP_NAVY_FLOUNDER: 4383 case CHIP_DIMGREY_CAVEFISH: 4384 case CHIP_BEIGE_GOBY: 4385 case CHIP_VANGOGH: 4386 case CHIP_ALDEBARAN: 4387 break; 4388 default: 4389 goto disabled; 4390 } 4391 } 4392 4393 return true; 4394 4395 disabled: 4396 dev_info(adev->dev, "GPU recovery disabled.\n"); 4397 return false; 4398 } 4399 4400 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4401 { 4402 u32 i; 4403 int ret = 0; 4404 4405 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4406 4407 dev_info(adev->dev, "GPU mode1 reset\n"); 4408 4409 /* disable BM */ 4410 pci_clear_master(adev->pdev); 4411 4412 amdgpu_device_cache_pci_state(adev->pdev); 4413 4414 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4415 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4416 ret = amdgpu_dpm_mode1_reset(adev); 4417 } else { 4418 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4419 ret = psp_gpu_reset(adev); 4420 } 4421 4422 if (ret) 4423 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4424 4425 amdgpu_device_load_pci_state(adev->pdev); 4426 4427 /* wait for asic to come out of reset */ 4428 for (i = 0; i < adev->usec_timeout; i++) { 4429 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4430 4431 if (memsize != 0xffffffff) 4432 break; 4433 udelay(1); 4434 } 4435 4436 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4437 return ret; 4438 } 4439 4440 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4441 struct amdgpu_reset_context *reset_context) 4442 { 4443 int i, r = 0; 4444 struct amdgpu_job *job = NULL; 4445 bool need_full_reset = 4446 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4447 4448 if (reset_context->reset_req_dev == adev) 4449 job = reset_context->job; 4450 4451 /* no need to dump if device is not in good state during probe period */ 4452 if (!adev->gmc.xgmi.pending_reset) 4453 amdgpu_debugfs_wait_dump(adev); 4454 4455 if (amdgpu_sriov_vf(adev)) { 4456 /* stop the data exchange thread */ 4457 amdgpu_virt_fini_data_exchange(adev); 4458 } 4459 4460 /* block all schedulers and reset given job's ring */ 4461 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4462 struct amdgpu_ring *ring = adev->rings[i]; 4463 4464 if (!ring || !ring->sched.thread) 4465 continue; 4466 4467 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4468 amdgpu_fence_driver_force_completion(ring); 4469 } 4470 4471 if(job) 4472 drm_sched_increase_karma(&job->base); 4473 4474 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4475 /* If reset handler not implemented, continue; otherwise return */ 4476 if (r == -ENOSYS) 4477 r = 0; 4478 else 4479 return r; 4480 4481 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4482 if (!amdgpu_sriov_vf(adev)) { 4483 4484 if (!need_full_reset) 4485 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4486 4487 if (!need_full_reset) { 4488 amdgpu_device_ip_pre_soft_reset(adev); 4489 r = amdgpu_device_ip_soft_reset(adev); 4490 amdgpu_device_ip_post_soft_reset(adev); 4491 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4492 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4493 need_full_reset = true; 4494 } 4495 } 4496 4497 if (need_full_reset) 4498 r = amdgpu_device_ip_suspend(adev); 4499 if (need_full_reset) 4500 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4501 else 4502 clear_bit(AMDGPU_NEED_FULL_RESET, 4503 &reset_context->flags); 4504 } 4505 4506 return r; 4507 } 4508 4509 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4510 struct amdgpu_reset_context *reset_context) 4511 { 4512 struct amdgpu_device *tmp_adev = NULL; 4513 bool need_full_reset, skip_hw_reset, vram_lost = false; 4514 int r = 0; 4515 4516 /* Try reset handler method first */ 4517 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4518 reset_list); 4519 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4520 /* If reset handler not implemented, continue; otherwise return */ 4521 if (r == -ENOSYS) 4522 r = 0; 4523 else 4524 return r; 4525 4526 /* Reset handler not implemented, use the default method */ 4527 need_full_reset = 4528 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4529 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4530 4531 /* 4532 * ASIC reset has to be done on all XGMI hive nodes ASAP 4533 * to allow proper links negotiation in FW (within 1 sec) 4534 */ 4535 if (!skip_hw_reset && need_full_reset) { 4536 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4537 /* For XGMI run all resets in parallel to speed up the process */ 4538 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4539 tmp_adev->gmc.xgmi.pending_reset = false; 4540 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4541 r = -EALREADY; 4542 } else 4543 r = amdgpu_asic_reset(tmp_adev); 4544 4545 if (r) { 4546 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4547 r, adev_to_drm(tmp_adev)->unique); 4548 break; 4549 } 4550 } 4551 4552 /* For XGMI wait for all resets to complete before proceed */ 4553 if (!r) { 4554 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4555 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4556 flush_work(&tmp_adev->xgmi_reset_work); 4557 r = tmp_adev->asic_reset_res; 4558 if (r) 4559 break; 4560 } 4561 } 4562 } 4563 } 4564 4565 if (!r && amdgpu_ras_intr_triggered()) { 4566 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4567 if (tmp_adev->mmhub.ras_funcs && 4568 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4569 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4570 } 4571 4572 amdgpu_ras_intr_cleared(); 4573 } 4574 4575 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4576 if (need_full_reset) { 4577 /* post card */ 4578 r = amdgpu_device_asic_init(tmp_adev); 4579 if (r) { 4580 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4581 } else { 4582 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4583 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4584 if (r) 4585 goto out; 4586 4587 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4588 if (vram_lost) { 4589 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4590 amdgpu_inc_vram_lost(tmp_adev); 4591 } 4592 4593 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4594 if (r) 4595 goto out; 4596 4597 r = amdgpu_device_fw_loading(tmp_adev); 4598 if (r) 4599 return r; 4600 4601 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4602 if (r) 4603 goto out; 4604 4605 if (vram_lost) 4606 amdgpu_device_fill_reset_magic(tmp_adev); 4607 4608 /* 4609 * Add this ASIC as tracked as reset was already 4610 * complete successfully. 4611 */ 4612 amdgpu_register_gpu_instance(tmp_adev); 4613 4614 if (!reset_context->hive && 4615 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4616 amdgpu_xgmi_add_device(tmp_adev); 4617 4618 r = amdgpu_device_ip_late_init(tmp_adev); 4619 if (r) 4620 goto out; 4621 4622 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4623 4624 /* 4625 * The GPU enters bad state once faulty pages 4626 * by ECC has reached the threshold, and ras 4627 * recovery is scheduled next. So add one check 4628 * here to break recovery if it indeed exceeds 4629 * bad page threshold, and remind user to 4630 * retire this GPU or setting one bigger 4631 * bad_page_threshold value to fix this once 4632 * probing driver again. 4633 */ 4634 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4635 /* must succeed. */ 4636 amdgpu_ras_resume(tmp_adev); 4637 } else { 4638 r = -EINVAL; 4639 goto out; 4640 } 4641 4642 /* Update PSP FW topology after reset */ 4643 if (reset_context->hive && 4644 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4645 r = amdgpu_xgmi_update_topology( 4646 reset_context->hive, tmp_adev); 4647 } 4648 } 4649 4650 out: 4651 if (!r) { 4652 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4653 r = amdgpu_ib_ring_tests(tmp_adev); 4654 if (r) { 4655 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4656 need_full_reset = true; 4657 r = -EAGAIN; 4658 goto end; 4659 } 4660 } 4661 4662 if (!r) 4663 r = amdgpu_device_recover_vram(tmp_adev); 4664 else 4665 tmp_adev->asic_reset_res = r; 4666 } 4667 4668 end: 4669 if (need_full_reset) 4670 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4671 else 4672 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4673 return r; 4674 } 4675 4676 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4677 struct amdgpu_hive_info *hive) 4678 { 4679 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4680 return false; 4681 4682 if (hive) { 4683 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4684 } else { 4685 down_write(&adev->reset_sem); 4686 } 4687 4688 switch (amdgpu_asic_reset_method(adev)) { 4689 case AMD_RESET_METHOD_MODE1: 4690 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4691 break; 4692 case AMD_RESET_METHOD_MODE2: 4693 adev->mp1_state = PP_MP1_STATE_RESET; 4694 break; 4695 default: 4696 adev->mp1_state = PP_MP1_STATE_NONE; 4697 break; 4698 } 4699 4700 return true; 4701 } 4702 4703 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4704 { 4705 amdgpu_vf_error_trans_all(adev); 4706 adev->mp1_state = PP_MP1_STATE_NONE; 4707 atomic_set(&adev->in_gpu_reset, 0); 4708 up_write(&adev->reset_sem); 4709 } 4710 4711 /* 4712 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4713 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4714 * 4715 * unlock won't require roll back. 4716 */ 4717 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4718 { 4719 struct amdgpu_device *tmp_adev = NULL; 4720 4721 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4722 if (!hive) { 4723 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4724 return -ENODEV; 4725 } 4726 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4727 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4728 goto roll_back; 4729 } 4730 } else if (!amdgpu_device_lock_adev(adev, hive)) 4731 return -EAGAIN; 4732 4733 return 0; 4734 roll_back: 4735 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4736 /* 4737 * if the lockup iteration break in the middle of a hive, 4738 * it may means there may has a race issue, 4739 * or a hive device locked up independently. 4740 * we may be in trouble and may not, so will try to roll back 4741 * the lock and give out a warnning. 4742 */ 4743 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4744 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4745 amdgpu_device_unlock_adev(tmp_adev); 4746 } 4747 } 4748 return -EAGAIN; 4749 } 4750 4751 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4752 { 4753 struct pci_dev *p = NULL; 4754 4755 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4756 adev->pdev->bus->number, 1); 4757 if (p) { 4758 pm_runtime_enable(&(p->dev)); 4759 pm_runtime_resume(&(p->dev)); 4760 } 4761 } 4762 4763 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4764 { 4765 enum amd_reset_method reset_method; 4766 struct pci_dev *p = NULL; 4767 u64 expires; 4768 4769 /* 4770 * For now, only BACO and mode1 reset are confirmed 4771 * to suffer the audio issue without proper suspended. 4772 */ 4773 reset_method = amdgpu_asic_reset_method(adev); 4774 if ((reset_method != AMD_RESET_METHOD_BACO) && 4775 (reset_method != AMD_RESET_METHOD_MODE1)) 4776 return -EINVAL; 4777 4778 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4779 adev->pdev->bus->number, 1); 4780 if (!p) 4781 return -ENODEV; 4782 4783 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4784 if (!expires) 4785 /* 4786 * If we cannot get the audio device autosuspend delay, 4787 * a fixed 4S interval will be used. Considering 3S is 4788 * the audio controller default autosuspend delay setting. 4789 * 4S used here is guaranteed to cover that. 4790 */ 4791 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4792 4793 while (!pm_runtime_status_suspended(&(p->dev))) { 4794 if (!pm_runtime_suspend(&(p->dev))) 4795 break; 4796 4797 if (expires < ktime_get_mono_fast_ns()) { 4798 dev_warn(adev->dev, "failed to suspend display audio\n"); 4799 /* TODO: abort the succeeding gpu reset? */ 4800 return -ETIMEDOUT; 4801 } 4802 } 4803 4804 pm_runtime_disable(&(p->dev)); 4805 4806 return 0; 4807 } 4808 4809 static void amdgpu_device_recheck_guilty_jobs( 4810 struct amdgpu_device *adev, struct list_head *device_list_handle, 4811 struct amdgpu_reset_context *reset_context) 4812 { 4813 int i, r = 0; 4814 4815 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4816 struct amdgpu_ring *ring = adev->rings[i]; 4817 int ret = 0; 4818 struct drm_sched_job *s_job; 4819 4820 if (!ring || !ring->sched.thread) 4821 continue; 4822 4823 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4824 struct drm_sched_job, list); 4825 if (s_job == NULL) 4826 continue; 4827 4828 /* clear job's guilty and depend the folowing step to decide the real one */ 4829 drm_sched_reset_karma(s_job); 4830 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4831 4832 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4833 if (ret == 0) { /* timeout */ 4834 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4835 ring->sched.name, s_job->id); 4836 4837 /* set guilty */ 4838 drm_sched_increase_karma(s_job); 4839 retry: 4840 /* do hw reset */ 4841 if (amdgpu_sriov_vf(adev)) { 4842 amdgpu_virt_fini_data_exchange(adev); 4843 r = amdgpu_device_reset_sriov(adev, false); 4844 if (r) 4845 adev->asic_reset_res = r; 4846 } else { 4847 clear_bit(AMDGPU_SKIP_HW_RESET, 4848 &reset_context->flags); 4849 r = amdgpu_do_asic_reset(device_list_handle, 4850 reset_context); 4851 if (r && r == -EAGAIN) 4852 goto retry; 4853 } 4854 4855 /* 4856 * add reset counter so that the following 4857 * resubmitted job could flush vmid 4858 */ 4859 atomic_inc(&adev->gpu_reset_counter); 4860 continue; 4861 } 4862 4863 /* got the hw fence, signal finished fence */ 4864 atomic_dec(ring->sched.score); 4865 dma_fence_get(&s_job->s_fence->finished); 4866 dma_fence_signal(&s_job->s_fence->finished); 4867 dma_fence_put(&s_job->s_fence->finished); 4868 4869 /* remove node from list and free the job */ 4870 spin_lock(&ring->sched.job_list_lock); 4871 list_del_init(&s_job->list); 4872 spin_unlock(&ring->sched.job_list_lock); 4873 ring->sched.ops->free_job(s_job); 4874 } 4875 } 4876 4877 /** 4878 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4879 * 4880 * @adev: amdgpu_device pointer 4881 * @job: which job trigger hang 4882 * 4883 * Attempt to reset the GPU if it has hung (all asics). 4884 * Attempt to do soft-reset or full-reset and reinitialize Asic 4885 * Returns 0 for success or an error on failure. 4886 */ 4887 4888 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4889 struct amdgpu_job *job) 4890 { 4891 struct list_head device_list, *device_list_handle = NULL; 4892 bool job_signaled = false; 4893 struct amdgpu_hive_info *hive = NULL; 4894 struct amdgpu_device *tmp_adev = NULL; 4895 int i, r = 0; 4896 bool need_emergency_restart = false; 4897 bool audio_suspended = false; 4898 int tmp_vram_lost_counter; 4899 struct amdgpu_reset_context reset_context; 4900 4901 memset(&reset_context, 0, sizeof(reset_context)); 4902 4903 /* 4904 * Special case: RAS triggered and full reset isn't supported 4905 */ 4906 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4907 4908 /* 4909 * Flush RAM to disk so that after reboot 4910 * the user can read log and see why the system rebooted. 4911 */ 4912 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4913 DRM_WARN("Emergency reboot."); 4914 4915 ksys_sync_helper(); 4916 emergency_restart(); 4917 } 4918 4919 dev_info(adev->dev, "GPU %s begin!\n", 4920 need_emergency_restart ? "jobs stop":"reset"); 4921 4922 /* 4923 * Here we trylock to avoid chain of resets executing from 4924 * either trigger by jobs on different adevs in XGMI hive or jobs on 4925 * different schedulers for same device while this TO handler is running. 4926 * We always reset all schedulers for device and all devices for XGMI 4927 * hive so that should take care of them too. 4928 */ 4929 hive = amdgpu_get_xgmi_hive(adev); 4930 if (hive) { 4931 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4932 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4933 job ? job->base.id : -1, hive->hive_id); 4934 amdgpu_put_xgmi_hive(hive); 4935 if (job) 4936 drm_sched_increase_karma(&job->base); 4937 return 0; 4938 } 4939 mutex_lock(&hive->hive_lock); 4940 } 4941 4942 reset_context.method = AMD_RESET_METHOD_NONE; 4943 reset_context.reset_req_dev = adev; 4944 reset_context.job = job; 4945 reset_context.hive = hive; 4946 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4947 4948 /* 4949 * lock the device before we try to operate the linked list 4950 * if didn't get the device lock, don't touch the linked list since 4951 * others may iterating it. 4952 */ 4953 r = amdgpu_device_lock_hive_adev(adev, hive); 4954 if (r) { 4955 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4956 job ? job->base.id : -1); 4957 4958 /* even we skipped this reset, still need to set the job to guilty */ 4959 if (job) 4960 drm_sched_increase_karma(&job->base); 4961 goto skip_recovery; 4962 } 4963 4964 /* 4965 * Build list of devices to reset. 4966 * In case we are in XGMI hive mode, resort the device list 4967 * to put adev in the 1st position. 4968 */ 4969 INIT_LIST_HEAD(&device_list); 4970 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4971 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4972 list_add_tail(&tmp_adev->reset_list, &device_list); 4973 if (!list_is_first(&adev->reset_list, &device_list)) 4974 list_rotate_to_front(&adev->reset_list, &device_list); 4975 device_list_handle = &device_list; 4976 } else { 4977 list_add_tail(&adev->reset_list, &device_list); 4978 device_list_handle = &device_list; 4979 } 4980 4981 /* block all schedulers and reset given job's ring */ 4982 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4983 /* 4984 * Try to put the audio codec into suspend state 4985 * before gpu reset started. 4986 * 4987 * Due to the power domain of the graphics device 4988 * is shared with AZ power domain. Without this, 4989 * we may change the audio hardware from behind 4990 * the audio driver's back. That will trigger 4991 * some audio codec errors. 4992 */ 4993 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4994 audio_suspended = true; 4995 4996 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4997 4998 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4999 5000 if (!amdgpu_sriov_vf(tmp_adev)) 5001 amdgpu_amdkfd_pre_reset(tmp_adev); 5002 5003 /* 5004 * Mark these ASICs to be reseted as untracked first 5005 * And add them back after reset completed 5006 */ 5007 amdgpu_unregister_gpu_instance(tmp_adev); 5008 5009 amdgpu_fbdev_set_suspend(tmp_adev, 1); 5010 5011 /* disable ras on ALL IPs */ 5012 if (!need_emergency_restart && 5013 amdgpu_device_ip_need_full_reset(tmp_adev)) 5014 amdgpu_ras_suspend(tmp_adev); 5015 5016 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5017 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5018 5019 if (!ring || !ring->sched.thread) 5020 continue; 5021 5022 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5023 5024 if (need_emergency_restart) 5025 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5026 } 5027 atomic_inc(&tmp_adev->gpu_reset_counter); 5028 } 5029 5030 if (need_emergency_restart) 5031 goto skip_sched_resume; 5032 5033 /* 5034 * Must check guilty signal here since after this point all old 5035 * HW fences are force signaled. 5036 * 5037 * job->base holds a reference to parent fence 5038 */ 5039 if (job && job->base.s_fence->parent && 5040 dma_fence_is_signaled(job->base.s_fence->parent)) { 5041 job_signaled = true; 5042 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5043 goto skip_hw_reset; 5044 } 5045 5046 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5047 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5048 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5049 /*TODO Should we stop ?*/ 5050 if (r) { 5051 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5052 r, adev_to_drm(tmp_adev)->unique); 5053 tmp_adev->asic_reset_res = r; 5054 } 5055 } 5056 5057 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5058 /* Actual ASIC resets if needed.*/ 5059 /* TODO Implement XGMI hive reset logic for SRIOV */ 5060 if (amdgpu_sriov_vf(adev)) { 5061 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5062 if (r) 5063 adev->asic_reset_res = r; 5064 } else { 5065 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5066 if (r && r == -EAGAIN) 5067 goto retry; 5068 } 5069 5070 skip_hw_reset: 5071 5072 /* Post ASIC reset for all devs .*/ 5073 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5074 5075 /* 5076 * Sometimes a later bad compute job can block a good gfx job as gfx 5077 * and compute ring share internal GC HW mutually. We add an additional 5078 * guilty jobs recheck step to find the real guilty job, it synchronously 5079 * submits and pends for the first job being signaled. If it gets timeout, 5080 * we identify it as a real guilty job. 5081 */ 5082 if (amdgpu_gpu_recovery == 2 && 5083 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5084 amdgpu_device_recheck_guilty_jobs( 5085 tmp_adev, device_list_handle, &reset_context); 5086 5087 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5088 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5089 5090 if (!ring || !ring->sched.thread) 5091 continue; 5092 5093 /* No point to resubmit jobs if we didn't HW reset*/ 5094 if (!tmp_adev->asic_reset_res && !job_signaled) 5095 drm_sched_resubmit_jobs(&ring->sched); 5096 5097 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5098 } 5099 5100 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 5101 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5102 } 5103 5104 tmp_adev->asic_reset_res = 0; 5105 5106 if (r) { 5107 /* bad news, how to tell it to userspace ? */ 5108 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5109 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5110 } else { 5111 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5112 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5113 DRM_WARN("smart shift update failed\n"); 5114 } 5115 } 5116 5117 skip_sched_resume: 5118 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5119 /* unlock kfd: SRIOV would do it separately */ 5120 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5121 amdgpu_amdkfd_post_reset(tmp_adev); 5122 5123 /* kfd_post_reset will do nothing if kfd device is not initialized, 5124 * need to bring up kfd here if it's not be initialized before 5125 */ 5126 if (!adev->kfd.init_complete) 5127 amdgpu_amdkfd_device_init(adev); 5128 5129 if (audio_suspended) 5130 amdgpu_device_resume_display_audio(tmp_adev); 5131 amdgpu_device_unlock_adev(tmp_adev); 5132 } 5133 5134 skip_recovery: 5135 if (hive) { 5136 atomic_set(&hive->in_reset, 0); 5137 mutex_unlock(&hive->hive_lock); 5138 amdgpu_put_xgmi_hive(hive); 5139 } 5140 5141 if (r && r != -EAGAIN) 5142 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5143 return r; 5144 } 5145 5146 /** 5147 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5148 * 5149 * @adev: amdgpu_device pointer 5150 * 5151 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5152 * and lanes) of the slot the device is in. Handles APUs and 5153 * virtualized environments where PCIE config space may not be available. 5154 */ 5155 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5156 { 5157 struct pci_dev *pdev; 5158 enum pci_bus_speed speed_cap, platform_speed_cap; 5159 enum pcie_link_width platform_link_width; 5160 5161 if (amdgpu_pcie_gen_cap) 5162 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5163 5164 if (amdgpu_pcie_lane_cap) 5165 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5166 5167 /* covers APUs as well */ 5168 if (pci_is_root_bus(adev->pdev->bus)) { 5169 if (adev->pm.pcie_gen_mask == 0) 5170 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5171 if (adev->pm.pcie_mlw_mask == 0) 5172 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5173 return; 5174 } 5175 5176 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5177 return; 5178 5179 pcie_bandwidth_available(adev->pdev, NULL, 5180 &platform_speed_cap, &platform_link_width); 5181 5182 if (adev->pm.pcie_gen_mask == 0) { 5183 /* asic caps */ 5184 pdev = adev->pdev; 5185 speed_cap = pcie_get_speed_cap(pdev); 5186 if (speed_cap == PCI_SPEED_UNKNOWN) { 5187 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5188 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5189 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5190 } else { 5191 if (speed_cap == PCIE_SPEED_32_0GT) 5192 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5193 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5194 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5195 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5196 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5197 else if (speed_cap == PCIE_SPEED_16_0GT) 5198 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5199 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5200 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5201 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5202 else if (speed_cap == PCIE_SPEED_8_0GT) 5203 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5204 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5205 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5206 else if (speed_cap == PCIE_SPEED_5_0GT) 5207 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5208 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5209 else 5210 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5211 } 5212 /* platform caps */ 5213 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5214 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5215 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5216 } else { 5217 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5218 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5219 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5220 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5221 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5222 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5223 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5224 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5225 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5226 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5227 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5228 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5229 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5230 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5231 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5232 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5233 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5234 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5235 else 5236 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5237 5238 } 5239 } 5240 if (adev->pm.pcie_mlw_mask == 0) { 5241 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5242 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5243 } else { 5244 switch (platform_link_width) { 5245 case PCIE_LNK_X32: 5246 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5247 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5248 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5249 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5250 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5251 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5252 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5253 break; 5254 case PCIE_LNK_X16: 5255 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5256 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5257 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5258 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5259 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5260 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5261 break; 5262 case PCIE_LNK_X12: 5263 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5264 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5265 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5266 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5267 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5268 break; 5269 case PCIE_LNK_X8: 5270 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5271 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5272 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5273 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5274 break; 5275 case PCIE_LNK_X4: 5276 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5277 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5278 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5279 break; 5280 case PCIE_LNK_X2: 5281 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5282 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5283 break; 5284 case PCIE_LNK_X1: 5285 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5286 break; 5287 default: 5288 break; 5289 } 5290 } 5291 } 5292 } 5293 5294 int amdgpu_device_baco_enter(struct drm_device *dev) 5295 { 5296 struct amdgpu_device *adev = drm_to_adev(dev); 5297 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5298 5299 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5300 return -ENOTSUPP; 5301 5302 if (ras && adev->ras_enabled && 5303 adev->nbio.funcs->enable_doorbell_interrupt) 5304 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5305 5306 return amdgpu_dpm_baco_enter(adev); 5307 } 5308 5309 int amdgpu_device_baco_exit(struct drm_device *dev) 5310 { 5311 struct amdgpu_device *adev = drm_to_adev(dev); 5312 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5313 int ret = 0; 5314 5315 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5316 return -ENOTSUPP; 5317 5318 ret = amdgpu_dpm_baco_exit(adev); 5319 if (ret) 5320 return ret; 5321 5322 if (ras && adev->ras_enabled && 5323 adev->nbio.funcs->enable_doorbell_interrupt) 5324 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5325 5326 return 0; 5327 } 5328 5329 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5330 { 5331 int i; 5332 5333 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5334 struct amdgpu_ring *ring = adev->rings[i]; 5335 5336 if (!ring || !ring->sched.thread) 5337 continue; 5338 5339 cancel_delayed_work_sync(&ring->sched.work_tdr); 5340 } 5341 } 5342 5343 /** 5344 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5345 * @pdev: PCI device struct 5346 * @state: PCI channel state 5347 * 5348 * Description: Called when a PCI error is detected. 5349 * 5350 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5351 */ 5352 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5353 { 5354 struct drm_device *dev = pci_get_drvdata(pdev); 5355 struct amdgpu_device *adev = drm_to_adev(dev); 5356 int i; 5357 5358 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5359 5360 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5361 DRM_WARN("No support for XGMI hive yet..."); 5362 return PCI_ERS_RESULT_DISCONNECT; 5363 } 5364 5365 switch (state) { 5366 case pci_channel_io_normal: 5367 return PCI_ERS_RESULT_CAN_RECOVER; 5368 /* Fatal error, prepare for slot reset */ 5369 case pci_channel_io_frozen: 5370 /* 5371 * Cancel and wait for all TDRs in progress if failing to 5372 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5373 * 5374 * Locking adev->reset_sem will prevent any external access 5375 * to GPU during PCI error recovery 5376 */ 5377 while (!amdgpu_device_lock_adev(adev, NULL)) 5378 amdgpu_cancel_all_tdr(adev); 5379 5380 /* 5381 * Block any work scheduling as we do for regular GPU reset 5382 * for the duration of the recovery 5383 */ 5384 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5385 struct amdgpu_ring *ring = adev->rings[i]; 5386 5387 if (!ring || !ring->sched.thread) 5388 continue; 5389 5390 drm_sched_stop(&ring->sched, NULL); 5391 } 5392 atomic_inc(&adev->gpu_reset_counter); 5393 return PCI_ERS_RESULT_NEED_RESET; 5394 case pci_channel_io_perm_failure: 5395 /* Permanent error, prepare for device removal */ 5396 return PCI_ERS_RESULT_DISCONNECT; 5397 } 5398 5399 return PCI_ERS_RESULT_NEED_RESET; 5400 } 5401 5402 /** 5403 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5404 * @pdev: pointer to PCI device 5405 */ 5406 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5407 { 5408 5409 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5410 5411 /* TODO - dump whatever for debugging purposes */ 5412 5413 /* This called only if amdgpu_pci_error_detected returns 5414 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5415 * works, no need to reset slot. 5416 */ 5417 5418 return PCI_ERS_RESULT_RECOVERED; 5419 } 5420 5421 /** 5422 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5423 * @pdev: PCI device struct 5424 * 5425 * Description: This routine is called by the pci error recovery 5426 * code after the PCI slot has been reset, just before we 5427 * should resume normal operations. 5428 */ 5429 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5430 { 5431 struct drm_device *dev = pci_get_drvdata(pdev); 5432 struct amdgpu_device *adev = drm_to_adev(dev); 5433 int r, i; 5434 struct amdgpu_reset_context reset_context; 5435 u32 memsize; 5436 struct list_head device_list; 5437 5438 DRM_INFO("PCI error: slot reset callback!!\n"); 5439 5440 memset(&reset_context, 0, sizeof(reset_context)); 5441 5442 INIT_LIST_HEAD(&device_list); 5443 list_add_tail(&adev->reset_list, &device_list); 5444 5445 /* wait for asic to come out of reset */ 5446 msleep(500); 5447 5448 /* Restore PCI confspace */ 5449 amdgpu_device_load_pci_state(pdev); 5450 5451 /* confirm ASIC came out of reset */ 5452 for (i = 0; i < adev->usec_timeout; i++) { 5453 memsize = amdgpu_asic_get_config_memsize(adev); 5454 5455 if (memsize != 0xffffffff) 5456 break; 5457 udelay(1); 5458 } 5459 if (memsize == 0xffffffff) { 5460 r = -ETIME; 5461 goto out; 5462 } 5463 5464 reset_context.method = AMD_RESET_METHOD_NONE; 5465 reset_context.reset_req_dev = adev; 5466 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5467 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5468 5469 adev->no_hw_access = true; 5470 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5471 adev->no_hw_access = false; 5472 if (r) 5473 goto out; 5474 5475 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5476 5477 out: 5478 if (!r) { 5479 if (amdgpu_device_cache_pci_state(adev->pdev)) 5480 pci_restore_state(adev->pdev); 5481 5482 DRM_INFO("PCIe error recovery succeeded\n"); 5483 } else { 5484 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5485 amdgpu_device_unlock_adev(adev); 5486 } 5487 5488 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5489 } 5490 5491 /** 5492 * amdgpu_pci_resume() - resume normal ops after PCI reset 5493 * @pdev: pointer to PCI device 5494 * 5495 * Called when the error recovery driver tells us that its 5496 * OK to resume normal operation. 5497 */ 5498 void amdgpu_pci_resume(struct pci_dev *pdev) 5499 { 5500 struct drm_device *dev = pci_get_drvdata(pdev); 5501 struct amdgpu_device *adev = drm_to_adev(dev); 5502 int i; 5503 5504 5505 DRM_INFO("PCI error: resume callback!!\n"); 5506 5507 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5508 struct amdgpu_ring *ring = adev->rings[i]; 5509 5510 if (!ring || !ring->sched.thread) 5511 continue; 5512 5513 5514 drm_sched_resubmit_jobs(&ring->sched); 5515 drm_sched_start(&ring->sched, true); 5516 } 5517 5518 amdgpu_device_unlock_adev(adev); 5519 } 5520 5521 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5522 { 5523 struct drm_device *dev = pci_get_drvdata(pdev); 5524 struct amdgpu_device *adev = drm_to_adev(dev); 5525 int r; 5526 5527 r = pci_save_state(pdev); 5528 if (!r) { 5529 kfree(adev->pci_state); 5530 5531 adev->pci_state = pci_store_saved_state(pdev); 5532 5533 if (!adev->pci_state) { 5534 DRM_ERROR("Failed to store PCI saved state"); 5535 return false; 5536 } 5537 } else { 5538 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5539 return false; 5540 } 5541 5542 return true; 5543 } 5544 5545 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5546 { 5547 struct drm_device *dev = pci_get_drvdata(pdev); 5548 struct amdgpu_device *adev = drm_to_adev(dev); 5549 int r; 5550 5551 if (!adev->pci_state) 5552 return false; 5553 5554 r = pci_load_saved_state(pdev, adev->pci_state); 5555 5556 if (!r) { 5557 pci_restore_state(pdev); 5558 } else { 5559 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5560 return false; 5561 } 5562 5563 return true; 5564 } 5565 5566 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5567 struct amdgpu_ring *ring) 5568 { 5569 #ifdef CONFIG_X86_64 5570 if (adev->flags & AMD_IS_APU) 5571 return; 5572 #endif 5573 if (adev->gmc.xgmi.connected_to_cpu) 5574 return; 5575 5576 if (ring && ring->funcs->emit_hdp_flush) 5577 amdgpu_ring_emit_hdp_flush(ring); 5578 else 5579 amdgpu_asic_flush_hdp(adev, ring); 5580 } 5581 5582 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5583 struct amdgpu_ring *ring) 5584 { 5585 #ifdef CONFIG_X86_64 5586 if (adev->flags & AMD_IS_APU) 5587 return; 5588 #endif 5589 if (adev->gmc.xgmi.connected_to_cpu) 5590 return; 5591 5592 amdgpu_asic_invalidate_hdp(adev, ring); 5593 } 5594