1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 /** 168 * DOC: product_name 169 * 170 * The amdgpu driver provides a sysfs API for reporting the product name 171 * for the device 172 * The file product_name is used for this and returns the product name 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_name(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return sysfs_emit(buf, "%s\n", adev->product_name); 184 } 185 186 static DEVICE_ATTR(product_name, S_IRUGO, 187 amdgpu_device_get_product_name, NULL); 188 189 /** 190 * DOC: product_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the part number 193 * for the device 194 * The file product_number is used for this and returns the part number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_product_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return sysfs_emit(buf, "%s\n", adev->product_number); 206 } 207 208 static DEVICE_ATTR(product_number, S_IRUGO, 209 amdgpu_device_get_product_number, NULL); 210 211 /** 212 * DOC: serial_number 213 * 214 * The amdgpu driver provides a sysfs API for reporting the serial number 215 * for the device 216 * The file serial_number is used for this and returns the serial number 217 * as returned from the FRU. 218 * NOTE: This is only available for certain server cards 219 */ 220 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 222 struct device_attribute *attr, char *buf) 223 { 224 struct drm_device *ddev = dev_get_drvdata(dev); 225 struct amdgpu_device *adev = drm_to_adev(ddev); 226 227 return sysfs_emit(buf, "%s\n", adev->serial); 228 } 229 230 static DEVICE_ATTR(serial_number, S_IRUGO, 231 amdgpu_device_get_serial_number, NULL); 232 233 /** 234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 235 * 236 * @dev: drm_device pointer 237 * 238 * Returns true if the device is a dGPU with ATPX power control, 239 * otherwise return false. 240 */ 241 bool amdgpu_device_supports_px(struct drm_device *dev) 242 { 243 struct amdgpu_device *adev = drm_to_adev(dev); 244 245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device is a dGPU with ACPI power control, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_boco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 if (adev->has_pr3 || 263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 264 return true; 265 return false; 266 } 267 268 /** 269 * amdgpu_device_supports_baco - Does the device support BACO 270 * 271 * @dev: drm_device pointer 272 * 273 * Returns true if the device supporte BACO, 274 * otherwise return false. 275 */ 276 bool amdgpu_device_supports_baco(struct drm_device *dev) 277 { 278 struct amdgpu_device *adev = drm_to_adev(dev); 279 280 return amdgpu_asic_supports_baco(adev); 281 } 282 283 /** 284 * amdgpu_device_supports_smart_shift - Is the device dGPU with 285 * smart shift support 286 * 287 * @dev: drm_device pointer 288 * 289 * Returns true if the device is a dGPU with Smart Shift support, 290 * otherwise returns false. 291 */ 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 293 { 294 return (amdgpu_device_supports_boco(dev) && 295 amdgpu_acpi_is_power_shift_control_supported()); 296 } 297 298 /* 299 * VRAM access helper functions 300 */ 301 302 /** 303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 304 * 305 * @adev: amdgpu_device pointer 306 * @pos: offset of the buffer in vram 307 * @buf: virtual address of the buffer in system memory 308 * @size: read/write size, sizeof(@buf) must > @size 309 * @write: true - write to vram, otherwise - read from vram 310 */ 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 312 void *buf, size_t size, bool write) 313 { 314 unsigned long flags; 315 uint32_t hi = ~0, tmp = 0; 316 uint32_t *data = buf; 317 uint64_t last; 318 int idx; 319 320 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 321 return; 322 323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 324 325 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 326 for (last = pos + size; pos < last; pos += 4) { 327 tmp = pos >> 31; 328 329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 330 if (tmp != hi) { 331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 332 hi = tmp; 333 } 334 if (write) 335 WREG32_NO_KIQ(mmMM_DATA, *data++); 336 else 337 *data++ = RREG32_NO_KIQ(mmMM_DATA); 338 } 339 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 drm_dev_exit(idx); 342 } 343 344 /** 345 * amdgpu_device_aper_access - access vram by vram aperature 346 * 347 * @adev: amdgpu_device pointer 348 * @pos: offset of the buffer in vram 349 * @buf: virtual address of the buffer in system memory 350 * @size: read/write size, sizeof(@buf) must > @size 351 * @write: true - write to vram, otherwise - read from vram 352 * 353 * The return value means how many bytes have been transferred. 354 */ 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 356 void *buf, size_t size, bool write) 357 { 358 #ifdef CONFIG_64BIT 359 void __iomem *addr; 360 size_t count = 0; 361 uint64_t last; 362 363 if (!adev->mman.aper_base_kaddr) 364 return 0; 365 366 last = min(pos + size, adev->gmc.visible_vram_size); 367 if (last > pos) { 368 addr = adev->mman.aper_base_kaddr + pos; 369 count = last - pos; 370 371 if (write) { 372 memcpy_toio(addr, buf, count); 373 mb(); 374 amdgpu_device_flush_hdp(adev, NULL); 375 } else { 376 amdgpu_device_invalidate_hdp(adev, NULL); 377 mb(); 378 memcpy_fromio(buf, addr, count); 379 } 380 381 } 382 383 return count; 384 #else 385 return 0; 386 #endif 387 } 388 389 /** 390 * amdgpu_device_vram_access - read/write a buffer in vram 391 * 392 * @adev: amdgpu_device pointer 393 * @pos: offset of the buffer in vram 394 * @buf: virtual address of the buffer in system memory 395 * @size: read/write size, sizeof(@buf) must > @size 396 * @write: true - write to vram, otherwise - read from vram 397 */ 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 399 void *buf, size_t size, bool write) 400 { 401 size_t count; 402 403 /* try to using vram apreature to access vram first */ 404 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 405 size -= count; 406 if (size) { 407 /* using MM to access rest vram */ 408 pos += count; 409 buf += count; 410 amdgpu_device_mm_access(adev, pos, buf, size, write); 411 } 412 } 413 414 /* 415 * register access helper functions. 416 */ 417 418 /* Check if hw access should be skipped because of hotplug or device error */ 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 420 { 421 if (adev->no_hw_access) 422 return true; 423 424 #ifdef CONFIG_LOCKDEP 425 /* 426 * This is a bit complicated to understand, so worth a comment. What we assert 427 * here is that the GPU reset is not running on another thread in parallel. 428 * 429 * For this we trylock the read side of the reset semaphore, if that succeeds 430 * we know that the reset is not running in paralell. 431 * 432 * If the trylock fails we assert that we are either already holding the read 433 * side of the lock or are the reset thread itself and hold the write side of 434 * the lock. 435 */ 436 if (in_task()) { 437 if (down_read_trylock(&adev->reset_domain->sem)) 438 up_read(&adev->reset_domain->sem); 439 else 440 lockdep_assert_held(&adev->reset_domain->sem); 441 } 442 #endif 443 return false; 444 } 445 446 /** 447 * amdgpu_device_rreg - read a memory mapped IO or indirect register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * @acc_flags: access flags which require special behavior 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t acc_flags) 457 { 458 uint32_t ret; 459 460 if (amdgpu_device_skip_hw_access(adev)) 461 return 0; 462 463 if ((reg * 4) < adev->rmmio_size) { 464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 465 amdgpu_sriov_runtime(adev) && 466 down_read_trylock(&adev->reset_domain->sem)) { 467 ret = amdgpu_kiq_rreg(adev, reg); 468 up_read(&adev->reset_domain->sem); 469 } else { 470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 471 } 472 } else { 473 ret = adev->pcie_rreg(adev, reg * 4); 474 } 475 476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 477 478 return ret; 479 } 480 481 /* 482 * MMIO register read with bytes helper functions 483 * @offset:bytes offset from MMIO start 484 * 485 */ 486 487 /** 488 * amdgpu_mm_rreg8 - read a memory mapped IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @offset: byte aligned register offset 492 * 493 * Returns the 8 bit value from the offset specified. 494 */ 495 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 496 { 497 if (amdgpu_device_skip_hw_access(adev)) 498 return 0; 499 500 if (offset < adev->rmmio_size) 501 return (readb(adev->rmmio + offset)); 502 BUG(); 503 } 504 505 /* 506 * MMIO register write with bytes helper functions 507 * @offset:bytes offset from MMIO start 508 * @value: the value want to be written to the register 509 * 510 */ 511 /** 512 * amdgpu_mm_wreg8 - read a memory mapped IO register 513 * 514 * @adev: amdgpu_device pointer 515 * @offset: byte aligned register offset 516 * @value: 8 bit value to write 517 * 518 * Writes the value specified to the offset specified. 519 */ 520 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 521 { 522 if (amdgpu_device_skip_hw_access(adev)) 523 return; 524 525 if (offset < adev->rmmio_size) 526 writeb(value, adev->rmmio + offset); 527 else 528 BUG(); 529 } 530 531 /** 532 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 533 * 534 * @adev: amdgpu_device pointer 535 * @reg: dword aligned register offset 536 * @v: 32 bit value to write to the register 537 * @acc_flags: access flags which require special behavior 538 * 539 * Writes the value specified to the offset specified. 540 */ 541 void amdgpu_device_wreg(struct amdgpu_device *adev, 542 uint32_t reg, uint32_t v, 543 uint32_t acc_flags) 544 { 545 if (amdgpu_device_skip_hw_access(adev)) 546 return; 547 548 if ((reg * 4) < adev->rmmio_size) { 549 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 550 amdgpu_sriov_runtime(adev) && 551 down_read_trylock(&adev->reset_domain->sem)) { 552 amdgpu_kiq_wreg(adev, reg, v); 553 up_read(&adev->reset_domain->sem); 554 } else { 555 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 556 } 557 } else { 558 adev->pcie_wreg(adev, reg * 4, v); 559 } 560 561 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 562 } 563 564 /** 565 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 566 * 567 * @adev: amdgpu_device pointer 568 * @reg: mmio/rlc register 569 * @v: value to write 570 * 571 * this function is invoked only for the debugfs register access 572 */ 573 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 574 uint32_t reg, uint32_t v, 575 uint32_t xcc_id) 576 { 577 if (amdgpu_device_skip_hw_access(adev)) 578 return; 579 580 if (amdgpu_sriov_fullaccess(adev) && 581 adev->gfx.rlc.funcs && 582 adev->gfx.rlc.funcs->is_rlcg_access_range) { 583 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 584 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 585 } else if ((reg * 4) >= adev->rmmio_size) { 586 adev->pcie_wreg(adev, reg * 4, v); 587 } else { 588 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 589 } 590 } 591 592 /** 593 * amdgpu_device_indirect_rreg - read an indirect register 594 * 595 * @adev: amdgpu_device pointer 596 * @reg_addr: indirect register address to read from 597 * 598 * Returns the value of indirect register @reg_addr 599 */ 600 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 601 u32 reg_addr) 602 { 603 unsigned long flags, pcie_index, pcie_data; 604 void __iomem *pcie_index_offset; 605 void __iomem *pcie_data_offset; 606 u32 r; 607 608 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 609 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 610 611 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 612 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 613 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 614 615 writel(reg_addr, pcie_index_offset); 616 readl(pcie_index_offset); 617 r = readl(pcie_data_offset); 618 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 619 620 return r; 621 } 622 623 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 624 u64 reg_addr) 625 { 626 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 627 u32 r; 628 void __iomem *pcie_index_offset; 629 void __iomem *pcie_index_hi_offset; 630 void __iomem *pcie_data_offset; 631 632 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 633 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 634 if (adev->nbio.funcs->get_pcie_index_hi_offset) 635 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 636 else 637 pcie_index_hi = 0; 638 639 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 640 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 641 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 642 if (pcie_index_hi != 0) 643 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 644 pcie_index_hi * 4; 645 646 writel(reg_addr, pcie_index_offset); 647 readl(pcie_index_offset); 648 if (pcie_index_hi != 0) { 649 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 650 readl(pcie_index_hi_offset); 651 } 652 r = readl(pcie_data_offset); 653 654 /* clear the high bits */ 655 if (pcie_index_hi != 0) { 656 writel(0, pcie_index_hi_offset); 657 readl(pcie_index_hi_offset); 658 } 659 660 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 661 662 return r; 663 } 664 665 /** 666 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 667 * 668 * @adev: amdgpu_device pointer 669 * @reg_addr: indirect register address to read from 670 * 671 * Returns the value of indirect register @reg_addr 672 */ 673 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 674 u32 reg_addr) 675 { 676 unsigned long flags, pcie_index, pcie_data; 677 void __iomem *pcie_index_offset; 678 void __iomem *pcie_data_offset; 679 u64 r; 680 681 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 682 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 683 684 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 685 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 686 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 687 688 /* read low 32 bits */ 689 writel(reg_addr, pcie_index_offset); 690 readl(pcie_index_offset); 691 r = readl(pcie_data_offset); 692 /* read high 32 bits */ 693 writel(reg_addr + 4, pcie_index_offset); 694 readl(pcie_index_offset); 695 r |= ((u64)readl(pcie_data_offset) << 32); 696 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 697 698 return r; 699 } 700 701 /** 702 * amdgpu_device_indirect_wreg - write an indirect register address 703 * 704 * @adev: amdgpu_device pointer 705 * @reg_addr: indirect register offset 706 * @reg_data: indirect register data 707 * 708 */ 709 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 710 u32 reg_addr, u32 reg_data) 711 { 712 unsigned long flags, pcie_index, pcie_data; 713 void __iomem *pcie_index_offset; 714 void __iomem *pcie_data_offset; 715 716 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 717 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 718 719 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 720 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 721 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 722 723 writel(reg_addr, pcie_index_offset); 724 readl(pcie_index_offset); 725 writel(reg_data, pcie_data_offset); 726 readl(pcie_data_offset); 727 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 728 } 729 730 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 731 u64 reg_addr, u32 reg_data) 732 { 733 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 734 void __iomem *pcie_index_offset; 735 void __iomem *pcie_index_hi_offset; 736 void __iomem *pcie_data_offset; 737 738 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 739 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 740 if (adev->nbio.funcs->get_pcie_index_hi_offset) 741 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 742 else 743 pcie_index_hi = 0; 744 745 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 746 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 747 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 748 if (pcie_index_hi != 0) 749 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 750 pcie_index_hi * 4; 751 752 writel(reg_addr, pcie_index_offset); 753 readl(pcie_index_offset); 754 if (pcie_index_hi != 0) { 755 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 756 readl(pcie_index_hi_offset); 757 } 758 writel(reg_data, pcie_data_offset); 759 readl(pcie_data_offset); 760 761 /* clear the high bits */ 762 if (pcie_index_hi != 0) { 763 writel(0, pcie_index_hi_offset); 764 readl(pcie_index_hi_offset); 765 } 766 767 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 768 } 769 770 /** 771 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 772 * 773 * @adev: amdgpu_device pointer 774 * @reg_addr: indirect register offset 775 * @reg_data: indirect register data 776 * 777 */ 778 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 779 u32 reg_addr, u64 reg_data) 780 { 781 unsigned long flags, pcie_index, pcie_data; 782 void __iomem *pcie_index_offset; 783 void __iomem *pcie_data_offset; 784 785 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 786 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 787 788 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 789 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 790 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 791 792 /* write low 32 bits */ 793 writel(reg_addr, pcie_index_offset); 794 readl(pcie_index_offset); 795 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 796 readl(pcie_data_offset); 797 /* write high 32 bits */ 798 writel(reg_addr + 4, pcie_index_offset); 799 readl(pcie_index_offset); 800 writel((u32)(reg_data >> 32), pcie_data_offset); 801 readl(pcie_data_offset); 802 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 803 } 804 805 /** 806 * amdgpu_device_get_rev_id - query device rev_id 807 * 808 * @adev: amdgpu_device pointer 809 * 810 * Return device rev_id 811 */ 812 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 813 { 814 return adev->nbio.funcs->get_rev_id(adev); 815 } 816 817 /** 818 * amdgpu_invalid_rreg - dummy reg read function 819 * 820 * @adev: amdgpu_device pointer 821 * @reg: offset of register 822 * 823 * Dummy register read function. Used for register blocks 824 * that certain asics don't have (all asics). 825 * Returns the value in the register. 826 */ 827 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 828 { 829 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 830 BUG(); 831 return 0; 832 } 833 834 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 835 { 836 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 837 BUG(); 838 return 0; 839 } 840 841 /** 842 * amdgpu_invalid_wreg - dummy reg write function 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: offset of register 846 * @v: value to write to the register 847 * 848 * Dummy register read function. Used for register blocks 849 * that certain asics don't have (all asics). 850 */ 851 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 852 { 853 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 854 reg, v); 855 BUG(); 856 } 857 858 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 859 { 860 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 861 reg, v); 862 BUG(); 863 } 864 865 /** 866 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 867 * 868 * @adev: amdgpu_device pointer 869 * @reg: offset of register 870 * 871 * Dummy register read function. Used for register blocks 872 * that certain asics don't have (all asics). 873 * Returns the value in the register. 874 */ 875 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 876 { 877 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 878 BUG(); 879 return 0; 880 } 881 882 /** 883 * amdgpu_invalid_wreg64 - dummy reg write function 884 * 885 * @adev: amdgpu_device pointer 886 * @reg: offset of register 887 * @v: value to write to the register 888 * 889 * Dummy register read function. Used for register blocks 890 * that certain asics don't have (all asics). 891 */ 892 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 893 { 894 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 895 reg, v); 896 BUG(); 897 } 898 899 /** 900 * amdgpu_block_invalid_rreg - dummy reg read function 901 * 902 * @adev: amdgpu_device pointer 903 * @block: offset of instance 904 * @reg: offset of register 905 * 906 * Dummy register read function. Used for register blocks 907 * that certain asics don't have (all asics). 908 * Returns the value in the register. 909 */ 910 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 911 uint32_t block, uint32_t reg) 912 { 913 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 914 reg, block); 915 BUG(); 916 return 0; 917 } 918 919 /** 920 * amdgpu_block_invalid_wreg - dummy reg write function 921 * 922 * @adev: amdgpu_device pointer 923 * @block: offset of instance 924 * @reg: offset of register 925 * @v: value to write to the register 926 * 927 * Dummy register read function. Used for register blocks 928 * that certain asics don't have (all asics). 929 */ 930 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 931 uint32_t block, 932 uint32_t reg, uint32_t v) 933 { 934 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 935 reg, block, v); 936 BUG(); 937 } 938 939 /** 940 * amdgpu_device_asic_init - Wrapper for atom asic_init 941 * 942 * @adev: amdgpu_device pointer 943 * 944 * Does any asic specific work and then calls atom asic init. 945 */ 946 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 947 { 948 amdgpu_asic_pre_asic_init(adev); 949 950 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 951 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 952 return amdgpu_atomfirmware_asic_init(adev, true); 953 else 954 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 955 } 956 957 /** 958 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 959 * 960 * @adev: amdgpu_device pointer 961 * 962 * Allocates a scratch page of VRAM for use by various things in the 963 * driver. 964 */ 965 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 966 { 967 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 968 AMDGPU_GEM_DOMAIN_VRAM | 969 AMDGPU_GEM_DOMAIN_GTT, 970 &adev->mem_scratch.robj, 971 &adev->mem_scratch.gpu_addr, 972 (void **)&adev->mem_scratch.ptr); 973 } 974 975 /** 976 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 977 * 978 * @adev: amdgpu_device pointer 979 * 980 * Frees the VRAM scratch page. 981 */ 982 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 983 { 984 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 985 } 986 987 /** 988 * amdgpu_device_program_register_sequence - program an array of registers. 989 * 990 * @adev: amdgpu_device pointer 991 * @registers: pointer to the register array 992 * @array_size: size of the register array 993 * 994 * Programs an array or registers with and and or masks. 995 * This is a helper for setting golden registers. 996 */ 997 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 998 const u32 *registers, 999 const u32 array_size) 1000 { 1001 u32 tmp, reg, and_mask, or_mask; 1002 int i; 1003 1004 if (array_size % 3) 1005 return; 1006 1007 for (i = 0; i < array_size; i += 3) { 1008 reg = registers[i + 0]; 1009 and_mask = registers[i + 1]; 1010 or_mask = registers[i + 2]; 1011 1012 if (and_mask == 0xffffffff) { 1013 tmp = or_mask; 1014 } else { 1015 tmp = RREG32(reg); 1016 tmp &= ~and_mask; 1017 if (adev->family >= AMDGPU_FAMILY_AI) 1018 tmp |= (or_mask & and_mask); 1019 else 1020 tmp |= or_mask; 1021 } 1022 WREG32(reg, tmp); 1023 } 1024 } 1025 1026 /** 1027 * amdgpu_device_pci_config_reset - reset the GPU 1028 * 1029 * @adev: amdgpu_device pointer 1030 * 1031 * Resets the GPU using the pci config reset sequence. 1032 * Only applicable to asics prior to vega10. 1033 */ 1034 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1035 { 1036 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1037 } 1038 1039 /** 1040 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1041 * 1042 * @adev: amdgpu_device pointer 1043 * 1044 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1045 */ 1046 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1047 { 1048 return pci_reset_function(adev->pdev); 1049 } 1050 1051 /* 1052 * amdgpu_device_wb_*() 1053 * Writeback is the method by which the GPU updates special pages in memory 1054 * with the status of certain GPU events (fences, ring pointers,etc.). 1055 */ 1056 1057 /** 1058 * amdgpu_device_wb_fini - Disable Writeback and free memory 1059 * 1060 * @adev: amdgpu_device pointer 1061 * 1062 * Disables Writeback and frees the Writeback memory (all asics). 1063 * Used at driver shutdown. 1064 */ 1065 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1066 { 1067 if (adev->wb.wb_obj) { 1068 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1069 &adev->wb.gpu_addr, 1070 (void **)&adev->wb.wb); 1071 adev->wb.wb_obj = NULL; 1072 } 1073 } 1074 1075 /** 1076 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1077 * 1078 * @adev: amdgpu_device pointer 1079 * 1080 * Initializes writeback and allocates writeback memory (all asics). 1081 * Used at driver startup. 1082 * Returns 0 on success or an -error on failure. 1083 */ 1084 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1085 { 1086 int r; 1087 1088 if (adev->wb.wb_obj == NULL) { 1089 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1090 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1091 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1092 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1093 (void **)&adev->wb.wb); 1094 if (r) { 1095 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1096 return r; 1097 } 1098 1099 adev->wb.num_wb = AMDGPU_MAX_WB; 1100 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1101 1102 /* clear wb memory */ 1103 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1104 } 1105 1106 return 0; 1107 } 1108 1109 /** 1110 * amdgpu_device_wb_get - Allocate a wb entry 1111 * 1112 * @adev: amdgpu_device pointer 1113 * @wb: wb index 1114 * 1115 * Allocate a wb slot for use by the driver (all asics). 1116 * Returns 0 on success or -EINVAL on failure. 1117 */ 1118 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1119 { 1120 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1121 1122 if (offset < adev->wb.num_wb) { 1123 __set_bit(offset, adev->wb.used); 1124 *wb = offset << 3; /* convert to dw offset */ 1125 return 0; 1126 } else { 1127 return -EINVAL; 1128 } 1129 } 1130 1131 /** 1132 * amdgpu_device_wb_free - Free a wb entry 1133 * 1134 * @adev: amdgpu_device pointer 1135 * @wb: wb index 1136 * 1137 * Free a wb slot allocated for use by the driver (all asics) 1138 */ 1139 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1140 { 1141 wb >>= 3; 1142 if (wb < adev->wb.num_wb) 1143 __clear_bit(wb, adev->wb.used); 1144 } 1145 1146 /** 1147 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1148 * 1149 * @adev: amdgpu_device pointer 1150 * 1151 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1152 * to fail, but if any of the BARs is not accessible after the size we abort 1153 * driver loading by returning -ENODEV. 1154 */ 1155 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1156 { 1157 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1158 struct pci_bus *root; 1159 struct resource *res; 1160 unsigned i; 1161 u16 cmd; 1162 int r; 1163 1164 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1165 return 0; 1166 1167 /* Bypass for VF */ 1168 if (amdgpu_sriov_vf(adev)) 1169 return 0; 1170 1171 /* skip if the bios has already enabled large BAR */ 1172 if (adev->gmc.real_vram_size && 1173 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1174 return 0; 1175 1176 /* Check if the root BUS has 64bit memory resources */ 1177 root = adev->pdev->bus; 1178 while (root->parent) 1179 root = root->parent; 1180 1181 pci_bus_for_each_resource(root, res, i) { 1182 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1183 res->start > 0x100000000ull) 1184 break; 1185 } 1186 1187 /* Trying to resize is pointless without a root hub window above 4GB */ 1188 if (!res) 1189 return 0; 1190 1191 /* Limit the BAR size to what is available */ 1192 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1193 rbar_size); 1194 1195 /* Disable memory decoding while we change the BAR addresses and size */ 1196 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1197 pci_write_config_word(adev->pdev, PCI_COMMAND, 1198 cmd & ~PCI_COMMAND_MEMORY); 1199 1200 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1201 amdgpu_doorbell_fini(adev); 1202 if (adev->asic_type >= CHIP_BONAIRE) 1203 pci_release_resource(adev->pdev, 2); 1204 1205 pci_release_resource(adev->pdev, 0); 1206 1207 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1208 if (r == -ENOSPC) 1209 DRM_INFO("Not enough PCI address space for a large BAR."); 1210 else if (r && r != -ENOTSUPP) 1211 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1212 1213 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1214 1215 /* When the doorbell or fb BAR isn't available we have no chance of 1216 * using the device. 1217 */ 1218 r = amdgpu_doorbell_init(adev); 1219 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1220 return -ENODEV; 1221 1222 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1223 1224 return 0; 1225 } 1226 1227 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1228 { 1229 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) { 1230 return false; 1231 } 1232 1233 return true; 1234 } 1235 1236 /* 1237 * GPU helpers function. 1238 */ 1239 /** 1240 * amdgpu_device_need_post - check if the hw need post or not 1241 * 1242 * @adev: amdgpu_device pointer 1243 * 1244 * Check if the asic has been initialized (all asics) at driver startup 1245 * or post is needed if hw reset is performed. 1246 * Returns true if need or false if not. 1247 */ 1248 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1249 { 1250 uint32_t reg; 1251 1252 if (amdgpu_sriov_vf(adev)) 1253 return false; 1254 1255 if (!amdgpu_device_read_bios(adev)) 1256 return false; 1257 1258 if (amdgpu_passthrough(adev)) { 1259 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1260 * some old smc fw still need driver do vPost otherwise gpu hang, while 1261 * those smc fw version above 22.15 doesn't have this flaw, so we force 1262 * vpost executed for smc version below 22.15 1263 */ 1264 if (adev->asic_type == CHIP_FIJI) { 1265 int err; 1266 uint32_t fw_ver; 1267 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1268 /* force vPost if error occured */ 1269 if (err) 1270 return true; 1271 1272 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1273 if (fw_ver < 0x00160e00) 1274 return true; 1275 } 1276 } 1277 1278 /* Don't post if we need to reset whole hive on init */ 1279 if (adev->gmc.xgmi.pending_reset) 1280 return false; 1281 1282 if (adev->has_hw_reset) { 1283 adev->has_hw_reset = false; 1284 return true; 1285 } 1286 1287 /* bios scratch used on CIK+ */ 1288 if (adev->asic_type >= CHIP_BONAIRE) 1289 return amdgpu_atombios_scratch_need_asic_init(adev); 1290 1291 /* check MEM_SIZE for older asics */ 1292 reg = amdgpu_asic_get_config_memsize(adev); 1293 1294 if ((reg != 0) && (reg != 0xffffffff)) 1295 return false; 1296 1297 return true; 1298 } 1299 1300 /* 1301 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1302 * speed switching. Until we have confirmation from Intel that a specific host 1303 * supports it, it's safer that we keep it disabled for all. 1304 * 1305 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1306 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1307 */ 1308 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1309 { 1310 #if IS_ENABLED(CONFIG_X86) 1311 struct cpuinfo_x86 *c = &cpu_data(0); 1312 1313 if (c->x86_vendor == X86_VENDOR_INTEL) 1314 return false; 1315 #endif 1316 return true; 1317 } 1318 1319 /** 1320 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1321 * 1322 * @adev: amdgpu_device pointer 1323 * 1324 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1325 * be set for this device. 1326 * 1327 * Returns true if it should be used or false if not. 1328 */ 1329 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1330 { 1331 switch (amdgpu_aspm) { 1332 case -1: 1333 break; 1334 case 0: 1335 return false; 1336 case 1: 1337 return true; 1338 default: 1339 return false; 1340 } 1341 return pcie_aspm_enabled(adev->pdev); 1342 } 1343 1344 bool amdgpu_device_aspm_support_quirk(void) 1345 { 1346 #if IS_ENABLED(CONFIG_X86) 1347 struct cpuinfo_x86 *c = &cpu_data(0); 1348 1349 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1350 #else 1351 return true; 1352 #endif 1353 } 1354 1355 /* if we get transitioned to only one device, take VGA back */ 1356 /** 1357 * amdgpu_device_vga_set_decode - enable/disable vga decode 1358 * 1359 * @pdev: PCI device pointer 1360 * @state: enable/disable vga decode 1361 * 1362 * Enable/disable vga decode (all asics). 1363 * Returns VGA resource flags. 1364 */ 1365 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1366 bool state) 1367 { 1368 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1369 amdgpu_asic_set_vga_state(adev, state); 1370 if (state) 1371 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1372 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1373 else 1374 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1375 } 1376 1377 /** 1378 * amdgpu_device_check_block_size - validate the vm block size 1379 * 1380 * @adev: amdgpu_device pointer 1381 * 1382 * Validates the vm block size specified via module parameter. 1383 * The vm block size defines number of bits in page table versus page directory, 1384 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1385 * page table and the remaining bits are in the page directory. 1386 */ 1387 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1388 { 1389 /* defines number of bits in page table versus page directory, 1390 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1391 * page table and the remaining bits are in the page directory */ 1392 if (amdgpu_vm_block_size == -1) 1393 return; 1394 1395 if (amdgpu_vm_block_size < 9) { 1396 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1397 amdgpu_vm_block_size); 1398 amdgpu_vm_block_size = -1; 1399 } 1400 } 1401 1402 /** 1403 * amdgpu_device_check_vm_size - validate the vm size 1404 * 1405 * @adev: amdgpu_device pointer 1406 * 1407 * Validates the vm size in GB specified via module parameter. 1408 * The VM size is the size of the GPU virtual memory space in GB. 1409 */ 1410 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1411 { 1412 /* no need to check the default value */ 1413 if (amdgpu_vm_size == -1) 1414 return; 1415 1416 if (amdgpu_vm_size < 1) { 1417 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1418 amdgpu_vm_size); 1419 amdgpu_vm_size = -1; 1420 } 1421 } 1422 1423 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1424 { 1425 struct sysinfo si; 1426 bool is_os_64 = (sizeof(void *) == 8); 1427 uint64_t total_memory; 1428 uint64_t dram_size_seven_GB = 0x1B8000000; 1429 uint64_t dram_size_three_GB = 0xB8000000; 1430 1431 if (amdgpu_smu_memory_pool_size == 0) 1432 return; 1433 1434 if (!is_os_64) { 1435 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1436 goto def_value; 1437 } 1438 si_meminfo(&si); 1439 total_memory = (uint64_t)si.totalram * si.mem_unit; 1440 1441 if ((amdgpu_smu_memory_pool_size == 1) || 1442 (amdgpu_smu_memory_pool_size == 2)) { 1443 if (total_memory < dram_size_three_GB) 1444 goto def_value1; 1445 } else if ((amdgpu_smu_memory_pool_size == 4) || 1446 (amdgpu_smu_memory_pool_size == 8)) { 1447 if (total_memory < dram_size_seven_GB) 1448 goto def_value1; 1449 } else { 1450 DRM_WARN("Smu memory pool size not supported\n"); 1451 goto def_value; 1452 } 1453 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1454 1455 return; 1456 1457 def_value1: 1458 DRM_WARN("No enough system memory\n"); 1459 def_value: 1460 adev->pm.smu_prv_buffer_size = 0; 1461 } 1462 1463 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1464 { 1465 if (!(adev->flags & AMD_IS_APU) || 1466 adev->asic_type < CHIP_RAVEN) 1467 return 0; 1468 1469 switch (adev->asic_type) { 1470 case CHIP_RAVEN: 1471 if (adev->pdev->device == 0x15dd) 1472 adev->apu_flags |= AMD_APU_IS_RAVEN; 1473 if (adev->pdev->device == 0x15d8) 1474 adev->apu_flags |= AMD_APU_IS_PICASSO; 1475 break; 1476 case CHIP_RENOIR: 1477 if ((adev->pdev->device == 0x1636) || 1478 (adev->pdev->device == 0x164c)) 1479 adev->apu_flags |= AMD_APU_IS_RENOIR; 1480 else 1481 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1482 break; 1483 case CHIP_VANGOGH: 1484 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1485 break; 1486 case CHIP_YELLOW_CARP: 1487 break; 1488 case CHIP_CYAN_SKILLFISH: 1489 if ((adev->pdev->device == 0x13FE) || 1490 (adev->pdev->device == 0x143F)) 1491 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1492 break; 1493 default: 1494 break; 1495 } 1496 1497 return 0; 1498 } 1499 1500 /** 1501 * amdgpu_device_check_arguments - validate module params 1502 * 1503 * @adev: amdgpu_device pointer 1504 * 1505 * Validates certain module parameters and updates 1506 * the associated values used by the driver (all asics). 1507 */ 1508 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1509 { 1510 if (amdgpu_sched_jobs < 4) { 1511 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1512 amdgpu_sched_jobs); 1513 amdgpu_sched_jobs = 4; 1514 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1515 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1516 amdgpu_sched_jobs); 1517 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1518 } 1519 1520 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1521 /* gart size must be greater or equal to 32M */ 1522 dev_warn(adev->dev, "gart size (%d) too small\n", 1523 amdgpu_gart_size); 1524 amdgpu_gart_size = -1; 1525 } 1526 1527 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1528 /* gtt size must be greater or equal to 32M */ 1529 dev_warn(adev->dev, "gtt size (%d) too small\n", 1530 amdgpu_gtt_size); 1531 amdgpu_gtt_size = -1; 1532 } 1533 1534 /* valid range is between 4 and 9 inclusive */ 1535 if (amdgpu_vm_fragment_size != -1 && 1536 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1537 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1538 amdgpu_vm_fragment_size = -1; 1539 } 1540 1541 if (amdgpu_sched_hw_submission < 2) { 1542 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1543 amdgpu_sched_hw_submission); 1544 amdgpu_sched_hw_submission = 2; 1545 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1546 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1547 amdgpu_sched_hw_submission); 1548 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1549 } 1550 1551 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1552 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1553 amdgpu_reset_method = -1; 1554 } 1555 1556 amdgpu_device_check_smu_prv_buffer_size(adev); 1557 1558 amdgpu_device_check_vm_size(adev); 1559 1560 amdgpu_device_check_block_size(adev); 1561 1562 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1563 1564 return 0; 1565 } 1566 1567 /** 1568 * amdgpu_switcheroo_set_state - set switcheroo state 1569 * 1570 * @pdev: pci dev pointer 1571 * @state: vga_switcheroo state 1572 * 1573 * Callback for the switcheroo driver. Suspends or resumes 1574 * the asics before or after it is powered up using ACPI methods. 1575 */ 1576 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1577 enum vga_switcheroo_state state) 1578 { 1579 struct drm_device *dev = pci_get_drvdata(pdev); 1580 int r; 1581 1582 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1583 return; 1584 1585 if (state == VGA_SWITCHEROO_ON) { 1586 pr_info("switched on\n"); 1587 /* don't suspend or resume card normally */ 1588 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1589 1590 pci_set_power_state(pdev, PCI_D0); 1591 amdgpu_device_load_pci_state(pdev); 1592 r = pci_enable_device(pdev); 1593 if (r) 1594 DRM_WARN("pci_enable_device failed (%d)\n", r); 1595 amdgpu_device_resume(dev, true); 1596 1597 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1598 } else { 1599 pr_info("switched off\n"); 1600 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1601 amdgpu_device_suspend(dev, true); 1602 amdgpu_device_cache_pci_state(pdev); 1603 /* Shut down the device */ 1604 pci_disable_device(pdev); 1605 pci_set_power_state(pdev, PCI_D3cold); 1606 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1607 } 1608 } 1609 1610 /** 1611 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1612 * 1613 * @pdev: pci dev pointer 1614 * 1615 * Callback for the switcheroo driver. Check of the switcheroo 1616 * state can be changed. 1617 * Returns true if the state can be changed, false if not. 1618 */ 1619 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1620 { 1621 struct drm_device *dev = pci_get_drvdata(pdev); 1622 1623 /* 1624 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1625 * locking inversion with the driver load path. And the access here is 1626 * completely racy anyway. So don't bother with locking for now. 1627 */ 1628 return atomic_read(&dev->open_count) == 0; 1629 } 1630 1631 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1632 .set_gpu_state = amdgpu_switcheroo_set_state, 1633 .reprobe = NULL, 1634 .can_switch = amdgpu_switcheroo_can_switch, 1635 }; 1636 1637 /** 1638 * amdgpu_device_ip_set_clockgating_state - set the CG state 1639 * 1640 * @dev: amdgpu_device pointer 1641 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1642 * @state: clockgating state (gate or ungate) 1643 * 1644 * Sets the requested clockgating state for all instances of 1645 * the hardware IP specified. 1646 * Returns the error code from the last instance. 1647 */ 1648 int amdgpu_device_ip_set_clockgating_state(void *dev, 1649 enum amd_ip_block_type block_type, 1650 enum amd_clockgating_state state) 1651 { 1652 struct amdgpu_device *adev = dev; 1653 int i, r = 0; 1654 1655 for (i = 0; i < adev->num_ip_blocks; i++) { 1656 if (!adev->ip_blocks[i].status.valid) 1657 continue; 1658 if (adev->ip_blocks[i].version->type != block_type) 1659 continue; 1660 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1661 continue; 1662 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1663 (void *)adev, state); 1664 if (r) 1665 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1666 adev->ip_blocks[i].version->funcs->name, r); 1667 } 1668 return r; 1669 } 1670 1671 /** 1672 * amdgpu_device_ip_set_powergating_state - set the PG state 1673 * 1674 * @dev: amdgpu_device pointer 1675 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1676 * @state: powergating state (gate or ungate) 1677 * 1678 * Sets the requested powergating state for all instances of 1679 * the hardware IP specified. 1680 * Returns the error code from the last instance. 1681 */ 1682 int amdgpu_device_ip_set_powergating_state(void *dev, 1683 enum amd_ip_block_type block_type, 1684 enum amd_powergating_state state) 1685 { 1686 struct amdgpu_device *adev = dev; 1687 int i, r = 0; 1688 1689 for (i = 0; i < adev->num_ip_blocks; i++) { 1690 if (!adev->ip_blocks[i].status.valid) 1691 continue; 1692 if (adev->ip_blocks[i].version->type != block_type) 1693 continue; 1694 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1695 continue; 1696 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1697 (void *)adev, state); 1698 if (r) 1699 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1700 adev->ip_blocks[i].version->funcs->name, r); 1701 } 1702 return r; 1703 } 1704 1705 /** 1706 * amdgpu_device_ip_get_clockgating_state - get the CG state 1707 * 1708 * @adev: amdgpu_device pointer 1709 * @flags: clockgating feature flags 1710 * 1711 * Walks the list of IPs on the device and updates the clockgating 1712 * flags for each IP. 1713 * Updates @flags with the feature flags for each hardware IP where 1714 * clockgating is enabled. 1715 */ 1716 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1717 u64 *flags) 1718 { 1719 int i; 1720 1721 for (i = 0; i < adev->num_ip_blocks; i++) { 1722 if (!adev->ip_blocks[i].status.valid) 1723 continue; 1724 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1725 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1726 } 1727 } 1728 1729 /** 1730 * amdgpu_device_ip_wait_for_idle - wait for idle 1731 * 1732 * @adev: amdgpu_device pointer 1733 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1734 * 1735 * Waits for the request hardware IP to be idle. 1736 * Returns 0 for success or a negative error code on failure. 1737 */ 1738 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1739 enum amd_ip_block_type block_type) 1740 { 1741 int i, r; 1742 1743 for (i = 0; i < adev->num_ip_blocks; i++) { 1744 if (!adev->ip_blocks[i].status.valid) 1745 continue; 1746 if (adev->ip_blocks[i].version->type == block_type) { 1747 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1748 if (r) 1749 return r; 1750 break; 1751 } 1752 } 1753 return 0; 1754 1755 } 1756 1757 /** 1758 * amdgpu_device_ip_is_idle - is the hardware IP idle 1759 * 1760 * @adev: amdgpu_device pointer 1761 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1762 * 1763 * Check if the hardware IP is idle or not. 1764 * Returns true if it the IP is idle, false if not. 1765 */ 1766 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1767 enum amd_ip_block_type block_type) 1768 { 1769 int i; 1770 1771 for (i = 0; i < adev->num_ip_blocks; i++) { 1772 if (!adev->ip_blocks[i].status.valid) 1773 continue; 1774 if (adev->ip_blocks[i].version->type == block_type) 1775 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1776 } 1777 return true; 1778 1779 } 1780 1781 /** 1782 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1783 * 1784 * @adev: amdgpu_device pointer 1785 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1786 * 1787 * Returns a pointer to the hardware IP block structure 1788 * if it exists for the asic, otherwise NULL. 1789 */ 1790 struct amdgpu_ip_block * 1791 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1792 enum amd_ip_block_type type) 1793 { 1794 int i; 1795 1796 for (i = 0; i < adev->num_ip_blocks; i++) 1797 if (adev->ip_blocks[i].version->type == type) 1798 return &adev->ip_blocks[i]; 1799 1800 return NULL; 1801 } 1802 1803 /** 1804 * amdgpu_device_ip_block_version_cmp 1805 * 1806 * @adev: amdgpu_device pointer 1807 * @type: enum amd_ip_block_type 1808 * @major: major version 1809 * @minor: minor version 1810 * 1811 * return 0 if equal or greater 1812 * return 1 if smaller or the ip_block doesn't exist 1813 */ 1814 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1815 enum amd_ip_block_type type, 1816 u32 major, u32 minor) 1817 { 1818 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1819 1820 if (ip_block && ((ip_block->version->major > major) || 1821 ((ip_block->version->major == major) && 1822 (ip_block->version->minor >= minor)))) 1823 return 0; 1824 1825 return 1; 1826 } 1827 1828 /** 1829 * amdgpu_device_ip_block_add 1830 * 1831 * @adev: amdgpu_device pointer 1832 * @ip_block_version: pointer to the IP to add 1833 * 1834 * Adds the IP block driver information to the collection of IPs 1835 * on the asic. 1836 */ 1837 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1838 const struct amdgpu_ip_block_version *ip_block_version) 1839 { 1840 if (!ip_block_version) 1841 return -EINVAL; 1842 1843 switch (ip_block_version->type) { 1844 case AMD_IP_BLOCK_TYPE_VCN: 1845 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1846 return 0; 1847 break; 1848 case AMD_IP_BLOCK_TYPE_JPEG: 1849 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1850 return 0; 1851 break; 1852 default: 1853 break; 1854 } 1855 1856 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1857 ip_block_version->funcs->name); 1858 1859 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1860 1861 return 0; 1862 } 1863 1864 /** 1865 * amdgpu_device_enable_virtual_display - enable virtual display feature 1866 * 1867 * @adev: amdgpu_device pointer 1868 * 1869 * Enabled the virtual display feature if the user has enabled it via 1870 * the module parameter virtual_display. This feature provides a virtual 1871 * display hardware on headless boards or in virtualized environments. 1872 * This function parses and validates the configuration string specified by 1873 * the user and configues the virtual display configuration (number of 1874 * virtual connectors, crtcs, etc.) specified. 1875 */ 1876 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1877 { 1878 adev->enable_virtual_display = false; 1879 1880 if (amdgpu_virtual_display) { 1881 const char *pci_address_name = pci_name(adev->pdev); 1882 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1883 1884 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1885 pciaddstr_tmp = pciaddstr; 1886 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1887 pciaddname = strsep(&pciaddname_tmp, ","); 1888 if (!strcmp("all", pciaddname) 1889 || !strcmp(pci_address_name, pciaddname)) { 1890 long num_crtc; 1891 int res = -1; 1892 1893 adev->enable_virtual_display = true; 1894 1895 if (pciaddname_tmp) 1896 res = kstrtol(pciaddname_tmp, 10, 1897 &num_crtc); 1898 1899 if (!res) { 1900 if (num_crtc < 1) 1901 num_crtc = 1; 1902 if (num_crtc > 6) 1903 num_crtc = 6; 1904 adev->mode_info.num_crtc = num_crtc; 1905 } else { 1906 adev->mode_info.num_crtc = 1; 1907 } 1908 break; 1909 } 1910 } 1911 1912 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1913 amdgpu_virtual_display, pci_address_name, 1914 adev->enable_virtual_display, adev->mode_info.num_crtc); 1915 1916 kfree(pciaddstr); 1917 } 1918 } 1919 1920 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1921 { 1922 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1923 adev->mode_info.num_crtc = 1; 1924 adev->enable_virtual_display = true; 1925 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1926 adev->enable_virtual_display, adev->mode_info.num_crtc); 1927 } 1928 } 1929 1930 /** 1931 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1932 * 1933 * @adev: amdgpu_device pointer 1934 * 1935 * Parses the asic configuration parameters specified in the gpu info 1936 * firmware and makes them availale to the driver for use in configuring 1937 * the asic. 1938 * Returns 0 on success, -EINVAL on failure. 1939 */ 1940 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1941 { 1942 const char *chip_name; 1943 char fw_name[40]; 1944 int err; 1945 const struct gpu_info_firmware_header_v1_0 *hdr; 1946 1947 adev->firmware.gpu_info_fw = NULL; 1948 1949 if (adev->mman.discovery_bin) { 1950 /* 1951 * FIXME: The bounding box is still needed by Navi12, so 1952 * temporarily read it from gpu_info firmware. Should be dropped 1953 * when DAL no longer needs it. 1954 */ 1955 if (adev->asic_type != CHIP_NAVI12) 1956 return 0; 1957 } 1958 1959 switch (adev->asic_type) { 1960 default: 1961 return 0; 1962 case CHIP_VEGA10: 1963 chip_name = "vega10"; 1964 break; 1965 case CHIP_VEGA12: 1966 chip_name = "vega12"; 1967 break; 1968 case CHIP_RAVEN: 1969 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1970 chip_name = "raven2"; 1971 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1972 chip_name = "picasso"; 1973 else 1974 chip_name = "raven"; 1975 break; 1976 case CHIP_ARCTURUS: 1977 chip_name = "arcturus"; 1978 break; 1979 case CHIP_NAVI12: 1980 chip_name = "navi12"; 1981 break; 1982 } 1983 1984 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1985 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1986 if (err) { 1987 dev_err(adev->dev, 1988 "Failed to get gpu_info firmware \"%s\"\n", 1989 fw_name); 1990 goto out; 1991 } 1992 1993 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1994 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1995 1996 switch (hdr->version_major) { 1997 case 1: 1998 { 1999 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2000 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2001 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2002 2003 /* 2004 * Should be droped when DAL no longer needs it. 2005 */ 2006 if (adev->asic_type == CHIP_NAVI12) 2007 goto parse_soc_bounding_box; 2008 2009 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2010 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2011 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2012 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2013 adev->gfx.config.max_texture_channel_caches = 2014 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2015 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2016 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2017 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2018 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2019 adev->gfx.config.double_offchip_lds_buf = 2020 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2021 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2022 adev->gfx.cu_info.max_waves_per_simd = 2023 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2024 adev->gfx.cu_info.max_scratch_slots_per_cu = 2025 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2026 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2027 if (hdr->version_minor >= 1) { 2028 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2029 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2030 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2031 adev->gfx.config.num_sc_per_sh = 2032 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2033 adev->gfx.config.num_packer_per_sc = 2034 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2035 } 2036 2037 parse_soc_bounding_box: 2038 /* 2039 * soc bounding box info is not integrated in disocovery table, 2040 * we always need to parse it from gpu info firmware if needed. 2041 */ 2042 if (hdr->version_minor == 2) { 2043 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2044 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2045 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2046 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2047 } 2048 break; 2049 } 2050 default: 2051 dev_err(adev->dev, 2052 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2053 err = -EINVAL; 2054 goto out; 2055 } 2056 out: 2057 return err; 2058 } 2059 2060 /** 2061 * amdgpu_device_ip_early_init - run early init for hardware IPs 2062 * 2063 * @adev: amdgpu_device pointer 2064 * 2065 * Early initialization pass for hardware IPs. The hardware IPs that make 2066 * up each asic are discovered each IP's early_init callback is run. This 2067 * is the first stage in initializing the asic. 2068 * Returns 0 on success, negative error code on failure. 2069 */ 2070 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2071 { 2072 struct drm_device *dev = adev_to_drm(adev); 2073 struct pci_dev *parent; 2074 int i, r; 2075 bool total; 2076 2077 amdgpu_device_enable_virtual_display(adev); 2078 2079 if (amdgpu_sriov_vf(adev)) { 2080 r = amdgpu_virt_request_full_gpu(adev, true); 2081 if (r) 2082 return r; 2083 } 2084 2085 switch (adev->asic_type) { 2086 #ifdef CONFIG_DRM_AMDGPU_SI 2087 case CHIP_VERDE: 2088 case CHIP_TAHITI: 2089 case CHIP_PITCAIRN: 2090 case CHIP_OLAND: 2091 case CHIP_HAINAN: 2092 adev->family = AMDGPU_FAMILY_SI; 2093 r = si_set_ip_blocks(adev); 2094 if (r) 2095 return r; 2096 break; 2097 #endif 2098 #ifdef CONFIG_DRM_AMDGPU_CIK 2099 case CHIP_BONAIRE: 2100 case CHIP_HAWAII: 2101 case CHIP_KAVERI: 2102 case CHIP_KABINI: 2103 case CHIP_MULLINS: 2104 if (adev->flags & AMD_IS_APU) 2105 adev->family = AMDGPU_FAMILY_KV; 2106 else 2107 adev->family = AMDGPU_FAMILY_CI; 2108 2109 r = cik_set_ip_blocks(adev); 2110 if (r) 2111 return r; 2112 break; 2113 #endif 2114 case CHIP_TOPAZ: 2115 case CHIP_TONGA: 2116 case CHIP_FIJI: 2117 case CHIP_POLARIS10: 2118 case CHIP_POLARIS11: 2119 case CHIP_POLARIS12: 2120 case CHIP_VEGAM: 2121 case CHIP_CARRIZO: 2122 case CHIP_STONEY: 2123 if (adev->flags & AMD_IS_APU) 2124 adev->family = AMDGPU_FAMILY_CZ; 2125 else 2126 adev->family = AMDGPU_FAMILY_VI; 2127 2128 r = vi_set_ip_blocks(adev); 2129 if (r) 2130 return r; 2131 break; 2132 default: 2133 r = amdgpu_discovery_set_ip_blocks(adev); 2134 if (r) 2135 return r; 2136 break; 2137 } 2138 2139 if (amdgpu_has_atpx() && 2140 (amdgpu_is_atpx_hybrid() || 2141 amdgpu_has_atpx_dgpu_power_cntl()) && 2142 ((adev->flags & AMD_IS_APU) == 0) && 2143 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2144 adev->flags |= AMD_IS_PX; 2145 2146 if (!(adev->flags & AMD_IS_APU)) { 2147 parent = pci_upstream_bridge(adev->pdev); 2148 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2149 } 2150 2151 2152 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2153 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2154 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2155 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2156 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2157 2158 total = true; 2159 for (i = 0; i < adev->num_ip_blocks; i++) { 2160 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2161 DRM_WARN("disabled ip block: %d <%s>\n", 2162 i, adev->ip_blocks[i].version->funcs->name); 2163 adev->ip_blocks[i].status.valid = false; 2164 } else { 2165 if (adev->ip_blocks[i].version->funcs->early_init) { 2166 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2167 if (r == -ENOENT) { 2168 adev->ip_blocks[i].status.valid = false; 2169 } else if (r) { 2170 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2171 adev->ip_blocks[i].version->funcs->name, r); 2172 total = false; 2173 } else { 2174 adev->ip_blocks[i].status.valid = true; 2175 } 2176 } else { 2177 adev->ip_blocks[i].status.valid = true; 2178 } 2179 } 2180 /* get the vbios after the asic_funcs are set up */ 2181 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2182 r = amdgpu_device_parse_gpu_info_fw(adev); 2183 if (r) 2184 return r; 2185 2186 /* Read BIOS */ 2187 if (amdgpu_device_read_bios(adev)) { 2188 if (!amdgpu_get_bios(adev)) 2189 return -EINVAL; 2190 2191 r = amdgpu_atombios_init(adev); 2192 if (r) { 2193 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2194 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2195 return r; 2196 } 2197 } 2198 2199 /*get pf2vf msg info at it's earliest time*/ 2200 if (amdgpu_sriov_vf(adev)) 2201 amdgpu_virt_init_data_exchange(adev); 2202 2203 } 2204 } 2205 if (!total) 2206 return -ENODEV; 2207 2208 amdgpu_amdkfd_device_probe(adev); 2209 adev->cg_flags &= amdgpu_cg_mask; 2210 adev->pg_flags &= amdgpu_pg_mask; 2211 2212 return 0; 2213 } 2214 2215 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2216 { 2217 int i, r; 2218 2219 for (i = 0; i < adev->num_ip_blocks; i++) { 2220 if (!adev->ip_blocks[i].status.sw) 2221 continue; 2222 if (adev->ip_blocks[i].status.hw) 2223 continue; 2224 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2225 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2226 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2227 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2228 if (r) { 2229 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2230 adev->ip_blocks[i].version->funcs->name, r); 2231 return r; 2232 } 2233 adev->ip_blocks[i].status.hw = true; 2234 } 2235 } 2236 2237 return 0; 2238 } 2239 2240 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2241 { 2242 int i, r; 2243 2244 for (i = 0; i < adev->num_ip_blocks; i++) { 2245 if (!adev->ip_blocks[i].status.sw) 2246 continue; 2247 if (adev->ip_blocks[i].status.hw) 2248 continue; 2249 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2250 if (r) { 2251 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2252 adev->ip_blocks[i].version->funcs->name, r); 2253 return r; 2254 } 2255 adev->ip_blocks[i].status.hw = true; 2256 } 2257 2258 return 0; 2259 } 2260 2261 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2262 { 2263 int r = 0; 2264 int i; 2265 uint32_t smu_version; 2266 2267 if (adev->asic_type >= CHIP_VEGA10) { 2268 for (i = 0; i < adev->num_ip_blocks; i++) { 2269 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2270 continue; 2271 2272 if (!adev->ip_blocks[i].status.sw) 2273 continue; 2274 2275 /* no need to do the fw loading again if already done*/ 2276 if (adev->ip_blocks[i].status.hw == true) 2277 break; 2278 2279 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2280 r = adev->ip_blocks[i].version->funcs->resume(adev); 2281 if (r) { 2282 DRM_ERROR("resume of IP block <%s> failed %d\n", 2283 adev->ip_blocks[i].version->funcs->name, r); 2284 return r; 2285 } 2286 } else { 2287 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2288 if (r) { 2289 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2290 adev->ip_blocks[i].version->funcs->name, r); 2291 return r; 2292 } 2293 } 2294 2295 adev->ip_blocks[i].status.hw = true; 2296 break; 2297 } 2298 } 2299 2300 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2301 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2302 2303 return r; 2304 } 2305 2306 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2307 { 2308 long timeout; 2309 int r, i; 2310 2311 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2312 struct amdgpu_ring *ring = adev->rings[i]; 2313 2314 /* No need to setup the GPU scheduler for rings that don't need it */ 2315 if (!ring || ring->no_scheduler) 2316 continue; 2317 2318 switch (ring->funcs->type) { 2319 case AMDGPU_RING_TYPE_GFX: 2320 timeout = adev->gfx_timeout; 2321 break; 2322 case AMDGPU_RING_TYPE_COMPUTE: 2323 timeout = adev->compute_timeout; 2324 break; 2325 case AMDGPU_RING_TYPE_SDMA: 2326 timeout = adev->sdma_timeout; 2327 break; 2328 default: 2329 timeout = adev->video_timeout; 2330 break; 2331 } 2332 2333 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2334 ring->num_hw_submission, 0, 2335 timeout, adev->reset_domain->wq, 2336 ring->sched_score, ring->name, 2337 adev->dev); 2338 if (r) { 2339 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2340 ring->name); 2341 return r; 2342 } 2343 } 2344 2345 amdgpu_xcp_update_partition_sched_list(adev); 2346 2347 return 0; 2348 } 2349 2350 2351 /** 2352 * amdgpu_device_ip_init - run init for hardware IPs 2353 * 2354 * @adev: amdgpu_device pointer 2355 * 2356 * Main initialization pass for hardware IPs. The list of all the hardware 2357 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2358 * are run. sw_init initializes the software state associated with each IP 2359 * and hw_init initializes the hardware associated with each IP. 2360 * Returns 0 on success, negative error code on failure. 2361 */ 2362 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2363 { 2364 int i, r; 2365 2366 r = amdgpu_ras_init(adev); 2367 if (r) 2368 return r; 2369 2370 for (i = 0; i < adev->num_ip_blocks; i++) { 2371 if (!adev->ip_blocks[i].status.valid) 2372 continue; 2373 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2374 if (r) { 2375 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2376 adev->ip_blocks[i].version->funcs->name, r); 2377 goto init_failed; 2378 } 2379 adev->ip_blocks[i].status.sw = true; 2380 2381 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2382 /* need to do common hw init early so everything is set up for gmc */ 2383 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2384 if (r) { 2385 DRM_ERROR("hw_init %d failed %d\n", i, r); 2386 goto init_failed; 2387 } 2388 adev->ip_blocks[i].status.hw = true; 2389 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2390 /* need to do gmc hw init early so we can allocate gpu mem */ 2391 /* Try to reserve bad pages early */ 2392 if (amdgpu_sriov_vf(adev)) 2393 amdgpu_virt_exchange_data(adev); 2394 2395 r = amdgpu_device_mem_scratch_init(adev); 2396 if (r) { 2397 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2398 goto init_failed; 2399 } 2400 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2401 if (r) { 2402 DRM_ERROR("hw_init %d failed %d\n", i, r); 2403 goto init_failed; 2404 } 2405 r = amdgpu_device_wb_init(adev); 2406 if (r) { 2407 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2408 goto init_failed; 2409 } 2410 adev->ip_blocks[i].status.hw = true; 2411 2412 /* right after GMC hw init, we create CSA */ 2413 if (adev->gfx.mcbp) { 2414 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2415 AMDGPU_GEM_DOMAIN_VRAM | 2416 AMDGPU_GEM_DOMAIN_GTT, 2417 AMDGPU_CSA_SIZE); 2418 if (r) { 2419 DRM_ERROR("allocate CSA failed %d\n", r); 2420 goto init_failed; 2421 } 2422 } 2423 } 2424 } 2425 2426 if (amdgpu_sriov_vf(adev)) 2427 amdgpu_virt_init_data_exchange(adev); 2428 2429 r = amdgpu_ib_pool_init(adev); 2430 if (r) { 2431 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2432 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2433 goto init_failed; 2434 } 2435 2436 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2437 if (r) 2438 goto init_failed; 2439 2440 r = amdgpu_device_ip_hw_init_phase1(adev); 2441 if (r) 2442 goto init_failed; 2443 2444 r = amdgpu_device_fw_loading(adev); 2445 if (r) 2446 goto init_failed; 2447 2448 r = amdgpu_device_ip_hw_init_phase2(adev); 2449 if (r) 2450 goto init_failed; 2451 2452 /* 2453 * retired pages will be loaded from eeprom and reserved here, 2454 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2455 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2456 * for I2C communication which only true at this point. 2457 * 2458 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2459 * failure from bad gpu situation and stop amdgpu init process 2460 * accordingly. For other failed cases, it will still release all 2461 * the resource and print error message, rather than returning one 2462 * negative value to upper level. 2463 * 2464 * Note: theoretically, this should be called before all vram allocations 2465 * to protect retired page from abusing 2466 */ 2467 r = amdgpu_ras_recovery_init(adev); 2468 if (r) 2469 goto init_failed; 2470 2471 /** 2472 * In case of XGMI grab extra reference for reset domain for this device 2473 */ 2474 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2475 if (amdgpu_xgmi_add_device(adev) == 0) { 2476 if (!amdgpu_sriov_vf(adev)) { 2477 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2478 2479 if (WARN_ON(!hive)) { 2480 r = -ENOENT; 2481 goto init_failed; 2482 } 2483 2484 if (!hive->reset_domain || 2485 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2486 r = -ENOENT; 2487 amdgpu_put_xgmi_hive(hive); 2488 goto init_failed; 2489 } 2490 2491 /* Drop the early temporary reset domain we created for device */ 2492 amdgpu_reset_put_reset_domain(adev->reset_domain); 2493 adev->reset_domain = hive->reset_domain; 2494 amdgpu_put_xgmi_hive(hive); 2495 } 2496 } 2497 } 2498 2499 r = amdgpu_device_init_schedulers(adev); 2500 if (r) 2501 goto init_failed; 2502 2503 /* Don't init kfd if whole hive need to be reset during init */ 2504 if (!adev->gmc.xgmi.pending_reset) { 2505 kgd2kfd_init_zone_device(adev); 2506 amdgpu_amdkfd_device_init(adev); 2507 } 2508 2509 amdgpu_fru_get_product_info(adev); 2510 2511 init_failed: 2512 2513 return r; 2514 } 2515 2516 /** 2517 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2518 * 2519 * @adev: amdgpu_device pointer 2520 * 2521 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2522 * this function before a GPU reset. If the value is retained after a 2523 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2524 */ 2525 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2526 { 2527 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2528 } 2529 2530 /** 2531 * amdgpu_device_check_vram_lost - check if vram is valid 2532 * 2533 * @adev: amdgpu_device pointer 2534 * 2535 * Checks the reset magic value written to the gart pointer in VRAM. 2536 * The driver calls this after a GPU reset to see if the contents of 2537 * VRAM is lost or now. 2538 * returns true if vram is lost, false if not. 2539 */ 2540 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2541 { 2542 if (memcmp(adev->gart.ptr, adev->reset_magic, 2543 AMDGPU_RESET_MAGIC_NUM)) 2544 return true; 2545 2546 if (!amdgpu_in_reset(adev)) 2547 return false; 2548 2549 /* 2550 * For all ASICs with baco/mode1 reset, the VRAM is 2551 * always assumed to be lost. 2552 */ 2553 switch (amdgpu_asic_reset_method(adev)) { 2554 case AMD_RESET_METHOD_BACO: 2555 case AMD_RESET_METHOD_MODE1: 2556 return true; 2557 default: 2558 return false; 2559 } 2560 } 2561 2562 /** 2563 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2564 * 2565 * @adev: amdgpu_device pointer 2566 * @state: clockgating state (gate or ungate) 2567 * 2568 * The list of all the hardware IPs that make up the asic is walked and the 2569 * set_clockgating_state callbacks are run. 2570 * Late initialization pass enabling clockgating for hardware IPs. 2571 * Fini or suspend, pass disabling clockgating for hardware IPs. 2572 * Returns 0 on success, negative error code on failure. 2573 */ 2574 2575 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2576 enum amd_clockgating_state state) 2577 { 2578 int i, j, r; 2579 2580 if (amdgpu_emu_mode == 1) 2581 return 0; 2582 2583 for (j = 0; j < adev->num_ip_blocks; j++) { 2584 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2585 if (!adev->ip_blocks[i].status.late_initialized) 2586 continue; 2587 /* skip CG for GFX, SDMA on S0ix */ 2588 if (adev->in_s0ix && 2589 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2590 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2591 continue; 2592 /* skip CG for VCE/UVD, it's handled specially */ 2593 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2594 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2595 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2596 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2597 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2598 /* enable clockgating to save power */ 2599 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2600 state); 2601 if (r) { 2602 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2603 adev->ip_blocks[i].version->funcs->name, r); 2604 return r; 2605 } 2606 } 2607 } 2608 2609 return 0; 2610 } 2611 2612 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2613 enum amd_powergating_state state) 2614 { 2615 int i, j, r; 2616 2617 if (amdgpu_emu_mode == 1) 2618 return 0; 2619 2620 for (j = 0; j < adev->num_ip_blocks; j++) { 2621 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2622 if (!adev->ip_blocks[i].status.late_initialized) 2623 continue; 2624 /* skip PG for GFX, SDMA on S0ix */ 2625 if (adev->in_s0ix && 2626 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2627 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2628 continue; 2629 /* skip CG for VCE/UVD, it's handled specially */ 2630 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2631 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2632 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2633 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2634 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2635 /* enable powergating to save power */ 2636 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2637 state); 2638 if (r) { 2639 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2640 adev->ip_blocks[i].version->funcs->name, r); 2641 return r; 2642 } 2643 } 2644 } 2645 return 0; 2646 } 2647 2648 static int amdgpu_device_enable_mgpu_fan_boost(void) 2649 { 2650 struct amdgpu_gpu_instance *gpu_ins; 2651 struct amdgpu_device *adev; 2652 int i, ret = 0; 2653 2654 mutex_lock(&mgpu_info.mutex); 2655 2656 /* 2657 * MGPU fan boost feature should be enabled 2658 * only when there are two or more dGPUs in 2659 * the system 2660 */ 2661 if (mgpu_info.num_dgpu < 2) 2662 goto out; 2663 2664 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2665 gpu_ins = &(mgpu_info.gpu_ins[i]); 2666 adev = gpu_ins->adev; 2667 if (!(adev->flags & AMD_IS_APU) && 2668 !gpu_ins->mgpu_fan_enabled) { 2669 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2670 if (ret) 2671 break; 2672 2673 gpu_ins->mgpu_fan_enabled = 1; 2674 } 2675 } 2676 2677 out: 2678 mutex_unlock(&mgpu_info.mutex); 2679 2680 return ret; 2681 } 2682 2683 /** 2684 * amdgpu_device_ip_late_init - run late init for hardware IPs 2685 * 2686 * @adev: amdgpu_device pointer 2687 * 2688 * Late initialization pass for hardware IPs. The list of all the hardware 2689 * IPs that make up the asic is walked and the late_init callbacks are run. 2690 * late_init covers any special initialization that an IP requires 2691 * after all of the have been initialized or something that needs to happen 2692 * late in the init process. 2693 * Returns 0 on success, negative error code on failure. 2694 */ 2695 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2696 { 2697 struct amdgpu_gpu_instance *gpu_instance; 2698 int i = 0, r; 2699 2700 for (i = 0; i < adev->num_ip_blocks; i++) { 2701 if (!adev->ip_blocks[i].status.hw) 2702 continue; 2703 if (adev->ip_blocks[i].version->funcs->late_init) { 2704 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2705 if (r) { 2706 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2707 adev->ip_blocks[i].version->funcs->name, r); 2708 return r; 2709 } 2710 } 2711 adev->ip_blocks[i].status.late_initialized = true; 2712 } 2713 2714 r = amdgpu_ras_late_init(adev); 2715 if (r) { 2716 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2717 return r; 2718 } 2719 2720 amdgpu_ras_set_error_query_ready(adev, true); 2721 2722 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2723 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2724 2725 amdgpu_device_fill_reset_magic(adev); 2726 2727 r = amdgpu_device_enable_mgpu_fan_boost(); 2728 if (r) 2729 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2730 2731 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2732 if (amdgpu_passthrough(adev) && 2733 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2734 adev->asic_type == CHIP_ALDEBARAN)) 2735 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2736 2737 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2738 mutex_lock(&mgpu_info.mutex); 2739 2740 /* 2741 * Reset device p-state to low as this was booted with high. 2742 * 2743 * This should be performed only after all devices from the same 2744 * hive get initialized. 2745 * 2746 * However, it's unknown how many device in the hive in advance. 2747 * As this is counted one by one during devices initializations. 2748 * 2749 * So, we wait for all XGMI interlinked devices initialized. 2750 * This may bring some delays as those devices may come from 2751 * different hives. But that should be OK. 2752 */ 2753 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2754 for (i = 0; i < mgpu_info.num_gpu; i++) { 2755 gpu_instance = &(mgpu_info.gpu_ins[i]); 2756 if (gpu_instance->adev->flags & AMD_IS_APU) 2757 continue; 2758 2759 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2760 AMDGPU_XGMI_PSTATE_MIN); 2761 if (r) { 2762 DRM_ERROR("pstate setting failed (%d).\n", r); 2763 break; 2764 } 2765 } 2766 } 2767 2768 mutex_unlock(&mgpu_info.mutex); 2769 } 2770 2771 return 0; 2772 } 2773 2774 /** 2775 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2776 * 2777 * @adev: amdgpu_device pointer 2778 * 2779 * For ASICs need to disable SMC first 2780 */ 2781 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2782 { 2783 int i, r; 2784 2785 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2786 return; 2787 2788 for (i = 0; i < adev->num_ip_blocks; i++) { 2789 if (!adev->ip_blocks[i].status.hw) 2790 continue; 2791 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2792 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2793 /* XXX handle errors */ 2794 if (r) { 2795 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2796 adev->ip_blocks[i].version->funcs->name, r); 2797 } 2798 adev->ip_blocks[i].status.hw = false; 2799 break; 2800 } 2801 } 2802 } 2803 2804 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2805 { 2806 int i, r; 2807 2808 for (i = 0; i < adev->num_ip_blocks; i++) { 2809 if (!adev->ip_blocks[i].version->funcs->early_fini) 2810 continue; 2811 2812 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2813 if (r) { 2814 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2815 adev->ip_blocks[i].version->funcs->name, r); 2816 } 2817 } 2818 2819 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2820 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2821 2822 amdgpu_amdkfd_suspend(adev, false); 2823 2824 /* Workaroud for ASICs need to disable SMC first */ 2825 amdgpu_device_smu_fini_early(adev); 2826 2827 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2828 if (!adev->ip_blocks[i].status.hw) 2829 continue; 2830 2831 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2832 /* XXX handle errors */ 2833 if (r) { 2834 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2835 adev->ip_blocks[i].version->funcs->name, r); 2836 } 2837 2838 adev->ip_blocks[i].status.hw = false; 2839 } 2840 2841 if (amdgpu_sriov_vf(adev)) { 2842 if (amdgpu_virt_release_full_gpu(adev, false)) 2843 DRM_ERROR("failed to release exclusive mode on fini\n"); 2844 } 2845 2846 return 0; 2847 } 2848 2849 /** 2850 * amdgpu_device_ip_fini - run fini for hardware IPs 2851 * 2852 * @adev: amdgpu_device pointer 2853 * 2854 * Main teardown pass for hardware IPs. The list of all the hardware 2855 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2856 * are run. hw_fini tears down the hardware associated with each IP 2857 * and sw_fini tears down any software state associated with each IP. 2858 * Returns 0 on success, negative error code on failure. 2859 */ 2860 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2861 { 2862 int i, r; 2863 2864 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2865 amdgpu_virt_release_ras_err_handler_data(adev); 2866 2867 if (adev->gmc.xgmi.num_physical_nodes > 1) 2868 amdgpu_xgmi_remove_device(adev); 2869 2870 amdgpu_amdkfd_device_fini_sw(adev); 2871 2872 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2873 if (!adev->ip_blocks[i].status.sw) 2874 continue; 2875 2876 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2877 amdgpu_ucode_free_bo(adev); 2878 amdgpu_free_static_csa(&adev->virt.csa_obj); 2879 amdgpu_device_wb_fini(adev); 2880 amdgpu_device_mem_scratch_fini(adev); 2881 amdgpu_ib_pool_fini(adev); 2882 } 2883 2884 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2885 /* XXX handle errors */ 2886 if (r) { 2887 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2888 adev->ip_blocks[i].version->funcs->name, r); 2889 } 2890 adev->ip_blocks[i].status.sw = false; 2891 adev->ip_blocks[i].status.valid = false; 2892 } 2893 2894 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2895 if (!adev->ip_blocks[i].status.late_initialized) 2896 continue; 2897 if (adev->ip_blocks[i].version->funcs->late_fini) 2898 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2899 adev->ip_blocks[i].status.late_initialized = false; 2900 } 2901 2902 amdgpu_ras_fini(adev); 2903 2904 return 0; 2905 } 2906 2907 /** 2908 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2909 * 2910 * @work: work_struct. 2911 */ 2912 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2913 { 2914 struct amdgpu_device *adev = 2915 container_of(work, struct amdgpu_device, delayed_init_work.work); 2916 int r; 2917 2918 r = amdgpu_ib_ring_tests(adev); 2919 if (r) 2920 DRM_ERROR("ib ring test failed (%d).\n", r); 2921 } 2922 2923 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2924 { 2925 struct amdgpu_device *adev = 2926 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2927 2928 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2929 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2930 2931 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2932 adev->gfx.gfx_off_state = true; 2933 } 2934 2935 /** 2936 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2937 * 2938 * @adev: amdgpu_device pointer 2939 * 2940 * Main suspend function for hardware IPs. The list of all the hardware 2941 * IPs that make up the asic is walked, clockgating is disabled and the 2942 * suspend callbacks are run. suspend puts the hardware and software state 2943 * in each IP into a state suitable for suspend. 2944 * Returns 0 on success, negative error code on failure. 2945 */ 2946 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2947 { 2948 int i, r; 2949 2950 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2951 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2952 2953 /* 2954 * Per PMFW team's suggestion, driver needs to handle gfxoff 2955 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2956 * scenario. Add the missing df cstate disablement here. 2957 */ 2958 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2959 dev_warn(adev->dev, "Failed to disallow df cstate"); 2960 2961 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2962 if (!adev->ip_blocks[i].status.valid) 2963 continue; 2964 2965 /* displays are handled separately */ 2966 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2967 continue; 2968 2969 /* XXX handle errors */ 2970 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2971 /* XXX handle errors */ 2972 if (r) { 2973 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2974 adev->ip_blocks[i].version->funcs->name, r); 2975 return r; 2976 } 2977 2978 adev->ip_blocks[i].status.hw = false; 2979 } 2980 2981 return 0; 2982 } 2983 2984 /** 2985 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2986 * 2987 * @adev: amdgpu_device pointer 2988 * 2989 * Main suspend function for hardware IPs. The list of all the hardware 2990 * IPs that make up the asic is walked, clockgating is disabled and the 2991 * suspend callbacks are run. suspend puts the hardware and software state 2992 * in each IP into a state suitable for suspend. 2993 * Returns 0 on success, negative error code on failure. 2994 */ 2995 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2996 { 2997 int i, r; 2998 2999 if (adev->in_s0ix) 3000 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3001 3002 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3003 if (!adev->ip_blocks[i].status.valid) 3004 continue; 3005 /* displays are handled in phase1 */ 3006 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3007 continue; 3008 /* PSP lost connection when err_event_athub occurs */ 3009 if (amdgpu_ras_intr_triggered() && 3010 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3011 adev->ip_blocks[i].status.hw = false; 3012 continue; 3013 } 3014 3015 /* skip unnecessary suspend if we do not initialize them yet */ 3016 if (adev->gmc.xgmi.pending_reset && 3017 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3018 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3019 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3020 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3021 adev->ip_blocks[i].status.hw = false; 3022 continue; 3023 } 3024 3025 /* skip suspend of gfx/mes and psp for S0ix 3026 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3027 * like at runtime. PSP is also part of the always on hardware 3028 * so no need to suspend it. 3029 */ 3030 if (adev->in_s0ix && 3031 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3032 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3033 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3034 continue; 3035 3036 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3037 if (adev->in_s0ix && 3038 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3039 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3040 continue; 3041 3042 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3043 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3044 * from this location and RLC Autoload automatically also gets loaded 3045 * from here based on PMFW -> PSP message during re-init sequence. 3046 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3047 * the TMR and reload FWs again for IMU enabled APU ASICs. 3048 */ 3049 if (amdgpu_in_reset(adev) && 3050 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3051 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3052 continue; 3053 3054 /* XXX handle errors */ 3055 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3056 /* XXX handle errors */ 3057 if (r) { 3058 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3059 adev->ip_blocks[i].version->funcs->name, r); 3060 } 3061 adev->ip_blocks[i].status.hw = false; 3062 /* handle putting the SMC in the appropriate state */ 3063 if (!amdgpu_sriov_vf(adev)) { 3064 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3065 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3066 if (r) { 3067 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3068 adev->mp1_state, r); 3069 return r; 3070 } 3071 } 3072 } 3073 } 3074 3075 return 0; 3076 } 3077 3078 /** 3079 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3080 * 3081 * @adev: amdgpu_device pointer 3082 * 3083 * Main suspend function for hardware IPs. The list of all the hardware 3084 * IPs that make up the asic is walked, clockgating is disabled and the 3085 * suspend callbacks are run. suspend puts the hardware and software state 3086 * in each IP into a state suitable for suspend. 3087 * Returns 0 on success, negative error code on failure. 3088 */ 3089 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3090 { 3091 int r; 3092 3093 if (amdgpu_sriov_vf(adev)) { 3094 amdgpu_virt_fini_data_exchange(adev); 3095 amdgpu_virt_request_full_gpu(adev, false); 3096 } 3097 3098 r = amdgpu_device_ip_suspend_phase1(adev); 3099 if (r) 3100 return r; 3101 r = amdgpu_device_ip_suspend_phase2(adev); 3102 3103 if (amdgpu_sriov_vf(adev)) 3104 amdgpu_virt_release_full_gpu(adev, false); 3105 3106 return r; 3107 } 3108 3109 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3110 { 3111 int i, r; 3112 3113 static enum amd_ip_block_type ip_order[] = { 3114 AMD_IP_BLOCK_TYPE_COMMON, 3115 AMD_IP_BLOCK_TYPE_GMC, 3116 AMD_IP_BLOCK_TYPE_PSP, 3117 AMD_IP_BLOCK_TYPE_IH, 3118 }; 3119 3120 for (i = 0; i < adev->num_ip_blocks; i++) { 3121 int j; 3122 struct amdgpu_ip_block *block; 3123 3124 block = &adev->ip_blocks[i]; 3125 block->status.hw = false; 3126 3127 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3128 3129 if (block->version->type != ip_order[j] || 3130 !block->status.valid) 3131 continue; 3132 3133 r = block->version->funcs->hw_init(adev); 3134 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3135 if (r) 3136 return r; 3137 block->status.hw = true; 3138 } 3139 } 3140 3141 return 0; 3142 } 3143 3144 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3145 { 3146 int i, r; 3147 3148 static enum amd_ip_block_type ip_order[] = { 3149 AMD_IP_BLOCK_TYPE_SMC, 3150 AMD_IP_BLOCK_TYPE_DCE, 3151 AMD_IP_BLOCK_TYPE_GFX, 3152 AMD_IP_BLOCK_TYPE_SDMA, 3153 AMD_IP_BLOCK_TYPE_MES, 3154 AMD_IP_BLOCK_TYPE_UVD, 3155 AMD_IP_BLOCK_TYPE_VCE, 3156 AMD_IP_BLOCK_TYPE_VCN, 3157 AMD_IP_BLOCK_TYPE_JPEG 3158 }; 3159 3160 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3161 int j; 3162 struct amdgpu_ip_block *block; 3163 3164 for (j = 0; j < adev->num_ip_blocks; j++) { 3165 block = &adev->ip_blocks[j]; 3166 3167 if (block->version->type != ip_order[i] || 3168 !block->status.valid || 3169 block->status.hw) 3170 continue; 3171 3172 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3173 r = block->version->funcs->resume(adev); 3174 else 3175 r = block->version->funcs->hw_init(adev); 3176 3177 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3178 if (r) 3179 return r; 3180 block->status.hw = true; 3181 } 3182 } 3183 3184 return 0; 3185 } 3186 3187 /** 3188 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3189 * 3190 * @adev: amdgpu_device pointer 3191 * 3192 * First resume function for hardware IPs. The list of all the hardware 3193 * IPs that make up the asic is walked and the resume callbacks are run for 3194 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3195 * after a suspend and updates the software state as necessary. This 3196 * function is also used for restoring the GPU after a GPU reset. 3197 * Returns 0 on success, negative error code on failure. 3198 */ 3199 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3200 { 3201 int i, r; 3202 3203 for (i = 0; i < adev->num_ip_blocks; i++) { 3204 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3205 continue; 3206 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3207 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3208 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3209 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3210 3211 r = adev->ip_blocks[i].version->funcs->resume(adev); 3212 if (r) { 3213 DRM_ERROR("resume of IP block <%s> failed %d\n", 3214 adev->ip_blocks[i].version->funcs->name, r); 3215 return r; 3216 } 3217 adev->ip_blocks[i].status.hw = true; 3218 } 3219 } 3220 3221 return 0; 3222 } 3223 3224 /** 3225 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3226 * 3227 * @adev: amdgpu_device pointer 3228 * 3229 * First resume function for hardware IPs. The list of all the hardware 3230 * IPs that make up the asic is walked and the resume callbacks are run for 3231 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3232 * functional state after a suspend and updates the software state as 3233 * necessary. This function is also used for restoring the GPU after a GPU 3234 * reset. 3235 * Returns 0 on success, negative error code on failure. 3236 */ 3237 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3238 { 3239 int i, r; 3240 3241 for (i = 0; i < adev->num_ip_blocks; i++) { 3242 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3243 continue; 3244 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3245 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3246 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3247 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3248 continue; 3249 r = adev->ip_blocks[i].version->funcs->resume(adev); 3250 if (r) { 3251 DRM_ERROR("resume of IP block <%s> failed %d\n", 3252 adev->ip_blocks[i].version->funcs->name, r); 3253 return r; 3254 } 3255 adev->ip_blocks[i].status.hw = true; 3256 } 3257 3258 return 0; 3259 } 3260 3261 /** 3262 * amdgpu_device_ip_resume - run resume for hardware IPs 3263 * 3264 * @adev: amdgpu_device pointer 3265 * 3266 * Main resume function for hardware IPs. The hardware IPs 3267 * are split into two resume functions because they are 3268 * are also used in in recovering from a GPU reset and some additional 3269 * steps need to be take between them. In this case (S3/S4) they are 3270 * run sequentially. 3271 * Returns 0 on success, negative error code on failure. 3272 */ 3273 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3274 { 3275 int r; 3276 3277 if (!adev->in_s0ix) { 3278 r = amdgpu_amdkfd_resume_iommu(adev); 3279 if (r) 3280 return r; 3281 } 3282 3283 r = amdgpu_device_ip_resume_phase1(adev); 3284 if (r) 3285 return r; 3286 3287 r = amdgpu_device_fw_loading(adev); 3288 if (r) 3289 return r; 3290 3291 r = amdgpu_device_ip_resume_phase2(adev); 3292 3293 return r; 3294 } 3295 3296 /** 3297 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3298 * 3299 * @adev: amdgpu_device pointer 3300 * 3301 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3302 */ 3303 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3304 { 3305 if (amdgpu_sriov_vf(adev)) { 3306 if (adev->is_atom_fw) { 3307 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3308 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3309 } else { 3310 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3311 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3312 } 3313 3314 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3315 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3316 } 3317 } 3318 3319 /** 3320 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3321 * 3322 * @asic_type: AMD asic type 3323 * 3324 * Check if there is DC (new modesetting infrastructre) support for an asic. 3325 * returns true if DC has support, false if not. 3326 */ 3327 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3328 { 3329 switch (asic_type) { 3330 #ifdef CONFIG_DRM_AMDGPU_SI 3331 case CHIP_HAINAN: 3332 #endif 3333 case CHIP_TOPAZ: 3334 /* chips with no display hardware */ 3335 return false; 3336 #if defined(CONFIG_DRM_AMD_DC) 3337 case CHIP_TAHITI: 3338 case CHIP_PITCAIRN: 3339 case CHIP_VERDE: 3340 case CHIP_OLAND: 3341 /* 3342 * We have systems in the wild with these ASICs that require 3343 * LVDS and VGA support which is not supported with DC. 3344 * 3345 * Fallback to the non-DC driver here by default so as not to 3346 * cause regressions. 3347 */ 3348 #if defined(CONFIG_DRM_AMD_DC_SI) 3349 return amdgpu_dc > 0; 3350 #else 3351 return false; 3352 #endif 3353 case CHIP_BONAIRE: 3354 case CHIP_KAVERI: 3355 case CHIP_KABINI: 3356 case CHIP_MULLINS: 3357 /* 3358 * We have systems in the wild with these ASICs that require 3359 * VGA support which is not supported with DC. 3360 * 3361 * Fallback to the non-DC driver here by default so as not to 3362 * cause regressions. 3363 */ 3364 return amdgpu_dc > 0; 3365 default: 3366 return amdgpu_dc != 0; 3367 #else 3368 default: 3369 if (amdgpu_dc > 0) 3370 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3371 "but isn't supported by ASIC, ignoring\n"); 3372 return false; 3373 #endif 3374 } 3375 } 3376 3377 /** 3378 * amdgpu_device_has_dc_support - check if dc is supported 3379 * 3380 * @adev: amdgpu_device pointer 3381 * 3382 * Returns true for supported, false for not supported 3383 */ 3384 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3385 { 3386 if (adev->enable_virtual_display || 3387 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3388 return false; 3389 3390 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3391 } 3392 3393 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3394 { 3395 struct amdgpu_device *adev = 3396 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3397 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3398 3399 /* It's a bug to not have a hive within this function */ 3400 if (WARN_ON(!hive)) 3401 return; 3402 3403 /* 3404 * Use task barrier to synchronize all xgmi reset works across the 3405 * hive. task_barrier_enter and task_barrier_exit will block 3406 * until all the threads running the xgmi reset works reach 3407 * those points. task_barrier_full will do both blocks. 3408 */ 3409 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3410 3411 task_barrier_enter(&hive->tb); 3412 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3413 3414 if (adev->asic_reset_res) 3415 goto fail; 3416 3417 task_barrier_exit(&hive->tb); 3418 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3419 3420 if (adev->asic_reset_res) 3421 goto fail; 3422 3423 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3424 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3425 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3426 } else { 3427 3428 task_barrier_full(&hive->tb); 3429 adev->asic_reset_res = amdgpu_asic_reset(adev); 3430 } 3431 3432 fail: 3433 if (adev->asic_reset_res) 3434 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3435 adev->asic_reset_res, adev_to_drm(adev)->unique); 3436 amdgpu_put_xgmi_hive(hive); 3437 } 3438 3439 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3440 { 3441 char *input = amdgpu_lockup_timeout; 3442 char *timeout_setting = NULL; 3443 int index = 0; 3444 long timeout; 3445 int ret = 0; 3446 3447 /* 3448 * By default timeout for non compute jobs is 10000 3449 * and 60000 for compute jobs. 3450 * In SR-IOV or passthrough mode, timeout for compute 3451 * jobs are 60000 by default. 3452 */ 3453 adev->gfx_timeout = msecs_to_jiffies(10000); 3454 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3455 if (amdgpu_sriov_vf(adev)) 3456 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3457 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3458 else 3459 adev->compute_timeout = msecs_to_jiffies(60000); 3460 3461 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3462 while ((timeout_setting = strsep(&input, ",")) && 3463 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3464 ret = kstrtol(timeout_setting, 0, &timeout); 3465 if (ret) 3466 return ret; 3467 3468 if (timeout == 0) { 3469 index++; 3470 continue; 3471 } else if (timeout < 0) { 3472 timeout = MAX_SCHEDULE_TIMEOUT; 3473 dev_warn(adev->dev, "lockup timeout disabled"); 3474 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3475 } else { 3476 timeout = msecs_to_jiffies(timeout); 3477 } 3478 3479 switch (index++) { 3480 case 0: 3481 adev->gfx_timeout = timeout; 3482 break; 3483 case 1: 3484 adev->compute_timeout = timeout; 3485 break; 3486 case 2: 3487 adev->sdma_timeout = timeout; 3488 break; 3489 case 3: 3490 adev->video_timeout = timeout; 3491 break; 3492 default: 3493 break; 3494 } 3495 } 3496 /* 3497 * There is only one value specified and 3498 * it should apply to all non-compute jobs. 3499 */ 3500 if (index == 1) { 3501 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3502 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3503 adev->compute_timeout = adev->gfx_timeout; 3504 } 3505 } 3506 3507 return ret; 3508 } 3509 3510 /** 3511 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3512 * 3513 * @adev: amdgpu_device pointer 3514 * 3515 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3516 */ 3517 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3518 { 3519 struct iommu_domain *domain; 3520 3521 domain = iommu_get_domain_for_dev(adev->dev); 3522 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3523 adev->ram_is_direct_mapped = true; 3524 } 3525 3526 static const struct attribute *amdgpu_dev_attributes[] = { 3527 &dev_attr_product_name.attr, 3528 &dev_attr_product_number.attr, 3529 &dev_attr_serial_number.attr, 3530 &dev_attr_pcie_replay_count.attr, 3531 NULL 3532 }; 3533 3534 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3535 { 3536 if (amdgpu_mcbp == 1) 3537 adev->gfx.mcbp = true; 3538 3539 if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3540 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3541 adev->gfx.num_gfx_rings) 3542 adev->gfx.mcbp = true; 3543 3544 if (amdgpu_sriov_vf(adev)) 3545 adev->gfx.mcbp = true; 3546 3547 if (adev->gfx.mcbp) 3548 DRM_INFO("MCBP is enabled\n"); 3549 } 3550 3551 /** 3552 * amdgpu_device_init - initialize the driver 3553 * 3554 * @adev: amdgpu_device pointer 3555 * @flags: driver flags 3556 * 3557 * Initializes the driver info and hw (all asics). 3558 * Returns 0 for success or an error on failure. 3559 * Called at driver startup. 3560 */ 3561 int amdgpu_device_init(struct amdgpu_device *adev, 3562 uint32_t flags) 3563 { 3564 struct drm_device *ddev = adev_to_drm(adev); 3565 struct pci_dev *pdev = adev->pdev; 3566 int r, i; 3567 bool px = false; 3568 u32 max_MBps; 3569 int tmp; 3570 3571 adev->shutdown = false; 3572 adev->flags = flags; 3573 3574 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3575 adev->asic_type = amdgpu_force_asic_type; 3576 else 3577 adev->asic_type = flags & AMD_ASIC_MASK; 3578 3579 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3580 if (amdgpu_emu_mode == 1) 3581 adev->usec_timeout *= 10; 3582 adev->gmc.gart_size = 512 * 1024 * 1024; 3583 adev->accel_working = false; 3584 adev->num_rings = 0; 3585 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3586 adev->mman.buffer_funcs = NULL; 3587 adev->mman.buffer_funcs_ring = NULL; 3588 adev->vm_manager.vm_pte_funcs = NULL; 3589 adev->vm_manager.vm_pte_num_scheds = 0; 3590 adev->gmc.gmc_funcs = NULL; 3591 adev->harvest_ip_mask = 0x0; 3592 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3593 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3594 3595 adev->smc_rreg = &amdgpu_invalid_rreg; 3596 adev->smc_wreg = &amdgpu_invalid_wreg; 3597 adev->pcie_rreg = &amdgpu_invalid_rreg; 3598 adev->pcie_wreg = &amdgpu_invalid_wreg; 3599 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3600 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3601 adev->pciep_rreg = &amdgpu_invalid_rreg; 3602 adev->pciep_wreg = &amdgpu_invalid_wreg; 3603 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3604 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3605 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3606 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3607 adev->didt_rreg = &amdgpu_invalid_rreg; 3608 adev->didt_wreg = &amdgpu_invalid_wreg; 3609 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3610 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3611 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3612 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3613 3614 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3615 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3616 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3617 3618 /* mutex initialization are all done here so we 3619 * can recall function without having locking issues */ 3620 mutex_init(&adev->firmware.mutex); 3621 mutex_init(&adev->pm.mutex); 3622 mutex_init(&adev->gfx.gpu_clock_mutex); 3623 mutex_init(&adev->srbm_mutex); 3624 mutex_init(&adev->gfx.pipe_reserve_mutex); 3625 mutex_init(&adev->gfx.gfx_off_mutex); 3626 mutex_init(&adev->gfx.partition_mutex); 3627 mutex_init(&adev->grbm_idx_mutex); 3628 mutex_init(&adev->mn_lock); 3629 mutex_init(&adev->virt.vf_errors.lock); 3630 hash_init(adev->mn_hash); 3631 mutex_init(&adev->psp.mutex); 3632 mutex_init(&adev->notifier_lock); 3633 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3634 mutex_init(&adev->benchmark_mutex); 3635 3636 amdgpu_device_init_apu_flags(adev); 3637 3638 r = amdgpu_device_check_arguments(adev); 3639 if (r) 3640 return r; 3641 3642 spin_lock_init(&adev->mmio_idx_lock); 3643 spin_lock_init(&adev->smc_idx_lock); 3644 spin_lock_init(&adev->pcie_idx_lock); 3645 spin_lock_init(&adev->uvd_ctx_idx_lock); 3646 spin_lock_init(&adev->didt_idx_lock); 3647 spin_lock_init(&adev->gc_cac_idx_lock); 3648 spin_lock_init(&adev->se_cac_idx_lock); 3649 spin_lock_init(&adev->audio_endpt_idx_lock); 3650 spin_lock_init(&adev->mm_stats.lock); 3651 3652 INIT_LIST_HEAD(&adev->shadow_list); 3653 mutex_init(&adev->shadow_list_lock); 3654 3655 INIT_LIST_HEAD(&adev->reset_list); 3656 3657 INIT_LIST_HEAD(&adev->ras_list); 3658 3659 INIT_DELAYED_WORK(&adev->delayed_init_work, 3660 amdgpu_device_delayed_init_work_handler); 3661 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3662 amdgpu_device_delay_enable_gfx_off); 3663 3664 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3665 3666 adev->gfx.gfx_off_req_count = 1; 3667 adev->gfx.gfx_off_residency = 0; 3668 adev->gfx.gfx_off_entrycount = 0; 3669 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3670 3671 atomic_set(&adev->throttling_logging_enabled, 1); 3672 /* 3673 * If throttling continues, logging will be performed every minute 3674 * to avoid log flooding. "-1" is subtracted since the thermal 3675 * throttling interrupt comes every second. Thus, the total logging 3676 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3677 * for throttling interrupt) = 60 seconds. 3678 */ 3679 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3680 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3681 3682 /* Registers mapping */ 3683 /* TODO: block userspace mapping of io register */ 3684 if (adev->asic_type >= CHIP_BONAIRE) { 3685 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3686 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3687 } else { 3688 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3689 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3690 } 3691 3692 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3693 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3694 3695 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3696 if (adev->rmmio == NULL) { 3697 return -ENOMEM; 3698 } 3699 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3700 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3701 3702 /* 3703 * Reset domain needs to be present early, before XGMI hive discovered 3704 * (if any) and intitialized to use reset sem and in_gpu reset flag 3705 * early on during init and before calling to RREG32. 3706 */ 3707 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3708 if (!adev->reset_domain) 3709 return -ENOMEM; 3710 3711 /* detect hw virtualization here */ 3712 amdgpu_detect_virtualization(adev); 3713 3714 amdgpu_device_get_pcie_info(adev); 3715 3716 r = amdgpu_device_get_job_timeout_settings(adev); 3717 if (r) { 3718 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3719 return r; 3720 } 3721 3722 /* early init functions */ 3723 r = amdgpu_device_ip_early_init(adev); 3724 if (r) 3725 return r; 3726 3727 amdgpu_device_set_mcbp(adev); 3728 3729 /* Get rid of things like offb */ 3730 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3731 if (r) 3732 return r; 3733 3734 /* Enable TMZ based on IP_VERSION */ 3735 amdgpu_gmc_tmz_set(adev); 3736 3737 amdgpu_gmc_noretry_set(adev); 3738 /* Need to get xgmi info early to decide the reset behavior*/ 3739 if (adev->gmc.xgmi.supported) { 3740 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3741 if (r) 3742 return r; 3743 } 3744 3745 /* enable PCIE atomic ops */ 3746 if (amdgpu_sriov_vf(adev)) { 3747 if (adev->virt.fw_reserve.p_pf2vf) 3748 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3749 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3750 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3751 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3752 * internal path natively support atomics, set have_atomics_support to true. 3753 */ 3754 } else if ((adev->flags & AMD_IS_APU) && 3755 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3756 adev->have_atomics_support = true; 3757 } else { 3758 adev->have_atomics_support = 3759 !pci_enable_atomic_ops_to_root(adev->pdev, 3760 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3761 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3762 } 3763 3764 if (!adev->have_atomics_support) 3765 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3766 3767 /* doorbell bar mapping and doorbell index init*/ 3768 amdgpu_doorbell_init(adev); 3769 3770 if (amdgpu_emu_mode == 1) { 3771 /* post the asic on emulation mode */ 3772 emu_soc_asic_init(adev); 3773 goto fence_driver_init; 3774 } 3775 3776 amdgpu_reset_init(adev); 3777 3778 /* detect if we are with an SRIOV vbios */ 3779 if (adev->bios) 3780 amdgpu_device_detect_sriov_bios(adev); 3781 3782 /* check if we need to reset the asic 3783 * E.g., driver was not cleanly unloaded previously, etc. 3784 */ 3785 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3786 if (adev->gmc.xgmi.num_physical_nodes) { 3787 dev_info(adev->dev, "Pending hive reset.\n"); 3788 adev->gmc.xgmi.pending_reset = true; 3789 /* Only need to init necessary block for SMU to handle the reset */ 3790 for (i = 0; i < adev->num_ip_blocks; i++) { 3791 if (!adev->ip_blocks[i].status.valid) 3792 continue; 3793 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3794 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3795 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3796 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3797 DRM_DEBUG("IP %s disabled for hw_init.\n", 3798 adev->ip_blocks[i].version->funcs->name); 3799 adev->ip_blocks[i].status.hw = true; 3800 } 3801 } 3802 } else { 3803 tmp = amdgpu_reset_method; 3804 /* It should do a default reset when loading or reloading the driver, 3805 * regardless of the module parameter reset_method. 3806 */ 3807 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3808 r = amdgpu_asic_reset(adev); 3809 amdgpu_reset_method = tmp; 3810 if (r) { 3811 dev_err(adev->dev, "asic reset on init failed\n"); 3812 goto failed; 3813 } 3814 } 3815 } 3816 3817 /* Post card if necessary */ 3818 if (amdgpu_device_need_post(adev)) { 3819 if (!adev->bios) { 3820 dev_err(adev->dev, "no vBIOS found\n"); 3821 r = -EINVAL; 3822 goto failed; 3823 } 3824 DRM_INFO("GPU posting now...\n"); 3825 r = amdgpu_device_asic_init(adev); 3826 if (r) { 3827 dev_err(adev->dev, "gpu post error!\n"); 3828 goto failed; 3829 } 3830 } 3831 3832 if (adev->bios) { 3833 if (adev->is_atom_fw) { 3834 /* Initialize clocks */ 3835 r = amdgpu_atomfirmware_get_clock_info(adev); 3836 if (r) { 3837 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3838 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3839 goto failed; 3840 } 3841 } else { 3842 /* Initialize clocks */ 3843 r = amdgpu_atombios_get_clock_info(adev); 3844 if (r) { 3845 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3846 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3847 goto failed; 3848 } 3849 /* init i2c buses */ 3850 if (!amdgpu_device_has_dc_support(adev)) 3851 amdgpu_atombios_i2c_init(adev); 3852 } 3853 } 3854 3855 fence_driver_init: 3856 /* Fence driver */ 3857 r = amdgpu_fence_driver_sw_init(adev); 3858 if (r) { 3859 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3860 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3861 goto failed; 3862 } 3863 3864 /* init the mode config */ 3865 drm_mode_config_init(adev_to_drm(adev)); 3866 3867 r = amdgpu_device_ip_init(adev); 3868 if (r) { 3869 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3870 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3871 goto release_ras_con; 3872 } 3873 3874 amdgpu_fence_driver_hw_init(adev); 3875 3876 dev_info(adev->dev, 3877 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3878 adev->gfx.config.max_shader_engines, 3879 adev->gfx.config.max_sh_per_se, 3880 adev->gfx.config.max_cu_per_sh, 3881 adev->gfx.cu_info.number); 3882 3883 adev->accel_working = true; 3884 3885 amdgpu_vm_check_compute_bug(adev); 3886 3887 /* Initialize the buffer migration limit. */ 3888 if (amdgpu_moverate >= 0) 3889 max_MBps = amdgpu_moverate; 3890 else 3891 max_MBps = 8; /* Allow 8 MB/s. */ 3892 /* Get a log2 for easy divisions. */ 3893 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3894 3895 r = amdgpu_atombios_sysfs_init(adev); 3896 if (r) 3897 drm_err(&adev->ddev, 3898 "registering atombios sysfs failed (%d).\n", r); 3899 3900 r = amdgpu_pm_sysfs_init(adev); 3901 if (r) 3902 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3903 3904 r = amdgpu_ucode_sysfs_init(adev); 3905 if (r) { 3906 adev->ucode_sysfs_en = false; 3907 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3908 } else 3909 adev->ucode_sysfs_en = true; 3910 3911 /* 3912 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3913 * Otherwise the mgpu fan boost feature will be skipped due to the 3914 * gpu instance is counted less. 3915 */ 3916 amdgpu_register_gpu_instance(adev); 3917 3918 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3919 * explicit gating rather than handling it automatically. 3920 */ 3921 if (!adev->gmc.xgmi.pending_reset) { 3922 r = amdgpu_device_ip_late_init(adev); 3923 if (r) { 3924 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3925 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3926 goto release_ras_con; 3927 } 3928 /* must succeed. */ 3929 amdgpu_ras_resume(adev); 3930 queue_delayed_work(system_wq, &adev->delayed_init_work, 3931 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3932 } 3933 3934 if (amdgpu_sriov_vf(adev)) { 3935 amdgpu_virt_release_full_gpu(adev, true); 3936 flush_delayed_work(&adev->delayed_init_work); 3937 } 3938 3939 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3940 if (r) 3941 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3942 3943 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3944 r = amdgpu_pmu_init(adev); 3945 if (r) 3946 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3947 3948 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3949 if (amdgpu_device_cache_pci_state(adev->pdev)) 3950 pci_restore_state(pdev); 3951 3952 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3953 /* this will fail for cards that aren't VGA class devices, just 3954 * ignore it */ 3955 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3956 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3957 3958 px = amdgpu_device_supports_px(ddev); 3959 3960 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3961 apple_gmux_detect(NULL, NULL))) 3962 vga_switcheroo_register_client(adev->pdev, 3963 &amdgpu_switcheroo_ops, px); 3964 3965 if (px) 3966 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3967 3968 if (adev->gmc.xgmi.pending_reset) 3969 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3970 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3971 3972 amdgpu_device_check_iommu_direct_map(adev); 3973 3974 return 0; 3975 3976 release_ras_con: 3977 if (amdgpu_sriov_vf(adev)) 3978 amdgpu_virt_release_full_gpu(adev, true); 3979 3980 /* failed in exclusive mode due to timeout */ 3981 if (amdgpu_sriov_vf(adev) && 3982 !amdgpu_sriov_runtime(adev) && 3983 amdgpu_virt_mmio_blocked(adev) && 3984 !amdgpu_virt_wait_reset(adev)) { 3985 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3986 /* Don't send request since VF is inactive. */ 3987 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3988 adev->virt.ops = NULL; 3989 r = -EAGAIN; 3990 } 3991 amdgpu_release_ras_context(adev); 3992 3993 failed: 3994 amdgpu_vf_error_trans_all(adev); 3995 3996 return r; 3997 } 3998 3999 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4000 { 4001 4002 /* Clear all CPU mappings pointing to this device */ 4003 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4004 4005 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4006 amdgpu_doorbell_fini(adev); 4007 4008 iounmap(adev->rmmio); 4009 adev->rmmio = NULL; 4010 if (adev->mman.aper_base_kaddr) 4011 iounmap(adev->mman.aper_base_kaddr); 4012 adev->mman.aper_base_kaddr = NULL; 4013 4014 /* Memory manager related */ 4015 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4016 arch_phys_wc_del(adev->gmc.vram_mtrr); 4017 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4018 } 4019 } 4020 4021 /** 4022 * amdgpu_device_fini_hw - tear down the driver 4023 * 4024 * @adev: amdgpu_device pointer 4025 * 4026 * Tear down the driver info (all asics). 4027 * Called at driver shutdown. 4028 */ 4029 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4030 { 4031 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4032 flush_delayed_work(&adev->delayed_init_work); 4033 adev->shutdown = true; 4034 4035 /* make sure IB test finished before entering exclusive mode 4036 * to avoid preemption on IB test 4037 * */ 4038 if (amdgpu_sriov_vf(adev)) { 4039 amdgpu_virt_request_full_gpu(adev, false); 4040 amdgpu_virt_fini_data_exchange(adev); 4041 } 4042 4043 /* disable all interrupts */ 4044 amdgpu_irq_disable_all(adev); 4045 if (adev->mode_info.mode_config_initialized) { 4046 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4047 drm_helper_force_disable_all(adev_to_drm(adev)); 4048 else 4049 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4050 } 4051 amdgpu_fence_driver_hw_fini(adev); 4052 4053 if (adev->mman.initialized) 4054 drain_workqueue(adev->mman.bdev.wq); 4055 4056 if (adev->pm.sysfs_initialized) 4057 amdgpu_pm_sysfs_fini(adev); 4058 if (adev->ucode_sysfs_en) 4059 amdgpu_ucode_sysfs_fini(adev); 4060 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4061 4062 /* disable ras feature must before hw fini */ 4063 amdgpu_ras_pre_fini(adev); 4064 4065 amdgpu_device_ip_fini_early(adev); 4066 4067 amdgpu_irq_fini_hw(adev); 4068 4069 if (adev->mman.initialized) 4070 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4071 4072 amdgpu_gart_dummy_page_fini(adev); 4073 4074 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4075 amdgpu_device_unmap_mmio(adev); 4076 4077 } 4078 4079 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4080 { 4081 int idx; 4082 bool px; 4083 4084 amdgpu_fence_driver_sw_fini(adev); 4085 amdgpu_device_ip_fini(adev); 4086 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4087 adev->accel_working = false; 4088 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4089 4090 amdgpu_reset_fini(adev); 4091 4092 /* free i2c buses */ 4093 if (!amdgpu_device_has_dc_support(adev)) 4094 amdgpu_i2c_fini(adev); 4095 4096 if (amdgpu_emu_mode != 1) 4097 amdgpu_atombios_fini(adev); 4098 4099 kfree(adev->bios); 4100 adev->bios = NULL; 4101 4102 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4103 4104 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4105 apple_gmux_detect(NULL, NULL))) 4106 vga_switcheroo_unregister_client(adev->pdev); 4107 4108 if (px) 4109 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4110 4111 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4112 vga_client_unregister(adev->pdev); 4113 4114 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4115 4116 iounmap(adev->rmmio); 4117 adev->rmmio = NULL; 4118 amdgpu_doorbell_fini(adev); 4119 drm_dev_exit(idx); 4120 } 4121 4122 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4123 amdgpu_pmu_fini(adev); 4124 if (adev->mman.discovery_bin) 4125 amdgpu_discovery_fini(adev); 4126 4127 amdgpu_reset_put_reset_domain(adev->reset_domain); 4128 adev->reset_domain = NULL; 4129 4130 kfree(adev->pci_state); 4131 4132 } 4133 4134 /** 4135 * amdgpu_device_evict_resources - evict device resources 4136 * @adev: amdgpu device object 4137 * 4138 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4139 * of the vram memory type. Mainly used for evicting device resources 4140 * at suspend time. 4141 * 4142 */ 4143 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4144 { 4145 int ret; 4146 4147 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4148 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4149 return 0; 4150 4151 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4152 if (ret) 4153 DRM_WARN("evicting device resources failed\n"); 4154 return ret; 4155 } 4156 4157 /* 4158 * Suspend & resume. 4159 */ 4160 /** 4161 * amdgpu_device_suspend - initiate device suspend 4162 * 4163 * @dev: drm dev pointer 4164 * @fbcon : notify the fbdev of suspend 4165 * 4166 * Puts the hw in the suspend state (all asics). 4167 * Returns 0 for success or an error on failure. 4168 * Called at driver suspend. 4169 */ 4170 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4171 { 4172 struct amdgpu_device *adev = drm_to_adev(dev); 4173 int r = 0; 4174 4175 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4176 return 0; 4177 4178 adev->in_suspend = true; 4179 4180 /* Evict the majority of BOs before grabbing the full access */ 4181 r = amdgpu_device_evict_resources(adev); 4182 if (r) 4183 return r; 4184 4185 if (amdgpu_sriov_vf(adev)) { 4186 amdgpu_virt_fini_data_exchange(adev); 4187 r = amdgpu_virt_request_full_gpu(adev, false); 4188 if (r) 4189 return r; 4190 } 4191 4192 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4193 DRM_WARN("smart shift update failed\n"); 4194 4195 if (fbcon) 4196 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4197 4198 cancel_delayed_work_sync(&adev->delayed_init_work); 4199 4200 amdgpu_ras_suspend(adev); 4201 4202 amdgpu_device_ip_suspend_phase1(adev); 4203 4204 if (!adev->in_s0ix) 4205 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4206 4207 r = amdgpu_device_evict_resources(adev); 4208 if (r) 4209 return r; 4210 4211 amdgpu_fence_driver_hw_fini(adev); 4212 4213 amdgpu_device_ip_suspend_phase2(adev); 4214 4215 if (amdgpu_sriov_vf(adev)) 4216 amdgpu_virt_release_full_gpu(adev, false); 4217 4218 return 0; 4219 } 4220 4221 /** 4222 * amdgpu_device_resume - initiate device resume 4223 * 4224 * @dev: drm dev pointer 4225 * @fbcon : notify the fbdev of resume 4226 * 4227 * Bring the hw back to operating state (all asics). 4228 * Returns 0 for success or an error on failure. 4229 * Called at driver resume. 4230 */ 4231 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4232 { 4233 struct amdgpu_device *adev = drm_to_adev(dev); 4234 int r = 0; 4235 4236 if (amdgpu_sriov_vf(adev)) { 4237 r = amdgpu_virt_request_full_gpu(adev, true); 4238 if (r) 4239 return r; 4240 } 4241 4242 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4243 return 0; 4244 4245 if (adev->in_s0ix) 4246 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4247 4248 /* post card */ 4249 if (amdgpu_device_need_post(adev)) { 4250 r = amdgpu_device_asic_init(adev); 4251 if (r) 4252 dev_err(adev->dev, "amdgpu asic init failed\n"); 4253 } 4254 4255 r = amdgpu_device_ip_resume(adev); 4256 4257 if (r) { 4258 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4259 goto exit; 4260 } 4261 amdgpu_fence_driver_hw_init(adev); 4262 4263 r = amdgpu_device_ip_late_init(adev); 4264 if (r) 4265 goto exit; 4266 4267 queue_delayed_work(system_wq, &adev->delayed_init_work, 4268 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4269 4270 if (!adev->in_s0ix) { 4271 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4272 if (r) 4273 goto exit; 4274 } 4275 4276 exit: 4277 if (amdgpu_sriov_vf(adev)) { 4278 amdgpu_virt_init_data_exchange(adev); 4279 amdgpu_virt_release_full_gpu(adev, true); 4280 } 4281 4282 if (r) 4283 return r; 4284 4285 /* Make sure IB tests flushed */ 4286 flush_delayed_work(&adev->delayed_init_work); 4287 4288 if (fbcon) 4289 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4290 4291 amdgpu_ras_resume(adev); 4292 4293 if (adev->mode_info.num_crtc) { 4294 /* 4295 * Most of the connector probing functions try to acquire runtime pm 4296 * refs to ensure that the GPU is powered on when connector polling is 4297 * performed. Since we're calling this from a runtime PM callback, 4298 * trying to acquire rpm refs will cause us to deadlock. 4299 * 4300 * Since we're guaranteed to be holding the rpm lock, it's safe to 4301 * temporarily disable the rpm helpers so this doesn't deadlock us. 4302 */ 4303 #ifdef CONFIG_PM 4304 dev->dev->power.disable_depth++; 4305 #endif 4306 if (!adev->dc_enabled) 4307 drm_helper_hpd_irq_event(dev); 4308 else 4309 drm_kms_helper_hotplug_event(dev); 4310 #ifdef CONFIG_PM 4311 dev->dev->power.disable_depth--; 4312 #endif 4313 } 4314 adev->in_suspend = false; 4315 4316 if (adev->enable_mes) 4317 amdgpu_mes_self_test(adev); 4318 4319 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4320 DRM_WARN("smart shift update failed\n"); 4321 4322 return 0; 4323 } 4324 4325 /** 4326 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4327 * 4328 * @adev: amdgpu_device pointer 4329 * 4330 * The list of all the hardware IPs that make up the asic is walked and 4331 * the check_soft_reset callbacks are run. check_soft_reset determines 4332 * if the asic is still hung or not. 4333 * Returns true if any of the IPs are still in a hung state, false if not. 4334 */ 4335 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4336 { 4337 int i; 4338 bool asic_hang = false; 4339 4340 if (amdgpu_sriov_vf(adev)) 4341 return true; 4342 4343 if (amdgpu_asic_need_full_reset(adev)) 4344 return true; 4345 4346 for (i = 0; i < adev->num_ip_blocks; i++) { 4347 if (!adev->ip_blocks[i].status.valid) 4348 continue; 4349 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4350 adev->ip_blocks[i].status.hang = 4351 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4352 if (adev->ip_blocks[i].status.hang) { 4353 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4354 asic_hang = true; 4355 } 4356 } 4357 return asic_hang; 4358 } 4359 4360 /** 4361 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4362 * 4363 * @adev: amdgpu_device pointer 4364 * 4365 * The list of all the hardware IPs that make up the asic is walked and the 4366 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4367 * handles any IP specific hardware or software state changes that are 4368 * necessary for a soft reset to succeed. 4369 * Returns 0 on success, negative error code on failure. 4370 */ 4371 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4372 { 4373 int i, r = 0; 4374 4375 for (i = 0; i < adev->num_ip_blocks; i++) { 4376 if (!adev->ip_blocks[i].status.valid) 4377 continue; 4378 if (adev->ip_blocks[i].status.hang && 4379 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4380 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4381 if (r) 4382 return r; 4383 } 4384 } 4385 4386 return 0; 4387 } 4388 4389 /** 4390 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4391 * 4392 * @adev: amdgpu_device pointer 4393 * 4394 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4395 * reset is necessary to recover. 4396 * Returns true if a full asic reset is required, false if not. 4397 */ 4398 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4399 { 4400 int i; 4401 4402 if (amdgpu_asic_need_full_reset(adev)) 4403 return true; 4404 4405 for (i = 0; i < adev->num_ip_blocks; i++) { 4406 if (!adev->ip_blocks[i].status.valid) 4407 continue; 4408 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4409 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4410 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4411 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4412 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4413 if (adev->ip_blocks[i].status.hang) { 4414 dev_info(adev->dev, "Some block need full reset!\n"); 4415 return true; 4416 } 4417 } 4418 } 4419 return false; 4420 } 4421 4422 /** 4423 * amdgpu_device_ip_soft_reset - do a soft reset 4424 * 4425 * @adev: amdgpu_device pointer 4426 * 4427 * The list of all the hardware IPs that make up the asic is walked and the 4428 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4429 * IP specific hardware or software state changes that are necessary to soft 4430 * reset the IP. 4431 * Returns 0 on success, negative error code on failure. 4432 */ 4433 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4434 { 4435 int i, r = 0; 4436 4437 for (i = 0; i < adev->num_ip_blocks; i++) { 4438 if (!adev->ip_blocks[i].status.valid) 4439 continue; 4440 if (adev->ip_blocks[i].status.hang && 4441 adev->ip_blocks[i].version->funcs->soft_reset) { 4442 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4443 if (r) 4444 return r; 4445 } 4446 } 4447 4448 return 0; 4449 } 4450 4451 /** 4452 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4453 * 4454 * @adev: amdgpu_device pointer 4455 * 4456 * The list of all the hardware IPs that make up the asic is walked and the 4457 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4458 * handles any IP specific hardware or software state changes that are 4459 * necessary after the IP has been soft reset. 4460 * Returns 0 on success, negative error code on failure. 4461 */ 4462 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4463 { 4464 int i, r = 0; 4465 4466 for (i = 0; i < adev->num_ip_blocks; i++) { 4467 if (!adev->ip_blocks[i].status.valid) 4468 continue; 4469 if (adev->ip_blocks[i].status.hang && 4470 adev->ip_blocks[i].version->funcs->post_soft_reset) 4471 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4472 if (r) 4473 return r; 4474 } 4475 4476 return 0; 4477 } 4478 4479 /** 4480 * amdgpu_device_recover_vram - Recover some VRAM contents 4481 * 4482 * @adev: amdgpu_device pointer 4483 * 4484 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4485 * restore things like GPUVM page tables after a GPU reset where 4486 * the contents of VRAM might be lost. 4487 * 4488 * Returns: 4489 * 0 on success, negative error code on failure. 4490 */ 4491 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4492 { 4493 struct dma_fence *fence = NULL, *next = NULL; 4494 struct amdgpu_bo *shadow; 4495 struct amdgpu_bo_vm *vmbo; 4496 long r = 1, tmo; 4497 4498 if (amdgpu_sriov_runtime(adev)) 4499 tmo = msecs_to_jiffies(8000); 4500 else 4501 tmo = msecs_to_jiffies(100); 4502 4503 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4504 mutex_lock(&adev->shadow_list_lock); 4505 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4506 /* If vm is compute context or adev is APU, shadow will be NULL */ 4507 if (!vmbo->shadow) 4508 continue; 4509 shadow = vmbo->shadow; 4510 4511 /* No need to recover an evicted BO */ 4512 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4513 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4514 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4515 continue; 4516 4517 r = amdgpu_bo_restore_shadow(shadow, &next); 4518 if (r) 4519 break; 4520 4521 if (fence) { 4522 tmo = dma_fence_wait_timeout(fence, false, tmo); 4523 dma_fence_put(fence); 4524 fence = next; 4525 if (tmo == 0) { 4526 r = -ETIMEDOUT; 4527 break; 4528 } else if (tmo < 0) { 4529 r = tmo; 4530 break; 4531 } 4532 } else { 4533 fence = next; 4534 } 4535 } 4536 mutex_unlock(&adev->shadow_list_lock); 4537 4538 if (fence) 4539 tmo = dma_fence_wait_timeout(fence, false, tmo); 4540 dma_fence_put(fence); 4541 4542 if (r < 0 || tmo <= 0) { 4543 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4544 return -EIO; 4545 } 4546 4547 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4548 return 0; 4549 } 4550 4551 4552 /** 4553 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4554 * 4555 * @adev: amdgpu_device pointer 4556 * @from_hypervisor: request from hypervisor 4557 * 4558 * do VF FLR and reinitialize Asic 4559 * return 0 means succeeded otherwise failed 4560 */ 4561 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4562 bool from_hypervisor) 4563 { 4564 int r; 4565 struct amdgpu_hive_info *hive = NULL; 4566 int retry_limit = 0; 4567 4568 retry: 4569 amdgpu_amdkfd_pre_reset(adev); 4570 4571 if (from_hypervisor) 4572 r = amdgpu_virt_request_full_gpu(adev, true); 4573 else 4574 r = amdgpu_virt_reset_gpu(adev); 4575 if (r) 4576 return r; 4577 4578 /* some sw clean up VF needs to do before recover */ 4579 amdgpu_virt_post_reset(adev); 4580 4581 /* Resume IP prior to SMC */ 4582 r = amdgpu_device_ip_reinit_early_sriov(adev); 4583 if (r) 4584 goto error; 4585 4586 amdgpu_virt_init_data_exchange(adev); 4587 4588 r = amdgpu_device_fw_loading(adev); 4589 if (r) 4590 return r; 4591 4592 /* now we are okay to resume SMC/CP/SDMA */ 4593 r = amdgpu_device_ip_reinit_late_sriov(adev); 4594 if (r) 4595 goto error; 4596 4597 hive = amdgpu_get_xgmi_hive(adev); 4598 /* Update PSP FW topology after reset */ 4599 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4600 r = amdgpu_xgmi_update_topology(hive, adev); 4601 4602 if (hive) 4603 amdgpu_put_xgmi_hive(hive); 4604 4605 if (!r) { 4606 amdgpu_irq_gpu_reset_resume_helper(adev); 4607 r = amdgpu_ib_ring_tests(adev); 4608 4609 amdgpu_amdkfd_post_reset(adev); 4610 } 4611 4612 error: 4613 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4614 amdgpu_inc_vram_lost(adev); 4615 r = amdgpu_device_recover_vram(adev); 4616 } 4617 amdgpu_virt_release_full_gpu(adev, true); 4618 4619 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4620 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4621 retry_limit++; 4622 goto retry; 4623 } else 4624 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4625 } 4626 4627 return r; 4628 } 4629 4630 /** 4631 * amdgpu_device_has_job_running - check if there is any job in mirror list 4632 * 4633 * @adev: amdgpu_device pointer 4634 * 4635 * check if there is any job in mirror list 4636 */ 4637 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4638 { 4639 int i; 4640 struct drm_sched_job *job; 4641 4642 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4643 struct amdgpu_ring *ring = adev->rings[i]; 4644 4645 if (!ring || !ring->sched.thread) 4646 continue; 4647 4648 spin_lock(&ring->sched.job_list_lock); 4649 job = list_first_entry_or_null(&ring->sched.pending_list, 4650 struct drm_sched_job, list); 4651 spin_unlock(&ring->sched.job_list_lock); 4652 if (job) 4653 return true; 4654 } 4655 return false; 4656 } 4657 4658 /** 4659 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4660 * 4661 * @adev: amdgpu_device pointer 4662 * 4663 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4664 * a hung GPU. 4665 */ 4666 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4667 { 4668 4669 if (amdgpu_gpu_recovery == 0) 4670 goto disabled; 4671 4672 /* Skip soft reset check in fatal error mode */ 4673 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4674 return true; 4675 4676 if (amdgpu_sriov_vf(adev)) 4677 return true; 4678 4679 if (amdgpu_gpu_recovery == -1) { 4680 switch (adev->asic_type) { 4681 #ifdef CONFIG_DRM_AMDGPU_SI 4682 case CHIP_VERDE: 4683 case CHIP_TAHITI: 4684 case CHIP_PITCAIRN: 4685 case CHIP_OLAND: 4686 case CHIP_HAINAN: 4687 #endif 4688 #ifdef CONFIG_DRM_AMDGPU_CIK 4689 case CHIP_KAVERI: 4690 case CHIP_KABINI: 4691 case CHIP_MULLINS: 4692 #endif 4693 case CHIP_CARRIZO: 4694 case CHIP_STONEY: 4695 case CHIP_CYAN_SKILLFISH: 4696 goto disabled; 4697 default: 4698 break; 4699 } 4700 } 4701 4702 return true; 4703 4704 disabled: 4705 dev_info(adev->dev, "GPU recovery disabled.\n"); 4706 return false; 4707 } 4708 4709 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4710 { 4711 u32 i; 4712 int ret = 0; 4713 4714 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4715 4716 dev_info(adev->dev, "GPU mode1 reset\n"); 4717 4718 /* disable BM */ 4719 pci_clear_master(adev->pdev); 4720 4721 amdgpu_device_cache_pci_state(adev->pdev); 4722 4723 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4724 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4725 ret = amdgpu_dpm_mode1_reset(adev); 4726 } else { 4727 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4728 ret = psp_gpu_reset(adev); 4729 } 4730 4731 if (ret) 4732 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4733 4734 amdgpu_device_load_pci_state(adev->pdev); 4735 4736 /* wait for asic to come out of reset */ 4737 for (i = 0; i < adev->usec_timeout; i++) { 4738 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4739 4740 if (memsize != 0xffffffff) 4741 break; 4742 udelay(1); 4743 } 4744 4745 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4746 return ret; 4747 } 4748 4749 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4750 struct amdgpu_reset_context *reset_context) 4751 { 4752 int i, r = 0; 4753 struct amdgpu_job *job = NULL; 4754 bool need_full_reset = 4755 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4756 4757 if (reset_context->reset_req_dev == adev) 4758 job = reset_context->job; 4759 4760 if (amdgpu_sriov_vf(adev)) { 4761 /* stop the data exchange thread */ 4762 amdgpu_virt_fini_data_exchange(adev); 4763 } 4764 4765 amdgpu_fence_driver_isr_toggle(adev, true); 4766 4767 /* block all schedulers and reset given job's ring */ 4768 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4769 struct amdgpu_ring *ring = adev->rings[i]; 4770 4771 if (!ring || !ring->sched.thread) 4772 continue; 4773 4774 /*clear job fence from fence drv to avoid force_completion 4775 *leave NULL and vm flush fence in fence drv */ 4776 amdgpu_fence_driver_clear_job_fences(ring); 4777 4778 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4779 amdgpu_fence_driver_force_completion(ring); 4780 } 4781 4782 amdgpu_fence_driver_isr_toggle(adev, false); 4783 4784 if (job && job->vm) 4785 drm_sched_increase_karma(&job->base); 4786 4787 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4788 /* If reset handler not implemented, continue; otherwise return */ 4789 if (r == -ENOSYS) 4790 r = 0; 4791 else 4792 return r; 4793 4794 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4795 if (!amdgpu_sriov_vf(adev)) { 4796 4797 if (!need_full_reset) 4798 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4799 4800 if (!need_full_reset && amdgpu_gpu_recovery && 4801 amdgpu_device_ip_check_soft_reset(adev)) { 4802 amdgpu_device_ip_pre_soft_reset(adev); 4803 r = amdgpu_device_ip_soft_reset(adev); 4804 amdgpu_device_ip_post_soft_reset(adev); 4805 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4806 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4807 need_full_reset = true; 4808 } 4809 } 4810 4811 if (need_full_reset) 4812 r = amdgpu_device_ip_suspend(adev); 4813 if (need_full_reset) 4814 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4815 else 4816 clear_bit(AMDGPU_NEED_FULL_RESET, 4817 &reset_context->flags); 4818 } 4819 4820 return r; 4821 } 4822 4823 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4824 { 4825 int i; 4826 4827 lockdep_assert_held(&adev->reset_domain->sem); 4828 4829 for (i = 0; i < adev->num_regs; i++) { 4830 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4831 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4832 adev->reset_dump_reg_value[i]); 4833 } 4834 4835 return 0; 4836 } 4837 4838 #ifdef CONFIG_DEV_COREDUMP 4839 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4840 size_t count, void *data, size_t datalen) 4841 { 4842 struct drm_printer p; 4843 struct amdgpu_device *adev = data; 4844 struct drm_print_iterator iter; 4845 int i; 4846 4847 iter.data = buffer; 4848 iter.offset = 0; 4849 iter.start = offset; 4850 iter.remain = count; 4851 4852 p = drm_coredump_printer(&iter); 4853 4854 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4855 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4856 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4857 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4858 if (adev->reset_task_info.pid) 4859 drm_printf(&p, "process_name: %s PID: %d\n", 4860 adev->reset_task_info.process_name, 4861 adev->reset_task_info.pid); 4862 4863 if (adev->reset_vram_lost) 4864 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4865 if (adev->num_regs) { 4866 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4867 4868 for (i = 0; i < adev->num_regs; i++) 4869 drm_printf(&p, "0x%08x: 0x%08x\n", 4870 adev->reset_dump_reg_list[i], 4871 adev->reset_dump_reg_value[i]); 4872 } 4873 4874 return count - iter.remain; 4875 } 4876 4877 static void amdgpu_devcoredump_free(void *data) 4878 { 4879 } 4880 4881 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4882 { 4883 struct drm_device *dev = adev_to_drm(adev); 4884 4885 ktime_get_ts64(&adev->reset_time); 4886 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4887 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4888 } 4889 #endif 4890 4891 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4892 struct amdgpu_reset_context *reset_context) 4893 { 4894 struct amdgpu_device *tmp_adev = NULL; 4895 bool need_full_reset, skip_hw_reset, vram_lost = false; 4896 int r = 0; 4897 bool gpu_reset_for_dev_remove = 0; 4898 4899 /* Try reset handler method first */ 4900 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4901 reset_list); 4902 amdgpu_reset_reg_dumps(tmp_adev); 4903 4904 reset_context->reset_device_list = device_list_handle; 4905 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4906 /* If reset handler not implemented, continue; otherwise return */ 4907 if (r == -ENOSYS) 4908 r = 0; 4909 else 4910 return r; 4911 4912 /* Reset handler not implemented, use the default method */ 4913 need_full_reset = 4914 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4915 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4916 4917 gpu_reset_for_dev_remove = 4918 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4919 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4920 4921 /* 4922 * ASIC reset has to be done on all XGMI hive nodes ASAP 4923 * to allow proper links negotiation in FW (within 1 sec) 4924 */ 4925 if (!skip_hw_reset && need_full_reset) { 4926 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4927 /* For XGMI run all resets in parallel to speed up the process */ 4928 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4929 tmp_adev->gmc.xgmi.pending_reset = false; 4930 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4931 r = -EALREADY; 4932 } else 4933 r = amdgpu_asic_reset(tmp_adev); 4934 4935 if (r) { 4936 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4937 r, adev_to_drm(tmp_adev)->unique); 4938 break; 4939 } 4940 } 4941 4942 /* For XGMI wait for all resets to complete before proceed */ 4943 if (!r) { 4944 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4945 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4946 flush_work(&tmp_adev->xgmi_reset_work); 4947 r = tmp_adev->asic_reset_res; 4948 if (r) 4949 break; 4950 } 4951 } 4952 } 4953 } 4954 4955 if (!r && amdgpu_ras_intr_triggered()) { 4956 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4957 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4958 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4959 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4960 } 4961 4962 amdgpu_ras_intr_cleared(); 4963 } 4964 4965 /* Since the mode1 reset affects base ip blocks, the 4966 * phase1 ip blocks need to be resumed. Otherwise there 4967 * will be a BIOS signature error and the psp bootloader 4968 * can't load kdb on the next amdgpu install. 4969 */ 4970 if (gpu_reset_for_dev_remove) { 4971 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4972 amdgpu_device_ip_resume_phase1(tmp_adev); 4973 4974 goto end; 4975 } 4976 4977 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4978 if (need_full_reset) { 4979 /* post card */ 4980 r = amdgpu_device_asic_init(tmp_adev); 4981 if (r) { 4982 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4983 } else { 4984 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4985 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4986 if (r) 4987 goto out; 4988 4989 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4990 if (r) 4991 goto out; 4992 4993 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4994 #ifdef CONFIG_DEV_COREDUMP 4995 tmp_adev->reset_vram_lost = vram_lost; 4996 memset(&tmp_adev->reset_task_info, 0, 4997 sizeof(tmp_adev->reset_task_info)); 4998 if (reset_context->job && reset_context->job->vm) 4999 tmp_adev->reset_task_info = 5000 reset_context->job->vm->task_info; 5001 amdgpu_reset_capture_coredumpm(tmp_adev); 5002 #endif 5003 if (vram_lost) { 5004 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5005 amdgpu_inc_vram_lost(tmp_adev); 5006 } 5007 5008 r = amdgpu_device_fw_loading(tmp_adev); 5009 if (r) 5010 return r; 5011 5012 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5013 if (r) 5014 goto out; 5015 5016 if (vram_lost) 5017 amdgpu_device_fill_reset_magic(tmp_adev); 5018 5019 /* 5020 * Add this ASIC as tracked as reset was already 5021 * complete successfully. 5022 */ 5023 amdgpu_register_gpu_instance(tmp_adev); 5024 5025 if (!reset_context->hive && 5026 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5027 amdgpu_xgmi_add_device(tmp_adev); 5028 5029 r = amdgpu_device_ip_late_init(tmp_adev); 5030 if (r) 5031 goto out; 5032 5033 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5034 5035 /* 5036 * The GPU enters bad state once faulty pages 5037 * by ECC has reached the threshold, and ras 5038 * recovery is scheduled next. So add one check 5039 * here to break recovery if it indeed exceeds 5040 * bad page threshold, and remind user to 5041 * retire this GPU or setting one bigger 5042 * bad_page_threshold value to fix this once 5043 * probing driver again. 5044 */ 5045 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5046 /* must succeed. */ 5047 amdgpu_ras_resume(tmp_adev); 5048 } else { 5049 r = -EINVAL; 5050 goto out; 5051 } 5052 5053 /* Update PSP FW topology after reset */ 5054 if (reset_context->hive && 5055 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5056 r = amdgpu_xgmi_update_topology( 5057 reset_context->hive, tmp_adev); 5058 } 5059 } 5060 5061 out: 5062 if (!r) { 5063 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5064 r = amdgpu_ib_ring_tests(tmp_adev); 5065 if (r) { 5066 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5067 need_full_reset = true; 5068 r = -EAGAIN; 5069 goto end; 5070 } 5071 } 5072 5073 if (!r) 5074 r = amdgpu_device_recover_vram(tmp_adev); 5075 else 5076 tmp_adev->asic_reset_res = r; 5077 } 5078 5079 end: 5080 if (need_full_reset) 5081 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5082 else 5083 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5084 return r; 5085 } 5086 5087 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5088 { 5089 5090 switch (amdgpu_asic_reset_method(adev)) { 5091 case AMD_RESET_METHOD_MODE1: 5092 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5093 break; 5094 case AMD_RESET_METHOD_MODE2: 5095 adev->mp1_state = PP_MP1_STATE_RESET; 5096 break; 5097 default: 5098 adev->mp1_state = PP_MP1_STATE_NONE; 5099 break; 5100 } 5101 } 5102 5103 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5104 { 5105 amdgpu_vf_error_trans_all(adev); 5106 adev->mp1_state = PP_MP1_STATE_NONE; 5107 } 5108 5109 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5110 { 5111 struct pci_dev *p = NULL; 5112 5113 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5114 adev->pdev->bus->number, 1); 5115 if (p) { 5116 pm_runtime_enable(&(p->dev)); 5117 pm_runtime_resume(&(p->dev)); 5118 } 5119 5120 pci_dev_put(p); 5121 } 5122 5123 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5124 { 5125 enum amd_reset_method reset_method; 5126 struct pci_dev *p = NULL; 5127 u64 expires; 5128 5129 /* 5130 * For now, only BACO and mode1 reset are confirmed 5131 * to suffer the audio issue without proper suspended. 5132 */ 5133 reset_method = amdgpu_asic_reset_method(adev); 5134 if ((reset_method != AMD_RESET_METHOD_BACO) && 5135 (reset_method != AMD_RESET_METHOD_MODE1)) 5136 return -EINVAL; 5137 5138 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5139 adev->pdev->bus->number, 1); 5140 if (!p) 5141 return -ENODEV; 5142 5143 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5144 if (!expires) 5145 /* 5146 * If we cannot get the audio device autosuspend delay, 5147 * a fixed 4S interval will be used. Considering 3S is 5148 * the audio controller default autosuspend delay setting. 5149 * 4S used here is guaranteed to cover that. 5150 */ 5151 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5152 5153 while (!pm_runtime_status_suspended(&(p->dev))) { 5154 if (!pm_runtime_suspend(&(p->dev))) 5155 break; 5156 5157 if (expires < ktime_get_mono_fast_ns()) { 5158 dev_warn(adev->dev, "failed to suspend display audio\n"); 5159 pci_dev_put(p); 5160 /* TODO: abort the succeeding gpu reset? */ 5161 return -ETIMEDOUT; 5162 } 5163 } 5164 5165 pm_runtime_disable(&(p->dev)); 5166 5167 pci_dev_put(p); 5168 return 0; 5169 } 5170 5171 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5172 { 5173 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5174 5175 #if defined(CONFIG_DEBUG_FS) 5176 if (!amdgpu_sriov_vf(adev)) 5177 cancel_work(&adev->reset_work); 5178 #endif 5179 5180 if (adev->kfd.dev) 5181 cancel_work(&adev->kfd.reset_work); 5182 5183 if (amdgpu_sriov_vf(adev)) 5184 cancel_work(&adev->virt.flr_work); 5185 5186 if (con && adev->ras_enabled) 5187 cancel_work(&con->recovery_work); 5188 5189 } 5190 5191 /** 5192 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5193 * 5194 * @adev: amdgpu_device pointer 5195 * @job: which job trigger hang 5196 * @reset_context: amdgpu reset context pointer 5197 * 5198 * Attempt to reset the GPU if it has hung (all asics). 5199 * Attempt to do soft-reset or full-reset and reinitialize Asic 5200 * Returns 0 for success or an error on failure. 5201 */ 5202 5203 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5204 struct amdgpu_job *job, 5205 struct amdgpu_reset_context *reset_context) 5206 { 5207 struct list_head device_list, *device_list_handle = NULL; 5208 bool job_signaled = false; 5209 struct amdgpu_hive_info *hive = NULL; 5210 struct amdgpu_device *tmp_adev = NULL; 5211 int i, r = 0; 5212 bool need_emergency_restart = false; 5213 bool audio_suspended = false; 5214 bool gpu_reset_for_dev_remove = false; 5215 5216 gpu_reset_for_dev_remove = 5217 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5218 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5219 5220 /* 5221 * Special case: RAS triggered and full reset isn't supported 5222 */ 5223 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5224 5225 /* 5226 * Flush RAM to disk so that after reboot 5227 * the user can read log and see why the system rebooted. 5228 */ 5229 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5230 DRM_WARN("Emergency reboot."); 5231 5232 ksys_sync_helper(); 5233 emergency_restart(); 5234 } 5235 5236 dev_info(adev->dev, "GPU %s begin!\n", 5237 need_emergency_restart ? "jobs stop":"reset"); 5238 5239 if (!amdgpu_sriov_vf(adev)) 5240 hive = amdgpu_get_xgmi_hive(adev); 5241 if (hive) 5242 mutex_lock(&hive->hive_lock); 5243 5244 reset_context->job = job; 5245 reset_context->hive = hive; 5246 /* 5247 * Build list of devices to reset. 5248 * In case we are in XGMI hive mode, resort the device list 5249 * to put adev in the 1st position. 5250 */ 5251 INIT_LIST_HEAD(&device_list); 5252 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5253 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5254 list_add_tail(&tmp_adev->reset_list, &device_list); 5255 if (gpu_reset_for_dev_remove && adev->shutdown) 5256 tmp_adev->shutdown = true; 5257 } 5258 if (!list_is_first(&adev->reset_list, &device_list)) 5259 list_rotate_to_front(&adev->reset_list, &device_list); 5260 device_list_handle = &device_list; 5261 } else { 5262 list_add_tail(&adev->reset_list, &device_list); 5263 device_list_handle = &device_list; 5264 } 5265 5266 /* We need to lock reset domain only once both for XGMI and single device */ 5267 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5268 reset_list); 5269 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5270 5271 /* block all schedulers and reset given job's ring */ 5272 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5273 5274 amdgpu_device_set_mp1_state(tmp_adev); 5275 5276 /* 5277 * Try to put the audio codec into suspend state 5278 * before gpu reset started. 5279 * 5280 * Due to the power domain of the graphics device 5281 * is shared with AZ power domain. Without this, 5282 * we may change the audio hardware from behind 5283 * the audio driver's back. That will trigger 5284 * some audio codec errors. 5285 */ 5286 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5287 audio_suspended = true; 5288 5289 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5290 5291 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5292 5293 if (!amdgpu_sriov_vf(tmp_adev)) 5294 amdgpu_amdkfd_pre_reset(tmp_adev); 5295 5296 /* 5297 * Mark these ASICs to be reseted as untracked first 5298 * And add them back after reset completed 5299 */ 5300 amdgpu_unregister_gpu_instance(tmp_adev); 5301 5302 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5303 5304 /* disable ras on ALL IPs */ 5305 if (!need_emergency_restart && 5306 amdgpu_device_ip_need_full_reset(tmp_adev)) 5307 amdgpu_ras_suspend(tmp_adev); 5308 5309 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5310 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5311 5312 if (!ring || !ring->sched.thread) 5313 continue; 5314 5315 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5316 5317 if (need_emergency_restart) 5318 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5319 } 5320 atomic_inc(&tmp_adev->gpu_reset_counter); 5321 } 5322 5323 if (need_emergency_restart) 5324 goto skip_sched_resume; 5325 5326 /* 5327 * Must check guilty signal here since after this point all old 5328 * HW fences are force signaled. 5329 * 5330 * job->base holds a reference to parent fence 5331 */ 5332 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5333 job_signaled = true; 5334 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5335 goto skip_hw_reset; 5336 } 5337 5338 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5339 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5340 if (gpu_reset_for_dev_remove) { 5341 /* Workaroud for ASICs need to disable SMC first */ 5342 amdgpu_device_smu_fini_early(tmp_adev); 5343 } 5344 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5345 /*TODO Should we stop ?*/ 5346 if (r) { 5347 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5348 r, adev_to_drm(tmp_adev)->unique); 5349 tmp_adev->asic_reset_res = r; 5350 } 5351 5352 /* 5353 * Drop all pending non scheduler resets. Scheduler resets 5354 * were already dropped during drm_sched_stop 5355 */ 5356 amdgpu_device_stop_pending_resets(tmp_adev); 5357 } 5358 5359 /* Actual ASIC resets if needed.*/ 5360 /* Host driver will handle XGMI hive reset for SRIOV */ 5361 if (amdgpu_sriov_vf(adev)) { 5362 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5363 if (r) 5364 adev->asic_reset_res = r; 5365 5366 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5367 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5368 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5369 amdgpu_ras_resume(adev); 5370 } else { 5371 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5372 if (r && r == -EAGAIN) 5373 goto retry; 5374 5375 if (!r && gpu_reset_for_dev_remove) 5376 goto recover_end; 5377 } 5378 5379 skip_hw_reset: 5380 5381 /* Post ASIC reset for all devs .*/ 5382 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5383 5384 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5385 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5386 5387 if (!ring || !ring->sched.thread) 5388 continue; 5389 5390 drm_sched_start(&ring->sched, true); 5391 } 5392 5393 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5394 amdgpu_mes_self_test(tmp_adev); 5395 5396 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5397 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5398 } 5399 5400 if (tmp_adev->asic_reset_res) 5401 r = tmp_adev->asic_reset_res; 5402 5403 tmp_adev->asic_reset_res = 0; 5404 5405 if (r) { 5406 /* bad news, how to tell it to userspace ? */ 5407 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5408 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5409 } else { 5410 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5411 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5412 DRM_WARN("smart shift update failed\n"); 5413 } 5414 } 5415 5416 skip_sched_resume: 5417 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5418 /* unlock kfd: SRIOV would do it separately */ 5419 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5420 amdgpu_amdkfd_post_reset(tmp_adev); 5421 5422 /* kfd_post_reset will do nothing if kfd device is not initialized, 5423 * need to bring up kfd here if it's not be initialized before 5424 */ 5425 if (!adev->kfd.init_complete) 5426 amdgpu_amdkfd_device_init(adev); 5427 5428 if (audio_suspended) 5429 amdgpu_device_resume_display_audio(tmp_adev); 5430 5431 amdgpu_device_unset_mp1_state(tmp_adev); 5432 5433 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5434 } 5435 5436 recover_end: 5437 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5438 reset_list); 5439 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5440 5441 if (hive) { 5442 mutex_unlock(&hive->hive_lock); 5443 amdgpu_put_xgmi_hive(hive); 5444 } 5445 5446 if (r) 5447 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5448 5449 atomic_set(&adev->reset_domain->reset_res, r); 5450 return r; 5451 } 5452 5453 /** 5454 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5455 * 5456 * @adev: amdgpu_device pointer 5457 * 5458 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5459 * and lanes) of the slot the device is in. Handles APUs and 5460 * virtualized environments where PCIE config space may not be available. 5461 */ 5462 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5463 { 5464 struct pci_dev *pdev; 5465 enum pci_bus_speed speed_cap, platform_speed_cap; 5466 enum pcie_link_width platform_link_width; 5467 5468 if (amdgpu_pcie_gen_cap) 5469 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5470 5471 if (amdgpu_pcie_lane_cap) 5472 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5473 5474 /* covers APUs as well */ 5475 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5476 if (adev->pm.pcie_gen_mask == 0) 5477 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5478 if (adev->pm.pcie_mlw_mask == 0) 5479 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5480 return; 5481 } 5482 5483 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5484 return; 5485 5486 pcie_bandwidth_available(adev->pdev, NULL, 5487 &platform_speed_cap, &platform_link_width); 5488 5489 if (adev->pm.pcie_gen_mask == 0) { 5490 /* asic caps */ 5491 pdev = adev->pdev; 5492 speed_cap = pcie_get_speed_cap(pdev); 5493 if (speed_cap == PCI_SPEED_UNKNOWN) { 5494 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5495 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5496 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5497 } else { 5498 if (speed_cap == PCIE_SPEED_32_0GT) 5499 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5500 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5501 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5502 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5504 else if (speed_cap == PCIE_SPEED_16_0GT) 5505 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5506 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5507 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5508 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5509 else if (speed_cap == PCIE_SPEED_8_0GT) 5510 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5511 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5512 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5513 else if (speed_cap == PCIE_SPEED_5_0GT) 5514 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5515 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5516 else 5517 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5518 } 5519 /* platform caps */ 5520 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5521 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5522 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5523 } else { 5524 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5525 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5526 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5527 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5528 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5530 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5531 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5532 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5533 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5534 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5535 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5536 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5537 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5538 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5539 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5540 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5541 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5542 else 5543 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5544 5545 } 5546 } 5547 if (adev->pm.pcie_mlw_mask == 0) { 5548 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5549 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5550 } else { 5551 switch (platform_link_width) { 5552 case PCIE_LNK_X32: 5553 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5559 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5560 break; 5561 case PCIE_LNK_X16: 5562 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5565 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5567 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5568 break; 5569 case PCIE_LNK_X12: 5570 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5572 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5573 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5574 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5575 break; 5576 case PCIE_LNK_X8: 5577 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5578 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5579 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5580 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5581 break; 5582 case PCIE_LNK_X4: 5583 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5584 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5585 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5586 break; 5587 case PCIE_LNK_X2: 5588 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5589 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5590 break; 5591 case PCIE_LNK_X1: 5592 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5593 break; 5594 default: 5595 break; 5596 } 5597 } 5598 } 5599 } 5600 5601 /** 5602 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5603 * 5604 * @adev: amdgpu_device pointer 5605 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5606 * 5607 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5608 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5609 * @peer_adev. 5610 */ 5611 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5612 struct amdgpu_device *peer_adev) 5613 { 5614 #ifdef CONFIG_HSA_AMD_P2P 5615 uint64_t address_mask = peer_adev->dev->dma_mask ? 5616 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5617 resource_size_t aper_limit = 5618 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5619 bool p2p_access = 5620 !adev->gmc.xgmi.connected_to_cpu && 5621 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5622 5623 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5624 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5625 !(adev->gmc.aper_base & address_mask || 5626 aper_limit & address_mask)); 5627 #else 5628 return false; 5629 #endif 5630 } 5631 5632 int amdgpu_device_baco_enter(struct drm_device *dev) 5633 { 5634 struct amdgpu_device *adev = drm_to_adev(dev); 5635 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5636 5637 if (!amdgpu_device_supports_baco(dev)) 5638 return -ENOTSUPP; 5639 5640 if (ras && adev->ras_enabled && 5641 adev->nbio.funcs->enable_doorbell_interrupt) 5642 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5643 5644 return amdgpu_dpm_baco_enter(adev); 5645 } 5646 5647 int amdgpu_device_baco_exit(struct drm_device *dev) 5648 { 5649 struct amdgpu_device *adev = drm_to_adev(dev); 5650 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5651 int ret = 0; 5652 5653 if (!amdgpu_device_supports_baco(dev)) 5654 return -ENOTSUPP; 5655 5656 ret = amdgpu_dpm_baco_exit(adev); 5657 if (ret) 5658 return ret; 5659 5660 if (ras && adev->ras_enabled && 5661 adev->nbio.funcs->enable_doorbell_interrupt) 5662 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5663 5664 if (amdgpu_passthrough(adev) && 5665 adev->nbio.funcs->clear_doorbell_interrupt) 5666 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5667 5668 return 0; 5669 } 5670 5671 /** 5672 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5673 * @pdev: PCI device struct 5674 * @state: PCI channel state 5675 * 5676 * Description: Called when a PCI error is detected. 5677 * 5678 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5679 */ 5680 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5681 { 5682 struct drm_device *dev = pci_get_drvdata(pdev); 5683 struct amdgpu_device *adev = drm_to_adev(dev); 5684 int i; 5685 5686 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5687 5688 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5689 DRM_WARN("No support for XGMI hive yet..."); 5690 return PCI_ERS_RESULT_DISCONNECT; 5691 } 5692 5693 adev->pci_channel_state = state; 5694 5695 switch (state) { 5696 case pci_channel_io_normal: 5697 return PCI_ERS_RESULT_CAN_RECOVER; 5698 /* Fatal error, prepare for slot reset */ 5699 case pci_channel_io_frozen: 5700 /* 5701 * Locking adev->reset_domain->sem will prevent any external access 5702 * to GPU during PCI error recovery 5703 */ 5704 amdgpu_device_lock_reset_domain(adev->reset_domain); 5705 amdgpu_device_set_mp1_state(adev); 5706 5707 /* 5708 * Block any work scheduling as we do for regular GPU reset 5709 * for the duration of the recovery 5710 */ 5711 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5712 struct amdgpu_ring *ring = adev->rings[i]; 5713 5714 if (!ring || !ring->sched.thread) 5715 continue; 5716 5717 drm_sched_stop(&ring->sched, NULL); 5718 } 5719 atomic_inc(&adev->gpu_reset_counter); 5720 return PCI_ERS_RESULT_NEED_RESET; 5721 case pci_channel_io_perm_failure: 5722 /* Permanent error, prepare for device removal */ 5723 return PCI_ERS_RESULT_DISCONNECT; 5724 } 5725 5726 return PCI_ERS_RESULT_NEED_RESET; 5727 } 5728 5729 /** 5730 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5731 * @pdev: pointer to PCI device 5732 */ 5733 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5734 { 5735 5736 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5737 5738 /* TODO - dump whatever for debugging purposes */ 5739 5740 /* This called only if amdgpu_pci_error_detected returns 5741 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5742 * works, no need to reset slot. 5743 */ 5744 5745 return PCI_ERS_RESULT_RECOVERED; 5746 } 5747 5748 /** 5749 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5750 * @pdev: PCI device struct 5751 * 5752 * Description: This routine is called by the pci error recovery 5753 * code after the PCI slot has been reset, just before we 5754 * should resume normal operations. 5755 */ 5756 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5757 { 5758 struct drm_device *dev = pci_get_drvdata(pdev); 5759 struct amdgpu_device *adev = drm_to_adev(dev); 5760 int r, i; 5761 struct amdgpu_reset_context reset_context; 5762 u32 memsize; 5763 struct list_head device_list; 5764 5765 DRM_INFO("PCI error: slot reset callback!!\n"); 5766 5767 memset(&reset_context, 0, sizeof(reset_context)); 5768 5769 INIT_LIST_HEAD(&device_list); 5770 list_add_tail(&adev->reset_list, &device_list); 5771 5772 /* wait for asic to come out of reset */ 5773 msleep(500); 5774 5775 /* Restore PCI confspace */ 5776 amdgpu_device_load_pci_state(pdev); 5777 5778 /* confirm ASIC came out of reset */ 5779 for (i = 0; i < adev->usec_timeout; i++) { 5780 memsize = amdgpu_asic_get_config_memsize(adev); 5781 5782 if (memsize != 0xffffffff) 5783 break; 5784 udelay(1); 5785 } 5786 if (memsize == 0xffffffff) { 5787 r = -ETIME; 5788 goto out; 5789 } 5790 5791 reset_context.method = AMD_RESET_METHOD_NONE; 5792 reset_context.reset_req_dev = adev; 5793 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5794 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5795 5796 adev->no_hw_access = true; 5797 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5798 adev->no_hw_access = false; 5799 if (r) 5800 goto out; 5801 5802 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5803 5804 out: 5805 if (!r) { 5806 if (amdgpu_device_cache_pci_state(adev->pdev)) 5807 pci_restore_state(adev->pdev); 5808 5809 DRM_INFO("PCIe error recovery succeeded\n"); 5810 } else { 5811 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5812 amdgpu_device_unset_mp1_state(adev); 5813 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5814 } 5815 5816 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5817 } 5818 5819 /** 5820 * amdgpu_pci_resume() - resume normal ops after PCI reset 5821 * @pdev: pointer to PCI device 5822 * 5823 * Called when the error recovery driver tells us that its 5824 * OK to resume normal operation. 5825 */ 5826 void amdgpu_pci_resume(struct pci_dev *pdev) 5827 { 5828 struct drm_device *dev = pci_get_drvdata(pdev); 5829 struct amdgpu_device *adev = drm_to_adev(dev); 5830 int i; 5831 5832 5833 DRM_INFO("PCI error: resume callback!!\n"); 5834 5835 /* Only continue execution for the case of pci_channel_io_frozen */ 5836 if (adev->pci_channel_state != pci_channel_io_frozen) 5837 return; 5838 5839 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5840 struct amdgpu_ring *ring = adev->rings[i]; 5841 5842 if (!ring || !ring->sched.thread) 5843 continue; 5844 5845 drm_sched_start(&ring->sched, true); 5846 } 5847 5848 amdgpu_device_unset_mp1_state(adev); 5849 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5850 } 5851 5852 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5853 { 5854 struct drm_device *dev = pci_get_drvdata(pdev); 5855 struct amdgpu_device *adev = drm_to_adev(dev); 5856 int r; 5857 5858 r = pci_save_state(pdev); 5859 if (!r) { 5860 kfree(adev->pci_state); 5861 5862 adev->pci_state = pci_store_saved_state(pdev); 5863 5864 if (!adev->pci_state) { 5865 DRM_ERROR("Failed to store PCI saved state"); 5866 return false; 5867 } 5868 } else { 5869 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5870 return false; 5871 } 5872 5873 return true; 5874 } 5875 5876 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5877 { 5878 struct drm_device *dev = pci_get_drvdata(pdev); 5879 struct amdgpu_device *adev = drm_to_adev(dev); 5880 int r; 5881 5882 if (!adev->pci_state) 5883 return false; 5884 5885 r = pci_load_saved_state(pdev, adev->pci_state); 5886 5887 if (!r) { 5888 pci_restore_state(pdev); 5889 } else { 5890 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5891 return false; 5892 } 5893 5894 return true; 5895 } 5896 5897 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5898 struct amdgpu_ring *ring) 5899 { 5900 #ifdef CONFIG_X86_64 5901 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5902 return; 5903 #endif 5904 if (adev->gmc.xgmi.connected_to_cpu) 5905 return; 5906 5907 if (ring && ring->funcs->emit_hdp_flush) 5908 amdgpu_ring_emit_hdp_flush(ring); 5909 else 5910 amdgpu_asic_flush_hdp(adev, ring); 5911 } 5912 5913 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5914 struct amdgpu_ring *ring) 5915 { 5916 #ifdef CONFIG_X86_64 5917 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5918 return; 5919 #endif 5920 if (adev->gmc.xgmi.connected_to_cpu) 5921 return; 5922 5923 amdgpu_asic_invalidate_hdp(adev, ring); 5924 } 5925 5926 int amdgpu_in_reset(struct amdgpu_device *adev) 5927 { 5928 return atomic_read(&adev->reset_domain->in_gpu_reset); 5929 } 5930 5931 /** 5932 * amdgpu_device_halt() - bring hardware to some kind of halt state 5933 * 5934 * @adev: amdgpu_device pointer 5935 * 5936 * Bring hardware to some kind of halt state so that no one can touch it 5937 * any more. It will help to maintain error context when error occurred. 5938 * Compare to a simple hang, the system will keep stable at least for SSH 5939 * access. Then it should be trivial to inspect the hardware state and 5940 * see what's going on. Implemented as following: 5941 * 5942 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5943 * clears all CPU mappings to device, disallows remappings through page faults 5944 * 2. amdgpu_irq_disable_all() disables all interrupts 5945 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5946 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5947 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5948 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5949 * flush any in flight DMA operations 5950 */ 5951 void amdgpu_device_halt(struct amdgpu_device *adev) 5952 { 5953 struct pci_dev *pdev = adev->pdev; 5954 struct drm_device *ddev = adev_to_drm(adev); 5955 5956 amdgpu_xcp_dev_unplug(adev); 5957 drm_dev_unplug(ddev); 5958 5959 amdgpu_irq_disable_all(adev); 5960 5961 amdgpu_fence_driver_hw_fini(adev); 5962 5963 adev->no_hw_access = true; 5964 5965 amdgpu_device_unmap_mmio(adev); 5966 5967 pci_disable_device(pdev); 5968 pci_wait_for_pending_transaction(pdev); 5969 } 5970 5971 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5972 u32 reg) 5973 { 5974 unsigned long flags, address, data; 5975 u32 r; 5976 5977 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5978 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5979 5980 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5981 WREG32(address, reg * 4); 5982 (void)RREG32(address); 5983 r = RREG32(data); 5984 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5985 return r; 5986 } 5987 5988 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5989 u32 reg, u32 v) 5990 { 5991 unsigned long flags, address, data; 5992 5993 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5994 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5995 5996 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5997 WREG32(address, reg * 4); 5998 (void)RREG32(address); 5999 WREG32(data, v); 6000 (void)RREG32(data); 6001 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6002 } 6003 6004 /** 6005 * amdgpu_device_switch_gang - switch to a new gang 6006 * @adev: amdgpu_device pointer 6007 * @gang: the gang to switch to 6008 * 6009 * Try to switch to a new gang. 6010 * Returns: NULL if we switched to the new gang or a reference to the current 6011 * gang leader. 6012 */ 6013 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6014 struct dma_fence *gang) 6015 { 6016 struct dma_fence *old = NULL; 6017 6018 do { 6019 dma_fence_put(old); 6020 rcu_read_lock(); 6021 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6022 rcu_read_unlock(); 6023 6024 if (old == gang) 6025 break; 6026 6027 if (!dma_fence_is_signaled(old)) 6028 return old; 6029 6030 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6031 old, gang) != old); 6032 6033 dma_fence_put(old); 6034 return NULL; 6035 } 6036 6037 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6038 { 6039 switch (adev->asic_type) { 6040 #ifdef CONFIG_DRM_AMDGPU_SI 6041 case CHIP_HAINAN: 6042 #endif 6043 case CHIP_TOPAZ: 6044 /* chips with no display hardware */ 6045 return false; 6046 #ifdef CONFIG_DRM_AMDGPU_SI 6047 case CHIP_TAHITI: 6048 case CHIP_PITCAIRN: 6049 case CHIP_VERDE: 6050 case CHIP_OLAND: 6051 #endif 6052 #ifdef CONFIG_DRM_AMDGPU_CIK 6053 case CHIP_BONAIRE: 6054 case CHIP_HAWAII: 6055 case CHIP_KAVERI: 6056 case CHIP_KABINI: 6057 case CHIP_MULLINS: 6058 #endif 6059 case CHIP_TONGA: 6060 case CHIP_FIJI: 6061 case CHIP_POLARIS10: 6062 case CHIP_POLARIS11: 6063 case CHIP_POLARIS12: 6064 case CHIP_VEGAM: 6065 case CHIP_CARRIZO: 6066 case CHIP_STONEY: 6067 /* chips with display hardware */ 6068 return true; 6069 default: 6070 /* IP discovery */ 6071 if (!adev->ip_versions[DCE_HWIP][0] || 6072 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6073 return false; 6074 return true; 6075 } 6076 } 6077 6078 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6079 uint32_t inst, uint32_t reg_addr, char reg_name[], 6080 uint32_t expected_value, uint32_t mask) 6081 { 6082 uint32_t ret = 0; 6083 uint32_t old_ = 0; 6084 uint32_t tmp_ = RREG32(reg_addr); 6085 uint32_t loop = adev->usec_timeout; 6086 6087 while ((tmp_ & (mask)) != (expected_value)) { 6088 if (old_ != tmp_) { 6089 loop = adev->usec_timeout; 6090 old_ = tmp_; 6091 } else 6092 udelay(1); 6093 tmp_ = RREG32(reg_addr); 6094 loop--; 6095 if (!loop) { 6096 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6097 inst, reg_name, (uint32_t)expected_value, 6098 (uint32_t)(tmp_ & (mask))); 6099 ret = -ETIMEDOUT; 6100 break; 6101 } 6102 } 6103 return ret; 6104 } 6105