1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 #include <drm/drm_drv.h> 75 76 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 91 const char *amdgpu_asic_name[] = { 92 "TAHITI", 93 "PITCAIRN", 94 "VERDE", 95 "OLAND", 96 "HAINAN", 97 "BONAIRE", 98 "KAVERI", 99 "KABINI", 100 "HAWAII", 101 "MULLINS", 102 "TOPAZ", 103 "TONGA", 104 "FIJI", 105 "CARRIZO", 106 "STONEY", 107 "POLARIS10", 108 "POLARIS11", 109 "POLARIS12", 110 "VEGAM", 111 "VEGA10", 112 "VEGA12", 113 "VEGA20", 114 "RAVEN", 115 "ARCTURUS", 116 "RENOIR", 117 "ALDEBARAN", 118 "NAVI10", 119 "NAVI14", 120 "NAVI12", 121 "SIENNA_CICHLID", 122 "NAVY_FLOUNDER", 123 "VANGOGH", 124 "DIMGREY_CAVEFISH", 125 "BEIGE_GOBY", 126 "YELLOW_CARP", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_vram_access - read/write a buffer in vram 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 299 uint32_t *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0; 303 uint64_t last; 304 int idx; 305 306 if (!drm_dev_enter(&adev->ddev, &idx)) 307 return; 308 309 #ifdef CONFIG_64BIT 310 last = min(pos + size, adev->gmc.visible_vram_size); 311 if (last > pos) { 312 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 313 size_t count = last - pos; 314 315 if (write) { 316 memcpy_toio(addr, buf, count); 317 mb(); 318 amdgpu_device_flush_hdp(adev, NULL); 319 } else { 320 amdgpu_device_invalidate_hdp(adev, NULL); 321 mb(); 322 memcpy_fromio(buf, addr, count); 323 } 324 325 if (count == size) 326 goto exit; 327 328 pos += count; 329 buf += count / 4; 330 size -= count; 331 } 332 #endif 333 334 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 335 for (last = pos + size; pos < last; pos += 4) { 336 uint32_t tmp = pos >> 31; 337 338 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 339 if (tmp != hi) { 340 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 341 hi = tmp; 342 } 343 if (write) 344 WREG32_NO_KIQ(mmMM_DATA, *buf++); 345 else 346 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 347 } 348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 349 350 #ifdef CONFIG_64BIT 351 exit: 352 #endif 353 drm_dev_exit(idx); 354 } 355 356 /* 357 * register access helper functions. 358 */ 359 360 /* Check if hw access should be skipped because of hotplug or device error */ 361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 362 { 363 if (adev->no_hw_access) 364 return true; 365 366 #ifdef CONFIG_LOCKDEP 367 /* 368 * This is a bit complicated to understand, so worth a comment. What we assert 369 * here is that the GPU reset is not running on another thread in parallel. 370 * 371 * For this we trylock the read side of the reset semaphore, if that succeeds 372 * we know that the reset is not running in paralell. 373 * 374 * If the trylock fails we assert that we are either already holding the read 375 * side of the lock or are the reset thread itself and hold the write side of 376 * the lock. 377 */ 378 if (in_task()) { 379 if (down_read_trylock(&adev->reset_sem)) 380 up_read(&adev->reset_sem); 381 else 382 lockdep_assert_held(&adev->reset_sem); 383 } 384 #endif 385 return false; 386 } 387 388 /** 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register 390 * 391 * @adev: amdgpu_device pointer 392 * @reg: dword aligned register offset 393 * @acc_flags: access flags which require special behavior 394 * 395 * Returns the 32 bit value from the offset specified. 396 */ 397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t acc_flags) 399 { 400 uint32_t ret; 401 402 if (amdgpu_device_skip_hw_access(adev)) 403 return 0; 404 405 if ((reg * 4) < adev->rmmio_size) { 406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 407 amdgpu_sriov_runtime(adev) && 408 down_read_trylock(&adev->reset_sem)) { 409 ret = amdgpu_kiq_rreg(adev, reg); 410 up_read(&adev->reset_sem); 411 } else { 412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 413 } 414 } else { 415 ret = adev->pcie_rreg(adev, reg * 4); 416 } 417 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 419 420 return ret; 421 } 422 423 /* 424 * MMIO register read with bytes helper functions 425 * @offset:bytes offset from MMIO start 426 * 427 */ 428 429 /** 430 * amdgpu_mm_rreg8 - read a memory mapped IO register 431 * 432 * @adev: amdgpu_device pointer 433 * @offset: byte aligned register offset 434 * 435 * Returns the 8 bit value from the offset specified. 436 */ 437 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 438 { 439 if (amdgpu_device_skip_hw_access(adev)) 440 return 0; 441 442 if (offset < adev->rmmio_size) 443 return (readb(adev->rmmio + offset)); 444 BUG(); 445 } 446 447 /* 448 * MMIO register write with bytes helper functions 449 * @offset:bytes offset from MMIO start 450 * @value: the value want to be written to the register 451 * 452 */ 453 /** 454 * amdgpu_mm_wreg8 - read a memory mapped IO register 455 * 456 * @adev: amdgpu_device pointer 457 * @offset: byte aligned register offset 458 * @value: 8 bit value to write 459 * 460 * Writes the value specified to the offset specified. 461 */ 462 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 463 { 464 if (amdgpu_device_skip_hw_access(adev)) 465 return; 466 467 if (offset < adev->rmmio_size) 468 writeb(value, adev->rmmio + offset); 469 else 470 BUG(); 471 } 472 473 /** 474 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 475 * 476 * @adev: amdgpu_device pointer 477 * @reg: dword aligned register offset 478 * @v: 32 bit value to write to the register 479 * @acc_flags: access flags which require special behavior 480 * 481 * Writes the value specified to the offset specified. 482 */ 483 void amdgpu_device_wreg(struct amdgpu_device *adev, 484 uint32_t reg, uint32_t v, 485 uint32_t acc_flags) 486 { 487 if (amdgpu_device_skip_hw_access(adev)) 488 return; 489 490 if ((reg * 4) < adev->rmmio_size) { 491 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 492 amdgpu_sriov_runtime(adev) && 493 down_read_trylock(&adev->reset_sem)) { 494 amdgpu_kiq_wreg(adev, reg, v); 495 up_read(&adev->reset_sem); 496 } else { 497 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 498 } 499 } else { 500 adev->pcie_wreg(adev, reg * 4, v); 501 } 502 503 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 504 } 505 506 /* 507 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 508 * 509 * this function is invoked only the debugfs register access 510 * */ 511 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 512 uint32_t reg, uint32_t v) 513 { 514 if (amdgpu_device_skip_hw_access(adev)) 515 return; 516 517 if (amdgpu_sriov_fullaccess(adev) && 518 adev->gfx.rlc.funcs && 519 adev->gfx.rlc.funcs->is_rlcg_access_range) { 520 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 521 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0, 0); 522 } else { 523 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 524 } 525 } 526 527 /** 528 * amdgpu_mm_rdoorbell - read a doorbell dword 529 * 530 * @adev: amdgpu_device pointer 531 * @index: doorbell index 532 * 533 * Returns the value in the doorbell aperture at the 534 * requested doorbell index (CIK). 535 */ 536 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 537 { 538 if (amdgpu_device_skip_hw_access(adev)) 539 return 0; 540 541 if (index < adev->doorbell.num_doorbells) { 542 return readl(adev->doorbell.ptr + index); 543 } else { 544 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 545 return 0; 546 } 547 } 548 549 /** 550 * amdgpu_mm_wdoorbell - write a doorbell dword 551 * 552 * @adev: amdgpu_device pointer 553 * @index: doorbell index 554 * @v: value to write 555 * 556 * Writes @v to the doorbell aperture at the 557 * requested doorbell index (CIK). 558 */ 559 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 560 { 561 if (amdgpu_device_skip_hw_access(adev)) 562 return; 563 564 if (index < adev->doorbell.num_doorbells) { 565 writel(v, adev->doorbell.ptr + index); 566 } else { 567 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 568 } 569 } 570 571 /** 572 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 573 * 574 * @adev: amdgpu_device pointer 575 * @index: doorbell index 576 * 577 * Returns the value in the doorbell aperture at the 578 * requested doorbell index (VEGA10+). 579 */ 580 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 581 { 582 if (amdgpu_device_skip_hw_access(adev)) 583 return 0; 584 585 if (index < adev->doorbell.num_doorbells) { 586 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 587 } else { 588 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 589 return 0; 590 } 591 } 592 593 /** 594 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 595 * 596 * @adev: amdgpu_device pointer 597 * @index: doorbell index 598 * @v: value to write 599 * 600 * Writes @v to the doorbell aperture at the 601 * requested doorbell index (VEGA10+). 602 */ 603 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 604 { 605 if (amdgpu_device_skip_hw_access(adev)) 606 return; 607 608 if (index < adev->doorbell.num_doorbells) { 609 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 610 } else { 611 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 612 } 613 } 614 615 /** 616 * amdgpu_device_indirect_rreg - read an indirect register 617 * 618 * @adev: amdgpu_device pointer 619 * @pcie_index: mmio register offset 620 * @pcie_data: mmio register offset 621 * @reg_addr: indirect register address to read from 622 * 623 * Returns the value of indirect register @reg_addr 624 */ 625 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 626 u32 pcie_index, u32 pcie_data, 627 u32 reg_addr) 628 { 629 unsigned long flags; 630 u32 r; 631 void __iomem *pcie_index_offset; 632 void __iomem *pcie_data_offset; 633 634 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 635 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 636 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 637 638 writel(reg_addr, pcie_index_offset); 639 readl(pcie_index_offset); 640 r = readl(pcie_data_offset); 641 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 642 643 return r; 644 } 645 646 /** 647 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 648 * 649 * @adev: amdgpu_device pointer 650 * @pcie_index: mmio register offset 651 * @pcie_data: mmio register offset 652 * @reg_addr: indirect register address to read from 653 * 654 * Returns the value of indirect register @reg_addr 655 */ 656 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 657 u32 pcie_index, u32 pcie_data, 658 u32 reg_addr) 659 { 660 unsigned long flags; 661 u64 r; 662 void __iomem *pcie_index_offset; 663 void __iomem *pcie_data_offset; 664 665 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 666 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 667 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 668 669 /* read low 32 bits */ 670 writel(reg_addr, pcie_index_offset); 671 readl(pcie_index_offset); 672 r = readl(pcie_data_offset); 673 /* read high 32 bits */ 674 writel(reg_addr + 4, pcie_index_offset); 675 readl(pcie_index_offset); 676 r |= ((u64)readl(pcie_data_offset) << 32); 677 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 678 679 return r; 680 } 681 682 /** 683 * amdgpu_device_indirect_wreg - write an indirect register address 684 * 685 * @adev: amdgpu_device pointer 686 * @pcie_index: mmio register offset 687 * @pcie_data: mmio register offset 688 * @reg_addr: indirect register offset 689 * @reg_data: indirect register data 690 * 691 */ 692 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 693 u32 pcie_index, u32 pcie_data, 694 u32 reg_addr, u32 reg_data) 695 { 696 unsigned long flags; 697 void __iomem *pcie_index_offset; 698 void __iomem *pcie_data_offset; 699 700 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 701 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 702 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 703 704 writel(reg_addr, pcie_index_offset); 705 readl(pcie_index_offset); 706 writel(reg_data, pcie_data_offset); 707 readl(pcie_data_offset); 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @pcie_index: mmio register offset 716 * @pcie_data: mmio register offset 717 * @reg_addr: indirect register offset 718 * @reg_data: indirect register data 719 * 720 */ 721 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 722 u32 pcie_index, u32 pcie_data, 723 u32 reg_addr, u64 reg_data) 724 { 725 unsigned long flags; 726 void __iomem *pcie_index_offset; 727 void __iomem *pcie_data_offset; 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_invalid_rreg - dummy reg read function 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: offset of register 751 * 752 * Dummy register read function. Used for register blocks 753 * that certain asics don't have (all asics). 754 * Returns the value in the register. 755 */ 756 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 757 { 758 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 759 BUG(); 760 return 0; 761 } 762 763 /** 764 * amdgpu_invalid_wreg - dummy reg write function 765 * 766 * @adev: amdgpu_device pointer 767 * @reg: offset of register 768 * @v: value to write to the register 769 * 770 * Dummy register read function. Used for register blocks 771 * that certain asics don't have (all asics). 772 */ 773 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 774 { 775 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 776 reg, v); 777 BUG(); 778 } 779 780 /** 781 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 782 * 783 * @adev: amdgpu_device pointer 784 * @reg: offset of register 785 * 786 * Dummy register read function. Used for register blocks 787 * that certain asics don't have (all asics). 788 * Returns the value in the register. 789 */ 790 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 791 { 792 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 793 BUG(); 794 return 0; 795 } 796 797 /** 798 * amdgpu_invalid_wreg64 - dummy reg write function 799 * 800 * @adev: amdgpu_device pointer 801 * @reg: offset of register 802 * @v: value to write to the register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 */ 807 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 808 { 809 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 810 reg, v); 811 BUG(); 812 } 813 814 /** 815 * amdgpu_block_invalid_rreg - dummy reg read function 816 * 817 * @adev: amdgpu_device pointer 818 * @block: offset of instance 819 * @reg: offset of register 820 * 821 * Dummy register read function. Used for register blocks 822 * that certain asics don't have (all asics). 823 * Returns the value in the register. 824 */ 825 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 826 uint32_t block, uint32_t reg) 827 { 828 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 829 reg, block); 830 BUG(); 831 return 0; 832 } 833 834 /** 835 * amdgpu_block_invalid_wreg - dummy reg write function 836 * 837 * @adev: amdgpu_device pointer 838 * @block: offset of instance 839 * @reg: offset of register 840 * @v: value to write to the register 841 * 842 * Dummy register read function. Used for register blocks 843 * that certain asics don't have (all asics). 844 */ 845 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 846 uint32_t block, 847 uint32_t reg, uint32_t v) 848 { 849 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 850 reg, block, v); 851 BUG(); 852 } 853 854 /** 855 * amdgpu_device_asic_init - Wrapper for atom asic_init 856 * 857 * @adev: amdgpu_device pointer 858 * 859 * Does any asic specific work and then calls atom asic init. 860 */ 861 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 862 { 863 amdgpu_asic_pre_asic_init(adev); 864 865 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 866 } 867 868 /** 869 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 870 * 871 * @adev: amdgpu_device pointer 872 * 873 * Allocates a scratch page of VRAM for use by various things in the 874 * driver. 875 */ 876 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 877 { 878 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 879 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 880 &adev->vram_scratch.robj, 881 &adev->vram_scratch.gpu_addr, 882 (void **)&adev->vram_scratch.ptr); 883 } 884 885 /** 886 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 887 * 888 * @adev: amdgpu_device pointer 889 * 890 * Frees the VRAM scratch page. 891 */ 892 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 893 { 894 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 895 } 896 897 /** 898 * amdgpu_device_program_register_sequence - program an array of registers. 899 * 900 * @adev: amdgpu_device pointer 901 * @registers: pointer to the register array 902 * @array_size: size of the register array 903 * 904 * Programs an array or registers with and and or masks. 905 * This is a helper for setting golden registers. 906 */ 907 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 908 const u32 *registers, 909 const u32 array_size) 910 { 911 u32 tmp, reg, and_mask, or_mask; 912 int i; 913 914 if (array_size % 3) 915 return; 916 917 for (i = 0; i < array_size; i +=3) { 918 reg = registers[i + 0]; 919 and_mask = registers[i + 1]; 920 or_mask = registers[i + 2]; 921 922 if (and_mask == 0xffffffff) { 923 tmp = or_mask; 924 } else { 925 tmp = RREG32(reg); 926 tmp &= ~and_mask; 927 if (adev->family >= AMDGPU_FAMILY_AI) 928 tmp |= (or_mask & and_mask); 929 else 930 tmp |= or_mask; 931 } 932 WREG32(reg, tmp); 933 } 934 } 935 936 /** 937 * amdgpu_device_pci_config_reset - reset the GPU 938 * 939 * @adev: amdgpu_device pointer 940 * 941 * Resets the GPU using the pci config reset sequence. 942 * Only applicable to asics prior to vega10. 943 */ 944 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 945 { 946 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 947 } 948 949 /** 950 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 951 * 952 * @adev: amdgpu_device pointer 953 * 954 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 955 */ 956 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 957 { 958 return pci_reset_function(adev->pdev); 959 } 960 961 /* 962 * GPU doorbell aperture helpers function. 963 */ 964 /** 965 * amdgpu_device_doorbell_init - Init doorbell driver information. 966 * 967 * @adev: amdgpu_device pointer 968 * 969 * Init doorbell driver information (CIK) 970 * Returns 0 on success, error on failure. 971 */ 972 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 973 { 974 975 /* No doorbell on SI hardware generation */ 976 if (adev->asic_type < CHIP_BONAIRE) { 977 adev->doorbell.base = 0; 978 adev->doorbell.size = 0; 979 adev->doorbell.num_doorbells = 0; 980 adev->doorbell.ptr = NULL; 981 return 0; 982 } 983 984 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 985 return -EINVAL; 986 987 amdgpu_asic_init_doorbell_index(adev); 988 989 /* doorbell bar mapping */ 990 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 991 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 992 993 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 994 adev->doorbell_index.max_assignment+1); 995 if (adev->doorbell.num_doorbells == 0) 996 return -EINVAL; 997 998 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 999 * paging queue doorbell use the second page. The 1000 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1001 * doorbells are in the first page. So with paging queue enabled, 1002 * the max num_doorbells should + 1 page (0x400 in dword) 1003 */ 1004 if (adev->asic_type >= CHIP_VEGA10) 1005 adev->doorbell.num_doorbells += 0x400; 1006 1007 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1008 adev->doorbell.num_doorbells * 1009 sizeof(u32)); 1010 if (adev->doorbell.ptr == NULL) 1011 return -ENOMEM; 1012 1013 return 0; 1014 } 1015 1016 /** 1017 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1018 * 1019 * @adev: amdgpu_device pointer 1020 * 1021 * Tear down doorbell driver information (CIK) 1022 */ 1023 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1024 { 1025 iounmap(adev->doorbell.ptr); 1026 adev->doorbell.ptr = NULL; 1027 } 1028 1029 1030 1031 /* 1032 * amdgpu_device_wb_*() 1033 * Writeback is the method by which the GPU updates special pages in memory 1034 * with the status of certain GPU events (fences, ring pointers,etc.). 1035 */ 1036 1037 /** 1038 * amdgpu_device_wb_fini - Disable Writeback and free memory 1039 * 1040 * @adev: amdgpu_device pointer 1041 * 1042 * Disables Writeback and frees the Writeback memory (all asics). 1043 * Used at driver shutdown. 1044 */ 1045 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1046 { 1047 if (adev->wb.wb_obj) { 1048 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1049 &adev->wb.gpu_addr, 1050 (void **)&adev->wb.wb); 1051 adev->wb.wb_obj = NULL; 1052 } 1053 } 1054 1055 /** 1056 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1057 * 1058 * @adev: amdgpu_device pointer 1059 * 1060 * Initializes writeback and allocates writeback memory (all asics). 1061 * Used at driver startup. 1062 * Returns 0 on success or an -error on failure. 1063 */ 1064 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1065 { 1066 int r; 1067 1068 if (adev->wb.wb_obj == NULL) { 1069 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1070 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1071 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1072 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1073 (void **)&adev->wb.wb); 1074 if (r) { 1075 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1076 return r; 1077 } 1078 1079 adev->wb.num_wb = AMDGPU_MAX_WB; 1080 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1081 1082 /* clear wb memory */ 1083 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1084 } 1085 1086 return 0; 1087 } 1088 1089 /** 1090 * amdgpu_device_wb_get - Allocate a wb entry 1091 * 1092 * @adev: amdgpu_device pointer 1093 * @wb: wb index 1094 * 1095 * Allocate a wb slot for use by the driver (all asics). 1096 * Returns 0 on success or -EINVAL on failure. 1097 */ 1098 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1099 { 1100 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1101 1102 if (offset < adev->wb.num_wb) { 1103 __set_bit(offset, adev->wb.used); 1104 *wb = offset << 3; /* convert to dw offset */ 1105 return 0; 1106 } else { 1107 return -EINVAL; 1108 } 1109 } 1110 1111 /** 1112 * amdgpu_device_wb_free - Free a wb entry 1113 * 1114 * @adev: amdgpu_device pointer 1115 * @wb: wb index 1116 * 1117 * Free a wb slot allocated for use by the driver (all asics) 1118 */ 1119 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1120 { 1121 wb >>= 3; 1122 if (wb < adev->wb.num_wb) 1123 __clear_bit(wb, adev->wb.used); 1124 } 1125 1126 /** 1127 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1128 * 1129 * @adev: amdgpu_device pointer 1130 * 1131 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1132 * to fail, but if any of the BARs is not accessible after the size we abort 1133 * driver loading by returning -ENODEV. 1134 */ 1135 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1136 { 1137 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1138 struct pci_bus *root; 1139 struct resource *res; 1140 unsigned i; 1141 u16 cmd; 1142 int r; 1143 1144 /* Bypass for VF */ 1145 if (amdgpu_sriov_vf(adev)) 1146 return 0; 1147 1148 /* skip if the bios has already enabled large BAR */ 1149 if (adev->gmc.real_vram_size && 1150 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1151 return 0; 1152 1153 /* Check if the root BUS has 64bit memory resources */ 1154 root = adev->pdev->bus; 1155 while (root->parent) 1156 root = root->parent; 1157 1158 pci_bus_for_each_resource(root, res, i) { 1159 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1160 res->start > 0x100000000ull) 1161 break; 1162 } 1163 1164 /* Trying to resize is pointless without a root hub window above 4GB */ 1165 if (!res) 1166 return 0; 1167 1168 /* Limit the BAR size to what is available */ 1169 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1170 rbar_size); 1171 1172 /* Disable memory decoding while we change the BAR addresses and size */ 1173 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1174 pci_write_config_word(adev->pdev, PCI_COMMAND, 1175 cmd & ~PCI_COMMAND_MEMORY); 1176 1177 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1178 amdgpu_device_doorbell_fini(adev); 1179 if (adev->asic_type >= CHIP_BONAIRE) 1180 pci_release_resource(adev->pdev, 2); 1181 1182 pci_release_resource(adev->pdev, 0); 1183 1184 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1185 if (r == -ENOSPC) 1186 DRM_INFO("Not enough PCI address space for a large BAR."); 1187 else if (r && r != -ENOTSUPP) 1188 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1189 1190 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1191 1192 /* When the doorbell or fb BAR isn't available we have no chance of 1193 * using the device. 1194 */ 1195 r = amdgpu_device_doorbell_init(adev); 1196 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1197 return -ENODEV; 1198 1199 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1200 1201 return 0; 1202 } 1203 1204 /* 1205 * GPU helpers function. 1206 */ 1207 /** 1208 * amdgpu_device_need_post - check if the hw need post or not 1209 * 1210 * @adev: amdgpu_device pointer 1211 * 1212 * Check if the asic has been initialized (all asics) at driver startup 1213 * or post is needed if hw reset is performed. 1214 * Returns true if need or false if not. 1215 */ 1216 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1217 { 1218 uint32_t reg; 1219 1220 if (amdgpu_sriov_vf(adev)) 1221 return false; 1222 1223 if (amdgpu_passthrough(adev)) { 1224 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1225 * some old smc fw still need driver do vPost otherwise gpu hang, while 1226 * those smc fw version above 22.15 doesn't have this flaw, so we force 1227 * vpost executed for smc version below 22.15 1228 */ 1229 if (adev->asic_type == CHIP_FIJI) { 1230 int err; 1231 uint32_t fw_ver; 1232 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1233 /* force vPost if error occured */ 1234 if (err) 1235 return true; 1236 1237 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1238 if (fw_ver < 0x00160e00) 1239 return true; 1240 } 1241 } 1242 1243 /* Don't post if we need to reset whole hive on init */ 1244 if (adev->gmc.xgmi.pending_reset) 1245 return false; 1246 1247 if (adev->has_hw_reset) { 1248 adev->has_hw_reset = false; 1249 return true; 1250 } 1251 1252 /* bios scratch used on CIK+ */ 1253 if (adev->asic_type >= CHIP_BONAIRE) 1254 return amdgpu_atombios_scratch_need_asic_init(adev); 1255 1256 /* check MEM_SIZE for older asics */ 1257 reg = amdgpu_asic_get_config_memsize(adev); 1258 1259 if ((reg != 0) && (reg != 0xffffffff)) 1260 return false; 1261 1262 return true; 1263 } 1264 1265 /* if we get transitioned to only one device, take VGA back */ 1266 /** 1267 * amdgpu_device_vga_set_decode - enable/disable vga decode 1268 * 1269 * @cookie: amdgpu_device pointer 1270 * @state: enable/disable vga decode 1271 * 1272 * Enable/disable vga decode (all asics). 1273 * Returns VGA resource flags. 1274 */ 1275 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1276 { 1277 struct amdgpu_device *adev = cookie; 1278 amdgpu_asic_set_vga_state(adev, state); 1279 if (state) 1280 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1281 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1282 else 1283 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1284 } 1285 1286 /** 1287 * amdgpu_device_check_block_size - validate the vm block size 1288 * 1289 * @adev: amdgpu_device pointer 1290 * 1291 * Validates the vm block size specified via module parameter. 1292 * The vm block size defines number of bits in page table versus page directory, 1293 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1294 * page table and the remaining bits are in the page directory. 1295 */ 1296 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1297 { 1298 /* defines number of bits in page table versus page directory, 1299 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1300 * page table and the remaining bits are in the page directory */ 1301 if (amdgpu_vm_block_size == -1) 1302 return; 1303 1304 if (amdgpu_vm_block_size < 9) { 1305 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1306 amdgpu_vm_block_size); 1307 amdgpu_vm_block_size = -1; 1308 } 1309 } 1310 1311 /** 1312 * amdgpu_device_check_vm_size - validate the vm size 1313 * 1314 * @adev: amdgpu_device pointer 1315 * 1316 * Validates the vm size in GB specified via module parameter. 1317 * The VM size is the size of the GPU virtual memory space in GB. 1318 */ 1319 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1320 { 1321 /* no need to check the default value */ 1322 if (amdgpu_vm_size == -1) 1323 return; 1324 1325 if (amdgpu_vm_size < 1) { 1326 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1327 amdgpu_vm_size); 1328 amdgpu_vm_size = -1; 1329 } 1330 } 1331 1332 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1333 { 1334 struct sysinfo si; 1335 bool is_os_64 = (sizeof(void *) == 8); 1336 uint64_t total_memory; 1337 uint64_t dram_size_seven_GB = 0x1B8000000; 1338 uint64_t dram_size_three_GB = 0xB8000000; 1339 1340 if (amdgpu_smu_memory_pool_size == 0) 1341 return; 1342 1343 if (!is_os_64) { 1344 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1345 goto def_value; 1346 } 1347 si_meminfo(&si); 1348 total_memory = (uint64_t)si.totalram * si.mem_unit; 1349 1350 if ((amdgpu_smu_memory_pool_size == 1) || 1351 (amdgpu_smu_memory_pool_size == 2)) { 1352 if (total_memory < dram_size_three_GB) 1353 goto def_value1; 1354 } else if ((amdgpu_smu_memory_pool_size == 4) || 1355 (amdgpu_smu_memory_pool_size == 8)) { 1356 if (total_memory < dram_size_seven_GB) 1357 goto def_value1; 1358 } else { 1359 DRM_WARN("Smu memory pool size not supported\n"); 1360 goto def_value; 1361 } 1362 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1363 1364 return; 1365 1366 def_value1: 1367 DRM_WARN("No enough system memory\n"); 1368 def_value: 1369 adev->pm.smu_prv_buffer_size = 0; 1370 } 1371 1372 /** 1373 * amdgpu_device_check_arguments - validate module params 1374 * 1375 * @adev: amdgpu_device pointer 1376 * 1377 * Validates certain module parameters and updates 1378 * the associated values used by the driver (all asics). 1379 */ 1380 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1381 { 1382 if (amdgpu_sched_jobs < 4) { 1383 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1384 amdgpu_sched_jobs); 1385 amdgpu_sched_jobs = 4; 1386 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1387 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1388 amdgpu_sched_jobs); 1389 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1390 } 1391 1392 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1393 /* gart size must be greater or equal to 32M */ 1394 dev_warn(adev->dev, "gart size (%d) too small\n", 1395 amdgpu_gart_size); 1396 amdgpu_gart_size = -1; 1397 } 1398 1399 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1400 /* gtt size must be greater or equal to 32M */ 1401 dev_warn(adev->dev, "gtt size (%d) too small\n", 1402 amdgpu_gtt_size); 1403 amdgpu_gtt_size = -1; 1404 } 1405 1406 /* valid range is between 4 and 9 inclusive */ 1407 if (amdgpu_vm_fragment_size != -1 && 1408 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1409 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1410 amdgpu_vm_fragment_size = -1; 1411 } 1412 1413 if (amdgpu_sched_hw_submission < 2) { 1414 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1415 amdgpu_sched_hw_submission); 1416 amdgpu_sched_hw_submission = 2; 1417 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1418 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1419 amdgpu_sched_hw_submission); 1420 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1421 } 1422 1423 amdgpu_device_check_smu_prv_buffer_size(adev); 1424 1425 amdgpu_device_check_vm_size(adev); 1426 1427 amdgpu_device_check_block_size(adev); 1428 1429 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1430 1431 amdgpu_gmc_tmz_set(adev); 1432 1433 amdgpu_gmc_noretry_set(adev); 1434 1435 return 0; 1436 } 1437 1438 /** 1439 * amdgpu_switcheroo_set_state - set switcheroo state 1440 * 1441 * @pdev: pci dev pointer 1442 * @state: vga_switcheroo state 1443 * 1444 * Callback for the switcheroo driver. Suspends or resumes the 1445 * the asics before or after it is powered up using ACPI methods. 1446 */ 1447 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1448 enum vga_switcheroo_state state) 1449 { 1450 struct drm_device *dev = pci_get_drvdata(pdev); 1451 int r; 1452 1453 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1454 return; 1455 1456 if (state == VGA_SWITCHEROO_ON) { 1457 pr_info("switched on\n"); 1458 /* don't suspend or resume card normally */ 1459 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1460 1461 pci_set_power_state(pdev, PCI_D0); 1462 amdgpu_device_load_pci_state(pdev); 1463 r = pci_enable_device(pdev); 1464 if (r) 1465 DRM_WARN("pci_enable_device failed (%d)\n", r); 1466 amdgpu_device_resume(dev, true); 1467 1468 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1469 } else { 1470 pr_info("switched off\n"); 1471 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1472 amdgpu_device_suspend(dev, true); 1473 amdgpu_device_cache_pci_state(pdev); 1474 /* Shut down the device */ 1475 pci_disable_device(pdev); 1476 pci_set_power_state(pdev, PCI_D3cold); 1477 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1478 } 1479 } 1480 1481 /** 1482 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1483 * 1484 * @pdev: pci dev pointer 1485 * 1486 * Callback for the switcheroo driver. Check of the switcheroo 1487 * state can be changed. 1488 * Returns true if the state can be changed, false if not. 1489 */ 1490 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1491 { 1492 struct drm_device *dev = pci_get_drvdata(pdev); 1493 1494 /* 1495 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1496 * locking inversion with the driver load path. And the access here is 1497 * completely racy anyway. So don't bother with locking for now. 1498 */ 1499 return atomic_read(&dev->open_count) == 0; 1500 } 1501 1502 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1503 .set_gpu_state = amdgpu_switcheroo_set_state, 1504 .reprobe = NULL, 1505 .can_switch = amdgpu_switcheroo_can_switch, 1506 }; 1507 1508 /** 1509 * amdgpu_device_ip_set_clockgating_state - set the CG state 1510 * 1511 * @dev: amdgpu_device pointer 1512 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1513 * @state: clockgating state (gate or ungate) 1514 * 1515 * Sets the requested clockgating state for all instances of 1516 * the hardware IP specified. 1517 * Returns the error code from the last instance. 1518 */ 1519 int amdgpu_device_ip_set_clockgating_state(void *dev, 1520 enum amd_ip_block_type block_type, 1521 enum amd_clockgating_state state) 1522 { 1523 struct amdgpu_device *adev = dev; 1524 int i, r = 0; 1525 1526 for (i = 0; i < adev->num_ip_blocks; i++) { 1527 if (!adev->ip_blocks[i].status.valid) 1528 continue; 1529 if (adev->ip_blocks[i].version->type != block_type) 1530 continue; 1531 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1532 continue; 1533 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1534 (void *)adev, state); 1535 if (r) 1536 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1537 adev->ip_blocks[i].version->funcs->name, r); 1538 } 1539 return r; 1540 } 1541 1542 /** 1543 * amdgpu_device_ip_set_powergating_state - set the PG state 1544 * 1545 * @dev: amdgpu_device pointer 1546 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1547 * @state: powergating state (gate or ungate) 1548 * 1549 * Sets the requested powergating state for all instances of 1550 * the hardware IP specified. 1551 * Returns the error code from the last instance. 1552 */ 1553 int amdgpu_device_ip_set_powergating_state(void *dev, 1554 enum amd_ip_block_type block_type, 1555 enum amd_powergating_state state) 1556 { 1557 struct amdgpu_device *adev = dev; 1558 int i, r = 0; 1559 1560 for (i = 0; i < adev->num_ip_blocks; i++) { 1561 if (!adev->ip_blocks[i].status.valid) 1562 continue; 1563 if (adev->ip_blocks[i].version->type != block_type) 1564 continue; 1565 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1566 continue; 1567 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1568 (void *)adev, state); 1569 if (r) 1570 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1571 adev->ip_blocks[i].version->funcs->name, r); 1572 } 1573 return r; 1574 } 1575 1576 /** 1577 * amdgpu_device_ip_get_clockgating_state - get the CG state 1578 * 1579 * @adev: amdgpu_device pointer 1580 * @flags: clockgating feature flags 1581 * 1582 * Walks the list of IPs on the device and updates the clockgating 1583 * flags for each IP. 1584 * Updates @flags with the feature flags for each hardware IP where 1585 * clockgating is enabled. 1586 */ 1587 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1588 u32 *flags) 1589 { 1590 int i; 1591 1592 for (i = 0; i < adev->num_ip_blocks; i++) { 1593 if (!adev->ip_blocks[i].status.valid) 1594 continue; 1595 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1596 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1597 } 1598 } 1599 1600 /** 1601 * amdgpu_device_ip_wait_for_idle - wait for idle 1602 * 1603 * @adev: amdgpu_device pointer 1604 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1605 * 1606 * Waits for the request hardware IP to be idle. 1607 * Returns 0 for success or a negative error code on failure. 1608 */ 1609 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1610 enum amd_ip_block_type block_type) 1611 { 1612 int i, r; 1613 1614 for (i = 0; i < adev->num_ip_blocks; i++) { 1615 if (!adev->ip_blocks[i].status.valid) 1616 continue; 1617 if (adev->ip_blocks[i].version->type == block_type) { 1618 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1619 if (r) 1620 return r; 1621 break; 1622 } 1623 } 1624 return 0; 1625 1626 } 1627 1628 /** 1629 * amdgpu_device_ip_is_idle - is the hardware IP idle 1630 * 1631 * @adev: amdgpu_device pointer 1632 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1633 * 1634 * Check if the hardware IP is idle or not. 1635 * Returns true if it the IP is idle, false if not. 1636 */ 1637 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1638 enum amd_ip_block_type block_type) 1639 { 1640 int i; 1641 1642 for (i = 0; i < adev->num_ip_blocks; i++) { 1643 if (!adev->ip_blocks[i].status.valid) 1644 continue; 1645 if (adev->ip_blocks[i].version->type == block_type) 1646 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1647 } 1648 return true; 1649 1650 } 1651 1652 /** 1653 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1654 * 1655 * @adev: amdgpu_device pointer 1656 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1657 * 1658 * Returns a pointer to the hardware IP block structure 1659 * if it exists for the asic, otherwise NULL. 1660 */ 1661 struct amdgpu_ip_block * 1662 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1663 enum amd_ip_block_type type) 1664 { 1665 int i; 1666 1667 for (i = 0; i < adev->num_ip_blocks; i++) 1668 if (adev->ip_blocks[i].version->type == type) 1669 return &adev->ip_blocks[i]; 1670 1671 return NULL; 1672 } 1673 1674 /** 1675 * amdgpu_device_ip_block_version_cmp 1676 * 1677 * @adev: amdgpu_device pointer 1678 * @type: enum amd_ip_block_type 1679 * @major: major version 1680 * @minor: minor version 1681 * 1682 * return 0 if equal or greater 1683 * return 1 if smaller or the ip_block doesn't exist 1684 */ 1685 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1686 enum amd_ip_block_type type, 1687 u32 major, u32 minor) 1688 { 1689 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1690 1691 if (ip_block && ((ip_block->version->major > major) || 1692 ((ip_block->version->major == major) && 1693 (ip_block->version->minor >= minor)))) 1694 return 0; 1695 1696 return 1; 1697 } 1698 1699 /** 1700 * amdgpu_device_ip_block_add 1701 * 1702 * @adev: amdgpu_device pointer 1703 * @ip_block_version: pointer to the IP to add 1704 * 1705 * Adds the IP block driver information to the collection of IPs 1706 * on the asic. 1707 */ 1708 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1709 const struct amdgpu_ip_block_version *ip_block_version) 1710 { 1711 if (!ip_block_version) 1712 return -EINVAL; 1713 1714 switch (ip_block_version->type) { 1715 case AMD_IP_BLOCK_TYPE_VCN: 1716 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1717 return 0; 1718 break; 1719 case AMD_IP_BLOCK_TYPE_JPEG: 1720 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1721 return 0; 1722 break; 1723 default: 1724 break; 1725 } 1726 1727 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1728 ip_block_version->funcs->name); 1729 1730 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1731 1732 return 0; 1733 } 1734 1735 /** 1736 * amdgpu_device_enable_virtual_display - enable virtual display feature 1737 * 1738 * @adev: amdgpu_device pointer 1739 * 1740 * Enabled the virtual display feature if the user has enabled it via 1741 * the module parameter virtual_display. This feature provides a virtual 1742 * display hardware on headless boards or in virtualized environments. 1743 * This function parses and validates the configuration string specified by 1744 * the user and configues the virtual display configuration (number of 1745 * virtual connectors, crtcs, etc.) specified. 1746 */ 1747 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1748 { 1749 adev->enable_virtual_display = false; 1750 1751 if (amdgpu_virtual_display) { 1752 const char *pci_address_name = pci_name(adev->pdev); 1753 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1754 1755 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1756 pciaddstr_tmp = pciaddstr; 1757 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1758 pciaddname = strsep(&pciaddname_tmp, ","); 1759 if (!strcmp("all", pciaddname) 1760 || !strcmp(pci_address_name, pciaddname)) { 1761 long num_crtc; 1762 int res = -1; 1763 1764 adev->enable_virtual_display = true; 1765 1766 if (pciaddname_tmp) 1767 res = kstrtol(pciaddname_tmp, 10, 1768 &num_crtc); 1769 1770 if (!res) { 1771 if (num_crtc < 1) 1772 num_crtc = 1; 1773 if (num_crtc > 6) 1774 num_crtc = 6; 1775 adev->mode_info.num_crtc = num_crtc; 1776 } else { 1777 adev->mode_info.num_crtc = 1; 1778 } 1779 break; 1780 } 1781 } 1782 1783 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1784 amdgpu_virtual_display, pci_address_name, 1785 adev->enable_virtual_display, adev->mode_info.num_crtc); 1786 1787 kfree(pciaddstr); 1788 } 1789 } 1790 1791 /** 1792 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1793 * 1794 * @adev: amdgpu_device pointer 1795 * 1796 * Parses the asic configuration parameters specified in the gpu info 1797 * firmware and makes them availale to the driver for use in configuring 1798 * the asic. 1799 * Returns 0 on success, -EINVAL on failure. 1800 */ 1801 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1802 { 1803 const char *chip_name; 1804 char fw_name[40]; 1805 int err; 1806 const struct gpu_info_firmware_header_v1_0 *hdr; 1807 1808 adev->firmware.gpu_info_fw = NULL; 1809 1810 if (adev->mman.discovery_bin) { 1811 amdgpu_discovery_get_gfx_info(adev); 1812 1813 /* 1814 * FIXME: The bounding box is still needed by Navi12, so 1815 * temporarily read it from gpu_info firmware. Should be droped 1816 * when DAL no longer needs it. 1817 */ 1818 if (adev->asic_type != CHIP_NAVI12) 1819 return 0; 1820 } 1821 1822 switch (adev->asic_type) { 1823 #ifdef CONFIG_DRM_AMDGPU_SI 1824 case CHIP_VERDE: 1825 case CHIP_TAHITI: 1826 case CHIP_PITCAIRN: 1827 case CHIP_OLAND: 1828 case CHIP_HAINAN: 1829 #endif 1830 #ifdef CONFIG_DRM_AMDGPU_CIK 1831 case CHIP_BONAIRE: 1832 case CHIP_HAWAII: 1833 case CHIP_KAVERI: 1834 case CHIP_KABINI: 1835 case CHIP_MULLINS: 1836 #endif 1837 case CHIP_TOPAZ: 1838 case CHIP_TONGA: 1839 case CHIP_FIJI: 1840 case CHIP_POLARIS10: 1841 case CHIP_POLARIS11: 1842 case CHIP_POLARIS12: 1843 case CHIP_VEGAM: 1844 case CHIP_CARRIZO: 1845 case CHIP_STONEY: 1846 case CHIP_VEGA20: 1847 case CHIP_ALDEBARAN: 1848 case CHIP_SIENNA_CICHLID: 1849 case CHIP_NAVY_FLOUNDER: 1850 case CHIP_DIMGREY_CAVEFISH: 1851 case CHIP_BEIGE_GOBY: 1852 default: 1853 return 0; 1854 case CHIP_VEGA10: 1855 chip_name = "vega10"; 1856 break; 1857 case CHIP_VEGA12: 1858 chip_name = "vega12"; 1859 break; 1860 case CHIP_RAVEN: 1861 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1862 chip_name = "raven2"; 1863 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1864 chip_name = "picasso"; 1865 else 1866 chip_name = "raven"; 1867 break; 1868 case CHIP_ARCTURUS: 1869 chip_name = "arcturus"; 1870 break; 1871 case CHIP_RENOIR: 1872 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1873 chip_name = "renoir"; 1874 else 1875 chip_name = "green_sardine"; 1876 break; 1877 case CHIP_NAVI10: 1878 chip_name = "navi10"; 1879 break; 1880 case CHIP_NAVI14: 1881 chip_name = "navi14"; 1882 break; 1883 case CHIP_NAVI12: 1884 chip_name = "navi12"; 1885 break; 1886 case CHIP_VANGOGH: 1887 chip_name = "vangogh"; 1888 break; 1889 case CHIP_YELLOW_CARP: 1890 chip_name = "yellow_carp"; 1891 break; 1892 } 1893 1894 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1895 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1896 if (err) { 1897 dev_err(adev->dev, 1898 "Failed to load gpu_info firmware \"%s\"\n", 1899 fw_name); 1900 goto out; 1901 } 1902 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1903 if (err) { 1904 dev_err(adev->dev, 1905 "Failed to validate gpu_info firmware \"%s\"\n", 1906 fw_name); 1907 goto out; 1908 } 1909 1910 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1911 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1912 1913 switch (hdr->version_major) { 1914 case 1: 1915 { 1916 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1917 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1918 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1919 1920 /* 1921 * Should be droped when DAL no longer needs it. 1922 */ 1923 if (adev->asic_type == CHIP_NAVI12) 1924 goto parse_soc_bounding_box; 1925 1926 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1927 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1928 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1929 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1930 adev->gfx.config.max_texture_channel_caches = 1931 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1932 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1933 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1934 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1935 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1936 adev->gfx.config.double_offchip_lds_buf = 1937 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1938 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1939 adev->gfx.cu_info.max_waves_per_simd = 1940 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1941 adev->gfx.cu_info.max_scratch_slots_per_cu = 1942 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1943 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1944 if (hdr->version_minor >= 1) { 1945 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1946 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1947 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1948 adev->gfx.config.num_sc_per_sh = 1949 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1950 adev->gfx.config.num_packer_per_sc = 1951 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1952 } 1953 1954 parse_soc_bounding_box: 1955 /* 1956 * soc bounding box info is not integrated in disocovery table, 1957 * we always need to parse it from gpu info firmware if needed. 1958 */ 1959 if (hdr->version_minor == 2) { 1960 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1961 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1962 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1963 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1964 } 1965 break; 1966 } 1967 default: 1968 dev_err(adev->dev, 1969 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1970 err = -EINVAL; 1971 goto out; 1972 } 1973 out: 1974 return err; 1975 } 1976 1977 /** 1978 * amdgpu_device_ip_early_init - run early init for hardware IPs 1979 * 1980 * @adev: amdgpu_device pointer 1981 * 1982 * Early initialization pass for hardware IPs. The hardware IPs that make 1983 * up each asic are discovered each IP's early_init callback is run. This 1984 * is the first stage in initializing the asic. 1985 * Returns 0 on success, negative error code on failure. 1986 */ 1987 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1988 { 1989 int i, r; 1990 1991 amdgpu_device_enable_virtual_display(adev); 1992 1993 if (amdgpu_sriov_vf(adev)) { 1994 r = amdgpu_virt_request_full_gpu(adev, true); 1995 if (r) 1996 return r; 1997 } 1998 1999 switch (adev->asic_type) { 2000 #ifdef CONFIG_DRM_AMDGPU_SI 2001 case CHIP_VERDE: 2002 case CHIP_TAHITI: 2003 case CHIP_PITCAIRN: 2004 case CHIP_OLAND: 2005 case CHIP_HAINAN: 2006 adev->family = AMDGPU_FAMILY_SI; 2007 r = si_set_ip_blocks(adev); 2008 if (r) 2009 return r; 2010 break; 2011 #endif 2012 #ifdef CONFIG_DRM_AMDGPU_CIK 2013 case CHIP_BONAIRE: 2014 case CHIP_HAWAII: 2015 case CHIP_KAVERI: 2016 case CHIP_KABINI: 2017 case CHIP_MULLINS: 2018 if (adev->flags & AMD_IS_APU) 2019 adev->family = AMDGPU_FAMILY_KV; 2020 else 2021 adev->family = AMDGPU_FAMILY_CI; 2022 2023 r = cik_set_ip_blocks(adev); 2024 if (r) 2025 return r; 2026 break; 2027 #endif 2028 case CHIP_TOPAZ: 2029 case CHIP_TONGA: 2030 case CHIP_FIJI: 2031 case CHIP_POLARIS10: 2032 case CHIP_POLARIS11: 2033 case CHIP_POLARIS12: 2034 case CHIP_VEGAM: 2035 case CHIP_CARRIZO: 2036 case CHIP_STONEY: 2037 if (adev->flags & AMD_IS_APU) 2038 adev->family = AMDGPU_FAMILY_CZ; 2039 else 2040 adev->family = AMDGPU_FAMILY_VI; 2041 2042 r = vi_set_ip_blocks(adev); 2043 if (r) 2044 return r; 2045 break; 2046 case CHIP_VEGA10: 2047 case CHIP_VEGA12: 2048 case CHIP_VEGA20: 2049 case CHIP_RAVEN: 2050 case CHIP_ARCTURUS: 2051 case CHIP_RENOIR: 2052 case CHIP_ALDEBARAN: 2053 if (adev->flags & AMD_IS_APU) 2054 adev->family = AMDGPU_FAMILY_RV; 2055 else 2056 adev->family = AMDGPU_FAMILY_AI; 2057 2058 r = soc15_set_ip_blocks(adev); 2059 if (r) 2060 return r; 2061 break; 2062 case CHIP_NAVI10: 2063 case CHIP_NAVI14: 2064 case CHIP_NAVI12: 2065 case CHIP_SIENNA_CICHLID: 2066 case CHIP_NAVY_FLOUNDER: 2067 case CHIP_DIMGREY_CAVEFISH: 2068 case CHIP_BEIGE_GOBY: 2069 case CHIP_VANGOGH: 2070 case CHIP_YELLOW_CARP: 2071 if (adev->asic_type == CHIP_VANGOGH) 2072 adev->family = AMDGPU_FAMILY_VGH; 2073 else if (adev->asic_type == CHIP_YELLOW_CARP) 2074 adev->family = AMDGPU_FAMILY_YC; 2075 else 2076 adev->family = AMDGPU_FAMILY_NV; 2077 2078 r = nv_set_ip_blocks(adev); 2079 if (r) 2080 return r; 2081 break; 2082 default: 2083 /* FIXME: not supported yet */ 2084 return -EINVAL; 2085 } 2086 2087 amdgpu_amdkfd_device_probe(adev); 2088 2089 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2090 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2091 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2092 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2093 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2094 2095 for (i = 0; i < adev->num_ip_blocks; i++) { 2096 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2097 DRM_ERROR("disabled ip block: %d <%s>\n", 2098 i, adev->ip_blocks[i].version->funcs->name); 2099 adev->ip_blocks[i].status.valid = false; 2100 } else { 2101 if (adev->ip_blocks[i].version->funcs->early_init) { 2102 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2103 if (r == -ENOENT) { 2104 adev->ip_blocks[i].status.valid = false; 2105 } else if (r) { 2106 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2107 adev->ip_blocks[i].version->funcs->name, r); 2108 return r; 2109 } else { 2110 adev->ip_blocks[i].status.valid = true; 2111 } 2112 } else { 2113 adev->ip_blocks[i].status.valid = true; 2114 } 2115 } 2116 /* get the vbios after the asic_funcs are set up */ 2117 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2118 r = amdgpu_device_parse_gpu_info_fw(adev); 2119 if (r) 2120 return r; 2121 2122 /* Read BIOS */ 2123 if (!amdgpu_get_bios(adev)) 2124 return -EINVAL; 2125 2126 r = amdgpu_atombios_init(adev); 2127 if (r) { 2128 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2129 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2130 return r; 2131 } 2132 2133 /*get pf2vf msg info at it's earliest time*/ 2134 if (amdgpu_sriov_vf(adev)) 2135 amdgpu_virt_init_data_exchange(adev); 2136 2137 } 2138 } 2139 2140 adev->cg_flags &= amdgpu_cg_mask; 2141 adev->pg_flags &= amdgpu_pg_mask; 2142 2143 return 0; 2144 } 2145 2146 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2147 { 2148 int i, r; 2149 2150 for (i = 0; i < adev->num_ip_blocks; i++) { 2151 if (!adev->ip_blocks[i].status.sw) 2152 continue; 2153 if (adev->ip_blocks[i].status.hw) 2154 continue; 2155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2156 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2157 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2158 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2159 if (r) { 2160 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2161 adev->ip_blocks[i].version->funcs->name, r); 2162 return r; 2163 } 2164 adev->ip_blocks[i].status.hw = true; 2165 } 2166 } 2167 2168 return 0; 2169 } 2170 2171 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2172 { 2173 int i, r; 2174 2175 for (i = 0; i < adev->num_ip_blocks; i++) { 2176 if (!adev->ip_blocks[i].status.sw) 2177 continue; 2178 if (adev->ip_blocks[i].status.hw) 2179 continue; 2180 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2181 if (r) { 2182 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2183 adev->ip_blocks[i].version->funcs->name, r); 2184 return r; 2185 } 2186 adev->ip_blocks[i].status.hw = true; 2187 } 2188 2189 return 0; 2190 } 2191 2192 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2193 { 2194 int r = 0; 2195 int i; 2196 uint32_t smu_version; 2197 2198 if (adev->asic_type >= CHIP_VEGA10) { 2199 for (i = 0; i < adev->num_ip_blocks; i++) { 2200 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2201 continue; 2202 2203 if (!adev->ip_blocks[i].status.sw) 2204 continue; 2205 2206 /* no need to do the fw loading again if already done*/ 2207 if (adev->ip_blocks[i].status.hw == true) 2208 break; 2209 2210 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2211 r = adev->ip_blocks[i].version->funcs->resume(adev); 2212 if (r) { 2213 DRM_ERROR("resume of IP block <%s> failed %d\n", 2214 adev->ip_blocks[i].version->funcs->name, r); 2215 return r; 2216 } 2217 } else { 2218 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2219 if (r) { 2220 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2221 adev->ip_blocks[i].version->funcs->name, r); 2222 return r; 2223 } 2224 } 2225 2226 adev->ip_blocks[i].status.hw = true; 2227 break; 2228 } 2229 } 2230 2231 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2232 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2233 2234 return r; 2235 } 2236 2237 /** 2238 * amdgpu_device_ip_init - run init for hardware IPs 2239 * 2240 * @adev: amdgpu_device pointer 2241 * 2242 * Main initialization pass for hardware IPs. The list of all the hardware 2243 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2244 * are run. sw_init initializes the software state associated with each IP 2245 * and hw_init initializes the hardware associated with each IP. 2246 * Returns 0 on success, negative error code on failure. 2247 */ 2248 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2249 { 2250 int i, r; 2251 2252 r = amdgpu_ras_init(adev); 2253 if (r) 2254 return r; 2255 2256 for (i = 0; i < adev->num_ip_blocks; i++) { 2257 if (!adev->ip_blocks[i].status.valid) 2258 continue; 2259 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2260 if (r) { 2261 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2262 adev->ip_blocks[i].version->funcs->name, r); 2263 goto init_failed; 2264 } 2265 adev->ip_blocks[i].status.sw = true; 2266 2267 /* need to do gmc hw init early so we can allocate gpu mem */ 2268 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2269 r = amdgpu_device_vram_scratch_init(adev); 2270 if (r) { 2271 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2272 goto init_failed; 2273 } 2274 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2275 if (r) { 2276 DRM_ERROR("hw_init %d failed %d\n", i, r); 2277 goto init_failed; 2278 } 2279 r = amdgpu_device_wb_init(adev); 2280 if (r) { 2281 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2282 goto init_failed; 2283 } 2284 adev->ip_blocks[i].status.hw = true; 2285 2286 /* right after GMC hw init, we create CSA */ 2287 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2288 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2289 AMDGPU_GEM_DOMAIN_VRAM, 2290 AMDGPU_CSA_SIZE); 2291 if (r) { 2292 DRM_ERROR("allocate CSA failed %d\n", r); 2293 goto init_failed; 2294 } 2295 } 2296 } 2297 } 2298 2299 if (amdgpu_sriov_vf(adev)) 2300 amdgpu_virt_init_data_exchange(adev); 2301 2302 r = amdgpu_ib_pool_init(adev); 2303 if (r) { 2304 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2305 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2306 goto init_failed; 2307 } 2308 2309 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2310 if (r) 2311 goto init_failed; 2312 2313 r = amdgpu_device_ip_hw_init_phase1(adev); 2314 if (r) 2315 goto init_failed; 2316 2317 r = amdgpu_device_fw_loading(adev); 2318 if (r) 2319 goto init_failed; 2320 2321 r = amdgpu_device_ip_hw_init_phase2(adev); 2322 if (r) 2323 goto init_failed; 2324 2325 /* 2326 * retired pages will be loaded from eeprom and reserved here, 2327 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2328 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2329 * for I2C communication which only true at this point. 2330 * 2331 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2332 * failure from bad gpu situation and stop amdgpu init process 2333 * accordingly. For other failed cases, it will still release all 2334 * the resource and print error message, rather than returning one 2335 * negative value to upper level. 2336 * 2337 * Note: theoretically, this should be called before all vram allocations 2338 * to protect retired page from abusing 2339 */ 2340 r = amdgpu_ras_recovery_init(adev); 2341 if (r) 2342 goto init_failed; 2343 2344 if (adev->gmc.xgmi.num_physical_nodes > 1) 2345 amdgpu_xgmi_add_device(adev); 2346 2347 /* Don't init kfd if whole hive need to be reset during init */ 2348 if (!adev->gmc.xgmi.pending_reset) 2349 amdgpu_amdkfd_device_init(adev); 2350 2351 amdgpu_fru_get_product_info(adev); 2352 2353 init_failed: 2354 if (amdgpu_sriov_vf(adev)) 2355 amdgpu_virt_release_full_gpu(adev, true); 2356 2357 return r; 2358 } 2359 2360 /** 2361 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2362 * 2363 * @adev: amdgpu_device pointer 2364 * 2365 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2366 * this function before a GPU reset. If the value is retained after a 2367 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2368 */ 2369 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2370 { 2371 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2372 } 2373 2374 /** 2375 * amdgpu_device_check_vram_lost - check if vram is valid 2376 * 2377 * @adev: amdgpu_device pointer 2378 * 2379 * Checks the reset magic value written to the gart pointer in VRAM. 2380 * The driver calls this after a GPU reset to see if the contents of 2381 * VRAM is lost or now. 2382 * returns true if vram is lost, false if not. 2383 */ 2384 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2385 { 2386 if (memcmp(adev->gart.ptr, adev->reset_magic, 2387 AMDGPU_RESET_MAGIC_NUM)) 2388 return true; 2389 2390 if (!amdgpu_in_reset(adev)) 2391 return false; 2392 2393 /* 2394 * For all ASICs with baco/mode1 reset, the VRAM is 2395 * always assumed to be lost. 2396 */ 2397 switch (amdgpu_asic_reset_method(adev)) { 2398 case AMD_RESET_METHOD_BACO: 2399 case AMD_RESET_METHOD_MODE1: 2400 return true; 2401 default: 2402 return false; 2403 } 2404 } 2405 2406 /** 2407 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2408 * 2409 * @adev: amdgpu_device pointer 2410 * @state: clockgating state (gate or ungate) 2411 * 2412 * The list of all the hardware IPs that make up the asic is walked and the 2413 * set_clockgating_state callbacks are run. 2414 * Late initialization pass enabling clockgating for hardware IPs. 2415 * Fini or suspend, pass disabling clockgating for hardware IPs. 2416 * Returns 0 on success, negative error code on failure. 2417 */ 2418 2419 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2420 enum amd_clockgating_state state) 2421 { 2422 int i, j, r; 2423 2424 if (amdgpu_emu_mode == 1) 2425 return 0; 2426 2427 for (j = 0; j < adev->num_ip_blocks; j++) { 2428 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2429 if (!adev->ip_blocks[i].status.late_initialized) 2430 continue; 2431 /* skip CG for GFX on S0ix */ 2432 if (adev->in_s0ix && 2433 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2434 continue; 2435 /* skip CG for VCE/UVD, it's handled specially */ 2436 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2437 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2438 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2439 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2440 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2441 /* enable clockgating to save power */ 2442 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2443 state); 2444 if (r) { 2445 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2446 adev->ip_blocks[i].version->funcs->name, r); 2447 return r; 2448 } 2449 } 2450 } 2451 2452 return 0; 2453 } 2454 2455 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2456 enum amd_powergating_state state) 2457 { 2458 int i, j, r; 2459 2460 if (amdgpu_emu_mode == 1) 2461 return 0; 2462 2463 for (j = 0; j < adev->num_ip_blocks; j++) { 2464 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2465 if (!adev->ip_blocks[i].status.late_initialized) 2466 continue; 2467 /* skip PG for GFX on S0ix */ 2468 if (adev->in_s0ix && 2469 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2470 continue; 2471 /* skip CG for VCE/UVD, it's handled specially */ 2472 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2473 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2474 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2475 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2476 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2477 /* enable powergating to save power */ 2478 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2479 state); 2480 if (r) { 2481 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2482 adev->ip_blocks[i].version->funcs->name, r); 2483 return r; 2484 } 2485 } 2486 } 2487 return 0; 2488 } 2489 2490 static int amdgpu_device_enable_mgpu_fan_boost(void) 2491 { 2492 struct amdgpu_gpu_instance *gpu_ins; 2493 struct amdgpu_device *adev; 2494 int i, ret = 0; 2495 2496 mutex_lock(&mgpu_info.mutex); 2497 2498 /* 2499 * MGPU fan boost feature should be enabled 2500 * only when there are two or more dGPUs in 2501 * the system 2502 */ 2503 if (mgpu_info.num_dgpu < 2) 2504 goto out; 2505 2506 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2507 gpu_ins = &(mgpu_info.gpu_ins[i]); 2508 adev = gpu_ins->adev; 2509 if (!(adev->flags & AMD_IS_APU) && 2510 !gpu_ins->mgpu_fan_enabled) { 2511 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2512 if (ret) 2513 break; 2514 2515 gpu_ins->mgpu_fan_enabled = 1; 2516 } 2517 } 2518 2519 out: 2520 mutex_unlock(&mgpu_info.mutex); 2521 2522 return ret; 2523 } 2524 2525 /** 2526 * amdgpu_device_ip_late_init - run late init for hardware IPs 2527 * 2528 * @adev: amdgpu_device pointer 2529 * 2530 * Late initialization pass for hardware IPs. The list of all the hardware 2531 * IPs that make up the asic is walked and the late_init callbacks are run. 2532 * late_init covers any special initialization that an IP requires 2533 * after all of the have been initialized or something that needs to happen 2534 * late in the init process. 2535 * Returns 0 on success, negative error code on failure. 2536 */ 2537 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2538 { 2539 struct amdgpu_gpu_instance *gpu_instance; 2540 int i = 0, r; 2541 2542 for (i = 0; i < adev->num_ip_blocks; i++) { 2543 if (!adev->ip_blocks[i].status.hw) 2544 continue; 2545 if (adev->ip_blocks[i].version->funcs->late_init) { 2546 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2547 if (r) { 2548 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2549 adev->ip_blocks[i].version->funcs->name, r); 2550 return r; 2551 } 2552 } 2553 adev->ip_blocks[i].status.late_initialized = true; 2554 } 2555 2556 amdgpu_ras_set_error_query_ready(adev, true); 2557 2558 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2559 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2560 2561 amdgpu_device_fill_reset_magic(adev); 2562 2563 r = amdgpu_device_enable_mgpu_fan_boost(); 2564 if (r) 2565 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2566 2567 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2568 if (adev->asic_type == CHIP_ARCTURUS && 2569 amdgpu_passthrough(adev) && 2570 adev->gmc.xgmi.num_physical_nodes > 1) 2571 smu_set_light_sbr(&adev->smu, true); 2572 2573 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2574 mutex_lock(&mgpu_info.mutex); 2575 2576 /* 2577 * Reset device p-state to low as this was booted with high. 2578 * 2579 * This should be performed only after all devices from the same 2580 * hive get initialized. 2581 * 2582 * However, it's unknown how many device in the hive in advance. 2583 * As this is counted one by one during devices initializations. 2584 * 2585 * So, we wait for all XGMI interlinked devices initialized. 2586 * This may bring some delays as those devices may come from 2587 * different hives. But that should be OK. 2588 */ 2589 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2590 for (i = 0; i < mgpu_info.num_gpu; i++) { 2591 gpu_instance = &(mgpu_info.gpu_ins[i]); 2592 if (gpu_instance->adev->flags & AMD_IS_APU) 2593 continue; 2594 2595 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2596 AMDGPU_XGMI_PSTATE_MIN); 2597 if (r) { 2598 DRM_ERROR("pstate setting failed (%d).\n", r); 2599 break; 2600 } 2601 } 2602 } 2603 2604 mutex_unlock(&mgpu_info.mutex); 2605 } 2606 2607 return 0; 2608 } 2609 2610 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2611 { 2612 int i, r; 2613 2614 for (i = 0; i < adev->num_ip_blocks; i++) { 2615 if (!adev->ip_blocks[i].version->funcs->early_fini) 2616 continue; 2617 2618 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2619 if (r) { 2620 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2621 adev->ip_blocks[i].version->funcs->name, r); 2622 } 2623 } 2624 2625 amdgpu_amdkfd_suspend(adev, false); 2626 2627 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2628 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2629 2630 /* need to disable SMC first */ 2631 for (i = 0; i < adev->num_ip_blocks; i++) { 2632 if (!adev->ip_blocks[i].status.hw) 2633 continue; 2634 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2635 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2636 /* XXX handle errors */ 2637 if (r) { 2638 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2639 adev->ip_blocks[i].version->funcs->name, r); 2640 } 2641 adev->ip_blocks[i].status.hw = false; 2642 break; 2643 } 2644 } 2645 2646 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2647 if (!adev->ip_blocks[i].status.hw) 2648 continue; 2649 2650 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2651 /* XXX handle errors */ 2652 if (r) { 2653 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2654 adev->ip_blocks[i].version->funcs->name, r); 2655 } 2656 2657 adev->ip_blocks[i].status.hw = false; 2658 } 2659 2660 return 0; 2661 } 2662 2663 /** 2664 * amdgpu_device_ip_fini - run fini for hardware IPs 2665 * 2666 * @adev: amdgpu_device pointer 2667 * 2668 * Main teardown pass for hardware IPs. The list of all the hardware 2669 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2670 * are run. hw_fini tears down the hardware associated with each IP 2671 * and sw_fini tears down any software state associated with each IP. 2672 * Returns 0 on success, negative error code on failure. 2673 */ 2674 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2675 { 2676 int i, r; 2677 2678 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2679 amdgpu_virt_release_ras_err_handler_data(adev); 2680 2681 amdgpu_ras_pre_fini(adev); 2682 2683 if (adev->gmc.xgmi.num_physical_nodes > 1) 2684 amdgpu_xgmi_remove_device(adev); 2685 2686 amdgpu_amdkfd_device_fini_sw(adev); 2687 2688 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2689 if (!adev->ip_blocks[i].status.sw) 2690 continue; 2691 2692 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2693 amdgpu_ucode_free_bo(adev); 2694 amdgpu_free_static_csa(&adev->virt.csa_obj); 2695 amdgpu_device_wb_fini(adev); 2696 amdgpu_device_vram_scratch_fini(adev); 2697 amdgpu_ib_pool_fini(adev); 2698 } 2699 2700 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2701 /* XXX handle errors */ 2702 if (r) { 2703 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2704 adev->ip_blocks[i].version->funcs->name, r); 2705 } 2706 adev->ip_blocks[i].status.sw = false; 2707 adev->ip_blocks[i].status.valid = false; 2708 } 2709 2710 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2711 if (!adev->ip_blocks[i].status.late_initialized) 2712 continue; 2713 if (adev->ip_blocks[i].version->funcs->late_fini) 2714 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2715 adev->ip_blocks[i].status.late_initialized = false; 2716 } 2717 2718 amdgpu_ras_fini(adev); 2719 2720 if (amdgpu_sriov_vf(adev)) 2721 if (amdgpu_virt_release_full_gpu(adev, false)) 2722 DRM_ERROR("failed to release exclusive mode on fini\n"); 2723 2724 return 0; 2725 } 2726 2727 /** 2728 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2729 * 2730 * @work: work_struct. 2731 */ 2732 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2733 { 2734 struct amdgpu_device *adev = 2735 container_of(work, struct amdgpu_device, delayed_init_work.work); 2736 int r; 2737 2738 r = amdgpu_ib_ring_tests(adev); 2739 if (r) 2740 DRM_ERROR("ib ring test failed (%d).\n", r); 2741 } 2742 2743 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2744 { 2745 struct amdgpu_device *adev = 2746 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2747 2748 mutex_lock(&adev->gfx.gfx_off_mutex); 2749 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2750 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2751 adev->gfx.gfx_off_state = true; 2752 } 2753 mutex_unlock(&adev->gfx.gfx_off_mutex); 2754 } 2755 2756 /** 2757 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2758 * 2759 * @adev: amdgpu_device pointer 2760 * 2761 * Main suspend function for hardware IPs. The list of all the hardware 2762 * IPs that make up the asic is walked, clockgating is disabled and the 2763 * suspend callbacks are run. suspend puts the hardware and software state 2764 * in each IP into a state suitable for suspend. 2765 * Returns 0 on success, negative error code on failure. 2766 */ 2767 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2768 { 2769 int i, r; 2770 2771 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2772 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2773 2774 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2775 if (!adev->ip_blocks[i].status.valid) 2776 continue; 2777 2778 /* displays are handled separately */ 2779 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2780 continue; 2781 2782 /* XXX handle errors */ 2783 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2784 /* XXX handle errors */ 2785 if (r) { 2786 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2787 adev->ip_blocks[i].version->funcs->name, r); 2788 return r; 2789 } 2790 2791 adev->ip_blocks[i].status.hw = false; 2792 } 2793 2794 return 0; 2795 } 2796 2797 /** 2798 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2799 * 2800 * @adev: amdgpu_device pointer 2801 * 2802 * Main suspend function for hardware IPs. The list of all the hardware 2803 * IPs that make up the asic is walked, clockgating is disabled and the 2804 * suspend callbacks are run. suspend puts the hardware and software state 2805 * in each IP into a state suitable for suspend. 2806 * Returns 0 on success, negative error code on failure. 2807 */ 2808 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2809 { 2810 int i, r; 2811 2812 if (adev->in_s0ix) 2813 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2814 2815 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2816 if (!adev->ip_blocks[i].status.valid) 2817 continue; 2818 /* displays are handled in phase1 */ 2819 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2820 continue; 2821 /* PSP lost connection when err_event_athub occurs */ 2822 if (amdgpu_ras_intr_triggered() && 2823 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2824 adev->ip_blocks[i].status.hw = false; 2825 continue; 2826 } 2827 2828 /* skip unnecessary suspend if we do not initialize them yet */ 2829 if (adev->gmc.xgmi.pending_reset && 2830 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2831 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2832 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2833 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2834 adev->ip_blocks[i].status.hw = false; 2835 continue; 2836 } 2837 2838 /* skip suspend of gfx and psp for S0ix 2839 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2840 * like at runtime. PSP is also part of the always on hardware 2841 * so no need to suspend it. 2842 */ 2843 if (adev->in_s0ix && 2844 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2845 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2846 continue; 2847 2848 /* XXX handle errors */ 2849 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2850 /* XXX handle errors */ 2851 if (r) { 2852 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2853 adev->ip_blocks[i].version->funcs->name, r); 2854 } 2855 adev->ip_blocks[i].status.hw = false; 2856 /* handle putting the SMC in the appropriate state */ 2857 if(!amdgpu_sriov_vf(adev)){ 2858 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2859 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2860 if (r) { 2861 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2862 adev->mp1_state, r); 2863 return r; 2864 } 2865 } 2866 } 2867 } 2868 2869 return 0; 2870 } 2871 2872 /** 2873 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2874 * 2875 * @adev: amdgpu_device pointer 2876 * 2877 * Main suspend function for hardware IPs. The list of all the hardware 2878 * IPs that make up the asic is walked, clockgating is disabled and the 2879 * suspend callbacks are run. suspend puts the hardware and software state 2880 * in each IP into a state suitable for suspend. 2881 * Returns 0 on success, negative error code on failure. 2882 */ 2883 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2884 { 2885 int r; 2886 2887 if (amdgpu_sriov_vf(adev)) { 2888 amdgpu_virt_fini_data_exchange(adev); 2889 amdgpu_virt_request_full_gpu(adev, false); 2890 } 2891 2892 r = amdgpu_device_ip_suspend_phase1(adev); 2893 if (r) 2894 return r; 2895 r = amdgpu_device_ip_suspend_phase2(adev); 2896 2897 if (amdgpu_sriov_vf(adev)) 2898 amdgpu_virt_release_full_gpu(adev, false); 2899 2900 return r; 2901 } 2902 2903 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2904 { 2905 int i, r; 2906 2907 static enum amd_ip_block_type ip_order[] = { 2908 AMD_IP_BLOCK_TYPE_GMC, 2909 AMD_IP_BLOCK_TYPE_COMMON, 2910 AMD_IP_BLOCK_TYPE_PSP, 2911 AMD_IP_BLOCK_TYPE_IH, 2912 }; 2913 2914 for (i = 0; i < adev->num_ip_blocks; i++) { 2915 int j; 2916 struct amdgpu_ip_block *block; 2917 2918 block = &adev->ip_blocks[i]; 2919 block->status.hw = false; 2920 2921 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2922 2923 if (block->version->type != ip_order[j] || 2924 !block->status.valid) 2925 continue; 2926 2927 r = block->version->funcs->hw_init(adev); 2928 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2929 if (r) 2930 return r; 2931 block->status.hw = true; 2932 } 2933 } 2934 2935 return 0; 2936 } 2937 2938 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2939 { 2940 int i, r; 2941 2942 static enum amd_ip_block_type ip_order[] = { 2943 AMD_IP_BLOCK_TYPE_SMC, 2944 AMD_IP_BLOCK_TYPE_DCE, 2945 AMD_IP_BLOCK_TYPE_GFX, 2946 AMD_IP_BLOCK_TYPE_SDMA, 2947 AMD_IP_BLOCK_TYPE_UVD, 2948 AMD_IP_BLOCK_TYPE_VCE, 2949 AMD_IP_BLOCK_TYPE_VCN 2950 }; 2951 2952 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2953 int j; 2954 struct amdgpu_ip_block *block; 2955 2956 for (j = 0; j < adev->num_ip_blocks; j++) { 2957 block = &adev->ip_blocks[j]; 2958 2959 if (block->version->type != ip_order[i] || 2960 !block->status.valid || 2961 block->status.hw) 2962 continue; 2963 2964 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2965 r = block->version->funcs->resume(adev); 2966 else 2967 r = block->version->funcs->hw_init(adev); 2968 2969 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2970 if (r) 2971 return r; 2972 block->status.hw = true; 2973 } 2974 } 2975 2976 return 0; 2977 } 2978 2979 /** 2980 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2981 * 2982 * @adev: amdgpu_device pointer 2983 * 2984 * First resume function for hardware IPs. The list of all the hardware 2985 * IPs that make up the asic is walked and the resume callbacks are run for 2986 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2987 * after a suspend and updates the software state as necessary. This 2988 * function is also used for restoring the GPU after a GPU reset. 2989 * Returns 0 on success, negative error code on failure. 2990 */ 2991 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2992 { 2993 int i, r; 2994 2995 for (i = 0; i < adev->num_ip_blocks; i++) { 2996 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2997 continue; 2998 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2999 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3001 3002 r = adev->ip_blocks[i].version->funcs->resume(adev); 3003 if (r) { 3004 DRM_ERROR("resume of IP block <%s> failed %d\n", 3005 adev->ip_blocks[i].version->funcs->name, r); 3006 return r; 3007 } 3008 adev->ip_blocks[i].status.hw = true; 3009 } 3010 } 3011 3012 return 0; 3013 } 3014 3015 /** 3016 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3017 * 3018 * @adev: amdgpu_device pointer 3019 * 3020 * First resume function for hardware IPs. The list of all the hardware 3021 * IPs that make up the asic is walked and the resume callbacks are run for 3022 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3023 * functional state after a suspend and updates the software state as 3024 * necessary. This function is also used for restoring the GPU after a GPU 3025 * reset. 3026 * Returns 0 on success, negative error code on failure. 3027 */ 3028 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3029 { 3030 int i, r; 3031 3032 for (i = 0; i < adev->num_ip_blocks; i++) { 3033 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3034 continue; 3035 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3036 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3037 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3038 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3039 continue; 3040 r = adev->ip_blocks[i].version->funcs->resume(adev); 3041 if (r) { 3042 DRM_ERROR("resume of IP block <%s> failed %d\n", 3043 adev->ip_blocks[i].version->funcs->name, r); 3044 return r; 3045 } 3046 adev->ip_blocks[i].status.hw = true; 3047 } 3048 3049 return 0; 3050 } 3051 3052 /** 3053 * amdgpu_device_ip_resume - run resume for hardware IPs 3054 * 3055 * @adev: amdgpu_device pointer 3056 * 3057 * Main resume function for hardware IPs. The hardware IPs 3058 * are split into two resume functions because they are 3059 * are also used in in recovering from a GPU reset and some additional 3060 * steps need to be take between them. In this case (S3/S4) they are 3061 * run sequentially. 3062 * Returns 0 on success, negative error code on failure. 3063 */ 3064 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3065 { 3066 int r; 3067 3068 r = amdgpu_device_ip_resume_phase1(adev); 3069 if (r) 3070 return r; 3071 3072 r = amdgpu_device_fw_loading(adev); 3073 if (r) 3074 return r; 3075 3076 r = amdgpu_device_ip_resume_phase2(adev); 3077 3078 return r; 3079 } 3080 3081 /** 3082 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3083 * 3084 * @adev: amdgpu_device pointer 3085 * 3086 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3087 */ 3088 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3089 { 3090 if (amdgpu_sriov_vf(adev)) { 3091 if (adev->is_atom_fw) { 3092 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3093 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3094 } else { 3095 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3096 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3097 } 3098 3099 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3100 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3101 } 3102 } 3103 3104 /** 3105 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3106 * 3107 * @asic_type: AMD asic type 3108 * 3109 * Check if there is DC (new modesetting infrastructre) support for an asic. 3110 * returns true if DC has support, false if not. 3111 */ 3112 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3113 { 3114 switch (asic_type) { 3115 #if defined(CONFIG_DRM_AMD_DC) 3116 #if defined(CONFIG_DRM_AMD_DC_SI) 3117 case CHIP_TAHITI: 3118 case CHIP_PITCAIRN: 3119 case CHIP_VERDE: 3120 case CHIP_OLAND: 3121 #endif 3122 case CHIP_BONAIRE: 3123 case CHIP_KAVERI: 3124 case CHIP_KABINI: 3125 case CHIP_MULLINS: 3126 /* 3127 * We have systems in the wild with these ASICs that require 3128 * LVDS and VGA support which is not supported with DC. 3129 * 3130 * Fallback to the non-DC driver here by default so as not to 3131 * cause regressions. 3132 */ 3133 return amdgpu_dc > 0; 3134 case CHIP_HAWAII: 3135 case CHIP_CARRIZO: 3136 case CHIP_STONEY: 3137 case CHIP_POLARIS10: 3138 case CHIP_POLARIS11: 3139 case CHIP_POLARIS12: 3140 case CHIP_VEGAM: 3141 case CHIP_TONGA: 3142 case CHIP_FIJI: 3143 case CHIP_VEGA10: 3144 case CHIP_VEGA12: 3145 case CHIP_VEGA20: 3146 #if defined(CONFIG_DRM_AMD_DC_DCN) 3147 case CHIP_RAVEN: 3148 case CHIP_NAVI10: 3149 case CHIP_NAVI14: 3150 case CHIP_NAVI12: 3151 case CHIP_RENOIR: 3152 case CHIP_SIENNA_CICHLID: 3153 case CHIP_NAVY_FLOUNDER: 3154 case CHIP_DIMGREY_CAVEFISH: 3155 case CHIP_BEIGE_GOBY: 3156 case CHIP_VANGOGH: 3157 case CHIP_YELLOW_CARP: 3158 #endif 3159 return amdgpu_dc != 0; 3160 #endif 3161 default: 3162 if (amdgpu_dc > 0) 3163 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3164 "but isn't supported by ASIC, ignoring\n"); 3165 return false; 3166 } 3167 } 3168 3169 /** 3170 * amdgpu_device_has_dc_support - check if dc is supported 3171 * 3172 * @adev: amdgpu_device pointer 3173 * 3174 * Returns true for supported, false for not supported 3175 */ 3176 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3177 { 3178 if (amdgpu_sriov_vf(adev) || 3179 adev->enable_virtual_display || 3180 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3181 return false; 3182 3183 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3184 } 3185 3186 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3187 { 3188 struct amdgpu_device *adev = 3189 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3190 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3191 3192 /* It's a bug to not have a hive within this function */ 3193 if (WARN_ON(!hive)) 3194 return; 3195 3196 /* 3197 * Use task barrier to synchronize all xgmi reset works across the 3198 * hive. task_barrier_enter and task_barrier_exit will block 3199 * until all the threads running the xgmi reset works reach 3200 * those points. task_barrier_full will do both blocks. 3201 */ 3202 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3203 3204 task_barrier_enter(&hive->tb); 3205 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3206 3207 if (adev->asic_reset_res) 3208 goto fail; 3209 3210 task_barrier_exit(&hive->tb); 3211 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3212 3213 if (adev->asic_reset_res) 3214 goto fail; 3215 3216 if (adev->mmhub.ras_funcs && 3217 adev->mmhub.ras_funcs->reset_ras_error_count) 3218 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3219 } else { 3220 3221 task_barrier_full(&hive->tb); 3222 adev->asic_reset_res = amdgpu_asic_reset(adev); 3223 } 3224 3225 fail: 3226 if (adev->asic_reset_res) 3227 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3228 adev->asic_reset_res, adev_to_drm(adev)->unique); 3229 amdgpu_put_xgmi_hive(hive); 3230 } 3231 3232 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3233 { 3234 char *input = amdgpu_lockup_timeout; 3235 char *timeout_setting = NULL; 3236 int index = 0; 3237 long timeout; 3238 int ret = 0; 3239 3240 /* 3241 * By default timeout for non compute jobs is 10000 3242 * and 60000 for compute jobs. 3243 * In SR-IOV or passthrough mode, timeout for compute 3244 * jobs are 60000 by default. 3245 */ 3246 adev->gfx_timeout = msecs_to_jiffies(10000); 3247 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3248 if (amdgpu_sriov_vf(adev)) 3249 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3250 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3251 else 3252 adev->compute_timeout = msecs_to_jiffies(60000); 3253 3254 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3255 while ((timeout_setting = strsep(&input, ",")) && 3256 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3257 ret = kstrtol(timeout_setting, 0, &timeout); 3258 if (ret) 3259 return ret; 3260 3261 if (timeout == 0) { 3262 index++; 3263 continue; 3264 } else if (timeout < 0) { 3265 timeout = MAX_SCHEDULE_TIMEOUT; 3266 } else { 3267 timeout = msecs_to_jiffies(timeout); 3268 } 3269 3270 switch (index++) { 3271 case 0: 3272 adev->gfx_timeout = timeout; 3273 break; 3274 case 1: 3275 adev->compute_timeout = timeout; 3276 break; 3277 case 2: 3278 adev->sdma_timeout = timeout; 3279 break; 3280 case 3: 3281 adev->video_timeout = timeout; 3282 break; 3283 default: 3284 break; 3285 } 3286 } 3287 /* 3288 * There is only one value specified and 3289 * it should apply to all non-compute jobs. 3290 */ 3291 if (index == 1) { 3292 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3293 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3294 adev->compute_timeout = adev->gfx_timeout; 3295 } 3296 } 3297 3298 return ret; 3299 } 3300 3301 static const struct attribute *amdgpu_dev_attributes[] = { 3302 &dev_attr_product_name.attr, 3303 &dev_attr_product_number.attr, 3304 &dev_attr_serial_number.attr, 3305 &dev_attr_pcie_replay_count.attr, 3306 NULL 3307 }; 3308 3309 /** 3310 * amdgpu_device_init - initialize the driver 3311 * 3312 * @adev: amdgpu_device pointer 3313 * @flags: driver flags 3314 * 3315 * Initializes the driver info and hw (all asics). 3316 * Returns 0 for success or an error on failure. 3317 * Called at driver startup. 3318 */ 3319 int amdgpu_device_init(struct amdgpu_device *adev, 3320 uint32_t flags) 3321 { 3322 struct drm_device *ddev = adev_to_drm(adev); 3323 struct pci_dev *pdev = adev->pdev; 3324 int r, i; 3325 bool px = false; 3326 u32 max_MBps; 3327 3328 adev->shutdown = false; 3329 adev->flags = flags; 3330 3331 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3332 adev->asic_type = amdgpu_force_asic_type; 3333 else 3334 adev->asic_type = flags & AMD_ASIC_MASK; 3335 3336 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3337 if (amdgpu_emu_mode == 1) 3338 adev->usec_timeout *= 10; 3339 adev->gmc.gart_size = 512 * 1024 * 1024; 3340 adev->accel_working = false; 3341 adev->num_rings = 0; 3342 adev->mman.buffer_funcs = NULL; 3343 adev->mman.buffer_funcs_ring = NULL; 3344 adev->vm_manager.vm_pte_funcs = NULL; 3345 adev->vm_manager.vm_pte_num_scheds = 0; 3346 adev->gmc.gmc_funcs = NULL; 3347 adev->harvest_ip_mask = 0x0; 3348 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3349 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3350 3351 adev->smc_rreg = &amdgpu_invalid_rreg; 3352 adev->smc_wreg = &amdgpu_invalid_wreg; 3353 adev->pcie_rreg = &amdgpu_invalid_rreg; 3354 adev->pcie_wreg = &amdgpu_invalid_wreg; 3355 adev->pciep_rreg = &amdgpu_invalid_rreg; 3356 adev->pciep_wreg = &amdgpu_invalid_wreg; 3357 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3358 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3359 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3360 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3361 adev->didt_rreg = &amdgpu_invalid_rreg; 3362 adev->didt_wreg = &amdgpu_invalid_wreg; 3363 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3364 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3365 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3366 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3367 3368 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3369 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3370 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3371 3372 /* mutex initialization are all done here so we 3373 * can recall function without having locking issues */ 3374 mutex_init(&adev->firmware.mutex); 3375 mutex_init(&adev->pm.mutex); 3376 mutex_init(&adev->gfx.gpu_clock_mutex); 3377 mutex_init(&adev->srbm_mutex); 3378 mutex_init(&adev->gfx.pipe_reserve_mutex); 3379 mutex_init(&adev->gfx.gfx_off_mutex); 3380 mutex_init(&adev->grbm_idx_mutex); 3381 mutex_init(&adev->mn_lock); 3382 mutex_init(&adev->virt.vf_errors.lock); 3383 hash_init(adev->mn_hash); 3384 atomic_set(&adev->in_gpu_reset, 0); 3385 init_rwsem(&adev->reset_sem); 3386 mutex_init(&adev->psp.mutex); 3387 mutex_init(&adev->notifier_lock); 3388 3389 r = amdgpu_device_check_arguments(adev); 3390 if (r) 3391 return r; 3392 3393 spin_lock_init(&adev->mmio_idx_lock); 3394 spin_lock_init(&adev->smc_idx_lock); 3395 spin_lock_init(&adev->pcie_idx_lock); 3396 spin_lock_init(&adev->uvd_ctx_idx_lock); 3397 spin_lock_init(&adev->didt_idx_lock); 3398 spin_lock_init(&adev->gc_cac_idx_lock); 3399 spin_lock_init(&adev->se_cac_idx_lock); 3400 spin_lock_init(&adev->audio_endpt_idx_lock); 3401 spin_lock_init(&adev->mm_stats.lock); 3402 3403 INIT_LIST_HEAD(&adev->shadow_list); 3404 mutex_init(&adev->shadow_list_lock); 3405 3406 INIT_LIST_HEAD(&adev->reset_list); 3407 3408 INIT_DELAYED_WORK(&adev->delayed_init_work, 3409 amdgpu_device_delayed_init_work_handler); 3410 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3411 amdgpu_device_delay_enable_gfx_off); 3412 3413 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3414 3415 adev->gfx.gfx_off_req_count = 1; 3416 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3417 3418 atomic_set(&adev->throttling_logging_enabled, 1); 3419 /* 3420 * If throttling continues, logging will be performed every minute 3421 * to avoid log flooding. "-1" is subtracted since the thermal 3422 * throttling interrupt comes every second. Thus, the total logging 3423 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3424 * for throttling interrupt) = 60 seconds. 3425 */ 3426 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3427 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3428 3429 /* Registers mapping */ 3430 /* TODO: block userspace mapping of io register */ 3431 if (adev->asic_type >= CHIP_BONAIRE) { 3432 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3433 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3434 } else { 3435 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3436 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3437 } 3438 3439 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3440 if (adev->rmmio == NULL) { 3441 return -ENOMEM; 3442 } 3443 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3444 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3445 3446 /* enable PCIE atomic ops */ 3447 r = pci_enable_atomic_ops_to_root(adev->pdev, 3448 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3449 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3450 if (r) { 3451 adev->have_atomics_support = false; 3452 DRM_INFO("PCIE atomic ops is not supported\n"); 3453 } else { 3454 adev->have_atomics_support = true; 3455 } 3456 3457 amdgpu_device_get_pcie_info(adev); 3458 3459 if (amdgpu_mcbp) 3460 DRM_INFO("MCBP is enabled\n"); 3461 3462 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3463 adev->enable_mes = true; 3464 3465 /* detect hw virtualization here */ 3466 amdgpu_detect_virtualization(adev); 3467 3468 r = amdgpu_device_get_job_timeout_settings(adev); 3469 if (r) { 3470 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3471 goto failed_unmap; 3472 } 3473 3474 /* early init functions */ 3475 r = amdgpu_device_ip_early_init(adev); 3476 if (r) 3477 goto failed_unmap; 3478 3479 /* doorbell bar mapping and doorbell index init*/ 3480 amdgpu_device_doorbell_init(adev); 3481 3482 if (amdgpu_emu_mode == 1) { 3483 /* post the asic on emulation mode */ 3484 emu_soc_asic_init(adev); 3485 goto fence_driver_init; 3486 } 3487 3488 amdgpu_reset_init(adev); 3489 3490 /* detect if we are with an SRIOV vbios */ 3491 amdgpu_device_detect_sriov_bios(adev); 3492 3493 /* check if we need to reset the asic 3494 * E.g., driver was not cleanly unloaded previously, etc. 3495 */ 3496 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3497 if (adev->gmc.xgmi.num_physical_nodes) { 3498 dev_info(adev->dev, "Pending hive reset.\n"); 3499 adev->gmc.xgmi.pending_reset = true; 3500 /* Only need to init necessary block for SMU to handle the reset */ 3501 for (i = 0; i < adev->num_ip_blocks; i++) { 3502 if (!adev->ip_blocks[i].status.valid) 3503 continue; 3504 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3505 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3506 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3507 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3508 DRM_DEBUG("IP %s disabled for hw_init.\n", 3509 adev->ip_blocks[i].version->funcs->name); 3510 adev->ip_blocks[i].status.hw = true; 3511 } 3512 } 3513 } else { 3514 r = amdgpu_asic_reset(adev); 3515 if (r) { 3516 dev_err(adev->dev, "asic reset on init failed\n"); 3517 goto failed; 3518 } 3519 } 3520 } 3521 3522 pci_enable_pcie_error_reporting(adev->pdev); 3523 3524 /* Post card if necessary */ 3525 if (amdgpu_device_need_post(adev)) { 3526 if (!adev->bios) { 3527 dev_err(adev->dev, "no vBIOS found\n"); 3528 r = -EINVAL; 3529 goto failed; 3530 } 3531 DRM_INFO("GPU posting now...\n"); 3532 r = amdgpu_device_asic_init(adev); 3533 if (r) { 3534 dev_err(adev->dev, "gpu post error!\n"); 3535 goto failed; 3536 } 3537 } 3538 3539 if (adev->is_atom_fw) { 3540 /* Initialize clocks */ 3541 r = amdgpu_atomfirmware_get_clock_info(adev); 3542 if (r) { 3543 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3544 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3545 goto failed; 3546 } 3547 } else { 3548 /* Initialize clocks */ 3549 r = amdgpu_atombios_get_clock_info(adev); 3550 if (r) { 3551 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3552 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3553 goto failed; 3554 } 3555 /* init i2c buses */ 3556 if (!amdgpu_device_has_dc_support(adev)) 3557 amdgpu_atombios_i2c_init(adev); 3558 } 3559 3560 fence_driver_init: 3561 /* Fence driver */ 3562 r = amdgpu_fence_driver_init(adev); 3563 if (r) { 3564 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3565 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3566 goto failed; 3567 } 3568 3569 /* init the mode config */ 3570 drm_mode_config_init(adev_to_drm(adev)); 3571 3572 r = amdgpu_device_ip_init(adev); 3573 if (r) { 3574 /* failed in exclusive mode due to timeout */ 3575 if (amdgpu_sriov_vf(adev) && 3576 !amdgpu_sriov_runtime(adev) && 3577 amdgpu_virt_mmio_blocked(adev) && 3578 !amdgpu_virt_wait_reset(adev)) { 3579 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3580 /* Don't send request since VF is inactive. */ 3581 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3582 adev->virt.ops = NULL; 3583 r = -EAGAIN; 3584 goto release_ras_con; 3585 } 3586 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3587 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3588 goto release_ras_con; 3589 } 3590 3591 dev_info(adev->dev, 3592 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3593 adev->gfx.config.max_shader_engines, 3594 adev->gfx.config.max_sh_per_se, 3595 adev->gfx.config.max_cu_per_sh, 3596 adev->gfx.cu_info.number); 3597 3598 adev->accel_working = true; 3599 3600 amdgpu_vm_check_compute_bug(adev); 3601 3602 /* Initialize the buffer migration limit. */ 3603 if (amdgpu_moverate >= 0) 3604 max_MBps = amdgpu_moverate; 3605 else 3606 max_MBps = 8; /* Allow 8 MB/s. */ 3607 /* Get a log2 for easy divisions. */ 3608 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3609 3610 amdgpu_fbdev_init(adev); 3611 3612 r = amdgpu_pm_sysfs_init(adev); 3613 if (r) { 3614 adev->pm_sysfs_en = false; 3615 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3616 } else 3617 adev->pm_sysfs_en = true; 3618 3619 r = amdgpu_ucode_sysfs_init(adev); 3620 if (r) { 3621 adev->ucode_sysfs_en = false; 3622 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3623 } else 3624 adev->ucode_sysfs_en = true; 3625 3626 if ((amdgpu_testing & 1)) { 3627 if (adev->accel_working) 3628 amdgpu_test_moves(adev); 3629 else 3630 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3631 } 3632 if (amdgpu_benchmarking) { 3633 if (adev->accel_working) 3634 amdgpu_benchmark(adev, amdgpu_benchmarking); 3635 else 3636 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3637 } 3638 3639 /* 3640 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3641 * Otherwise the mgpu fan boost feature will be skipped due to the 3642 * gpu instance is counted less. 3643 */ 3644 amdgpu_register_gpu_instance(adev); 3645 3646 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3647 * explicit gating rather than handling it automatically. 3648 */ 3649 if (!adev->gmc.xgmi.pending_reset) { 3650 r = amdgpu_device_ip_late_init(adev); 3651 if (r) { 3652 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3653 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3654 goto release_ras_con; 3655 } 3656 /* must succeed. */ 3657 amdgpu_ras_resume(adev); 3658 queue_delayed_work(system_wq, &adev->delayed_init_work, 3659 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3660 } 3661 3662 if (amdgpu_sriov_vf(adev)) 3663 flush_delayed_work(&adev->delayed_init_work); 3664 3665 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3666 if (r) 3667 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3668 3669 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3670 r = amdgpu_pmu_init(adev); 3671 if (r) 3672 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3673 3674 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3675 if (amdgpu_device_cache_pci_state(adev->pdev)) 3676 pci_restore_state(pdev); 3677 3678 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3679 /* this will fail for cards that aren't VGA class devices, just 3680 * ignore it */ 3681 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3682 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3683 3684 if (amdgpu_device_supports_px(ddev)) { 3685 px = true; 3686 vga_switcheroo_register_client(adev->pdev, 3687 &amdgpu_switcheroo_ops, px); 3688 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3689 } 3690 3691 if (adev->gmc.xgmi.pending_reset) 3692 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3693 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3694 3695 return 0; 3696 3697 release_ras_con: 3698 amdgpu_release_ras_context(adev); 3699 3700 failed: 3701 amdgpu_vf_error_trans_all(adev); 3702 3703 failed_unmap: 3704 iounmap(adev->rmmio); 3705 adev->rmmio = NULL; 3706 3707 return r; 3708 } 3709 3710 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3711 { 3712 /* Clear all CPU mappings pointing to this device */ 3713 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3714 3715 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3716 amdgpu_device_doorbell_fini(adev); 3717 3718 iounmap(adev->rmmio); 3719 adev->rmmio = NULL; 3720 if (adev->mman.aper_base_kaddr) 3721 iounmap(adev->mman.aper_base_kaddr); 3722 adev->mman.aper_base_kaddr = NULL; 3723 3724 /* Memory manager related */ 3725 if (!adev->gmc.xgmi.connected_to_cpu) { 3726 arch_phys_wc_del(adev->gmc.vram_mtrr); 3727 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3728 } 3729 } 3730 3731 /** 3732 * amdgpu_device_fini - tear down the driver 3733 * 3734 * @adev: amdgpu_device pointer 3735 * 3736 * Tear down the driver info (all asics). 3737 * Called at driver shutdown. 3738 */ 3739 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3740 { 3741 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3742 flush_delayed_work(&adev->delayed_init_work); 3743 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3744 adev->shutdown = true; 3745 3746 /* make sure IB test finished before entering exclusive mode 3747 * to avoid preemption on IB test 3748 * */ 3749 if (amdgpu_sriov_vf(adev)) { 3750 amdgpu_virt_request_full_gpu(adev, false); 3751 amdgpu_virt_fini_data_exchange(adev); 3752 } 3753 3754 /* disable all interrupts */ 3755 amdgpu_irq_disable_all(adev); 3756 if (adev->mode_info.mode_config_initialized){ 3757 if (!amdgpu_device_has_dc_support(adev)) 3758 drm_helper_force_disable_all(adev_to_drm(adev)); 3759 else 3760 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3761 } 3762 amdgpu_fence_driver_fini_hw(adev); 3763 3764 if (adev->pm_sysfs_en) 3765 amdgpu_pm_sysfs_fini(adev); 3766 if (adev->ucode_sysfs_en) 3767 amdgpu_ucode_sysfs_fini(adev); 3768 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3769 3770 amdgpu_fbdev_fini(adev); 3771 3772 amdgpu_irq_fini_hw(adev); 3773 3774 amdgpu_device_ip_fini_early(adev); 3775 3776 amdgpu_gart_dummy_page_fini(adev); 3777 3778 amdgpu_device_unmap_mmio(adev); 3779 } 3780 3781 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3782 { 3783 amdgpu_device_ip_fini(adev); 3784 amdgpu_fence_driver_fini_sw(adev); 3785 release_firmware(adev->firmware.gpu_info_fw); 3786 adev->firmware.gpu_info_fw = NULL; 3787 adev->accel_working = false; 3788 3789 amdgpu_reset_fini(adev); 3790 3791 /* free i2c buses */ 3792 if (!amdgpu_device_has_dc_support(adev)) 3793 amdgpu_i2c_fini(adev); 3794 3795 if (amdgpu_emu_mode != 1) 3796 amdgpu_atombios_fini(adev); 3797 3798 kfree(adev->bios); 3799 adev->bios = NULL; 3800 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3801 vga_switcheroo_unregister_client(adev->pdev); 3802 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3803 } 3804 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3805 vga_client_register(adev->pdev, NULL, NULL, NULL); 3806 3807 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3808 amdgpu_pmu_fini(adev); 3809 if (adev->mman.discovery_bin) 3810 amdgpu_discovery_fini(adev); 3811 3812 kfree(adev->pci_state); 3813 3814 } 3815 3816 3817 /* 3818 * Suspend & resume. 3819 */ 3820 /** 3821 * amdgpu_device_suspend - initiate device suspend 3822 * 3823 * @dev: drm dev pointer 3824 * @fbcon : notify the fbdev of suspend 3825 * 3826 * Puts the hw in the suspend state (all asics). 3827 * Returns 0 for success or an error on failure. 3828 * Called at driver suspend. 3829 */ 3830 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3831 { 3832 struct amdgpu_device *adev = drm_to_adev(dev); 3833 3834 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3835 return 0; 3836 3837 adev->in_suspend = true; 3838 3839 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 3840 DRM_WARN("smart shift update failed\n"); 3841 3842 drm_kms_helper_poll_disable(dev); 3843 3844 if (fbcon) 3845 amdgpu_fbdev_set_suspend(adev, 1); 3846 3847 cancel_delayed_work_sync(&adev->delayed_init_work); 3848 3849 amdgpu_ras_suspend(adev); 3850 3851 amdgpu_device_ip_suspend_phase1(adev); 3852 3853 if (!adev->in_s0ix) 3854 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3855 3856 /* evict vram memory */ 3857 amdgpu_bo_evict_vram(adev); 3858 3859 amdgpu_fence_driver_suspend(adev); 3860 3861 amdgpu_device_ip_suspend_phase2(adev); 3862 /* evict remaining vram memory 3863 * This second call to evict vram is to evict the gart page table 3864 * using the CPU. 3865 */ 3866 amdgpu_bo_evict_vram(adev); 3867 3868 return 0; 3869 } 3870 3871 /** 3872 * amdgpu_device_resume - initiate device resume 3873 * 3874 * @dev: drm dev pointer 3875 * @fbcon : notify the fbdev of resume 3876 * 3877 * Bring the hw back to operating state (all asics). 3878 * Returns 0 for success or an error on failure. 3879 * Called at driver resume. 3880 */ 3881 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3882 { 3883 struct amdgpu_device *adev = drm_to_adev(dev); 3884 int r = 0; 3885 3886 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3887 return 0; 3888 3889 if (adev->in_s0ix) 3890 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3891 3892 /* post card */ 3893 if (amdgpu_device_need_post(adev)) { 3894 r = amdgpu_device_asic_init(adev); 3895 if (r) 3896 dev_err(adev->dev, "amdgpu asic init failed\n"); 3897 } 3898 3899 r = amdgpu_device_ip_resume(adev); 3900 if (r) { 3901 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3902 return r; 3903 } 3904 amdgpu_fence_driver_resume(adev); 3905 3906 3907 r = amdgpu_device_ip_late_init(adev); 3908 if (r) 3909 return r; 3910 3911 queue_delayed_work(system_wq, &adev->delayed_init_work, 3912 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3913 3914 if (!adev->in_s0ix) { 3915 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3916 if (r) 3917 return r; 3918 } 3919 3920 /* Make sure IB tests flushed */ 3921 flush_delayed_work(&adev->delayed_init_work); 3922 3923 if (fbcon) 3924 amdgpu_fbdev_set_suspend(adev, 0); 3925 3926 drm_kms_helper_poll_enable(dev); 3927 3928 amdgpu_ras_resume(adev); 3929 3930 /* 3931 * Most of the connector probing functions try to acquire runtime pm 3932 * refs to ensure that the GPU is powered on when connector polling is 3933 * performed. Since we're calling this from a runtime PM callback, 3934 * trying to acquire rpm refs will cause us to deadlock. 3935 * 3936 * Since we're guaranteed to be holding the rpm lock, it's safe to 3937 * temporarily disable the rpm helpers so this doesn't deadlock us. 3938 */ 3939 #ifdef CONFIG_PM 3940 dev->dev->power.disable_depth++; 3941 #endif 3942 if (!amdgpu_device_has_dc_support(adev)) 3943 drm_helper_hpd_irq_event(dev); 3944 else 3945 drm_kms_helper_hotplug_event(dev); 3946 #ifdef CONFIG_PM 3947 dev->dev->power.disable_depth--; 3948 #endif 3949 adev->in_suspend = false; 3950 3951 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 3952 DRM_WARN("smart shift update failed\n"); 3953 3954 return 0; 3955 } 3956 3957 /** 3958 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3959 * 3960 * @adev: amdgpu_device pointer 3961 * 3962 * The list of all the hardware IPs that make up the asic is walked and 3963 * the check_soft_reset callbacks are run. check_soft_reset determines 3964 * if the asic is still hung or not. 3965 * Returns true if any of the IPs are still in a hung state, false if not. 3966 */ 3967 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3968 { 3969 int i; 3970 bool asic_hang = false; 3971 3972 if (amdgpu_sriov_vf(adev)) 3973 return true; 3974 3975 if (amdgpu_asic_need_full_reset(adev)) 3976 return true; 3977 3978 for (i = 0; i < adev->num_ip_blocks; i++) { 3979 if (!adev->ip_blocks[i].status.valid) 3980 continue; 3981 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3982 adev->ip_blocks[i].status.hang = 3983 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3984 if (adev->ip_blocks[i].status.hang) { 3985 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3986 asic_hang = true; 3987 } 3988 } 3989 return asic_hang; 3990 } 3991 3992 /** 3993 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3994 * 3995 * @adev: amdgpu_device pointer 3996 * 3997 * The list of all the hardware IPs that make up the asic is walked and the 3998 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3999 * handles any IP specific hardware or software state changes that are 4000 * necessary for a soft reset to succeed. 4001 * Returns 0 on success, negative error code on failure. 4002 */ 4003 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4004 { 4005 int i, r = 0; 4006 4007 for (i = 0; i < adev->num_ip_blocks; i++) { 4008 if (!adev->ip_blocks[i].status.valid) 4009 continue; 4010 if (adev->ip_blocks[i].status.hang && 4011 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4012 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4013 if (r) 4014 return r; 4015 } 4016 } 4017 4018 return 0; 4019 } 4020 4021 /** 4022 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4023 * 4024 * @adev: amdgpu_device pointer 4025 * 4026 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4027 * reset is necessary to recover. 4028 * Returns true if a full asic reset is required, false if not. 4029 */ 4030 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4031 { 4032 int i; 4033 4034 if (amdgpu_asic_need_full_reset(adev)) 4035 return true; 4036 4037 for (i = 0; i < adev->num_ip_blocks; i++) { 4038 if (!adev->ip_blocks[i].status.valid) 4039 continue; 4040 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4041 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4042 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4043 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4044 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4045 if (adev->ip_blocks[i].status.hang) { 4046 dev_info(adev->dev, "Some block need full reset!\n"); 4047 return true; 4048 } 4049 } 4050 } 4051 return false; 4052 } 4053 4054 /** 4055 * amdgpu_device_ip_soft_reset - do a soft reset 4056 * 4057 * @adev: amdgpu_device pointer 4058 * 4059 * The list of all the hardware IPs that make up the asic is walked and the 4060 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4061 * IP specific hardware or software state changes that are necessary to soft 4062 * reset the IP. 4063 * Returns 0 on success, negative error code on failure. 4064 */ 4065 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4066 { 4067 int i, r = 0; 4068 4069 for (i = 0; i < adev->num_ip_blocks; i++) { 4070 if (!adev->ip_blocks[i].status.valid) 4071 continue; 4072 if (adev->ip_blocks[i].status.hang && 4073 adev->ip_blocks[i].version->funcs->soft_reset) { 4074 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4075 if (r) 4076 return r; 4077 } 4078 } 4079 4080 return 0; 4081 } 4082 4083 /** 4084 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4085 * 4086 * @adev: amdgpu_device pointer 4087 * 4088 * The list of all the hardware IPs that make up the asic is walked and the 4089 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4090 * handles any IP specific hardware or software state changes that are 4091 * necessary after the IP has been soft reset. 4092 * Returns 0 on success, negative error code on failure. 4093 */ 4094 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4095 { 4096 int i, r = 0; 4097 4098 for (i = 0; i < adev->num_ip_blocks; i++) { 4099 if (!adev->ip_blocks[i].status.valid) 4100 continue; 4101 if (adev->ip_blocks[i].status.hang && 4102 adev->ip_blocks[i].version->funcs->post_soft_reset) 4103 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4104 if (r) 4105 return r; 4106 } 4107 4108 return 0; 4109 } 4110 4111 /** 4112 * amdgpu_device_recover_vram - Recover some VRAM contents 4113 * 4114 * @adev: amdgpu_device pointer 4115 * 4116 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4117 * restore things like GPUVM page tables after a GPU reset where 4118 * the contents of VRAM might be lost. 4119 * 4120 * Returns: 4121 * 0 on success, negative error code on failure. 4122 */ 4123 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4124 { 4125 struct dma_fence *fence = NULL, *next = NULL; 4126 struct amdgpu_bo *shadow; 4127 long r = 1, tmo; 4128 4129 if (amdgpu_sriov_runtime(adev)) 4130 tmo = msecs_to_jiffies(8000); 4131 else 4132 tmo = msecs_to_jiffies(100); 4133 4134 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4135 mutex_lock(&adev->shadow_list_lock); 4136 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4137 4138 /* No need to recover an evicted BO */ 4139 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4140 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4141 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4142 continue; 4143 4144 r = amdgpu_bo_restore_shadow(shadow, &next); 4145 if (r) 4146 break; 4147 4148 if (fence) { 4149 tmo = dma_fence_wait_timeout(fence, false, tmo); 4150 dma_fence_put(fence); 4151 fence = next; 4152 if (tmo == 0) { 4153 r = -ETIMEDOUT; 4154 break; 4155 } else if (tmo < 0) { 4156 r = tmo; 4157 break; 4158 } 4159 } else { 4160 fence = next; 4161 } 4162 } 4163 mutex_unlock(&adev->shadow_list_lock); 4164 4165 if (fence) 4166 tmo = dma_fence_wait_timeout(fence, false, tmo); 4167 dma_fence_put(fence); 4168 4169 if (r < 0 || tmo <= 0) { 4170 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4171 return -EIO; 4172 } 4173 4174 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4175 return 0; 4176 } 4177 4178 4179 /** 4180 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4181 * 4182 * @adev: amdgpu_device pointer 4183 * @from_hypervisor: request from hypervisor 4184 * 4185 * do VF FLR and reinitialize Asic 4186 * return 0 means succeeded otherwise failed 4187 */ 4188 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4189 bool from_hypervisor) 4190 { 4191 int r; 4192 4193 if (from_hypervisor) 4194 r = amdgpu_virt_request_full_gpu(adev, true); 4195 else 4196 r = amdgpu_virt_reset_gpu(adev); 4197 if (r) 4198 return r; 4199 4200 amdgpu_amdkfd_pre_reset(adev); 4201 4202 /* Resume IP prior to SMC */ 4203 r = amdgpu_device_ip_reinit_early_sriov(adev); 4204 if (r) 4205 goto error; 4206 4207 amdgpu_virt_init_data_exchange(adev); 4208 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4209 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4210 4211 r = amdgpu_device_fw_loading(adev); 4212 if (r) 4213 return r; 4214 4215 /* now we are okay to resume SMC/CP/SDMA */ 4216 r = amdgpu_device_ip_reinit_late_sriov(adev); 4217 if (r) 4218 goto error; 4219 4220 amdgpu_irq_gpu_reset_resume_helper(adev); 4221 r = amdgpu_ib_ring_tests(adev); 4222 amdgpu_amdkfd_post_reset(adev); 4223 4224 error: 4225 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4226 amdgpu_inc_vram_lost(adev); 4227 r = amdgpu_device_recover_vram(adev); 4228 } 4229 amdgpu_virt_release_full_gpu(adev, true); 4230 4231 return r; 4232 } 4233 4234 /** 4235 * amdgpu_device_has_job_running - check if there is any job in mirror list 4236 * 4237 * @adev: amdgpu_device pointer 4238 * 4239 * check if there is any job in mirror list 4240 */ 4241 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4242 { 4243 int i; 4244 struct drm_sched_job *job; 4245 4246 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4247 struct amdgpu_ring *ring = adev->rings[i]; 4248 4249 if (!ring || !ring->sched.thread) 4250 continue; 4251 4252 spin_lock(&ring->sched.job_list_lock); 4253 job = list_first_entry_or_null(&ring->sched.pending_list, 4254 struct drm_sched_job, list); 4255 spin_unlock(&ring->sched.job_list_lock); 4256 if (job) 4257 return true; 4258 } 4259 return false; 4260 } 4261 4262 /** 4263 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4264 * 4265 * @adev: amdgpu_device pointer 4266 * 4267 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4268 * a hung GPU. 4269 */ 4270 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4271 { 4272 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4273 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4274 return false; 4275 } 4276 4277 if (amdgpu_gpu_recovery == 0) 4278 goto disabled; 4279 4280 if (amdgpu_sriov_vf(adev)) 4281 return true; 4282 4283 if (amdgpu_gpu_recovery == -1) { 4284 switch (adev->asic_type) { 4285 case CHIP_BONAIRE: 4286 case CHIP_HAWAII: 4287 case CHIP_TOPAZ: 4288 case CHIP_TONGA: 4289 case CHIP_FIJI: 4290 case CHIP_POLARIS10: 4291 case CHIP_POLARIS11: 4292 case CHIP_POLARIS12: 4293 case CHIP_VEGAM: 4294 case CHIP_VEGA20: 4295 case CHIP_VEGA10: 4296 case CHIP_VEGA12: 4297 case CHIP_RAVEN: 4298 case CHIP_ARCTURUS: 4299 case CHIP_RENOIR: 4300 case CHIP_NAVI10: 4301 case CHIP_NAVI14: 4302 case CHIP_NAVI12: 4303 case CHIP_SIENNA_CICHLID: 4304 case CHIP_NAVY_FLOUNDER: 4305 case CHIP_DIMGREY_CAVEFISH: 4306 case CHIP_VANGOGH: 4307 case CHIP_ALDEBARAN: 4308 break; 4309 default: 4310 goto disabled; 4311 } 4312 } 4313 4314 return true; 4315 4316 disabled: 4317 dev_info(adev->dev, "GPU recovery disabled.\n"); 4318 return false; 4319 } 4320 4321 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4322 { 4323 u32 i; 4324 int ret = 0; 4325 4326 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4327 4328 dev_info(adev->dev, "GPU mode1 reset\n"); 4329 4330 /* disable BM */ 4331 pci_clear_master(adev->pdev); 4332 4333 amdgpu_device_cache_pci_state(adev->pdev); 4334 4335 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4336 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4337 ret = amdgpu_dpm_mode1_reset(adev); 4338 } else { 4339 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4340 ret = psp_gpu_reset(adev); 4341 } 4342 4343 if (ret) 4344 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4345 4346 amdgpu_device_load_pci_state(adev->pdev); 4347 4348 /* wait for asic to come out of reset */ 4349 for (i = 0; i < adev->usec_timeout; i++) { 4350 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4351 4352 if (memsize != 0xffffffff) 4353 break; 4354 udelay(1); 4355 } 4356 4357 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4358 return ret; 4359 } 4360 4361 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4362 struct amdgpu_reset_context *reset_context) 4363 { 4364 int i, r = 0; 4365 struct amdgpu_job *job = NULL; 4366 bool need_full_reset = 4367 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4368 4369 if (reset_context->reset_req_dev == adev) 4370 job = reset_context->job; 4371 4372 /* no need to dump if device is not in good state during probe period */ 4373 if (!adev->gmc.xgmi.pending_reset) 4374 amdgpu_debugfs_wait_dump(adev); 4375 4376 if (amdgpu_sriov_vf(adev)) { 4377 /* stop the data exchange thread */ 4378 amdgpu_virt_fini_data_exchange(adev); 4379 } 4380 4381 /* block all schedulers and reset given job's ring */ 4382 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4383 struct amdgpu_ring *ring = adev->rings[i]; 4384 4385 if (!ring || !ring->sched.thread) 4386 continue; 4387 4388 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4389 amdgpu_fence_driver_force_completion(ring); 4390 } 4391 4392 if(job) 4393 drm_sched_increase_karma(&job->base); 4394 4395 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4396 /* If reset handler not implemented, continue; otherwise return */ 4397 if (r == -ENOSYS) 4398 r = 0; 4399 else 4400 return r; 4401 4402 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4403 if (!amdgpu_sriov_vf(adev)) { 4404 4405 if (!need_full_reset) 4406 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4407 4408 if (!need_full_reset) { 4409 amdgpu_device_ip_pre_soft_reset(adev); 4410 r = amdgpu_device_ip_soft_reset(adev); 4411 amdgpu_device_ip_post_soft_reset(adev); 4412 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4413 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4414 need_full_reset = true; 4415 } 4416 } 4417 4418 if (need_full_reset) 4419 r = amdgpu_device_ip_suspend(adev); 4420 if (need_full_reset) 4421 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4422 else 4423 clear_bit(AMDGPU_NEED_FULL_RESET, 4424 &reset_context->flags); 4425 } 4426 4427 return r; 4428 } 4429 4430 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4431 struct amdgpu_reset_context *reset_context) 4432 { 4433 struct amdgpu_device *tmp_adev = NULL; 4434 bool need_full_reset, skip_hw_reset, vram_lost = false; 4435 int r = 0; 4436 4437 /* Try reset handler method first */ 4438 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4439 reset_list); 4440 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4441 /* If reset handler not implemented, continue; otherwise return */ 4442 if (r == -ENOSYS) 4443 r = 0; 4444 else 4445 return r; 4446 4447 /* Reset handler not implemented, use the default method */ 4448 need_full_reset = 4449 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4450 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4451 4452 /* 4453 * ASIC reset has to be done on all XGMI hive nodes ASAP 4454 * to allow proper links negotiation in FW (within 1 sec) 4455 */ 4456 if (!skip_hw_reset && need_full_reset) { 4457 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4458 /* For XGMI run all resets in parallel to speed up the process */ 4459 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4460 tmp_adev->gmc.xgmi.pending_reset = false; 4461 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4462 r = -EALREADY; 4463 } else 4464 r = amdgpu_asic_reset(tmp_adev); 4465 4466 if (r) { 4467 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4468 r, adev_to_drm(tmp_adev)->unique); 4469 break; 4470 } 4471 } 4472 4473 /* For XGMI wait for all resets to complete before proceed */ 4474 if (!r) { 4475 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4476 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4477 flush_work(&tmp_adev->xgmi_reset_work); 4478 r = tmp_adev->asic_reset_res; 4479 if (r) 4480 break; 4481 } 4482 } 4483 } 4484 } 4485 4486 if (!r && amdgpu_ras_intr_triggered()) { 4487 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4488 if (tmp_adev->mmhub.ras_funcs && 4489 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4490 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4491 } 4492 4493 amdgpu_ras_intr_cleared(); 4494 } 4495 4496 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4497 if (need_full_reset) { 4498 /* post card */ 4499 r = amdgpu_device_asic_init(tmp_adev); 4500 if (r) { 4501 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4502 } else { 4503 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4504 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4505 if (r) 4506 goto out; 4507 4508 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4509 if (vram_lost) { 4510 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4511 amdgpu_inc_vram_lost(tmp_adev); 4512 } 4513 4514 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4515 if (r) 4516 goto out; 4517 4518 r = amdgpu_device_fw_loading(tmp_adev); 4519 if (r) 4520 return r; 4521 4522 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4523 if (r) 4524 goto out; 4525 4526 if (vram_lost) 4527 amdgpu_device_fill_reset_magic(tmp_adev); 4528 4529 /* 4530 * Add this ASIC as tracked as reset was already 4531 * complete successfully. 4532 */ 4533 amdgpu_register_gpu_instance(tmp_adev); 4534 4535 if (!reset_context->hive && 4536 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4537 amdgpu_xgmi_add_device(tmp_adev); 4538 4539 r = amdgpu_device_ip_late_init(tmp_adev); 4540 if (r) 4541 goto out; 4542 4543 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4544 4545 /* 4546 * The GPU enters bad state once faulty pages 4547 * by ECC has reached the threshold, and ras 4548 * recovery is scheduled next. So add one check 4549 * here to break recovery if it indeed exceeds 4550 * bad page threshold, and remind user to 4551 * retire this GPU or setting one bigger 4552 * bad_page_threshold value to fix this once 4553 * probing driver again. 4554 */ 4555 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4556 /* must succeed. */ 4557 amdgpu_ras_resume(tmp_adev); 4558 } else { 4559 r = -EINVAL; 4560 goto out; 4561 } 4562 4563 /* Update PSP FW topology after reset */ 4564 if (reset_context->hive && 4565 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4566 r = amdgpu_xgmi_update_topology( 4567 reset_context->hive, tmp_adev); 4568 } 4569 } 4570 4571 out: 4572 if (!r) { 4573 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4574 r = amdgpu_ib_ring_tests(tmp_adev); 4575 if (r) { 4576 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4577 need_full_reset = true; 4578 r = -EAGAIN; 4579 goto end; 4580 } 4581 } 4582 4583 if (!r) 4584 r = amdgpu_device_recover_vram(tmp_adev); 4585 else 4586 tmp_adev->asic_reset_res = r; 4587 } 4588 4589 end: 4590 if (need_full_reset) 4591 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4592 else 4593 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4594 return r; 4595 } 4596 4597 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4598 struct amdgpu_hive_info *hive) 4599 { 4600 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4601 return false; 4602 4603 if (hive) { 4604 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4605 } else { 4606 down_write(&adev->reset_sem); 4607 } 4608 4609 switch (amdgpu_asic_reset_method(adev)) { 4610 case AMD_RESET_METHOD_MODE1: 4611 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4612 break; 4613 case AMD_RESET_METHOD_MODE2: 4614 adev->mp1_state = PP_MP1_STATE_RESET; 4615 break; 4616 default: 4617 adev->mp1_state = PP_MP1_STATE_NONE; 4618 break; 4619 } 4620 4621 return true; 4622 } 4623 4624 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4625 { 4626 amdgpu_vf_error_trans_all(adev); 4627 adev->mp1_state = PP_MP1_STATE_NONE; 4628 atomic_set(&adev->in_gpu_reset, 0); 4629 up_write(&adev->reset_sem); 4630 } 4631 4632 /* 4633 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4634 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4635 * 4636 * unlock won't require roll back. 4637 */ 4638 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4639 { 4640 struct amdgpu_device *tmp_adev = NULL; 4641 4642 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4643 if (!hive) { 4644 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4645 return -ENODEV; 4646 } 4647 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4648 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4649 goto roll_back; 4650 } 4651 } else if (!amdgpu_device_lock_adev(adev, hive)) 4652 return -EAGAIN; 4653 4654 return 0; 4655 roll_back: 4656 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4657 /* 4658 * if the lockup iteration break in the middle of a hive, 4659 * it may means there may has a race issue, 4660 * or a hive device locked up independently. 4661 * we may be in trouble and may not, so will try to roll back 4662 * the lock and give out a warnning. 4663 */ 4664 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4665 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4666 amdgpu_device_unlock_adev(tmp_adev); 4667 } 4668 } 4669 return -EAGAIN; 4670 } 4671 4672 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4673 { 4674 struct pci_dev *p = NULL; 4675 4676 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4677 adev->pdev->bus->number, 1); 4678 if (p) { 4679 pm_runtime_enable(&(p->dev)); 4680 pm_runtime_resume(&(p->dev)); 4681 } 4682 } 4683 4684 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4685 { 4686 enum amd_reset_method reset_method; 4687 struct pci_dev *p = NULL; 4688 u64 expires; 4689 4690 /* 4691 * For now, only BACO and mode1 reset are confirmed 4692 * to suffer the audio issue without proper suspended. 4693 */ 4694 reset_method = amdgpu_asic_reset_method(adev); 4695 if ((reset_method != AMD_RESET_METHOD_BACO) && 4696 (reset_method != AMD_RESET_METHOD_MODE1)) 4697 return -EINVAL; 4698 4699 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4700 adev->pdev->bus->number, 1); 4701 if (!p) 4702 return -ENODEV; 4703 4704 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4705 if (!expires) 4706 /* 4707 * If we cannot get the audio device autosuspend delay, 4708 * a fixed 4S interval will be used. Considering 3S is 4709 * the audio controller default autosuspend delay setting. 4710 * 4S used here is guaranteed to cover that. 4711 */ 4712 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4713 4714 while (!pm_runtime_status_suspended(&(p->dev))) { 4715 if (!pm_runtime_suspend(&(p->dev))) 4716 break; 4717 4718 if (expires < ktime_get_mono_fast_ns()) { 4719 dev_warn(adev->dev, "failed to suspend display audio\n"); 4720 /* TODO: abort the succeeding gpu reset? */ 4721 return -ETIMEDOUT; 4722 } 4723 } 4724 4725 pm_runtime_disable(&(p->dev)); 4726 4727 return 0; 4728 } 4729 4730 static void amdgpu_device_recheck_guilty_jobs( 4731 struct amdgpu_device *adev, struct list_head *device_list_handle, 4732 struct amdgpu_reset_context *reset_context) 4733 { 4734 int i, r = 0; 4735 4736 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4737 struct amdgpu_ring *ring = adev->rings[i]; 4738 int ret = 0; 4739 struct drm_sched_job *s_job; 4740 4741 if (!ring || !ring->sched.thread) 4742 continue; 4743 4744 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4745 struct drm_sched_job, list); 4746 if (s_job == NULL) 4747 continue; 4748 4749 /* clear job's guilty and depend the folowing step to decide the real one */ 4750 drm_sched_reset_karma(s_job); 4751 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4752 4753 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4754 if (ret == 0) { /* timeout */ 4755 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4756 ring->sched.name, s_job->id); 4757 4758 /* set guilty */ 4759 drm_sched_increase_karma(s_job); 4760 retry: 4761 /* do hw reset */ 4762 if (amdgpu_sriov_vf(adev)) { 4763 amdgpu_virt_fini_data_exchange(adev); 4764 r = amdgpu_device_reset_sriov(adev, false); 4765 if (r) 4766 adev->asic_reset_res = r; 4767 } else { 4768 clear_bit(AMDGPU_SKIP_HW_RESET, 4769 &reset_context->flags); 4770 r = amdgpu_do_asic_reset(device_list_handle, 4771 reset_context); 4772 if (r && r == -EAGAIN) 4773 goto retry; 4774 } 4775 4776 /* 4777 * add reset counter so that the following 4778 * resubmitted job could flush vmid 4779 */ 4780 atomic_inc(&adev->gpu_reset_counter); 4781 continue; 4782 } 4783 4784 /* got the hw fence, signal finished fence */ 4785 atomic_dec(ring->sched.score); 4786 dma_fence_get(&s_job->s_fence->finished); 4787 dma_fence_signal(&s_job->s_fence->finished); 4788 dma_fence_put(&s_job->s_fence->finished); 4789 4790 /* remove node from list and free the job */ 4791 spin_lock(&ring->sched.job_list_lock); 4792 list_del_init(&s_job->list); 4793 spin_unlock(&ring->sched.job_list_lock); 4794 ring->sched.ops->free_job(s_job); 4795 } 4796 } 4797 4798 /** 4799 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4800 * 4801 * @adev: amdgpu_device pointer 4802 * @job: which job trigger hang 4803 * 4804 * Attempt to reset the GPU if it has hung (all asics). 4805 * Attempt to do soft-reset or full-reset and reinitialize Asic 4806 * Returns 0 for success or an error on failure. 4807 */ 4808 4809 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4810 struct amdgpu_job *job) 4811 { 4812 struct list_head device_list, *device_list_handle = NULL; 4813 bool job_signaled = false; 4814 struct amdgpu_hive_info *hive = NULL; 4815 struct amdgpu_device *tmp_adev = NULL; 4816 int i, r = 0; 4817 bool need_emergency_restart = false; 4818 bool audio_suspended = false; 4819 int tmp_vram_lost_counter; 4820 struct amdgpu_reset_context reset_context; 4821 4822 memset(&reset_context, 0, sizeof(reset_context)); 4823 4824 /* 4825 * Special case: RAS triggered and full reset isn't supported 4826 */ 4827 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4828 4829 /* 4830 * Flush RAM to disk so that after reboot 4831 * the user can read log and see why the system rebooted. 4832 */ 4833 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4834 DRM_WARN("Emergency reboot."); 4835 4836 ksys_sync_helper(); 4837 emergency_restart(); 4838 } 4839 4840 dev_info(adev->dev, "GPU %s begin!\n", 4841 need_emergency_restart ? "jobs stop":"reset"); 4842 4843 /* 4844 * Here we trylock to avoid chain of resets executing from 4845 * either trigger by jobs on different adevs in XGMI hive or jobs on 4846 * different schedulers for same device while this TO handler is running. 4847 * We always reset all schedulers for device and all devices for XGMI 4848 * hive so that should take care of them too. 4849 */ 4850 hive = amdgpu_get_xgmi_hive(adev); 4851 if (hive) { 4852 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4853 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4854 job ? job->base.id : -1, hive->hive_id); 4855 amdgpu_put_xgmi_hive(hive); 4856 if (job) 4857 drm_sched_increase_karma(&job->base); 4858 return 0; 4859 } 4860 mutex_lock(&hive->hive_lock); 4861 } 4862 4863 reset_context.method = AMD_RESET_METHOD_NONE; 4864 reset_context.reset_req_dev = adev; 4865 reset_context.job = job; 4866 reset_context.hive = hive; 4867 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4868 4869 /* 4870 * lock the device before we try to operate the linked list 4871 * if didn't get the device lock, don't touch the linked list since 4872 * others may iterating it. 4873 */ 4874 r = amdgpu_device_lock_hive_adev(adev, hive); 4875 if (r) { 4876 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4877 job ? job->base.id : -1); 4878 4879 /* even we skipped this reset, still need to set the job to guilty */ 4880 if (job) 4881 drm_sched_increase_karma(&job->base); 4882 goto skip_recovery; 4883 } 4884 4885 /* 4886 * Build list of devices to reset. 4887 * In case we are in XGMI hive mode, resort the device list 4888 * to put adev in the 1st position. 4889 */ 4890 INIT_LIST_HEAD(&device_list); 4891 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4892 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4893 list_add_tail(&tmp_adev->reset_list, &device_list); 4894 if (!list_is_first(&adev->reset_list, &device_list)) 4895 list_rotate_to_front(&adev->reset_list, &device_list); 4896 device_list_handle = &device_list; 4897 } else { 4898 list_add_tail(&adev->reset_list, &device_list); 4899 device_list_handle = &device_list; 4900 } 4901 4902 /* block all schedulers and reset given job's ring */ 4903 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4904 /* 4905 * Try to put the audio codec into suspend state 4906 * before gpu reset started. 4907 * 4908 * Due to the power domain of the graphics device 4909 * is shared with AZ power domain. Without this, 4910 * we may change the audio hardware from behind 4911 * the audio driver's back. That will trigger 4912 * some audio codec errors. 4913 */ 4914 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4915 audio_suspended = true; 4916 4917 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4918 4919 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4920 4921 if (!amdgpu_sriov_vf(tmp_adev)) 4922 amdgpu_amdkfd_pre_reset(tmp_adev); 4923 4924 /* 4925 * Mark these ASICs to be reseted as untracked first 4926 * And add them back after reset completed 4927 */ 4928 amdgpu_unregister_gpu_instance(tmp_adev); 4929 4930 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4931 4932 /* disable ras on ALL IPs */ 4933 if (!need_emergency_restart && 4934 amdgpu_device_ip_need_full_reset(tmp_adev)) 4935 amdgpu_ras_suspend(tmp_adev); 4936 4937 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4938 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4939 4940 if (!ring || !ring->sched.thread) 4941 continue; 4942 4943 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4944 4945 if (need_emergency_restart) 4946 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4947 } 4948 atomic_inc(&tmp_adev->gpu_reset_counter); 4949 } 4950 4951 if (need_emergency_restart) 4952 goto skip_sched_resume; 4953 4954 /* 4955 * Must check guilty signal here since after this point all old 4956 * HW fences are force signaled. 4957 * 4958 * job->base holds a reference to parent fence 4959 */ 4960 if (job && job->base.s_fence->parent && 4961 dma_fence_is_signaled(job->base.s_fence->parent)) { 4962 job_signaled = true; 4963 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4964 goto skip_hw_reset; 4965 } 4966 4967 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4968 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4969 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 4970 /*TODO Should we stop ?*/ 4971 if (r) { 4972 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4973 r, adev_to_drm(tmp_adev)->unique); 4974 tmp_adev->asic_reset_res = r; 4975 } 4976 } 4977 4978 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 4979 /* Actual ASIC resets if needed.*/ 4980 /* TODO Implement XGMI hive reset logic for SRIOV */ 4981 if (amdgpu_sriov_vf(adev)) { 4982 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4983 if (r) 4984 adev->asic_reset_res = r; 4985 } else { 4986 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 4987 if (r && r == -EAGAIN) 4988 goto retry; 4989 } 4990 4991 skip_hw_reset: 4992 4993 /* Post ASIC reset for all devs .*/ 4994 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4995 4996 /* 4997 * Sometimes a later bad compute job can block a good gfx job as gfx 4998 * and compute ring share internal GC HW mutually. We add an additional 4999 * guilty jobs recheck step to find the real guilty job, it synchronously 5000 * submits and pends for the first job being signaled. If it gets timeout, 5001 * we identify it as a real guilty job. 5002 */ 5003 if (amdgpu_gpu_recovery == 2 && 5004 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5005 amdgpu_device_recheck_guilty_jobs( 5006 tmp_adev, device_list_handle, &reset_context); 5007 5008 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5009 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5010 5011 if (!ring || !ring->sched.thread) 5012 continue; 5013 5014 /* No point to resubmit jobs if we didn't HW reset*/ 5015 if (!tmp_adev->asic_reset_res && !job_signaled) 5016 drm_sched_resubmit_jobs(&ring->sched); 5017 5018 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5019 } 5020 5021 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 5022 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5023 } 5024 5025 tmp_adev->asic_reset_res = 0; 5026 5027 if (r) { 5028 /* bad news, how to tell it to userspace ? */ 5029 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5030 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5031 } else { 5032 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5033 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5034 DRM_WARN("smart shift update failed\n"); 5035 } 5036 } 5037 5038 skip_sched_resume: 5039 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5040 /* unlock kfd: SRIOV would do it separately */ 5041 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5042 amdgpu_amdkfd_post_reset(tmp_adev); 5043 5044 /* kfd_post_reset will do nothing if kfd device is not initialized, 5045 * need to bring up kfd here if it's not be initialized before 5046 */ 5047 if (!adev->kfd.init_complete) 5048 amdgpu_amdkfd_device_init(adev); 5049 5050 if (audio_suspended) 5051 amdgpu_device_resume_display_audio(tmp_adev); 5052 amdgpu_device_unlock_adev(tmp_adev); 5053 } 5054 5055 skip_recovery: 5056 if (hive) { 5057 atomic_set(&hive->in_reset, 0); 5058 mutex_unlock(&hive->hive_lock); 5059 amdgpu_put_xgmi_hive(hive); 5060 } 5061 5062 if (r && r != -EAGAIN) 5063 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5064 return r; 5065 } 5066 5067 /** 5068 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5069 * 5070 * @adev: amdgpu_device pointer 5071 * 5072 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5073 * and lanes) of the slot the device is in. Handles APUs and 5074 * virtualized environments where PCIE config space may not be available. 5075 */ 5076 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5077 { 5078 struct pci_dev *pdev; 5079 enum pci_bus_speed speed_cap, platform_speed_cap; 5080 enum pcie_link_width platform_link_width; 5081 5082 if (amdgpu_pcie_gen_cap) 5083 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5084 5085 if (amdgpu_pcie_lane_cap) 5086 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5087 5088 /* covers APUs as well */ 5089 if (pci_is_root_bus(adev->pdev->bus)) { 5090 if (adev->pm.pcie_gen_mask == 0) 5091 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5092 if (adev->pm.pcie_mlw_mask == 0) 5093 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5094 return; 5095 } 5096 5097 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5098 return; 5099 5100 pcie_bandwidth_available(adev->pdev, NULL, 5101 &platform_speed_cap, &platform_link_width); 5102 5103 if (adev->pm.pcie_gen_mask == 0) { 5104 /* asic caps */ 5105 pdev = adev->pdev; 5106 speed_cap = pcie_get_speed_cap(pdev); 5107 if (speed_cap == PCI_SPEED_UNKNOWN) { 5108 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5109 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5110 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5111 } else { 5112 if (speed_cap == PCIE_SPEED_32_0GT) 5113 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5114 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5115 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5116 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5117 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5118 else if (speed_cap == PCIE_SPEED_16_0GT) 5119 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5120 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5121 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5122 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5123 else if (speed_cap == PCIE_SPEED_8_0GT) 5124 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5125 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5126 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5127 else if (speed_cap == PCIE_SPEED_5_0GT) 5128 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5129 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5130 else 5131 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5132 } 5133 /* platform caps */ 5134 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5135 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5136 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5137 } else { 5138 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5139 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5140 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5141 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5142 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5143 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5144 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5145 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5146 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5147 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5148 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5149 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5150 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5151 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5152 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5153 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5154 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5155 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5156 else 5157 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5158 5159 } 5160 } 5161 if (adev->pm.pcie_mlw_mask == 0) { 5162 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5163 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5164 } else { 5165 switch (platform_link_width) { 5166 case PCIE_LNK_X32: 5167 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5168 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5169 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5170 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5171 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5172 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5173 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5174 break; 5175 case PCIE_LNK_X16: 5176 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5177 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5178 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5179 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5180 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5181 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5182 break; 5183 case PCIE_LNK_X12: 5184 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5185 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5186 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5187 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5188 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5189 break; 5190 case PCIE_LNK_X8: 5191 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5192 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5193 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5194 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5195 break; 5196 case PCIE_LNK_X4: 5197 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5198 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5199 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5200 break; 5201 case PCIE_LNK_X2: 5202 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5203 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5204 break; 5205 case PCIE_LNK_X1: 5206 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5207 break; 5208 default: 5209 break; 5210 } 5211 } 5212 } 5213 } 5214 5215 int amdgpu_device_baco_enter(struct drm_device *dev) 5216 { 5217 struct amdgpu_device *adev = drm_to_adev(dev); 5218 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5219 5220 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5221 return -ENOTSUPP; 5222 5223 if (ras && adev->ras_enabled && 5224 adev->nbio.funcs->enable_doorbell_interrupt) 5225 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5226 5227 return amdgpu_dpm_baco_enter(adev); 5228 } 5229 5230 int amdgpu_device_baco_exit(struct drm_device *dev) 5231 { 5232 struct amdgpu_device *adev = drm_to_adev(dev); 5233 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5234 int ret = 0; 5235 5236 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5237 return -ENOTSUPP; 5238 5239 ret = amdgpu_dpm_baco_exit(adev); 5240 if (ret) 5241 return ret; 5242 5243 if (ras && adev->ras_enabled && 5244 adev->nbio.funcs->enable_doorbell_interrupt) 5245 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5246 5247 return 0; 5248 } 5249 5250 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5251 { 5252 int i; 5253 5254 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5255 struct amdgpu_ring *ring = adev->rings[i]; 5256 5257 if (!ring || !ring->sched.thread) 5258 continue; 5259 5260 cancel_delayed_work_sync(&ring->sched.work_tdr); 5261 } 5262 } 5263 5264 /** 5265 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5266 * @pdev: PCI device struct 5267 * @state: PCI channel state 5268 * 5269 * Description: Called when a PCI error is detected. 5270 * 5271 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5272 */ 5273 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5274 { 5275 struct drm_device *dev = pci_get_drvdata(pdev); 5276 struct amdgpu_device *adev = drm_to_adev(dev); 5277 int i; 5278 5279 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5280 5281 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5282 DRM_WARN("No support for XGMI hive yet..."); 5283 return PCI_ERS_RESULT_DISCONNECT; 5284 } 5285 5286 switch (state) { 5287 case pci_channel_io_normal: 5288 return PCI_ERS_RESULT_CAN_RECOVER; 5289 /* Fatal error, prepare for slot reset */ 5290 case pci_channel_io_frozen: 5291 /* 5292 * Cancel and wait for all TDRs in progress if failing to 5293 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5294 * 5295 * Locking adev->reset_sem will prevent any external access 5296 * to GPU during PCI error recovery 5297 */ 5298 while (!amdgpu_device_lock_adev(adev, NULL)) 5299 amdgpu_cancel_all_tdr(adev); 5300 5301 /* 5302 * Block any work scheduling as we do for regular GPU reset 5303 * for the duration of the recovery 5304 */ 5305 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5306 struct amdgpu_ring *ring = adev->rings[i]; 5307 5308 if (!ring || !ring->sched.thread) 5309 continue; 5310 5311 drm_sched_stop(&ring->sched, NULL); 5312 } 5313 atomic_inc(&adev->gpu_reset_counter); 5314 return PCI_ERS_RESULT_NEED_RESET; 5315 case pci_channel_io_perm_failure: 5316 /* Permanent error, prepare for device removal */ 5317 return PCI_ERS_RESULT_DISCONNECT; 5318 } 5319 5320 return PCI_ERS_RESULT_NEED_RESET; 5321 } 5322 5323 /** 5324 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5325 * @pdev: pointer to PCI device 5326 */ 5327 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5328 { 5329 5330 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5331 5332 /* TODO - dump whatever for debugging purposes */ 5333 5334 /* This called only if amdgpu_pci_error_detected returns 5335 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5336 * works, no need to reset slot. 5337 */ 5338 5339 return PCI_ERS_RESULT_RECOVERED; 5340 } 5341 5342 /** 5343 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5344 * @pdev: PCI device struct 5345 * 5346 * Description: This routine is called by the pci error recovery 5347 * code after the PCI slot has been reset, just before we 5348 * should resume normal operations. 5349 */ 5350 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5351 { 5352 struct drm_device *dev = pci_get_drvdata(pdev); 5353 struct amdgpu_device *adev = drm_to_adev(dev); 5354 int r, i; 5355 struct amdgpu_reset_context reset_context; 5356 u32 memsize; 5357 struct list_head device_list; 5358 5359 DRM_INFO("PCI error: slot reset callback!!\n"); 5360 5361 memset(&reset_context, 0, sizeof(reset_context)); 5362 5363 INIT_LIST_HEAD(&device_list); 5364 list_add_tail(&adev->reset_list, &device_list); 5365 5366 /* wait for asic to come out of reset */ 5367 msleep(500); 5368 5369 /* Restore PCI confspace */ 5370 amdgpu_device_load_pci_state(pdev); 5371 5372 /* confirm ASIC came out of reset */ 5373 for (i = 0; i < adev->usec_timeout; i++) { 5374 memsize = amdgpu_asic_get_config_memsize(adev); 5375 5376 if (memsize != 0xffffffff) 5377 break; 5378 udelay(1); 5379 } 5380 if (memsize == 0xffffffff) { 5381 r = -ETIME; 5382 goto out; 5383 } 5384 5385 reset_context.method = AMD_RESET_METHOD_NONE; 5386 reset_context.reset_req_dev = adev; 5387 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5388 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5389 5390 adev->no_hw_access = true; 5391 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5392 adev->no_hw_access = false; 5393 if (r) 5394 goto out; 5395 5396 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5397 5398 out: 5399 if (!r) { 5400 if (amdgpu_device_cache_pci_state(adev->pdev)) 5401 pci_restore_state(adev->pdev); 5402 5403 DRM_INFO("PCIe error recovery succeeded\n"); 5404 } else { 5405 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5406 amdgpu_device_unlock_adev(adev); 5407 } 5408 5409 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5410 } 5411 5412 /** 5413 * amdgpu_pci_resume() - resume normal ops after PCI reset 5414 * @pdev: pointer to PCI device 5415 * 5416 * Called when the error recovery driver tells us that its 5417 * OK to resume normal operation. 5418 */ 5419 void amdgpu_pci_resume(struct pci_dev *pdev) 5420 { 5421 struct drm_device *dev = pci_get_drvdata(pdev); 5422 struct amdgpu_device *adev = drm_to_adev(dev); 5423 int i; 5424 5425 5426 DRM_INFO("PCI error: resume callback!!\n"); 5427 5428 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5429 struct amdgpu_ring *ring = adev->rings[i]; 5430 5431 if (!ring || !ring->sched.thread) 5432 continue; 5433 5434 5435 drm_sched_resubmit_jobs(&ring->sched); 5436 drm_sched_start(&ring->sched, true); 5437 } 5438 5439 amdgpu_device_unlock_adev(adev); 5440 } 5441 5442 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5443 { 5444 struct drm_device *dev = pci_get_drvdata(pdev); 5445 struct amdgpu_device *adev = drm_to_adev(dev); 5446 int r; 5447 5448 r = pci_save_state(pdev); 5449 if (!r) { 5450 kfree(adev->pci_state); 5451 5452 adev->pci_state = pci_store_saved_state(pdev); 5453 5454 if (!adev->pci_state) { 5455 DRM_ERROR("Failed to store PCI saved state"); 5456 return false; 5457 } 5458 } else { 5459 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5460 return false; 5461 } 5462 5463 return true; 5464 } 5465 5466 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5467 { 5468 struct drm_device *dev = pci_get_drvdata(pdev); 5469 struct amdgpu_device *adev = drm_to_adev(dev); 5470 int r; 5471 5472 if (!adev->pci_state) 5473 return false; 5474 5475 r = pci_load_saved_state(pdev, adev->pci_state); 5476 5477 if (!r) { 5478 pci_restore_state(pdev); 5479 } else { 5480 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5481 return false; 5482 } 5483 5484 return true; 5485 } 5486 5487 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5488 struct amdgpu_ring *ring) 5489 { 5490 #ifdef CONFIG_X86_64 5491 if (adev->flags & AMD_IS_APU) 5492 return; 5493 #endif 5494 if (adev->gmc.xgmi.connected_to_cpu) 5495 return; 5496 5497 if (ring && ring->funcs->emit_hdp_flush) 5498 amdgpu_ring_emit_hdp_flush(ring); 5499 else 5500 amdgpu_asic_flush_hdp(adev, ring); 5501 } 5502 5503 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5504 struct amdgpu_ring *ring) 5505 { 5506 #ifdef CONFIG_X86_64 5507 if (adev->flags & AMD_IS_APU) 5508 return; 5509 #endif 5510 if (adev->gmc.xgmi.connected_to_cpu) 5511 return; 5512 5513 amdgpu_asic_invalidate_hdp(adev, ring); 5514 } 5515