1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "ALDEBARAN", 115 "NAVI10", 116 "NAVI14", 117 "NAVI12", 118 "SIENNA_CICHLID", 119 "NAVY_FLOUNDER", 120 "VANGOGH", 121 "DIMGREY_CAVEFISH", 122 "LAST", 123 }; 124 125 /** 126 * DOC: pcie_replay_count 127 * 128 * The amdgpu driver provides a sysfs API for reporting the total number 129 * of PCIe replays (NAKs) 130 * The file pcie_replay_count is used for this and returns the total 131 * number of replays as a sum of the NAKs generated and NAKs received 132 */ 133 134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 struct drm_device *ddev = dev_get_drvdata(dev); 138 struct amdgpu_device *adev = drm_to_adev(ddev); 139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 140 141 return sysfs_emit(buf, "%llu\n", cnt); 142 } 143 144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 145 amdgpu_device_get_pcie_replay_count, NULL); 146 147 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 148 149 /** 150 * DOC: product_name 151 * 152 * The amdgpu driver provides a sysfs API for reporting the product name 153 * for the device 154 * The file serial_number is used for this and returns the product name 155 * as returned from the FRU. 156 * NOTE: This is only available for certain server cards 157 */ 158 159 static ssize_t amdgpu_device_get_product_name(struct device *dev, 160 struct device_attribute *attr, char *buf) 161 { 162 struct drm_device *ddev = dev_get_drvdata(dev); 163 struct amdgpu_device *adev = drm_to_adev(ddev); 164 165 return sysfs_emit(buf, "%s\n", adev->product_name); 166 } 167 168 static DEVICE_ATTR(product_name, S_IRUGO, 169 amdgpu_device_get_product_name, NULL); 170 171 /** 172 * DOC: product_number 173 * 174 * The amdgpu driver provides a sysfs API for reporting the part number 175 * for the device 176 * The file serial_number is used for this and returns the part number 177 * as returned from the FRU. 178 * NOTE: This is only available for certain server cards 179 */ 180 181 static ssize_t amdgpu_device_get_product_number(struct device *dev, 182 struct device_attribute *attr, char *buf) 183 { 184 struct drm_device *ddev = dev_get_drvdata(dev); 185 struct amdgpu_device *adev = drm_to_adev(ddev); 186 187 return sysfs_emit(buf, "%s\n", adev->product_number); 188 } 189 190 static DEVICE_ATTR(product_number, S_IRUGO, 191 amdgpu_device_get_product_number, NULL); 192 193 /** 194 * DOC: serial_number 195 * 196 * The amdgpu driver provides a sysfs API for reporting the serial number 197 * for the device 198 * The file serial_number is used for this and returns the serial number 199 * as returned from the FRU. 200 * NOTE: This is only available for certain server cards 201 */ 202 203 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct drm_device *ddev = dev_get_drvdata(dev); 207 struct amdgpu_device *adev = drm_to_adev(ddev); 208 209 return sysfs_emit(buf, "%s\n", adev->serial); 210 } 211 212 static DEVICE_ATTR(serial_number, S_IRUGO, 213 amdgpu_device_get_serial_number, NULL); 214 215 /** 216 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 217 * 218 * @dev: drm_device pointer 219 * 220 * Returns true if the device is a dGPU with ATPX power control, 221 * otherwise return false. 222 */ 223 bool amdgpu_device_supports_px(struct drm_device *dev) 224 { 225 struct amdgpu_device *adev = drm_to_adev(dev); 226 227 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 228 return true; 229 return false; 230 } 231 232 /** 233 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 234 * 235 * @dev: drm_device pointer 236 * 237 * Returns true if the device is a dGPU with ACPI power control, 238 * otherwise return false. 239 */ 240 bool amdgpu_device_supports_boco(struct drm_device *dev) 241 { 242 struct amdgpu_device *adev = drm_to_adev(dev); 243 244 if (adev->has_pr3 || 245 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_baco - Does the device support BACO 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device supporte BACO, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_baco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 return amdgpu_asic_supports_baco(adev); 263 } 264 265 /* 266 * VRAM access helper functions 267 */ 268 269 /** 270 * amdgpu_device_vram_access - read/write a buffer in vram 271 * 272 * @adev: amdgpu_device pointer 273 * @pos: offset of the buffer in vram 274 * @buf: virtual address of the buffer in system memory 275 * @size: read/write size, sizeof(@buf) must > @size 276 * @write: true - write to vram, otherwise - read from vram 277 */ 278 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 279 uint32_t *buf, size_t size, bool write) 280 { 281 unsigned long flags; 282 uint32_t hi = ~0; 283 uint64_t last; 284 285 286 #ifdef CONFIG_64BIT 287 last = min(pos + size, adev->gmc.visible_vram_size); 288 if (last > pos) { 289 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 290 size_t count = last - pos; 291 292 if (write) { 293 memcpy_toio(addr, buf, count); 294 mb(); 295 amdgpu_asic_flush_hdp(adev, NULL); 296 } else { 297 amdgpu_asic_invalidate_hdp(adev, NULL); 298 mb(); 299 memcpy_fromio(buf, addr, count); 300 } 301 302 if (count == size) 303 return; 304 305 pos += count; 306 buf += count / 4; 307 size -= count; 308 } 309 #endif 310 311 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 312 for (last = pos + size; pos < last; pos += 4) { 313 uint32_t tmp = pos >> 31; 314 315 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 316 if (tmp != hi) { 317 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 318 hi = tmp; 319 } 320 if (write) 321 WREG32_NO_KIQ(mmMM_DATA, *buf++); 322 else 323 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 324 } 325 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 326 } 327 328 /* 329 * register access helper functions. 330 */ 331 332 /* Check if hw access should be skipped because of hotplug or device error */ 333 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 334 { 335 if (adev->in_pci_err_recovery) 336 return true; 337 338 #ifdef CONFIG_LOCKDEP 339 /* 340 * This is a bit complicated to understand, so worth a comment. What we assert 341 * here is that the GPU reset is not running on another thread in parallel. 342 * 343 * For this we trylock the read side of the reset semaphore, if that succeeds 344 * we know that the reset is not running in paralell. 345 * 346 * If the trylock fails we assert that we are either already holding the read 347 * side of the lock or are the reset thread itself and hold the write side of 348 * the lock. 349 */ 350 if (in_task()) { 351 if (down_read_trylock(&adev->reset_sem)) 352 up_read(&adev->reset_sem); 353 else 354 lockdep_assert_held(&adev->reset_sem); 355 } 356 #endif 357 return false; 358 } 359 360 /** 361 * amdgpu_device_rreg - read a memory mapped IO or indirect register 362 * 363 * @adev: amdgpu_device pointer 364 * @reg: dword aligned register offset 365 * @acc_flags: access flags which require special behavior 366 * 367 * Returns the 32 bit value from the offset specified. 368 */ 369 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 370 uint32_t reg, uint32_t acc_flags) 371 { 372 uint32_t ret; 373 374 if (amdgpu_device_skip_hw_access(adev)) 375 return 0; 376 377 if ((reg * 4) < adev->rmmio_size) { 378 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 379 amdgpu_sriov_runtime(adev) && 380 down_read_trylock(&adev->reset_sem)) { 381 ret = amdgpu_kiq_rreg(adev, reg); 382 up_read(&adev->reset_sem); 383 } else { 384 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 385 } 386 } else { 387 ret = adev->pcie_rreg(adev, reg * 4); 388 } 389 390 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 391 392 return ret; 393 } 394 395 /* 396 * MMIO register read with bytes helper functions 397 * @offset:bytes offset from MMIO start 398 * 399 */ 400 401 /** 402 * amdgpu_mm_rreg8 - read a memory mapped IO register 403 * 404 * @adev: amdgpu_device pointer 405 * @offset: byte aligned register offset 406 * 407 * Returns the 8 bit value from the offset specified. 408 */ 409 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 410 { 411 if (amdgpu_device_skip_hw_access(adev)) 412 return 0; 413 414 if (offset < adev->rmmio_size) 415 return (readb(adev->rmmio + offset)); 416 BUG(); 417 } 418 419 /* 420 * MMIO register write with bytes helper functions 421 * @offset:bytes offset from MMIO start 422 * @value: the value want to be written to the register 423 * 424 */ 425 /** 426 * amdgpu_mm_wreg8 - read a memory mapped IO register 427 * 428 * @adev: amdgpu_device pointer 429 * @offset: byte aligned register offset 430 * @value: 8 bit value to write 431 * 432 * Writes the value specified to the offset specified. 433 */ 434 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 435 { 436 if (amdgpu_device_skip_hw_access(adev)) 437 return; 438 439 if (offset < adev->rmmio_size) 440 writeb(value, adev->rmmio + offset); 441 else 442 BUG(); 443 } 444 445 /** 446 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 447 * 448 * @adev: amdgpu_device pointer 449 * @reg: dword aligned register offset 450 * @v: 32 bit value to write to the register 451 * @acc_flags: access flags which require special behavior 452 * 453 * Writes the value specified to the offset specified. 454 */ 455 void amdgpu_device_wreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t v, 457 uint32_t acc_flags) 458 { 459 if (amdgpu_device_skip_hw_access(adev)) 460 return; 461 462 if ((reg * 4) < adev->rmmio_size) { 463 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 464 amdgpu_sriov_runtime(adev) && 465 down_read_trylock(&adev->reset_sem)) { 466 amdgpu_kiq_wreg(adev, reg, v); 467 up_read(&adev->reset_sem); 468 } else { 469 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 470 } 471 } else { 472 adev->pcie_wreg(adev, reg * 4, v); 473 } 474 475 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 476 } 477 478 /* 479 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 480 * 481 * this function is invoked only the debugfs register access 482 * */ 483 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 484 uint32_t reg, uint32_t v) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return; 488 489 if (amdgpu_sriov_fullaccess(adev) && 490 adev->gfx.rlc.funcs && 491 adev->gfx.rlc.funcs->is_rlcg_access_range) { 492 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 493 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } 498 499 /** 500 * amdgpu_mm_rdoorbell - read a doorbell dword 501 * 502 * @adev: amdgpu_device pointer 503 * @index: doorbell index 504 * 505 * Returns the value in the doorbell aperture at the 506 * requested doorbell index (CIK). 507 */ 508 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 509 { 510 if (amdgpu_device_skip_hw_access(adev)) 511 return 0; 512 513 if (index < adev->doorbell.num_doorbells) { 514 return readl(adev->doorbell.ptr + index); 515 } else { 516 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 517 return 0; 518 } 519 } 520 521 /** 522 * amdgpu_mm_wdoorbell - write a doorbell dword 523 * 524 * @adev: amdgpu_device pointer 525 * @index: doorbell index 526 * @v: value to write 527 * 528 * Writes @v to the doorbell aperture at the 529 * requested doorbell index (CIK). 530 */ 531 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 532 { 533 if (amdgpu_device_skip_hw_access(adev)) 534 return; 535 536 if (index < adev->doorbell.num_doorbells) { 537 writel(v, adev->doorbell.ptr + index); 538 } else { 539 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 540 } 541 } 542 543 /** 544 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 545 * 546 * @adev: amdgpu_device pointer 547 * @index: doorbell index 548 * 549 * Returns the value in the doorbell aperture at the 550 * requested doorbell index (VEGA10+). 551 */ 552 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 553 { 554 if (amdgpu_device_skip_hw_access(adev)) 555 return 0; 556 557 if (index < adev->doorbell.num_doorbells) { 558 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 559 } else { 560 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 561 return 0; 562 } 563 } 564 565 /** 566 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 567 * 568 * @adev: amdgpu_device pointer 569 * @index: doorbell index 570 * @v: value to write 571 * 572 * Writes @v to the doorbell aperture at the 573 * requested doorbell index (VEGA10+). 574 */ 575 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 576 { 577 if (amdgpu_device_skip_hw_access(adev)) 578 return; 579 580 if (index < adev->doorbell.num_doorbells) { 581 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 582 } else { 583 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 584 } 585 } 586 587 /** 588 * amdgpu_device_indirect_rreg - read an indirect register 589 * 590 * @adev: amdgpu_device pointer 591 * @pcie_index: mmio register offset 592 * @pcie_data: mmio register offset 593 * @reg_addr: indirect register address to read from 594 * 595 * Returns the value of indirect register @reg_addr 596 */ 597 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 598 u32 pcie_index, u32 pcie_data, 599 u32 reg_addr) 600 { 601 unsigned long flags; 602 u32 r; 603 void __iomem *pcie_index_offset; 604 void __iomem *pcie_data_offset; 605 606 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 607 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 608 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 609 610 writel(reg_addr, pcie_index_offset); 611 readl(pcie_index_offset); 612 r = readl(pcie_data_offset); 613 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 614 615 return r; 616 } 617 618 /** 619 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 620 * 621 * @adev: amdgpu_device pointer 622 * @pcie_index: mmio register offset 623 * @pcie_data: mmio register offset 624 * @reg_addr: indirect register address to read from 625 * 626 * Returns the value of indirect register @reg_addr 627 */ 628 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 629 u32 pcie_index, u32 pcie_data, 630 u32 reg_addr) 631 { 632 unsigned long flags; 633 u64 r; 634 void __iomem *pcie_index_offset; 635 void __iomem *pcie_data_offset; 636 637 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 638 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 639 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 640 641 /* read low 32 bits */ 642 writel(reg_addr, pcie_index_offset); 643 readl(pcie_index_offset); 644 r = readl(pcie_data_offset); 645 /* read high 32 bits */ 646 writel(reg_addr + 4, pcie_index_offset); 647 readl(pcie_index_offset); 648 r |= ((u64)readl(pcie_data_offset) << 32); 649 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 650 651 return r; 652 } 653 654 /** 655 * amdgpu_device_indirect_wreg - write an indirect register address 656 * 657 * @adev: amdgpu_device pointer 658 * @pcie_index: mmio register offset 659 * @pcie_data: mmio register offset 660 * @reg_addr: indirect register offset 661 * @reg_data: indirect register data 662 * 663 */ 664 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 665 u32 pcie_index, u32 pcie_data, 666 u32 reg_addr, u32 reg_data) 667 { 668 unsigned long flags; 669 void __iomem *pcie_index_offset; 670 void __iomem *pcie_data_offset; 671 672 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 673 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 674 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 675 676 writel(reg_addr, pcie_index_offset); 677 readl(pcie_index_offset); 678 writel(reg_data, pcie_data_offset); 679 readl(pcie_data_offset); 680 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 681 } 682 683 /** 684 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 685 * 686 * @adev: amdgpu_device pointer 687 * @pcie_index: mmio register offset 688 * @pcie_data: mmio register offset 689 * @reg_addr: indirect register offset 690 * @reg_data: indirect register data 691 * 692 */ 693 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 694 u32 pcie_index, u32 pcie_data, 695 u32 reg_addr, u64 reg_data) 696 { 697 unsigned long flags; 698 void __iomem *pcie_index_offset; 699 void __iomem *pcie_data_offset; 700 701 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 702 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 703 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 704 705 /* write low 32 bits */ 706 writel(reg_addr, pcie_index_offset); 707 readl(pcie_index_offset); 708 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 709 readl(pcie_data_offset); 710 /* write high 32 bits */ 711 writel(reg_addr + 4, pcie_index_offset); 712 readl(pcie_index_offset); 713 writel((u32)(reg_data >> 32), pcie_data_offset); 714 readl(pcie_data_offset); 715 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 716 } 717 718 /** 719 * amdgpu_invalid_rreg - dummy reg read function 720 * 721 * @adev: amdgpu_device pointer 722 * @reg: offset of register 723 * 724 * Dummy register read function. Used for register blocks 725 * that certain asics don't have (all asics). 726 * Returns the value in the register. 727 */ 728 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 729 { 730 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 731 BUG(); 732 return 0; 733 } 734 735 /** 736 * amdgpu_invalid_wreg - dummy reg write function 737 * 738 * @adev: amdgpu_device pointer 739 * @reg: offset of register 740 * @v: value to write to the register 741 * 742 * Dummy register read function. Used for register blocks 743 * that certain asics don't have (all asics). 744 */ 745 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 746 { 747 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 748 reg, v); 749 BUG(); 750 } 751 752 /** 753 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 754 * 755 * @adev: amdgpu_device pointer 756 * @reg: offset of register 757 * 758 * Dummy register read function. Used for register blocks 759 * that certain asics don't have (all asics). 760 * Returns the value in the register. 761 */ 762 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 763 { 764 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 765 BUG(); 766 return 0; 767 } 768 769 /** 770 * amdgpu_invalid_wreg64 - dummy reg write function 771 * 772 * @adev: amdgpu_device pointer 773 * @reg: offset of register 774 * @v: value to write to the register 775 * 776 * Dummy register read function. Used for register blocks 777 * that certain asics don't have (all asics). 778 */ 779 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 780 { 781 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 782 reg, v); 783 BUG(); 784 } 785 786 /** 787 * amdgpu_block_invalid_rreg - dummy reg read function 788 * 789 * @adev: amdgpu_device pointer 790 * @block: offset of instance 791 * @reg: offset of register 792 * 793 * Dummy register read function. Used for register blocks 794 * that certain asics don't have (all asics). 795 * Returns the value in the register. 796 */ 797 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 798 uint32_t block, uint32_t reg) 799 { 800 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 801 reg, block); 802 BUG(); 803 return 0; 804 } 805 806 /** 807 * amdgpu_block_invalid_wreg - dummy reg write function 808 * 809 * @adev: amdgpu_device pointer 810 * @block: offset of instance 811 * @reg: offset of register 812 * @v: value to write to the register 813 * 814 * Dummy register read function. Used for register blocks 815 * that certain asics don't have (all asics). 816 */ 817 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 818 uint32_t block, 819 uint32_t reg, uint32_t v) 820 { 821 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 822 reg, block, v); 823 BUG(); 824 } 825 826 /** 827 * amdgpu_device_asic_init - Wrapper for atom asic_init 828 * 829 * @adev: amdgpu_device pointer 830 * 831 * Does any asic specific work and then calls atom asic init. 832 */ 833 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 834 { 835 amdgpu_asic_pre_asic_init(adev); 836 837 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 838 } 839 840 /** 841 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 842 * 843 * @adev: amdgpu_device pointer 844 * 845 * Allocates a scratch page of VRAM for use by various things in the 846 * driver. 847 */ 848 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 849 { 850 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 851 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 852 &adev->vram_scratch.robj, 853 &adev->vram_scratch.gpu_addr, 854 (void **)&adev->vram_scratch.ptr); 855 } 856 857 /** 858 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 859 * 860 * @adev: amdgpu_device pointer 861 * 862 * Frees the VRAM scratch page. 863 */ 864 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 865 { 866 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 867 } 868 869 /** 870 * amdgpu_device_program_register_sequence - program an array of registers. 871 * 872 * @adev: amdgpu_device pointer 873 * @registers: pointer to the register array 874 * @array_size: size of the register array 875 * 876 * Programs an array or registers with and and or masks. 877 * This is a helper for setting golden registers. 878 */ 879 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 880 const u32 *registers, 881 const u32 array_size) 882 { 883 u32 tmp, reg, and_mask, or_mask; 884 int i; 885 886 if (array_size % 3) 887 return; 888 889 for (i = 0; i < array_size; i +=3) { 890 reg = registers[i + 0]; 891 and_mask = registers[i + 1]; 892 or_mask = registers[i + 2]; 893 894 if (and_mask == 0xffffffff) { 895 tmp = or_mask; 896 } else { 897 tmp = RREG32(reg); 898 tmp &= ~and_mask; 899 if (adev->family >= AMDGPU_FAMILY_AI) 900 tmp |= (or_mask & and_mask); 901 else 902 tmp |= or_mask; 903 } 904 WREG32(reg, tmp); 905 } 906 } 907 908 /** 909 * amdgpu_device_pci_config_reset - reset the GPU 910 * 911 * @adev: amdgpu_device pointer 912 * 913 * Resets the GPU using the pci config reset sequence. 914 * Only applicable to asics prior to vega10. 915 */ 916 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 917 { 918 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 919 } 920 921 /** 922 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 923 * 924 * @adev: amdgpu_device pointer 925 * 926 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 927 */ 928 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 929 { 930 return pci_reset_function(adev->pdev); 931 } 932 933 /* 934 * GPU doorbell aperture helpers function. 935 */ 936 /** 937 * amdgpu_device_doorbell_init - Init doorbell driver information. 938 * 939 * @adev: amdgpu_device pointer 940 * 941 * Init doorbell driver information (CIK) 942 * Returns 0 on success, error on failure. 943 */ 944 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 945 { 946 947 /* No doorbell on SI hardware generation */ 948 if (adev->asic_type < CHIP_BONAIRE) { 949 adev->doorbell.base = 0; 950 adev->doorbell.size = 0; 951 adev->doorbell.num_doorbells = 0; 952 adev->doorbell.ptr = NULL; 953 return 0; 954 } 955 956 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 957 return -EINVAL; 958 959 amdgpu_asic_init_doorbell_index(adev); 960 961 /* doorbell bar mapping */ 962 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 963 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 964 965 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 966 adev->doorbell_index.max_assignment+1); 967 if (adev->doorbell.num_doorbells == 0) 968 return -EINVAL; 969 970 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 971 * paging queue doorbell use the second page. The 972 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 973 * doorbells are in the first page. So with paging queue enabled, 974 * the max num_doorbells should + 1 page (0x400 in dword) 975 */ 976 if (adev->asic_type >= CHIP_VEGA10) 977 adev->doorbell.num_doorbells += 0x400; 978 979 adev->doorbell.ptr = ioremap(adev->doorbell.base, 980 adev->doorbell.num_doorbells * 981 sizeof(u32)); 982 if (adev->doorbell.ptr == NULL) 983 return -ENOMEM; 984 985 return 0; 986 } 987 988 /** 989 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 990 * 991 * @adev: amdgpu_device pointer 992 * 993 * Tear down doorbell driver information (CIK) 994 */ 995 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 996 { 997 iounmap(adev->doorbell.ptr); 998 adev->doorbell.ptr = NULL; 999 } 1000 1001 1002 1003 /* 1004 * amdgpu_device_wb_*() 1005 * Writeback is the method by which the GPU updates special pages in memory 1006 * with the status of certain GPU events (fences, ring pointers,etc.). 1007 */ 1008 1009 /** 1010 * amdgpu_device_wb_fini - Disable Writeback and free memory 1011 * 1012 * @adev: amdgpu_device pointer 1013 * 1014 * Disables Writeback and frees the Writeback memory (all asics). 1015 * Used at driver shutdown. 1016 */ 1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1018 { 1019 if (adev->wb.wb_obj) { 1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1021 &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 adev->wb.wb_obj = NULL; 1024 } 1025 } 1026 1027 /** 1028 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1029 * 1030 * @adev: amdgpu_device pointer 1031 * 1032 * Initializes writeback and allocates writeback memory (all asics). 1033 * Used at driver startup. 1034 * Returns 0 on success or an -error on failure. 1035 */ 1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1037 { 1038 int r; 1039 1040 if (adev->wb.wb_obj == NULL) { 1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1044 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1045 (void **)&adev->wb.wb); 1046 if (r) { 1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1048 return r; 1049 } 1050 1051 adev->wb.num_wb = AMDGPU_MAX_WB; 1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1053 1054 /* clear wb memory */ 1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1056 } 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * amdgpu_device_wb_get - Allocate a wb entry 1063 * 1064 * @adev: amdgpu_device pointer 1065 * @wb: wb index 1066 * 1067 * Allocate a wb slot for use by the driver (all asics). 1068 * Returns 0 on success or -EINVAL on failure. 1069 */ 1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1071 { 1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1073 1074 if (offset < adev->wb.num_wb) { 1075 __set_bit(offset, adev->wb.used); 1076 *wb = offset << 3; /* convert to dw offset */ 1077 return 0; 1078 } else { 1079 return -EINVAL; 1080 } 1081 } 1082 1083 /** 1084 * amdgpu_device_wb_free - Free a wb entry 1085 * 1086 * @adev: amdgpu_device pointer 1087 * @wb: wb index 1088 * 1089 * Free a wb slot allocated for use by the driver (all asics) 1090 */ 1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1092 { 1093 wb >>= 3; 1094 if (wb < adev->wb.num_wb) 1095 __clear_bit(wb, adev->wb.used); 1096 } 1097 1098 /** 1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1100 * 1101 * @adev: amdgpu_device pointer 1102 * 1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1104 * to fail, but if any of the BARs is not accessible after the size we abort 1105 * driver loading by returning -ENODEV. 1106 */ 1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1108 { 1109 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1110 struct pci_bus *root; 1111 struct resource *res; 1112 unsigned i; 1113 u16 cmd; 1114 int r; 1115 1116 /* Bypass for VF */ 1117 if (amdgpu_sriov_vf(adev)) 1118 return 0; 1119 1120 /* skip if the bios has already enabled large BAR */ 1121 if (adev->gmc.real_vram_size && 1122 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1123 return 0; 1124 1125 /* Check if the root BUS has 64bit memory resources */ 1126 root = adev->pdev->bus; 1127 while (root->parent) 1128 root = root->parent; 1129 1130 pci_bus_for_each_resource(root, res, i) { 1131 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1132 res->start > 0x100000000ull) 1133 break; 1134 } 1135 1136 /* Trying to resize is pointless without a root hub window above 4GB */ 1137 if (!res) 1138 return 0; 1139 1140 /* Limit the BAR size to what is available */ 1141 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1142 rbar_size); 1143 1144 /* Disable memory decoding while we change the BAR addresses and size */ 1145 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1146 pci_write_config_word(adev->pdev, PCI_COMMAND, 1147 cmd & ~PCI_COMMAND_MEMORY); 1148 1149 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1150 amdgpu_device_doorbell_fini(adev); 1151 if (adev->asic_type >= CHIP_BONAIRE) 1152 pci_release_resource(adev->pdev, 2); 1153 1154 pci_release_resource(adev->pdev, 0); 1155 1156 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1157 if (r == -ENOSPC) 1158 DRM_INFO("Not enough PCI address space for a large BAR."); 1159 else if (r && r != -ENOTSUPP) 1160 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1161 1162 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1163 1164 /* When the doorbell or fb BAR isn't available we have no chance of 1165 * using the device. 1166 */ 1167 r = amdgpu_device_doorbell_init(adev); 1168 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1169 return -ENODEV; 1170 1171 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1172 1173 return 0; 1174 } 1175 1176 /* 1177 * GPU helpers function. 1178 */ 1179 /** 1180 * amdgpu_device_need_post - check if the hw need post or not 1181 * 1182 * @adev: amdgpu_device pointer 1183 * 1184 * Check if the asic has been initialized (all asics) at driver startup 1185 * or post is needed if hw reset is performed. 1186 * Returns true if need or false if not. 1187 */ 1188 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1189 { 1190 uint32_t reg; 1191 1192 if (amdgpu_sriov_vf(adev)) 1193 return false; 1194 1195 if (amdgpu_passthrough(adev)) { 1196 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1197 * some old smc fw still need driver do vPost otherwise gpu hang, while 1198 * those smc fw version above 22.15 doesn't have this flaw, so we force 1199 * vpost executed for smc version below 22.15 1200 */ 1201 if (adev->asic_type == CHIP_FIJI) { 1202 int err; 1203 uint32_t fw_ver; 1204 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1205 /* force vPost if error occured */ 1206 if (err) 1207 return true; 1208 1209 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1210 if (fw_ver < 0x00160e00) 1211 return true; 1212 } 1213 } 1214 1215 /* Don't post if we need to reset whole hive on init */ 1216 if (adev->gmc.xgmi.pending_reset) 1217 return false; 1218 1219 if (adev->has_hw_reset) { 1220 adev->has_hw_reset = false; 1221 return true; 1222 } 1223 1224 /* bios scratch used on CIK+ */ 1225 if (adev->asic_type >= CHIP_BONAIRE) 1226 return amdgpu_atombios_scratch_need_asic_init(adev); 1227 1228 /* check MEM_SIZE for older asics */ 1229 reg = amdgpu_asic_get_config_memsize(adev); 1230 1231 if ((reg != 0) && (reg != 0xffffffff)) 1232 return false; 1233 1234 return true; 1235 } 1236 1237 /* if we get transitioned to only one device, take VGA back */ 1238 /** 1239 * amdgpu_device_vga_set_decode - enable/disable vga decode 1240 * 1241 * @cookie: amdgpu_device pointer 1242 * @state: enable/disable vga decode 1243 * 1244 * Enable/disable vga decode (all asics). 1245 * Returns VGA resource flags. 1246 */ 1247 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1248 { 1249 struct amdgpu_device *adev = cookie; 1250 amdgpu_asic_set_vga_state(adev, state); 1251 if (state) 1252 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1253 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1254 else 1255 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1256 } 1257 1258 /** 1259 * amdgpu_device_check_block_size - validate the vm block size 1260 * 1261 * @adev: amdgpu_device pointer 1262 * 1263 * Validates the vm block size specified via module parameter. 1264 * The vm block size defines number of bits in page table versus page directory, 1265 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1266 * page table and the remaining bits are in the page directory. 1267 */ 1268 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1269 { 1270 /* defines number of bits in page table versus page directory, 1271 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1272 * page table and the remaining bits are in the page directory */ 1273 if (amdgpu_vm_block_size == -1) 1274 return; 1275 1276 if (amdgpu_vm_block_size < 9) { 1277 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1278 amdgpu_vm_block_size); 1279 amdgpu_vm_block_size = -1; 1280 } 1281 } 1282 1283 /** 1284 * amdgpu_device_check_vm_size - validate the vm size 1285 * 1286 * @adev: amdgpu_device pointer 1287 * 1288 * Validates the vm size in GB specified via module parameter. 1289 * The VM size is the size of the GPU virtual memory space in GB. 1290 */ 1291 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1292 { 1293 /* no need to check the default value */ 1294 if (amdgpu_vm_size == -1) 1295 return; 1296 1297 if (amdgpu_vm_size < 1) { 1298 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1299 amdgpu_vm_size); 1300 amdgpu_vm_size = -1; 1301 } 1302 } 1303 1304 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1305 { 1306 struct sysinfo si; 1307 bool is_os_64 = (sizeof(void *) == 8); 1308 uint64_t total_memory; 1309 uint64_t dram_size_seven_GB = 0x1B8000000; 1310 uint64_t dram_size_three_GB = 0xB8000000; 1311 1312 if (amdgpu_smu_memory_pool_size == 0) 1313 return; 1314 1315 if (!is_os_64) { 1316 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1317 goto def_value; 1318 } 1319 si_meminfo(&si); 1320 total_memory = (uint64_t)si.totalram * si.mem_unit; 1321 1322 if ((amdgpu_smu_memory_pool_size == 1) || 1323 (amdgpu_smu_memory_pool_size == 2)) { 1324 if (total_memory < dram_size_three_GB) 1325 goto def_value1; 1326 } else if ((amdgpu_smu_memory_pool_size == 4) || 1327 (amdgpu_smu_memory_pool_size == 8)) { 1328 if (total_memory < dram_size_seven_GB) 1329 goto def_value1; 1330 } else { 1331 DRM_WARN("Smu memory pool size not supported\n"); 1332 goto def_value; 1333 } 1334 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1335 1336 return; 1337 1338 def_value1: 1339 DRM_WARN("No enough system memory\n"); 1340 def_value: 1341 adev->pm.smu_prv_buffer_size = 0; 1342 } 1343 1344 /** 1345 * amdgpu_device_check_arguments - validate module params 1346 * 1347 * @adev: amdgpu_device pointer 1348 * 1349 * Validates certain module parameters and updates 1350 * the associated values used by the driver (all asics). 1351 */ 1352 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1353 { 1354 if (amdgpu_sched_jobs < 4) { 1355 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1356 amdgpu_sched_jobs); 1357 amdgpu_sched_jobs = 4; 1358 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1359 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1360 amdgpu_sched_jobs); 1361 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1362 } 1363 1364 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1365 /* gart size must be greater or equal to 32M */ 1366 dev_warn(adev->dev, "gart size (%d) too small\n", 1367 amdgpu_gart_size); 1368 amdgpu_gart_size = -1; 1369 } 1370 1371 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1372 /* gtt size must be greater or equal to 32M */ 1373 dev_warn(adev->dev, "gtt size (%d) too small\n", 1374 amdgpu_gtt_size); 1375 amdgpu_gtt_size = -1; 1376 } 1377 1378 /* valid range is between 4 and 9 inclusive */ 1379 if (amdgpu_vm_fragment_size != -1 && 1380 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1381 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1382 amdgpu_vm_fragment_size = -1; 1383 } 1384 1385 if (amdgpu_sched_hw_submission < 2) { 1386 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1387 amdgpu_sched_hw_submission); 1388 amdgpu_sched_hw_submission = 2; 1389 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1390 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1391 amdgpu_sched_hw_submission); 1392 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1393 } 1394 1395 amdgpu_device_check_smu_prv_buffer_size(adev); 1396 1397 amdgpu_device_check_vm_size(adev); 1398 1399 amdgpu_device_check_block_size(adev); 1400 1401 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1402 1403 amdgpu_gmc_tmz_set(adev); 1404 1405 amdgpu_gmc_noretry_set(adev); 1406 1407 return 0; 1408 } 1409 1410 /** 1411 * amdgpu_switcheroo_set_state - set switcheroo state 1412 * 1413 * @pdev: pci dev pointer 1414 * @state: vga_switcheroo state 1415 * 1416 * Callback for the switcheroo driver. Suspends or resumes the 1417 * the asics before or after it is powered up using ACPI methods. 1418 */ 1419 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1420 enum vga_switcheroo_state state) 1421 { 1422 struct drm_device *dev = pci_get_drvdata(pdev); 1423 int r; 1424 1425 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1426 return; 1427 1428 if (state == VGA_SWITCHEROO_ON) { 1429 pr_info("switched on\n"); 1430 /* don't suspend or resume card normally */ 1431 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1432 1433 pci_set_power_state(pdev, PCI_D0); 1434 amdgpu_device_load_pci_state(pdev); 1435 r = pci_enable_device(pdev); 1436 if (r) 1437 DRM_WARN("pci_enable_device failed (%d)\n", r); 1438 amdgpu_device_resume(dev, true); 1439 1440 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1441 } else { 1442 pr_info("switched off\n"); 1443 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1444 amdgpu_device_suspend(dev, true); 1445 amdgpu_device_cache_pci_state(pdev); 1446 /* Shut down the device */ 1447 pci_disable_device(pdev); 1448 pci_set_power_state(pdev, PCI_D3cold); 1449 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1450 } 1451 } 1452 1453 /** 1454 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1455 * 1456 * @pdev: pci dev pointer 1457 * 1458 * Callback for the switcheroo driver. Check of the switcheroo 1459 * state can be changed. 1460 * Returns true if the state can be changed, false if not. 1461 */ 1462 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1463 { 1464 struct drm_device *dev = pci_get_drvdata(pdev); 1465 1466 /* 1467 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1468 * locking inversion with the driver load path. And the access here is 1469 * completely racy anyway. So don't bother with locking for now. 1470 */ 1471 return atomic_read(&dev->open_count) == 0; 1472 } 1473 1474 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1475 .set_gpu_state = amdgpu_switcheroo_set_state, 1476 .reprobe = NULL, 1477 .can_switch = amdgpu_switcheroo_can_switch, 1478 }; 1479 1480 /** 1481 * amdgpu_device_ip_set_clockgating_state - set the CG state 1482 * 1483 * @dev: amdgpu_device pointer 1484 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1485 * @state: clockgating state (gate or ungate) 1486 * 1487 * Sets the requested clockgating state for all instances of 1488 * the hardware IP specified. 1489 * Returns the error code from the last instance. 1490 */ 1491 int amdgpu_device_ip_set_clockgating_state(void *dev, 1492 enum amd_ip_block_type block_type, 1493 enum amd_clockgating_state state) 1494 { 1495 struct amdgpu_device *adev = dev; 1496 int i, r = 0; 1497 1498 for (i = 0; i < adev->num_ip_blocks; i++) { 1499 if (!adev->ip_blocks[i].status.valid) 1500 continue; 1501 if (adev->ip_blocks[i].version->type != block_type) 1502 continue; 1503 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1504 continue; 1505 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1506 (void *)adev, state); 1507 if (r) 1508 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1509 adev->ip_blocks[i].version->funcs->name, r); 1510 } 1511 return r; 1512 } 1513 1514 /** 1515 * amdgpu_device_ip_set_powergating_state - set the PG state 1516 * 1517 * @dev: amdgpu_device pointer 1518 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1519 * @state: powergating state (gate or ungate) 1520 * 1521 * Sets the requested powergating state for all instances of 1522 * the hardware IP specified. 1523 * Returns the error code from the last instance. 1524 */ 1525 int amdgpu_device_ip_set_powergating_state(void *dev, 1526 enum amd_ip_block_type block_type, 1527 enum amd_powergating_state state) 1528 { 1529 struct amdgpu_device *adev = dev; 1530 int i, r = 0; 1531 1532 for (i = 0; i < adev->num_ip_blocks; i++) { 1533 if (!adev->ip_blocks[i].status.valid) 1534 continue; 1535 if (adev->ip_blocks[i].version->type != block_type) 1536 continue; 1537 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1538 continue; 1539 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1540 (void *)adev, state); 1541 if (r) 1542 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1543 adev->ip_blocks[i].version->funcs->name, r); 1544 } 1545 return r; 1546 } 1547 1548 /** 1549 * amdgpu_device_ip_get_clockgating_state - get the CG state 1550 * 1551 * @adev: amdgpu_device pointer 1552 * @flags: clockgating feature flags 1553 * 1554 * Walks the list of IPs on the device and updates the clockgating 1555 * flags for each IP. 1556 * Updates @flags with the feature flags for each hardware IP where 1557 * clockgating is enabled. 1558 */ 1559 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1560 u32 *flags) 1561 { 1562 int i; 1563 1564 for (i = 0; i < adev->num_ip_blocks; i++) { 1565 if (!adev->ip_blocks[i].status.valid) 1566 continue; 1567 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1568 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1569 } 1570 } 1571 1572 /** 1573 * amdgpu_device_ip_wait_for_idle - wait for idle 1574 * 1575 * @adev: amdgpu_device pointer 1576 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1577 * 1578 * Waits for the request hardware IP to be idle. 1579 * Returns 0 for success or a negative error code on failure. 1580 */ 1581 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1582 enum amd_ip_block_type block_type) 1583 { 1584 int i, r; 1585 1586 for (i = 0; i < adev->num_ip_blocks; i++) { 1587 if (!adev->ip_blocks[i].status.valid) 1588 continue; 1589 if (adev->ip_blocks[i].version->type == block_type) { 1590 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1591 if (r) 1592 return r; 1593 break; 1594 } 1595 } 1596 return 0; 1597 1598 } 1599 1600 /** 1601 * amdgpu_device_ip_is_idle - is the hardware IP idle 1602 * 1603 * @adev: amdgpu_device pointer 1604 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1605 * 1606 * Check if the hardware IP is idle or not. 1607 * Returns true if it the IP is idle, false if not. 1608 */ 1609 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1610 enum amd_ip_block_type block_type) 1611 { 1612 int i; 1613 1614 for (i = 0; i < adev->num_ip_blocks; i++) { 1615 if (!adev->ip_blocks[i].status.valid) 1616 continue; 1617 if (adev->ip_blocks[i].version->type == block_type) 1618 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1619 } 1620 return true; 1621 1622 } 1623 1624 /** 1625 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1626 * 1627 * @adev: amdgpu_device pointer 1628 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1629 * 1630 * Returns a pointer to the hardware IP block structure 1631 * if it exists for the asic, otherwise NULL. 1632 */ 1633 struct amdgpu_ip_block * 1634 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1635 enum amd_ip_block_type type) 1636 { 1637 int i; 1638 1639 for (i = 0; i < adev->num_ip_blocks; i++) 1640 if (adev->ip_blocks[i].version->type == type) 1641 return &adev->ip_blocks[i]; 1642 1643 return NULL; 1644 } 1645 1646 /** 1647 * amdgpu_device_ip_block_version_cmp 1648 * 1649 * @adev: amdgpu_device pointer 1650 * @type: enum amd_ip_block_type 1651 * @major: major version 1652 * @minor: minor version 1653 * 1654 * return 0 if equal or greater 1655 * return 1 if smaller or the ip_block doesn't exist 1656 */ 1657 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1658 enum amd_ip_block_type type, 1659 u32 major, u32 minor) 1660 { 1661 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1662 1663 if (ip_block && ((ip_block->version->major > major) || 1664 ((ip_block->version->major == major) && 1665 (ip_block->version->minor >= minor)))) 1666 return 0; 1667 1668 return 1; 1669 } 1670 1671 /** 1672 * amdgpu_device_ip_block_add 1673 * 1674 * @adev: amdgpu_device pointer 1675 * @ip_block_version: pointer to the IP to add 1676 * 1677 * Adds the IP block driver information to the collection of IPs 1678 * on the asic. 1679 */ 1680 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1681 const struct amdgpu_ip_block_version *ip_block_version) 1682 { 1683 if (!ip_block_version) 1684 return -EINVAL; 1685 1686 switch (ip_block_version->type) { 1687 case AMD_IP_BLOCK_TYPE_VCN: 1688 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1689 return 0; 1690 break; 1691 case AMD_IP_BLOCK_TYPE_JPEG: 1692 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1693 return 0; 1694 break; 1695 default: 1696 break; 1697 } 1698 1699 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1700 ip_block_version->funcs->name); 1701 1702 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1703 1704 return 0; 1705 } 1706 1707 /** 1708 * amdgpu_device_enable_virtual_display - enable virtual display feature 1709 * 1710 * @adev: amdgpu_device pointer 1711 * 1712 * Enabled the virtual display feature if the user has enabled it via 1713 * the module parameter virtual_display. This feature provides a virtual 1714 * display hardware on headless boards or in virtualized environments. 1715 * This function parses and validates the configuration string specified by 1716 * the user and configues the virtual display configuration (number of 1717 * virtual connectors, crtcs, etc.) specified. 1718 */ 1719 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1720 { 1721 adev->enable_virtual_display = false; 1722 1723 if (amdgpu_virtual_display) { 1724 const char *pci_address_name = pci_name(adev->pdev); 1725 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1726 1727 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1728 pciaddstr_tmp = pciaddstr; 1729 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1730 pciaddname = strsep(&pciaddname_tmp, ","); 1731 if (!strcmp("all", pciaddname) 1732 || !strcmp(pci_address_name, pciaddname)) { 1733 long num_crtc; 1734 int res = -1; 1735 1736 adev->enable_virtual_display = true; 1737 1738 if (pciaddname_tmp) 1739 res = kstrtol(pciaddname_tmp, 10, 1740 &num_crtc); 1741 1742 if (!res) { 1743 if (num_crtc < 1) 1744 num_crtc = 1; 1745 if (num_crtc > 6) 1746 num_crtc = 6; 1747 adev->mode_info.num_crtc = num_crtc; 1748 } else { 1749 adev->mode_info.num_crtc = 1; 1750 } 1751 break; 1752 } 1753 } 1754 1755 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1756 amdgpu_virtual_display, pci_address_name, 1757 adev->enable_virtual_display, adev->mode_info.num_crtc); 1758 1759 kfree(pciaddstr); 1760 } 1761 } 1762 1763 /** 1764 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1765 * 1766 * @adev: amdgpu_device pointer 1767 * 1768 * Parses the asic configuration parameters specified in the gpu info 1769 * firmware and makes them availale to the driver for use in configuring 1770 * the asic. 1771 * Returns 0 on success, -EINVAL on failure. 1772 */ 1773 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1774 { 1775 const char *chip_name; 1776 char fw_name[40]; 1777 int err; 1778 const struct gpu_info_firmware_header_v1_0 *hdr; 1779 1780 adev->firmware.gpu_info_fw = NULL; 1781 1782 if (adev->mman.discovery_bin) { 1783 amdgpu_discovery_get_gfx_info(adev); 1784 1785 /* 1786 * FIXME: The bounding box is still needed by Navi12, so 1787 * temporarily read it from gpu_info firmware. Should be droped 1788 * when DAL no longer needs it. 1789 */ 1790 if (adev->asic_type != CHIP_NAVI12) 1791 return 0; 1792 } 1793 1794 switch (adev->asic_type) { 1795 #ifdef CONFIG_DRM_AMDGPU_SI 1796 case CHIP_VERDE: 1797 case CHIP_TAHITI: 1798 case CHIP_PITCAIRN: 1799 case CHIP_OLAND: 1800 case CHIP_HAINAN: 1801 #endif 1802 #ifdef CONFIG_DRM_AMDGPU_CIK 1803 case CHIP_BONAIRE: 1804 case CHIP_HAWAII: 1805 case CHIP_KAVERI: 1806 case CHIP_KABINI: 1807 case CHIP_MULLINS: 1808 #endif 1809 case CHIP_TOPAZ: 1810 case CHIP_TONGA: 1811 case CHIP_FIJI: 1812 case CHIP_POLARIS10: 1813 case CHIP_POLARIS11: 1814 case CHIP_POLARIS12: 1815 case CHIP_VEGAM: 1816 case CHIP_CARRIZO: 1817 case CHIP_STONEY: 1818 case CHIP_VEGA20: 1819 case CHIP_ALDEBARAN: 1820 case CHIP_SIENNA_CICHLID: 1821 case CHIP_NAVY_FLOUNDER: 1822 case CHIP_DIMGREY_CAVEFISH: 1823 default: 1824 return 0; 1825 case CHIP_VEGA10: 1826 chip_name = "vega10"; 1827 break; 1828 case CHIP_VEGA12: 1829 chip_name = "vega12"; 1830 break; 1831 case CHIP_RAVEN: 1832 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1833 chip_name = "raven2"; 1834 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1835 chip_name = "picasso"; 1836 else 1837 chip_name = "raven"; 1838 break; 1839 case CHIP_ARCTURUS: 1840 chip_name = "arcturus"; 1841 break; 1842 case CHIP_RENOIR: 1843 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1844 chip_name = "renoir"; 1845 else 1846 chip_name = "green_sardine"; 1847 break; 1848 case CHIP_NAVI10: 1849 chip_name = "navi10"; 1850 break; 1851 case CHIP_NAVI14: 1852 chip_name = "navi14"; 1853 break; 1854 case CHIP_NAVI12: 1855 chip_name = "navi12"; 1856 break; 1857 case CHIP_VANGOGH: 1858 chip_name = "vangogh"; 1859 break; 1860 } 1861 1862 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1863 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1864 if (err) { 1865 dev_err(adev->dev, 1866 "Failed to load gpu_info firmware \"%s\"\n", 1867 fw_name); 1868 goto out; 1869 } 1870 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1871 if (err) { 1872 dev_err(adev->dev, 1873 "Failed to validate gpu_info firmware \"%s\"\n", 1874 fw_name); 1875 goto out; 1876 } 1877 1878 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1879 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1880 1881 switch (hdr->version_major) { 1882 case 1: 1883 { 1884 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1885 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1886 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1887 1888 /* 1889 * Should be droped when DAL no longer needs it. 1890 */ 1891 if (adev->asic_type == CHIP_NAVI12) 1892 goto parse_soc_bounding_box; 1893 1894 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1895 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1896 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1897 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1898 adev->gfx.config.max_texture_channel_caches = 1899 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1900 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1901 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1902 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1903 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1904 adev->gfx.config.double_offchip_lds_buf = 1905 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1906 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1907 adev->gfx.cu_info.max_waves_per_simd = 1908 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1909 adev->gfx.cu_info.max_scratch_slots_per_cu = 1910 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1911 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1912 if (hdr->version_minor >= 1) { 1913 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1914 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1915 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1916 adev->gfx.config.num_sc_per_sh = 1917 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1918 adev->gfx.config.num_packer_per_sc = 1919 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1920 } 1921 1922 parse_soc_bounding_box: 1923 /* 1924 * soc bounding box info is not integrated in disocovery table, 1925 * we always need to parse it from gpu info firmware if needed. 1926 */ 1927 if (hdr->version_minor == 2) { 1928 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1929 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1930 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1931 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1932 } 1933 break; 1934 } 1935 default: 1936 dev_err(adev->dev, 1937 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1938 err = -EINVAL; 1939 goto out; 1940 } 1941 out: 1942 return err; 1943 } 1944 1945 /** 1946 * amdgpu_device_ip_early_init - run early init for hardware IPs 1947 * 1948 * @adev: amdgpu_device pointer 1949 * 1950 * Early initialization pass for hardware IPs. The hardware IPs that make 1951 * up each asic are discovered each IP's early_init callback is run. This 1952 * is the first stage in initializing the asic. 1953 * Returns 0 on success, negative error code on failure. 1954 */ 1955 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1956 { 1957 int i, r; 1958 1959 amdgpu_device_enable_virtual_display(adev); 1960 1961 if (amdgpu_sriov_vf(adev)) { 1962 r = amdgpu_virt_request_full_gpu(adev, true); 1963 if (r) 1964 return r; 1965 } 1966 1967 switch (adev->asic_type) { 1968 #ifdef CONFIG_DRM_AMDGPU_SI 1969 case CHIP_VERDE: 1970 case CHIP_TAHITI: 1971 case CHIP_PITCAIRN: 1972 case CHIP_OLAND: 1973 case CHIP_HAINAN: 1974 adev->family = AMDGPU_FAMILY_SI; 1975 r = si_set_ip_blocks(adev); 1976 if (r) 1977 return r; 1978 break; 1979 #endif 1980 #ifdef CONFIG_DRM_AMDGPU_CIK 1981 case CHIP_BONAIRE: 1982 case CHIP_HAWAII: 1983 case CHIP_KAVERI: 1984 case CHIP_KABINI: 1985 case CHIP_MULLINS: 1986 if (adev->flags & AMD_IS_APU) 1987 adev->family = AMDGPU_FAMILY_KV; 1988 else 1989 adev->family = AMDGPU_FAMILY_CI; 1990 1991 r = cik_set_ip_blocks(adev); 1992 if (r) 1993 return r; 1994 break; 1995 #endif 1996 case CHIP_TOPAZ: 1997 case CHIP_TONGA: 1998 case CHIP_FIJI: 1999 case CHIP_POLARIS10: 2000 case CHIP_POLARIS11: 2001 case CHIP_POLARIS12: 2002 case CHIP_VEGAM: 2003 case CHIP_CARRIZO: 2004 case CHIP_STONEY: 2005 if (adev->flags & AMD_IS_APU) 2006 adev->family = AMDGPU_FAMILY_CZ; 2007 else 2008 adev->family = AMDGPU_FAMILY_VI; 2009 2010 r = vi_set_ip_blocks(adev); 2011 if (r) 2012 return r; 2013 break; 2014 case CHIP_VEGA10: 2015 case CHIP_VEGA12: 2016 case CHIP_VEGA20: 2017 case CHIP_RAVEN: 2018 case CHIP_ARCTURUS: 2019 case CHIP_RENOIR: 2020 case CHIP_ALDEBARAN: 2021 if (adev->flags & AMD_IS_APU) 2022 adev->family = AMDGPU_FAMILY_RV; 2023 else 2024 adev->family = AMDGPU_FAMILY_AI; 2025 2026 r = soc15_set_ip_blocks(adev); 2027 if (r) 2028 return r; 2029 break; 2030 case CHIP_NAVI10: 2031 case CHIP_NAVI14: 2032 case CHIP_NAVI12: 2033 case CHIP_SIENNA_CICHLID: 2034 case CHIP_NAVY_FLOUNDER: 2035 case CHIP_DIMGREY_CAVEFISH: 2036 case CHIP_VANGOGH: 2037 if (adev->asic_type == CHIP_VANGOGH) 2038 adev->family = AMDGPU_FAMILY_VGH; 2039 else 2040 adev->family = AMDGPU_FAMILY_NV; 2041 2042 r = nv_set_ip_blocks(adev); 2043 if (r) 2044 return r; 2045 break; 2046 default: 2047 /* FIXME: not supported yet */ 2048 return -EINVAL; 2049 } 2050 2051 amdgpu_amdkfd_device_probe(adev); 2052 2053 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2054 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2055 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2056 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2057 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2058 2059 for (i = 0; i < adev->num_ip_blocks; i++) { 2060 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2061 DRM_ERROR("disabled ip block: %d <%s>\n", 2062 i, adev->ip_blocks[i].version->funcs->name); 2063 adev->ip_blocks[i].status.valid = false; 2064 } else { 2065 if (adev->ip_blocks[i].version->funcs->early_init) { 2066 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2067 if (r == -ENOENT) { 2068 adev->ip_blocks[i].status.valid = false; 2069 } else if (r) { 2070 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2071 adev->ip_blocks[i].version->funcs->name, r); 2072 return r; 2073 } else { 2074 adev->ip_blocks[i].status.valid = true; 2075 } 2076 } else { 2077 adev->ip_blocks[i].status.valid = true; 2078 } 2079 } 2080 /* get the vbios after the asic_funcs are set up */ 2081 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2082 r = amdgpu_device_parse_gpu_info_fw(adev); 2083 if (r) 2084 return r; 2085 2086 /* Read BIOS */ 2087 if (!amdgpu_get_bios(adev)) 2088 return -EINVAL; 2089 2090 r = amdgpu_atombios_init(adev); 2091 if (r) { 2092 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2093 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2094 return r; 2095 } 2096 2097 /*get pf2vf msg info at it's earliest time*/ 2098 if (amdgpu_sriov_vf(adev)) 2099 amdgpu_virt_init_data_exchange(adev); 2100 2101 } 2102 } 2103 2104 adev->cg_flags &= amdgpu_cg_mask; 2105 adev->pg_flags &= amdgpu_pg_mask; 2106 2107 return 0; 2108 } 2109 2110 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2111 { 2112 int i, r; 2113 2114 for (i = 0; i < adev->num_ip_blocks; i++) { 2115 if (!adev->ip_blocks[i].status.sw) 2116 continue; 2117 if (adev->ip_blocks[i].status.hw) 2118 continue; 2119 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2120 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2122 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2123 if (r) { 2124 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2125 adev->ip_blocks[i].version->funcs->name, r); 2126 return r; 2127 } 2128 adev->ip_blocks[i].status.hw = true; 2129 } 2130 } 2131 2132 return 0; 2133 } 2134 2135 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2136 { 2137 int i, r; 2138 2139 for (i = 0; i < adev->num_ip_blocks; i++) { 2140 if (!adev->ip_blocks[i].status.sw) 2141 continue; 2142 if (adev->ip_blocks[i].status.hw) 2143 continue; 2144 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2145 if (r) { 2146 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2147 adev->ip_blocks[i].version->funcs->name, r); 2148 return r; 2149 } 2150 adev->ip_blocks[i].status.hw = true; 2151 } 2152 2153 return 0; 2154 } 2155 2156 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2157 { 2158 int r = 0; 2159 int i; 2160 uint32_t smu_version; 2161 2162 if (adev->asic_type >= CHIP_VEGA10) { 2163 for (i = 0; i < adev->num_ip_blocks; i++) { 2164 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2165 continue; 2166 2167 if (!adev->ip_blocks[i].status.sw) 2168 continue; 2169 2170 /* no need to do the fw loading again if already done*/ 2171 if (adev->ip_blocks[i].status.hw == true) 2172 break; 2173 2174 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2175 r = adev->ip_blocks[i].version->funcs->resume(adev); 2176 if (r) { 2177 DRM_ERROR("resume of IP block <%s> failed %d\n", 2178 adev->ip_blocks[i].version->funcs->name, r); 2179 return r; 2180 } 2181 } else { 2182 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2183 if (r) { 2184 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2185 adev->ip_blocks[i].version->funcs->name, r); 2186 return r; 2187 } 2188 } 2189 2190 adev->ip_blocks[i].status.hw = true; 2191 break; 2192 } 2193 } 2194 2195 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2196 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2197 2198 return r; 2199 } 2200 2201 /** 2202 * amdgpu_device_ip_init - run init for hardware IPs 2203 * 2204 * @adev: amdgpu_device pointer 2205 * 2206 * Main initialization pass for hardware IPs. The list of all the hardware 2207 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2208 * are run. sw_init initializes the software state associated with each IP 2209 * and hw_init initializes the hardware associated with each IP. 2210 * Returns 0 on success, negative error code on failure. 2211 */ 2212 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2213 { 2214 int i, r; 2215 2216 r = amdgpu_ras_init(adev); 2217 if (r) 2218 return r; 2219 2220 for (i = 0; i < adev->num_ip_blocks; i++) { 2221 if (!adev->ip_blocks[i].status.valid) 2222 continue; 2223 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2224 if (r) { 2225 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2226 adev->ip_blocks[i].version->funcs->name, r); 2227 goto init_failed; 2228 } 2229 adev->ip_blocks[i].status.sw = true; 2230 2231 /* need to do gmc hw init early so we can allocate gpu mem */ 2232 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2233 r = amdgpu_device_vram_scratch_init(adev); 2234 if (r) { 2235 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2236 goto init_failed; 2237 } 2238 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2239 if (r) { 2240 DRM_ERROR("hw_init %d failed %d\n", i, r); 2241 goto init_failed; 2242 } 2243 r = amdgpu_device_wb_init(adev); 2244 if (r) { 2245 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2246 goto init_failed; 2247 } 2248 adev->ip_blocks[i].status.hw = true; 2249 2250 /* right after GMC hw init, we create CSA */ 2251 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2252 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2253 AMDGPU_GEM_DOMAIN_VRAM, 2254 AMDGPU_CSA_SIZE); 2255 if (r) { 2256 DRM_ERROR("allocate CSA failed %d\n", r); 2257 goto init_failed; 2258 } 2259 } 2260 } 2261 } 2262 2263 if (amdgpu_sriov_vf(adev)) 2264 amdgpu_virt_init_data_exchange(adev); 2265 2266 r = amdgpu_ib_pool_init(adev); 2267 if (r) { 2268 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2269 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2270 goto init_failed; 2271 } 2272 2273 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2274 if (r) 2275 goto init_failed; 2276 2277 r = amdgpu_device_ip_hw_init_phase1(adev); 2278 if (r) 2279 goto init_failed; 2280 2281 r = amdgpu_device_fw_loading(adev); 2282 if (r) 2283 goto init_failed; 2284 2285 r = amdgpu_device_ip_hw_init_phase2(adev); 2286 if (r) 2287 goto init_failed; 2288 2289 /* 2290 * retired pages will be loaded from eeprom and reserved here, 2291 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2292 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2293 * for I2C communication which only true at this point. 2294 * 2295 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2296 * failure from bad gpu situation and stop amdgpu init process 2297 * accordingly. For other failed cases, it will still release all 2298 * the resource and print error message, rather than returning one 2299 * negative value to upper level. 2300 * 2301 * Note: theoretically, this should be called before all vram allocations 2302 * to protect retired page from abusing 2303 */ 2304 r = amdgpu_ras_recovery_init(adev); 2305 if (r) 2306 goto init_failed; 2307 2308 if (adev->gmc.xgmi.num_physical_nodes > 1) 2309 amdgpu_xgmi_add_device(adev); 2310 2311 /* Don't init kfd if whole hive need to be reset during init */ 2312 if (!adev->gmc.xgmi.pending_reset) 2313 amdgpu_amdkfd_device_init(adev); 2314 2315 amdgpu_fru_get_product_info(adev); 2316 2317 init_failed: 2318 if (amdgpu_sriov_vf(adev)) 2319 amdgpu_virt_release_full_gpu(adev, true); 2320 2321 return r; 2322 } 2323 2324 /** 2325 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2326 * 2327 * @adev: amdgpu_device pointer 2328 * 2329 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2330 * this function before a GPU reset. If the value is retained after a 2331 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2332 */ 2333 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2334 { 2335 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2336 } 2337 2338 /** 2339 * amdgpu_device_check_vram_lost - check if vram is valid 2340 * 2341 * @adev: amdgpu_device pointer 2342 * 2343 * Checks the reset magic value written to the gart pointer in VRAM. 2344 * The driver calls this after a GPU reset to see if the contents of 2345 * VRAM is lost or now. 2346 * returns true if vram is lost, false if not. 2347 */ 2348 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2349 { 2350 if (memcmp(adev->gart.ptr, adev->reset_magic, 2351 AMDGPU_RESET_MAGIC_NUM)) 2352 return true; 2353 2354 if (!amdgpu_in_reset(adev)) 2355 return false; 2356 2357 /* 2358 * For all ASICs with baco/mode1 reset, the VRAM is 2359 * always assumed to be lost. 2360 */ 2361 switch (amdgpu_asic_reset_method(adev)) { 2362 case AMD_RESET_METHOD_BACO: 2363 case AMD_RESET_METHOD_MODE1: 2364 return true; 2365 default: 2366 return false; 2367 } 2368 } 2369 2370 /** 2371 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2372 * 2373 * @adev: amdgpu_device pointer 2374 * @state: clockgating state (gate or ungate) 2375 * 2376 * The list of all the hardware IPs that make up the asic is walked and the 2377 * set_clockgating_state callbacks are run. 2378 * Late initialization pass enabling clockgating for hardware IPs. 2379 * Fini or suspend, pass disabling clockgating for hardware IPs. 2380 * Returns 0 on success, negative error code on failure. 2381 */ 2382 2383 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2384 enum amd_clockgating_state state) 2385 { 2386 int i, j, r; 2387 2388 if (amdgpu_emu_mode == 1) 2389 return 0; 2390 2391 for (j = 0; j < adev->num_ip_blocks; j++) { 2392 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2393 if (!adev->ip_blocks[i].status.late_initialized) 2394 continue; 2395 /* skip CG for GFX on S0ix */ 2396 if (adev->in_s0ix && 2397 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2398 continue; 2399 /* skip CG for VCE/UVD, it's handled specially */ 2400 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2401 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2402 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2403 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2404 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2405 /* enable clockgating to save power */ 2406 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2407 state); 2408 if (r) { 2409 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2410 adev->ip_blocks[i].version->funcs->name, r); 2411 return r; 2412 } 2413 } 2414 } 2415 2416 return 0; 2417 } 2418 2419 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2420 enum amd_powergating_state state) 2421 { 2422 int i, j, r; 2423 2424 if (amdgpu_emu_mode == 1) 2425 return 0; 2426 2427 for (j = 0; j < adev->num_ip_blocks; j++) { 2428 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2429 if (!adev->ip_blocks[i].status.late_initialized) 2430 continue; 2431 /* skip PG for GFX on S0ix */ 2432 if (adev->in_s0ix && 2433 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2434 continue; 2435 /* skip CG for VCE/UVD, it's handled specially */ 2436 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2437 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2438 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2439 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2440 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2441 /* enable powergating to save power */ 2442 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2443 state); 2444 if (r) { 2445 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2446 adev->ip_blocks[i].version->funcs->name, r); 2447 return r; 2448 } 2449 } 2450 } 2451 return 0; 2452 } 2453 2454 static int amdgpu_device_enable_mgpu_fan_boost(void) 2455 { 2456 struct amdgpu_gpu_instance *gpu_ins; 2457 struct amdgpu_device *adev; 2458 int i, ret = 0; 2459 2460 mutex_lock(&mgpu_info.mutex); 2461 2462 /* 2463 * MGPU fan boost feature should be enabled 2464 * only when there are two or more dGPUs in 2465 * the system 2466 */ 2467 if (mgpu_info.num_dgpu < 2) 2468 goto out; 2469 2470 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2471 gpu_ins = &(mgpu_info.gpu_ins[i]); 2472 adev = gpu_ins->adev; 2473 if (!(adev->flags & AMD_IS_APU) && 2474 !gpu_ins->mgpu_fan_enabled) { 2475 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2476 if (ret) 2477 break; 2478 2479 gpu_ins->mgpu_fan_enabled = 1; 2480 } 2481 } 2482 2483 out: 2484 mutex_unlock(&mgpu_info.mutex); 2485 2486 return ret; 2487 } 2488 2489 /** 2490 * amdgpu_device_ip_late_init - run late init for hardware IPs 2491 * 2492 * @adev: amdgpu_device pointer 2493 * 2494 * Late initialization pass for hardware IPs. The list of all the hardware 2495 * IPs that make up the asic is walked and the late_init callbacks are run. 2496 * late_init covers any special initialization that an IP requires 2497 * after all of the have been initialized or something that needs to happen 2498 * late in the init process. 2499 * Returns 0 on success, negative error code on failure. 2500 */ 2501 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2502 { 2503 struct amdgpu_gpu_instance *gpu_instance; 2504 int i = 0, r; 2505 2506 for (i = 0; i < adev->num_ip_blocks; i++) { 2507 if (!adev->ip_blocks[i].status.hw) 2508 continue; 2509 if (adev->ip_blocks[i].version->funcs->late_init) { 2510 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2511 if (r) { 2512 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2513 adev->ip_blocks[i].version->funcs->name, r); 2514 return r; 2515 } 2516 } 2517 adev->ip_blocks[i].status.late_initialized = true; 2518 } 2519 2520 amdgpu_ras_set_error_query_ready(adev, true); 2521 2522 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2523 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2524 2525 amdgpu_device_fill_reset_magic(adev); 2526 2527 r = amdgpu_device_enable_mgpu_fan_boost(); 2528 if (r) 2529 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2530 2531 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2532 if (adev->asic_type == CHIP_ARCTURUS && 2533 amdgpu_passthrough(adev) && 2534 adev->gmc.xgmi.num_physical_nodes > 1) 2535 smu_set_light_sbr(&adev->smu, true); 2536 2537 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2538 mutex_lock(&mgpu_info.mutex); 2539 2540 /* 2541 * Reset device p-state to low as this was booted with high. 2542 * 2543 * This should be performed only after all devices from the same 2544 * hive get initialized. 2545 * 2546 * However, it's unknown how many device in the hive in advance. 2547 * As this is counted one by one during devices initializations. 2548 * 2549 * So, we wait for all XGMI interlinked devices initialized. 2550 * This may bring some delays as those devices may come from 2551 * different hives. But that should be OK. 2552 */ 2553 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2554 for (i = 0; i < mgpu_info.num_gpu; i++) { 2555 gpu_instance = &(mgpu_info.gpu_ins[i]); 2556 if (gpu_instance->adev->flags & AMD_IS_APU) 2557 continue; 2558 2559 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2560 AMDGPU_XGMI_PSTATE_MIN); 2561 if (r) { 2562 DRM_ERROR("pstate setting failed (%d).\n", r); 2563 break; 2564 } 2565 } 2566 } 2567 2568 mutex_unlock(&mgpu_info.mutex); 2569 } 2570 2571 return 0; 2572 } 2573 2574 /** 2575 * amdgpu_device_ip_fini - run fini for hardware IPs 2576 * 2577 * @adev: amdgpu_device pointer 2578 * 2579 * Main teardown pass for hardware IPs. The list of all the hardware 2580 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2581 * are run. hw_fini tears down the hardware associated with each IP 2582 * and sw_fini tears down any software state associated with each IP. 2583 * Returns 0 on success, negative error code on failure. 2584 */ 2585 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2586 { 2587 int i, r; 2588 2589 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2590 amdgpu_virt_release_ras_err_handler_data(adev); 2591 2592 amdgpu_ras_pre_fini(adev); 2593 2594 if (adev->gmc.xgmi.num_physical_nodes > 1) 2595 amdgpu_xgmi_remove_device(adev); 2596 2597 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2598 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2599 2600 amdgpu_amdkfd_device_fini(adev); 2601 2602 /* need to disable SMC first */ 2603 for (i = 0; i < adev->num_ip_blocks; i++) { 2604 if (!adev->ip_blocks[i].status.hw) 2605 continue; 2606 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2607 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2608 /* XXX handle errors */ 2609 if (r) { 2610 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2611 adev->ip_blocks[i].version->funcs->name, r); 2612 } 2613 adev->ip_blocks[i].status.hw = false; 2614 break; 2615 } 2616 } 2617 2618 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2619 if (!adev->ip_blocks[i].status.hw) 2620 continue; 2621 2622 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2623 /* XXX handle errors */ 2624 if (r) { 2625 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2626 adev->ip_blocks[i].version->funcs->name, r); 2627 } 2628 2629 adev->ip_blocks[i].status.hw = false; 2630 } 2631 2632 2633 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2634 if (!adev->ip_blocks[i].status.sw) 2635 continue; 2636 2637 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2638 amdgpu_ucode_free_bo(adev); 2639 amdgpu_free_static_csa(&adev->virt.csa_obj); 2640 amdgpu_device_wb_fini(adev); 2641 amdgpu_device_vram_scratch_fini(adev); 2642 amdgpu_ib_pool_fini(adev); 2643 } 2644 2645 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2646 /* XXX handle errors */ 2647 if (r) { 2648 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2649 adev->ip_blocks[i].version->funcs->name, r); 2650 } 2651 adev->ip_blocks[i].status.sw = false; 2652 adev->ip_blocks[i].status.valid = false; 2653 } 2654 2655 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2656 if (!adev->ip_blocks[i].status.late_initialized) 2657 continue; 2658 if (adev->ip_blocks[i].version->funcs->late_fini) 2659 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2660 adev->ip_blocks[i].status.late_initialized = false; 2661 } 2662 2663 amdgpu_ras_fini(adev); 2664 2665 if (amdgpu_sriov_vf(adev)) 2666 if (amdgpu_virt_release_full_gpu(adev, false)) 2667 DRM_ERROR("failed to release exclusive mode on fini\n"); 2668 2669 return 0; 2670 } 2671 2672 /** 2673 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2674 * 2675 * @work: work_struct. 2676 */ 2677 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2678 { 2679 struct amdgpu_device *adev = 2680 container_of(work, struct amdgpu_device, delayed_init_work.work); 2681 int r; 2682 2683 r = amdgpu_ib_ring_tests(adev); 2684 if (r) 2685 DRM_ERROR("ib ring test failed (%d).\n", r); 2686 } 2687 2688 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2689 { 2690 struct amdgpu_device *adev = 2691 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2692 2693 mutex_lock(&adev->gfx.gfx_off_mutex); 2694 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2695 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2696 adev->gfx.gfx_off_state = true; 2697 } 2698 mutex_unlock(&adev->gfx.gfx_off_mutex); 2699 } 2700 2701 /** 2702 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2703 * 2704 * @adev: amdgpu_device pointer 2705 * 2706 * Main suspend function for hardware IPs. The list of all the hardware 2707 * IPs that make up the asic is walked, clockgating is disabled and the 2708 * suspend callbacks are run. suspend puts the hardware and software state 2709 * in each IP into a state suitable for suspend. 2710 * Returns 0 on success, negative error code on failure. 2711 */ 2712 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2713 { 2714 int i, r; 2715 2716 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2717 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2718 2719 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2720 if (!adev->ip_blocks[i].status.valid) 2721 continue; 2722 2723 /* displays are handled separately */ 2724 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2725 continue; 2726 2727 /* XXX handle errors */ 2728 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2729 /* XXX handle errors */ 2730 if (r) { 2731 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2732 adev->ip_blocks[i].version->funcs->name, r); 2733 return r; 2734 } 2735 2736 adev->ip_blocks[i].status.hw = false; 2737 } 2738 2739 return 0; 2740 } 2741 2742 /** 2743 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2744 * 2745 * @adev: amdgpu_device pointer 2746 * 2747 * Main suspend function for hardware IPs. The list of all the hardware 2748 * IPs that make up the asic is walked, clockgating is disabled and the 2749 * suspend callbacks are run. suspend puts the hardware and software state 2750 * in each IP into a state suitable for suspend. 2751 * Returns 0 on success, negative error code on failure. 2752 */ 2753 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2754 { 2755 int i, r; 2756 2757 if (adev->in_s0ix) 2758 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2759 2760 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2761 if (!adev->ip_blocks[i].status.valid) 2762 continue; 2763 /* displays are handled in phase1 */ 2764 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2765 continue; 2766 /* PSP lost connection when err_event_athub occurs */ 2767 if (amdgpu_ras_intr_triggered() && 2768 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2769 adev->ip_blocks[i].status.hw = false; 2770 continue; 2771 } 2772 2773 /* skip unnecessary suspend if we do not initialize them yet */ 2774 if (adev->gmc.xgmi.pending_reset && 2775 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2778 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2779 adev->ip_blocks[i].status.hw = false; 2780 continue; 2781 } 2782 2783 /* skip suspend of gfx and psp for S0ix 2784 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2785 * like at runtime. PSP is also part of the always on hardware 2786 * so no need to suspend it. 2787 */ 2788 if (adev->in_s0ix && 2789 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2790 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2791 continue; 2792 2793 /* XXX handle errors */ 2794 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2795 /* XXX handle errors */ 2796 if (r) { 2797 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2798 adev->ip_blocks[i].version->funcs->name, r); 2799 } 2800 adev->ip_blocks[i].status.hw = false; 2801 /* handle putting the SMC in the appropriate state */ 2802 if(!amdgpu_sriov_vf(adev)){ 2803 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2804 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2805 if (r) { 2806 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2807 adev->mp1_state, r); 2808 return r; 2809 } 2810 } 2811 } 2812 } 2813 2814 return 0; 2815 } 2816 2817 /** 2818 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2819 * 2820 * @adev: amdgpu_device pointer 2821 * 2822 * Main suspend function for hardware IPs. The list of all the hardware 2823 * IPs that make up the asic is walked, clockgating is disabled and the 2824 * suspend callbacks are run. suspend puts the hardware and software state 2825 * in each IP into a state suitable for suspend. 2826 * Returns 0 on success, negative error code on failure. 2827 */ 2828 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2829 { 2830 int r; 2831 2832 if (amdgpu_sriov_vf(adev)) { 2833 amdgpu_virt_fini_data_exchange(adev); 2834 amdgpu_virt_request_full_gpu(adev, false); 2835 } 2836 2837 r = amdgpu_device_ip_suspend_phase1(adev); 2838 if (r) 2839 return r; 2840 r = amdgpu_device_ip_suspend_phase2(adev); 2841 2842 if (amdgpu_sriov_vf(adev)) 2843 amdgpu_virt_release_full_gpu(adev, false); 2844 2845 return r; 2846 } 2847 2848 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2849 { 2850 int i, r; 2851 2852 static enum amd_ip_block_type ip_order[] = { 2853 AMD_IP_BLOCK_TYPE_GMC, 2854 AMD_IP_BLOCK_TYPE_COMMON, 2855 AMD_IP_BLOCK_TYPE_PSP, 2856 AMD_IP_BLOCK_TYPE_IH, 2857 }; 2858 2859 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2860 int j; 2861 struct amdgpu_ip_block *block; 2862 2863 block = &adev->ip_blocks[i]; 2864 block->status.hw = false; 2865 2866 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2867 2868 if (block->version->type != ip_order[j] || 2869 !block->status.valid) 2870 continue; 2871 2872 r = block->version->funcs->hw_init(adev); 2873 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2874 if (r) 2875 return r; 2876 block->status.hw = true; 2877 } 2878 } 2879 2880 return 0; 2881 } 2882 2883 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2884 { 2885 int i, r; 2886 2887 static enum amd_ip_block_type ip_order[] = { 2888 AMD_IP_BLOCK_TYPE_SMC, 2889 AMD_IP_BLOCK_TYPE_DCE, 2890 AMD_IP_BLOCK_TYPE_GFX, 2891 AMD_IP_BLOCK_TYPE_SDMA, 2892 AMD_IP_BLOCK_TYPE_UVD, 2893 AMD_IP_BLOCK_TYPE_VCE, 2894 AMD_IP_BLOCK_TYPE_VCN 2895 }; 2896 2897 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2898 int j; 2899 struct amdgpu_ip_block *block; 2900 2901 for (j = 0; j < adev->num_ip_blocks; j++) { 2902 block = &adev->ip_blocks[j]; 2903 2904 if (block->version->type != ip_order[i] || 2905 !block->status.valid || 2906 block->status.hw) 2907 continue; 2908 2909 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2910 r = block->version->funcs->resume(adev); 2911 else 2912 r = block->version->funcs->hw_init(adev); 2913 2914 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2915 if (r) 2916 return r; 2917 block->status.hw = true; 2918 } 2919 } 2920 2921 return 0; 2922 } 2923 2924 /** 2925 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2926 * 2927 * @adev: amdgpu_device pointer 2928 * 2929 * First resume function for hardware IPs. The list of all the hardware 2930 * IPs that make up the asic is walked and the resume callbacks are run for 2931 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2932 * after a suspend and updates the software state as necessary. This 2933 * function is also used for restoring the GPU after a GPU reset. 2934 * Returns 0 on success, negative error code on failure. 2935 */ 2936 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2937 { 2938 int i, r; 2939 2940 for (i = 0; i < adev->num_ip_blocks; i++) { 2941 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2942 continue; 2943 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2944 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2945 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2946 2947 r = adev->ip_blocks[i].version->funcs->resume(adev); 2948 if (r) { 2949 DRM_ERROR("resume of IP block <%s> failed %d\n", 2950 adev->ip_blocks[i].version->funcs->name, r); 2951 return r; 2952 } 2953 adev->ip_blocks[i].status.hw = true; 2954 } 2955 } 2956 2957 return 0; 2958 } 2959 2960 /** 2961 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2962 * 2963 * @adev: amdgpu_device pointer 2964 * 2965 * First resume function for hardware IPs. The list of all the hardware 2966 * IPs that make up the asic is walked and the resume callbacks are run for 2967 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2968 * functional state after a suspend and updates the software state as 2969 * necessary. This function is also used for restoring the GPU after a GPU 2970 * reset. 2971 * Returns 0 on success, negative error code on failure. 2972 */ 2973 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2974 { 2975 int i, r; 2976 2977 for (i = 0; i < adev->num_ip_blocks; i++) { 2978 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2979 continue; 2980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2984 continue; 2985 r = adev->ip_blocks[i].version->funcs->resume(adev); 2986 if (r) { 2987 DRM_ERROR("resume of IP block <%s> failed %d\n", 2988 adev->ip_blocks[i].version->funcs->name, r); 2989 return r; 2990 } 2991 adev->ip_blocks[i].status.hw = true; 2992 } 2993 2994 return 0; 2995 } 2996 2997 /** 2998 * amdgpu_device_ip_resume - run resume for hardware IPs 2999 * 3000 * @adev: amdgpu_device pointer 3001 * 3002 * Main resume function for hardware IPs. The hardware IPs 3003 * are split into two resume functions because they are 3004 * are also used in in recovering from a GPU reset and some additional 3005 * steps need to be take between them. In this case (S3/S4) they are 3006 * run sequentially. 3007 * Returns 0 on success, negative error code on failure. 3008 */ 3009 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3010 { 3011 int r; 3012 3013 r = amdgpu_device_ip_resume_phase1(adev); 3014 if (r) 3015 return r; 3016 3017 r = amdgpu_device_fw_loading(adev); 3018 if (r) 3019 return r; 3020 3021 r = amdgpu_device_ip_resume_phase2(adev); 3022 3023 return r; 3024 } 3025 3026 /** 3027 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3028 * 3029 * @adev: amdgpu_device pointer 3030 * 3031 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3032 */ 3033 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3034 { 3035 if (amdgpu_sriov_vf(adev)) { 3036 if (adev->is_atom_fw) { 3037 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 3038 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3039 } else { 3040 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3041 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3042 } 3043 3044 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3045 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3046 } 3047 } 3048 3049 /** 3050 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3051 * 3052 * @asic_type: AMD asic type 3053 * 3054 * Check if there is DC (new modesetting infrastructre) support for an asic. 3055 * returns true if DC has support, false if not. 3056 */ 3057 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3058 { 3059 switch (asic_type) { 3060 #if defined(CONFIG_DRM_AMD_DC) 3061 #if defined(CONFIG_DRM_AMD_DC_SI) 3062 case CHIP_TAHITI: 3063 case CHIP_PITCAIRN: 3064 case CHIP_VERDE: 3065 case CHIP_OLAND: 3066 #endif 3067 case CHIP_BONAIRE: 3068 case CHIP_KAVERI: 3069 case CHIP_KABINI: 3070 case CHIP_MULLINS: 3071 /* 3072 * We have systems in the wild with these ASICs that require 3073 * LVDS and VGA support which is not supported with DC. 3074 * 3075 * Fallback to the non-DC driver here by default so as not to 3076 * cause regressions. 3077 */ 3078 return amdgpu_dc > 0; 3079 case CHIP_HAWAII: 3080 case CHIP_CARRIZO: 3081 case CHIP_STONEY: 3082 case CHIP_POLARIS10: 3083 case CHIP_POLARIS11: 3084 case CHIP_POLARIS12: 3085 case CHIP_VEGAM: 3086 case CHIP_TONGA: 3087 case CHIP_FIJI: 3088 case CHIP_VEGA10: 3089 case CHIP_VEGA12: 3090 case CHIP_VEGA20: 3091 #if defined(CONFIG_DRM_AMD_DC_DCN) 3092 case CHIP_RAVEN: 3093 case CHIP_NAVI10: 3094 case CHIP_NAVI14: 3095 case CHIP_NAVI12: 3096 case CHIP_RENOIR: 3097 case CHIP_SIENNA_CICHLID: 3098 case CHIP_NAVY_FLOUNDER: 3099 case CHIP_DIMGREY_CAVEFISH: 3100 case CHIP_VANGOGH: 3101 #endif 3102 return amdgpu_dc != 0; 3103 #endif 3104 default: 3105 if (amdgpu_dc > 0) 3106 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3107 "but isn't supported by ASIC, ignoring\n"); 3108 return false; 3109 } 3110 } 3111 3112 /** 3113 * amdgpu_device_has_dc_support - check if dc is supported 3114 * 3115 * @adev: amdgpu_device pointer 3116 * 3117 * Returns true for supported, false for not supported 3118 */ 3119 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3120 { 3121 if (amdgpu_sriov_vf(adev) || 3122 adev->enable_virtual_display || 3123 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3124 return false; 3125 3126 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3127 } 3128 3129 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3130 { 3131 struct amdgpu_device *adev = 3132 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3133 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3134 3135 /* It's a bug to not have a hive within this function */ 3136 if (WARN_ON(!hive)) 3137 return; 3138 3139 /* 3140 * Use task barrier to synchronize all xgmi reset works across the 3141 * hive. task_barrier_enter and task_barrier_exit will block 3142 * until all the threads running the xgmi reset works reach 3143 * those points. task_barrier_full will do both blocks. 3144 */ 3145 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3146 3147 task_barrier_enter(&hive->tb); 3148 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3149 3150 if (adev->asic_reset_res) 3151 goto fail; 3152 3153 task_barrier_exit(&hive->tb); 3154 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3155 3156 if (adev->asic_reset_res) 3157 goto fail; 3158 3159 if (adev->mmhub.ras_funcs && 3160 adev->mmhub.ras_funcs->reset_ras_error_count) 3161 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3162 } else { 3163 3164 task_barrier_full(&hive->tb); 3165 adev->asic_reset_res = amdgpu_asic_reset(adev); 3166 } 3167 3168 fail: 3169 if (adev->asic_reset_res) 3170 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3171 adev->asic_reset_res, adev_to_drm(adev)->unique); 3172 amdgpu_put_xgmi_hive(hive); 3173 } 3174 3175 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3176 { 3177 char *input = amdgpu_lockup_timeout; 3178 char *timeout_setting = NULL; 3179 int index = 0; 3180 long timeout; 3181 int ret = 0; 3182 3183 /* 3184 * By default timeout for non compute jobs is 10000. 3185 * And there is no timeout enforced on compute jobs. 3186 * In SR-IOV or passthrough mode, timeout for compute 3187 * jobs are 60000 by default. 3188 */ 3189 adev->gfx_timeout = msecs_to_jiffies(10000); 3190 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3191 if (amdgpu_sriov_vf(adev)) 3192 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3193 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3194 else if (amdgpu_passthrough(adev)) 3195 adev->compute_timeout = msecs_to_jiffies(60000); 3196 else 3197 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3198 3199 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3200 while ((timeout_setting = strsep(&input, ",")) && 3201 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3202 ret = kstrtol(timeout_setting, 0, &timeout); 3203 if (ret) 3204 return ret; 3205 3206 if (timeout == 0) { 3207 index++; 3208 continue; 3209 } else if (timeout < 0) { 3210 timeout = MAX_SCHEDULE_TIMEOUT; 3211 } else { 3212 timeout = msecs_to_jiffies(timeout); 3213 } 3214 3215 switch (index++) { 3216 case 0: 3217 adev->gfx_timeout = timeout; 3218 break; 3219 case 1: 3220 adev->compute_timeout = timeout; 3221 break; 3222 case 2: 3223 adev->sdma_timeout = timeout; 3224 break; 3225 case 3: 3226 adev->video_timeout = timeout; 3227 break; 3228 default: 3229 break; 3230 } 3231 } 3232 /* 3233 * There is only one value specified and 3234 * it should apply to all non-compute jobs. 3235 */ 3236 if (index == 1) { 3237 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3238 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3239 adev->compute_timeout = adev->gfx_timeout; 3240 } 3241 } 3242 3243 return ret; 3244 } 3245 3246 static const struct attribute *amdgpu_dev_attributes[] = { 3247 &dev_attr_product_name.attr, 3248 &dev_attr_product_number.attr, 3249 &dev_attr_serial_number.attr, 3250 &dev_attr_pcie_replay_count.attr, 3251 NULL 3252 }; 3253 3254 3255 /** 3256 * amdgpu_device_init - initialize the driver 3257 * 3258 * @adev: amdgpu_device pointer 3259 * @flags: driver flags 3260 * 3261 * Initializes the driver info and hw (all asics). 3262 * Returns 0 for success or an error on failure. 3263 * Called at driver startup. 3264 */ 3265 int amdgpu_device_init(struct amdgpu_device *adev, 3266 uint32_t flags) 3267 { 3268 struct drm_device *ddev = adev_to_drm(adev); 3269 struct pci_dev *pdev = adev->pdev; 3270 int r, i; 3271 bool px = false; 3272 u32 max_MBps; 3273 3274 adev->shutdown = false; 3275 adev->flags = flags; 3276 3277 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3278 adev->asic_type = amdgpu_force_asic_type; 3279 else 3280 adev->asic_type = flags & AMD_ASIC_MASK; 3281 3282 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3283 if (amdgpu_emu_mode == 1) 3284 adev->usec_timeout *= 10; 3285 adev->gmc.gart_size = 512 * 1024 * 1024; 3286 adev->accel_working = false; 3287 adev->num_rings = 0; 3288 adev->mman.buffer_funcs = NULL; 3289 adev->mman.buffer_funcs_ring = NULL; 3290 adev->vm_manager.vm_pte_funcs = NULL; 3291 adev->vm_manager.vm_pte_num_scheds = 0; 3292 adev->gmc.gmc_funcs = NULL; 3293 adev->harvest_ip_mask = 0x0; 3294 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3295 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3296 3297 adev->smc_rreg = &amdgpu_invalid_rreg; 3298 adev->smc_wreg = &amdgpu_invalid_wreg; 3299 adev->pcie_rreg = &amdgpu_invalid_rreg; 3300 adev->pcie_wreg = &amdgpu_invalid_wreg; 3301 adev->pciep_rreg = &amdgpu_invalid_rreg; 3302 adev->pciep_wreg = &amdgpu_invalid_wreg; 3303 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3304 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3305 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3306 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3307 adev->didt_rreg = &amdgpu_invalid_rreg; 3308 adev->didt_wreg = &amdgpu_invalid_wreg; 3309 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3310 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3311 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3312 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3313 3314 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3315 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3316 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3317 3318 /* mutex initialization are all done here so we 3319 * can recall function without having locking issues */ 3320 mutex_init(&adev->firmware.mutex); 3321 mutex_init(&adev->pm.mutex); 3322 mutex_init(&adev->gfx.gpu_clock_mutex); 3323 mutex_init(&adev->srbm_mutex); 3324 mutex_init(&adev->gfx.pipe_reserve_mutex); 3325 mutex_init(&adev->gfx.gfx_off_mutex); 3326 mutex_init(&adev->grbm_idx_mutex); 3327 mutex_init(&adev->mn_lock); 3328 mutex_init(&adev->virt.vf_errors.lock); 3329 hash_init(adev->mn_hash); 3330 atomic_set(&adev->in_gpu_reset, 0); 3331 init_rwsem(&adev->reset_sem); 3332 mutex_init(&adev->psp.mutex); 3333 mutex_init(&adev->notifier_lock); 3334 3335 r = amdgpu_device_check_arguments(adev); 3336 if (r) 3337 return r; 3338 3339 spin_lock_init(&adev->mmio_idx_lock); 3340 spin_lock_init(&adev->smc_idx_lock); 3341 spin_lock_init(&adev->pcie_idx_lock); 3342 spin_lock_init(&adev->uvd_ctx_idx_lock); 3343 spin_lock_init(&adev->didt_idx_lock); 3344 spin_lock_init(&adev->gc_cac_idx_lock); 3345 spin_lock_init(&adev->se_cac_idx_lock); 3346 spin_lock_init(&adev->audio_endpt_idx_lock); 3347 spin_lock_init(&adev->mm_stats.lock); 3348 3349 INIT_LIST_HEAD(&adev->shadow_list); 3350 mutex_init(&adev->shadow_list_lock); 3351 3352 INIT_LIST_HEAD(&adev->reset_list); 3353 3354 INIT_DELAYED_WORK(&adev->delayed_init_work, 3355 amdgpu_device_delayed_init_work_handler); 3356 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3357 amdgpu_device_delay_enable_gfx_off); 3358 3359 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3360 3361 adev->gfx.gfx_off_req_count = 1; 3362 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3363 3364 atomic_set(&adev->throttling_logging_enabled, 1); 3365 /* 3366 * If throttling continues, logging will be performed every minute 3367 * to avoid log flooding. "-1" is subtracted since the thermal 3368 * throttling interrupt comes every second. Thus, the total logging 3369 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3370 * for throttling interrupt) = 60 seconds. 3371 */ 3372 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3373 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3374 3375 /* Registers mapping */ 3376 /* TODO: block userspace mapping of io register */ 3377 if (adev->asic_type >= CHIP_BONAIRE) { 3378 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3379 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3380 } else { 3381 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3382 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3383 } 3384 3385 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3386 if (adev->rmmio == NULL) { 3387 return -ENOMEM; 3388 } 3389 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3390 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3391 3392 /* enable PCIE atomic ops */ 3393 r = pci_enable_atomic_ops_to_root(adev->pdev, 3394 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3395 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3396 if (r) { 3397 adev->have_atomics_support = false; 3398 DRM_INFO("PCIE atomic ops is not supported\n"); 3399 } else { 3400 adev->have_atomics_support = true; 3401 } 3402 3403 amdgpu_device_get_pcie_info(adev); 3404 3405 if (amdgpu_mcbp) 3406 DRM_INFO("MCBP is enabled\n"); 3407 3408 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3409 adev->enable_mes = true; 3410 3411 /* detect hw virtualization here */ 3412 amdgpu_detect_virtualization(adev); 3413 3414 r = amdgpu_device_get_job_timeout_settings(adev); 3415 if (r) { 3416 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3417 goto failed_unmap; 3418 } 3419 3420 /* early init functions */ 3421 r = amdgpu_device_ip_early_init(adev); 3422 if (r) 3423 goto failed_unmap; 3424 3425 /* doorbell bar mapping and doorbell index init*/ 3426 amdgpu_device_doorbell_init(adev); 3427 3428 if (amdgpu_emu_mode == 1) { 3429 /* post the asic on emulation mode */ 3430 emu_soc_asic_init(adev); 3431 goto fence_driver_init; 3432 } 3433 3434 amdgpu_reset_init(adev); 3435 3436 /* detect if we are with an SRIOV vbios */ 3437 amdgpu_device_detect_sriov_bios(adev); 3438 3439 /* check if we need to reset the asic 3440 * E.g., driver was not cleanly unloaded previously, etc. 3441 */ 3442 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3443 if (adev->gmc.xgmi.num_physical_nodes) { 3444 dev_info(adev->dev, "Pending hive reset.\n"); 3445 adev->gmc.xgmi.pending_reset = true; 3446 /* Only need to init necessary block for SMU to handle the reset */ 3447 for (i = 0; i < adev->num_ip_blocks; i++) { 3448 if (!adev->ip_blocks[i].status.valid) 3449 continue; 3450 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3451 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3452 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3453 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3454 DRM_DEBUG("IP %s disabled for hw_init.\n", 3455 adev->ip_blocks[i].version->funcs->name); 3456 adev->ip_blocks[i].status.hw = true; 3457 } 3458 } 3459 } else { 3460 r = amdgpu_asic_reset(adev); 3461 if (r) { 3462 dev_err(adev->dev, "asic reset on init failed\n"); 3463 goto failed; 3464 } 3465 } 3466 } 3467 3468 pci_enable_pcie_error_reporting(adev->pdev); 3469 3470 /* Post card if necessary */ 3471 if (amdgpu_device_need_post(adev)) { 3472 if (!adev->bios) { 3473 dev_err(adev->dev, "no vBIOS found\n"); 3474 r = -EINVAL; 3475 goto failed; 3476 } 3477 DRM_INFO("GPU posting now...\n"); 3478 r = amdgpu_device_asic_init(adev); 3479 if (r) { 3480 dev_err(adev->dev, "gpu post error!\n"); 3481 goto failed; 3482 } 3483 } 3484 3485 if (adev->is_atom_fw) { 3486 /* Initialize clocks */ 3487 r = amdgpu_atomfirmware_get_clock_info(adev); 3488 if (r) { 3489 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3490 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3491 goto failed; 3492 } 3493 } else { 3494 /* Initialize clocks */ 3495 r = amdgpu_atombios_get_clock_info(adev); 3496 if (r) { 3497 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3498 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3499 goto failed; 3500 } 3501 /* init i2c buses */ 3502 if (!amdgpu_device_has_dc_support(adev)) 3503 amdgpu_atombios_i2c_init(adev); 3504 } 3505 3506 fence_driver_init: 3507 /* Fence driver */ 3508 r = amdgpu_fence_driver_init(adev); 3509 if (r) { 3510 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3511 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3512 goto failed; 3513 } 3514 3515 /* init the mode config */ 3516 drm_mode_config_init(adev_to_drm(adev)); 3517 3518 r = amdgpu_device_ip_init(adev); 3519 if (r) { 3520 /* failed in exclusive mode due to timeout */ 3521 if (amdgpu_sriov_vf(adev) && 3522 !amdgpu_sriov_runtime(adev) && 3523 amdgpu_virt_mmio_blocked(adev) && 3524 !amdgpu_virt_wait_reset(adev)) { 3525 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3526 /* Don't send request since VF is inactive. */ 3527 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3528 adev->virt.ops = NULL; 3529 r = -EAGAIN; 3530 goto release_ras_con; 3531 } 3532 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3533 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3534 goto release_ras_con; 3535 } 3536 3537 dev_info(adev->dev, 3538 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3539 adev->gfx.config.max_shader_engines, 3540 adev->gfx.config.max_sh_per_se, 3541 adev->gfx.config.max_cu_per_sh, 3542 adev->gfx.cu_info.number); 3543 3544 adev->accel_working = true; 3545 3546 amdgpu_vm_check_compute_bug(adev); 3547 3548 /* Initialize the buffer migration limit. */ 3549 if (amdgpu_moverate >= 0) 3550 max_MBps = amdgpu_moverate; 3551 else 3552 max_MBps = 8; /* Allow 8 MB/s. */ 3553 /* Get a log2 for easy divisions. */ 3554 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3555 3556 amdgpu_fbdev_init(adev); 3557 3558 r = amdgpu_pm_sysfs_init(adev); 3559 if (r) { 3560 adev->pm_sysfs_en = false; 3561 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3562 } else 3563 adev->pm_sysfs_en = true; 3564 3565 r = amdgpu_ucode_sysfs_init(adev); 3566 if (r) { 3567 adev->ucode_sysfs_en = false; 3568 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3569 } else 3570 adev->ucode_sysfs_en = true; 3571 3572 if ((amdgpu_testing & 1)) { 3573 if (adev->accel_working) 3574 amdgpu_test_moves(adev); 3575 else 3576 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3577 } 3578 if (amdgpu_benchmarking) { 3579 if (adev->accel_working) 3580 amdgpu_benchmark(adev, amdgpu_benchmarking); 3581 else 3582 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3583 } 3584 3585 /* 3586 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3587 * Otherwise the mgpu fan boost feature will be skipped due to the 3588 * gpu instance is counted less. 3589 */ 3590 amdgpu_register_gpu_instance(adev); 3591 3592 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3593 * explicit gating rather than handling it automatically. 3594 */ 3595 if (!adev->gmc.xgmi.pending_reset) { 3596 r = amdgpu_device_ip_late_init(adev); 3597 if (r) { 3598 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3599 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3600 goto release_ras_con; 3601 } 3602 /* must succeed. */ 3603 amdgpu_ras_resume(adev); 3604 queue_delayed_work(system_wq, &adev->delayed_init_work, 3605 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3606 } 3607 3608 if (amdgpu_sriov_vf(adev)) 3609 flush_delayed_work(&adev->delayed_init_work); 3610 3611 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3612 if (r) 3613 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3614 3615 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3616 r = amdgpu_pmu_init(adev); 3617 if (r) 3618 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3619 3620 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3621 if (amdgpu_device_cache_pci_state(adev->pdev)) 3622 pci_restore_state(pdev); 3623 3624 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3625 /* this will fail for cards that aren't VGA class devices, just 3626 * ignore it */ 3627 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3628 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3629 3630 if (amdgpu_device_supports_px(ddev)) { 3631 px = true; 3632 vga_switcheroo_register_client(adev->pdev, 3633 &amdgpu_switcheroo_ops, px); 3634 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3635 } 3636 3637 if (adev->gmc.xgmi.pending_reset) 3638 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3639 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3640 3641 return 0; 3642 3643 release_ras_con: 3644 amdgpu_release_ras_context(adev); 3645 3646 failed: 3647 amdgpu_vf_error_trans_all(adev); 3648 3649 failed_unmap: 3650 iounmap(adev->rmmio); 3651 adev->rmmio = NULL; 3652 3653 return r; 3654 } 3655 3656 /** 3657 * amdgpu_device_fini - tear down the driver 3658 * 3659 * @adev: amdgpu_device pointer 3660 * 3661 * Tear down the driver info (all asics). 3662 * Called at driver shutdown. 3663 */ 3664 void amdgpu_device_fini(struct amdgpu_device *adev) 3665 { 3666 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3667 flush_delayed_work(&adev->delayed_init_work); 3668 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3669 adev->shutdown = true; 3670 3671 kfree(adev->pci_state); 3672 3673 /* make sure IB test finished before entering exclusive mode 3674 * to avoid preemption on IB test 3675 * */ 3676 if (amdgpu_sriov_vf(adev)) { 3677 amdgpu_virt_request_full_gpu(adev, false); 3678 amdgpu_virt_fini_data_exchange(adev); 3679 } 3680 3681 /* disable all interrupts */ 3682 amdgpu_irq_disable_all(adev); 3683 if (adev->mode_info.mode_config_initialized){ 3684 if (!amdgpu_device_has_dc_support(adev)) 3685 drm_helper_force_disable_all(adev_to_drm(adev)); 3686 else 3687 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3688 } 3689 amdgpu_fence_driver_fini(adev); 3690 if (adev->pm_sysfs_en) 3691 amdgpu_pm_sysfs_fini(adev); 3692 amdgpu_fbdev_fini(adev); 3693 amdgpu_device_ip_fini(adev); 3694 release_firmware(adev->firmware.gpu_info_fw); 3695 adev->firmware.gpu_info_fw = NULL; 3696 adev->accel_working = false; 3697 3698 amdgpu_reset_fini(adev); 3699 3700 /* free i2c buses */ 3701 if (!amdgpu_device_has_dc_support(adev)) 3702 amdgpu_i2c_fini(adev); 3703 3704 if (amdgpu_emu_mode != 1) 3705 amdgpu_atombios_fini(adev); 3706 3707 kfree(adev->bios); 3708 adev->bios = NULL; 3709 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3710 vga_switcheroo_unregister_client(adev->pdev); 3711 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3712 } 3713 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3714 vga_client_register(adev->pdev, NULL, NULL, NULL); 3715 iounmap(adev->rmmio); 3716 adev->rmmio = NULL; 3717 amdgpu_device_doorbell_fini(adev); 3718 3719 if (adev->ucode_sysfs_en) 3720 amdgpu_ucode_sysfs_fini(adev); 3721 3722 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3723 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3724 amdgpu_pmu_fini(adev); 3725 if (adev->mman.discovery_bin) 3726 amdgpu_discovery_fini(adev); 3727 } 3728 3729 3730 /* 3731 * Suspend & resume. 3732 */ 3733 /** 3734 * amdgpu_device_suspend - initiate device suspend 3735 * 3736 * @dev: drm dev pointer 3737 * @fbcon : notify the fbdev of suspend 3738 * 3739 * Puts the hw in the suspend state (all asics). 3740 * Returns 0 for success or an error on failure. 3741 * Called at driver suspend. 3742 */ 3743 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3744 { 3745 struct amdgpu_device *adev = drm_to_adev(dev); 3746 int r; 3747 3748 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3749 return 0; 3750 3751 adev->in_suspend = true; 3752 drm_kms_helper_poll_disable(dev); 3753 3754 if (fbcon) 3755 amdgpu_fbdev_set_suspend(adev, 1); 3756 3757 cancel_delayed_work_sync(&adev->delayed_init_work); 3758 3759 amdgpu_ras_suspend(adev); 3760 3761 r = amdgpu_device_ip_suspend_phase1(adev); 3762 3763 if (!adev->in_s0ix) 3764 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3765 3766 /* evict vram memory */ 3767 amdgpu_bo_evict_vram(adev); 3768 3769 amdgpu_fence_driver_suspend(adev); 3770 3771 r = amdgpu_device_ip_suspend_phase2(adev); 3772 /* evict remaining vram memory 3773 * This second call to evict vram is to evict the gart page table 3774 * using the CPU. 3775 */ 3776 amdgpu_bo_evict_vram(adev); 3777 3778 return 0; 3779 } 3780 3781 /** 3782 * amdgpu_device_resume - initiate device resume 3783 * 3784 * @dev: drm dev pointer 3785 * @fbcon : notify the fbdev of resume 3786 * 3787 * Bring the hw back to operating state (all asics). 3788 * Returns 0 for success or an error on failure. 3789 * Called at driver resume. 3790 */ 3791 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3792 { 3793 struct amdgpu_device *adev = drm_to_adev(dev); 3794 int r = 0; 3795 3796 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3797 return 0; 3798 3799 if (adev->in_s0ix) 3800 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3801 3802 /* post card */ 3803 if (amdgpu_device_need_post(adev)) { 3804 r = amdgpu_device_asic_init(adev); 3805 if (r) 3806 dev_err(adev->dev, "amdgpu asic init failed\n"); 3807 } 3808 3809 r = amdgpu_device_ip_resume(adev); 3810 if (r) { 3811 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3812 return r; 3813 } 3814 amdgpu_fence_driver_resume(adev); 3815 3816 3817 r = amdgpu_device_ip_late_init(adev); 3818 if (r) 3819 return r; 3820 3821 queue_delayed_work(system_wq, &adev->delayed_init_work, 3822 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3823 3824 if (!adev->in_s0ix) { 3825 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3826 if (r) 3827 return r; 3828 } 3829 3830 /* Make sure IB tests flushed */ 3831 flush_delayed_work(&adev->delayed_init_work); 3832 3833 if (fbcon) 3834 amdgpu_fbdev_set_suspend(adev, 0); 3835 3836 drm_kms_helper_poll_enable(dev); 3837 3838 amdgpu_ras_resume(adev); 3839 3840 /* 3841 * Most of the connector probing functions try to acquire runtime pm 3842 * refs to ensure that the GPU is powered on when connector polling is 3843 * performed. Since we're calling this from a runtime PM callback, 3844 * trying to acquire rpm refs will cause us to deadlock. 3845 * 3846 * Since we're guaranteed to be holding the rpm lock, it's safe to 3847 * temporarily disable the rpm helpers so this doesn't deadlock us. 3848 */ 3849 #ifdef CONFIG_PM 3850 dev->dev->power.disable_depth++; 3851 #endif 3852 if (!amdgpu_device_has_dc_support(adev)) 3853 drm_helper_hpd_irq_event(dev); 3854 else 3855 drm_kms_helper_hotplug_event(dev); 3856 #ifdef CONFIG_PM 3857 dev->dev->power.disable_depth--; 3858 #endif 3859 adev->in_suspend = false; 3860 3861 return 0; 3862 } 3863 3864 /** 3865 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3866 * 3867 * @adev: amdgpu_device pointer 3868 * 3869 * The list of all the hardware IPs that make up the asic is walked and 3870 * the check_soft_reset callbacks are run. check_soft_reset determines 3871 * if the asic is still hung or not. 3872 * Returns true if any of the IPs are still in a hung state, false if not. 3873 */ 3874 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3875 { 3876 int i; 3877 bool asic_hang = false; 3878 3879 if (amdgpu_sriov_vf(adev)) 3880 return true; 3881 3882 if (amdgpu_asic_need_full_reset(adev)) 3883 return true; 3884 3885 for (i = 0; i < adev->num_ip_blocks; i++) { 3886 if (!adev->ip_blocks[i].status.valid) 3887 continue; 3888 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3889 adev->ip_blocks[i].status.hang = 3890 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3891 if (adev->ip_blocks[i].status.hang) { 3892 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3893 asic_hang = true; 3894 } 3895 } 3896 return asic_hang; 3897 } 3898 3899 /** 3900 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3901 * 3902 * @adev: amdgpu_device pointer 3903 * 3904 * The list of all the hardware IPs that make up the asic is walked and the 3905 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3906 * handles any IP specific hardware or software state changes that are 3907 * necessary for a soft reset to succeed. 3908 * Returns 0 on success, negative error code on failure. 3909 */ 3910 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3911 { 3912 int i, r = 0; 3913 3914 for (i = 0; i < adev->num_ip_blocks; i++) { 3915 if (!adev->ip_blocks[i].status.valid) 3916 continue; 3917 if (adev->ip_blocks[i].status.hang && 3918 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3919 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3920 if (r) 3921 return r; 3922 } 3923 } 3924 3925 return 0; 3926 } 3927 3928 /** 3929 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3930 * 3931 * @adev: amdgpu_device pointer 3932 * 3933 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3934 * reset is necessary to recover. 3935 * Returns true if a full asic reset is required, false if not. 3936 */ 3937 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3938 { 3939 int i; 3940 3941 if (amdgpu_asic_need_full_reset(adev)) 3942 return true; 3943 3944 for (i = 0; i < adev->num_ip_blocks; i++) { 3945 if (!adev->ip_blocks[i].status.valid) 3946 continue; 3947 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3948 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3949 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3950 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3951 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3952 if (adev->ip_blocks[i].status.hang) { 3953 dev_info(adev->dev, "Some block need full reset!\n"); 3954 return true; 3955 } 3956 } 3957 } 3958 return false; 3959 } 3960 3961 /** 3962 * amdgpu_device_ip_soft_reset - do a soft reset 3963 * 3964 * @adev: amdgpu_device pointer 3965 * 3966 * The list of all the hardware IPs that make up the asic is walked and the 3967 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3968 * IP specific hardware or software state changes that are necessary to soft 3969 * reset the IP. 3970 * Returns 0 on success, negative error code on failure. 3971 */ 3972 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3973 { 3974 int i, r = 0; 3975 3976 for (i = 0; i < adev->num_ip_blocks; i++) { 3977 if (!adev->ip_blocks[i].status.valid) 3978 continue; 3979 if (adev->ip_blocks[i].status.hang && 3980 adev->ip_blocks[i].version->funcs->soft_reset) { 3981 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3982 if (r) 3983 return r; 3984 } 3985 } 3986 3987 return 0; 3988 } 3989 3990 /** 3991 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3992 * 3993 * @adev: amdgpu_device pointer 3994 * 3995 * The list of all the hardware IPs that make up the asic is walked and the 3996 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3997 * handles any IP specific hardware or software state changes that are 3998 * necessary after the IP has been soft reset. 3999 * Returns 0 on success, negative error code on failure. 4000 */ 4001 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4002 { 4003 int i, r = 0; 4004 4005 for (i = 0; i < adev->num_ip_blocks; i++) { 4006 if (!adev->ip_blocks[i].status.valid) 4007 continue; 4008 if (adev->ip_blocks[i].status.hang && 4009 adev->ip_blocks[i].version->funcs->post_soft_reset) 4010 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4011 if (r) 4012 return r; 4013 } 4014 4015 return 0; 4016 } 4017 4018 /** 4019 * amdgpu_device_recover_vram - Recover some VRAM contents 4020 * 4021 * @adev: amdgpu_device pointer 4022 * 4023 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4024 * restore things like GPUVM page tables after a GPU reset where 4025 * the contents of VRAM might be lost. 4026 * 4027 * Returns: 4028 * 0 on success, negative error code on failure. 4029 */ 4030 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4031 { 4032 struct dma_fence *fence = NULL, *next = NULL; 4033 struct amdgpu_bo *shadow; 4034 long r = 1, tmo; 4035 4036 if (amdgpu_sriov_runtime(adev)) 4037 tmo = msecs_to_jiffies(8000); 4038 else 4039 tmo = msecs_to_jiffies(100); 4040 4041 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4042 mutex_lock(&adev->shadow_list_lock); 4043 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4044 4045 /* No need to recover an evicted BO */ 4046 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4047 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4048 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4049 continue; 4050 4051 r = amdgpu_bo_restore_shadow(shadow, &next); 4052 if (r) 4053 break; 4054 4055 if (fence) { 4056 tmo = dma_fence_wait_timeout(fence, false, tmo); 4057 dma_fence_put(fence); 4058 fence = next; 4059 if (tmo == 0) { 4060 r = -ETIMEDOUT; 4061 break; 4062 } else if (tmo < 0) { 4063 r = tmo; 4064 break; 4065 } 4066 } else { 4067 fence = next; 4068 } 4069 } 4070 mutex_unlock(&adev->shadow_list_lock); 4071 4072 if (fence) 4073 tmo = dma_fence_wait_timeout(fence, false, tmo); 4074 dma_fence_put(fence); 4075 4076 if (r < 0 || tmo <= 0) { 4077 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4078 return -EIO; 4079 } 4080 4081 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4082 return 0; 4083 } 4084 4085 4086 /** 4087 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4088 * 4089 * @adev: amdgpu_device pointer 4090 * @from_hypervisor: request from hypervisor 4091 * 4092 * do VF FLR and reinitialize Asic 4093 * return 0 means succeeded otherwise failed 4094 */ 4095 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4096 bool from_hypervisor) 4097 { 4098 int r; 4099 4100 if (from_hypervisor) 4101 r = amdgpu_virt_request_full_gpu(adev, true); 4102 else 4103 r = amdgpu_virt_reset_gpu(adev); 4104 if (r) 4105 return r; 4106 4107 amdgpu_amdkfd_pre_reset(adev); 4108 4109 /* Resume IP prior to SMC */ 4110 r = amdgpu_device_ip_reinit_early_sriov(adev); 4111 if (r) 4112 goto error; 4113 4114 amdgpu_virt_init_data_exchange(adev); 4115 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4116 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4117 4118 r = amdgpu_device_fw_loading(adev); 4119 if (r) 4120 return r; 4121 4122 /* now we are okay to resume SMC/CP/SDMA */ 4123 r = amdgpu_device_ip_reinit_late_sriov(adev); 4124 if (r) 4125 goto error; 4126 4127 amdgpu_irq_gpu_reset_resume_helper(adev); 4128 r = amdgpu_ib_ring_tests(adev); 4129 amdgpu_amdkfd_post_reset(adev); 4130 4131 error: 4132 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4133 amdgpu_inc_vram_lost(adev); 4134 r = amdgpu_device_recover_vram(adev); 4135 } 4136 amdgpu_virt_release_full_gpu(adev, true); 4137 4138 return r; 4139 } 4140 4141 /** 4142 * amdgpu_device_has_job_running - check if there is any job in mirror list 4143 * 4144 * @adev: amdgpu_device pointer 4145 * 4146 * check if there is any job in mirror list 4147 */ 4148 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4149 { 4150 int i; 4151 struct drm_sched_job *job; 4152 4153 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4154 struct amdgpu_ring *ring = adev->rings[i]; 4155 4156 if (!ring || !ring->sched.thread) 4157 continue; 4158 4159 spin_lock(&ring->sched.job_list_lock); 4160 job = list_first_entry_or_null(&ring->sched.pending_list, 4161 struct drm_sched_job, list); 4162 spin_unlock(&ring->sched.job_list_lock); 4163 if (job) 4164 return true; 4165 } 4166 return false; 4167 } 4168 4169 /** 4170 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4171 * 4172 * @adev: amdgpu_device pointer 4173 * 4174 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4175 * a hung GPU. 4176 */ 4177 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4178 { 4179 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4180 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4181 return false; 4182 } 4183 4184 if (amdgpu_gpu_recovery == 0) 4185 goto disabled; 4186 4187 if (amdgpu_sriov_vf(adev)) 4188 return true; 4189 4190 if (amdgpu_gpu_recovery == -1) { 4191 switch (adev->asic_type) { 4192 case CHIP_BONAIRE: 4193 case CHIP_HAWAII: 4194 case CHIP_TOPAZ: 4195 case CHIP_TONGA: 4196 case CHIP_FIJI: 4197 case CHIP_POLARIS10: 4198 case CHIP_POLARIS11: 4199 case CHIP_POLARIS12: 4200 case CHIP_VEGAM: 4201 case CHIP_VEGA20: 4202 case CHIP_VEGA10: 4203 case CHIP_VEGA12: 4204 case CHIP_RAVEN: 4205 case CHIP_ARCTURUS: 4206 case CHIP_RENOIR: 4207 case CHIP_NAVI10: 4208 case CHIP_NAVI14: 4209 case CHIP_NAVI12: 4210 case CHIP_SIENNA_CICHLID: 4211 case CHIP_NAVY_FLOUNDER: 4212 case CHIP_DIMGREY_CAVEFISH: 4213 case CHIP_VANGOGH: 4214 case CHIP_ALDEBARAN: 4215 break; 4216 default: 4217 goto disabled; 4218 } 4219 } 4220 4221 return true; 4222 4223 disabled: 4224 dev_info(adev->dev, "GPU recovery disabled.\n"); 4225 return false; 4226 } 4227 4228 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4229 { 4230 u32 i; 4231 int ret = 0; 4232 4233 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4234 4235 dev_info(adev->dev, "GPU mode1 reset\n"); 4236 4237 /* disable BM */ 4238 pci_clear_master(adev->pdev); 4239 4240 amdgpu_device_cache_pci_state(adev->pdev); 4241 4242 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4243 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4244 ret = amdgpu_dpm_mode1_reset(adev); 4245 } else { 4246 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4247 ret = psp_gpu_reset(adev); 4248 } 4249 4250 if (ret) 4251 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4252 4253 amdgpu_device_load_pci_state(adev->pdev); 4254 4255 /* wait for asic to come out of reset */ 4256 for (i = 0; i < adev->usec_timeout; i++) { 4257 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4258 4259 if (memsize != 0xffffffff) 4260 break; 4261 udelay(1); 4262 } 4263 4264 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4265 return ret; 4266 } 4267 4268 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4269 struct amdgpu_reset_context *reset_context) 4270 { 4271 int i, r = 0; 4272 struct amdgpu_job *job = NULL; 4273 bool need_full_reset = 4274 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4275 4276 if (reset_context->reset_req_dev == adev) 4277 job = reset_context->job; 4278 4279 /* no need to dump if device is not in good state during probe period */ 4280 if (!adev->gmc.xgmi.pending_reset) 4281 amdgpu_debugfs_wait_dump(adev); 4282 4283 if (amdgpu_sriov_vf(adev)) { 4284 /* stop the data exchange thread */ 4285 amdgpu_virt_fini_data_exchange(adev); 4286 } 4287 4288 /* block all schedulers and reset given job's ring */ 4289 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4290 struct amdgpu_ring *ring = adev->rings[i]; 4291 4292 if (!ring || !ring->sched.thread) 4293 continue; 4294 4295 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4296 amdgpu_fence_driver_force_completion(ring); 4297 } 4298 4299 if(job) 4300 drm_sched_increase_karma(&job->base); 4301 4302 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4303 /* If reset handler not implemented, continue; otherwise return */ 4304 if (r == -ENOSYS) 4305 r = 0; 4306 else 4307 return r; 4308 4309 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4310 if (!amdgpu_sriov_vf(adev)) { 4311 4312 if (!need_full_reset) 4313 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4314 4315 if (!need_full_reset) { 4316 amdgpu_device_ip_pre_soft_reset(adev); 4317 r = amdgpu_device_ip_soft_reset(adev); 4318 amdgpu_device_ip_post_soft_reset(adev); 4319 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4320 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4321 need_full_reset = true; 4322 } 4323 } 4324 4325 if (need_full_reset) 4326 r = amdgpu_device_ip_suspend(adev); 4327 if (need_full_reset) 4328 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4329 else 4330 clear_bit(AMDGPU_NEED_FULL_RESET, 4331 &reset_context->flags); 4332 } 4333 4334 return r; 4335 } 4336 4337 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4338 struct amdgpu_reset_context *reset_context) 4339 { 4340 struct amdgpu_device *tmp_adev = NULL; 4341 bool need_full_reset, skip_hw_reset, vram_lost = false; 4342 int r = 0; 4343 4344 /* Try reset handler method first */ 4345 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4346 reset_list); 4347 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4348 /* If reset handler not implemented, continue; otherwise return */ 4349 if (r == -ENOSYS) 4350 r = 0; 4351 else 4352 return r; 4353 4354 /* Reset handler not implemented, use the default method */ 4355 need_full_reset = 4356 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4357 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4358 4359 /* 4360 * ASIC reset has to be done on all XGMI hive nodes ASAP 4361 * to allow proper links negotiation in FW (within 1 sec) 4362 */ 4363 if (!skip_hw_reset && need_full_reset) { 4364 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4365 /* For XGMI run all resets in parallel to speed up the process */ 4366 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4367 tmp_adev->gmc.xgmi.pending_reset = false; 4368 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4369 r = -EALREADY; 4370 } else 4371 r = amdgpu_asic_reset(tmp_adev); 4372 4373 if (r) { 4374 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4375 r, adev_to_drm(tmp_adev)->unique); 4376 break; 4377 } 4378 } 4379 4380 /* For XGMI wait for all resets to complete before proceed */ 4381 if (!r) { 4382 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4383 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4384 flush_work(&tmp_adev->xgmi_reset_work); 4385 r = tmp_adev->asic_reset_res; 4386 if (r) 4387 break; 4388 } 4389 } 4390 } 4391 } 4392 4393 if (!r && amdgpu_ras_intr_triggered()) { 4394 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4395 if (tmp_adev->mmhub.ras_funcs && 4396 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4397 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4398 } 4399 4400 amdgpu_ras_intr_cleared(); 4401 } 4402 4403 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4404 if (need_full_reset) { 4405 /* post card */ 4406 r = amdgpu_device_asic_init(tmp_adev); 4407 if (r) { 4408 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4409 } else { 4410 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4411 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4412 if (r) 4413 goto out; 4414 4415 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4416 if (vram_lost) { 4417 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4418 amdgpu_inc_vram_lost(tmp_adev); 4419 } 4420 4421 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4422 if (r) 4423 goto out; 4424 4425 r = amdgpu_device_fw_loading(tmp_adev); 4426 if (r) 4427 return r; 4428 4429 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4430 if (r) 4431 goto out; 4432 4433 if (vram_lost) 4434 amdgpu_device_fill_reset_magic(tmp_adev); 4435 4436 /* 4437 * Add this ASIC as tracked as reset was already 4438 * complete successfully. 4439 */ 4440 amdgpu_register_gpu_instance(tmp_adev); 4441 4442 if (!reset_context->hive && 4443 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4444 amdgpu_xgmi_add_device(tmp_adev); 4445 4446 r = amdgpu_device_ip_late_init(tmp_adev); 4447 if (r) 4448 goto out; 4449 4450 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4451 4452 /* 4453 * The GPU enters bad state once faulty pages 4454 * by ECC has reached the threshold, and ras 4455 * recovery is scheduled next. So add one check 4456 * here to break recovery if it indeed exceeds 4457 * bad page threshold, and remind user to 4458 * retire this GPU or setting one bigger 4459 * bad_page_threshold value to fix this once 4460 * probing driver again. 4461 */ 4462 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4463 /* must succeed. */ 4464 amdgpu_ras_resume(tmp_adev); 4465 } else { 4466 r = -EINVAL; 4467 goto out; 4468 } 4469 4470 /* Update PSP FW topology after reset */ 4471 if (reset_context->hive && 4472 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4473 r = amdgpu_xgmi_update_topology( 4474 reset_context->hive, tmp_adev); 4475 } 4476 } 4477 4478 out: 4479 if (!r) { 4480 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4481 r = amdgpu_ib_ring_tests(tmp_adev); 4482 if (r) { 4483 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4484 need_full_reset = true; 4485 r = -EAGAIN; 4486 goto end; 4487 } 4488 } 4489 4490 if (!r) 4491 r = amdgpu_device_recover_vram(tmp_adev); 4492 else 4493 tmp_adev->asic_reset_res = r; 4494 } 4495 4496 end: 4497 if (need_full_reset) 4498 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4499 else 4500 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4501 return r; 4502 } 4503 4504 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4505 struct amdgpu_hive_info *hive) 4506 { 4507 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4508 return false; 4509 4510 if (hive) { 4511 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4512 } else { 4513 down_write(&adev->reset_sem); 4514 } 4515 4516 switch (amdgpu_asic_reset_method(adev)) { 4517 case AMD_RESET_METHOD_MODE1: 4518 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4519 break; 4520 case AMD_RESET_METHOD_MODE2: 4521 adev->mp1_state = PP_MP1_STATE_RESET; 4522 break; 4523 default: 4524 adev->mp1_state = PP_MP1_STATE_NONE; 4525 break; 4526 } 4527 4528 return true; 4529 } 4530 4531 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4532 { 4533 amdgpu_vf_error_trans_all(adev); 4534 adev->mp1_state = PP_MP1_STATE_NONE; 4535 atomic_set(&adev->in_gpu_reset, 0); 4536 up_write(&adev->reset_sem); 4537 } 4538 4539 /* 4540 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4541 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4542 * 4543 * unlock won't require roll back. 4544 */ 4545 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4546 { 4547 struct amdgpu_device *tmp_adev = NULL; 4548 4549 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4550 if (!hive) { 4551 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4552 return -ENODEV; 4553 } 4554 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4555 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4556 goto roll_back; 4557 } 4558 } else if (!amdgpu_device_lock_adev(adev, hive)) 4559 return -EAGAIN; 4560 4561 return 0; 4562 roll_back: 4563 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4564 /* 4565 * if the lockup iteration break in the middle of a hive, 4566 * it may means there may has a race issue, 4567 * or a hive device locked up independently. 4568 * we may be in trouble and may not, so will try to roll back 4569 * the lock and give out a warnning. 4570 */ 4571 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4572 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4573 amdgpu_device_unlock_adev(tmp_adev); 4574 } 4575 } 4576 return -EAGAIN; 4577 } 4578 4579 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4580 { 4581 struct pci_dev *p = NULL; 4582 4583 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4584 adev->pdev->bus->number, 1); 4585 if (p) { 4586 pm_runtime_enable(&(p->dev)); 4587 pm_runtime_resume(&(p->dev)); 4588 } 4589 } 4590 4591 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4592 { 4593 enum amd_reset_method reset_method; 4594 struct pci_dev *p = NULL; 4595 u64 expires; 4596 4597 /* 4598 * For now, only BACO and mode1 reset are confirmed 4599 * to suffer the audio issue without proper suspended. 4600 */ 4601 reset_method = amdgpu_asic_reset_method(adev); 4602 if ((reset_method != AMD_RESET_METHOD_BACO) && 4603 (reset_method != AMD_RESET_METHOD_MODE1)) 4604 return -EINVAL; 4605 4606 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4607 adev->pdev->bus->number, 1); 4608 if (!p) 4609 return -ENODEV; 4610 4611 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4612 if (!expires) 4613 /* 4614 * If we cannot get the audio device autosuspend delay, 4615 * a fixed 4S interval will be used. Considering 3S is 4616 * the audio controller default autosuspend delay setting. 4617 * 4S used here is guaranteed to cover that. 4618 */ 4619 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4620 4621 while (!pm_runtime_status_suspended(&(p->dev))) { 4622 if (!pm_runtime_suspend(&(p->dev))) 4623 break; 4624 4625 if (expires < ktime_get_mono_fast_ns()) { 4626 dev_warn(adev->dev, "failed to suspend display audio\n"); 4627 /* TODO: abort the succeeding gpu reset? */ 4628 return -ETIMEDOUT; 4629 } 4630 } 4631 4632 pm_runtime_disable(&(p->dev)); 4633 4634 return 0; 4635 } 4636 4637 void amdgpu_device_recheck_guilty_jobs( 4638 struct amdgpu_device *adev, struct list_head *device_list_handle, 4639 struct amdgpu_reset_context *reset_context) 4640 { 4641 int i, r = 0; 4642 4643 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4644 struct amdgpu_ring *ring = adev->rings[i]; 4645 int ret = 0; 4646 struct drm_sched_job *s_job; 4647 4648 if (!ring || !ring->sched.thread) 4649 continue; 4650 4651 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4652 struct drm_sched_job, list); 4653 if (s_job == NULL) 4654 continue; 4655 4656 /* clear job's guilty and depend the folowing step to decide the real one */ 4657 drm_sched_reset_karma(s_job); 4658 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4659 4660 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4661 if (ret == 0) { /* timeout */ 4662 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4663 ring->sched.name, s_job->id); 4664 4665 /* set guilty */ 4666 drm_sched_increase_karma(s_job); 4667 retry: 4668 /* do hw reset */ 4669 if (amdgpu_sriov_vf(adev)) { 4670 amdgpu_virt_fini_data_exchange(adev); 4671 r = amdgpu_device_reset_sriov(adev, false); 4672 if (r) 4673 adev->asic_reset_res = r; 4674 } else { 4675 clear_bit(AMDGPU_SKIP_HW_RESET, 4676 &reset_context->flags); 4677 r = amdgpu_do_asic_reset(device_list_handle, 4678 reset_context); 4679 if (r && r == -EAGAIN) 4680 goto retry; 4681 } 4682 4683 /* 4684 * add reset counter so that the following 4685 * resubmitted job could flush vmid 4686 */ 4687 atomic_inc(&adev->gpu_reset_counter); 4688 continue; 4689 } 4690 4691 /* got the hw fence, signal finished fence */ 4692 atomic_dec(ring->sched.score); 4693 dma_fence_get(&s_job->s_fence->finished); 4694 dma_fence_signal(&s_job->s_fence->finished); 4695 dma_fence_put(&s_job->s_fence->finished); 4696 4697 /* remove node from list and free the job */ 4698 spin_lock(&ring->sched.job_list_lock); 4699 list_del_init(&s_job->list); 4700 spin_unlock(&ring->sched.job_list_lock); 4701 ring->sched.ops->free_job(s_job); 4702 } 4703 } 4704 4705 /** 4706 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4707 * 4708 * @adev: amdgpu_device pointer 4709 * @job: which job trigger hang 4710 * 4711 * Attempt to reset the GPU if it has hung (all asics). 4712 * Attempt to do soft-reset or full-reset and reinitialize Asic 4713 * Returns 0 for success or an error on failure. 4714 */ 4715 4716 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4717 struct amdgpu_job *job) 4718 { 4719 struct list_head device_list, *device_list_handle = NULL; 4720 bool job_signaled = false; 4721 struct amdgpu_hive_info *hive = NULL; 4722 struct amdgpu_device *tmp_adev = NULL; 4723 int i, r = 0; 4724 bool need_emergency_restart = false; 4725 bool audio_suspended = false; 4726 int tmp_vram_lost_counter; 4727 struct amdgpu_reset_context reset_context; 4728 4729 memset(&reset_context, 0, sizeof(reset_context)); 4730 4731 /* 4732 * Special case: RAS triggered and full reset isn't supported 4733 */ 4734 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4735 4736 /* 4737 * Flush RAM to disk so that after reboot 4738 * the user can read log and see why the system rebooted. 4739 */ 4740 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4741 DRM_WARN("Emergency reboot."); 4742 4743 ksys_sync_helper(); 4744 emergency_restart(); 4745 } 4746 4747 dev_info(adev->dev, "GPU %s begin!\n", 4748 need_emergency_restart ? "jobs stop":"reset"); 4749 4750 /* 4751 * Here we trylock to avoid chain of resets executing from 4752 * either trigger by jobs on different adevs in XGMI hive or jobs on 4753 * different schedulers for same device while this TO handler is running. 4754 * We always reset all schedulers for device and all devices for XGMI 4755 * hive so that should take care of them too. 4756 */ 4757 hive = amdgpu_get_xgmi_hive(adev); 4758 if (hive) { 4759 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4760 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4761 job ? job->base.id : -1, hive->hive_id); 4762 amdgpu_put_xgmi_hive(hive); 4763 if (job) 4764 drm_sched_increase_karma(&job->base); 4765 return 0; 4766 } 4767 mutex_lock(&hive->hive_lock); 4768 } 4769 4770 reset_context.method = AMD_RESET_METHOD_NONE; 4771 reset_context.reset_req_dev = adev; 4772 reset_context.job = job; 4773 reset_context.hive = hive; 4774 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4775 4776 /* 4777 * lock the device before we try to operate the linked list 4778 * if didn't get the device lock, don't touch the linked list since 4779 * others may iterating it. 4780 */ 4781 r = amdgpu_device_lock_hive_adev(adev, hive); 4782 if (r) { 4783 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4784 job ? job->base.id : -1); 4785 4786 /* even we skipped this reset, still need to set the job to guilty */ 4787 if (job) 4788 drm_sched_increase_karma(&job->base); 4789 goto skip_recovery; 4790 } 4791 4792 /* 4793 * Build list of devices to reset. 4794 * In case we are in XGMI hive mode, resort the device list 4795 * to put adev in the 1st position. 4796 */ 4797 INIT_LIST_HEAD(&device_list); 4798 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4799 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4800 list_add_tail(&tmp_adev->reset_list, &device_list); 4801 if (!list_is_first(&adev->reset_list, &device_list)) 4802 list_rotate_to_front(&adev->reset_list, &device_list); 4803 device_list_handle = &device_list; 4804 } else { 4805 list_add_tail(&adev->reset_list, &device_list); 4806 device_list_handle = &device_list; 4807 } 4808 4809 /* block all schedulers and reset given job's ring */ 4810 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4811 /* 4812 * Try to put the audio codec into suspend state 4813 * before gpu reset started. 4814 * 4815 * Due to the power domain of the graphics device 4816 * is shared with AZ power domain. Without this, 4817 * we may change the audio hardware from behind 4818 * the audio driver's back. That will trigger 4819 * some audio codec errors. 4820 */ 4821 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4822 audio_suspended = true; 4823 4824 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4825 4826 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4827 4828 if (!amdgpu_sriov_vf(tmp_adev)) 4829 amdgpu_amdkfd_pre_reset(tmp_adev); 4830 4831 /* 4832 * Mark these ASICs to be reseted as untracked first 4833 * And add them back after reset completed 4834 */ 4835 amdgpu_unregister_gpu_instance(tmp_adev); 4836 4837 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4838 4839 /* disable ras on ALL IPs */ 4840 if (!need_emergency_restart && 4841 amdgpu_device_ip_need_full_reset(tmp_adev)) 4842 amdgpu_ras_suspend(tmp_adev); 4843 4844 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4845 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4846 4847 if (!ring || !ring->sched.thread) 4848 continue; 4849 4850 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4851 4852 if (need_emergency_restart) 4853 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4854 } 4855 atomic_inc(&tmp_adev->gpu_reset_counter); 4856 } 4857 4858 if (need_emergency_restart) 4859 goto skip_sched_resume; 4860 4861 /* 4862 * Must check guilty signal here since after this point all old 4863 * HW fences are force signaled. 4864 * 4865 * job->base holds a reference to parent fence 4866 */ 4867 if (job && job->base.s_fence->parent && 4868 dma_fence_is_signaled(job->base.s_fence->parent)) { 4869 job_signaled = true; 4870 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4871 goto skip_hw_reset; 4872 } 4873 4874 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4875 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4876 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 4877 /*TODO Should we stop ?*/ 4878 if (r) { 4879 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4880 r, adev_to_drm(tmp_adev)->unique); 4881 tmp_adev->asic_reset_res = r; 4882 } 4883 } 4884 4885 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 4886 /* Actual ASIC resets if needed.*/ 4887 /* TODO Implement XGMI hive reset logic for SRIOV */ 4888 if (amdgpu_sriov_vf(adev)) { 4889 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4890 if (r) 4891 adev->asic_reset_res = r; 4892 } else { 4893 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 4894 if (r && r == -EAGAIN) 4895 goto retry; 4896 } 4897 4898 skip_hw_reset: 4899 4900 /* Post ASIC reset for all devs .*/ 4901 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4902 4903 /* 4904 * Sometimes a later bad compute job can block a good gfx job as gfx 4905 * and compute ring share internal GC HW mutually. We add an additional 4906 * guilty jobs recheck step to find the real guilty job, it synchronously 4907 * submits and pends for the first job being signaled. If it gets timeout, 4908 * we identify it as a real guilty job. 4909 */ 4910 if (amdgpu_gpu_recovery == 2 && 4911 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 4912 amdgpu_device_recheck_guilty_jobs( 4913 tmp_adev, device_list_handle, &reset_context); 4914 4915 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4916 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4917 4918 if (!ring || !ring->sched.thread) 4919 continue; 4920 4921 /* No point to resubmit jobs if we didn't HW reset*/ 4922 if (!tmp_adev->asic_reset_res && !job_signaled) 4923 drm_sched_resubmit_jobs(&ring->sched); 4924 4925 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4926 } 4927 4928 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4929 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4930 } 4931 4932 tmp_adev->asic_reset_res = 0; 4933 4934 if (r) { 4935 /* bad news, how to tell it to userspace ? */ 4936 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4937 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4938 } else { 4939 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4940 } 4941 } 4942 4943 skip_sched_resume: 4944 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4945 /* unlock kfd: SRIOV would do it separately */ 4946 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4947 amdgpu_amdkfd_post_reset(tmp_adev); 4948 4949 /* kfd_post_reset will do nothing if kfd device is not initialized, 4950 * need to bring up kfd here if it's not be initialized before 4951 */ 4952 if (!adev->kfd.init_complete) 4953 amdgpu_amdkfd_device_init(adev); 4954 4955 if (audio_suspended) 4956 amdgpu_device_resume_display_audio(tmp_adev); 4957 amdgpu_device_unlock_adev(tmp_adev); 4958 } 4959 4960 skip_recovery: 4961 if (hive) { 4962 atomic_set(&hive->in_reset, 0); 4963 mutex_unlock(&hive->hive_lock); 4964 amdgpu_put_xgmi_hive(hive); 4965 } 4966 4967 if (r && r != -EAGAIN) 4968 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4969 return r; 4970 } 4971 4972 /** 4973 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4974 * 4975 * @adev: amdgpu_device pointer 4976 * 4977 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4978 * and lanes) of the slot the device is in. Handles APUs and 4979 * virtualized environments where PCIE config space may not be available. 4980 */ 4981 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4982 { 4983 struct pci_dev *pdev; 4984 enum pci_bus_speed speed_cap, platform_speed_cap; 4985 enum pcie_link_width platform_link_width; 4986 4987 if (amdgpu_pcie_gen_cap) 4988 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4989 4990 if (amdgpu_pcie_lane_cap) 4991 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4992 4993 /* covers APUs as well */ 4994 if (pci_is_root_bus(adev->pdev->bus)) { 4995 if (adev->pm.pcie_gen_mask == 0) 4996 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4997 if (adev->pm.pcie_mlw_mask == 0) 4998 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4999 return; 5000 } 5001 5002 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5003 return; 5004 5005 pcie_bandwidth_available(adev->pdev, NULL, 5006 &platform_speed_cap, &platform_link_width); 5007 5008 if (adev->pm.pcie_gen_mask == 0) { 5009 /* asic caps */ 5010 pdev = adev->pdev; 5011 speed_cap = pcie_get_speed_cap(pdev); 5012 if (speed_cap == PCI_SPEED_UNKNOWN) { 5013 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5014 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5015 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5016 } else { 5017 if (speed_cap == PCIE_SPEED_32_0GT) 5018 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5019 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5020 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5021 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5022 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5023 else if (speed_cap == PCIE_SPEED_16_0GT) 5024 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5025 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5026 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5027 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5028 else if (speed_cap == PCIE_SPEED_8_0GT) 5029 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5030 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5031 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5032 else if (speed_cap == PCIE_SPEED_5_0GT) 5033 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5034 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5035 else 5036 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5037 } 5038 /* platform caps */ 5039 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5040 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5041 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5042 } else { 5043 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5044 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5045 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5046 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5047 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5048 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5049 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5050 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5051 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5052 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5053 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5054 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5055 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5056 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5057 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5058 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5059 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5060 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5061 else 5062 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5063 5064 } 5065 } 5066 if (adev->pm.pcie_mlw_mask == 0) { 5067 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5068 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5069 } else { 5070 switch (platform_link_width) { 5071 case PCIE_LNK_X32: 5072 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5073 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5074 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5075 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5076 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5077 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5078 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5079 break; 5080 case PCIE_LNK_X16: 5081 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5082 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5083 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5084 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5085 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5086 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5087 break; 5088 case PCIE_LNK_X12: 5089 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5090 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5091 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5092 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5093 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5094 break; 5095 case PCIE_LNK_X8: 5096 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5097 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5098 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5099 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5100 break; 5101 case PCIE_LNK_X4: 5102 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5103 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5104 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5105 break; 5106 case PCIE_LNK_X2: 5107 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5108 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5109 break; 5110 case PCIE_LNK_X1: 5111 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5112 break; 5113 default: 5114 break; 5115 } 5116 } 5117 } 5118 } 5119 5120 int amdgpu_device_baco_enter(struct drm_device *dev) 5121 { 5122 struct amdgpu_device *adev = drm_to_adev(dev); 5123 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5124 5125 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5126 return -ENOTSUPP; 5127 5128 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 5129 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5130 5131 return amdgpu_dpm_baco_enter(adev); 5132 } 5133 5134 int amdgpu_device_baco_exit(struct drm_device *dev) 5135 { 5136 struct amdgpu_device *adev = drm_to_adev(dev); 5137 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5138 int ret = 0; 5139 5140 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5141 return -ENOTSUPP; 5142 5143 ret = amdgpu_dpm_baco_exit(adev); 5144 if (ret) 5145 return ret; 5146 5147 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 5148 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5149 5150 return 0; 5151 } 5152 5153 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5154 { 5155 int i; 5156 5157 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5158 struct amdgpu_ring *ring = adev->rings[i]; 5159 5160 if (!ring || !ring->sched.thread) 5161 continue; 5162 5163 cancel_delayed_work_sync(&ring->sched.work_tdr); 5164 } 5165 } 5166 5167 /** 5168 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5169 * @pdev: PCI device struct 5170 * @state: PCI channel state 5171 * 5172 * Description: Called when a PCI error is detected. 5173 * 5174 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5175 */ 5176 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5177 { 5178 struct drm_device *dev = pci_get_drvdata(pdev); 5179 struct amdgpu_device *adev = drm_to_adev(dev); 5180 int i; 5181 5182 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5183 5184 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5185 DRM_WARN("No support for XGMI hive yet..."); 5186 return PCI_ERS_RESULT_DISCONNECT; 5187 } 5188 5189 switch (state) { 5190 case pci_channel_io_normal: 5191 return PCI_ERS_RESULT_CAN_RECOVER; 5192 /* Fatal error, prepare for slot reset */ 5193 case pci_channel_io_frozen: 5194 /* 5195 * Cancel and wait for all TDRs in progress if failing to 5196 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5197 * 5198 * Locking adev->reset_sem will prevent any external access 5199 * to GPU during PCI error recovery 5200 */ 5201 while (!amdgpu_device_lock_adev(adev, NULL)) 5202 amdgpu_cancel_all_tdr(adev); 5203 5204 /* 5205 * Block any work scheduling as we do for regular GPU reset 5206 * for the duration of the recovery 5207 */ 5208 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5209 struct amdgpu_ring *ring = adev->rings[i]; 5210 5211 if (!ring || !ring->sched.thread) 5212 continue; 5213 5214 drm_sched_stop(&ring->sched, NULL); 5215 } 5216 atomic_inc(&adev->gpu_reset_counter); 5217 return PCI_ERS_RESULT_NEED_RESET; 5218 case pci_channel_io_perm_failure: 5219 /* Permanent error, prepare for device removal */ 5220 return PCI_ERS_RESULT_DISCONNECT; 5221 } 5222 5223 return PCI_ERS_RESULT_NEED_RESET; 5224 } 5225 5226 /** 5227 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5228 * @pdev: pointer to PCI device 5229 */ 5230 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5231 { 5232 5233 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5234 5235 /* TODO - dump whatever for debugging purposes */ 5236 5237 /* This called only if amdgpu_pci_error_detected returns 5238 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5239 * works, no need to reset slot. 5240 */ 5241 5242 return PCI_ERS_RESULT_RECOVERED; 5243 } 5244 5245 /** 5246 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5247 * @pdev: PCI device struct 5248 * 5249 * Description: This routine is called by the pci error recovery 5250 * code after the PCI slot has been reset, just before we 5251 * should resume normal operations. 5252 */ 5253 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5254 { 5255 struct drm_device *dev = pci_get_drvdata(pdev); 5256 struct amdgpu_device *adev = drm_to_adev(dev); 5257 int r, i; 5258 struct amdgpu_reset_context reset_context; 5259 u32 memsize; 5260 struct list_head device_list; 5261 5262 DRM_INFO("PCI error: slot reset callback!!\n"); 5263 5264 memset(&reset_context, 0, sizeof(reset_context)); 5265 5266 INIT_LIST_HEAD(&device_list); 5267 list_add_tail(&adev->reset_list, &device_list); 5268 5269 /* wait for asic to come out of reset */ 5270 msleep(500); 5271 5272 /* Restore PCI confspace */ 5273 amdgpu_device_load_pci_state(pdev); 5274 5275 /* confirm ASIC came out of reset */ 5276 for (i = 0; i < adev->usec_timeout; i++) { 5277 memsize = amdgpu_asic_get_config_memsize(adev); 5278 5279 if (memsize != 0xffffffff) 5280 break; 5281 udelay(1); 5282 } 5283 if (memsize == 0xffffffff) { 5284 r = -ETIME; 5285 goto out; 5286 } 5287 5288 reset_context.method = AMD_RESET_METHOD_NONE; 5289 reset_context.reset_req_dev = adev; 5290 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5291 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5292 5293 adev->in_pci_err_recovery = true; 5294 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5295 adev->in_pci_err_recovery = false; 5296 if (r) 5297 goto out; 5298 5299 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5300 5301 out: 5302 if (!r) { 5303 if (amdgpu_device_cache_pci_state(adev->pdev)) 5304 pci_restore_state(adev->pdev); 5305 5306 DRM_INFO("PCIe error recovery succeeded\n"); 5307 } else { 5308 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5309 amdgpu_device_unlock_adev(adev); 5310 } 5311 5312 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5313 } 5314 5315 /** 5316 * amdgpu_pci_resume() - resume normal ops after PCI reset 5317 * @pdev: pointer to PCI device 5318 * 5319 * Called when the error recovery driver tells us that its 5320 * OK to resume normal operation. 5321 */ 5322 void amdgpu_pci_resume(struct pci_dev *pdev) 5323 { 5324 struct drm_device *dev = pci_get_drvdata(pdev); 5325 struct amdgpu_device *adev = drm_to_adev(dev); 5326 int i; 5327 5328 5329 DRM_INFO("PCI error: resume callback!!\n"); 5330 5331 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5332 struct amdgpu_ring *ring = adev->rings[i]; 5333 5334 if (!ring || !ring->sched.thread) 5335 continue; 5336 5337 5338 drm_sched_resubmit_jobs(&ring->sched); 5339 drm_sched_start(&ring->sched, true); 5340 } 5341 5342 amdgpu_device_unlock_adev(adev); 5343 } 5344 5345 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5346 { 5347 struct drm_device *dev = pci_get_drvdata(pdev); 5348 struct amdgpu_device *adev = drm_to_adev(dev); 5349 int r; 5350 5351 r = pci_save_state(pdev); 5352 if (!r) { 5353 kfree(adev->pci_state); 5354 5355 adev->pci_state = pci_store_saved_state(pdev); 5356 5357 if (!adev->pci_state) { 5358 DRM_ERROR("Failed to store PCI saved state"); 5359 return false; 5360 } 5361 } else { 5362 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5363 return false; 5364 } 5365 5366 return true; 5367 } 5368 5369 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5370 { 5371 struct drm_device *dev = pci_get_drvdata(pdev); 5372 struct amdgpu_device *adev = drm_to_adev(dev); 5373 int r; 5374 5375 if (!adev->pci_state) 5376 return false; 5377 5378 r = pci_load_saved_state(pdev, adev->pci_state); 5379 5380 if (!r) { 5381 pci_restore_state(pdev); 5382 } else { 5383 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5384 return false; 5385 } 5386 5387 return true; 5388 } 5389 5390 5391