1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "NAVI10", 115 "NAVI14", 116 "NAVI12", 117 "SIENNA_CICHLID", 118 "NAVY_FLOUNDER", 119 "LAST", 120 }; 121 122 /** 123 * DOC: pcie_replay_count 124 * 125 * The amdgpu driver provides a sysfs API for reporting the total number 126 * of PCIe replays (NAKs) 127 * The file pcie_replay_count is used for this and returns the total 128 * number of replays as a sum of the NAKs generated and NAKs received 129 */ 130 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct drm_device *ddev = dev_get_drvdata(dev); 135 struct amdgpu_device *adev = drm_to_adev(ddev); 136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 137 138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 139 } 140 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 142 amdgpu_device_get_pcie_replay_count, NULL); 143 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 145 146 /** 147 * DOC: product_name 148 * 149 * The amdgpu driver provides a sysfs API for reporting the product name 150 * for the device 151 * The file serial_number is used for this and returns the product name 152 * as returned from the FRU. 153 * NOTE: This is only available for certain server cards 154 */ 155 156 static ssize_t amdgpu_device_get_product_name(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 struct drm_device *ddev = dev_get_drvdata(dev); 160 struct amdgpu_device *adev = drm_to_adev(ddev); 161 162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 163 } 164 165 static DEVICE_ATTR(product_name, S_IRUGO, 166 amdgpu_device_get_product_name, NULL); 167 168 /** 169 * DOC: product_number 170 * 171 * The amdgpu driver provides a sysfs API for reporting the part number 172 * for the device 173 * The file serial_number is used for this and returns the part number 174 * as returned from the FRU. 175 * NOTE: This is only available for certain server cards 176 */ 177 178 static ssize_t amdgpu_device_get_product_number(struct device *dev, 179 struct device_attribute *attr, char *buf) 180 { 181 struct drm_device *ddev = dev_get_drvdata(dev); 182 struct amdgpu_device *adev = drm_to_adev(ddev); 183 184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 185 } 186 187 static DEVICE_ATTR(product_number, S_IRUGO, 188 amdgpu_device_get_product_number, NULL); 189 190 /** 191 * DOC: serial_number 192 * 193 * The amdgpu driver provides a sysfs API for reporting the serial number 194 * for the device 195 * The file serial_number is used for this and returns the serial number 196 * as returned from the FRU. 197 * NOTE: This is only available for certain server cards 198 */ 199 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 201 struct device_attribute *attr, char *buf) 202 { 203 struct drm_device *ddev = dev_get_drvdata(dev); 204 struct amdgpu_device *adev = drm_to_adev(ddev); 205 206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 207 } 208 209 static DEVICE_ATTR(serial_number, S_IRUGO, 210 amdgpu_device_get_serial_number, NULL); 211 212 /** 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 214 * 215 * @dev: drm_device pointer 216 * 217 * Returns true if the device is a dGPU with HG/PX power control, 218 * otherwise return false. 219 */ 220 bool amdgpu_device_supports_boco(struct drm_device *dev) 221 { 222 struct amdgpu_device *adev = drm_to_adev(dev); 223 224 if (adev->flags & AMD_IS_PX) 225 return true; 226 return false; 227 } 228 229 /** 230 * amdgpu_device_supports_baco - Does the device support BACO 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device supporte BACO, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_baco(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = drm_to_adev(dev); 240 241 return amdgpu_asic_supports_baco(adev); 242 } 243 244 /** 245 * VRAM access helper functions. 246 * 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * MMIO register access helper functions. 307 */ 308 /** 309 * amdgpu_mm_rreg - read a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 318 uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (adev->in_pci_err_recovery) 323 return 0; 324 325 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && 326 down_read_trylock(&adev->reset_sem)) { 327 ret = amdgpu_kiq_rreg(adev, reg); 328 up_read(&adev->reset_sem); 329 return ret; 330 } 331 332 if ((reg * 4) < adev->rmmio_size) 333 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 334 else { 335 unsigned long flags; 336 337 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 338 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 339 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 } 342 343 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 344 return ret; 345 } 346 347 /* 348 * MMIO register read with bytes helper functions 349 * @offset:bytes offset from MMIO start 350 * 351 */ 352 353 /** 354 * amdgpu_mm_rreg8 - read a memory mapped IO register 355 * 356 * @adev: amdgpu_device pointer 357 * @offset: byte aligned register offset 358 * 359 * Returns the 8 bit value from the offset specified. 360 */ 361 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 362 { 363 if (adev->in_pci_err_recovery) 364 return 0; 365 366 if (offset < adev->rmmio_size) 367 return (readb(adev->rmmio + offset)); 368 BUG(); 369 } 370 371 /* 372 * MMIO register write with bytes helper functions 373 * @offset:bytes offset from MMIO start 374 * @value: the value want to be written to the register 375 * 376 */ 377 /** 378 * amdgpu_mm_wreg8 - read a memory mapped IO register 379 * 380 * @adev: amdgpu_device pointer 381 * @offset: byte aligned register offset 382 * @value: 8 bit value to write 383 * 384 * Writes the value specified to the offset specified. 385 */ 386 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 387 { 388 if (adev->in_pci_err_recovery) 389 return; 390 391 if (offset < adev->rmmio_size) 392 writeb(value, adev->rmmio + offset); 393 else 394 BUG(); 395 } 396 397 static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t v, 399 uint32_t acc_flags) 400 { 401 if (adev->in_pci_err_recovery) 402 return; 403 404 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 405 406 if ((reg * 4) < adev->rmmio_size) 407 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 408 else { 409 unsigned long flags; 410 411 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 412 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 413 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 414 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 415 } 416 } 417 418 /** 419 * amdgpu_mm_wreg - write to a memory mapped IO register 420 * 421 * @adev: amdgpu_device pointer 422 * @reg: dword aligned register offset 423 * @v: 32 bit value to write to the register 424 * @acc_flags: access flags which require special behavior 425 * 426 * Writes the value specified to the offset specified. 427 */ 428 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 429 uint32_t acc_flags) 430 { 431 if (adev->in_pci_err_recovery) 432 return; 433 434 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && 435 down_read_trylock(&adev->reset_sem)) { 436 amdgpu_kiq_wreg(adev, reg, v); 437 up_read(&adev->reset_sem); 438 return; 439 } 440 441 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 442 } 443 444 /* 445 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 446 * 447 * this function is invoked only the debugfs register access 448 * */ 449 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 450 uint32_t acc_flags) 451 { 452 if (adev->in_pci_err_recovery) 453 return; 454 455 if (amdgpu_sriov_fullaccess(adev) && 456 adev->gfx.rlc.funcs && 457 adev->gfx.rlc.funcs->is_rlcg_access_range) { 458 459 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 460 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 461 } 462 463 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 464 } 465 466 /** 467 * amdgpu_io_rreg - read an IO register 468 * 469 * @adev: amdgpu_device pointer 470 * @reg: dword aligned register offset 471 * 472 * Returns the 32 bit value from the offset specified. 473 */ 474 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 475 { 476 if (adev->in_pci_err_recovery) 477 return 0; 478 479 if ((reg * 4) < adev->rio_mem_size) 480 return ioread32(adev->rio_mem + (reg * 4)); 481 else { 482 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 483 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 484 } 485 } 486 487 /** 488 * amdgpu_io_wreg - write to an IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @reg: dword aligned register offset 492 * @v: 32 bit value to write to the register 493 * 494 * Writes the value specified to the offset specified. 495 */ 496 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 497 { 498 if (adev->in_pci_err_recovery) 499 return; 500 501 if ((reg * 4) < adev->rio_mem_size) 502 iowrite32(v, adev->rio_mem + (reg * 4)); 503 else { 504 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 505 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 506 } 507 } 508 509 /** 510 * amdgpu_mm_rdoorbell - read a doorbell dword 511 * 512 * @adev: amdgpu_device pointer 513 * @index: doorbell index 514 * 515 * Returns the value in the doorbell aperture at the 516 * requested doorbell index (CIK). 517 */ 518 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 519 { 520 if (adev->in_pci_err_recovery) 521 return 0; 522 523 if (index < adev->doorbell.num_doorbells) { 524 return readl(adev->doorbell.ptr + index); 525 } else { 526 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 527 return 0; 528 } 529 } 530 531 /** 532 * amdgpu_mm_wdoorbell - write a doorbell dword 533 * 534 * @adev: amdgpu_device pointer 535 * @index: doorbell index 536 * @v: value to write 537 * 538 * Writes @v to the doorbell aperture at the 539 * requested doorbell index (CIK). 540 */ 541 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 542 { 543 if (adev->in_pci_err_recovery) 544 return; 545 546 if (index < adev->doorbell.num_doorbells) { 547 writel(v, adev->doorbell.ptr + index); 548 } else { 549 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 550 } 551 } 552 553 /** 554 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 555 * 556 * @adev: amdgpu_device pointer 557 * @index: doorbell index 558 * 559 * Returns the value in the doorbell aperture at the 560 * requested doorbell index (VEGA10+). 561 */ 562 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 563 { 564 if (adev->in_pci_err_recovery) 565 return 0; 566 567 if (index < adev->doorbell.num_doorbells) { 568 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 569 } else { 570 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 571 return 0; 572 } 573 } 574 575 /** 576 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 577 * 578 * @adev: amdgpu_device pointer 579 * @index: doorbell index 580 * @v: value to write 581 * 582 * Writes @v to the doorbell aperture at the 583 * requested doorbell index (VEGA10+). 584 */ 585 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 586 { 587 if (adev->in_pci_err_recovery) 588 return; 589 590 if (index < adev->doorbell.num_doorbells) { 591 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 592 } else { 593 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 594 } 595 } 596 597 /** 598 * amdgpu_invalid_rreg - dummy reg read function 599 * 600 * @adev: amdgpu device pointer 601 * @reg: offset of register 602 * 603 * Dummy register read function. Used for register blocks 604 * that certain asics don't have (all asics). 605 * Returns the value in the register. 606 */ 607 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 608 { 609 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 610 BUG(); 611 return 0; 612 } 613 614 /** 615 * amdgpu_invalid_wreg - dummy reg write function 616 * 617 * @adev: amdgpu device pointer 618 * @reg: offset of register 619 * @v: value to write to the register 620 * 621 * Dummy register read function. Used for register blocks 622 * that certain asics don't have (all asics). 623 */ 624 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 625 { 626 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 627 reg, v); 628 BUG(); 629 } 630 631 /** 632 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 633 * 634 * @adev: amdgpu device pointer 635 * @reg: offset of register 636 * 637 * Dummy register read function. Used for register blocks 638 * that certain asics don't have (all asics). 639 * Returns the value in the register. 640 */ 641 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 642 { 643 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 644 BUG(); 645 return 0; 646 } 647 648 /** 649 * amdgpu_invalid_wreg64 - dummy reg write function 650 * 651 * @adev: amdgpu device pointer 652 * @reg: offset of register 653 * @v: value to write to the register 654 * 655 * Dummy register read function. Used for register blocks 656 * that certain asics don't have (all asics). 657 */ 658 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 659 { 660 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 661 reg, v); 662 BUG(); 663 } 664 665 /** 666 * amdgpu_block_invalid_rreg - dummy reg read function 667 * 668 * @adev: amdgpu device pointer 669 * @block: offset of instance 670 * @reg: offset of register 671 * 672 * Dummy register read function. Used for register blocks 673 * that certain asics don't have (all asics). 674 * Returns the value in the register. 675 */ 676 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 677 uint32_t block, uint32_t reg) 678 { 679 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 680 reg, block); 681 BUG(); 682 return 0; 683 } 684 685 /** 686 * amdgpu_block_invalid_wreg - dummy reg write function 687 * 688 * @adev: amdgpu device pointer 689 * @block: offset of instance 690 * @reg: offset of register 691 * @v: value to write to the register 692 * 693 * Dummy register read function. Used for register blocks 694 * that certain asics don't have (all asics). 695 */ 696 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 697 uint32_t block, 698 uint32_t reg, uint32_t v) 699 { 700 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 701 reg, block, v); 702 BUG(); 703 } 704 705 /** 706 * amdgpu_device_asic_init - Wrapper for atom asic_init 707 * 708 * @dev: drm_device pointer 709 * 710 * Does any asic specific work and then calls atom asic init. 711 */ 712 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 713 { 714 amdgpu_asic_pre_asic_init(adev); 715 716 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 717 } 718 719 /** 720 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 721 * 722 * @adev: amdgpu device pointer 723 * 724 * Allocates a scratch page of VRAM for use by various things in the 725 * driver. 726 */ 727 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 728 { 729 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 730 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 731 &adev->vram_scratch.robj, 732 &adev->vram_scratch.gpu_addr, 733 (void **)&adev->vram_scratch.ptr); 734 } 735 736 /** 737 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 738 * 739 * @adev: amdgpu device pointer 740 * 741 * Frees the VRAM scratch page. 742 */ 743 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 744 { 745 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 746 } 747 748 /** 749 * amdgpu_device_program_register_sequence - program an array of registers. 750 * 751 * @adev: amdgpu_device pointer 752 * @registers: pointer to the register array 753 * @array_size: size of the register array 754 * 755 * Programs an array or registers with and and or masks. 756 * This is a helper for setting golden registers. 757 */ 758 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 759 const u32 *registers, 760 const u32 array_size) 761 { 762 u32 tmp, reg, and_mask, or_mask; 763 int i; 764 765 if (array_size % 3) 766 return; 767 768 for (i = 0; i < array_size; i +=3) { 769 reg = registers[i + 0]; 770 and_mask = registers[i + 1]; 771 or_mask = registers[i + 2]; 772 773 if (and_mask == 0xffffffff) { 774 tmp = or_mask; 775 } else { 776 tmp = RREG32(reg); 777 tmp &= ~and_mask; 778 if (adev->family >= AMDGPU_FAMILY_AI) 779 tmp |= (or_mask & and_mask); 780 else 781 tmp |= or_mask; 782 } 783 WREG32(reg, tmp); 784 } 785 } 786 787 /** 788 * amdgpu_device_pci_config_reset - reset the GPU 789 * 790 * @adev: amdgpu_device pointer 791 * 792 * Resets the GPU using the pci config reset sequence. 793 * Only applicable to asics prior to vega10. 794 */ 795 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 796 { 797 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 798 } 799 800 /* 801 * GPU doorbell aperture helpers function. 802 */ 803 /** 804 * amdgpu_device_doorbell_init - Init doorbell driver information. 805 * 806 * @adev: amdgpu_device pointer 807 * 808 * Init doorbell driver information (CIK) 809 * Returns 0 on success, error on failure. 810 */ 811 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 812 { 813 814 /* No doorbell on SI hardware generation */ 815 if (adev->asic_type < CHIP_BONAIRE) { 816 adev->doorbell.base = 0; 817 adev->doorbell.size = 0; 818 adev->doorbell.num_doorbells = 0; 819 adev->doorbell.ptr = NULL; 820 return 0; 821 } 822 823 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 824 return -EINVAL; 825 826 amdgpu_asic_init_doorbell_index(adev); 827 828 /* doorbell bar mapping */ 829 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 830 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 831 832 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 833 adev->doorbell_index.max_assignment+1); 834 if (adev->doorbell.num_doorbells == 0) 835 return -EINVAL; 836 837 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 838 * paging queue doorbell use the second page. The 839 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 840 * doorbells are in the first page. So with paging queue enabled, 841 * the max num_doorbells should + 1 page (0x400 in dword) 842 */ 843 if (adev->asic_type >= CHIP_VEGA10) 844 adev->doorbell.num_doorbells += 0x400; 845 846 adev->doorbell.ptr = ioremap(adev->doorbell.base, 847 adev->doorbell.num_doorbells * 848 sizeof(u32)); 849 if (adev->doorbell.ptr == NULL) 850 return -ENOMEM; 851 852 return 0; 853 } 854 855 /** 856 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 857 * 858 * @adev: amdgpu_device pointer 859 * 860 * Tear down doorbell driver information (CIK) 861 */ 862 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 863 { 864 iounmap(adev->doorbell.ptr); 865 adev->doorbell.ptr = NULL; 866 } 867 868 869 870 /* 871 * amdgpu_device_wb_*() 872 * Writeback is the method by which the GPU updates special pages in memory 873 * with the status of certain GPU events (fences, ring pointers,etc.). 874 */ 875 876 /** 877 * amdgpu_device_wb_fini - Disable Writeback and free memory 878 * 879 * @adev: amdgpu_device pointer 880 * 881 * Disables Writeback and frees the Writeback memory (all asics). 882 * Used at driver shutdown. 883 */ 884 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 885 { 886 if (adev->wb.wb_obj) { 887 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 888 &adev->wb.gpu_addr, 889 (void **)&adev->wb.wb); 890 adev->wb.wb_obj = NULL; 891 } 892 } 893 894 /** 895 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 896 * 897 * @adev: amdgpu_device pointer 898 * 899 * Initializes writeback and allocates writeback memory (all asics). 900 * Used at driver startup. 901 * Returns 0 on success or an -error on failure. 902 */ 903 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 904 { 905 int r; 906 907 if (adev->wb.wb_obj == NULL) { 908 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 909 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 910 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 911 &adev->wb.wb_obj, &adev->wb.gpu_addr, 912 (void **)&adev->wb.wb); 913 if (r) { 914 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 915 return r; 916 } 917 918 adev->wb.num_wb = AMDGPU_MAX_WB; 919 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 920 921 /* clear wb memory */ 922 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 923 } 924 925 return 0; 926 } 927 928 /** 929 * amdgpu_device_wb_get - Allocate a wb entry 930 * 931 * @adev: amdgpu_device pointer 932 * @wb: wb index 933 * 934 * Allocate a wb slot for use by the driver (all asics). 935 * Returns 0 on success or -EINVAL on failure. 936 */ 937 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 938 { 939 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 940 941 if (offset < adev->wb.num_wb) { 942 __set_bit(offset, adev->wb.used); 943 *wb = offset << 3; /* convert to dw offset */ 944 return 0; 945 } else { 946 return -EINVAL; 947 } 948 } 949 950 /** 951 * amdgpu_device_wb_free - Free a wb entry 952 * 953 * @adev: amdgpu_device pointer 954 * @wb: wb index 955 * 956 * Free a wb slot allocated for use by the driver (all asics) 957 */ 958 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 959 { 960 wb >>= 3; 961 if (wb < adev->wb.num_wb) 962 __clear_bit(wb, adev->wb.used); 963 } 964 965 /** 966 * amdgpu_device_resize_fb_bar - try to resize FB BAR 967 * 968 * @adev: amdgpu_device pointer 969 * 970 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 971 * to fail, but if any of the BARs is not accessible after the size we abort 972 * driver loading by returning -ENODEV. 973 */ 974 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 975 { 976 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 977 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 978 struct pci_bus *root; 979 struct resource *res; 980 unsigned i; 981 u16 cmd; 982 int r; 983 984 /* Bypass for VF */ 985 if (amdgpu_sriov_vf(adev)) 986 return 0; 987 988 /* skip if the bios has already enabled large BAR */ 989 if (adev->gmc.real_vram_size && 990 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 991 return 0; 992 993 /* Check if the root BUS has 64bit memory resources */ 994 root = adev->pdev->bus; 995 while (root->parent) 996 root = root->parent; 997 998 pci_bus_for_each_resource(root, res, i) { 999 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1000 res->start > 0x100000000ull) 1001 break; 1002 } 1003 1004 /* Trying to resize is pointless without a root hub window above 4GB */ 1005 if (!res) 1006 return 0; 1007 1008 /* Disable memory decoding while we change the BAR addresses and size */ 1009 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1010 pci_write_config_word(adev->pdev, PCI_COMMAND, 1011 cmd & ~PCI_COMMAND_MEMORY); 1012 1013 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1014 amdgpu_device_doorbell_fini(adev); 1015 if (adev->asic_type >= CHIP_BONAIRE) 1016 pci_release_resource(adev->pdev, 2); 1017 1018 pci_release_resource(adev->pdev, 0); 1019 1020 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1021 if (r == -ENOSPC) 1022 DRM_INFO("Not enough PCI address space for a large BAR."); 1023 else if (r && r != -ENOTSUPP) 1024 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1025 1026 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1027 1028 /* When the doorbell or fb BAR isn't available we have no chance of 1029 * using the device. 1030 */ 1031 r = amdgpu_device_doorbell_init(adev); 1032 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1033 return -ENODEV; 1034 1035 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1036 1037 return 0; 1038 } 1039 1040 /* 1041 * GPU helpers function. 1042 */ 1043 /** 1044 * amdgpu_device_need_post - check if the hw need post or not 1045 * 1046 * @adev: amdgpu_device pointer 1047 * 1048 * Check if the asic has been initialized (all asics) at driver startup 1049 * or post is needed if hw reset is performed. 1050 * Returns true if need or false if not. 1051 */ 1052 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1053 { 1054 uint32_t reg; 1055 1056 if (amdgpu_sriov_vf(adev)) 1057 return false; 1058 1059 if (amdgpu_passthrough(adev)) { 1060 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1061 * some old smc fw still need driver do vPost otherwise gpu hang, while 1062 * those smc fw version above 22.15 doesn't have this flaw, so we force 1063 * vpost executed for smc version below 22.15 1064 */ 1065 if (adev->asic_type == CHIP_FIJI) { 1066 int err; 1067 uint32_t fw_ver; 1068 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1069 /* force vPost if error occured */ 1070 if (err) 1071 return true; 1072 1073 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1074 if (fw_ver < 0x00160e00) 1075 return true; 1076 } 1077 } 1078 1079 if (adev->has_hw_reset) { 1080 adev->has_hw_reset = false; 1081 return true; 1082 } 1083 1084 /* bios scratch used on CIK+ */ 1085 if (adev->asic_type >= CHIP_BONAIRE) 1086 return amdgpu_atombios_scratch_need_asic_init(adev); 1087 1088 /* check MEM_SIZE for older asics */ 1089 reg = amdgpu_asic_get_config_memsize(adev); 1090 1091 if ((reg != 0) && (reg != 0xffffffff)) 1092 return false; 1093 1094 return true; 1095 } 1096 1097 /* if we get transitioned to only one device, take VGA back */ 1098 /** 1099 * amdgpu_device_vga_set_decode - enable/disable vga decode 1100 * 1101 * @cookie: amdgpu_device pointer 1102 * @state: enable/disable vga decode 1103 * 1104 * Enable/disable vga decode (all asics). 1105 * Returns VGA resource flags. 1106 */ 1107 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1108 { 1109 struct amdgpu_device *adev = cookie; 1110 amdgpu_asic_set_vga_state(adev, state); 1111 if (state) 1112 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1113 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1114 else 1115 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1116 } 1117 1118 /** 1119 * amdgpu_device_check_block_size - validate the vm block size 1120 * 1121 * @adev: amdgpu_device pointer 1122 * 1123 * Validates the vm block size specified via module parameter. 1124 * The vm block size defines number of bits in page table versus page directory, 1125 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1126 * page table and the remaining bits are in the page directory. 1127 */ 1128 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1129 { 1130 /* defines number of bits in page table versus page directory, 1131 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1132 * page table and the remaining bits are in the page directory */ 1133 if (amdgpu_vm_block_size == -1) 1134 return; 1135 1136 if (amdgpu_vm_block_size < 9) { 1137 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1138 amdgpu_vm_block_size); 1139 amdgpu_vm_block_size = -1; 1140 } 1141 } 1142 1143 /** 1144 * amdgpu_device_check_vm_size - validate the vm size 1145 * 1146 * @adev: amdgpu_device pointer 1147 * 1148 * Validates the vm size in GB specified via module parameter. 1149 * The VM size is the size of the GPU virtual memory space in GB. 1150 */ 1151 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1152 { 1153 /* no need to check the default value */ 1154 if (amdgpu_vm_size == -1) 1155 return; 1156 1157 if (amdgpu_vm_size < 1) { 1158 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1159 amdgpu_vm_size); 1160 amdgpu_vm_size = -1; 1161 } 1162 } 1163 1164 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1165 { 1166 struct sysinfo si; 1167 bool is_os_64 = (sizeof(void *) == 8); 1168 uint64_t total_memory; 1169 uint64_t dram_size_seven_GB = 0x1B8000000; 1170 uint64_t dram_size_three_GB = 0xB8000000; 1171 1172 if (amdgpu_smu_memory_pool_size == 0) 1173 return; 1174 1175 if (!is_os_64) { 1176 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1177 goto def_value; 1178 } 1179 si_meminfo(&si); 1180 total_memory = (uint64_t)si.totalram * si.mem_unit; 1181 1182 if ((amdgpu_smu_memory_pool_size == 1) || 1183 (amdgpu_smu_memory_pool_size == 2)) { 1184 if (total_memory < dram_size_three_GB) 1185 goto def_value1; 1186 } else if ((amdgpu_smu_memory_pool_size == 4) || 1187 (amdgpu_smu_memory_pool_size == 8)) { 1188 if (total_memory < dram_size_seven_GB) 1189 goto def_value1; 1190 } else { 1191 DRM_WARN("Smu memory pool size not supported\n"); 1192 goto def_value; 1193 } 1194 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1195 1196 return; 1197 1198 def_value1: 1199 DRM_WARN("No enough system memory\n"); 1200 def_value: 1201 adev->pm.smu_prv_buffer_size = 0; 1202 } 1203 1204 /** 1205 * amdgpu_device_check_arguments - validate module params 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Validates certain module parameters and updates 1210 * the associated values used by the driver (all asics). 1211 */ 1212 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1213 { 1214 if (amdgpu_sched_jobs < 4) { 1215 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1216 amdgpu_sched_jobs); 1217 amdgpu_sched_jobs = 4; 1218 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1219 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1220 amdgpu_sched_jobs); 1221 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1222 } 1223 1224 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1225 /* gart size must be greater or equal to 32M */ 1226 dev_warn(adev->dev, "gart size (%d) too small\n", 1227 amdgpu_gart_size); 1228 amdgpu_gart_size = -1; 1229 } 1230 1231 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1232 /* gtt size must be greater or equal to 32M */ 1233 dev_warn(adev->dev, "gtt size (%d) too small\n", 1234 amdgpu_gtt_size); 1235 amdgpu_gtt_size = -1; 1236 } 1237 1238 /* valid range is between 4 and 9 inclusive */ 1239 if (amdgpu_vm_fragment_size != -1 && 1240 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1241 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1242 amdgpu_vm_fragment_size = -1; 1243 } 1244 1245 if (amdgpu_sched_hw_submission < 2) { 1246 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1247 amdgpu_sched_hw_submission); 1248 amdgpu_sched_hw_submission = 2; 1249 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1250 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1251 amdgpu_sched_hw_submission); 1252 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1253 } 1254 1255 amdgpu_device_check_smu_prv_buffer_size(adev); 1256 1257 amdgpu_device_check_vm_size(adev); 1258 1259 amdgpu_device_check_block_size(adev); 1260 1261 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1262 1263 amdgpu_gmc_tmz_set(adev); 1264 1265 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1266 amdgpu_num_kcq = 8; 1267 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1268 } 1269 1270 return 0; 1271 } 1272 1273 /** 1274 * amdgpu_switcheroo_set_state - set switcheroo state 1275 * 1276 * @pdev: pci dev pointer 1277 * @state: vga_switcheroo state 1278 * 1279 * Callback for the switcheroo driver. Suspends or resumes the 1280 * the asics before or after it is powered up using ACPI methods. 1281 */ 1282 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1283 enum vga_switcheroo_state state) 1284 { 1285 struct drm_device *dev = pci_get_drvdata(pdev); 1286 int r; 1287 1288 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1289 return; 1290 1291 if (state == VGA_SWITCHEROO_ON) { 1292 pr_info("switched on\n"); 1293 /* don't suspend or resume card normally */ 1294 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1295 1296 pci_set_power_state(dev->pdev, PCI_D0); 1297 amdgpu_device_load_pci_state(dev->pdev); 1298 r = pci_enable_device(dev->pdev); 1299 if (r) 1300 DRM_WARN("pci_enable_device failed (%d)\n", r); 1301 amdgpu_device_resume(dev, true); 1302 1303 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1304 drm_kms_helper_poll_enable(dev); 1305 } else { 1306 pr_info("switched off\n"); 1307 drm_kms_helper_poll_disable(dev); 1308 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1309 amdgpu_device_suspend(dev, true); 1310 amdgpu_device_cache_pci_state(dev->pdev); 1311 /* Shut down the device */ 1312 pci_disable_device(dev->pdev); 1313 pci_set_power_state(dev->pdev, PCI_D3cold); 1314 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1315 } 1316 } 1317 1318 /** 1319 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1320 * 1321 * @pdev: pci dev pointer 1322 * 1323 * Callback for the switcheroo driver. Check of the switcheroo 1324 * state can be changed. 1325 * Returns true if the state can be changed, false if not. 1326 */ 1327 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1328 { 1329 struct drm_device *dev = pci_get_drvdata(pdev); 1330 1331 /* 1332 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1333 * locking inversion with the driver load path. And the access here is 1334 * completely racy anyway. So don't bother with locking for now. 1335 */ 1336 return atomic_read(&dev->open_count) == 0; 1337 } 1338 1339 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1340 .set_gpu_state = amdgpu_switcheroo_set_state, 1341 .reprobe = NULL, 1342 .can_switch = amdgpu_switcheroo_can_switch, 1343 }; 1344 1345 /** 1346 * amdgpu_device_ip_set_clockgating_state - set the CG state 1347 * 1348 * @dev: amdgpu_device pointer 1349 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1350 * @state: clockgating state (gate or ungate) 1351 * 1352 * Sets the requested clockgating state for all instances of 1353 * the hardware IP specified. 1354 * Returns the error code from the last instance. 1355 */ 1356 int amdgpu_device_ip_set_clockgating_state(void *dev, 1357 enum amd_ip_block_type block_type, 1358 enum amd_clockgating_state state) 1359 { 1360 struct amdgpu_device *adev = dev; 1361 int i, r = 0; 1362 1363 for (i = 0; i < adev->num_ip_blocks; i++) { 1364 if (!adev->ip_blocks[i].status.valid) 1365 continue; 1366 if (adev->ip_blocks[i].version->type != block_type) 1367 continue; 1368 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1369 continue; 1370 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1371 (void *)adev, state); 1372 if (r) 1373 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1374 adev->ip_blocks[i].version->funcs->name, r); 1375 } 1376 return r; 1377 } 1378 1379 /** 1380 * amdgpu_device_ip_set_powergating_state - set the PG state 1381 * 1382 * @dev: amdgpu_device pointer 1383 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1384 * @state: powergating state (gate or ungate) 1385 * 1386 * Sets the requested powergating state for all instances of 1387 * the hardware IP specified. 1388 * Returns the error code from the last instance. 1389 */ 1390 int amdgpu_device_ip_set_powergating_state(void *dev, 1391 enum amd_ip_block_type block_type, 1392 enum amd_powergating_state state) 1393 { 1394 struct amdgpu_device *adev = dev; 1395 int i, r = 0; 1396 1397 for (i = 0; i < adev->num_ip_blocks; i++) { 1398 if (!adev->ip_blocks[i].status.valid) 1399 continue; 1400 if (adev->ip_blocks[i].version->type != block_type) 1401 continue; 1402 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1403 continue; 1404 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1405 (void *)adev, state); 1406 if (r) 1407 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1408 adev->ip_blocks[i].version->funcs->name, r); 1409 } 1410 return r; 1411 } 1412 1413 /** 1414 * amdgpu_device_ip_get_clockgating_state - get the CG state 1415 * 1416 * @adev: amdgpu_device pointer 1417 * @flags: clockgating feature flags 1418 * 1419 * Walks the list of IPs on the device and updates the clockgating 1420 * flags for each IP. 1421 * Updates @flags with the feature flags for each hardware IP where 1422 * clockgating is enabled. 1423 */ 1424 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1425 u32 *flags) 1426 { 1427 int i; 1428 1429 for (i = 0; i < adev->num_ip_blocks; i++) { 1430 if (!adev->ip_blocks[i].status.valid) 1431 continue; 1432 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1433 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1434 } 1435 } 1436 1437 /** 1438 * amdgpu_device_ip_wait_for_idle - wait for idle 1439 * 1440 * @adev: amdgpu_device pointer 1441 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1442 * 1443 * Waits for the request hardware IP to be idle. 1444 * Returns 0 for success or a negative error code on failure. 1445 */ 1446 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1447 enum amd_ip_block_type block_type) 1448 { 1449 int i, r; 1450 1451 for (i = 0; i < adev->num_ip_blocks; i++) { 1452 if (!adev->ip_blocks[i].status.valid) 1453 continue; 1454 if (adev->ip_blocks[i].version->type == block_type) { 1455 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1456 if (r) 1457 return r; 1458 break; 1459 } 1460 } 1461 return 0; 1462 1463 } 1464 1465 /** 1466 * amdgpu_device_ip_is_idle - is the hardware IP idle 1467 * 1468 * @adev: amdgpu_device pointer 1469 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1470 * 1471 * Check if the hardware IP is idle or not. 1472 * Returns true if it the IP is idle, false if not. 1473 */ 1474 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1475 enum amd_ip_block_type block_type) 1476 { 1477 int i; 1478 1479 for (i = 0; i < adev->num_ip_blocks; i++) { 1480 if (!adev->ip_blocks[i].status.valid) 1481 continue; 1482 if (adev->ip_blocks[i].version->type == block_type) 1483 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1484 } 1485 return true; 1486 1487 } 1488 1489 /** 1490 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1491 * 1492 * @adev: amdgpu_device pointer 1493 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1494 * 1495 * Returns a pointer to the hardware IP block structure 1496 * if it exists for the asic, otherwise NULL. 1497 */ 1498 struct amdgpu_ip_block * 1499 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1500 enum amd_ip_block_type type) 1501 { 1502 int i; 1503 1504 for (i = 0; i < adev->num_ip_blocks; i++) 1505 if (adev->ip_blocks[i].version->type == type) 1506 return &adev->ip_blocks[i]; 1507 1508 return NULL; 1509 } 1510 1511 /** 1512 * amdgpu_device_ip_block_version_cmp 1513 * 1514 * @adev: amdgpu_device pointer 1515 * @type: enum amd_ip_block_type 1516 * @major: major version 1517 * @minor: minor version 1518 * 1519 * return 0 if equal or greater 1520 * return 1 if smaller or the ip_block doesn't exist 1521 */ 1522 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1523 enum amd_ip_block_type type, 1524 u32 major, u32 minor) 1525 { 1526 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1527 1528 if (ip_block && ((ip_block->version->major > major) || 1529 ((ip_block->version->major == major) && 1530 (ip_block->version->minor >= minor)))) 1531 return 0; 1532 1533 return 1; 1534 } 1535 1536 /** 1537 * amdgpu_device_ip_block_add 1538 * 1539 * @adev: amdgpu_device pointer 1540 * @ip_block_version: pointer to the IP to add 1541 * 1542 * Adds the IP block driver information to the collection of IPs 1543 * on the asic. 1544 */ 1545 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1546 const struct amdgpu_ip_block_version *ip_block_version) 1547 { 1548 if (!ip_block_version) 1549 return -EINVAL; 1550 1551 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1552 ip_block_version->funcs->name); 1553 1554 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1555 1556 return 0; 1557 } 1558 1559 /** 1560 * amdgpu_device_enable_virtual_display - enable virtual display feature 1561 * 1562 * @adev: amdgpu_device pointer 1563 * 1564 * Enabled the virtual display feature if the user has enabled it via 1565 * the module parameter virtual_display. This feature provides a virtual 1566 * display hardware on headless boards or in virtualized environments. 1567 * This function parses and validates the configuration string specified by 1568 * the user and configues the virtual display configuration (number of 1569 * virtual connectors, crtcs, etc.) specified. 1570 */ 1571 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1572 { 1573 adev->enable_virtual_display = false; 1574 1575 if (amdgpu_virtual_display) { 1576 struct drm_device *ddev = adev_to_drm(adev); 1577 const char *pci_address_name = pci_name(ddev->pdev); 1578 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1579 1580 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1581 pciaddstr_tmp = pciaddstr; 1582 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1583 pciaddname = strsep(&pciaddname_tmp, ","); 1584 if (!strcmp("all", pciaddname) 1585 || !strcmp(pci_address_name, pciaddname)) { 1586 long num_crtc; 1587 int res = -1; 1588 1589 adev->enable_virtual_display = true; 1590 1591 if (pciaddname_tmp) 1592 res = kstrtol(pciaddname_tmp, 10, 1593 &num_crtc); 1594 1595 if (!res) { 1596 if (num_crtc < 1) 1597 num_crtc = 1; 1598 if (num_crtc > 6) 1599 num_crtc = 6; 1600 adev->mode_info.num_crtc = num_crtc; 1601 } else { 1602 adev->mode_info.num_crtc = 1; 1603 } 1604 break; 1605 } 1606 } 1607 1608 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1609 amdgpu_virtual_display, pci_address_name, 1610 adev->enable_virtual_display, adev->mode_info.num_crtc); 1611 1612 kfree(pciaddstr); 1613 } 1614 } 1615 1616 /** 1617 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1618 * 1619 * @adev: amdgpu_device pointer 1620 * 1621 * Parses the asic configuration parameters specified in the gpu info 1622 * firmware and makes them availale to the driver for use in configuring 1623 * the asic. 1624 * Returns 0 on success, -EINVAL on failure. 1625 */ 1626 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1627 { 1628 const char *chip_name; 1629 char fw_name[40]; 1630 int err; 1631 const struct gpu_info_firmware_header_v1_0 *hdr; 1632 1633 adev->firmware.gpu_info_fw = NULL; 1634 1635 if (adev->mman.discovery_bin) { 1636 amdgpu_discovery_get_gfx_info(adev); 1637 1638 /* 1639 * FIXME: The bounding box is still needed by Navi12, so 1640 * temporarily read it from gpu_info firmware. Should be droped 1641 * when DAL no longer needs it. 1642 */ 1643 if (adev->asic_type != CHIP_NAVI12) 1644 return 0; 1645 } 1646 1647 switch (adev->asic_type) { 1648 #ifdef CONFIG_DRM_AMDGPU_SI 1649 case CHIP_VERDE: 1650 case CHIP_TAHITI: 1651 case CHIP_PITCAIRN: 1652 case CHIP_OLAND: 1653 case CHIP_HAINAN: 1654 #endif 1655 #ifdef CONFIG_DRM_AMDGPU_CIK 1656 case CHIP_BONAIRE: 1657 case CHIP_HAWAII: 1658 case CHIP_KAVERI: 1659 case CHIP_KABINI: 1660 case CHIP_MULLINS: 1661 #endif 1662 case CHIP_TOPAZ: 1663 case CHIP_TONGA: 1664 case CHIP_FIJI: 1665 case CHIP_POLARIS10: 1666 case CHIP_POLARIS11: 1667 case CHIP_POLARIS12: 1668 case CHIP_VEGAM: 1669 case CHIP_CARRIZO: 1670 case CHIP_STONEY: 1671 case CHIP_VEGA20: 1672 default: 1673 return 0; 1674 case CHIP_VEGA10: 1675 chip_name = "vega10"; 1676 break; 1677 case CHIP_VEGA12: 1678 chip_name = "vega12"; 1679 break; 1680 case CHIP_RAVEN: 1681 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1682 chip_name = "raven2"; 1683 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1684 chip_name = "picasso"; 1685 else 1686 chip_name = "raven"; 1687 break; 1688 case CHIP_ARCTURUS: 1689 chip_name = "arcturus"; 1690 break; 1691 case CHIP_RENOIR: 1692 chip_name = "renoir"; 1693 break; 1694 case CHIP_NAVI10: 1695 chip_name = "navi10"; 1696 break; 1697 case CHIP_NAVI14: 1698 chip_name = "navi14"; 1699 break; 1700 case CHIP_NAVI12: 1701 chip_name = "navi12"; 1702 break; 1703 case CHIP_SIENNA_CICHLID: 1704 chip_name = "sienna_cichlid"; 1705 break; 1706 case CHIP_NAVY_FLOUNDER: 1707 chip_name = "navy_flounder"; 1708 break; 1709 } 1710 1711 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1712 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1713 if (err) { 1714 dev_err(adev->dev, 1715 "Failed to load gpu_info firmware \"%s\"\n", 1716 fw_name); 1717 goto out; 1718 } 1719 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1720 if (err) { 1721 dev_err(adev->dev, 1722 "Failed to validate gpu_info firmware \"%s\"\n", 1723 fw_name); 1724 goto out; 1725 } 1726 1727 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1728 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1729 1730 switch (hdr->version_major) { 1731 case 1: 1732 { 1733 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1734 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1735 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1736 1737 /* 1738 * Should be droped when DAL no longer needs it. 1739 */ 1740 if (adev->asic_type == CHIP_NAVI12) 1741 goto parse_soc_bounding_box; 1742 1743 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1744 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1745 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1746 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1747 adev->gfx.config.max_texture_channel_caches = 1748 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1749 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1750 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1751 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1752 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1753 adev->gfx.config.double_offchip_lds_buf = 1754 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1755 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1756 adev->gfx.cu_info.max_waves_per_simd = 1757 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1758 adev->gfx.cu_info.max_scratch_slots_per_cu = 1759 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1760 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1761 if (hdr->version_minor >= 1) { 1762 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1763 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1764 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1765 adev->gfx.config.num_sc_per_sh = 1766 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1767 adev->gfx.config.num_packer_per_sc = 1768 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1769 } 1770 1771 parse_soc_bounding_box: 1772 /* 1773 * soc bounding box info is not integrated in disocovery table, 1774 * we always need to parse it from gpu info firmware if needed. 1775 */ 1776 if (hdr->version_minor == 2) { 1777 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1778 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1779 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1780 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1781 } 1782 break; 1783 } 1784 default: 1785 dev_err(adev->dev, 1786 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1787 err = -EINVAL; 1788 goto out; 1789 } 1790 out: 1791 return err; 1792 } 1793 1794 /** 1795 * amdgpu_device_ip_early_init - run early init for hardware IPs 1796 * 1797 * @adev: amdgpu_device pointer 1798 * 1799 * Early initialization pass for hardware IPs. The hardware IPs that make 1800 * up each asic are discovered each IP's early_init callback is run. This 1801 * is the first stage in initializing the asic. 1802 * Returns 0 on success, negative error code on failure. 1803 */ 1804 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1805 { 1806 int i, r; 1807 1808 amdgpu_device_enable_virtual_display(adev); 1809 1810 if (amdgpu_sriov_vf(adev)) { 1811 r = amdgpu_virt_request_full_gpu(adev, true); 1812 if (r) 1813 return r; 1814 } 1815 1816 switch (adev->asic_type) { 1817 #ifdef CONFIG_DRM_AMDGPU_SI 1818 case CHIP_VERDE: 1819 case CHIP_TAHITI: 1820 case CHIP_PITCAIRN: 1821 case CHIP_OLAND: 1822 case CHIP_HAINAN: 1823 adev->family = AMDGPU_FAMILY_SI; 1824 r = si_set_ip_blocks(adev); 1825 if (r) 1826 return r; 1827 break; 1828 #endif 1829 #ifdef CONFIG_DRM_AMDGPU_CIK 1830 case CHIP_BONAIRE: 1831 case CHIP_HAWAII: 1832 case CHIP_KAVERI: 1833 case CHIP_KABINI: 1834 case CHIP_MULLINS: 1835 if (adev->flags & AMD_IS_APU) 1836 adev->family = AMDGPU_FAMILY_KV; 1837 else 1838 adev->family = AMDGPU_FAMILY_CI; 1839 1840 r = cik_set_ip_blocks(adev); 1841 if (r) 1842 return r; 1843 break; 1844 #endif 1845 case CHIP_TOPAZ: 1846 case CHIP_TONGA: 1847 case CHIP_FIJI: 1848 case CHIP_POLARIS10: 1849 case CHIP_POLARIS11: 1850 case CHIP_POLARIS12: 1851 case CHIP_VEGAM: 1852 case CHIP_CARRIZO: 1853 case CHIP_STONEY: 1854 if (adev->flags & AMD_IS_APU) 1855 adev->family = AMDGPU_FAMILY_CZ; 1856 else 1857 adev->family = AMDGPU_FAMILY_VI; 1858 1859 r = vi_set_ip_blocks(adev); 1860 if (r) 1861 return r; 1862 break; 1863 case CHIP_VEGA10: 1864 case CHIP_VEGA12: 1865 case CHIP_VEGA20: 1866 case CHIP_RAVEN: 1867 case CHIP_ARCTURUS: 1868 case CHIP_RENOIR: 1869 if (adev->flags & AMD_IS_APU) 1870 adev->family = AMDGPU_FAMILY_RV; 1871 else 1872 adev->family = AMDGPU_FAMILY_AI; 1873 1874 r = soc15_set_ip_blocks(adev); 1875 if (r) 1876 return r; 1877 break; 1878 case CHIP_NAVI10: 1879 case CHIP_NAVI14: 1880 case CHIP_NAVI12: 1881 case CHIP_SIENNA_CICHLID: 1882 case CHIP_NAVY_FLOUNDER: 1883 adev->family = AMDGPU_FAMILY_NV; 1884 1885 r = nv_set_ip_blocks(adev); 1886 if (r) 1887 return r; 1888 break; 1889 default: 1890 /* FIXME: not supported yet */ 1891 return -EINVAL; 1892 } 1893 1894 amdgpu_amdkfd_device_probe(adev); 1895 1896 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1897 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1898 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1899 1900 for (i = 0; i < adev->num_ip_blocks; i++) { 1901 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1902 DRM_ERROR("disabled ip block: %d <%s>\n", 1903 i, adev->ip_blocks[i].version->funcs->name); 1904 adev->ip_blocks[i].status.valid = false; 1905 } else { 1906 if (adev->ip_blocks[i].version->funcs->early_init) { 1907 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1908 if (r == -ENOENT) { 1909 adev->ip_blocks[i].status.valid = false; 1910 } else if (r) { 1911 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1912 adev->ip_blocks[i].version->funcs->name, r); 1913 return r; 1914 } else { 1915 adev->ip_blocks[i].status.valid = true; 1916 } 1917 } else { 1918 adev->ip_blocks[i].status.valid = true; 1919 } 1920 } 1921 /* get the vbios after the asic_funcs are set up */ 1922 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1923 r = amdgpu_device_parse_gpu_info_fw(adev); 1924 if (r) 1925 return r; 1926 1927 /* Read BIOS */ 1928 if (!amdgpu_get_bios(adev)) 1929 return -EINVAL; 1930 1931 r = amdgpu_atombios_init(adev); 1932 if (r) { 1933 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1934 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1935 return r; 1936 } 1937 } 1938 } 1939 1940 adev->cg_flags &= amdgpu_cg_mask; 1941 adev->pg_flags &= amdgpu_pg_mask; 1942 1943 return 0; 1944 } 1945 1946 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1947 { 1948 int i, r; 1949 1950 for (i = 0; i < adev->num_ip_blocks; i++) { 1951 if (!adev->ip_blocks[i].status.sw) 1952 continue; 1953 if (adev->ip_blocks[i].status.hw) 1954 continue; 1955 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1956 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1957 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1958 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1959 if (r) { 1960 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1961 adev->ip_blocks[i].version->funcs->name, r); 1962 return r; 1963 } 1964 adev->ip_blocks[i].status.hw = true; 1965 } 1966 } 1967 1968 return 0; 1969 } 1970 1971 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1972 { 1973 int i, r; 1974 1975 for (i = 0; i < adev->num_ip_blocks; i++) { 1976 if (!adev->ip_blocks[i].status.sw) 1977 continue; 1978 if (adev->ip_blocks[i].status.hw) 1979 continue; 1980 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1981 if (r) { 1982 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1983 adev->ip_blocks[i].version->funcs->name, r); 1984 return r; 1985 } 1986 adev->ip_blocks[i].status.hw = true; 1987 } 1988 1989 return 0; 1990 } 1991 1992 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1993 { 1994 int r = 0; 1995 int i; 1996 uint32_t smu_version; 1997 1998 if (adev->asic_type >= CHIP_VEGA10) { 1999 for (i = 0; i < adev->num_ip_blocks; i++) { 2000 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2001 continue; 2002 2003 /* no need to do the fw loading again if already done*/ 2004 if (adev->ip_blocks[i].status.hw == true) 2005 break; 2006 2007 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2008 r = adev->ip_blocks[i].version->funcs->resume(adev); 2009 if (r) { 2010 DRM_ERROR("resume of IP block <%s> failed %d\n", 2011 adev->ip_blocks[i].version->funcs->name, r); 2012 return r; 2013 } 2014 } else { 2015 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2016 if (r) { 2017 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2018 adev->ip_blocks[i].version->funcs->name, r); 2019 return r; 2020 } 2021 } 2022 2023 adev->ip_blocks[i].status.hw = true; 2024 break; 2025 } 2026 } 2027 2028 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2029 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2030 2031 return r; 2032 } 2033 2034 /** 2035 * amdgpu_device_ip_init - run init for hardware IPs 2036 * 2037 * @adev: amdgpu_device pointer 2038 * 2039 * Main initialization pass for hardware IPs. The list of all the hardware 2040 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2041 * are run. sw_init initializes the software state associated with each IP 2042 * and hw_init initializes the hardware associated with each IP. 2043 * Returns 0 on success, negative error code on failure. 2044 */ 2045 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2046 { 2047 int i, r; 2048 2049 r = amdgpu_ras_init(adev); 2050 if (r) 2051 return r; 2052 2053 for (i = 0; i < adev->num_ip_blocks; i++) { 2054 if (!adev->ip_blocks[i].status.valid) 2055 continue; 2056 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2057 if (r) { 2058 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2059 adev->ip_blocks[i].version->funcs->name, r); 2060 goto init_failed; 2061 } 2062 adev->ip_blocks[i].status.sw = true; 2063 2064 /* need to do gmc hw init early so we can allocate gpu mem */ 2065 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2066 r = amdgpu_device_vram_scratch_init(adev); 2067 if (r) { 2068 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2069 goto init_failed; 2070 } 2071 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2072 if (r) { 2073 DRM_ERROR("hw_init %d failed %d\n", i, r); 2074 goto init_failed; 2075 } 2076 r = amdgpu_device_wb_init(adev); 2077 if (r) { 2078 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2079 goto init_failed; 2080 } 2081 adev->ip_blocks[i].status.hw = true; 2082 2083 /* right after GMC hw init, we create CSA */ 2084 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2085 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2086 AMDGPU_GEM_DOMAIN_VRAM, 2087 AMDGPU_CSA_SIZE); 2088 if (r) { 2089 DRM_ERROR("allocate CSA failed %d\n", r); 2090 goto init_failed; 2091 } 2092 } 2093 } 2094 } 2095 2096 if (amdgpu_sriov_vf(adev)) 2097 amdgpu_virt_init_data_exchange(adev); 2098 2099 r = amdgpu_ib_pool_init(adev); 2100 if (r) { 2101 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2102 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2103 goto init_failed; 2104 } 2105 2106 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2107 if (r) 2108 goto init_failed; 2109 2110 r = amdgpu_device_ip_hw_init_phase1(adev); 2111 if (r) 2112 goto init_failed; 2113 2114 r = amdgpu_device_fw_loading(adev); 2115 if (r) 2116 goto init_failed; 2117 2118 r = amdgpu_device_ip_hw_init_phase2(adev); 2119 if (r) 2120 goto init_failed; 2121 2122 /* 2123 * retired pages will be loaded from eeprom and reserved here, 2124 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2125 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2126 * for I2C communication which only true at this point. 2127 * 2128 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2129 * failure from bad gpu situation and stop amdgpu init process 2130 * accordingly. For other failed cases, it will still release all 2131 * the resource and print error message, rather than returning one 2132 * negative value to upper level. 2133 * 2134 * Note: theoretically, this should be called before all vram allocations 2135 * to protect retired page from abusing 2136 */ 2137 r = amdgpu_ras_recovery_init(adev); 2138 if (r) 2139 goto init_failed; 2140 2141 if (adev->gmc.xgmi.num_physical_nodes > 1) 2142 amdgpu_xgmi_add_device(adev); 2143 amdgpu_amdkfd_device_init(adev); 2144 2145 amdgpu_fru_get_product_info(adev); 2146 2147 init_failed: 2148 if (amdgpu_sriov_vf(adev)) 2149 amdgpu_virt_release_full_gpu(adev, true); 2150 2151 return r; 2152 } 2153 2154 /** 2155 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2156 * 2157 * @adev: amdgpu_device pointer 2158 * 2159 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2160 * this function before a GPU reset. If the value is retained after a 2161 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2162 */ 2163 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2164 { 2165 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2166 } 2167 2168 /** 2169 * amdgpu_device_check_vram_lost - check if vram is valid 2170 * 2171 * @adev: amdgpu_device pointer 2172 * 2173 * Checks the reset magic value written to the gart pointer in VRAM. 2174 * The driver calls this after a GPU reset to see if the contents of 2175 * VRAM is lost or now. 2176 * returns true if vram is lost, false if not. 2177 */ 2178 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2179 { 2180 if (memcmp(adev->gart.ptr, adev->reset_magic, 2181 AMDGPU_RESET_MAGIC_NUM)) 2182 return true; 2183 2184 if (!amdgpu_in_reset(adev)) 2185 return false; 2186 2187 /* 2188 * For all ASICs with baco/mode1 reset, the VRAM is 2189 * always assumed to be lost. 2190 */ 2191 switch (amdgpu_asic_reset_method(adev)) { 2192 case AMD_RESET_METHOD_BACO: 2193 case AMD_RESET_METHOD_MODE1: 2194 return true; 2195 default: 2196 return false; 2197 } 2198 } 2199 2200 /** 2201 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2202 * 2203 * @adev: amdgpu_device pointer 2204 * @state: clockgating state (gate or ungate) 2205 * 2206 * The list of all the hardware IPs that make up the asic is walked and the 2207 * set_clockgating_state callbacks are run. 2208 * Late initialization pass enabling clockgating for hardware IPs. 2209 * Fini or suspend, pass disabling clockgating for hardware IPs. 2210 * Returns 0 on success, negative error code on failure. 2211 */ 2212 2213 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2214 enum amd_clockgating_state state) 2215 { 2216 int i, j, r; 2217 2218 if (amdgpu_emu_mode == 1) 2219 return 0; 2220 2221 for (j = 0; j < adev->num_ip_blocks; j++) { 2222 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2223 if (!adev->ip_blocks[i].status.late_initialized) 2224 continue; 2225 /* skip CG for VCE/UVD, it's handled specially */ 2226 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2227 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2228 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2229 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2230 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2231 /* enable clockgating to save power */ 2232 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2233 state); 2234 if (r) { 2235 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2236 adev->ip_blocks[i].version->funcs->name, r); 2237 return r; 2238 } 2239 } 2240 } 2241 2242 return 0; 2243 } 2244 2245 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2246 { 2247 int i, j, r; 2248 2249 if (amdgpu_emu_mode == 1) 2250 return 0; 2251 2252 for (j = 0; j < adev->num_ip_blocks; j++) { 2253 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2254 if (!adev->ip_blocks[i].status.late_initialized) 2255 continue; 2256 /* skip CG for VCE/UVD, it's handled specially */ 2257 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2259 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2260 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2261 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2262 /* enable powergating to save power */ 2263 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2264 state); 2265 if (r) { 2266 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2267 adev->ip_blocks[i].version->funcs->name, r); 2268 return r; 2269 } 2270 } 2271 } 2272 return 0; 2273 } 2274 2275 static int amdgpu_device_enable_mgpu_fan_boost(void) 2276 { 2277 struct amdgpu_gpu_instance *gpu_ins; 2278 struct amdgpu_device *adev; 2279 int i, ret = 0; 2280 2281 mutex_lock(&mgpu_info.mutex); 2282 2283 /* 2284 * MGPU fan boost feature should be enabled 2285 * only when there are two or more dGPUs in 2286 * the system 2287 */ 2288 if (mgpu_info.num_dgpu < 2) 2289 goto out; 2290 2291 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2292 gpu_ins = &(mgpu_info.gpu_ins[i]); 2293 adev = gpu_ins->adev; 2294 if (!(adev->flags & AMD_IS_APU) && 2295 !gpu_ins->mgpu_fan_enabled) { 2296 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2297 if (ret) 2298 break; 2299 2300 gpu_ins->mgpu_fan_enabled = 1; 2301 } 2302 } 2303 2304 out: 2305 mutex_unlock(&mgpu_info.mutex); 2306 2307 return ret; 2308 } 2309 2310 /** 2311 * amdgpu_device_ip_late_init - run late init for hardware IPs 2312 * 2313 * @adev: amdgpu_device pointer 2314 * 2315 * Late initialization pass for hardware IPs. The list of all the hardware 2316 * IPs that make up the asic is walked and the late_init callbacks are run. 2317 * late_init covers any special initialization that an IP requires 2318 * after all of the have been initialized or something that needs to happen 2319 * late in the init process. 2320 * Returns 0 on success, negative error code on failure. 2321 */ 2322 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2323 { 2324 struct amdgpu_gpu_instance *gpu_instance; 2325 int i = 0, r; 2326 2327 for (i = 0; i < adev->num_ip_blocks; i++) { 2328 if (!adev->ip_blocks[i].status.hw) 2329 continue; 2330 if (adev->ip_blocks[i].version->funcs->late_init) { 2331 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2332 if (r) { 2333 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2334 adev->ip_blocks[i].version->funcs->name, r); 2335 return r; 2336 } 2337 } 2338 adev->ip_blocks[i].status.late_initialized = true; 2339 } 2340 2341 amdgpu_ras_set_error_query_ready(adev, true); 2342 2343 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2344 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2345 2346 amdgpu_device_fill_reset_magic(adev); 2347 2348 r = amdgpu_device_enable_mgpu_fan_boost(); 2349 if (r) 2350 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2351 2352 2353 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2354 mutex_lock(&mgpu_info.mutex); 2355 2356 /* 2357 * Reset device p-state to low as this was booted with high. 2358 * 2359 * This should be performed only after all devices from the same 2360 * hive get initialized. 2361 * 2362 * However, it's unknown how many device in the hive in advance. 2363 * As this is counted one by one during devices initializations. 2364 * 2365 * So, we wait for all XGMI interlinked devices initialized. 2366 * This may bring some delays as those devices may come from 2367 * different hives. But that should be OK. 2368 */ 2369 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2370 for (i = 0; i < mgpu_info.num_gpu; i++) { 2371 gpu_instance = &(mgpu_info.gpu_ins[i]); 2372 if (gpu_instance->adev->flags & AMD_IS_APU) 2373 continue; 2374 2375 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2376 AMDGPU_XGMI_PSTATE_MIN); 2377 if (r) { 2378 DRM_ERROR("pstate setting failed (%d).\n", r); 2379 break; 2380 } 2381 } 2382 } 2383 2384 mutex_unlock(&mgpu_info.mutex); 2385 } 2386 2387 return 0; 2388 } 2389 2390 /** 2391 * amdgpu_device_ip_fini - run fini for hardware IPs 2392 * 2393 * @adev: amdgpu_device pointer 2394 * 2395 * Main teardown pass for hardware IPs. The list of all the hardware 2396 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2397 * are run. hw_fini tears down the hardware associated with each IP 2398 * and sw_fini tears down any software state associated with each IP. 2399 * Returns 0 on success, negative error code on failure. 2400 */ 2401 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2402 { 2403 int i, r; 2404 2405 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2406 amdgpu_virt_release_ras_err_handler_data(adev); 2407 2408 amdgpu_ras_pre_fini(adev); 2409 2410 if (adev->gmc.xgmi.num_physical_nodes > 1) 2411 amdgpu_xgmi_remove_device(adev); 2412 2413 amdgpu_amdkfd_device_fini(adev); 2414 2415 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2416 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2417 2418 /* need to disable SMC first */ 2419 for (i = 0; i < adev->num_ip_blocks; i++) { 2420 if (!adev->ip_blocks[i].status.hw) 2421 continue; 2422 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2423 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2424 /* XXX handle errors */ 2425 if (r) { 2426 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2427 adev->ip_blocks[i].version->funcs->name, r); 2428 } 2429 adev->ip_blocks[i].status.hw = false; 2430 break; 2431 } 2432 } 2433 2434 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2435 if (!adev->ip_blocks[i].status.hw) 2436 continue; 2437 2438 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2439 /* XXX handle errors */ 2440 if (r) { 2441 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2442 adev->ip_blocks[i].version->funcs->name, r); 2443 } 2444 2445 adev->ip_blocks[i].status.hw = false; 2446 } 2447 2448 2449 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2450 if (!adev->ip_blocks[i].status.sw) 2451 continue; 2452 2453 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2454 amdgpu_ucode_free_bo(adev); 2455 amdgpu_free_static_csa(&adev->virt.csa_obj); 2456 amdgpu_device_wb_fini(adev); 2457 amdgpu_device_vram_scratch_fini(adev); 2458 amdgpu_ib_pool_fini(adev); 2459 } 2460 2461 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2462 /* XXX handle errors */ 2463 if (r) { 2464 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2465 adev->ip_blocks[i].version->funcs->name, r); 2466 } 2467 adev->ip_blocks[i].status.sw = false; 2468 adev->ip_blocks[i].status.valid = false; 2469 } 2470 2471 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2472 if (!adev->ip_blocks[i].status.late_initialized) 2473 continue; 2474 if (adev->ip_blocks[i].version->funcs->late_fini) 2475 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2476 adev->ip_blocks[i].status.late_initialized = false; 2477 } 2478 2479 amdgpu_ras_fini(adev); 2480 2481 if (amdgpu_sriov_vf(adev)) 2482 if (amdgpu_virt_release_full_gpu(adev, false)) 2483 DRM_ERROR("failed to release exclusive mode on fini\n"); 2484 2485 return 0; 2486 } 2487 2488 /** 2489 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2490 * 2491 * @work: work_struct. 2492 */ 2493 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2494 { 2495 struct amdgpu_device *adev = 2496 container_of(work, struct amdgpu_device, delayed_init_work.work); 2497 int r; 2498 2499 r = amdgpu_ib_ring_tests(adev); 2500 if (r) 2501 DRM_ERROR("ib ring test failed (%d).\n", r); 2502 } 2503 2504 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2505 { 2506 struct amdgpu_device *adev = 2507 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2508 2509 mutex_lock(&adev->gfx.gfx_off_mutex); 2510 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2511 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2512 adev->gfx.gfx_off_state = true; 2513 } 2514 mutex_unlock(&adev->gfx.gfx_off_mutex); 2515 } 2516 2517 /** 2518 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2519 * 2520 * @adev: amdgpu_device pointer 2521 * 2522 * Main suspend function for hardware IPs. The list of all the hardware 2523 * IPs that make up the asic is walked, clockgating is disabled and the 2524 * suspend callbacks are run. suspend puts the hardware and software state 2525 * in each IP into a state suitable for suspend. 2526 * Returns 0 on success, negative error code on failure. 2527 */ 2528 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2529 { 2530 int i, r; 2531 2532 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2533 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2534 2535 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2536 if (!adev->ip_blocks[i].status.valid) 2537 continue; 2538 2539 /* displays are handled separately */ 2540 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2541 continue; 2542 2543 /* XXX handle errors */ 2544 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2545 /* XXX handle errors */ 2546 if (r) { 2547 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2548 adev->ip_blocks[i].version->funcs->name, r); 2549 return r; 2550 } 2551 2552 adev->ip_blocks[i].status.hw = false; 2553 } 2554 2555 return 0; 2556 } 2557 2558 /** 2559 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2560 * 2561 * @adev: amdgpu_device pointer 2562 * 2563 * Main suspend function for hardware IPs. The list of all the hardware 2564 * IPs that make up the asic is walked, clockgating is disabled and the 2565 * suspend callbacks are run. suspend puts the hardware and software state 2566 * in each IP into a state suitable for suspend. 2567 * Returns 0 on success, negative error code on failure. 2568 */ 2569 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2570 { 2571 int i, r; 2572 2573 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2574 if (!adev->ip_blocks[i].status.valid) 2575 continue; 2576 /* displays are handled in phase1 */ 2577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2578 continue; 2579 /* PSP lost connection when err_event_athub occurs */ 2580 if (amdgpu_ras_intr_triggered() && 2581 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2582 adev->ip_blocks[i].status.hw = false; 2583 continue; 2584 } 2585 /* XXX handle errors */ 2586 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2587 /* XXX handle errors */ 2588 if (r) { 2589 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2590 adev->ip_blocks[i].version->funcs->name, r); 2591 } 2592 adev->ip_blocks[i].status.hw = false; 2593 /* handle putting the SMC in the appropriate state */ 2594 if(!amdgpu_sriov_vf(adev)){ 2595 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2596 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2597 if (r) { 2598 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2599 adev->mp1_state, r); 2600 return r; 2601 } 2602 } 2603 } 2604 adev->ip_blocks[i].status.hw = false; 2605 } 2606 2607 return 0; 2608 } 2609 2610 /** 2611 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2612 * 2613 * @adev: amdgpu_device pointer 2614 * 2615 * Main suspend function for hardware IPs. The list of all the hardware 2616 * IPs that make up the asic is walked, clockgating is disabled and the 2617 * suspend callbacks are run. suspend puts the hardware and software state 2618 * in each IP into a state suitable for suspend. 2619 * Returns 0 on success, negative error code on failure. 2620 */ 2621 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2622 { 2623 int r; 2624 2625 if (amdgpu_sriov_vf(adev)) 2626 amdgpu_virt_request_full_gpu(adev, false); 2627 2628 r = amdgpu_device_ip_suspend_phase1(adev); 2629 if (r) 2630 return r; 2631 r = amdgpu_device_ip_suspend_phase2(adev); 2632 2633 if (amdgpu_sriov_vf(adev)) 2634 amdgpu_virt_release_full_gpu(adev, false); 2635 2636 return r; 2637 } 2638 2639 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2640 { 2641 int i, r; 2642 2643 static enum amd_ip_block_type ip_order[] = { 2644 AMD_IP_BLOCK_TYPE_GMC, 2645 AMD_IP_BLOCK_TYPE_COMMON, 2646 AMD_IP_BLOCK_TYPE_PSP, 2647 AMD_IP_BLOCK_TYPE_IH, 2648 }; 2649 2650 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2651 int j; 2652 struct amdgpu_ip_block *block; 2653 2654 block = &adev->ip_blocks[i]; 2655 block->status.hw = false; 2656 2657 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2658 2659 if (block->version->type != ip_order[j] || 2660 !block->status.valid) 2661 continue; 2662 2663 r = block->version->funcs->hw_init(adev); 2664 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2665 if (r) 2666 return r; 2667 block->status.hw = true; 2668 } 2669 } 2670 2671 return 0; 2672 } 2673 2674 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2675 { 2676 int i, r; 2677 2678 static enum amd_ip_block_type ip_order[] = { 2679 AMD_IP_BLOCK_TYPE_SMC, 2680 AMD_IP_BLOCK_TYPE_DCE, 2681 AMD_IP_BLOCK_TYPE_GFX, 2682 AMD_IP_BLOCK_TYPE_SDMA, 2683 AMD_IP_BLOCK_TYPE_UVD, 2684 AMD_IP_BLOCK_TYPE_VCE, 2685 AMD_IP_BLOCK_TYPE_VCN 2686 }; 2687 2688 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2689 int j; 2690 struct amdgpu_ip_block *block; 2691 2692 for (j = 0; j < adev->num_ip_blocks; j++) { 2693 block = &adev->ip_blocks[j]; 2694 2695 if (block->version->type != ip_order[i] || 2696 !block->status.valid || 2697 block->status.hw) 2698 continue; 2699 2700 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2701 r = block->version->funcs->resume(adev); 2702 else 2703 r = block->version->funcs->hw_init(adev); 2704 2705 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2706 if (r) 2707 return r; 2708 block->status.hw = true; 2709 } 2710 } 2711 2712 return 0; 2713 } 2714 2715 /** 2716 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2717 * 2718 * @adev: amdgpu_device pointer 2719 * 2720 * First resume function for hardware IPs. The list of all the hardware 2721 * IPs that make up the asic is walked and the resume callbacks are run for 2722 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2723 * after a suspend and updates the software state as necessary. This 2724 * function is also used for restoring the GPU after a GPU reset. 2725 * Returns 0 on success, negative error code on failure. 2726 */ 2727 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2728 { 2729 int i, r; 2730 2731 for (i = 0; i < adev->num_ip_blocks; i++) { 2732 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2733 continue; 2734 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2736 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2737 2738 r = adev->ip_blocks[i].version->funcs->resume(adev); 2739 if (r) { 2740 DRM_ERROR("resume of IP block <%s> failed %d\n", 2741 adev->ip_blocks[i].version->funcs->name, r); 2742 return r; 2743 } 2744 adev->ip_blocks[i].status.hw = true; 2745 } 2746 } 2747 2748 return 0; 2749 } 2750 2751 /** 2752 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2753 * 2754 * @adev: amdgpu_device pointer 2755 * 2756 * First resume function for hardware IPs. The list of all the hardware 2757 * IPs that make up the asic is walked and the resume callbacks are run for 2758 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2759 * functional state after a suspend and updates the software state as 2760 * necessary. This function is also used for restoring the GPU after a GPU 2761 * reset. 2762 * Returns 0 on success, negative error code on failure. 2763 */ 2764 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2765 { 2766 int i, r; 2767 2768 for (i = 0; i < adev->num_ip_blocks; i++) { 2769 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2770 continue; 2771 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2772 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2774 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2775 continue; 2776 r = adev->ip_blocks[i].version->funcs->resume(adev); 2777 if (r) { 2778 DRM_ERROR("resume of IP block <%s> failed %d\n", 2779 adev->ip_blocks[i].version->funcs->name, r); 2780 return r; 2781 } 2782 adev->ip_blocks[i].status.hw = true; 2783 } 2784 2785 return 0; 2786 } 2787 2788 /** 2789 * amdgpu_device_ip_resume - run resume for hardware IPs 2790 * 2791 * @adev: amdgpu_device pointer 2792 * 2793 * Main resume function for hardware IPs. The hardware IPs 2794 * are split into two resume functions because they are 2795 * are also used in in recovering from a GPU reset and some additional 2796 * steps need to be take between them. In this case (S3/S4) they are 2797 * run sequentially. 2798 * Returns 0 on success, negative error code on failure. 2799 */ 2800 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2801 { 2802 int r; 2803 2804 r = amdgpu_device_ip_resume_phase1(adev); 2805 if (r) 2806 return r; 2807 2808 r = amdgpu_device_fw_loading(adev); 2809 if (r) 2810 return r; 2811 2812 r = amdgpu_device_ip_resume_phase2(adev); 2813 2814 return r; 2815 } 2816 2817 /** 2818 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2819 * 2820 * @adev: amdgpu_device pointer 2821 * 2822 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2823 */ 2824 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2825 { 2826 if (amdgpu_sriov_vf(adev)) { 2827 if (adev->is_atom_fw) { 2828 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2829 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2830 } else { 2831 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2832 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2833 } 2834 2835 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2836 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2837 } 2838 } 2839 2840 /** 2841 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2842 * 2843 * @asic_type: AMD asic type 2844 * 2845 * Check if there is DC (new modesetting infrastructre) support for an asic. 2846 * returns true if DC has support, false if not. 2847 */ 2848 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2849 { 2850 switch (asic_type) { 2851 #if defined(CONFIG_DRM_AMD_DC) 2852 #if defined(CONFIG_DRM_AMD_DC_SI) 2853 case CHIP_TAHITI: 2854 case CHIP_PITCAIRN: 2855 case CHIP_VERDE: 2856 case CHIP_OLAND: 2857 #endif 2858 case CHIP_BONAIRE: 2859 case CHIP_KAVERI: 2860 case CHIP_KABINI: 2861 case CHIP_MULLINS: 2862 /* 2863 * We have systems in the wild with these ASICs that require 2864 * LVDS and VGA support which is not supported with DC. 2865 * 2866 * Fallback to the non-DC driver here by default so as not to 2867 * cause regressions. 2868 */ 2869 return amdgpu_dc > 0; 2870 case CHIP_HAWAII: 2871 case CHIP_CARRIZO: 2872 case CHIP_STONEY: 2873 case CHIP_POLARIS10: 2874 case CHIP_POLARIS11: 2875 case CHIP_POLARIS12: 2876 case CHIP_VEGAM: 2877 case CHIP_TONGA: 2878 case CHIP_FIJI: 2879 case CHIP_VEGA10: 2880 case CHIP_VEGA12: 2881 case CHIP_VEGA20: 2882 #if defined(CONFIG_DRM_AMD_DC_DCN) 2883 case CHIP_RAVEN: 2884 case CHIP_NAVI10: 2885 case CHIP_NAVI14: 2886 case CHIP_NAVI12: 2887 case CHIP_RENOIR: 2888 #endif 2889 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2890 case CHIP_SIENNA_CICHLID: 2891 case CHIP_NAVY_FLOUNDER: 2892 #endif 2893 return amdgpu_dc != 0; 2894 #endif 2895 default: 2896 if (amdgpu_dc > 0) 2897 DRM_INFO("Display Core has been requested via kernel parameter " 2898 "but isn't supported by ASIC, ignoring\n"); 2899 return false; 2900 } 2901 } 2902 2903 /** 2904 * amdgpu_device_has_dc_support - check if dc is supported 2905 * 2906 * @adev: amdgpu_device_pointer 2907 * 2908 * Returns true for supported, false for not supported 2909 */ 2910 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2911 { 2912 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 2913 return false; 2914 2915 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2916 } 2917 2918 2919 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2920 { 2921 struct amdgpu_device *adev = 2922 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2923 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2924 2925 /* It's a bug to not have a hive within this function */ 2926 if (WARN_ON(!hive)) 2927 return; 2928 2929 /* 2930 * Use task barrier to synchronize all xgmi reset works across the 2931 * hive. task_barrier_enter and task_barrier_exit will block 2932 * until all the threads running the xgmi reset works reach 2933 * those points. task_barrier_full will do both blocks. 2934 */ 2935 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2936 2937 task_barrier_enter(&hive->tb); 2938 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 2939 2940 if (adev->asic_reset_res) 2941 goto fail; 2942 2943 task_barrier_exit(&hive->tb); 2944 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 2945 2946 if (adev->asic_reset_res) 2947 goto fail; 2948 2949 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 2950 adev->mmhub.funcs->reset_ras_error_count(adev); 2951 } else { 2952 2953 task_barrier_full(&hive->tb); 2954 adev->asic_reset_res = amdgpu_asic_reset(adev); 2955 } 2956 2957 fail: 2958 if (adev->asic_reset_res) 2959 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2960 adev->asic_reset_res, adev_to_drm(adev)->unique); 2961 amdgpu_put_xgmi_hive(hive); 2962 } 2963 2964 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2965 { 2966 char *input = amdgpu_lockup_timeout; 2967 char *timeout_setting = NULL; 2968 int index = 0; 2969 long timeout; 2970 int ret = 0; 2971 2972 /* 2973 * By default timeout for non compute jobs is 10000. 2974 * And there is no timeout enforced on compute jobs. 2975 * In SR-IOV or passthrough mode, timeout for compute 2976 * jobs are 60000 by default. 2977 */ 2978 adev->gfx_timeout = msecs_to_jiffies(10000); 2979 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2980 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2981 adev->compute_timeout = msecs_to_jiffies(60000); 2982 else 2983 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2984 2985 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2986 while ((timeout_setting = strsep(&input, ",")) && 2987 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2988 ret = kstrtol(timeout_setting, 0, &timeout); 2989 if (ret) 2990 return ret; 2991 2992 if (timeout == 0) { 2993 index++; 2994 continue; 2995 } else if (timeout < 0) { 2996 timeout = MAX_SCHEDULE_TIMEOUT; 2997 } else { 2998 timeout = msecs_to_jiffies(timeout); 2999 } 3000 3001 switch (index++) { 3002 case 0: 3003 adev->gfx_timeout = timeout; 3004 break; 3005 case 1: 3006 adev->compute_timeout = timeout; 3007 break; 3008 case 2: 3009 adev->sdma_timeout = timeout; 3010 break; 3011 case 3: 3012 adev->video_timeout = timeout; 3013 break; 3014 default: 3015 break; 3016 } 3017 } 3018 /* 3019 * There is only one value specified and 3020 * it should apply to all non-compute jobs. 3021 */ 3022 if (index == 1) { 3023 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3024 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3025 adev->compute_timeout = adev->gfx_timeout; 3026 } 3027 } 3028 3029 return ret; 3030 } 3031 3032 static const struct attribute *amdgpu_dev_attributes[] = { 3033 &dev_attr_product_name.attr, 3034 &dev_attr_product_number.attr, 3035 &dev_attr_serial_number.attr, 3036 &dev_attr_pcie_replay_count.attr, 3037 NULL 3038 }; 3039 3040 3041 /** 3042 * amdgpu_device_init - initialize the driver 3043 * 3044 * @adev: amdgpu_device pointer 3045 * @flags: driver flags 3046 * 3047 * Initializes the driver info and hw (all asics). 3048 * Returns 0 for success or an error on failure. 3049 * Called at driver startup. 3050 */ 3051 int amdgpu_device_init(struct amdgpu_device *adev, 3052 uint32_t flags) 3053 { 3054 struct drm_device *ddev = adev_to_drm(adev); 3055 struct pci_dev *pdev = adev->pdev; 3056 int r, i; 3057 bool boco = false; 3058 u32 max_MBps; 3059 3060 adev->shutdown = false; 3061 adev->flags = flags; 3062 3063 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3064 adev->asic_type = amdgpu_force_asic_type; 3065 else 3066 adev->asic_type = flags & AMD_ASIC_MASK; 3067 3068 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3069 if (amdgpu_emu_mode == 1) 3070 adev->usec_timeout *= 10; 3071 adev->gmc.gart_size = 512 * 1024 * 1024; 3072 adev->accel_working = false; 3073 adev->num_rings = 0; 3074 adev->mman.buffer_funcs = NULL; 3075 adev->mman.buffer_funcs_ring = NULL; 3076 adev->vm_manager.vm_pte_funcs = NULL; 3077 adev->vm_manager.vm_pte_num_scheds = 0; 3078 adev->gmc.gmc_funcs = NULL; 3079 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3080 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3081 3082 adev->smc_rreg = &amdgpu_invalid_rreg; 3083 adev->smc_wreg = &amdgpu_invalid_wreg; 3084 adev->pcie_rreg = &amdgpu_invalid_rreg; 3085 adev->pcie_wreg = &amdgpu_invalid_wreg; 3086 adev->pciep_rreg = &amdgpu_invalid_rreg; 3087 adev->pciep_wreg = &amdgpu_invalid_wreg; 3088 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3089 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3090 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3091 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3092 adev->didt_rreg = &amdgpu_invalid_rreg; 3093 adev->didt_wreg = &amdgpu_invalid_wreg; 3094 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3095 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3096 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3097 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3098 3099 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3100 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3101 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3102 3103 /* mutex initialization are all done here so we 3104 * can recall function without having locking issues */ 3105 atomic_set(&adev->irq.ih.lock, 0); 3106 mutex_init(&adev->firmware.mutex); 3107 mutex_init(&adev->pm.mutex); 3108 mutex_init(&adev->gfx.gpu_clock_mutex); 3109 mutex_init(&adev->srbm_mutex); 3110 mutex_init(&adev->gfx.pipe_reserve_mutex); 3111 mutex_init(&adev->gfx.gfx_off_mutex); 3112 mutex_init(&adev->grbm_idx_mutex); 3113 mutex_init(&adev->mn_lock); 3114 mutex_init(&adev->virt.vf_errors.lock); 3115 hash_init(adev->mn_hash); 3116 atomic_set(&adev->in_gpu_reset, 0); 3117 init_rwsem(&adev->reset_sem); 3118 mutex_init(&adev->psp.mutex); 3119 mutex_init(&adev->notifier_lock); 3120 3121 r = amdgpu_device_check_arguments(adev); 3122 if (r) 3123 return r; 3124 3125 spin_lock_init(&adev->mmio_idx_lock); 3126 spin_lock_init(&adev->smc_idx_lock); 3127 spin_lock_init(&adev->pcie_idx_lock); 3128 spin_lock_init(&adev->uvd_ctx_idx_lock); 3129 spin_lock_init(&adev->didt_idx_lock); 3130 spin_lock_init(&adev->gc_cac_idx_lock); 3131 spin_lock_init(&adev->se_cac_idx_lock); 3132 spin_lock_init(&adev->audio_endpt_idx_lock); 3133 spin_lock_init(&adev->mm_stats.lock); 3134 3135 INIT_LIST_HEAD(&adev->shadow_list); 3136 mutex_init(&adev->shadow_list_lock); 3137 3138 INIT_DELAYED_WORK(&adev->delayed_init_work, 3139 amdgpu_device_delayed_init_work_handler); 3140 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3141 amdgpu_device_delay_enable_gfx_off); 3142 3143 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3144 3145 adev->gfx.gfx_off_req_count = 1; 3146 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3147 3148 atomic_set(&adev->throttling_logging_enabled, 1); 3149 /* 3150 * If throttling continues, logging will be performed every minute 3151 * to avoid log flooding. "-1" is subtracted since the thermal 3152 * throttling interrupt comes every second. Thus, the total logging 3153 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3154 * for throttling interrupt) = 60 seconds. 3155 */ 3156 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3157 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3158 3159 /* Registers mapping */ 3160 /* TODO: block userspace mapping of io register */ 3161 if (adev->asic_type >= CHIP_BONAIRE) { 3162 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3163 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3164 } else { 3165 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3166 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3167 } 3168 3169 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3170 if (adev->rmmio == NULL) { 3171 return -ENOMEM; 3172 } 3173 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3174 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3175 3176 /* io port mapping */ 3177 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3178 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3179 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3180 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3181 break; 3182 } 3183 } 3184 if (adev->rio_mem == NULL) 3185 DRM_INFO("PCI I/O BAR is not found.\n"); 3186 3187 /* enable PCIE atomic ops */ 3188 r = pci_enable_atomic_ops_to_root(adev->pdev, 3189 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3190 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3191 if (r) { 3192 adev->have_atomics_support = false; 3193 DRM_INFO("PCIE atomic ops is not supported\n"); 3194 } else { 3195 adev->have_atomics_support = true; 3196 } 3197 3198 amdgpu_device_get_pcie_info(adev); 3199 3200 if (amdgpu_mcbp) 3201 DRM_INFO("MCBP is enabled\n"); 3202 3203 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3204 adev->enable_mes = true; 3205 3206 /* detect hw virtualization here */ 3207 amdgpu_detect_virtualization(adev); 3208 3209 r = amdgpu_device_get_job_timeout_settings(adev); 3210 if (r) { 3211 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3212 goto failed_unmap; 3213 } 3214 3215 /* early init functions */ 3216 r = amdgpu_device_ip_early_init(adev); 3217 if (r) 3218 goto failed_unmap; 3219 3220 /* doorbell bar mapping and doorbell index init*/ 3221 amdgpu_device_doorbell_init(adev); 3222 3223 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3224 /* this will fail for cards that aren't VGA class devices, just 3225 * ignore it */ 3226 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3227 3228 if (amdgpu_device_supports_boco(ddev)) 3229 boco = true; 3230 if (amdgpu_has_atpx() && 3231 (amdgpu_is_atpx_hybrid() || 3232 amdgpu_has_atpx_dgpu_power_cntl()) && 3233 !pci_is_thunderbolt_attached(adev->pdev)) 3234 vga_switcheroo_register_client(adev->pdev, 3235 &amdgpu_switcheroo_ops, boco); 3236 if (boco) 3237 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3238 3239 if (amdgpu_emu_mode == 1) { 3240 /* post the asic on emulation mode */ 3241 emu_soc_asic_init(adev); 3242 goto fence_driver_init; 3243 } 3244 3245 /* detect if we are with an SRIOV vbios */ 3246 amdgpu_device_detect_sriov_bios(adev); 3247 3248 /* check if we need to reset the asic 3249 * E.g., driver was not cleanly unloaded previously, etc. 3250 */ 3251 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3252 r = amdgpu_asic_reset(adev); 3253 if (r) { 3254 dev_err(adev->dev, "asic reset on init failed\n"); 3255 goto failed; 3256 } 3257 } 3258 3259 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3260 3261 /* Post card if necessary */ 3262 if (amdgpu_device_need_post(adev)) { 3263 if (!adev->bios) { 3264 dev_err(adev->dev, "no vBIOS found\n"); 3265 r = -EINVAL; 3266 goto failed; 3267 } 3268 DRM_INFO("GPU posting now...\n"); 3269 r = amdgpu_device_asic_init(adev); 3270 if (r) { 3271 dev_err(adev->dev, "gpu post error!\n"); 3272 goto failed; 3273 } 3274 } 3275 3276 if (adev->is_atom_fw) { 3277 /* Initialize clocks */ 3278 r = amdgpu_atomfirmware_get_clock_info(adev); 3279 if (r) { 3280 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3281 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3282 goto failed; 3283 } 3284 } else { 3285 /* Initialize clocks */ 3286 r = amdgpu_atombios_get_clock_info(adev); 3287 if (r) { 3288 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3289 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3290 goto failed; 3291 } 3292 /* init i2c buses */ 3293 if (!amdgpu_device_has_dc_support(adev)) 3294 amdgpu_atombios_i2c_init(adev); 3295 } 3296 3297 fence_driver_init: 3298 /* Fence driver */ 3299 r = amdgpu_fence_driver_init(adev); 3300 if (r) { 3301 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3302 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3303 goto failed; 3304 } 3305 3306 /* init the mode config */ 3307 drm_mode_config_init(adev_to_drm(adev)); 3308 3309 r = amdgpu_device_ip_init(adev); 3310 if (r) { 3311 /* failed in exclusive mode due to timeout */ 3312 if (amdgpu_sriov_vf(adev) && 3313 !amdgpu_sriov_runtime(adev) && 3314 amdgpu_virt_mmio_blocked(adev) && 3315 !amdgpu_virt_wait_reset(adev)) { 3316 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3317 /* Don't send request since VF is inactive. */ 3318 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3319 adev->virt.ops = NULL; 3320 r = -EAGAIN; 3321 goto failed; 3322 } 3323 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3324 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3325 goto failed; 3326 } 3327 3328 dev_info(adev->dev, 3329 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3330 adev->gfx.config.max_shader_engines, 3331 adev->gfx.config.max_sh_per_se, 3332 adev->gfx.config.max_cu_per_sh, 3333 adev->gfx.cu_info.number); 3334 3335 adev->accel_working = true; 3336 3337 amdgpu_vm_check_compute_bug(adev); 3338 3339 /* Initialize the buffer migration limit. */ 3340 if (amdgpu_moverate >= 0) 3341 max_MBps = amdgpu_moverate; 3342 else 3343 max_MBps = 8; /* Allow 8 MB/s. */ 3344 /* Get a log2 for easy divisions. */ 3345 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3346 3347 amdgpu_fbdev_init(adev); 3348 3349 r = amdgpu_pm_sysfs_init(adev); 3350 if (r) { 3351 adev->pm_sysfs_en = false; 3352 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3353 } else 3354 adev->pm_sysfs_en = true; 3355 3356 r = amdgpu_ucode_sysfs_init(adev); 3357 if (r) { 3358 adev->ucode_sysfs_en = false; 3359 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3360 } else 3361 adev->ucode_sysfs_en = true; 3362 3363 if ((amdgpu_testing & 1)) { 3364 if (adev->accel_working) 3365 amdgpu_test_moves(adev); 3366 else 3367 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3368 } 3369 if (amdgpu_benchmarking) { 3370 if (adev->accel_working) 3371 amdgpu_benchmark(adev, amdgpu_benchmarking); 3372 else 3373 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3374 } 3375 3376 /* 3377 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3378 * Otherwise the mgpu fan boost feature will be skipped due to the 3379 * gpu instance is counted less. 3380 */ 3381 amdgpu_register_gpu_instance(adev); 3382 3383 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3384 * explicit gating rather than handling it automatically. 3385 */ 3386 r = amdgpu_device_ip_late_init(adev); 3387 if (r) { 3388 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3389 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3390 goto failed; 3391 } 3392 3393 /* must succeed. */ 3394 amdgpu_ras_resume(adev); 3395 3396 queue_delayed_work(system_wq, &adev->delayed_init_work, 3397 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3398 3399 if (amdgpu_sriov_vf(adev)) 3400 flush_delayed_work(&adev->delayed_init_work); 3401 3402 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3403 if (r) 3404 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3405 3406 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3407 r = amdgpu_pmu_init(adev); 3408 if (r) 3409 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3410 3411 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3412 if (amdgpu_device_cache_pci_state(adev->pdev)) 3413 pci_restore_state(pdev); 3414 3415 return 0; 3416 3417 failed: 3418 amdgpu_vf_error_trans_all(adev); 3419 if (boco) 3420 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3421 3422 failed_unmap: 3423 iounmap(adev->rmmio); 3424 adev->rmmio = NULL; 3425 3426 return r; 3427 } 3428 3429 /** 3430 * amdgpu_device_fini - tear down the driver 3431 * 3432 * @adev: amdgpu_device pointer 3433 * 3434 * Tear down the driver info (all asics). 3435 * Called at driver shutdown. 3436 */ 3437 void amdgpu_device_fini(struct amdgpu_device *adev) 3438 { 3439 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3440 flush_delayed_work(&adev->delayed_init_work); 3441 adev->shutdown = true; 3442 3443 kfree(adev->pci_state); 3444 3445 /* make sure IB test finished before entering exclusive mode 3446 * to avoid preemption on IB test 3447 * */ 3448 if (amdgpu_sriov_vf(adev)) 3449 amdgpu_virt_request_full_gpu(adev, false); 3450 3451 /* disable all interrupts */ 3452 amdgpu_irq_disable_all(adev); 3453 if (adev->mode_info.mode_config_initialized){ 3454 if (!amdgpu_device_has_dc_support(adev)) 3455 drm_helper_force_disable_all(adev_to_drm(adev)); 3456 else 3457 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3458 } 3459 amdgpu_fence_driver_fini(adev); 3460 if (adev->pm_sysfs_en) 3461 amdgpu_pm_sysfs_fini(adev); 3462 amdgpu_fbdev_fini(adev); 3463 amdgpu_device_ip_fini(adev); 3464 release_firmware(adev->firmware.gpu_info_fw); 3465 adev->firmware.gpu_info_fw = NULL; 3466 adev->accel_working = false; 3467 /* free i2c buses */ 3468 if (!amdgpu_device_has_dc_support(adev)) 3469 amdgpu_i2c_fini(adev); 3470 3471 if (amdgpu_emu_mode != 1) 3472 amdgpu_atombios_fini(adev); 3473 3474 kfree(adev->bios); 3475 adev->bios = NULL; 3476 if (amdgpu_has_atpx() && 3477 (amdgpu_is_atpx_hybrid() || 3478 amdgpu_has_atpx_dgpu_power_cntl()) && 3479 !pci_is_thunderbolt_attached(adev->pdev)) 3480 vga_switcheroo_unregister_client(adev->pdev); 3481 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3482 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3483 vga_client_register(adev->pdev, NULL, NULL, NULL); 3484 if (adev->rio_mem) 3485 pci_iounmap(adev->pdev, adev->rio_mem); 3486 adev->rio_mem = NULL; 3487 iounmap(adev->rmmio); 3488 adev->rmmio = NULL; 3489 amdgpu_device_doorbell_fini(adev); 3490 3491 if (adev->ucode_sysfs_en) 3492 amdgpu_ucode_sysfs_fini(adev); 3493 3494 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3495 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3496 amdgpu_pmu_fini(adev); 3497 if (adev->mman.discovery_bin) 3498 amdgpu_discovery_fini(adev); 3499 } 3500 3501 3502 /* 3503 * Suspend & resume. 3504 */ 3505 /** 3506 * amdgpu_device_suspend - initiate device suspend 3507 * 3508 * @dev: drm dev pointer 3509 * @fbcon : notify the fbdev of suspend 3510 * 3511 * Puts the hw in the suspend state (all asics). 3512 * Returns 0 for success or an error on failure. 3513 * Called at driver suspend. 3514 */ 3515 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3516 { 3517 struct amdgpu_device *adev; 3518 struct drm_crtc *crtc; 3519 struct drm_connector *connector; 3520 struct drm_connector_list_iter iter; 3521 int r; 3522 3523 adev = drm_to_adev(dev); 3524 3525 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3526 return 0; 3527 3528 adev->in_suspend = true; 3529 drm_kms_helper_poll_disable(dev); 3530 3531 if (fbcon) 3532 amdgpu_fbdev_set_suspend(adev, 1); 3533 3534 cancel_delayed_work_sync(&adev->delayed_init_work); 3535 3536 if (!amdgpu_device_has_dc_support(adev)) { 3537 /* turn off display hw */ 3538 drm_modeset_lock_all(dev); 3539 drm_connector_list_iter_begin(dev, &iter); 3540 drm_for_each_connector_iter(connector, &iter) 3541 drm_helper_connector_dpms(connector, 3542 DRM_MODE_DPMS_OFF); 3543 drm_connector_list_iter_end(&iter); 3544 drm_modeset_unlock_all(dev); 3545 /* unpin the front buffers and cursors */ 3546 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3547 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3548 struct drm_framebuffer *fb = crtc->primary->fb; 3549 struct amdgpu_bo *robj; 3550 3551 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3552 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3553 r = amdgpu_bo_reserve(aobj, true); 3554 if (r == 0) { 3555 amdgpu_bo_unpin(aobj); 3556 amdgpu_bo_unreserve(aobj); 3557 } 3558 } 3559 3560 if (fb == NULL || fb->obj[0] == NULL) { 3561 continue; 3562 } 3563 robj = gem_to_amdgpu_bo(fb->obj[0]); 3564 /* don't unpin kernel fb objects */ 3565 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3566 r = amdgpu_bo_reserve(robj, true); 3567 if (r == 0) { 3568 amdgpu_bo_unpin(robj); 3569 amdgpu_bo_unreserve(robj); 3570 } 3571 } 3572 } 3573 } 3574 3575 amdgpu_ras_suspend(adev); 3576 3577 r = amdgpu_device_ip_suspend_phase1(adev); 3578 3579 amdgpu_amdkfd_suspend(adev, !fbcon); 3580 3581 /* evict vram memory */ 3582 amdgpu_bo_evict_vram(adev); 3583 3584 amdgpu_fence_driver_suspend(adev); 3585 3586 r = amdgpu_device_ip_suspend_phase2(adev); 3587 3588 /* evict remaining vram memory 3589 * This second call to evict vram is to evict the gart page table 3590 * using the CPU. 3591 */ 3592 amdgpu_bo_evict_vram(adev); 3593 3594 return 0; 3595 } 3596 3597 /** 3598 * amdgpu_device_resume - initiate device resume 3599 * 3600 * @dev: drm dev pointer 3601 * @fbcon : notify the fbdev of resume 3602 * 3603 * Bring the hw back to operating state (all asics). 3604 * Returns 0 for success or an error on failure. 3605 * Called at driver resume. 3606 */ 3607 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3608 { 3609 struct drm_connector *connector; 3610 struct drm_connector_list_iter iter; 3611 struct amdgpu_device *adev = drm_to_adev(dev); 3612 struct drm_crtc *crtc; 3613 int r = 0; 3614 3615 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3616 return 0; 3617 3618 /* post card */ 3619 if (amdgpu_device_need_post(adev)) { 3620 r = amdgpu_device_asic_init(adev); 3621 if (r) 3622 dev_err(adev->dev, "amdgpu asic init failed\n"); 3623 } 3624 3625 r = amdgpu_device_ip_resume(adev); 3626 if (r) { 3627 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3628 return r; 3629 } 3630 amdgpu_fence_driver_resume(adev); 3631 3632 3633 r = amdgpu_device_ip_late_init(adev); 3634 if (r) 3635 return r; 3636 3637 queue_delayed_work(system_wq, &adev->delayed_init_work, 3638 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3639 3640 if (!amdgpu_device_has_dc_support(adev)) { 3641 /* pin cursors */ 3642 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3643 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3644 3645 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3646 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3647 r = amdgpu_bo_reserve(aobj, true); 3648 if (r == 0) { 3649 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3650 if (r != 0) 3651 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3652 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3653 amdgpu_bo_unreserve(aobj); 3654 } 3655 } 3656 } 3657 } 3658 r = amdgpu_amdkfd_resume(adev, !fbcon); 3659 if (r) 3660 return r; 3661 3662 /* Make sure IB tests flushed */ 3663 flush_delayed_work(&adev->delayed_init_work); 3664 3665 /* blat the mode back in */ 3666 if (fbcon) { 3667 if (!amdgpu_device_has_dc_support(adev)) { 3668 /* pre DCE11 */ 3669 drm_helper_resume_force_mode(dev); 3670 3671 /* turn on display hw */ 3672 drm_modeset_lock_all(dev); 3673 3674 drm_connector_list_iter_begin(dev, &iter); 3675 drm_for_each_connector_iter(connector, &iter) 3676 drm_helper_connector_dpms(connector, 3677 DRM_MODE_DPMS_ON); 3678 drm_connector_list_iter_end(&iter); 3679 3680 drm_modeset_unlock_all(dev); 3681 } 3682 amdgpu_fbdev_set_suspend(adev, 0); 3683 } 3684 3685 drm_kms_helper_poll_enable(dev); 3686 3687 amdgpu_ras_resume(adev); 3688 3689 /* 3690 * Most of the connector probing functions try to acquire runtime pm 3691 * refs to ensure that the GPU is powered on when connector polling is 3692 * performed. Since we're calling this from a runtime PM callback, 3693 * trying to acquire rpm refs will cause us to deadlock. 3694 * 3695 * Since we're guaranteed to be holding the rpm lock, it's safe to 3696 * temporarily disable the rpm helpers so this doesn't deadlock us. 3697 */ 3698 #ifdef CONFIG_PM 3699 dev->dev->power.disable_depth++; 3700 #endif 3701 if (!amdgpu_device_has_dc_support(adev)) 3702 drm_helper_hpd_irq_event(dev); 3703 else 3704 drm_kms_helper_hotplug_event(dev); 3705 #ifdef CONFIG_PM 3706 dev->dev->power.disable_depth--; 3707 #endif 3708 adev->in_suspend = false; 3709 3710 return 0; 3711 } 3712 3713 /** 3714 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3715 * 3716 * @adev: amdgpu_device pointer 3717 * 3718 * The list of all the hardware IPs that make up the asic is walked and 3719 * the check_soft_reset callbacks are run. check_soft_reset determines 3720 * if the asic is still hung or not. 3721 * Returns true if any of the IPs are still in a hung state, false if not. 3722 */ 3723 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3724 { 3725 int i; 3726 bool asic_hang = false; 3727 3728 if (amdgpu_sriov_vf(adev)) 3729 return true; 3730 3731 if (amdgpu_asic_need_full_reset(adev)) 3732 return true; 3733 3734 for (i = 0; i < adev->num_ip_blocks; i++) { 3735 if (!adev->ip_blocks[i].status.valid) 3736 continue; 3737 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3738 adev->ip_blocks[i].status.hang = 3739 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3740 if (adev->ip_blocks[i].status.hang) { 3741 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3742 asic_hang = true; 3743 } 3744 } 3745 return asic_hang; 3746 } 3747 3748 /** 3749 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3750 * 3751 * @adev: amdgpu_device pointer 3752 * 3753 * The list of all the hardware IPs that make up the asic is walked and the 3754 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3755 * handles any IP specific hardware or software state changes that are 3756 * necessary for a soft reset to succeed. 3757 * Returns 0 on success, negative error code on failure. 3758 */ 3759 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3760 { 3761 int i, r = 0; 3762 3763 for (i = 0; i < adev->num_ip_blocks; i++) { 3764 if (!adev->ip_blocks[i].status.valid) 3765 continue; 3766 if (adev->ip_blocks[i].status.hang && 3767 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3768 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3769 if (r) 3770 return r; 3771 } 3772 } 3773 3774 return 0; 3775 } 3776 3777 /** 3778 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3779 * 3780 * @adev: amdgpu_device pointer 3781 * 3782 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3783 * reset is necessary to recover. 3784 * Returns true if a full asic reset is required, false if not. 3785 */ 3786 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3787 { 3788 int i; 3789 3790 if (amdgpu_asic_need_full_reset(adev)) 3791 return true; 3792 3793 for (i = 0; i < adev->num_ip_blocks; i++) { 3794 if (!adev->ip_blocks[i].status.valid) 3795 continue; 3796 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3797 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3798 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3799 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3800 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3801 if (adev->ip_blocks[i].status.hang) { 3802 dev_info(adev->dev, "Some block need full reset!\n"); 3803 return true; 3804 } 3805 } 3806 } 3807 return false; 3808 } 3809 3810 /** 3811 * amdgpu_device_ip_soft_reset - do a soft reset 3812 * 3813 * @adev: amdgpu_device pointer 3814 * 3815 * The list of all the hardware IPs that make up the asic is walked and the 3816 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3817 * IP specific hardware or software state changes that are necessary to soft 3818 * reset the IP. 3819 * Returns 0 on success, negative error code on failure. 3820 */ 3821 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3822 { 3823 int i, r = 0; 3824 3825 for (i = 0; i < adev->num_ip_blocks; i++) { 3826 if (!adev->ip_blocks[i].status.valid) 3827 continue; 3828 if (adev->ip_blocks[i].status.hang && 3829 adev->ip_blocks[i].version->funcs->soft_reset) { 3830 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3831 if (r) 3832 return r; 3833 } 3834 } 3835 3836 return 0; 3837 } 3838 3839 /** 3840 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3841 * 3842 * @adev: amdgpu_device pointer 3843 * 3844 * The list of all the hardware IPs that make up the asic is walked and the 3845 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3846 * handles any IP specific hardware or software state changes that are 3847 * necessary after the IP has been soft reset. 3848 * Returns 0 on success, negative error code on failure. 3849 */ 3850 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3851 { 3852 int i, r = 0; 3853 3854 for (i = 0; i < adev->num_ip_blocks; i++) { 3855 if (!adev->ip_blocks[i].status.valid) 3856 continue; 3857 if (adev->ip_blocks[i].status.hang && 3858 adev->ip_blocks[i].version->funcs->post_soft_reset) 3859 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3860 if (r) 3861 return r; 3862 } 3863 3864 return 0; 3865 } 3866 3867 /** 3868 * amdgpu_device_recover_vram - Recover some VRAM contents 3869 * 3870 * @adev: amdgpu_device pointer 3871 * 3872 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3873 * restore things like GPUVM page tables after a GPU reset where 3874 * the contents of VRAM might be lost. 3875 * 3876 * Returns: 3877 * 0 on success, negative error code on failure. 3878 */ 3879 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3880 { 3881 struct dma_fence *fence = NULL, *next = NULL; 3882 struct amdgpu_bo *shadow; 3883 long r = 1, tmo; 3884 3885 if (amdgpu_sriov_runtime(adev)) 3886 tmo = msecs_to_jiffies(8000); 3887 else 3888 tmo = msecs_to_jiffies(100); 3889 3890 dev_info(adev->dev, "recover vram bo from shadow start\n"); 3891 mutex_lock(&adev->shadow_list_lock); 3892 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3893 3894 /* No need to recover an evicted BO */ 3895 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3896 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3897 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3898 continue; 3899 3900 r = amdgpu_bo_restore_shadow(shadow, &next); 3901 if (r) 3902 break; 3903 3904 if (fence) { 3905 tmo = dma_fence_wait_timeout(fence, false, tmo); 3906 dma_fence_put(fence); 3907 fence = next; 3908 if (tmo == 0) { 3909 r = -ETIMEDOUT; 3910 break; 3911 } else if (tmo < 0) { 3912 r = tmo; 3913 break; 3914 } 3915 } else { 3916 fence = next; 3917 } 3918 } 3919 mutex_unlock(&adev->shadow_list_lock); 3920 3921 if (fence) 3922 tmo = dma_fence_wait_timeout(fence, false, tmo); 3923 dma_fence_put(fence); 3924 3925 if (r < 0 || tmo <= 0) { 3926 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3927 return -EIO; 3928 } 3929 3930 dev_info(adev->dev, "recover vram bo from shadow done\n"); 3931 return 0; 3932 } 3933 3934 3935 /** 3936 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3937 * 3938 * @adev: amdgpu device pointer 3939 * @from_hypervisor: request from hypervisor 3940 * 3941 * do VF FLR and reinitialize Asic 3942 * return 0 means succeeded otherwise failed 3943 */ 3944 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3945 bool from_hypervisor) 3946 { 3947 int r; 3948 3949 if (from_hypervisor) 3950 r = amdgpu_virt_request_full_gpu(adev, true); 3951 else 3952 r = amdgpu_virt_reset_gpu(adev); 3953 if (r) 3954 return r; 3955 3956 amdgpu_amdkfd_pre_reset(adev); 3957 3958 /* Resume IP prior to SMC */ 3959 r = amdgpu_device_ip_reinit_early_sriov(adev); 3960 if (r) 3961 goto error; 3962 3963 amdgpu_virt_init_data_exchange(adev); 3964 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3965 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 3966 3967 r = amdgpu_device_fw_loading(adev); 3968 if (r) 3969 return r; 3970 3971 /* now we are okay to resume SMC/CP/SDMA */ 3972 r = amdgpu_device_ip_reinit_late_sriov(adev); 3973 if (r) 3974 goto error; 3975 3976 amdgpu_irq_gpu_reset_resume_helper(adev); 3977 r = amdgpu_ib_ring_tests(adev); 3978 amdgpu_amdkfd_post_reset(adev); 3979 3980 error: 3981 amdgpu_virt_release_full_gpu(adev, true); 3982 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3983 amdgpu_inc_vram_lost(adev); 3984 r = amdgpu_device_recover_vram(adev); 3985 } 3986 3987 return r; 3988 } 3989 3990 /** 3991 * amdgpu_device_has_job_running - check if there is any job in mirror list 3992 * 3993 * @adev: amdgpu device pointer 3994 * 3995 * check if there is any job in mirror list 3996 */ 3997 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 3998 { 3999 int i; 4000 struct drm_sched_job *job; 4001 4002 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4003 struct amdgpu_ring *ring = adev->rings[i]; 4004 4005 if (!ring || !ring->sched.thread) 4006 continue; 4007 4008 spin_lock(&ring->sched.job_list_lock); 4009 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4010 struct drm_sched_job, node); 4011 spin_unlock(&ring->sched.job_list_lock); 4012 if (job) 4013 return true; 4014 } 4015 return false; 4016 } 4017 4018 /** 4019 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4020 * 4021 * @adev: amdgpu device pointer 4022 * 4023 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4024 * a hung GPU. 4025 */ 4026 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4027 { 4028 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4029 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4030 return false; 4031 } 4032 4033 if (amdgpu_gpu_recovery == 0) 4034 goto disabled; 4035 4036 if (amdgpu_sriov_vf(adev)) 4037 return true; 4038 4039 if (amdgpu_gpu_recovery == -1) { 4040 switch (adev->asic_type) { 4041 case CHIP_BONAIRE: 4042 case CHIP_HAWAII: 4043 case CHIP_TOPAZ: 4044 case CHIP_TONGA: 4045 case CHIP_FIJI: 4046 case CHIP_POLARIS10: 4047 case CHIP_POLARIS11: 4048 case CHIP_POLARIS12: 4049 case CHIP_VEGAM: 4050 case CHIP_VEGA20: 4051 case CHIP_VEGA10: 4052 case CHIP_VEGA12: 4053 case CHIP_RAVEN: 4054 case CHIP_ARCTURUS: 4055 case CHIP_RENOIR: 4056 case CHIP_NAVI10: 4057 case CHIP_NAVI14: 4058 case CHIP_NAVI12: 4059 case CHIP_SIENNA_CICHLID: 4060 break; 4061 default: 4062 goto disabled; 4063 } 4064 } 4065 4066 return true; 4067 4068 disabled: 4069 dev_info(adev->dev, "GPU recovery disabled.\n"); 4070 return false; 4071 } 4072 4073 4074 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4075 struct amdgpu_job *job, 4076 bool *need_full_reset_arg) 4077 { 4078 int i, r = 0; 4079 bool need_full_reset = *need_full_reset_arg; 4080 4081 amdgpu_debugfs_wait_dump(adev); 4082 4083 /* block all schedulers and reset given job's ring */ 4084 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4085 struct amdgpu_ring *ring = adev->rings[i]; 4086 4087 if (!ring || !ring->sched.thread) 4088 continue; 4089 4090 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4091 amdgpu_fence_driver_force_completion(ring); 4092 } 4093 4094 if(job) 4095 drm_sched_increase_karma(&job->base); 4096 4097 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4098 if (!amdgpu_sriov_vf(adev)) { 4099 4100 if (!need_full_reset) 4101 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4102 4103 if (!need_full_reset) { 4104 amdgpu_device_ip_pre_soft_reset(adev); 4105 r = amdgpu_device_ip_soft_reset(adev); 4106 amdgpu_device_ip_post_soft_reset(adev); 4107 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4108 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4109 need_full_reset = true; 4110 } 4111 } 4112 4113 if (need_full_reset) 4114 r = amdgpu_device_ip_suspend(adev); 4115 4116 *need_full_reset_arg = need_full_reset; 4117 } 4118 4119 return r; 4120 } 4121 4122 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4123 struct list_head *device_list_handle, 4124 bool *need_full_reset_arg, 4125 bool skip_hw_reset) 4126 { 4127 struct amdgpu_device *tmp_adev = NULL; 4128 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4129 int r = 0; 4130 4131 /* 4132 * ASIC reset has to be done on all HGMI hive nodes ASAP 4133 * to allow proper links negotiation in FW (within 1 sec) 4134 */ 4135 if (!skip_hw_reset && need_full_reset) { 4136 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4137 /* For XGMI run all resets in parallel to speed up the process */ 4138 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4139 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4140 r = -EALREADY; 4141 } else 4142 r = amdgpu_asic_reset(tmp_adev); 4143 4144 if (r) { 4145 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4146 r, adev_to_drm(tmp_adev)->unique); 4147 break; 4148 } 4149 } 4150 4151 /* For XGMI wait for all resets to complete before proceed */ 4152 if (!r) { 4153 list_for_each_entry(tmp_adev, device_list_handle, 4154 gmc.xgmi.head) { 4155 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4156 flush_work(&tmp_adev->xgmi_reset_work); 4157 r = tmp_adev->asic_reset_res; 4158 if (r) 4159 break; 4160 } 4161 } 4162 } 4163 } 4164 4165 if (!r && amdgpu_ras_intr_triggered()) { 4166 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4167 if (tmp_adev->mmhub.funcs && 4168 tmp_adev->mmhub.funcs->reset_ras_error_count) 4169 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4170 } 4171 4172 amdgpu_ras_intr_cleared(); 4173 } 4174 4175 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4176 if (need_full_reset) { 4177 /* post card */ 4178 if (amdgpu_device_asic_init(tmp_adev)) 4179 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4180 4181 if (!r) { 4182 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4183 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4184 if (r) 4185 goto out; 4186 4187 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4188 if (vram_lost) { 4189 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4190 amdgpu_inc_vram_lost(tmp_adev); 4191 } 4192 4193 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4194 if (r) 4195 goto out; 4196 4197 r = amdgpu_device_fw_loading(tmp_adev); 4198 if (r) 4199 return r; 4200 4201 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4202 if (r) 4203 goto out; 4204 4205 if (vram_lost) 4206 amdgpu_device_fill_reset_magic(tmp_adev); 4207 4208 /* 4209 * Add this ASIC as tracked as reset was already 4210 * complete successfully. 4211 */ 4212 amdgpu_register_gpu_instance(tmp_adev); 4213 4214 r = amdgpu_device_ip_late_init(tmp_adev); 4215 if (r) 4216 goto out; 4217 4218 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4219 4220 /* 4221 * The GPU enters bad state once faulty pages 4222 * by ECC has reached the threshold, and ras 4223 * recovery is scheduled next. So add one check 4224 * here to break recovery if it indeed exceeds 4225 * bad page threshold, and remind user to 4226 * retire this GPU or setting one bigger 4227 * bad_page_threshold value to fix this once 4228 * probing driver again. 4229 */ 4230 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4231 /* must succeed. */ 4232 amdgpu_ras_resume(tmp_adev); 4233 } else { 4234 r = -EINVAL; 4235 goto out; 4236 } 4237 4238 /* Update PSP FW topology after reset */ 4239 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4240 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4241 } 4242 } 4243 4244 out: 4245 if (!r) { 4246 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4247 r = amdgpu_ib_ring_tests(tmp_adev); 4248 if (r) { 4249 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4250 r = amdgpu_device_ip_suspend(tmp_adev); 4251 need_full_reset = true; 4252 r = -EAGAIN; 4253 goto end; 4254 } 4255 } 4256 4257 if (!r) 4258 r = amdgpu_device_recover_vram(tmp_adev); 4259 else 4260 tmp_adev->asic_reset_res = r; 4261 } 4262 4263 end: 4264 *need_full_reset_arg = need_full_reset; 4265 return r; 4266 } 4267 4268 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4269 struct amdgpu_hive_info *hive) 4270 { 4271 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4272 return false; 4273 4274 if (hive) { 4275 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4276 } else { 4277 down_write(&adev->reset_sem); 4278 } 4279 4280 atomic_inc(&adev->gpu_reset_counter); 4281 switch (amdgpu_asic_reset_method(adev)) { 4282 case AMD_RESET_METHOD_MODE1: 4283 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4284 break; 4285 case AMD_RESET_METHOD_MODE2: 4286 adev->mp1_state = PP_MP1_STATE_RESET; 4287 break; 4288 default: 4289 adev->mp1_state = PP_MP1_STATE_NONE; 4290 break; 4291 } 4292 4293 return true; 4294 } 4295 4296 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4297 { 4298 amdgpu_vf_error_trans_all(adev); 4299 adev->mp1_state = PP_MP1_STATE_NONE; 4300 atomic_set(&adev->in_gpu_reset, 0); 4301 up_write(&adev->reset_sem); 4302 } 4303 4304 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4305 { 4306 struct pci_dev *p = NULL; 4307 4308 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4309 adev->pdev->bus->number, 1); 4310 if (p) { 4311 pm_runtime_enable(&(p->dev)); 4312 pm_runtime_resume(&(p->dev)); 4313 } 4314 } 4315 4316 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4317 { 4318 enum amd_reset_method reset_method; 4319 struct pci_dev *p = NULL; 4320 u64 expires; 4321 4322 /* 4323 * For now, only BACO and mode1 reset are confirmed 4324 * to suffer the audio issue without proper suspended. 4325 */ 4326 reset_method = amdgpu_asic_reset_method(adev); 4327 if ((reset_method != AMD_RESET_METHOD_BACO) && 4328 (reset_method != AMD_RESET_METHOD_MODE1)) 4329 return -EINVAL; 4330 4331 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4332 adev->pdev->bus->number, 1); 4333 if (!p) 4334 return -ENODEV; 4335 4336 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4337 if (!expires) 4338 /* 4339 * If we cannot get the audio device autosuspend delay, 4340 * a fixed 4S interval will be used. Considering 3S is 4341 * the audio controller default autosuspend delay setting. 4342 * 4S used here is guaranteed to cover that. 4343 */ 4344 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4345 4346 while (!pm_runtime_status_suspended(&(p->dev))) { 4347 if (!pm_runtime_suspend(&(p->dev))) 4348 break; 4349 4350 if (expires < ktime_get_mono_fast_ns()) { 4351 dev_warn(adev->dev, "failed to suspend display audio\n"); 4352 /* TODO: abort the succeeding gpu reset? */ 4353 return -ETIMEDOUT; 4354 } 4355 } 4356 4357 pm_runtime_disable(&(p->dev)); 4358 4359 return 0; 4360 } 4361 4362 /** 4363 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4364 * 4365 * @adev: amdgpu device pointer 4366 * @job: which job trigger hang 4367 * 4368 * Attempt to reset the GPU if it has hung (all asics). 4369 * Attempt to do soft-reset or full-reset and reinitialize Asic 4370 * Returns 0 for success or an error on failure. 4371 */ 4372 4373 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4374 struct amdgpu_job *job) 4375 { 4376 struct list_head device_list, *device_list_handle = NULL; 4377 bool need_full_reset = false; 4378 bool job_signaled = false; 4379 struct amdgpu_hive_info *hive = NULL; 4380 struct amdgpu_device *tmp_adev = NULL; 4381 int i, r = 0; 4382 bool need_emergency_restart = false; 4383 bool audio_suspended = false; 4384 4385 /** 4386 * Special case: RAS triggered and full reset isn't supported 4387 */ 4388 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4389 4390 /* 4391 * Flush RAM to disk so that after reboot 4392 * the user can read log and see why the system rebooted. 4393 */ 4394 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4395 DRM_WARN("Emergency reboot."); 4396 4397 ksys_sync_helper(); 4398 emergency_restart(); 4399 } 4400 4401 dev_info(adev->dev, "GPU %s begin!\n", 4402 need_emergency_restart ? "jobs stop":"reset"); 4403 4404 /* 4405 * Here we trylock to avoid chain of resets executing from 4406 * either trigger by jobs on different adevs in XGMI hive or jobs on 4407 * different schedulers for same device while this TO handler is running. 4408 * We always reset all schedulers for device and all devices for XGMI 4409 * hive so that should take care of them too. 4410 */ 4411 hive = amdgpu_get_xgmi_hive(adev); 4412 if (hive) { 4413 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4414 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4415 job ? job->base.id : -1, hive->hive_id); 4416 amdgpu_put_xgmi_hive(hive); 4417 return 0; 4418 } 4419 mutex_lock(&hive->hive_lock); 4420 } 4421 4422 /* 4423 * Build list of devices to reset. 4424 * In case we are in XGMI hive mode, resort the device list 4425 * to put adev in the 1st position. 4426 */ 4427 INIT_LIST_HEAD(&device_list); 4428 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4429 if (!hive) 4430 return -ENODEV; 4431 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4432 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4433 device_list_handle = &hive->device_list; 4434 } else { 4435 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4436 device_list_handle = &device_list; 4437 } 4438 4439 /* block all schedulers and reset given job's ring */ 4440 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4441 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4442 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4443 job ? job->base.id : -1); 4444 r = 0; 4445 goto skip_recovery; 4446 } 4447 4448 /* 4449 * Try to put the audio codec into suspend state 4450 * before gpu reset started. 4451 * 4452 * Due to the power domain of the graphics device 4453 * is shared with AZ power domain. Without this, 4454 * we may change the audio hardware from behind 4455 * the audio driver's back. That will trigger 4456 * some audio codec errors. 4457 */ 4458 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4459 audio_suspended = true; 4460 4461 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4462 4463 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4464 4465 if (!amdgpu_sriov_vf(tmp_adev)) 4466 amdgpu_amdkfd_pre_reset(tmp_adev); 4467 4468 /* 4469 * Mark these ASICs to be reseted as untracked first 4470 * And add them back after reset completed 4471 */ 4472 amdgpu_unregister_gpu_instance(tmp_adev); 4473 4474 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4475 4476 /* disable ras on ALL IPs */ 4477 if (!need_emergency_restart && 4478 amdgpu_device_ip_need_full_reset(tmp_adev)) 4479 amdgpu_ras_suspend(tmp_adev); 4480 4481 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4482 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4483 4484 if (!ring || !ring->sched.thread) 4485 continue; 4486 4487 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4488 4489 if (need_emergency_restart) 4490 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4491 } 4492 } 4493 4494 if (need_emergency_restart) 4495 goto skip_sched_resume; 4496 4497 /* 4498 * Must check guilty signal here since after this point all old 4499 * HW fences are force signaled. 4500 * 4501 * job->base holds a reference to parent fence 4502 */ 4503 if (job && job->base.s_fence->parent && 4504 dma_fence_is_signaled(job->base.s_fence->parent)) { 4505 job_signaled = true; 4506 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4507 goto skip_hw_reset; 4508 } 4509 4510 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4511 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4512 r = amdgpu_device_pre_asic_reset(tmp_adev, 4513 NULL, 4514 &need_full_reset); 4515 /*TODO Should we stop ?*/ 4516 if (r) { 4517 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4518 r, adev_to_drm(tmp_adev)->unique); 4519 tmp_adev->asic_reset_res = r; 4520 } 4521 } 4522 4523 /* Actual ASIC resets if needed.*/ 4524 /* TODO Implement XGMI hive reset logic for SRIOV */ 4525 if (amdgpu_sriov_vf(adev)) { 4526 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4527 if (r) 4528 adev->asic_reset_res = r; 4529 } else { 4530 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4531 if (r && r == -EAGAIN) 4532 goto retry; 4533 } 4534 4535 skip_hw_reset: 4536 4537 /* Post ASIC reset for all devs .*/ 4538 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4539 4540 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4541 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4542 4543 if (!ring || !ring->sched.thread) 4544 continue; 4545 4546 /* No point to resubmit jobs if we didn't HW reset*/ 4547 if (!tmp_adev->asic_reset_res && !job_signaled) 4548 drm_sched_resubmit_jobs(&ring->sched); 4549 4550 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4551 } 4552 4553 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4554 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4555 } 4556 4557 tmp_adev->asic_reset_res = 0; 4558 4559 if (r) { 4560 /* bad news, how to tell it to userspace ? */ 4561 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4562 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4563 } else { 4564 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4565 } 4566 } 4567 4568 skip_sched_resume: 4569 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4570 /*unlock kfd: SRIOV would do it separately */ 4571 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4572 amdgpu_amdkfd_post_reset(tmp_adev); 4573 if (audio_suspended) 4574 amdgpu_device_resume_display_audio(tmp_adev); 4575 amdgpu_device_unlock_adev(tmp_adev); 4576 } 4577 4578 skip_recovery: 4579 if (hive) { 4580 atomic_set(&hive->in_reset, 0); 4581 mutex_unlock(&hive->hive_lock); 4582 amdgpu_put_xgmi_hive(hive); 4583 } 4584 4585 if (r) 4586 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4587 return r; 4588 } 4589 4590 /** 4591 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4592 * 4593 * @adev: amdgpu_device pointer 4594 * 4595 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4596 * and lanes) of the slot the device is in. Handles APUs and 4597 * virtualized environments where PCIE config space may not be available. 4598 */ 4599 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4600 { 4601 struct pci_dev *pdev; 4602 enum pci_bus_speed speed_cap, platform_speed_cap; 4603 enum pcie_link_width platform_link_width; 4604 4605 if (amdgpu_pcie_gen_cap) 4606 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4607 4608 if (amdgpu_pcie_lane_cap) 4609 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4610 4611 /* covers APUs as well */ 4612 if (pci_is_root_bus(adev->pdev->bus)) { 4613 if (adev->pm.pcie_gen_mask == 0) 4614 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4615 if (adev->pm.pcie_mlw_mask == 0) 4616 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4617 return; 4618 } 4619 4620 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4621 return; 4622 4623 pcie_bandwidth_available(adev->pdev, NULL, 4624 &platform_speed_cap, &platform_link_width); 4625 4626 if (adev->pm.pcie_gen_mask == 0) { 4627 /* asic caps */ 4628 pdev = adev->pdev; 4629 speed_cap = pcie_get_speed_cap(pdev); 4630 if (speed_cap == PCI_SPEED_UNKNOWN) { 4631 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4632 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4633 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4634 } else { 4635 if (speed_cap == PCIE_SPEED_16_0GT) 4636 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4637 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4638 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4639 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4640 else if (speed_cap == PCIE_SPEED_8_0GT) 4641 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4642 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4643 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4644 else if (speed_cap == PCIE_SPEED_5_0GT) 4645 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4646 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4647 else 4648 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4649 } 4650 /* platform caps */ 4651 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4652 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4653 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4654 } else { 4655 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4656 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4657 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4658 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4659 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4660 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4661 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4662 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4663 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4664 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4665 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4666 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4667 else 4668 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4669 4670 } 4671 } 4672 if (adev->pm.pcie_mlw_mask == 0) { 4673 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4674 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4675 } else { 4676 switch (platform_link_width) { 4677 case PCIE_LNK_X32: 4678 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4681 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4682 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4683 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4684 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4685 break; 4686 case PCIE_LNK_X16: 4687 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4691 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4692 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4693 break; 4694 case PCIE_LNK_X12: 4695 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4697 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4698 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4699 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4700 break; 4701 case PCIE_LNK_X8: 4702 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4703 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4704 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4705 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4706 break; 4707 case PCIE_LNK_X4: 4708 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4709 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4710 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4711 break; 4712 case PCIE_LNK_X2: 4713 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4714 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4715 break; 4716 case PCIE_LNK_X1: 4717 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4718 break; 4719 default: 4720 break; 4721 } 4722 } 4723 } 4724 } 4725 4726 int amdgpu_device_baco_enter(struct drm_device *dev) 4727 { 4728 struct amdgpu_device *adev = drm_to_adev(dev); 4729 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4730 4731 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4732 return -ENOTSUPP; 4733 4734 if (ras && ras->supported) 4735 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4736 4737 return amdgpu_dpm_baco_enter(adev); 4738 } 4739 4740 int amdgpu_device_baco_exit(struct drm_device *dev) 4741 { 4742 struct amdgpu_device *adev = drm_to_adev(dev); 4743 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4744 int ret = 0; 4745 4746 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4747 return -ENOTSUPP; 4748 4749 ret = amdgpu_dpm_baco_exit(adev); 4750 if (ret) 4751 return ret; 4752 4753 if (ras && ras->supported) 4754 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4755 4756 return 0; 4757 } 4758 4759 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4760 { 4761 int i; 4762 4763 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4764 struct amdgpu_ring *ring = adev->rings[i]; 4765 4766 if (!ring || !ring->sched.thread) 4767 continue; 4768 4769 cancel_delayed_work_sync(&ring->sched.work_tdr); 4770 } 4771 } 4772 4773 /** 4774 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4775 * @pdev: PCI device struct 4776 * @state: PCI channel state 4777 * 4778 * Description: Called when a PCI error is detected. 4779 * 4780 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4781 */ 4782 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4783 { 4784 struct drm_device *dev = pci_get_drvdata(pdev); 4785 struct amdgpu_device *adev = drm_to_adev(dev); 4786 int i; 4787 4788 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4789 4790 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4791 DRM_WARN("No support for XGMI hive yet..."); 4792 return PCI_ERS_RESULT_DISCONNECT; 4793 } 4794 4795 switch (state) { 4796 case pci_channel_io_normal: 4797 return PCI_ERS_RESULT_CAN_RECOVER; 4798 /* Fatal error, prepare for slot reset */ 4799 case pci_channel_io_frozen: 4800 /* 4801 * Cancel and wait for all TDRs in progress if failing to 4802 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4803 * 4804 * Locking adev->reset_sem will prevent any external access 4805 * to GPU during PCI error recovery 4806 */ 4807 while (!amdgpu_device_lock_adev(adev, NULL)) 4808 amdgpu_cancel_all_tdr(adev); 4809 4810 /* 4811 * Block any work scheduling as we do for regular GPU reset 4812 * for the duration of the recovery 4813 */ 4814 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4815 struct amdgpu_ring *ring = adev->rings[i]; 4816 4817 if (!ring || !ring->sched.thread) 4818 continue; 4819 4820 drm_sched_stop(&ring->sched, NULL); 4821 } 4822 return PCI_ERS_RESULT_NEED_RESET; 4823 case pci_channel_io_perm_failure: 4824 /* Permanent error, prepare for device removal */ 4825 return PCI_ERS_RESULT_DISCONNECT; 4826 } 4827 4828 return PCI_ERS_RESULT_NEED_RESET; 4829 } 4830 4831 /** 4832 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4833 * @pdev: pointer to PCI device 4834 */ 4835 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4836 { 4837 4838 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4839 4840 /* TODO - dump whatever for debugging purposes */ 4841 4842 /* This called only if amdgpu_pci_error_detected returns 4843 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4844 * works, no need to reset slot. 4845 */ 4846 4847 return PCI_ERS_RESULT_RECOVERED; 4848 } 4849 4850 /** 4851 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4852 * @pdev: PCI device struct 4853 * 4854 * Description: This routine is called by the pci error recovery 4855 * code after the PCI slot has been reset, just before we 4856 * should resume normal operations. 4857 */ 4858 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4859 { 4860 struct drm_device *dev = pci_get_drvdata(pdev); 4861 struct amdgpu_device *adev = drm_to_adev(dev); 4862 int r, i; 4863 bool need_full_reset = true; 4864 u32 memsize; 4865 struct list_head device_list; 4866 4867 DRM_INFO("PCI error: slot reset callback!!\n"); 4868 4869 INIT_LIST_HEAD(&device_list); 4870 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4871 4872 /* wait for asic to come out of reset */ 4873 msleep(500); 4874 4875 /* Restore PCI confspace */ 4876 amdgpu_device_load_pci_state(pdev); 4877 4878 /* confirm ASIC came out of reset */ 4879 for (i = 0; i < adev->usec_timeout; i++) { 4880 memsize = amdgpu_asic_get_config_memsize(adev); 4881 4882 if (memsize != 0xffffffff) 4883 break; 4884 udelay(1); 4885 } 4886 if (memsize == 0xffffffff) { 4887 r = -ETIME; 4888 goto out; 4889 } 4890 4891 adev->in_pci_err_recovery = true; 4892 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 4893 adev->in_pci_err_recovery = false; 4894 if (r) 4895 goto out; 4896 4897 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 4898 4899 out: 4900 if (!r) { 4901 if (amdgpu_device_cache_pci_state(adev->pdev)) 4902 pci_restore_state(adev->pdev); 4903 4904 DRM_INFO("PCIe error recovery succeeded\n"); 4905 } else { 4906 DRM_ERROR("PCIe error recovery failed, err:%d", r); 4907 amdgpu_device_unlock_adev(adev); 4908 } 4909 4910 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 4911 } 4912 4913 /** 4914 * amdgpu_pci_resume() - resume normal ops after PCI reset 4915 * @pdev: pointer to PCI device 4916 * 4917 * Called when the error recovery driver tells us that its 4918 * OK to resume normal operation. Use completion to allow 4919 * halted scsi ops to resume. 4920 */ 4921 void amdgpu_pci_resume(struct pci_dev *pdev) 4922 { 4923 struct drm_device *dev = pci_get_drvdata(pdev); 4924 struct amdgpu_device *adev = drm_to_adev(dev); 4925 int i; 4926 4927 4928 DRM_INFO("PCI error: resume callback!!\n"); 4929 4930 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4931 struct amdgpu_ring *ring = adev->rings[i]; 4932 4933 if (!ring || !ring->sched.thread) 4934 continue; 4935 4936 4937 drm_sched_resubmit_jobs(&ring->sched); 4938 drm_sched_start(&ring->sched, true); 4939 } 4940 4941 amdgpu_device_unlock_adev(adev); 4942 } 4943 4944 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 4945 { 4946 struct drm_device *dev = pci_get_drvdata(pdev); 4947 struct amdgpu_device *adev = drm_to_adev(dev); 4948 int r; 4949 4950 r = pci_save_state(pdev); 4951 if (!r) { 4952 kfree(adev->pci_state); 4953 4954 adev->pci_state = pci_store_saved_state(pdev); 4955 4956 if (!adev->pci_state) { 4957 DRM_ERROR("Failed to store PCI saved state"); 4958 return false; 4959 } 4960 } else { 4961 DRM_WARN("Failed to save PCI state, err:%d\n", r); 4962 return false; 4963 } 4964 4965 return true; 4966 } 4967 4968 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 4969 { 4970 struct drm_device *dev = pci_get_drvdata(pdev); 4971 struct amdgpu_device *adev = drm_to_adev(dev); 4972 int r; 4973 4974 if (!adev->pci_state) 4975 return false; 4976 4977 r = pci_load_saved_state(pdev, adev->pci_state); 4978 4979 if (!r) { 4980 pci_restore_state(pdev); 4981 } else { 4982 DRM_WARN("Failed to load PCI state, err:%d\n", r); 4983 return false; 4984 } 4985 4986 return true; 4987 } 4988 4989 4990