1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "NAVI10", 115 "NAVI14", 116 "NAVI12", 117 "SIENNA_CICHLID", 118 "NAVY_FLOUNDER", 119 "LAST", 120 }; 121 122 /** 123 * DOC: pcie_replay_count 124 * 125 * The amdgpu driver provides a sysfs API for reporting the total number 126 * of PCIe replays (NAKs) 127 * The file pcie_replay_count is used for this and returns the total 128 * number of replays as a sum of the NAKs generated and NAKs received 129 */ 130 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct drm_device *ddev = dev_get_drvdata(dev); 135 struct amdgpu_device *adev = drm_to_adev(ddev); 136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 137 138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 139 } 140 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 142 amdgpu_device_get_pcie_replay_count, NULL); 143 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 145 146 /** 147 * DOC: product_name 148 * 149 * The amdgpu driver provides a sysfs API for reporting the product name 150 * for the device 151 * The file serial_number is used for this and returns the product name 152 * as returned from the FRU. 153 * NOTE: This is only available for certain server cards 154 */ 155 156 static ssize_t amdgpu_device_get_product_name(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 struct drm_device *ddev = dev_get_drvdata(dev); 160 struct amdgpu_device *adev = drm_to_adev(ddev); 161 162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 163 } 164 165 static DEVICE_ATTR(product_name, S_IRUGO, 166 amdgpu_device_get_product_name, NULL); 167 168 /** 169 * DOC: product_number 170 * 171 * The amdgpu driver provides a sysfs API for reporting the part number 172 * for the device 173 * The file serial_number is used for this and returns the part number 174 * as returned from the FRU. 175 * NOTE: This is only available for certain server cards 176 */ 177 178 static ssize_t amdgpu_device_get_product_number(struct device *dev, 179 struct device_attribute *attr, char *buf) 180 { 181 struct drm_device *ddev = dev_get_drvdata(dev); 182 struct amdgpu_device *adev = drm_to_adev(ddev); 183 184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 185 } 186 187 static DEVICE_ATTR(product_number, S_IRUGO, 188 amdgpu_device_get_product_number, NULL); 189 190 /** 191 * DOC: serial_number 192 * 193 * The amdgpu driver provides a sysfs API for reporting the serial number 194 * for the device 195 * The file serial_number is used for this and returns the serial number 196 * as returned from the FRU. 197 * NOTE: This is only available for certain server cards 198 */ 199 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 201 struct device_attribute *attr, char *buf) 202 { 203 struct drm_device *ddev = dev_get_drvdata(dev); 204 struct amdgpu_device *adev = drm_to_adev(ddev); 205 206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 207 } 208 209 static DEVICE_ATTR(serial_number, S_IRUGO, 210 amdgpu_device_get_serial_number, NULL); 211 212 /** 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 214 * 215 * @dev: drm_device pointer 216 * 217 * Returns true if the device is a dGPU with HG/PX power control, 218 * otherwise return false. 219 */ 220 bool amdgpu_device_supports_boco(struct drm_device *dev) 221 { 222 struct amdgpu_device *adev = drm_to_adev(dev); 223 224 if (adev->flags & AMD_IS_PX) 225 return true; 226 return false; 227 } 228 229 /** 230 * amdgpu_device_supports_baco - Does the device support BACO 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device supporte BACO, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_baco(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = drm_to_adev(dev); 240 241 return amdgpu_asic_supports_baco(adev); 242 } 243 244 /** 245 * VRAM access helper functions. 246 * 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * MMIO register access helper functions. 307 */ 308 /** 309 * amdgpu_mm_rreg - read a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 318 uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (adev->in_pci_err_recovery) 323 return 0; 324 325 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && 326 down_read_trylock(&adev->reset_sem)) { 327 ret = amdgpu_kiq_rreg(adev, reg); 328 up_read(&adev->reset_sem); 329 return ret; 330 } 331 332 if ((reg * 4) < adev->rmmio_size) 333 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 334 else { 335 unsigned long flags; 336 337 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 338 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 339 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 341 } 342 343 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 344 return ret; 345 } 346 347 /* 348 * MMIO register read with bytes helper functions 349 * @offset:bytes offset from MMIO start 350 * 351 */ 352 353 /** 354 * amdgpu_mm_rreg8 - read a memory mapped IO register 355 * 356 * @adev: amdgpu_device pointer 357 * @offset: byte aligned register offset 358 * 359 * Returns the 8 bit value from the offset specified. 360 */ 361 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 362 { 363 if (adev->in_pci_err_recovery) 364 return 0; 365 366 if (offset < adev->rmmio_size) 367 return (readb(adev->rmmio + offset)); 368 BUG(); 369 } 370 371 /* 372 * MMIO register write with bytes helper functions 373 * @offset:bytes offset from MMIO start 374 * @value: the value want to be written to the register 375 * 376 */ 377 /** 378 * amdgpu_mm_wreg8 - read a memory mapped IO register 379 * 380 * @adev: amdgpu_device pointer 381 * @offset: byte aligned register offset 382 * @value: 8 bit value to write 383 * 384 * Writes the value specified to the offset specified. 385 */ 386 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 387 { 388 if (adev->in_pci_err_recovery) 389 return; 390 391 if (offset < adev->rmmio_size) 392 writeb(value, adev->rmmio + offset); 393 else 394 BUG(); 395 } 396 397 static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t v, 399 uint32_t acc_flags) 400 { 401 if (adev->in_pci_err_recovery) 402 return; 403 404 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 405 406 if ((reg * 4) < adev->rmmio_size) 407 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 408 else { 409 unsigned long flags; 410 411 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 412 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 413 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 414 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 415 } 416 } 417 418 /** 419 * amdgpu_mm_wreg - write to a memory mapped IO register 420 * 421 * @adev: amdgpu_device pointer 422 * @reg: dword aligned register offset 423 * @v: 32 bit value to write to the register 424 * @acc_flags: access flags which require special behavior 425 * 426 * Writes the value specified to the offset specified. 427 */ 428 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 429 uint32_t acc_flags) 430 { 431 if (adev->in_pci_err_recovery) 432 return; 433 434 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && 435 down_read_trylock(&adev->reset_sem)) { 436 amdgpu_kiq_wreg(adev, reg, v); 437 up_read(&adev->reset_sem); 438 return; 439 } 440 441 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 442 } 443 444 /* 445 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 446 * 447 * this function is invoked only the debugfs register access 448 * */ 449 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 450 uint32_t acc_flags) 451 { 452 if (adev->in_pci_err_recovery) 453 return; 454 455 if (amdgpu_sriov_fullaccess(adev) && 456 adev->gfx.rlc.funcs && 457 adev->gfx.rlc.funcs->is_rlcg_access_range) { 458 459 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 460 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 461 } 462 463 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 464 } 465 466 /** 467 * amdgpu_io_rreg - read an IO register 468 * 469 * @adev: amdgpu_device pointer 470 * @reg: dword aligned register offset 471 * 472 * Returns the 32 bit value from the offset specified. 473 */ 474 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 475 { 476 if (adev->in_pci_err_recovery) 477 return 0; 478 479 if ((reg * 4) < adev->rio_mem_size) 480 return ioread32(adev->rio_mem + (reg * 4)); 481 else { 482 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 483 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 484 } 485 } 486 487 /** 488 * amdgpu_io_wreg - write to an IO register 489 * 490 * @adev: amdgpu_device pointer 491 * @reg: dword aligned register offset 492 * @v: 32 bit value to write to the register 493 * 494 * Writes the value specified to the offset specified. 495 */ 496 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 497 { 498 if (adev->in_pci_err_recovery) 499 return; 500 501 if ((reg * 4) < adev->rio_mem_size) 502 iowrite32(v, adev->rio_mem + (reg * 4)); 503 else { 504 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 505 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 506 } 507 } 508 509 /** 510 * amdgpu_mm_rdoorbell - read a doorbell dword 511 * 512 * @adev: amdgpu_device pointer 513 * @index: doorbell index 514 * 515 * Returns the value in the doorbell aperture at the 516 * requested doorbell index (CIK). 517 */ 518 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 519 { 520 if (adev->in_pci_err_recovery) 521 return 0; 522 523 if (index < adev->doorbell.num_doorbells) { 524 return readl(adev->doorbell.ptr + index); 525 } else { 526 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 527 return 0; 528 } 529 } 530 531 /** 532 * amdgpu_mm_wdoorbell - write a doorbell dword 533 * 534 * @adev: amdgpu_device pointer 535 * @index: doorbell index 536 * @v: value to write 537 * 538 * Writes @v to the doorbell aperture at the 539 * requested doorbell index (CIK). 540 */ 541 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 542 { 543 if (adev->in_pci_err_recovery) 544 return; 545 546 if (index < adev->doorbell.num_doorbells) { 547 writel(v, adev->doorbell.ptr + index); 548 } else { 549 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 550 } 551 } 552 553 /** 554 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 555 * 556 * @adev: amdgpu_device pointer 557 * @index: doorbell index 558 * 559 * Returns the value in the doorbell aperture at the 560 * requested doorbell index (VEGA10+). 561 */ 562 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 563 { 564 if (adev->in_pci_err_recovery) 565 return 0; 566 567 if (index < adev->doorbell.num_doorbells) { 568 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 569 } else { 570 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 571 return 0; 572 } 573 } 574 575 /** 576 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 577 * 578 * @adev: amdgpu_device pointer 579 * @index: doorbell index 580 * @v: value to write 581 * 582 * Writes @v to the doorbell aperture at the 583 * requested doorbell index (VEGA10+). 584 */ 585 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 586 { 587 if (adev->in_pci_err_recovery) 588 return; 589 590 if (index < adev->doorbell.num_doorbells) { 591 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 592 } else { 593 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 594 } 595 } 596 597 /** 598 * amdgpu_invalid_rreg - dummy reg read function 599 * 600 * @adev: amdgpu device pointer 601 * @reg: offset of register 602 * 603 * Dummy register read function. Used for register blocks 604 * that certain asics don't have (all asics). 605 * Returns the value in the register. 606 */ 607 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 608 { 609 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 610 BUG(); 611 return 0; 612 } 613 614 /** 615 * amdgpu_invalid_wreg - dummy reg write function 616 * 617 * @adev: amdgpu device pointer 618 * @reg: offset of register 619 * @v: value to write to the register 620 * 621 * Dummy register read function. Used for register blocks 622 * that certain asics don't have (all asics). 623 */ 624 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 625 { 626 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 627 reg, v); 628 BUG(); 629 } 630 631 /** 632 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 633 * 634 * @adev: amdgpu device pointer 635 * @reg: offset of register 636 * 637 * Dummy register read function. Used for register blocks 638 * that certain asics don't have (all asics). 639 * Returns the value in the register. 640 */ 641 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 642 { 643 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 644 BUG(); 645 return 0; 646 } 647 648 /** 649 * amdgpu_invalid_wreg64 - dummy reg write function 650 * 651 * @adev: amdgpu device pointer 652 * @reg: offset of register 653 * @v: value to write to the register 654 * 655 * Dummy register read function. Used for register blocks 656 * that certain asics don't have (all asics). 657 */ 658 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 659 { 660 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 661 reg, v); 662 BUG(); 663 } 664 665 /** 666 * amdgpu_block_invalid_rreg - dummy reg read function 667 * 668 * @adev: amdgpu device pointer 669 * @block: offset of instance 670 * @reg: offset of register 671 * 672 * Dummy register read function. Used for register blocks 673 * that certain asics don't have (all asics). 674 * Returns the value in the register. 675 */ 676 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 677 uint32_t block, uint32_t reg) 678 { 679 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 680 reg, block); 681 BUG(); 682 return 0; 683 } 684 685 /** 686 * amdgpu_block_invalid_wreg - dummy reg write function 687 * 688 * @adev: amdgpu device pointer 689 * @block: offset of instance 690 * @reg: offset of register 691 * @v: value to write to the register 692 * 693 * Dummy register read function. Used for register blocks 694 * that certain asics don't have (all asics). 695 */ 696 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 697 uint32_t block, 698 uint32_t reg, uint32_t v) 699 { 700 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 701 reg, block, v); 702 BUG(); 703 } 704 705 /** 706 * amdgpu_device_asic_init - Wrapper for atom asic_init 707 * 708 * @dev: drm_device pointer 709 * 710 * Does any asic specific work and then calls atom asic init. 711 */ 712 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 713 { 714 amdgpu_asic_pre_asic_init(adev); 715 716 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 717 } 718 719 /** 720 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 721 * 722 * @adev: amdgpu device pointer 723 * 724 * Allocates a scratch page of VRAM for use by various things in the 725 * driver. 726 */ 727 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 728 { 729 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 730 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 731 &adev->vram_scratch.robj, 732 &adev->vram_scratch.gpu_addr, 733 (void **)&adev->vram_scratch.ptr); 734 } 735 736 /** 737 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 738 * 739 * @adev: amdgpu device pointer 740 * 741 * Frees the VRAM scratch page. 742 */ 743 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 744 { 745 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 746 } 747 748 /** 749 * amdgpu_device_program_register_sequence - program an array of registers. 750 * 751 * @adev: amdgpu_device pointer 752 * @registers: pointer to the register array 753 * @array_size: size of the register array 754 * 755 * Programs an array or registers with and and or masks. 756 * This is a helper for setting golden registers. 757 */ 758 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 759 const u32 *registers, 760 const u32 array_size) 761 { 762 u32 tmp, reg, and_mask, or_mask; 763 int i; 764 765 if (array_size % 3) 766 return; 767 768 for (i = 0; i < array_size; i +=3) { 769 reg = registers[i + 0]; 770 and_mask = registers[i + 1]; 771 or_mask = registers[i + 2]; 772 773 if (and_mask == 0xffffffff) { 774 tmp = or_mask; 775 } else { 776 tmp = RREG32(reg); 777 tmp &= ~and_mask; 778 if (adev->family >= AMDGPU_FAMILY_AI) 779 tmp |= (or_mask & and_mask); 780 else 781 tmp |= or_mask; 782 } 783 WREG32(reg, tmp); 784 } 785 } 786 787 /** 788 * amdgpu_device_pci_config_reset - reset the GPU 789 * 790 * @adev: amdgpu_device pointer 791 * 792 * Resets the GPU using the pci config reset sequence. 793 * Only applicable to asics prior to vega10. 794 */ 795 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 796 { 797 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 798 } 799 800 /* 801 * GPU doorbell aperture helpers function. 802 */ 803 /** 804 * amdgpu_device_doorbell_init - Init doorbell driver information. 805 * 806 * @adev: amdgpu_device pointer 807 * 808 * Init doorbell driver information (CIK) 809 * Returns 0 on success, error on failure. 810 */ 811 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 812 { 813 814 /* No doorbell on SI hardware generation */ 815 if (adev->asic_type < CHIP_BONAIRE) { 816 adev->doorbell.base = 0; 817 adev->doorbell.size = 0; 818 adev->doorbell.num_doorbells = 0; 819 adev->doorbell.ptr = NULL; 820 return 0; 821 } 822 823 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 824 return -EINVAL; 825 826 amdgpu_asic_init_doorbell_index(adev); 827 828 /* doorbell bar mapping */ 829 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 830 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 831 832 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 833 adev->doorbell_index.max_assignment+1); 834 if (adev->doorbell.num_doorbells == 0) 835 return -EINVAL; 836 837 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 838 * paging queue doorbell use the second page. The 839 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 840 * doorbells are in the first page. So with paging queue enabled, 841 * the max num_doorbells should + 1 page (0x400 in dword) 842 */ 843 if (adev->asic_type >= CHIP_VEGA10) 844 adev->doorbell.num_doorbells += 0x400; 845 846 adev->doorbell.ptr = ioremap(adev->doorbell.base, 847 adev->doorbell.num_doorbells * 848 sizeof(u32)); 849 if (adev->doorbell.ptr == NULL) 850 return -ENOMEM; 851 852 return 0; 853 } 854 855 /** 856 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 857 * 858 * @adev: amdgpu_device pointer 859 * 860 * Tear down doorbell driver information (CIK) 861 */ 862 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 863 { 864 iounmap(adev->doorbell.ptr); 865 adev->doorbell.ptr = NULL; 866 } 867 868 869 870 /* 871 * amdgpu_device_wb_*() 872 * Writeback is the method by which the GPU updates special pages in memory 873 * with the status of certain GPU events (fences, ring pointers,etc.). 874 */ 875 876 /** 877 * amdgpu_device_wb_fini - Disable Writeback and free memory 878 * 879 * @adev: amdgpu_device pointer 880 * 881 * Disables Writeback and frees the Writeback memory (all asics). 882 * Used at driver shutdown. 883 */ 884 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 885 { 886 if (adev->wb.wb_obj) { 887 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 888 &adev->wb.gpu_addr, 889 (void **)&adev->wb.wb); 890 adev->wb.wb_obj = NULL; 891 } 892 } 893 894 /** 895 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 896 * 897 * @adev: amdgpu_device pointer 898 * 899 * Initializes writeback and allocates writeback memory (all asics). 900 * Used at driver startup. 901 * Returns 0 on success or an -error on failure. 902 */ 903 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 904 { 905 int r; 906 907 if (adev->wb.wb_obj == NULL) { 908 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 909 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 910 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 911 &adev->wb.wb_obj, &adev->wb.gpu_addr, 912 (void **)&adev->wb.wb); 913 if (r) { 914 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 915 return r; 916 } 917 918 adev->wb.num_wb = AMDGPU_MAX_WB; 919 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 920 921 /* clear wb memory */ 922 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 923 } 924 925 return 0; 926 } 927 928 /** 929 * amdgpu_device_wb_get - Allocate a wb entry 930 * 931 * @adev: amdgpu_device pointer 932 * @wb: wb index 933 * 934 * Allocate a wb slot for use by the driver (all asics). 935 * Returns 0 on success or -EINVAL on failure. 936 */ 937 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 938 { 939 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 940 941 if (offset < adev->wb.num_wb) { 942 __set_bit(offset, adev->wb.used); 943 *wb = offset << 3; /* convert to dw offset */ 944 return 0; 945 } else { 946 return -EINVAL; 947 } 948 } 949 950 /** 951 * amdgpu_device_wb_free - Free a wb entry 952 * 953 * @adev: amdgpu_device pointer 954 * @wb: wb index 955 * 956 * Free a wb slot allocated for use by the driver (all asics) 957 */ 958 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 959 { 960 wb >>= 3; 961 if (wb < adev->wb.num_wb) 962 __clear_bit(wb, adev->wb.used); 963 } 964 965 /** 966 * amdgpu_device_resize_fb_bar - try to resize FB BAR 967 * 968 * @adev: amdgpu_device pointer 969 * 970 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 971 * to fail, but if any of the BARs is not accessible after the size we abort 972 * driver loading by returning -ENODEV. 973 */ 974 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 975 { 976 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 977 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 978 struct pci_bus *root; 979 struct resource *res; 980 unsigned i; 981 u16 cmd; 982 int r; 983 984 /* Bypass for VF */ 985 if (amdgpu_sriov_vf(adev)) 986 return 0; 987 988 /* skip if the bios has already enabled large BAR */ 989 if (adev->gmc.real_vram_size && 990 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 991 return 0; 992 993 /* Check if the root BUS has 64bit memory resources */ 994 root = adev->pdev->bus; 995 while (root->parent) 996 root = root->parent; 997 998 pci_bus_for_each_resource(root, res, i) { 999 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1000 res->start > 0x100000000ull) 1001 break; 1002 } 1003 1004 /* Trying to resize is pointless without a root hub window above 4GB */ 1005 if (!res) 1006 return 0; 1007 1008 /* Disable memory decoding while we change the BAR addresses and size */ 1009 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1010 pci_write_config_word(adev->pdev, PCI_COMMAND, 1011 cmd & ~PCI_COMMAND_MEMORY); 1012 1013 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1014 amdgpu_device_doorbell_fini(adev); 1015 if (adev->asic_type >= CHIP_BONAIRE) 1016 pci_release_resource(adev->pdev, 2); 1017 1018 pci_release_resource(adev->pdev, 0); 1019 1020 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1021 if (r == -ENOSPC) 1022 DRM_INFO("Not enough PCI address space for a large BAR."); 1023 else if (r && r != -ENOTSUPP) 1024 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1025 1026 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1027 1028 /* When the doorbell or fb BAR isn't available we have no chance of 1029 * using the device. 1030 */ 1031 r = amdgpu_device_doorbell_init(adev); 1032 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1033 return -ENODEV; 1034 1035 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1036 1037 return 0; 1038 } 1039 1040 /* 1041 * GPU helpers function. 1042 */ 1043 /** 1044 * amdgpu_device_need_post - check if the hw need post or not 1045 * 1046 * @adev: amdgpu_device pointer 1047 * 1048 * Check if the asic has been initialized (all asics) at driver startup 1049 * or post is needed if hw reset is performed. 1050 * Returns true if need or false if not. 1051 */ 1052 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1053 { 1054 uint32_t reg; 1055 1056 if (amdgpu_sriov_vf(adev)) 1057 return false; 1058 1059 if (amdgpu_passthrough(adev)) { 1060 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1061 * some old smc fw still need driver do vPost otherwise gpu hang, while 1062 * those smc fw version above 22.15 doesn't have this flaw, so we force 1063 * vpost executed for smc version below 22.15 1064 */ 1065 if (adev->asic_type == CHIP_FIJI) { 1066 int err; 1067 uint32_t fw_ver; 1068 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1069 /* force vPost if error occured */ 1070 if (err) 1071 return true; 1072 1073 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1074 if (fw_ver < 0x00160e00) 1075 return true; 1076 } 1077 } 1078 1079 if (adev->has_hw_reset) { 1080 adev->has_hw_reset = false; 1081 return true; 1082 } 1083 1084 /* bios scratch used on CIK+ */ 1085 if (adev->asic_type >= CHIP_BONAIRE) 1086 return amdgpu_atombios_scratch_need_asic_init(adev); 1087 1088 /* check MEM_SIZE for older asics */ 1089 reg = amdgpu_asic_get_config_memsize(adev); 1090 1091 if ((reg != 0) && (reg != 0xffffffff)) 1092 return false; 1093 1094 return true; 1095 } 1096 1097 /* if we get transitioned to only one device, take VGA back */ 1098 /** 1099 * amdgpu_device_vga_set_decode - enable/disable vga decode 1100 * 1101 * @cookie: amdgpu_device pointer 1102 * @state: enable/disable vga decode 1103 * 1104 * Enable/disable vga decode (all asics). 1105 * Returns VGA resource flags. 1106 */ 1107 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1108 { 1109 struct amdgpu_device *adev = cookie; 1110 amdgpu_asic_set_vga_state(adev, state); 1111 if (state) 1112 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1113 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1114 else 1115 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1116 } 1117 1118 /** 1119 * amdgpu_device_check_block_size - validate the vm block size 1120 * 1121 * @adev: amdgpu_device pointer 1122 * 1123 * Validates the vm block size specified via module parameter. 1124 * The vm block size defines number of bits in page table versus page directory, 1125 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1126 * page table and the remaining bits are in the page directory. 1127 */ 1128 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1129 { 1130 /* defines number of bits in page table versus page directory, 1131 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1132 * page table and the remaining bits are in the page directory */ 1133 if (amdgpu_vm_block_size == -1) 1134 return; 1135 1136 if (amdgpu_vm_block_size < 9) { 1137 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1138 amdgpu_vm_block_size); 1139 amdgpu_vm_block_size = -1; 1140 } 1141 } 1142 1143 /** 1144 * amdgpu_device_check_vm_size - validate the vm size 1145 * 1146 * @adev: amdgpu_device pointer 1147 * 1148 * Validates the vm size in GB specified via module parameter. 1149 * The VM size is the size of the GPU virtual memory space in GB. 1150 */ 1151 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1152 { 1153 /* no need to check the default value */ 1154 if (amdgpu_vm_size == -1) 1155 return; 1156 1157 if (amdgpu_vm_size < 1) { 1158 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1159 amdgpu_vm_size); 1160 amdgpu_vm_size = -1; 1161 } 1162 } 1163 1164 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1165 { 1166 struct sysinfo si; 1167 bool is_os_64 = (sizeof(void *) == 8); 1168 uint64_t total_memory; 1169 uint64_t dram_size_seven_GB = 0x1B8000000; 1170 uint64_t dram_size_three_GB = 0xB8000000; 1171 1172 if (amdgpu_smu_memory_pool_size == 0) 1173 return; 1174 1175 if (!is_os_64) { 1176 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1177 goto def_value; 1178 } 1179 si_meminfo(&si); 1180 total_memory = (uint64_t)si.totalram * si.mem_unit; 1181 1182 if ((amdgpu_smu_memory_pool_size == 1) || 1183 (amdgpu_smu_memory_pool_size == 2)) { 1184 if (total_memory < dram_size_three_GB) 1185 goto def_value1; 1186 } else if ((amdgpu_smu_memory_pool_size == 4) || 1187 (amdgpu_smu_memory_pool_size == 8)) { 1188 if (total_memory < dram_size_seven_GB) 1189 goto def_value1; 1190 } else { 1191 DRM_WARN("Smu memory pool size not supported\n"); 1192 goto def_value; 1193 } 1194 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1195 1196 return; 1197 1198 def_value1: 1199 DRM_WARN("No enough system memory\n"); 1200 def_value: 1201 adev->pm.smu_prv_buffer_size = 0; 1202 } 1203 1204 /** 1205 * amdgpu_device_check_arguments - validate module params 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Validates certain module parameters and updates 1210 * the associated values used by the driver (all asics). 1211 */ 1212 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1213 { 1214 if (amdgpu_sched_jobs < 4) { 1215 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1216 amdgpu_sched_jobs); 1217 amdgpu_sched_jobs = 4; 1218 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1219 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1220 amdgpu_sched_jobs); 1221 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1222 } 1223 1224 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1225 /* gart size must be greater or equal to 32M */ 1226 dev_warn(adev->dev, "gart size (%d) too small\n", 1227 amdgpu_gart_size); 1228 amdgpu_gart_size = -1; 1229 } 1230 1231 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1232 /* gtt size must be greater or equal to 32M */ 1233 dev_warn(adev->dev, "gtt size (%d) too small\n", 1234 amdgpu_gtt_size); 1235 amdgpu_gtt_size = -1; 1236 } 1237 1238 /* valid range is between 4 and 9 inclusive */ 1239 if (amdgpu_vm_fragment_size != -1 && 1240 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1241 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1242 amdgpu_vm_fragment_size = -1; 1243 } 1244 1245 if (amdgpu_sched_hw_submission < 2) { 1246 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1247 amdgpu_sched_hw_submission); 1248 amdgpu_sched_hw_submission = 2; 1249 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1250 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1251 amdgpu_sched_hw_submission); 1252 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1253 } 1254 1255 amdgpu_device_check_smu_prv_buffer_size(adev); 1256 1257 amdgpu_device_check_vm_size(adev); 1258 1259 amdgpu_device_check_block_size(adev); 1260 1261 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1262 1263 amdgpu_gmc_tmz_set(adev); 1264 1265 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1266 amdgpu_num_kcq = 8; 1267 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1268 } 1269 1270 return 0; 1271 } 1272 1273 /** 1274 * amdgpu_switcheroo_set_state - set switcheroo state 1275 * 1276 * @pdev: pci dev pointer 1277 * @state: vga_switcheroo state 1278 * 1279 * Callback for the switcheroo driver. Suspends or resumes the 1280 * the asics before or after it is powered up using ACPI methods. 1281 */ 1282 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1283 enum vga_switcheroo_state state) 1284 { 1285 struct drm_device *dev = pci_get_drvdata(pdev); 1286 int r; 1287 1288 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1289 return; 1290 1291 if (state == VGA_SWITCHEROO_ON) { 1292 pr_info("switched on\n"); 1293 /* don't suspend or resume card normally */ 1294 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1295 1296 pci_set_power_state(dev->pdev, PCI_D0); 1297 amdgpu_device_load_pci_state(dev->pdev); 1298 r = pci_enable_device(dev->pdev); 1299 if (r) 1300 DRM_WARN("pci_enable_device failed (%d)\n", r); 1301 amdgpu_device_resume(dev, true); 1302 1303 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1304 drm_kms_helper_poll_enable(dev); 1305 } else { 1306 pr_info("switched off\n"); 1307 drm_kms_helper_poll_disable(dev); 1308 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1309 amdgpu_device_suspend(dev, true); 1310 amdgpu_device_cache_pci_state(dev->pdev); 1311 /* Shut down the device */ 1312 pci_disable_device(dev->pdev); 1313 pci_set_power_state(dev->pdev, PCI_D3cold); 1314 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1315 } 1316 } 1317 1318 /** 1319 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1320 * 1321 * @pdev: pci dev pointer 1322 * 1323 * Callback for the switcheroo driver. Check of the switcheroo 1324 * state can be changed. 1325 * Returns true if the state can be changed, false if not. 1326 */ 1327 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1328 { 1329 struct drm_device *dev = pci_get_drvdata(pdev); 1330 1331 /* 1332 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1333 * locking inversion with the driver load path. And the access here is 1334 * completely racy anyway. So don't bother with locking for now. 1335 */ 1336 return atomic_read(&dev->open_count) == 0; 1337 } 1338 1339 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1340 .set_gpu_state = amdgpu_switcheroo_set_state, 1341 .reprobe = NULL, 1342 .can_switch = amdgpu_switcheroo_can_switch, 1343 }; 1344 1345 /** 1346 * amdgpu_device_ip_set_clockgating_state - set the CG state 1347 * 1348 * @dev: amdgpu_device pointer 1349 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1350 * @state: clockgating state (gate or ungate) 1351 * 1352 * Sets the requested clockgating state for all instances of 1353 * the hardware IP specified. 1354 * Returns the error code from the last instance. 1355 */ 1356 int amdgpu_device_ip_set_clockgating_state(void *dev, 1357 enum amd_ip_block_type block_type, 1358 enum amd_clockgating_state state) 1359 { 1360 struct amdgpu_device *adev = dev; 1361 int i, r = 0; 1362 1363 for (i = 0; i < adev->num_ip_blocks; i++) { 1364 if (!adev->ip_blocks[i].status.valid) 1365 continue; 1366 if (adev->ip_blocks[i].version->type != block_type) 1367 continue; 1368 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1369 continue; 1370 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1371 (void *)adev, state); 1372 if (r) 1373 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1374 adev->ip_blocks[i].version->funcs->name, r); 1375 } 1376 return r; 1377 } 1378 1379 /** 1380 * amdgpu_device_ip_set_powergating_state - set the PG state 1381 * 1382 * @dev: amdgpu_device pointer 1383 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1384 * @state: powergating state (gate or ungate) 1385 * 1386 * Sets the requested powergating state for all instances of 1387 * the hardware IP specified. 1388 * Returns the error code from the last instance. 1389 */ 1390 int amdgpu_device_ip_set_powergating_state(void *dev, 1391 enum amd_ip_block_type block_type, 1392 enum amd_powergating_state state) 1393 { 1394 struct amdgpu_device *adev = dev; 1395 int i, r = 0; 1396 1397 for (i = 0; i < adev->num_ip_blocks; i++) { 1398 if (!adev->ip_blocks[i].status.valid) 1399 continue; 1400 if (adev->ip_blocks[i].version->type != block_type) 1401 continue; 1402 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1403 continue; 1404 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1405 (void *)adev, state); 1406 if (r) 1407 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1408 adev->ip_blocks[i].version->funcs->name, r); 1409 } 1410 return r; 1411 } 1412 1413 /** 1414 * amdgpu_device_ip_get_clockgating_state - get the CG state 1415 * 1416 * @adev: amdgpu_device pointer 1417 * @flags: clockgating feature flags 1418 * 1419 * Walks the list of IPs on the device and updates the clockgating 1420 * flags for each IP. 1421 * Updates @flags with the feature flags for each hardware IP where 1422 * clockgating is enabled. 1423 */ 1424 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1425 u32 *flags) 1426 { 1427 int i; 1428 1429 for (i = 0; i < adev->num_ip_blocks; i++) { 1430 if (!adev->ip_blocks[i].status.valid) 1431 continue; 1432 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1433 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1434 } 1435 } 1436 1437 /** 1438 * amdgpu_device_ip_wait_for_idle - wait for idle 1439 * 1440 * @adev: amdgpu_device pointer 1441 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1442 * 1443 * Waits for the request hardware IP to be idle. 1444 * Returns 0 for success or a negative error code on failure. 1445 */ 1446 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1447 enum amd_ip_block_type block_type) 1448 { 1449 int i, r; 1450 1451 for (i = 0; i < adev->num_ip_blocks; i++) { 1452 if (!adev->ip_blocks[i].status.valid) 1453 continue; 1454 if (adev->ip_blocks[i].version->type == block_type) { 1455 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1456 if (r) 1457 return r; 1458 break; 1459 } 1460 } 1461 return 0; 1462 1463 } 1464 1465 /** 1466 * amdgpu_device_ip_is_idle - is the hardware IP idle 1467 * 1468 * @adev: amdgpu_device pointer 1469 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1470 * 1471 * Check if the hardware IP is idle or not. 1472 * Returns true if it the IP is idle, false if not. 1473 */ 1474 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1475 enum amd_ip_block_type block_type) 1476 { 1477 int i; 1478 1479 for (i = 0; i < adev->num_ip_blocks; i++) { 1480 if (!adev->ip_blocks[i].status.valid) 1481 continue; 1482 if (adev->ip_blocks[i].version->type == block_type) 1483 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1484 } 1485 return true; 1486 1487 } 1488 1489 /** 1490 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1491 * 1492 * @adev: amdgpu_device pointer 1493 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1494 * 1495 * Returns a pointer to the hardware IP block structure 1496 * if it exists for the asic, otherwise NULL. 1497 */ 1498 struct amdgpu_ip_block * 1499 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1500 enum amd_ip_block_type type) 1501 { 1502 int i; 1503 1504 for (i = 0; i < adev->num_ip_blocks; i++) 1505 if (adev->ip_blocks[i].version->type == type) 1506 return &adev->ip_blocks[i]; 1507 1508 return NULL; 1509 } 1510 1511 /** 1512 * amdgpu_device_ip_block_version_cmp 1513 * 1514 * @adev: amdgpu_device pointer 1515 * @type: enum amd_ip_block_type 1516 * @major: major version 1517 * @minor: minor version 1518 * 1519 * return 0 if equal or greater 1520 * return 1 if smaller or the ip_block doesn't exist 1521 */ 1522 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1523 enum amd_ip_block_type type, 1524 u32 major, u32 minor) 1525 { 1526 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1527 1528 if (ip_block && ((ip_block->version->major > major) || 1529 ((ip_block->version->major == major) && 1530 (ip_block->version->minor >= minor)))) 1531 return 0; 1532 1533 return 1; 1534 } 1535 1536 /** 1537 * amdgpu_device_ip_block_add 1538 * 1539 * @adev: amdgpu_device pointer 1540 * @ip_block_version: pointer to the IP to add 1541 * 1542 * Adds the IP block driver information to the collection of IPs 1543 * on the asic. 1544 */ 1545 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1546 const struct amdgpu_ip_block_version *ip_block_version) 1547 { 1548 if (!ip_block_version) 1549 return -EINVAL; 1550 1551 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1552 ip_block_version->funcs->name); 1553 1554 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1555 1556 return 0; 1557 } 1558 1559 /** 1560 * amdgpu_device_enable_virtual_display - enable virtual display feature 1561 * 1562 * @adev: amdgpu_device pointer 1563 * 1564 * Enabled the virtual display feature if the user has enabled it via 1565 * the module parameter virtual_display. This feature provides a virtual 1566 * display hardware on headless boards or in virtualized environments. 1567 * This function parses and validates the configuration string specified by 1568 * the user and configues the virtual display configuration (number of 1569 * virtual connectors, crtcs, etc.) specified. 1570 */ 1571 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1572 { 1573 adev->enable_virtual_display = false; 1574 1575 if (amdgpu_virtual_display) { 1576 struct drm_device *ddev = adev_to_drm(adev); 1577 const char *pci_address_name = pci_name(ddev->pdev); 1578 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1579 1580 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1581 pciaddstr_tmp = pciaddstr; 1582 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1583 pciaddname = strsep(&pciaddname_tmp, ","); 1584 if (!strcmp("all", pciaddname) 1585 || !strcmp(pci_address_name, pciaddname)) { 1586 long num_crtc; 1587 int res = -1; 1588 1589 adev->enable_virtual_display = true; 1590 1591 if (pciaddname_tmp) 1592 res = kstrtol(pciaddname_tmp, 10, 1593 &num_crtc); 1594 1595 if (!res) { 1596 if (num_crtc < 1) 1597 num_crtc = 1; 1598 if (num_crtc > 6) 1599 num_crtc = 6; 1600 adev->mode_info.num_crtc = num_crtc; 1601 } else { 1602 adev->mode_info.num_crtc = 1; 1603 } 1604 break; 1605 } 1606 } 1607 1608 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1609 amdgpu_virtual_display, pci_address_name, 1610 adev->enable_virtual_display, adev->mode_info.num_crtc); 1611 1612 kfree(pciaddstr); 1613 } 1614 } 1615 1616 /** 1617 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1618 * 1619 * @adev: amdgpu_device pointer 1620 * 1621 * Parses the asic configuration parameters specified in the gpu info 1622 * firmware and makes them availale to the driver for use in configuring 1623 * the asic. 1624 * Returns 0 on success, -EINVAL on failure. 1625 */ 1626 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1627 { 1628 const char *chip_name; 1629 char fw_name[40]; 1630 int err; 1631 const struct gpu_info_firmware_header_v1_0 *hdr; 1632 1633 adev->firmware.gpu_info_fw = NULL; 1634 1635 if (adev->mman.discovery_bin) { 1636 amdgpu_discovery_get_gfx_info(adev); 1637 1638 /* 1639 * FIXME: The bounding box is still needed by Navi12, so 1640 * temporarily read it from gpu_info firmware. Should be droped 1641 * when DAL no longer needs it. 1642 */ 1643 if (adev->asic_type != CHIP_NAVI12) 1644 return 0; 1645 } 1646 1647 switch (adev->asic_type) { 1648 #ifdef CONFIG_DRM_AMDGPU_SI 1649 case CHIP_VERDE: 1650 case CHIP_TAHITI: 1651 case CHIP_PITCAIRN: 1652 case CHIP_OLAND: 1653 case CHIP_HAINAN: 1654 #endif 1655 #ifdef CONFIG_DRM_AMDGPU_CIK 1656 case CHIP_BONAIRE: 1657 case CHIP_HAWAII: 1658 case CHIP_KAVERI: 1659 case CHIP_KABINI: 1660 case CHIP_MULLINS: 1661 #endif 1662 case CHIP_TOPAZ: 1663 case CHIP_TONGA: 1664 case CHIP_FIJI: 1665 case CHIP_POLARIS10: 1666 case CHIP_POLARIS11: 1667 case CHIP_POLARIS12: 1668 case CHIP_VEGAM: 1669 case CHIP_CARRIZO: 1670 case CHIP_STONEY: 1671 case CHIP_VEGA20: 1672 default: 1673 return 0; 1674 case CHIP_VEGA10: 1675 chip_name = "vega10"; 1676 break; 1677 case CHIP_VEGA12: 1678 chip_name = "vega12"; 1679 break; 1680 case CHIP_RAVEN: 1681 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1682 chip_name = "raven2"; 1683 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1684 chip_name = "picasso"; 1685 else 1686 chip_name = "raven"; 1687 break; 1688 case CHIP_ARCTURUS: 1689 chip_name = "arcturus"; 1690 break; 1691 case CHIP_RENOIR: 1692 chip_name = "renoir"; 1693 break; 1694 case CHIP_NAVI10: 1695 chip_name = "navi10"; 1696 break; 1697 case CHIP_NAVI14: 1698 chip_name = "navi14"; 1699 break; 1700 case CHIP_NAVI12: 1701 chip_name = "navi12"; 1702 break; 1703 case CHIP_SIENNA_CICHLID: 1704 chip_name = "sienna_cichlid"; 1705 break; 1706 case CHIP_NAVY_FLOUNDER: 1707 chip_name = "navy_flounder"; 1708 break; 1709 } 1710 1711 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1712 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1713 if (err) { 1714 dev_err(adev->dev, 1715 "Failed to load gpu_info firmware \"%s\"\n", 1716 fw_name); 1717 goto out; 1718 } 1719 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1720 if (err) { 1721 dev_err(adev->dev, 1722 "Failed to validate gpu_info firmware \"%s\"\n", 1723 fw_name); 1724 goto out; 1725 } 1726 1727 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1728 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1729 1730 switch (hdr->version_major) { 1731 case 1: 1732 { 1733 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1734 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1735 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1736 1737 /* 1738 * Should be droped when DAL no longer needs it. 1739 */ 1740 if (adev->asic_type == CHIP_NAVI12) 1741 goto parse_soc_bounding_box; 1742 1743 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1744 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1745 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1746 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1747 adev->gfx.config.max_texture_channel_caches = 1748 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1749 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1750 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1751 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1752 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1753 adev->gfx.config.double_offchip_lds_buf = 1754 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1755 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1756 adev->gfx.cu_info.max_waves_per_simd = 1757 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1758 adev->gfx.cu_info.max_scratch_slots_per_cu = 1759 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1760 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1761 if (hdr->version_minor >= 1) { 1762 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1763 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1764 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1765 adev->gfx.config.num_sc_per_sh = 1766 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1767 adev->gfx.config.num_packer_per_sc = 1768 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1769 } 1770 1771 parse_soc_bounding_box: 1772 /* 1773 * soc bounding box info is not integrated in disocovery table, 1774 * we always need to parse it from gpu info firmware if needed. 1775 */ 1776 if (hdr->version_minor == 2) { 1777 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1778 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1779 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1780 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1781 } 1782 break; 1783 } 1784 default: 1785 dev_err(adev->dev, 1786 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1787 err = -EINVAL; 1788 goto out; 1789 } 1790 out: 1791 return err; 1792 } 1793 1794 /** 1795 * amdgpu_device_ip_early_init - run early init for hardware IPs 1796 * 1797 * @adev: amdgpu_device pointer 1798 * 1799 * Early initialization pass for hardware IPs. The hardware IPs that make 1800 * up each asic are discovered each IP's early_init callback is run. This 1801 * is the first stage in initializing the asic. 1802 * Returns 0 on success, negative error code on failure. 1803 */ 1804 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1805 { 1806 int i, r; 1807 1808 amdgpu_device_enable_virtual_display(adev); 1809 1810 if (amdgpu_sriov_vf(adev)) { 1811 r = amdgpu_virt_request_full_gpu(adev, true); 1812 if (r) 1813 return r; 1814 } 1815 1816 switch (adev->asic_type) { 1817 #ifdef CONFIG_DRM_AMDGPU_SI 1818 case CHIP_VERDE: 1819 case CHIP_TAHITI: 1820 case CHIP_PITCAIRN: 1821 case CHIP_OLAND: 1822 case CHIP_HAINAN: 1823 adev->family = AMDGPU_FAMILY_SI; 1824 r = si_set_ip_blocks(adev); 1825 if (r) 1826 return r; 1827 break; 1828 #endif 1829 #ifdef CONFIG_DRM_AMDGPU_CIK 1830 case CHIP_BONAIRE: 1831 case CHIP_HAWAII: 1832 case CHIP_KAVERI: 1833 case CHIP_KABINI: 1834 case CHIP_MULLINS: 1835 if (adev->flags & AMD_IS_APU) 1836 adev->family = AMDGPU_FAMILY_KV; 1837 else 1838 adev->family = AMDGPU_FAMILY_CI; 1839 1840 r = cik_set_ip_blocks(adev); 1841 if (r) 1842 return r; 1843 break; 1844 #endif 1845 case CHIP_TOPAZ: 1846 case CHIP_TONGA: 1847 case CHIP_FIJI: 1848 case CHIP_POLARIS10: 1849 case CHIP_POLARIS11: 1850 case CHIP_POLARIS12: 1851 case CHIP_VEGAM: 1852 case CHIP_CARRIZO: 1853 case CHIP_STONEY: 1854 if (adev->flags & AMD_IS_APU) 1855 adev->family = AMDGPU_FAMILY_CZ; 1856 else 1857 adev->family = AMDGPU_FAMILY_VI; 1858 1859 r = vi_set_ip_blocks(adev); 1860 if (r) 1861 return r; 1862 break; 1863 case CHIP_VEGA10: 1864 case CHIP_VEGA12: 1865 case CHIP_VEGA20: 1866 case CHIP_RAVEN: 1867 case CHIP_ARCTURUS: 1868 case CHIP_RENOIR: 1869 if (adev->flags & AMD_IS_APU) 1870 adev->family = AMDGPU_FAMILY_RV; 1871 else 1872 adev->family = AMDGPU_FAMILY_AI; 1873 1874 r = soc15_set_ip_blocks(adev); 1875 if (r) 1876 return r; 1877 break; 1878 case CHIP_NAVI10: 1879 case CHIP_NAVI14: 1880 case CHIP_NAVI12: 1881 case CHIP_SIENNA_CICHLID: 1882 case CHIP_NAVY_FLOUNDER: 1883 adev->family = AMDGPU_FAMILY_NV; 1884 1885 r = nv_set_ip_blocks(adev); 1886 if (r) 1887 return r; 1888 break; 1889 default: 1890 /* FIXME: not supported yet */ 1891 return -EINVAL; 1892 } 1893 1894 amdgpu_amdkfd_device_probe(adev); 1895 1896 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1897 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1898 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1899 1900 for (i = 0; i < adev->num_ip_blocks; i++) { 1901 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1902 DRM_ERROR("disabled ip block: %d <%s>\n", 1903 i, adev->ip_blocks[i].version->funcs->name); 1904 adev->ip_blocks[i].status.valid = false; 1905 } else { 1906 if (adev->ip_blocks[i].version->funcs->early_init) { 1907 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1908 if (r == -ENOENT) { 1909 adev->ip_blocks[i].status.valid = false; 1910 } else if (r) { 1911 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1912 adev->ip_blocks[i].version->funcs->name, r); 1913 return r; 1914 } else { 1915 adev->ip_blocks[i].status.valid = true; 1916 } 1917 } else { 1918 adev->ip_blocks[i].status.valid = true; 1919 } 1920 } 1921 /* get the vbios after the asic_funcs are set up */ 1922 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1923 r = amdgpu_device_parse_gpu_info_fw(adev); 1924 if (r) 1925 return r; 1926 1927 /* Read BIOS */ 1928 if (!amdgpu_get_bios(adev)) 1929 return -EINVAL; 1930 1931 r = amdgpu_atombios_init(adev); 1932 if (r) { 1933 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1934 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1935 return r; 1936 } 1937 } 1938 } 1939 1940 adev->cg_flags &= amdgpu_cg_mask; 1941 adev->pg_flags &= amdgpu_pg_mask; 1942 1943 return 0; 1944 } 1945 1946 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1947 { 1948 int i, r; 1949 1950 for (i = 0; i < adev->num_ip_blocks; i++) { 1951 if (!adev->ip_blocks[i].status.sw) 1952 continue; 1953 if (adev->ip_blocks[i].status.hw) 1954 continue; 1955 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1956 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1957 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1958 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1959 if (r) { 1960 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1961 adev->ip_blocks[i].version->funcs->name, r); 1962 return r; 1963 } 1964 adev->ip_blocks[i].status.hw = true; 1965 } 1966 } 1967 1968 return 0; 1969 } 1970 1971 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1972 { 1973 int i, r; 1974 1975 for (i = 0; i < adev->num_ip_blocks; i++) { 1976 if (!adev->ip_blocks[i].status.sw) 1977 continue; 1978 if (adev->ip_blocks[i].status.hw) 1979 continue; 1980 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1981 if (r) { 1982 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1983 adev->ip_blocks[i].version->funcs->name, r); 1984 return r; 1985 } 1986 adev->ip_blocks[i].status.hw = true; 1987 } 1988 1989 return 0; 1990 } 1991 1992 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1993 { 1994 int r = 0; 1995 int i; 1996 uint32_t smu_version; 1997 1998 if (adev->asic_type >= CHIP_VEGA10) { 1999 for (i = 0; i < adev->num_ip_blocks; i++) { 2000 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2001 continue; 2002 2003 /* no need to do the fw loading again if already done*/ 2004 if (adev->ip_blocks[i].status.hw == true) 2005 break; 2006 2007 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2008 r = adev->ip_blocks[i].version->funcs->resume(adev); 2009 if (r) { 2010 DRM_ERROR("resume of IP block <%s> failed %d\n", 2011 adev->ip_blocks[i].version->funcs->name, r); 2012 return r; 2013 } 2014 } else { 2015 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2016 if (r) { 2017 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2018 adev->ip_blocks[i].version->funcs->name, r); 2019 return r; 2020 } 2021 } 2022 2023 adev->ip_blocks[i].status.hw = true; 2024 break; 2025 } 2026 } 2027 2028 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2029 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2030 2031 return r; 2032 } 2033 2034 /** 2035 * amdgpu_device_ip_init - run init for hardware IPs 2036 * 2037 * @adev: amdgpu_device pointer 2038 * 2039 * Main initialization pass for hardware IPs. The list of all the hardware 2040 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2041 * are run. sw_init initializes the software state associated with each IP 2042 * and hw_init initializes the hardware associated with each IP. 2043 * Returns 0 on success, negative error code on failure. 2044 */ 2045 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2046 { 2047 int i, r; 2048 2049 r = amdgpu_ras_init(adev); 2050 if (r) 2051 return r; 2052 2053 for (i = 0; i < adev->num_ip_blocks; i++) { 2054 if (!adev->ip_blocks[i].status.valid) 2055 continue; 2056 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2057 if (r) { 2058 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2059 adev->ip_blocks[i].version->funcs->name, r); 2060 goto init_failed; 2061 } 2062 adev->ip_blocks[i].status.sw = true; 2063 2064 /* need to do gmc hw init early so we can allocate gpu mem */ 2065 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2066 r = amdgpu_device_vram_scratch_init(adev); 2067 if (r) { 2068 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2069 goto init_failed; 2070 } 2071 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2072 if (r) { 2073 DRM_ERROR("hw_init %d failed %d\n", i, r); 2074 goto init_failed; 2075 } 2076 r = amdgpu_device_wb_init(adev); 2077 if (r) { 2078 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2079 goto init_failed; 2080 } 2081 adev->ip_blocks[i].status.hw = true; 2082 2083 /* right after GMC hw init, we create CSA */ 2084 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2085 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2086 AMDGPU_GEM_DOMAIN_VRAM, 2087 AMDGPU_CSA_SIZE); 2088 if (r) { 2089 DRM_ERROR("allocate CSA failed %d\n", r); 2090 goto init_failed; 2091 } 2092 } 2093 } 2094 } 2095 2096 if (amdgpu_sriov_vf(adev)) 2097 amdgpu_virt_init_data_exchange(adev); 2098 2099 r = amdgpu_ib_pool_init(adev); 2100 if (r) { 2101 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2102 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2103 goto init_failed; 2104 } 2105 2106 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2107 if (r) 2108 goto init_failed; 2109 2110 r = amdgpu_device_ip_hw_init_phase1(adev); 2111 if (r) 2112 goto init_failed; 2113 2114 r = amdgpu_device_fw_loading(adev); 2115 if (r) 2116 goto init_failed; 2117 2118 r = amdgpu_device_ip_hw_init_phase2(adev); 2119 if (r) 2120 goto init_failed; 2121 2122 /* 2123 * retired pages will be loaded from eeprom and reserved here, 2124 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2125 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2126 * for I2C communication which only true at this point. 2127 * 2128 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2129 * failure from bad gpu situation and stop amdgpu init process 2130 * accordingly. For other failed cases, it will still release all 2131 * the resource and print error message, rather than returning one 2132 * negative value to upper level. 2133 * 2134 * Note: theoretically, this should be called before all vram allocations 2135 * to protect retired page from abusing 2136 */ 2137 r = amdgpu_ras_recovery_init(adev); 2138 if (r) 2139 goto init_failed; 2140 2141 if (adev->gmc.xgmi.num_physical_nodes > 1) 2142 amdgpu_xgmi_add_device(adev); 2143 amdgpu_amdkfd_device_init(adev); 2144 2145 amdgpu_fru_get_product_info(adev); 2146 2147 init_failed: 2148 if (amdgpu_sriov_vf(adev)) 2149 amdgpu_virt_release_full_gpu(adev, true); 2150 2151 return r; 2152 } 2153 2154 /** 2155 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2156 * 2157 * @adev: amdgpu_device pointer 2158 * 2159 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2160 * this function before a GPU reset. If the value is retained after a 2161 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2162 */ 2163 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2164 { 2165 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2166 } 2167 2168 /** 2169 * amdgpu_device_check_vram_lost - check if vram is valid 2170 * 2171 * @adev: amdgpu_device pointer 2172 * 2173 * Checks the reset magic value written to the gart pointer in VRAM. 2174 * The driver calls this after a GPU reset to see if the contents of 2175 * VRAM is lost or now. 2176 * returns true if vram is lost, false if not. 2177 */ 2178 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2179 { 2180 if (memcmp(adev->gart.ptr, adev->reset_magic, 2181 AMDGPU_RESET_MAGIC_NUM)) 2182 return true; 2183 2184 if (!amdgpu_in_reset(adev)) 2185 return false; 2186 2187 /* 2188 * For all ASICs with baco/mode1 reset, the VRAM is 2189 * always assumed to be lost. 2190 */ 2191 switch (amdgpu_asic_reset_method(adev)) { 2192 case AMD_RESET_METHOD_BACO: 2193 case AMD_RESET_METHOD_MODE1: 2194 return true; 2195 default: 2196 return false; 2197 } 2198 } 2199 2200 /** 2201 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2202 * 2203 * @adev: amdgpu_device pointer 2204 * @state: clockgating state (gate or ungate) 2205 * 2206 * The list of all the hardware IPs that make up the asic is walked and the 2207 * set_clockgating_state callbacks are run. 2208 * Late initialization pass enabling clockgating for hardware IPs. 2209 * Fini or suspend, pass disabling clockgating for hardware IPs. 2210 * Returns 0 on success, negative error code on failure. 2211 */ 2212 2213 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2214 enum amd_clockgating_state state) 2215 { 2216 int i, j, r; 2217 2218 if (amdgpu_emu_mode == 1) 2219 return 0; 2220 2221 for (j = 0; j < adev->num_ip_blocks; j++) { 2222 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2223 if (!adev->ip_blocks[i].status.late_initialized) 2224 continue; 2225 /* skip CG for VCE/UVD, it's handled specially */ 2226 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2227 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2228 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2229 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2230 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2231 /* enable clockgating to save power */ 2232 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2233 state); 2234 if (r) { 2235 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2236 adev->ip_blocks[i].version->funcs->name, r); 2237 return r; 2238 } 2239 } 2240 } 2241 2242 return 0; 2243 } 2244 2245 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2246 { 2247 int i, j, r; 2248 2249 if (amdgpu_emu_mode == 1) 2250 return 0; 2251 2252 for (j = 0; j < adev->num_ip_blocks; j++) { 2253 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2254 if (!adev->ip_blocks[i].status.late_initialized) 2255 continue; 2256 /* skip CG for VCE/UVD, it's handled specially */ 2257 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2259 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2260 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2261 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2262 /* enable powergating to save power */ 2263 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2264 state); 2265 if (r) { 2266 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2267 adev->ip_blocks[i].version->funcs->name, r); 2268 return r; 2269 } 2270 } 2271 } 2272 return 0; 2273 } 2274 2275 static int amdgpu_device_enable_mgpu_fan_boost(void) 2276 { 2277 struct amdgpu_gpu_instance *gpu_ins; 2278 struct amdgpu_device *adev; 2279 int i, ret = 0; 2280 2281 mutex_lock(&mgpu_info.mutex); 2282 2283 /* 2284 * MGPU fan boost feature should be enabled 2285 * only when there are two or more dGPUs in 2286 * the system 2287 */ 2288 if (mgpu_info.num_dgpu < 2) 2289 goto out; 2290 2291 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2292 gpu_ins = &(mgpu_info.gpu_ins[i]); 2293 adev = gpu_ins->adev; 2294 if (!(adev->flags & AMD_IS_APU) && 2295 !gpu_ins->mgpu_fan_enabled) { 2296 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2297 if (ret) 2298 break; 2299 2300 gpu_ins->mgpu_fan_enabled = 1; 2301 } 2302 } 2303 2304 out: 2305 mutex_unlock(&mgpu_info.mutex); 2306 2307 return ret; 2308 } 2309 2310 /** 2311 * amdgpu_device_ip_late_init - run late init for hardware IPs 2312 * 2313 * @adev: amdgpu_device pointer 2314 * 2315 * Late initialization pass for hardware IPs. The list of all the hardware 2316 * IPs that make up the asic is walked and the late_init callbacks are run. 2317 * late_init covers any special initialization that an IP requires 2318 * after all of the have been initialized or something that needs to happen 2319 * late in the init process. 2320 * Returns 0 on success, negative error code on failure. 2321 */ 2322 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2323 { 2324 struct amdgpu_gpu_instance *gpu_instance; 2325 int i = 0, r; 2326 2327 for (i = 0; i < adev->num_ip_blocks; i++) { 2328 if (!adev->ip_blocks[i].status.hw) 2329 continue; 2330 if (adev->ip_blocks[i].version->funcs->late_init) { 2331 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2332 if (r) { 2333 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2334 adev->ip_blocks[i].version->funcs->name, r); 2335 return r; 2336 } 2337 } 2338 adev->ip_blocks[i].status.late_initialized = true; 2339 } 2340 2341 amdgpu_ras_set_error_query_ready(adev, true); 2342 2343 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2344 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2345 2346 amdgpu_device_fill_reset_magic(adev); 2347 2348 r = amdgpu_device_enable_mgpu_fan_boost(); 2349 if (r) 2350 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2351 2352 2353 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2354 mutex_lock(&mgpu_info.mutex); 2355 2356 /* 2357 * Reset device p-state to low as this was booted with high. 2358 * 2359 * This should be performed only after all devices from the same 2360 * hive get initialized. 2361 * 2362 * However, it's unknown how many device in the hive in advance. 2363 * As this is counted one by one during devices initializations. 2364 * 2365 * So, we wait for all XGMI interlinked devices initialized. 2366 * This may bring some delays as those devices may come from 2367 * different hives. But that should be OK. 2368 */ 2369 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2370 for (i = 0; i < mgpu_info.num_gpu; i++) { 2371 gpu_instance = &(mgpu_info.gpu_ins[i]); 2372 if (gpu_instance->adev->flags & AMD_IS_APU) 2373 continue; 2374 2375 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2376 AMDGPU_XGMI_PSTATE_MIN); 2377 if (r) { 2378 DRM_ERROR("pstate setting failed (%d).\n", r); 2379 break; 2380 } 2381 } 2382 } 2383 2384 mutex_unlock(&mgpu_info.mutex); 2385 } 2386 2387 return 0; 2388 } 2389 2390 /** 2391 * amdgpu_device_ip_fini - run fini for hardware IPs 2392 * 2393 * @adev: amdgpu_device pointer 2394 * 2395 * Main teardown pass for hardware IPs. The list of all the hardware 2396 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2397 * are run. hw_fini tears down the hardware associated with each IP 2398 * and sw_fini tears down any software state associated with each IP. 2399 * Returns 0 on success, negative error code on failure. 2400 */ 2401 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2402 { 2403 int i, r; 2404 2405 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2406 amdgpu_virt_release_ras_err_handler_data(adev); 2407 2408 amdgpu_ras_pre_fini(adev); 2409 2410 if (adev->gmc.xgmi.num_physical_nodes > 1) 2411 amdgpu_xgmi_remove_device(adev); 2412 2413 amdgpu_amdkfd_device_fini(adev); 2414 2415 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2416 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2417 2418 /* need to disable SMC first */ 2419 for (i = 0; i < adev->num_ip_blocks; i++) { 2420 if (!adev->ip_blocks[i].status.hw) 2421 continue; 2422 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2423 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2424 /* XXX handle errors */ 2425 if (r) { 2426 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2427 adev->ip_blocks[i].version->funcs->name, r); 2428 } 2429 adev->ip_blocks[i].status.hw = false; 2430 break; 2431 } 2432 } 2433 2434 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2435 if (!adev->ip_blocks[i].status.hw) 2436 continue; 2437 2438 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2439 /* XXX handle errors */ 2440 if (r) { 2441 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2442 adev->ip_blocks[i].version->funcs->name, r); 2443 } 2444 2445 adev->ip_blocks[i].status.hw = false; 2446 } 2447 2448 2449 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2450 if (!adev->ip_blocks[i].status.sw) 2451 continue; 2452 2453 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2454 amdgpu_ucode_free_bo(adev); 2455 amdgpu_free_static_csa(&adev->virt.csa_obj); 2456 amdgpu_device_wb_fini(adev); 2457 amdgpu_device_vram_scratch_fini(adev); 2458 amdgpu_ib_pool_fini(adev); 2459 } 2460 2461 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2462 /* XXX handle errors */ 2463 if (r) { 2464 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2465 adev->ip_blocks[i].version->funcs->name, r); 2466 } 2467 adev->ip_blocks[i].status.sw = false; 2468 adev->ip_blocks[i].status.valid = false; 2469 } 2470 2471 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2472 if (!adev->ip_blocks[i].status.late_initialized) 2473 continue; 2474 if (adev->ip_blocks[i].version->funcs->late_fini) 2475 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2476 adev->ip_blocks[i].status.late_initialized = false; 2477 } 2478 2479 amdgpu_ras_fini(adev); 2480 2481 if (amdgpu_sriov_vf(adev)) 2482 if (amdgpu_virt_release_full_gpu(adev, false)) 2483 DRM_ERROR("failed to release exclusive mode on fini\n"); 2484 2485 return 0; 2486 } 2487 2488 /** 2489 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2490 * 2491 * @work: work_struct. 2492 */ 2493 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2494 { 2495 struct amdgpu_device *adev = 2496 container_of(work, struct amdgpu_device, delayed_init_work.work); 2497 int r; 2498 2499 r = amdgpu_ib_ring_tests(adev); 2500 if (r) 2501 DRM_ERROR("ib ring test failed (%d).\n", r); 2502 } 2503 2504 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2505 { 2506 struct amdgpu_device *adev = 2507 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2508 2509 mutex_lock(&adev->gfx.gfx_off_mutex); 2510 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2511 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2512 adev->gfx.gfx_off_state = true; 2513 } 2514 mutex_unlock(&adev->gfx.gfx_off_mutex); 2515 } 2516 2517 /** 2518 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2519 * 2520 * @adev: amdgpu_device pointer 2521 * 2522 * Main suspend function for hardware IPs. The list of all the hardware 2523 * IPs that make up the asic is walked, clockgating is disabled and the 2524 * suspend callbacks are run. suspend puts the hardware and software state 2525 * in each IP into a state suitable for suspend. 2526 * Returns 0 on success, negative error code on failure. 2527 */ 2528 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2529 { 2530 int i, r; 2531 2532 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2533 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2534 2535 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2536 if (!adev->ip_blocks[i].status.valid) 2537 continue; 2538 2539 /* displays are handled separately */ 2540 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2541 continue; 2542 2543 /* XXX handle errors */ 2544 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2545 /* XXX handle errors */ 2546 if (r) { 2547 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2548 adev->ip_blocks[i].version->funcs->name, r); 2549 return r; 2550 } 2551 2552 adev->ip_blocks[i].status.hw = false; 2553 } 2554 2555 return 0; 2556 } 2557 2558 /** 2559 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2560 * 2561 * @adev: amdgpu_device pointer 2562 * 2563 * Main suspend function for hardware IPs. The list of all the hardware 2564 * IPs that make up the asic is walked, clockgating is disabled and the 2565 * suspend callbacks are run. suspend puts the hardware and software state 2566 * in each IP into a state suitable for suspend. 2567 * Returns 0 on success, negative error code on failure. 2568 */ 2569 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2570 { 2571 int i, r; 2572 2573 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2574 if (!adev->ip_blocks[i].status.valid) 2575 continue; 2576 /* displays are handled in phase1 */ 2577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2578 continue; 2579 /* PSP lost connection when err_event_athub occurs */ 2580 if (amdgpu_ras_intr_triggered() && 2581 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2582 adev->ip_blocks[i].status.hw = false; 2583 continue; 2584 } 2585 /* XXX handle errors */ 2586 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2587 /* XXX handle errors */ 2588 if (r) { 2589 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2590 adev->ip_blocks[i].version->funcs->name, r); 2591 } 2592 adev->ip_blocks[i].status.hw = false; 2593 /* handle putting the SMC in the appropriate state */ 2594 if(!amdgpu_sriov_vf(adev)){ 2595 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2596 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2597 if (r) { 2598 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2599 adev->mp1_state, r); 2600 return r; 2601 } 2602 } 2603 } 2604 adev->ip_blocks[i].status.hw = false; 2605 } 2606 2607 return 0; 2608 } 2609 2610 /** 2611 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2612 * 2613 * @adev: amdgpu_device pointer 2614 * 2615 * Main suspend function for hardware IPs. The list of all the hardware 2616 * IPs that make up the asic is walked, clockgating is disabled and the 2617 * suspend callbacks are run. suspend puts the hardware and software state 2618 * in each IP into a state suitable for suspend. 2619 * Returns 0 on success, negative error code on failure. 2620 */ 2621 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2622 { 2623 int r; 2624 2625 if (amdgpu_sriov_vf(adev)) 2626 amdgpu_virt_request_full_gpu(adev, false); 2627 2628 r = amdgpu_device_ip_suspend_phase1(adev); 2629 if (r) 2630 return r; 2631 r = amdgpu_device_ip_suspend_phase2(adev); 2632 2633 if (amdgpu_sriov_vf(adev)) 2634 amdgpu_virt_release_full_gpu(adev, false); 2635 2636 return r; 2637 } 2638 2639 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2640 { 2641 int i, r; 2642 2643 static enum amd_ip_block_type ip_order[] = { 2644 AMD_IP_BLOCK_TYPE_GMC, 2645 AMD_IP_BLOCK_TYPE_COMMON, 2646 AMD_IP_BLOCK_TYPE_PSP, 2647 AMD_IP_BLOCK_TYPE_IH, 2648 }; 2649 2650 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2651 int j; 2652 struct amdgpu_ip_block *block; 2653 2654 block = &adev->ip_blocks[i]; 2655 block->status.hw = false; 2656 2657 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2658 2659 if (block->version->type != ip_order[j] || 2660 !block->status.valid) 2661 continue; 2662 2663 r = block->version->funcs->hw_init(adev); 2664 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2665 if (r) 2666 return r; 2667 block->status.hw = true; 2668 } 2669 } 2670 2671 return 0; 2672 } 2673 2674 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2675 { 2676 int i, r; 2677 2678 static enum amd_ip_block_type ip_order[] = { 2679 AMD_IP_BLOCK_TYPE_SMC, 2680 AMD_IP_BLOCK_TYPE_DCE, 2681 AMD_IP_BLOCK_TYPE_GFX, 2682 AMD_IP_BLOCK_TYPE_SDMA, 2683 AMD_IP_BLOCK_TYPE_UVD, 2684 AMD_IP_BLOCK_TYPE_VCE, 2685 AMD_IP_BLOCK_TYPE_VCN 2686 }; 2687 2688 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2689 int j; 2690 struct amdgpu_ip_block *block; 2691 2692 for (j = 0; j < adev->num_ip_blocks; j++) { 2693 block = &adev->ip_blocks[j]; 2694 2695 if (block->version->type != ip_order[i] || 2696 !block->status.valid || 2697 block->status.hw) 2698 continue; 2699 2700 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2701 r = block->version->funcs->resume(adev); 2702 else 2703 r = block->version->funcs->hw_init(adev); 2704 2705 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2706 if (r) 2707 return r; 2708 block->status.hw = true; 2709 } 2710 } 2711 2712 return 0; 2713 } 2714 2715 /** 2716 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2717 * 2718 * @adev: amdgpu_device pointer 2719 * 2720 * First resume function for hardware IPs. The list of all the hardware 2721 * IPs that make up the asic is walked and the resume callbacks are run for 2722 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2723 * after a suspend and updates the software state as necessary. This 2724 * function is also used for restoring the GPU after a GPU reset. 2725 * Returns 0 on success, negative error code on failure. 2726 */ 2727 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2728 { 2729 int i, r; 2730 2731 for (i = 0; i < adev->num_ip_blocks; i++) { 2732 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2733 continue; 2734 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2736 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2737 2738 r = adev->ip_blocks[i].version->funcs->resume(adev); 2739 if (r) { 2740 DRM_ERROR("resume of IP block <%s> failed %d\n", 2741 adev->ip_blocks[i].version->funcs->name, r); 2742 return r; 2743 } 2744 adev->ip_blocks[i].status.hw = true; 2745 } 2746 } 2747 2748 return 0; 2749 } 2750 2751 /** 2752 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2753 * 2754 * @adev: amdgpu_device pointer 2755 * 2756 * First resume function for hardware IPs. The list of all the hardware 2757 * IPs that make up the asic is walked and the resume callbacks are run for 2758 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2759 * functional state after a suspend and updates the software state as 2760 * necessary. This function is also used for restoring the GPU after a GPU 2761 * reset. 2762 * Returns 0 on success, negative error code on failure. 2763 */ 2764 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2765 { 2766 int i, r; 2767 2768 for (i = 0; i < adev->num_ip_blocks; i++) { 2769 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2770 continue; 2771 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2772 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2774 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2775 continue; 2776 r = adev->ip_blocks[i].version->funcs->resume(adev); 2777 if (r) { 2778 DRM_ERROR("resume of IP block <%s> failed %d\n", 2779 adev->ip_blocks[i].version->funcs->name, r); 2780 return r; 2781 } 2782 adev->ip_blocks[i].status.hw = true; 2783 } 2784 2785 return 0; 2786 } 2787 2788 /** 2789 * amdgpu_device_ip_resume - run resume for hardware IPs 2790 * 2791 * @adev: amdgpu_device pointer 2792 * 2793 * Main resume function for hardware IPs. The hardware IPs 2794 * are split into two resume functions because they are 2795 * are also used in in recovering from a GPU reset and some additional 2796 * steps need to be take between them. In this case (S3/S4) they are 2797 * run sequentially. 2798 * Returns 0 on success, negative error code on failure. 2799 */ 2800 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2801 { 2802 int r; 2803 2804 r = amdgpu_device_ip_resume_phase1(adev); 2805 if (r) 2806 return r; 2807 2808 r = amdgpu_device_fw_loading(adev); 2809 if (r) 2810 return r; 2811 2812 r = amdgpu_device_ip_resume_phase2(adev); 2813 2814 return r; 2815 } 2816 2817 /** 2818 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2819 * 2820 * @adev: amdgpu_device pointer 2821 * 2822 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2823 */ 2824 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2825 { 2826 if (amdgpu_sriov_vf(adev)) { 2827 if (adev->is_atom_fw) { 2828 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2829 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2830 } else { 2831 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2832 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2833 } 2834 2835 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2836 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2837 } 2838 } 2839 2840 /** 2841 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2842 * 2843 * @asic_type: AMD asic type 2844 * 2845 * Check if there is DC (new modesetting infrastructre) support for an asic. 2846 * returns true if DC has support, false if not. 2847 */ 2848 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2849 { 2850 switch (asic_type) { 2851 #if defined(CONFIG_DRM_AMD_DC) 2852 #if defined(CONFIG_DRM_AMD_DC_SI) 2853 case CHIP_TAHITI: 2854 case CHIP_PITCAIRN: 2855 case CHIP_VERDE: 2856 case CHIP_OLAND: 2857 #endif 2858 case CHIP_BONAIRE: 2859 case CHIP_KAVERI: 2860 case CHIP_KABINI: 2861 case CHIP_MULLINS: 2862 /* 2863 * We have systems in the wild with these ASICs that require 2864 * LVDS and VGA support which is not supported with DC. 2865 * 2866 * Fallback to the non-DC driver here by default so as not to 2867 * cause regressions. 2868 */ 2869 return amdgpu_dc > 0; 2870 case CHIP_HAWAII: 2871 case CHIP_CARRIZO: 2872 case CHIP_STONEY: 2873 case CHIP_POLARIS10: 2874 case CHIP_POLARIS11: 2875 case CHIP_POLARIS12: 2876 case CHIP_VEGAM: 2877 case CHIP_TONGA: 2878 case CHIP_FIJI: 2879 case CHIP_VEGA10: 2880 case CHIP_VEGA12: 2881 case CHIP_VEGA20: 2882 #if defined(CONFIG_DRM_AMD_DC_DCN) 2883 case CHIP_RAVEN: 2884 case CHIP_NAVI10: 2885 case CHIP_NAVI14: 2886 case CHIP_NAVI12: 2887 case CHIP_RENOIR: 2888 #endif 2889 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2890 case CHIP_SIENNA_CICHLID: 2891 case CHIP_NAVY_FLOUNDER: 2892 #endif 2893 return amdgpu_dc != 0; 2894 #endif 2895 default: 2896 if (amdgpu_dc > 0) 2897 DRM_INFO("Display Core has been requested via kernel parameter " 2898 "but isn't supported by ASIC, ignoring\n"); 2899 return false; 2900 } 2901 } 2902 2903 /** 2904 * amdgpu_device_has_dc_support - check if dc is supported 2905 * 2906 * @adev: amdgpu_device_pointer 2907 * 2908 * Returns true for supported, false for not supported 2909 */ 2910 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2911 { 2912 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 2913 return false; 2914 2915 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2916 } 2917 2918 2919 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2920 { 2921 struct amdgpu_device *adev = 2922 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2923 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2924 2925 /* It's a bug to not have a hive within this function */ 2926 if (WARN_ON(!hive)) 2927 return; 2928 2929 /* 2930 * Use task barrier to synchronize all xgmi reset works across the 2931 * hive. task_barrier_enter and task_barrier_exit will block 2932 * until all the threads running the xgmi reset works reach 2933 * those points. task_barrier_full will do both blocks. 2934 */ 2935 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2936 2937 task_barrier_enter(&hive->tb); 2938 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 2939 2940 if (adev->asic_reset_res) 2941 goto fail; 2942 2943 task_barrier_exit(&hive->tb); 2944 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 2945 2946 if (adev->asic_reset_res) 2947 goto fail; 2948 2949 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 2950 adev->mmhub.funcs->reset_ras_error_count(adev); 2951 } else { 2952 2953 task_barrier_full(&hive->tb); 2954 adev->asic_reset_res = amdgpu_asic_reset(adev); 2955 } 2956 2957 fail: 2958 if (adev->asic_reset_res) 2959 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2960 adev->asic_reset_res, adev_to_drm(adev)->unique); 2961 amdgpu_put_xgmi_hive(hive); 2962 } 2963 2964 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2965 { 2966 char *input = amdgpu_lockup_timeout; 2967 char *timeout_setting = NULL; 2968 int index = 0; 2969 long timeout; 2970 int ret = 0; 2971 2972 /* 2973 * By default timeout for non compute jobs is 10000. 2974 * And there is no timeout enforced on compute jobs. 2975 * In SR-IOV or passthrough mode, timeout for compute 2976 * jobs are 60000 by default. 2977 */ 2978 adev->gfx_timeout = msecs_to_jiffies(10000); 2979 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2980 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2981 adev->compute_timeout = msecs_to_jiffies(60000); 2982 else 2983 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2984 2985 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2986 while ((timeout_setting = strsep(&input, ",")) && 2987 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2988 ret = kstrtol(timeout_setting, 0, &timeout); 2989 if (ret) 2990 return ret; 2991 2992 if (timeout == 0) { 2993 index++; 2994 continue; 2995 } else if (timeout < 0) { 2996 timeout = MAX_SCHEDULE_TIMEOUT; 2997 } else { 2998 timeout = msecs_to_jiffies(timeout); 2999 } 3000 3001 switch (index++) { 3002 case 0: 3003 adev->gfx_timeout = timeout; 3004 break; 3005 case 1: 3006 adev->compute_timeout = timeout; 3007 break; 3008 case 2: 3009 adev->sdma_timeout = timeout; 3010 break; 3011 case 3: 3012 adev->video_timeout = timeout; 3013 break; 3014 default: 3015 break; 3016 } 3017 } 3018 /* 3019 * There is only one value specified and 3020 * it should apply to all non-compute jobs. 3021 */ 3022 if (index == 1) { 3023 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3024 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3025 adev->compute_timeout = adev->gfx_timeout; 3026 } 3027 } 3028 3029 return ret; 3030 } 3031 3032 static const struct attribute *amdgpu_dev_attributes[] = { 3033 &dev_attr_product_name.attr, 3034 &dev_attr_product_number.attr, 3035 &dev_attr_serial_number.attr, 3036 &dev_attr_pcie_replay_count.attr, 3037 NULL 3038 }; 3039 3040 3041 /** 3042 * amdgpu_device_init - initialize the driver 3043 * 3044 * @adev: amdgpu_device pointer 3045 * @flags: driver flags 3046 * 3047 * Initializes the driver info and hw (all asics). 3048 * Returns 0 for success or an error on failure. 3049 * Called at driver startup. 3050 */ 3051 int amdgpu_device_init(struct amdgpu_device *adev, 3052 uint32_t flags) 3053 { 3054 struct drm_device *ddev = adev_to_drm(adev); 3055 struct pci_dev *pdev = adev->pdev; 3056 int r, i; 3057 bool boco = false; 3058 u32 max_MBps; 3059 3060 adev->shutdown = false; 3061 adev->flags = flags; 3062 3063 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3064 adev->asic_type = amdgpu_force_asic_type; 3065 else 3066 adev->asic_type = flags & AMD_ASIC_MASK; 3067 3068 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3069 if (amdgpu_emu_mode == 1) 3070 adev->usec_timeout *= 10; 3071 adev->gmc.gart_size = 512 * 1024 * 1024; 3072 adev->accel_working = false; 3073 adev->num_rings = 0; 3074 adev->mman.buffer_funcs = NULL; 3075 adev->mman.buffer_funcs_ring = NULL; 3076 adev->vm_manager.vm_pte_funcs = NULL; 3077 adev->vm_manager.vm_pte_num_scheds = 0; 3078 adev->gmc.gmc_funcs = NULL; 3079 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3080 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3081 3082 adev->smc_rreg = &amdgpu_invalid_rreg; 3083 adev->smc_wreg = &amdgpu_invalid_wreg; 3084 adev->pcie_rreg = &amdgpu_invalid_rreg; 3085 adev->pcie_wreg = &amdgpu_invalid_wreg; 3086 adev->pciep_rreg = &amdgpu_invalid_rreg; 3087 adev->pciep_wreg = &amdgpu_invalid_wreg; 3088 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3089 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3090 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3091 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3092 adev->didt_rreg = &amdgpu_invalid_rreg; 3093 adev->didt_wreg = &amdgpu_invalid_wreg; 3094 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3095 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3096 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3097 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3098 3099 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3100 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3101 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3102 3103 /* mutex initialization are all done here so we 3104 * can recall function without having locking issues */ 3105 atomic_set(&adev->irq.ih.lock, 0); 3106 mutex_init(&adev->firmware.mutex); 3107 mutex_init(&adev->pm.mutex); 3108 mutex_init(&adev->gfx.gpu_clock_mutex); 3109 mutex_init(&adev->srbm_mutex); 3110 mutex_init(&adev->gfx.pipe_reserve_mutex); 3111 mutex_init(&adev->gfx.gfx_off_mutex); 3112 mutex_init(&adev->grbm_idx_mutex); 3113 mutex_init(&adev->mn_lock); 3114 mutex_init(&adev->virt.vf_errors.lock); 3115 hash_init(adev->mn_hash); 3116 atomic_set(&adev->in_gpu_reset, 0); 3117 init_rwsem(&adev->reset_sem); 3118 mutex_init(&adev->psp.mutex); 3119 mutex_init(&adev->notifier_lock); 3120 3121 r = amdgpu_device_check_arguments(adev); 3122 if (r) 3123 return r; 3124 3125 spin_lock_init(&adev->mmio_idx_lock); 3126 spin_lock_init(&adev->smc_idx_lock); 3127 spin_lock_init(&adev->pcie_idx_lock); 3128 spin_lock_init(&adev->uvd_ctx_idx_lock); 3129 spin_lock_init(&adev->didt_idx_lock); 3130 spin_lock_init(&adev->gc_cac_idx_lock); 3131 spin_lock_init(&adev->se_cac_idx_lock); 3132 spin_lock_init(&adev->audio_endpt_idx_lock); 3133 spin_lock_init(&adev->mm_stats.lock); 3134 3135 INIT_LIST_HEAD(&adev->shadow_list); 3136 mutex_init(&adev->shadow_list_lock); 3137 3138 INIT_DELAYED_WORK(&adev->delayed_init_work, 3139 amdgpu_device_delayed_init_work_handler); 3140 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3141 amdgpu_device_delay_enable_gfx_off); 3142 3143 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3144 3145 adev->gfx.gfx_off_req_count = 1; 3146 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3147 3148 atomic_set(&adev->throttling_logging_enabled, 1); 3149 /* 3150 * If throttling continues, logging will be performed every minute 3151 * to avoid log flooding. "-1" is subtracted since the thermal 3152 * throttling interrupt comes every second. Thus, the total logging 3153 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3154 * for throttling interrupt) = 60 seconds. 3155 */ 3156 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3157 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3158 3159 /* Registers mapping */ 3160 /* TODO: block userspace mapping of io register */ 3161 if (adev->asic_type >= CHIP_BONAIRE) { 3162 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3163 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3164 } else { 3165 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3166 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3167 } 3168 3169 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3170 if (adev->rmmio == NULL) { 3171 return -ENOMEM; 3172 } 3173 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3174 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3175 3176 /* io port mapping */ 3177 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3178 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3179 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3180 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3181 break; 3182 } 3183 } 3184 if (adev->rio_mem == NULL) 3185 DRM_INFO("PCI I/O BAR is not found.\n"); 3186 3187 /* enable PCIE atomic ops */ 3188 r = pci_enable_atomic_ops_to_root(adev->pdev, 3189 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3190 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3191 if (r) { 3192 adev->have_atomics_support = false; 3193 DRM_INFO("PCIE atomic ops is not supported\n"); 3194 } else { 3195 adev->have_atomics_support = true; 3196 } 3197 3198 amdgpu_device_get_pcie_info(adev); 3199 3200 if (amdgpu_mcbp) 3201 DRM_INFO("MCBP is enabled\n"); 3202 3203 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3204 adev->enable_mes = true; 3205 3206 /* detect hw virtualization here */ 3207 amdgpu_detect_virtualization(adev); 3208 3209 r = amdgpu_device_get_job_timeout_settings(adev); 3210 if (r) { 3211 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3212 return r; 3213 } 3214 3215 /* early init functions */ 3216 r = amdgpu_device_ip_early_init(adev); 3217 if (r) 3218 return r; 3219 3220 /* doorbell bar mapping and doorbell index init*/ 3221 amdgpu_device_doorbell_init(adev); 3222 3223 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3224 /* this will fail for cards that aren't VGA class devices, just 3225 * ignore it */ 3226 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3227 3228 if (amdgpu_device_supports_boco(ddev)) 3229 boco = true; 3230 if (amdgpu_has_atpx() && 3231 (amdgpu_is_atpx_hybrid() || 3232 amdgpu_has_atpx_dgpu_power_cntl()) && 3233 !pci_is_thunderbolt_attached(adev->pdev)) 3234 vga_switcheroo_register_client(adev->pdev, 3235 &amdgpu_switcheroo_ops, boco); 3236 if (boco) 3237 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3238 3239 if (amdgpu_emu_mode == 1) { 3240 /* post the asic on emulation mode */ 3241 emu_soc_asic_init(adev); 3242 goto fence_driver_init; 3243 } 3244 3245 /* detect if we are with an SRIOV vbios */ 3246 amdgpu_device_detect_sriov_bios(adev); 3247 3248 /* check if we need to reset the asic 3249 * E.g., driver was not cleanly unloaded previously, etc. 3250 */ 3251 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3252 r = amdgpu_asic_reset(adev); 3253 if (r) { 3254 dev_err(adev->dev, "asic reset on init failed\n"); 3255 goto failed; 3256 } 3257 } 3258 3259 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3260 3261 /* Post card if necessary */ 3262 if (amdgpu_device_need_post(adev)) { 3263 if (!adev->bios) { 3264 dev_err(adev->dev, "no vBIOS found\n"); 3265 r = -EINVAL; 3266 goto failed; 3267 } 3268 DRM_INFO("GPU posting now...\n"); 3269 r = amdgpu_device_asic_init(adev); 3270 if (r) { 3271 dev_err(adev->dev, "gpu post error!\n"); 3272 goto failed; 3273 } 3274 } 3275 3276 if (adev->is_atom_fw) { 3277 /* Initialize clocks */ 3278 r = amdgpu_atomfirmware_get_clock_info(adev); 3279 if (r) { 3280 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3281 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3282 goto failed; 3283 } 3284 } else { 3285 /* Initialize clocks */ 3286 r = amdgpu_atombios_get_clock_info(adev); 3287 if (r) { 3288 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3289 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3290 goto failed; 3291 } 3292 /* init i2c buses */ 3293 if (!amdgpu_device_has_dc_support(adev)) 3294 amdgpu_atombios_i2c_init(adev); 3295 } 3296 3297 fence_driver_init: 3298 /* Fence driver */ 3299 r = amdgpu_fence_driver_init(adev); 3300 if (r) { 3301 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3302 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3303 goto failed; 3304 } 3305 3306 /* init the mode config */ 3307 drm_mode_config_init(adev_to_drm(adev)); 3308 3309 r = amdgpu_device_ip_init(adev); 3310 if (r) { 3311 /* failed in exclusive mode due to timeout */ 3312 if (amdgpu_sriov_vf(adev) && 3313 !amdgpu_sriov_runtime(adev) && 3314 amdgpu_virt_mmio_blocked(adev) && 3315 !amdgpu_virt_wait_reset(adev)) { 3316 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3317 /* Don't send request since VF is inactive. */ 3318 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3319 adev->virt.ops = NULL; 3320 r = -EAGAIN; 3321 goto failed; 3322 } 3323 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3324 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3325 goto failed; 3326 } 3327 3328 dev_info(adev->dev, 3329 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3330 adev->gfx.config.max_shader_engines, 3331 adev->gfx.config.max_sh_per_se, 3332 adev->gfx.config.max_cu_per_sh, 3333 adev->gfx.cu_info.number); 3334 3335 adev->accel_working = true; 3336 3337 amdgpu_vm_check_compute_bug(adev); 3338 3339 /* Initialize the buffer migration limit. */ 3340 if (amdgpu_moverate >= 0) 3341 max_MBps = amdgpu_moverate; 3342 else 3343 max_MBps = 8; /* Allow 8 MB/s. */ 3344 /* Get a log2 for easy divisions. */ 3345 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3346 3347 amdgpu_fbdev_init(adev); 3348 3349 r = amdgpu_pm_sysfs_init(adev); 3350 if (r) { 3351 adev->pm_sysfs_en = false; 3352 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3353 } else 3354 adev->pm_sysfs_en = true; 3355 3356 r = amdgpu_ucode_sysfs_init(adev); 3357 if (r) { 3358 adev->ucode_sysfs_en = false; 3359 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3360 } else 3361 adev->ucode_sysfs_en = true; 3362 3363 if ((amdgpu_testing & 1)) { 3364 if (adev->accel_working) 3365 amdgpu_test_moves(adev); 3366 else 3367 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3368 } 3369 if (amdgpu_benchmarking) { 3370 if (adev->accel_working) 3371 amdgpu_benchmark(adev, amdgpu_benchmarking); 3372 else 3373 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3374 } 3375 3376 /* 3377 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3378 * Otherwise the mgpu fan boost feature will be skipped due to the 3379 * gpu instance is counted less. 3380 */ 3381 amdgpu_register_gpu_instance(adev); 3382 3383 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3384 * explicit gating rather than handling it automatically. 3385 */ 3386 r = amdgpu_device_ip_late_init(adev); 3387 if (r) { 3388 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3389 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3390 goto failed; 3391 } 3392 3393 /* must succeed. */ 3394 amdgpu_ras_resume(adev); 3395 3396 queue_delayed_work(system_wq, &adev->delayed_init_work, 3397 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3398 3399 if (amdgpu_sriov_vf(adev)) 3400 flush_delayed_work(&adev->delayed_init_work); 3401 3402 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3403 if (r) { 3404 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3405 return r; 3406 } 3407 3408 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3409 r = amdgpu_pmu_init(adev); 3410 if (r) 3411 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3412 3413 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3414 if (amdgpu_device_cache_pci_state(adev->pdev)) 3415 pci_restore_state(pdev); 3416 3417 return 0; 3418 3419 failed: 3420 amdgpu_vf_error_trans_all(adev); 3421 if (boco) 3422 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3423 3424 return r; 3425 } 3426 3427 /** 3428 * amdgpu_device_fini - tear down the driver 3429 * 3430 * @adev: amdgpu_device pointer 3431 * 3432 * Tear down the driver info (all asics). 3433 * Called at driver shutdown. 3434 */ 3435 void amdgpu_device_fini(struct amdgpu_device *adev) 3436 { 3437 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3438 flush_delayed_work(&adev->delayed_init_work); 3439 adev->shutdown = true; 3440 3441 kfree(adev->pci_state); 3442 3443 /* make sure IB test finished before entering exclusive mode 3444 * to avoid preemption on IB test 3445 * */ 3446 if (amdgpu_sriov_vf(adev)) 3447 amdgpu_virt_request_full_gpu(adev, false); 3448 3449 /* disable all interrupts */ 3450 amdgpu_irq_disable_all(adev); 3451 if (adev->mode_info.mode_config_initialized){ 3452 if (!amdgpu_device_has_dc_support(adev)) 3453 drm_helper_force_disable_all(adev_to_drm(adev)); 3454 else 3455 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3456 } 3457 amdgpu_fence_driver_fini(adev); 3458 if (adev->pm_sysfs_en) 3459 amdgpu_pm_sysfs_fini(adev); 3460 amdgpu_fbdev_fini(adev); 3461 amdgpu_device_ip_fini(adev); 3462 release_firmware(adev->firmware.gpu_info_fw); 3463 adev->firmware.gpu_info_fw = NULL; 3464 adev->accel_working = false; 3465 /* free i2c buses */ 3466 if (!amdgpu_device_has_dc_support(adev)) 3467 amdgpu_i2c_fini(adev); 3468 3469 if (amdgpu_emu_mode != 1) 3470 amdgpu_atombios_fini(adev); 3471 3472 kfree(adev->bios); 3473 adev->bios = NULL; 3474 if (amdgpu_has_atpx() && 3475 (amdgpu_is_atpx_hybrid() || 3476 amdgpu_has_atpx_dgpu_power_cntl()) && 3477 !pci_is_thunderbolt_attached(adev->pdev)) 3478 vga_switcheroo_unregister_client(adev->pdev); 3479 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3480 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3481 vga_client_register(adev->pdev, NULL, NULL, NULL); 3482 if (adev->rio_mem) 3483 pci_iounmap(adev->pdev, adev->rio_mem); 3484 adev->rio_mem = NULL; 3485 iounmap(adev->rmmio); 3486 adev->rmmio = NULL; 3487 amdgpu_device_doorbell_fini(adev); 3488 3489 if (adev->ucode_sysfs_en) 3490 amdgpu_ucode_sysfs_fini(adev); 3491 3492 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3493 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3494 amdgpu_pmu_fini(adev); 3495 if (adev->mman.discovery_bin) 3496 amdgpu_discovery_fini(adev); 3497 } 3498 3499 3500 /* 3501 * Suspend & resume. 3502 */ 3503 /** 3504 * amdgpu_device_suspend - initiate device suspend 3505 * 3506 * @dev: drm dev pointer 3507 * @fbcon : notify the fbdev of suspend 3508 * 3509 * Puts the hw in the suspend state (all asics). 3510 * Returns 0 for success or an error on failure. 3511 * Called at driver suspend. 3512 */ 3513 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3514 { 3515 struct amdgpu_device *adev; 3516 struct drm_crtc *crtc; 3517 struct drm_connector *connector; 3518 struct drm_connector_list_iter iter; 3519 int r; 3520 3521 adev = drm_to_adev(dev); 3522 3523 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3524 return 0; 3525 3526 adev->in_suspend = true; 3527 drm_kms_helper_poll_disable(dev); 3528 3529 if (fbcon) 3530 amdgpu_fbdev_set_suspend(adev, 1); 3531 3532 cancel_delayed_work_sync(&adev->delayed_init_work); 3533 3534 if (!amdgpu_device_has_dc_support(adev)) { 3535 /* turn off display hw */ 3536 drm_modeset_lock_all(dev); 3537 drm_connector_list_iter_begin(dev, &iter); 3538 drm_for_each_connector_iter(connector, &iter) 3539 drm_helper_connector_dpms(connector, 3540 DRM_MODE_DPMS_OFF); 3541 drm_connector_list_iter_end(&iter); 3542 drm_modeset_unlock_all(dev); 3543 /* unpin the front buffers and cursors */ 3544 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3545 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3546 struct drm_framebuffer *fb = crtc->primary->fb; 3547 struct amdgpu_bo *robj; 3548 3549 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3550 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3551 r = amdgpu_bo_reserve(aobj, true); 3552 if (r == 0) { 3553 amdgpu_bo_unpin(aobj); 3554 amdgpu_bo_unreserve(aobj); 3555 } 3556 } 3557 3558 if (fb == NULL || fb->obj[0] == NULL) { 3559 continue; 3560 } 3561 robj = gem_to_amdgpu_bo(fb->obj[0]); 3562 /* don't unpin kernel fb objects */ 3563 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3564 r = amdgpu_bo_reserve(robj, true); 3565 if (r == 0) { 3566 amdgpu_bo_unpin(robj); 3567 amdgpu_bo_unreserve(robj); 3568 } 3569 } 3570 } 3571 } 3572 3573 amdgpu_ras_suspend(adev); 3574 3575 r = amdgpu_device_ip_suspend_phase1(adev); 3576 3577 amdgpu_amdkfd_suspend(adev, !fbcon); 3578 3579 /* evict vram memory */ 3580 amdgpu_bo_evict_vram(adev); 3581 3582 amdgpu_fence_driver_suspend(adev); 3583 3584 r = amdgpu_device_ip_suspend_phase2(adev); 3585 3586 /* evict remaining vram memory 3587 * This second call to evict vram is to evict the gart page table 3588 * using the CPU. 3589 */ 3590 amdgpu_bo_evict_vram(adev); 3591 3592 return 0; 3593 } 3594 3595 /** 3596 * amdgpu_device_resume - initiate device resume 3597 * 3598 * @dev: drm dev pointer 3599 * @fbcon : notify the fbdev of resume 3600 * 3601 * Bring the hw back to operating state (all asics). 3602 * Returns 0 for success or an error on failure. 3603 * Called at driver resume. 3604 */ 3605 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3606 { 3607 struct drm_connector *connector; 3608 struct drm_connector_list_iter iter; 3609 struct amdgpu_device *adev = drm_to_adev(dev); 3610 struct drm_crtc *crtc; 3611 int r = 0; 3612 3613 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3614 return 0; 3615 3616 /* post card */ 3617 if (amdgpu_device_need_post(adev)) { 3618 r = amdgpu_device_asic_init(adev); 3619 if (r) 3620 dev_err(adev->dev, "amdgpu asic init failed\n"); 3621 } 3622 3623 r = amdgpu_device_ip_resume(adev); 3624 if (r) { 3625 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3626 return r; 3627 } 3628 amdgpu_fence_driver_resume(adev); 3629 3630 3631 r = amdgpu_device_ip_late_init(adev); 3632 if (r) 3633 return r; 3634 3635 queue_delayed_work(system_wq, &adev->delayed_init_work, 3636 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3637 3638 if (!amdgpu_device_has_dc_support(adev)) { 3639 /* pin cursors */ 3640 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3641 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3642 3643 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3644 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3645 r = amdgpu_bo_reserve(aobj, true); 3646 if (r == 0) { 3647 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3648 if (r != 0) 3649 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3650 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3651 amdgpu_bo_unreserve(aobj); 3652 } 3653 } 3654 } 3655 } 3656 r = amdgpu_amdkfd_resume(adev, !fbcon); 3657 if (r) 3658 return r; 3659 3660 /* Make sure IB tests flushed */ 3661 flush_delayed_work(&adev->delayed_init_work); 3662 3663 /* blat the mode back in */ 3664 if (fbcon) { 3665 if (!amdgpu_device_has_dc_support(adev)) { 3666 /* pre DCE11 */ 3667 drm_helper_resume_force_mode(dev); 3668 3669 /* turn on display hw */ 3670 drm_modeset_lock_all(dev); 3671 3672 drm_connector_list_iter_begin(dev, &iter); 3673 drm_for_each_connector_iter(connector, &iter) 3674 drm_helper_connector_dpms(connector, 3675 DRM_MODE_DPMS_ON); 3676 drm_connector_list_iter_end(&iter); 3677 3678 drm_modeset_unlock_all(dev); 3679 } 3680 amdgpu_fbdev_set_suspend(adev, 0); 3681 } 3682 3683 drm_kms_helper_poll_enable(dev); 3684 3685 amdgpu_ras_resume(adev); 3686 3687 /* 3688 * Most of the connector probing functions try to acquire runtime pm 3689 * refs to ensure that the GPU is powered on when connector polling is 3690 * performed. Since we're calling this from a runtime PM callback, 3691 * trying to acquire rpm refs will cause us to deadlock. 3692 * 3693 * Since we're guaranteed to be holding the rpm lock, it's safe to 3694 * temporarily disable the rpm helpers so this doesn't deadlock us. 3695 */ 3696 #ifdef CONFIG_PM 3697 dev->dev->power.disable_depth++; 3698 #endif 3699 if (!amdgpu_device_has_dc_support(adev)) 3700 drm_helper_hpd_irq_event(dev); 3701 else 3702 drm_kms_helper_hotplug_event(dev); 3703 #ifdef CONFIG_PM 3704 dev->dev->power.disable_depth--; 3705 #endif 3706 adev->in_suspend = false; 3707 3708 return 0; 3709 } 3710 3711 /** 3712 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3713 * 3714 * @adev: amdgpu_device pointer 3715 * 3716 * The list of all the hardware IPs that make up the asic is walked and 3717 * the check_soft_reset callbacks are run. check_soft_reset determines 3718 * if the asic is still hung or not. 3719 * Returns true if any of the IPs are still in a hung state, false if not. 3720 */ 3721 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3722 { 3723 int i; 3724 bool asic_hang = false; 3725 3726 if (amdgpu_sriov_vf(adev)) 3727 return true; 3728 3729 if (amdgpu_asic_need_full_reset(adev)) 3730 return true; 3731 3732 for (i = 0; i < adev->num_ip_blocks; i++) { 3733 if (!adev->ip_blocks[i].status.valid) 3734 continue; 3735 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3736 adev->ip_blocks[i].status.hang = 3737 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3738 if (adev->ip_blocks[i].status.hang) { 3739 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3740 asic_hang = true; 3741 } 3742 } 3743 return asic_hang; 3744 } 3745 3746 /** 3747 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3748 * 3749 * @adev: amdgpu_device pointer 3750 * 3751 * The list of all the hardware IPs that make up the asic is walked and the 3752 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3753 * handles any IP specific hardware or software state changes that are 3754 * necessary for a soft reset to succeed. 3755 * Returns 0 on success, negative error code on failure. 3756 */ 3757 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3758 { 3759 int i, r = 0; 3760 3761 for (i = 0; i < adev->num_ip_blocks; i++) { 3762 if (!adev->ip_blocks[i].status.valid) 3763 continue; 3764 if (adev->ip_blocks[i].status.hang && 3765 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3766 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3767 if (r) 3768 return r; 3769 } 3770 } 3771 3772 return 0; 3773 } 3774 3775 /** 3776 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3777 * 3778 * @adev: amdgpu_device pointer 3779 * 3780 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3781 * reset is necessary to recover. 3782 * Returns true if a full asic reset is required, false if not. 3783 */ 3784 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3785 { 3786 int i; 3787 3788 if (amdgpu_asic_need_full_reset(adev)) 3789 return true; 3790 3791 for (i = 0; i < adev->num_ip_blocks; i++) { 3792 if (!adev->ip_blocks[i].status.valid) 3793 continue; 3794 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3795 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3796 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3797 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3798 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3799 if (adev->ip_blocks[i].status.hang) { 3800 dev_info(adev->dev, "Some block need full reset!\n"); 3801 return true; 3802 } 3803 } 3804 } 3805 return false; 3806 } 3807 3808 /** 3809 * amdgpu_device_ip_soft_reset - do a soft reset 3810 * 3811 * @adev: amdgpu_device pointer 3812 * 3813 * The list of all the hardware IPs that make up the asic is walked and the 3814 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3815 * IP specific hardware or software state changes that are necessary to soft 3816 * reset the IP. 3817 * Returns 0 on success, negative error code on failure. 3818 */ 3819 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3820 { 3821 int i, r = 0; 3822 3823 for (i = 0; i < adev->num_ip_blocks; i++) { 3824 if (!adev->ip_blocks[i].status.valid) 3825 continue; 3826 if (adev->ip_blocks[i].status.hang && 3827 adev->ip_blocks[i].version->funcs->soft_reset) { 3828 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3829 if (r) 3830 return r; 3831 } 3832 } 3833 3834 return 0; 3835 } 3836 3837 /** 3838 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3839 * 3840 * @adev: amdgpu_device pointer 3841 * 3842 * The list of all the hardware IPs that make up the asic is walked and the 3843 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3844 * handles any IP specific hardware or software state changes that are 3845 * necessary after the IP has been soft reset. 3846 * Returns 0 on success, negative error code on failure. 3847 */ 3848 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3849 { 3850 int i, r = 0; 3851 3852 for (i = 0; i < adev->num_ip_blocks; i++) { 3853 if (!adev->ip_blocks[i].status.valid) 3854 continue; 3855 if (adev->ip_blocks[i].status.hang && 3856 adev->ip_blocks[i].version->funcs->post_soft_reset) 3857 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3858 if (r) 3859 return r; 3860 } 3861 3862 return 0; 3863 } 3864 3865 /** 3866 * amdgpu_device_recover_vram - Recover some VRAM contents 3867 * 3868 * @adev: amdgpu_device pointer 3869 * 3870 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3871 * restore things like GPUVM page tables after a GPU reset where 3872 * the contents of VRAM might be lost. 3873 * 3874 * Returns: 3875 * 0 on success, negative error code on failure. 3876 */ 3877 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3878 { 3879 struct dma_fence *fence = NULL, *next = NULL; 3880 struct amdgpu_bo *shadow; 3881 long r = 1, tmo; 3882 3883 if (amdgpu_sriov_runtime(adev)) 3884 tmo = msecs_to_jiffies(8000); 3885 else 3886 tmo = msecs_to_jiffies(100); 3887 3888 dev_info(adev->dev, "recover vram bo from shadow start\n"); 3889 mutex_lock(&adev->shadow_list_lock); 3890 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3891 3892 /* No need to recover an evicted BO */ 3893 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3894 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3895 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3896 continue; 3897 3898 r = amdgpu_bo_restore_shadow(shadow, &next); 3899 if (r) 3900 break; 3901 3902 if (fence) { 3903 tmo = dma_fence_wait_timeout(fence, false, tmo); 3904 dma_fence_put(fence); 3905 fence = next; 3906 if (tmo == 0) { 3907 r = -ETIMEDOUT; 3908 break; 3909 } else if (tmo < 0) { 3910 r = tmo; 3911 break; 3912 } 3913 } else { 3914 fence = next; 3915 } 3916 } 3917 mutex_unlock(&adev->shadow_list_lock); 3918 3919 if (fence) 3920 tmo = dma_fence_wait_timeout(fence, false, tmo); 3921 dma_fence_put(fence); 3922 3923 if (r < 0 || tmo <= 0) { 3924 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3925 return -EIO; 3926 } 3927 3928 dev_info(adev->dev, "recover vram bo from shadow done\n"); 3929 return 0; 3930 } 3931 3932 3933 /** 3934 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3935 * 3936 * @adev: amdgpu device pointer 3937 * @from_hypervisor: request from hypervisor 3938 * 3939 * do VF FLR and reinitialize Asic 3940 * return 0 means succeeded otherwise failed 3941 */ 3942 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3943 bool from_hypervisor) 3944 { 3945 int r; 3946 3947 if (from_hypervisor) 3948 r = amdgpu_virt_request_full_gpu(adev, true); 3949 else 3950 r = amdgpu_virt_reset_gpu(adev); 3951 if (r) 3952 return r; 3953 3954 amdgpu_amdkfd_pre_reset(adev); 3955 3956 /* Resume IP prior to SMC */ 3957 r = amdgpu_device_ip_reinit_early_sriov(adev); 3958 if (r) 3959 goto error; 3960 3961 amdgpu_virt_init_data_exchange(adev); 3962 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3963 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 3964 3965 r = amdgpu_device_fw_loading(adev); 3966 if (r) 3967 return r; 3968 3969 /* now we are okay to resume SMC/CP/SDMA */ 3970 r = amdgpu_device_ip_reinit_late_sriov(adev); 3971 if (r) 3972 goto error; 3973 3974 amdgpu_irq_gpu_reset_resume_helper(adev); 3975 r = amdgpu_ib_ring_tests(adev); 3976 amdgpu_amdkfd_post_reset(adev); 3977 3978 error: 3979 amdgpu_virt_release_full_gpu(adev, true); 3980 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3981 amdgpu_inc_vram_lost(adev); 3982 r = amdgpu_device_recover_vram(adev); 3983 } 3984 3985 return r; 3986 } 3987 3988 /** 3989 * amdgpu_device_has_job_running - check if there is any job in mirror list 3990 * 3991 * @adev: amdgpu device pointer 3992 * 3993 * check if there is any job in mirror list 3994 */ 3995 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 3996 { 3997 int i; 3998 struct drm_sched_job *job; 3999 4000 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4001 struct amdgpu_ring *ring = adev->rings[i]; 4002 4003 if (!ring || !ring->sched.thread) 4004 continue; 4005 4006 spin_lock(&ring->sched.job_list_lock); 4007 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4008 struct drm_sched_job, node); 4009 spin_unlock(&ring->sched.job_list_lock); 4010 if (job) 4011 return true; 4012 } 4013 return false; 4014 } 4015 4016 /** 4017 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4018 * 4019 * @adev: amdgpu device pointer 4020 * 4021 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4022 * a hung GPU. 4023 */ 4024 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4025 { 4026 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4027 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4028 return false; 4029 } 4030 4031 if (amdgpu_gpu_recovery == 0) 4032 goto disabled; 4033 4034 if (amdgpu_sriov_vf(adev)) 4035 return true; 4036 4037 if (amdgpu_gpu_recovery == -1) { 4038 switch (adev->asic_type) { 4039 case CHIP_BONAIRE: 4040 case CHIP_HAWAII: 4041 case CHIP_TOPAZ: 4042 case CHIP_TONGA: 4043 case CHIP_FIJI: 4044 case CHIP_POLARIS10: 4045 case CHIP_POLARIS11: 4046 case CHIP_POLARIS12: 4047 case CHIP_VEGAM: 4048 case CHIP_VEGA20: 4049 case CHIP_VEGA10: 4050 case CHIP_VEGA12: 4051 case CHIP_RAVEN: 4052 case CHIP_ARCTURUS: 4053 case CHIP_RENOIR: 4054 case CHIP_NAVI10: 4055 case CHIP_NAVI14: 4056 case CHIP_NAVI12: 4057 case CHIP_SIENNA_CICHLID: 4058 break; 4059 default: 4060 goto disabled; 4061 } 4062 } 4063 4064 return true; 4065 4066 disabled: 4067 dev_info(adev->dev, "GPU recovery disabled.\n"); 4068 return false; 4069 } 4070 4071 4072 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4073 struct amdgpu_job *job, 4074 bool *need_full_reset_arg) 4075 { 4076 int i, r = 0; 4077 bool need_full_reset = *need_full_reset_arg; 4078 4079 amdgpu_debugfs_wait_dump(adev); 4080 4081 /* block all schedulers and reset given job's ring */ 4082 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4083 struct amdgpu_ring *ring = adev->rings[i]; 4084 4085 if (!ring || !ring->sched.thread) 4086 continue; 4087 4088 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4089 amdgpu_fence_driver_force_completion(ring); 4090 } 4091 4092 if(job) 4093 drm_sched_increase_karma(&job->base); 4094 4095 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4096 if (!amdgpu_sriov_vf(adev)) { 4097 4098 if (!need_full_reset) 4099 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4100 4101 if (!need_full_reset) { 4102 amdgpu_device_ip_pre_soft_reset(adev); 4103 r = amdgpu_device_ip_soft_reset(adev); 4104 amdgpu_device_ip_post_soft_reset(adev); 4105 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4106 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4107 need_full_reset = true; 4108 } 4109 } 4110 4111 if (need_full_reset) 4112 r = amdgpu_device_ip_suspend(adev); 4113 4114 *need_full_reset_arg = need_full_reset; 4115 } 4116 4117 return r; 4118 } 4119 4120 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4121 struct list_head *device_list_handle, 4122 bool *need_full_reset_arg, 4123 bool skip_hw_reset) 4124 { 4125 struct amdgpu_device *tmp_adev = NULL; 4126 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4127 int r = 0; 4128 4129 /* 4130 * ASIC reset has to be done on all HGMI hive nodes ASAP 4131 * to allow proper links negotiation in FW (within 1 sec) 4132 */ 4133 if (!skip_hw_reset && need_full_reset) { 4134 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4135 /* For XGMI run all resets in parallel to speed up the process */ 4136 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4137 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4138 r = -EALREADY; 4139 } else 4140 r = amdgpu_asic_reset(tmp_adev); 4141 4142 if (r) { 4143 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4144 r, adev_to_drm(tmp_adev)->unique); 4145 break; 4146 } 4147 } 4148 4149 /* For XGMI wait for all resets to complete before proceed */ 4150 if (!r) { 4151 list_for_each_entry(tmp_adev, device_list_handle, 4152 gmc.xgmi.head) { 4153 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4154 flush_work(&tmp_adev->xgmi_reset_work); 4155 r = tmp_adev->asic_reset_res; 4156 if (r) 4157 break; 4158 } 4159 } 4160 } 4161 } 4162 4163 if (!r && amdgpu_ras_intr_triggered()) { 4164 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4165 if (tmp_adev->mmhub.funcs && 4166 tmp_adev->mmhub.funcs->reset_ras_error_count) 4167 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4168 } 4169 4170 amdgpu_ras_intr_cleared(); 4171 } 4172 4173 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4174 if (need_full_reset) { 4175 /* post card */ 4176 if (amdgpu_device_asic_init(tmp_adev)) 4177 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4178 4179 if (!r) { 4180 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4181 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4182 if (r) 4183 goto out; 4184 4185 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4186 if (vram_lost) { 4187 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4188 amdgpu_inc_vram_lost(tmp_adev); 4189 } 4190 4191 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4192 if (r) 4193 goto out; 4194 4195 r = amdgpu_device_fw_loading(tmp_adev); 4196 if (r) 4197 return r; 4198 4199 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4200 if (r) 4201 goto out; 4202 4203 if (vram_lost) 4204 amdgpu_device_fill_reset_magic(tmp_adev); 4205 4206 /* 4207 * Add this ASIC as tracked as reset was already 4208 * complete successfully. 4209 */ 4210 amdgpu_register_gpu_instance(tmp_adev); 4211 4212 r = amdgpu_device_ip_late_init(tmp_adev); 4213 if (r) 4214 goto out; 4215 4216 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4217 4218 /* 4219 * The GPU enters bad state once faulty pages 4220 * by ECC has reached the threshold, and ras 4221 * recovery is scheduled next. So add one check 4222 * here to break recovery if it indeed exceeds 4223 * bad page threshold, and remind user to 4224 * retire this GPU or setting one bigger 4225 * bad_page_threshold value to fix this once 4226 * probing driver again. 4227 */ 4228 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4229 /* must succeed. */ 4230 amdgpu_ras_resume(tmp_adev); 4231 } else { 4232 r = -EINVAL; 4233 goto out; 4234 } 4235 4236 /* Update PSP FW topology after reset */ 4237 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4238 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4239 } 4240 } 4241 4242 out: 4243 if (!r) { 4244 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4245 r = amdgpu_ib_ring_tests(tmp_adev); 4246 if (r) { 4247 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4248 r = amdgpu_device_ip_suspend(tmp_adev); 4249 need_full_reset = true; 4250 r = -EAGAIN; 4251 goto end; 4252 } 4253 } 4254 4255 if (!r) 4256 r = amdgpu_device_recover_vram(tmp_adev); 4257 else 4258 tmp_adev->asic_reset_res = r; 4259 } 4260 4261 end: 4262 *need_full_reset_arg = need_full_reset; 4263 return r; 4264 } 4265 4266 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4267 struct amdgpu_hive_info *hive) 4268 { 4269 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4270 return false; 4271 4272 if (hive) { 4273 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4274 } else { 4275 down_write(&adev->reset_sem); 4276 } 4277 4278 atomic_inc(&adev->gpu_reset_counter); 4279 switch (amdgpu_asic_reset_method(adev)) { 4280 case AMD_RESET_METHOD_MODE1: 4281 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4282 break; 4283 case AMD_RESET_METHOD_MODE2: 4284 adev->mp1_state = PP_MP1_STATE_RESET; 4285 break; 4286 default: 4287 adev->mp1_state = PP_MP1_STATE_NONE; 4288 break; 4289 } 4290 4291 return true; 4292 } 4293 4294 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4295 { 4296 amdgpu_vf_error_trans_all(adev); 4297 adev->mp1_state = PP_MP1_STATE_NONE; 4298 atomic_set(&adev->in_gpu_reset, 0); 4299 up_write(&adev->reset_sem); 4300 } 4301 4302 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4303 { 4304 struct pci_dev *p = NULL; 4305 4306 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4307 adev->pdev->bus->number, 1); 4308 if (p) { 4309 pm_runtime_enable(&(p->dev)); 4310 pm_runtime_resume(&(p->dev)); 4311 } 4312 } 4313 4314 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4315 { 4316 enum amd_reset_method reset_method; 4317 struct pci_dev *p = NULL; 4318 u64 expires; 4319 4320 /* 4321 * For now, only BACO and mode1 reset are confirmed 4322 * to suffer the audio issue without proper suspended. 4323 */ 4324 reset_method = amdgpu_asic_reset_method(adev); 4325 if ((reset_method != AMD_RESET_METHOD_BACO) && 4326 (reset_method != AMD_RESET_METHOD_MODE1)) 4327 return -EINVAL; 4328 4329 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4330 adev->pdev->bus->number, 1); 4331 if (!p) 4332 return -ENODEV; 4333 4334 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4335 if (!expires) 4336 /* 4337 * If we cannot get the audio device autosuspend delay, 4338 * a fixed 4S interval will be used. Considering 3S is 4339 * the audio controller default autosuspend delay setting. 4340 * 4S used here is guaranteed to cover that. 4341 */ 4342 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4343 4344 while (!pm_runtime_status_suspended(&(p->dev))) { 4345 if (!pm_runtime_suspend(&(p->dev))) 4346 break; 4347 4348 if (expires < ktime_get_mono_fast_ns()) { 4349 dev_warn(adev->dev, "failed to suspend display audio\n"); 4350 /* TODO: abort the succeeding gpu reset? */ 4351 return -ETIMEDOUT; 4352 } 4353 } 4354 4355 pm_runtime_disable(&(p->dev)); 4356 4357 return 0; 4358 } 4359 4360 /** 4361 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4362 * 4363 * @adev: amdgpu device pointer 4364 * @job: which job trigger hang 4365 * 4366 * Attempt to reset the GPU if it has hung (all asics). 4367 * Attempt to do soft-reset or full-reset and reinitialize Asic 4368 * Returns 0 for success or an error on failure. 4369 */ 4370 4371 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4372 struct amdgpu_job *job) 4373 { 4374 struct list_head device_list, *device_list_handle = NULL; 4375 bool need_full_reset = false; 4376 bool job_signaled = false; 4377 struct amdgpu_hive_info *hive = NULL; 4378 struct amdgpu_device *tmp_adev = NULL; 4379 int i, r = 0; 4380 bool need_emergency_restart = false; 4381 bool audio_suspended = false; 4382 4383 /** 4384 * Special case: RAS triggered and full reset isn't supported 4385 */ 4386 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4387 4388 /* 4389 * Flush RAM to disk so that after reboot 4390 * the user can read log and see why the system rebooted. 4391 */ 4392 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4393 DRM_WARN("Emergency reboot."); 4394 4395 ksys_sync_helper(); 4396 emergency_restart(); 4397 } 4398 4399 dev_info(adev->dev, "GPU %s begin!\n", 4400 need_emergency_restart ? "jobs stop":"reset"); 4401 4402 /* 4403 * Here we trylock to avoid chain of resets executing from 4404 * either trigger by jobs on different adevs in XGMI hive or jobs on 4405 * different schedulers for same device while this TO handler is running. 4406 * We always reset all schedulers for device and all devices for XGMI 4407 * hive so that should take care of them too. 4408 */ 4409 hive = amdgpu_get_xgmi_hive(adev); 4410 if (hive) { 4411 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4412 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4413 job ? job->base.id : -1, hive->hive_id); 4414 amdgpu_put_xgmi_hive(hive); 4415 return 0; 4416 } 4417 mutex_lock(&hive->hive_lock); 4418 } 4419 4420 /* 4421 * Build list of devices to reset. 4422 * In case we are in XGMI hive mode, resort the device list 4423 * to put adev in the 1st position. 4424 */ 4425 INIT_LIST_HEAD(&device_list); 4426 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4427 if (!hive) 4428 return -ENODEV; 4429 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4430 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4431 device_list_handle = &hive->device_list; 4432 } else { 4433 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4434 device_list_handle = &device_list; 4435 } 4436 4437 /* block all schedulers and reset given job's ring */ 4438 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4439 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4440 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4441 job ? job->base.id : -1); 4442 r = 0; 4443 goto skip_recovery; 4444 } 4445 4446 /* 4447 * Try to put the audio codec into suspend state 4448 * before gpu reset started. 4449 * 4450 * Due to the power domain of the graphics device 4451 * is shared with AZ power domain. Without this, 4452 * we may change the audio hardware from behind 4453 * the audio driver's back. That will trigger 4454 * some audio codec errors. 4455 */ 4456 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4457 audio_suspended = true; 4458 4459 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4460 4461 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4462 4463 if (!amdgpu_sriov_vf(tmp_adev)) 4464 amdgpu_amdkfd_pre_reset(tmp_adev); 4465 4466 /* 4467 * Mark these ASICs to be reseted as untracked first 4468 * And add them back after reset completed 4469 */ 4470 amdgpu_unregister_gpu_instance(tmp_adev); 4471 4472 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4473 4474 /* disable ras on ALL IPs */ 4475 if (!need_emergency_restart && 4476 amdgpu_device_ip_need_full_reset(tmp_adev)) 4477 amdgpu_ras_suspend(tmp_adev); 4478 4479 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4480 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4481 4482 if (!ring || !ring->sched.thread) 4483 continue; 4484 4485 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4486 4487 if (need_emergency_restart) 4488 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4489 } 4490 } 4491 4492 if (need_emergency_restart) 4493 goto skip_sched_resume; 4494 4495 /* 4496 * Must check guilty signal here since after this point all old 4497 * HW fences are force signaled. 4498 * 4499 * job->base holds a reference to parent fence 4500 */ 4501 if (job && job->base.s_fence->parent && 4502 dma_fence_is_signaled(job->base.s_fence->parent)) { 4503 job_signaled = true; 4504 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4505 goto skip_hw_reset; 4506 } 4507 4508 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4509 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4510 r = amdgpu_device_pre_asic_reset(tmp_adev, 4511 NULL, 4512 &need_full_reset); 4513 /*TODO Should we stop ?*/ 4514 if (r) { 4515 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4516 r, adev_to_drm(tmp_adev)->unique); 4517 tmp_adev->asic_reset_res = r; 4518 } 4519 } 4520 4521 /* Actual ASIC resets if needed.*/ 4522 /* TODO Implement XGMI hive reset logic for SRIOV */ 4523 if (amdgpu_sriov_vf(adev)) { 4524 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4525 if (r) 4526 adev->asic_reset_res = r; 4527 } else { 4528 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4529 if (r && r == -EAGAIN) 4530 goto retry; 4531 } 4532 4533 skip_hw_reset: 4534 4535 /* Post ASIC reset for all devs .*/ 4536 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4537 4538 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4539 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4540 4541 if (!ring || !ring->sched.thread) 4542 continue; 4543 4544 /* No point to resubmit jobs if we didn't HW reset*/ 4545 if (!tmp_adev->asic_reset_res && !job_signaled) 4546 drm_sched_resubmit_jobs(&ring->sched); 4547 4548 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4549 } 4550 4551 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4552 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4553 } 4554 4555 tmp_adev->asic_reset_res = 0; 4556 4557 if (r) { 4558 /* bad news, how to tell it to userspace ? */ 4559 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4560 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4561 } else { 4562 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4563 } 4564 } 4565 4566 skip_sched_resume: 4567 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4568 /*unlock kfd: SRIOV would do it separately */ 4569 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4570 amdgpu_amdkfd_post_reset(tmp_adev); 4571 if (audio_suspended) 4572 amdgpu_device_resume_display_audio(tmp_adev); 4573 amdgpu_device_unlock_adev(tmp_adev); 4574 } 4575 4576 skip_recovery: 4577 if (hive) { 4578 atomic_set(&hive->in_reset, 0); 4579 mutex_unlock(&hive->hive_lock); 4580 amdgpu_put_xgmi_hive(hive); 4581 } 4582 4583 if (r) 4584 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4585 return r; 4586 } 4587 4588 /** 4589 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4590 * 4591 * @adev: amdgpu_device pointer 4592 * 4593 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4594 * and lanes) of the slot the device is in. Handles APUs and 4595 * virtualized environments where PCIE config space may not be available. 4596 */ 4597 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4598 { 4599 struct pci_dev *pdev; 4600 enum pci_bus_speed speed_cap, platform_speed_cap; 4601 enum pcie_link_width platform_link_width; 4602 4603 if (amdgpu_pcie_gen_cap) 4604 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4605 4606 if (amdgpu_pcie_lane_cap) 4607 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4608 4609 /* covers APUs as well */ 4610 if (pci_is_root_bus(adev->pdev->bus)) { 4611 if (adev->pm.pcie_gen_mask == 0) 4612 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4613 if (adev->pm.pcie_mlw_mask == 0) 4614 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4615 return; 4616 } 4617 4618 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4619 return; 4620 4621 pcie_bandwidth_available(adev->pdev, NULL, 4622 &platform_speed_cap, &platform_link_width); 4623 4624 if (adev->pm.pcie_gen_mask == 0) { 4625 /* asic caps */ 4626 pdev = adev->pdev; 4627 speed_cap = pcie_get_speed_cap(pdev); 4628 if (speed_cap == PCI_SPEED_UNKNOWN) { 4629 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4630 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4631 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4632 } else { 4633 if (speed_cap == PCIE_SPEED_16_0GT) 4634 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4635 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4636 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4637 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4638 else if (speed_cap == PCIE_SPEED_8_0GT) 4639 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4640 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4641 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4642 else if (speed_cap == PCIE_SPEED_5_0GT) 4643 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4644 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4645 else 4646 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4647 } 4648 /* platform caps */ 4649 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4650 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4651 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4652 } else { 4653 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4654 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4655 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4656 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4657 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4658 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4659 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4660 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4661 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4662 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4663 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4664 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4665 else 4666 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4667 4668 } 4669 } 4670 if (adev->pm.pcie_mlw_mask == 0) { 4671 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4672 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4673 } else { 4674 switch (platform_link_width) { 4675 case PCIE_LNK_X32: 4676 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4677 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4678 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4681 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4682 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4683 break; 4684 case PCIE_LNK_X16: 4685 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4686 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4691 break; 4692 case PCIE_LNK_X12: 4693 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4694 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4695 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4697 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4698 break; 4699 case PCIE_LNK_X8: 4700 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4701 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4702 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4703 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4704 break; 4705 case PCIE_LNK_X4: 4706 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4707 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4708 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4709 break; 4710 case PCIE_LNK_X2: 4711 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4712 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4713 break; 4714 case PCIE_LNK_X1: 4715 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4716 break; 4717 default: 4718 break; 4719 } 4720 } 4721 } 4722 } 4723 4724 int amdgpu_device_baco_enter(struct drm_device *dev) 4725 { 4726 struct amdgpu_device *adev = drm_to_adev(dev); 4727 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4728 4729 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4730 return -ENOTSUPP; 4731 4732 if (ras && ras->supported) 4733 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4734 4735 return amdgpu_dpm_baco_enter(adev); 4736 } 4737 4738 int amdgpu_device_baco_exit(struct drm_device *dev) 4739 { 4740 struct amdgpu_device *adev = drm_to_adev(dev); 4741 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4742 int ret = 0; 4743 4744 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4745 return -ENOTSUPP; 4746 4747 ret = amdgpu_dpm_baco_exit(adev); 4748 if (ret) 4749 return ret; 4750 4751 if (ras && ras->supported) 4752 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4753 4754 return 0; 4755 } 4756 4757 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4758 { 4759 int i; 4760 4761 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4762 struct amdgpu_ring *ring = adev->rings[i]; 4763 4764 if (!ring || !ring->sched.thread) 4765 continue; 4766 4767 cancel_delayed_work_sync(&ring->sched.work_tdr); 4768 } 4769 } 4770 4771 /** 4772 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4773 * @pdev: PCI device struct 4774 * @state: PCI channel state 4775 * 4776 * Description: Called when a PCI error is detected. 4777 * 4778 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4779 */ 4780 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4781 { 4782 struct drm_device *dev = pci_get_drvdata(pdev); 4783 struct amdgpu_device *adev = drm_to_adev(dev); 4784 int i; 4785 4786 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4787 4788 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4789 DRM_WARN("No support for XGMI hive yet..."); 4790 return PCI_ERS_RESULT_DISCONNECT; 4791 } 4792 4793 switch (state) { 4794 case pci_channel_io_normal: 4795 return PCI_ERS_RESULT_CAN_RECOVER; 4796 /* Fatal error, prepare for slot reset */ 4797 case pci_channel_io_frozen: 4798 /* 4799 * Cancel and wait for all TDRs in progress if failing to 4800 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4801 * 4802 * Locking adev->reset_sem will prevent any external access 4803 * to GPU during PCI error recovery 4804 */ 4805 while (!amdgpu_device_lock_adev(adev, NULL)) 4806 amdgpu_cancel_all_tdr(adev); 4807 4808 /* 4809 * Block any work scheduling as we do for regular GPU reset 4810 * for the duration of the recovery 4811 */ 4812 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4813 struct amdgpu_ring *ring = adev->rings[i]; 4814 4815 if (!ring || !ring->sched.thread) 4816 continue; 4817 4818 drm_sched_stop(&ring->sched, NULL); 4819 } 4820 return PCI_ERS_RESULT_NEED_RESET; 4821 case pci_channel_io_perm_failure: 4822 /* Permanent error, prepare for device removal */ 4823 return PCI_ERS_RESULT_DISCONNECT; 4824 } 4825 4826 return PCI_ERS_RESULT_NEED_RESET; 4827 } 4828 4829 /** 4830 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4831 * @pdev: pointer to PCI device 4832 */ 4833 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4834 { 4835 4836 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4837 4838 /* TODO - dump whatever for debugging purposes */ 4839 4840 /* This called only if amdgpu_pci_error_detected returns 4841 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4842 * works, no need to reset slot. 4843 */ 4844 4845 return PCI_ERS_RESULT_RECOVERED; 4846 } 4847 4848 /** 4849 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4850 * @pdev: PCI device struct 4851 * 4852 * Description: This routine is called by the pci error recovery 4853 * code after the PCI slot has been reset, just before we 4854 * should resume normal operations. 4855 */ 4856 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4857 { 4858 struct drm_device *dev = pci_get_drvdata(pdev); 4859 struct amdgpu_device *adev = drm_to_adev(dev); 4860 int r, i; 4861 bool need_full_reset = true; 4862 u32 memsize; 4863 struct list_head device_list; 4864 4865 DRM_INFO("PCI error: slot reset callback!!\n"); 4866 4867 INIT_LIST_HEAD(&device_list); 4868 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4869 4870 /* wait for asic to come out of reset */ 4871 msleep(500); 4872 4873 /* Restore PCI confspace */ 4874 amdgpu_device_load_pci_state(pdev); 4875 4876 /* confirm ASIC came out of reset */ 4877 for (i = 0; i < adev->usec_timeout; i++) { 4878 memsize = amdgpu_asic_get_config_memsize(adev); 4879 4880 if (memsize != 0xffffffff) 4881 break; 4882 udelay(1); 4883 } 4884 if (memsize == 0xffffffff) { 4885 r = -ETIME; 4886 goto out; 4887 } 4888 4889 adev->in_pci_err_recovery = true; 4890 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 4891 adev->in_pci_err_recovery = false; 4892 if (r) 4893 goto out; 4894 4895 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 4896 4897 out: 4898 if (!r) { 4899 if (amdgpu_device_cache_pci_state(adev->pdev)) 4900 pci_restore_state(adev->pdev); 4901 4902 DRM_INFO("PCIe error recovery succeeded\n"); 4903 } else { 4904 DRM_ERROR("PCIe error recovery failed, err:%d", r); 4905 amdgpu_device_unlock_adev(adev); 4906 } 4907 4908 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 4909 } 4910 4911 /** 4912 * amdgpu_pci_resume() - resume normal ops after PCI reset 4913 * @pdev: pointer to PCI device 4914 * 4915 * Called when the error recovery driver tells us that its 4916 * OK to resume normal operation. Use completion to allow 4917 * halted scsi ops to resume. 4918 */ 4919 void amdgpu_pci_resume(struct pci_dev *pdev) 4920 { 4921 struct drm_device *dev = pci_get_drvdata(pdev); 4922 struct amdgpu_device *adev = drm_to_adev(dev); 4923 int i; 4924 4925 4926 DRM_INFO("PCI error: resume callback!!\n"); 4927 4928 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4929 struct amdgpu_ring *ring = adev->rings[i]; 4930 4931 if (!ring || !ring->sched.thread) 4932 continue; 4933 4934 4935 drm_sched_resubmit_jobs(&ring->sched); 4936 drm_sched_start(&ring->sched, true); 4937 } 4938 4939 amdgpu_device_unlock_adev(adev); 4940 } 4941 4942 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 4943 { 4944 struct drm_device *dev = pci_get_drvdata(pdev); 4945 struct amdgpu_device *adev = drm_to_adev(dev); 4946 int r; 4947 4948 r = pci_save_state(pdev); 4949 if (!r) { 4950 kfree(adev->pci_state); 4951 4952 adev->pci_state = pci_store_saved_state(pdev); 4953 4954 if (!adev->pci_state) { 4955 DRM_ERROR("Failed to store PCI saved state"); 4956 return false; 4957 } 4958 } else { 4959 DRM_WARN("Failed to save PCI state, err:%d\n", r); 4960 return false; 4961 } 4962 4963 return true; 4964 } 4965 4966 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 4967 { 4968 struct drm_device *dev = pci_get_drvdata(pdev); 4969 struct amdgpu_device *adev = drm_to_adev(dev); 4970 int r; 4971 4972 if (!adev->pci_state) 4973 return false; 4974 4975 r = pci_load_saved_state(pdev, adev->pci_state); 4976 4977 if (!r) { 4978 pci_restore_state(pdev); 4979 } else { 4980 DRM_WARN("Failed to load PCI state, err:%d\n", r); 4981 return false; 4982 } 4983 4984 return true; 4985 } 4986 4987 4988