1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 87 const char *amdgpu_asic_name[] = { 88 "TAHITI", 89 "PITCAIRN", 90 "VERDE", 91 "OLAND", 92 "HAINAN", 93 "BONAIRE", 94 "KAVERI", 95 "KABINI", 96 "HAWAII", 97 "MULLINS", 98 "TOPAZ", 99 "TONGA", 100 "FIJI", 101 "CARRIZO", 102 "STONEY", 103 "POLARIS10", 104 "POLARIS11", 105 "POLARIS12", 106 "VEGAM", 107 "VEGA10", 108 "VEGA12", 109 "VEGA20", 110 "RAVEN", 111 "ARCTURUS", 112 "RENOIR", 113 "NAVI10", 114 "NAVI14", 115 "NAVI12", 116 "SIENNA_CICHLID", 117 "NAVY_FLOUNDER", 118 "VANGOGH", 119 "LAST", 120 }; 121 122 /** 123 * DOC: pcie_replay_count 124 * 125 * The amdgpu driver provides a sysfs API for reporting the total number 126 * of PCIe replays (NAKs) 127 * The file pcie_replay_count is used for this and returns the total 128 * number of replays as a sum of the NAKs generated and NAKs received 129 */ 130 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct drm_device *ddev = dev_get_drvdata(dev); 135 struct amdgpu_device *adev = drm_to_adev(ddev); 136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 137 138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 139 } 140 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 142 amdgpu_device_get_pcie_replay_count, NULL); 143 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 145 146 /** 147 * DOC: product_name 148 * 149 * The amdgpu driver provides a sysfs API for reporting the product name 150 * for the device 151 * The file serial_number is used for this and returns the product name 152 * as returned from the FRU. 153 * NOTE: This is only available for certain server cards 154 */ 155 156 static ssize_t amdgpu_device_get_product_name(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 struct drm_device *ddev = dev_get_drvdata(dev); 160 struct amdgpu_device *adev = drm_to_adev(ddev); 161 162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 163 } 164 165 static DEVICE_ATTR(product_name, S_IRUGO, 166 amdgpu_device_get_product_name, NULL); 167 168 /** 169 * DOC: product_number 170 * 171 * The amdgpu driver provides a sysfs API for reporting the part number 172 * for the device 173 * The file serial_number is used for this and returns the part number 174 * as returned from the FRU. 175 * NOTE: This is only available for certain server cards 176 */ 177 178 static ssize_t amdgpu_device_get_product_number(struct device *dev, 179 struct device_attribute *attr, char *buf) 180 { 181 struct drm_device *ddev = dev_get_drvdata(dev); 182 struct amdgpu_device *adev = drm_to_adev(ddev); 183 184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 185 } 186 187 static DEVICE_ATTR(product_number, S_IRUGO, 188 amdgpu_device_get_product_number, NULL); 189 190 /** 191 * DOC: serial_number 192 * 193 * The amdgpu driver provides a sysfs API for reporting the serial number 194 * for the device 195 * The file serial_number is used for this and returns the serial number 196 * as returned from the FRU. 197 * NOTE: This is only available for certain server cards 198 */ 199 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 201 struct device_attribute *attr, char *buf) 202 { 203 struct drm_device *ddev = dev_get_drvdata(dev); 204 struct amdgpu_device *adev = drm_to_adev(ddev); 205 206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 207 } 208 209 static DEVICE_ATTR(serial_number, S_IRUGO, 210 amdgpu_device_get_serial_number, NULL); 211 212 /** 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 214 * 215 * @dev: drm_device pointer 216 * 217 * Returns true if the device is a dGPU with HG/PX power control, 218 * otherwise return false. 219 */ 220 bool amdgpu_device_supports_boco(struct drm_device *dev) 221 { 222 struct amdgpu_device *adev = drm_to_adev(dev); 223 224 if (adev->flags & AMD_IS_PX) 225 return true; 226 return false; 227 } 228 229 /** 230 * amdgpu_device_supports_baco - Does the device support BACO 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device supporte BACO, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_baco(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = drm_to_adev(dev); 240 241 return amdgpu_asic_supports_baco(adev); 242 } 243 244 /** 245 * VRAM access helper functions. 246 * 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * register access helper functions. 307 */ 308 /** 309 * amdgpu_device_rreg - read a memory mapped IO or indirect register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 318 uint32_t reg, uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (adev->in_pci_err_recovery) 323 return 0; 324 325 if ((reg * 4) < adev->rmmio_size) { 326 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 327 amdgpu_sriov_runtime(adev) && 328 down_read_trylock(&adev->reset_sem)) { 329 ret = amdgpu_kiq_rreg(adev, reg); 330 up_read(&adev->reset_sem); 331 } else { 332 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 333 } 334 } else { 335 ret = adev->pcie_rreg(adev, reg * 4); 336 } 337 338 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 339 340 return ret; 341 } 342 343 /* 344 * MMIO register read with bytes helper functions 345 * @offset:bytes offset from MMIO start 346 * 347 */ 348 349 /** 350 * amdgpu_mm_rreg8 - read a memory mapped IO register 351 * 352 * @adev: amdgpu_device pointer 353 * @offset: byte aligned register offset 354 * 355 * Returns the 8 bit value from the offset specified. 356 */ 357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 358 { 359 if (adev->in_pci_err_recovery) 360 return 0; 361 362 if (offset < adev->rmmio_size) 363 return (readb(adev->rmmio + offset)); 364 BUG(); 365 } 366 367 /* 368 * MMIO register write with bytes helper functions 369 * @offset:bytes offset from MMIO start 370 * @value: the value want to be written to the register 371 * 372 */ 373 /** 374 * amdgpu_mm_wreg8 - read a memory mapped IO register 375 * 376 * @adev: amdgpu_device pointer 377 * @offset: byte aligned register offset 378 * @value: 8 bit value to write 379 * 380 * Writes the value specified to the offset specified. 381 */ 382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 383 { 384 if (adev->in_pci_err_recovery) 385 return; 386 387 if (offset < adev->rmmio_size) 388 writeb(value, adev->rmmio + offset); 389 else 390 BUG(); 391 } 392 393 /** 394 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 395 * 396 * @adev: amdgpu_device pointer 397 * @reg: dword aligned register offset 398 * @v: 32 bit value to write to the register 399 * @acc_flags: access flags which require special behavior 400 * 401 * Writes the value specified to the offset specified. 402 */ 403 void amdgpu_device_wreg(struct amdgpu_device *adev, 404 uint32_t reg, uint32_t v, 405 uint32_t acc_flags) 406 { 407 if (adev->in_pci_err_recovery) 408 return; 409 410 if ((reg * 4) < adev->rmmio_size) { 411 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 412 amdgpu_sriov_runtime(adev) && 413 down_read_trylock(&adev->reset_sem)) { 414 amdgpu_kiq_wreg(adev, reg, v); 415 up_read(&adev->reset_sem); 416 } else { 417 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 418 } 419 } else { 420 adev->pcie_wreg(adev, reg * 4, v); 421 } 422 423 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 424 } 425 426 /* 427 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 428 * 429 * this function is invoked only the debugfs register access 430 * */ 431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 432 uint32_t reg, uint32_t v) 433 { 434 if (adev->in_pci_err_recovery) 435 return; 436 437 if (amdgpu_sriov_fullaccess(adev) && 438 adev->gfx.rlc.funcs && 439 adev->gfx.rlc.funcs->is_rlcg_access_range) { 440 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 441 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 442 } else { 443 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 444 } 445 } 446 447 /** 448 * amdgpu_io_rreg - read an IO register 449 * 450 * @adev: amdgpu_device pointer 451 * @reg: dword aligned register offset 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 456 { 457 if (adev->in_pci_err_recovery) 458 return 0; 459 460 if ((reg * 4) < adev->rio_mem_size) 461 return ioread32(adev->rio_mem + (reg * 4)); 462 else { 463 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 464 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 465 } 466 } 467 468 /** 469 * amdgpu_io_wreg - write to an IO register 470 * 471 * @adev: amdgpu_device pointer 472 * @reg: dword aligned register offset 473 * @v: 32 bit value to write to the register 474 * 475 * Writes the value specified to the offset specified. 476 */ 477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 478 { 479 if (adev->in_pci_err_recovery) 480 return; 481 482 if ((reg * 4) < adev->rio_mem_size) 483 iowrite32(v, adev->rio_mem + (reg * 4)); 484 else { 485 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 486 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 487 } 488 } 489 490 /** 491 * amdgpu_mm_rdoorbell - read a doorbell dword 492 * 493 * @adev: amdgpu_device pointer 494 * @index: doorbell index 495 * 496 * Returns the value in the doorbell aperture at the 497 * requested doorbell index (CIK). 498 */ 499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 500 { 501 if (adev->in_pci_err_recovery) 502 return 0; 503 504 if (index < adev->doorbell.num_doorbells) { 505 return readl(adev->doorbell.ptr + index); 506 } else { 507 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 508 return 0; 509 } 510 } 511 512 /** 513 * amdgpu_mm_wdoorbell - write a doorbell dword 514 * 515 * @adev: amdgpu_device pointer 516 * @index: doorbell index 517 * @v: value to write 518 * 519 * Writes @v to the doorbell aperture at the 520 * requested doorbell index (CIK). 521 */ 522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 523 { 524 if (adev->in_pci_err_recovery) 525 return; 526 527 if (index < adev->doorbell.num_doorbells) { 528 writel(v, adev->doorbell.ptr + index); 529 } else { 530 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 531 } 532 } 533 534 /** 535 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 536 * 537 * @adev: amdgpu_device pointer 538 * @index: doorbell index 539 * 540 * Returns the value in the doorbell aperture at the 541 * requested doorbell index (VEGA10+). 542 */ 543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 544 { 545 if (adev->in_pci_err_recovery) 546 return 0; 547 548 if (index < adev->doorbell.num_doorbells) { 549 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 550 } else { 551 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 552 return 0; 553 } 554 } 555 556 /** 557 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 558 * 559 * @adev: amdgpu_device pointer 560 * @index: doorbell index 561 * @v: value to write 562 * 563 * Writes @v to the doorbell aperture at the 564 * requested doorbell index (VEGA10+). 565 */ 566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 567 { 568 if (adev->in_pci_err_recovery) 569 return; 570 571 if (index < adev->doorbell.num_doorbells) { 572 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 573 } else { 574 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 575 } 576 } 577 578 /** 579 * amdgpu_device_indirect_rreg - read an indirect register 580 * 581 * @adev: amdgpu_device pointer 582 * @pcie_index: mmio register offset 583 * @pcie_data: mmio register offset 584 * 585 * Returns the value of indirect register @reg_addr 586 */ 587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 588 u32 pcie_index, u32 pcie_data, 589 u32 reg_addr) 590 { 591 unsigned long flags; 592 u32 r; 593 void __iomem *pcie_index_offset; 594 void __iomem *pcie_data_offset; 595 596 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 597 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 598 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 599 600 writel(reg_addr, pcie_index_offset); 601 readl(pcie_index_offset); 602 r = readl(pcie_data_offset); 603 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 604 605 return r; 606 } 607 608 /** 609 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 610 * 611 * @adev: amdgpu_device pointer 612 * @pcie_index: mmio register offset 613 * @pcie_data: mmio register offset 614 * 615 * Returns the value of indirect register @reg_addr 616 */ 617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 618 u32 pcie_index, u32 pcie_data, 619 u32 reg_addr) 620 { 621 unsigned long flags; 622 u64 r; 623 void __iomem *pcie_index_offset; 624 void __iomem *pcie_data_offset; 625 626 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 627 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 628 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 629 630 /* read low 32 bits */ 631 writel(reg_addr, pcie_index_offset); 632 readl(pcie_index_offset); 633 r = readl(pcie_data_offset); 634 /* read high 32 bits */ 635 writel(reg_addr + 4, pcie_index_offset); 636 readl(pcie_index_offset); 637 r |= ((u64)readl(pcie_data_offset) << 32); 638 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 639 640 return r; 641 } 642 643 /** 644 * amdgpu_device_indirect_wreg - write an indirect register address 645 * 646 * @adev: amdgpu_device pointer 647 * @pcie_index: mmio register offset 648 * @pcie_data: mmio register offset 649 * @reg_addr: indirect register offset 650 * @reg_data: indirect register data 651 * 652 */ 653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 654 u32 pcie_index, u32 pcie_data, 655 u32 reg_addr, u32 reg_data) 656 { 657 unsigned long flags; 658 void __iomem *pcie_index_offset; 659 void __iomem *pcie_data_offset; 660 661 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 662 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 663 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 664 665 writel(reg_addr, pcie_index_offset); 666 readl(pcie_index_offset); 667 writel(reg_data, pcie_data_offset); 668 readl(pcie_data_offset); 669 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 670 } 671 672 /** 673 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 674 * 675 * @adev: amdgpu_device pointer 676 * @pcie_index: mmio register offset 677 * @pcie_data: mmio register offset 678 * @reg_addr: indirect register offset 679 * @reg_data: indirect register data 680 * 681 */ 682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 683 u32 pcie_index, u32 pcie_data, 684 u32 reg_addr, u64 reg_data) 685 { 686 unsigned long flags; 687 void __iomem *pcie_index_offset; 688 void __iomem *pcie_data_offset; 689 690 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 693 694 /* write low 32 bits */ 695 writel(reg_addr, pcie_index_offset); 696 readl(pcie_index_offset); 697 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 698 readl(pcie_data_offset); 699 /* write high 32 bits */ 700 writel(reg_addr + 4, pcie_index_offset); 701 readl(pcie_index_offset); 702 writel((u32)(reg_data >> 32), pcie_data_offset); 703 readl(pcie_data_offset); 704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 705 } 706 707 /** 708 * amdgpu_invalid_rreg - dummy reg read function 709 * 710 * @adev: amdgpu device pointer 711 * @reg: offset of register 712 * 713 * Dummy register read function. Used for register blocks 714 * that certain asics don't have (all asics). 715 * Returns the value in the register. 716 */ 717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 718 { 719 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 720 BUG(); 721 return 0; 722 } 723 724 /** 725 * amdgpu_invalid_wreg - dummy reg write function 726 * 727 * @adev: amdgpu device pointer 728 * @reg: offset of register 729 * @v: value to write to the register 730 * 731 * Dummy register read function. Used for register blocks 732 * that certain asics don't have (all asics). 733 */ 734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 735 { 736 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 737 reg, v); 738 BUG(); 739 } 740 741 /** 742 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 743 * 744 * @adev: amdgpu device pointer 745 * @reg: offset of register 746 * 747 * Dummy register read function. Used for register blocks 748 * that certain asics don't have (all asics). 749 * Returns the value in the register. 750 */ 751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 752 { 753 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 754 BUG(); 755 return 0; 756 } 757 758 /** 759 * amdgpu_invalid_wreg64 - dummy reg write function 760 * 761 * @adev: amdgpu device pointer 762 * @reg: offset of register 763 * @v: value to write to the register 764 * 765 * Dummy register read function. Used for register blocks 766 * that certain asics don't have (all asics). 767 */ 768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 769 { 770 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 771 reg, v); 772 BUG(); 773 } 774 775 /** 776 * amdgpu_block_invalid_rreg - dummy reg read function 777 * 778 * @adev: amdgpu device pointer 779 * @block: offset of instance 780 * @reg: offset of register 781 * 782 * Dummy register read function. Used for register blocks 783 * that certain asics don't have (all asics). 784 * Returns the value in the register. 785 */ 786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 787 uint32_t block, uint32_t reg) 788 { 789 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 790 reg, block); 791 BUG(); 792 return 0; 793 } 794 795 /** 796 * amdgpu_block_invalid_wreg - dummy reg write function 797 * 798 * @adev: amdgpu device pointer 799 * @block: offset of instance 800 * @reg: offset of register 801 * @v: value to write to the register 802 * 803 * Dummy register read function. Used for register blocks 804 * that certain asics don't have (all asics). 805 */ 806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 807 uint32_t block, 808 uint32_t reg, uint32_t v) 809 { 810 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 811 reg, block, v); 812 BUG(); 813 } 814 815 /** 816 * amdgpu_device_asic_init - Wrapper for atom asic_init 817 * 818 * @dev: drm_device pointer 819 * 820 * Does any asic specific work and then calls atom asic init. 821 */ 822 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 823 { 824 amdgpu_asic_pre_asic_init(adev); 825 826 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 827 } 828 829 /** 830 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 831 * 832 * @adev: amdgpu device pointer 833 * 834 * Allocates a scratch page of VRAM for use by various things in the 835 * driver. 836 */ 837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 838 { 839 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 840 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 841 &adev->vram_scratch.robj, 842 &adev->vram_scratch.gpu_addr, 843 (void **)&adev->vram_scratch.ptr); 844 } 845 846 /** 847 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 848 * 849 * @adev: amdgpu device pointer 850 * 851 * Frees the VRAM scratch page. 852 */ 853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 854 { 855 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 856 } 857 858 /** 859 * amdgpu_device_program_register_sequence - program an array of registers. 860 * 861 * @adev: amdgpu_device pointer 862 * @registers: pointer to the register array 863 * @array_size: size of the register array 864 * 865 * Programs an array or registers with and and or masks. 866 * This is a helper for setting golden registers. 867 */ 868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 869 const u32 *registers, 870 const u32 array_size) 871 { 872 u32 tmp, reg, and_mask, or_mask; 873 int i; 874 875 if (array_size % 3) 876 return; 877 878 for (i = 0; i < array_size; i +=3) { 879 reg = registers[i + 0]; 880 and_mask = registers[i + 1]; 881 or_mask = registers[i + 2]; 882 883 if (and_mask == 0xffffffff) { 884 tmp = or_mask; 885 } else { 886 tmp = RREG32(reg); 887 tmp &= ~and_mask; 888 if (adev->family >= AMDGPU_FAMILY_AI) 889 tmp |= (or_mask & and_mask); 890 else 891 tmp |= or_mask; 892 } 893 WREG32(reg, tmp); 894 } 895 } 896 897 /** 898 * amdgpu_device_pci_config_reset - reset the GPU 899 * 900 * @adev: amdgpu_device pointer 901 * 902 * Resets the GPU using the pci config reset sequence. 903 * Only applicable to asics prior to vega10. 904 */ 905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 906 { 907 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 908 } 909 910 /* 911 * GPU doorbell aperture helpers function. 912 */ 913 /** 914 * amdgpu_device_doorbell_init - Init doorbell driver information. 915 * 916 * @adev: amdgpu_device pointer 917 * 918 * Init doorbell driver information (CIK) 919 * Returns 0 on success, error on failure. 920 */ 921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 922 { 923 924 /* No doorbell on SI hardware generation */ 925 if (adev->asic_type < CHIP_BONAIRE) { 926 adev->doorbell.base = 0; 927 adev->doorbell.size = 0; 928 adev->doorbell.num_doorbells = 0; 929 adev->doorbell.ptr = NULL; 930 return 0; 931 } 932 933 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 934 return -EINVAL; 935 936 amdgpu_asic_init_doorbell_index(adev); 937 938 /* doorbell bar mapping */ 939 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 940 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 941 942 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 943 adev->doorbell_index.max_assignment+1); 944 if (adev->doorbell.num_doorbells == 0) 945 return -EINVAL; 946 947 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 948 * paging queue doorbell use the second page. The 949 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 950 * doorbells are in the first page. So with paging queue enabled, 951 * the max num_doorbells should + 1 page (0x400 in dword) 952 */ 953 if (adev->asic_type >= CHIP_VEGA10) 954 adev->doorbell.num_doorbells += 0x400; 955 956 adev->doorbell.ptr = ioremap(adev->doorbell.base, 957 adev->doorbell.num_doorbells * 958 sizeof(u32)); 959 if (adev->doorbell.ptr == NULL) 960 return -ENOMEM; 961 962 return 0; 963 } 964 965 /** 966 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 967 * 968 * @adev: amdgpu_device pointer 969 * 970 * Tear down doorbell driver information (CIK) 971 */ 972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 973 { 974 iounmap(adev->doorbell.ptr); 975 adev->doorbell.ptr = NULL; 976 } 977 978 979 980 /* 981 * amdgpu_device_wb_*() 982 * Writeback is the method by which the GPU updates special pages in memory 983 * with the status of certain GPU events (fences, ring pointers,etc.). 984 */ 985 986 /** 987 * amdgpu_device_wb_fini - Disable Writeback and free memory 988 * 989 * @adev: amdgpu_device pointer 990 * 991 * Disables Writeback and frees the Writeback memory (all asics). 992 * Used at driver shutdown. 993 */ 994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 995 { 996 if (adev->wb.wb_obj) { 997 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 998 &adev->wb.gpu_addr, 999 (void **)&adev->wb.wb); 1000 adev->wb.wb_obj = NULL; 1001 } 1002 } 1003 1004 /** 1005 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1006 * 1007 * @adev: amdgpu_device pointer 1008 * 1009 * Initializes writeback and allocates writeback memory (all asics). 1010 * Used at driver startup. 1011 * Returns 0 on success or an -error on failure. 1012 */ 1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1014 { 1015 int r; 1016 1017 if (adev->wb.wb_obj == NULL) { 1018 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1019 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1020 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1021 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 if (r) { 1024 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1025 return r; 1026 } 1027 1028 adev->wb.num_wb = AMDGPU_MAX_WB; 1029 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1030 1031 /* clear wb memory */ 1032 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1033 } 1034 1035 return 0; 1036 } 1037 1038 /** 1039 * amdgpu_device_wb_get - Allocate a wb entry 1040 * 1041 * @adev: amdgpu_device pointer 1042 * @wb: wb index 1043 * 1044 * Allocate a wb slot for use by the driver (all asics). 1045 * Returns 0 on success or -EINVAL on failure. 1046 */ 1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1048 { 1049 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1050 1051 if (offset < adev->wb.num_wb) { 1052 __set_bit(offset, adev->wb.used); 1053 *wb = offset << 3; /* convert to dw offset */ 1054 return 0; 1055 } else { 1056 return -EINVAL; 1057 } 1058 } 1059 1060 /** 1061 * amdgpu_device_wb_free - Free a wb entry 1062 * 1063 * @adev: amdgpu_device pointer 1064 * @wb: wb index 1065 * 1066 * Free a wb slot allocated for use by the driver (all asics) 1067 */ 1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1069 { 1070 wb >>= 3; 1071 if (wb < adev->wb.num_wb) 1072 __clear_bit(wb, adev->wb.used); 1073 } 1074 1075 /** 1076 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1077 * 1078 * @adev: amdgpu_device pointer 1079 * 1080 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1081 * to fail, but if any of the BARs is not accessible after the size we abort 1082 * driver loading by returning -ENODEV. 1083 */ 1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1085 { 1086 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1087 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1088 struct pci_bus *root; 1089 struct resource *res; 1090 unsigned i; 1091 u16 cmd; 1092 int r; 1093 1094 /* Bypass for VF */ 1095 if (amdgpu_sriov_vf(adev)) 1096 return 0; 1097 1098 /* skip if the bios has already enabled large BAR */ 1099 if (adev->gmc.real_vram_size && 1100 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1101 return 0; 1102 1103 /* Check if the root BUS has 64bit memory resources */ 1104 root = adev->pdev->bus; 1105 while (root->parent) 1106 root = root->parent; 1107 1108 pci_bus_for_each_resource(root, res, i) { 1109 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1110 res->start > 0x100000000ull) 1111 break; 1112 } 1113 1114 /* Trying to resize is pointless without a root hub window above 4GB */ 1115 if (!res) 1116 return 0; 1117 1118 /* Disable memory decoding while we change the BAR addresses and size */ 1119 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1120 pci_write_config_word(adev->pdev, PCI_COMMAND, 1121 cmd & ~PCI_COMMAND_MEMORY); 1122 1123 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1124 amdgpu_device_doorbell_fini(adev); 1125 if (adev->asic_type >= CHIP_BONAIRE) 1126 pci_release_resource(adev->pdev, 2); 1127 1128 pci_release_resource(adev->pdev, 0); 1129 1130 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1131 if (r == -ENOSPC) 1132 DRM_INFO("Not enough PCI address space for a large BAR."); 1133 else if (r && r != -ENOTSUPP) 1134 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1135 1136 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1137 1138 /* When the doorbell or fb BAR isn't available we have no chance of 1139 * using the device. 1140 */ 1141 r = amdgpu_device_doorbell_init(adev); 1142 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1143 return -ENODEV; 1144 1145 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1146 1147 return 0; 1148 } 1149 1150 /* 1151 * GPU helpers function. 1152 */ 1153 /** 1154 * amdgpu_device_need_post - check if the hw need post or not 1155 * 1156 * @adev: amdgpu_device pointer 1157 * 1158 * Check if the asic has been initialized (all asics) at driver startup 1159 * or post is needed if hw reset is performed. 1160 * Returns true if need or false if not. 1161 */ 1162 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1163 { 1164 uint32_t reg; 1165 1166 if (amdgpu_sriov_vf(adev)) 1167 return false; 1168 1169 if (amdgpu_passthrough(adev)) { 1170 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1171 * some old smc fw still need driver do vPost otherwise gpu hang, while 1172 * those smc fw version above 22.15 doesn't have this flaw, so we force 1173 * vpost executed for smc version below 22.15 1174 */ 1175 if (adev->asic_type == CHIP_FIJI) { 1176 int err; 1177 uint32_t fw_ver; 1178 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1179 /* force vPost if error occured */ 1180 if (err) 1181 return true; 1182 1183 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1184 if (fw_ver < 0x00160e00) 1185 return true; 1186 } 1187 } 1188 1189 if (adev->has_hw_reset) { 1190 adev->has_hw_reset = false; 1191 return true; 1192 } 1193 1194 /* bios scratch used on CIK+ */ 1195 if (adev->asic_type >= CHIP_BONAIRE) 1196 return amdgpu_atombios_scratch_need_asic_init(adev); 1197 1198 /* check MEM_SIZE for older asics */ 1199 reg = amdgpu_asic_get_config_memsize(adev); 1200 1201 if ((reg != 0) && (reg != 0xffffffff)) 1202 return false; 1203 1204 return true; 1205 } 1206 1207 /* if we get transitioned to only one device, take VGA back */ 1208 /** 1209 * amdgpu_device_vga_set_decode - enable/disable vga decode 1210 * 1211 * @cookie: amdgpu_device pointer 1212 * @state: enable/disable vga decode 1213 * 1214 * Enable/disable vga decode (all asics). 1215 * Returns VGA resource flags. 1216 */ 1217 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1218 { 1219 struct amdgpu_device *adev = cookie; 1220 amdgpu_asic_set_vga_state(adev, state); 1221 if (state) 1222 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1223 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1224 else 1225 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1226 } 1227 1228 /** 1229 * amdgpu_device_check_block_size - validate the vm block size 1230 * 1231 * @adev: amdgpu_device pointer 1232 * 1233 * Validates the vm block size specified via module parameter. 1234 * The vm block size defines number of bits in page table versus page directory, 1235 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1236 * page table and the remaining bits are in the page directory. 1237 */ 1238 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1239 { 1240 /* defines number of bits in page table versus page directory, 1241 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1242 * page table and the remaining bits are in the page directory */ 1243 if (amdgpu_vm_block_size == -1) 1244 return; 1245 1246 if (amdgpu_vm_block_size < 9) { 1247 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1248 amdgpu_vm_block_size); 1249 amdgpu_vm_block_size = -1; 1250 } 1251 } 1252 1253 /** 1254 * amdgpu_device_check_vm_size - validate the vm size 1255 * 1256 * @adev: amdgpu_device pointer 1257 * 1258 * Validates the vm size in GB specified via module parameter. 1259 * The VM size is the size of the GPU virtual memory space in GB. 1260 */ 1261 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1262 { 1263 /* no need to check the default value */ 1264 if (amdgpu_vm_size == -1) 1265 return; 1266 1267 if (amdgpu_vm_size < 1) { 1268 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1269 amdgpu_vm_size); 1270 amdgpu_vm_size = -1; 1271 } 1272 } 1273 1274 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1275 { 1276 struct sysinfo si; 1277 bool is_os_64 = (sizeof(void *) == 8); 1278 uint64_t total_memory; 1279 uint64_t dram_size_seven_GB = 0x1B8000000; 1280 uint64_t dram_size_three_GB = 0xB8000000; 1281 1282 if (amdgpu_smu_memory_pool_size == 0) 1283 return; 1284 1285 if (!is_os_64) { 1286 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1287 goto def_value; 1288 } 1289 si_meminfo(&si); 1290 total_memory = (uint64_t)si.totalram * si.mem_unit; 1291 1292 if ((amdgpu_smu_memory_pool_size == 1) || 1293 (amdgpu_smu_memory_pool_size == 2)) { 1294 if (total_memory < dram_size_three_GB) 1295 goto def_value1; 1296 } else if ((amdgpu_smu_memory_pool_size == 4) || 1297 (amdgpu_smu_memory_pool_size == 8)) { 1298 if (total_memory < dram_size_seven_GB) 1299 goto def_value1; 1300 } else { 1301 DRM_WARN("Smu memory pool size not supported\n"); 1302 goto def_value; 1303 } 1304 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1305 1306 return; 1307 1308 def_value1: 1309 DRM_WARN("No enough system memory\n"); 1310 def_value: 1311 adev->pm.smu_prv_buffer_size = 0; 1312 } 1313 1314 /** 1315 * amdgpu_device_check_arguments - validate module params 1316 * 1317 * @adev: amdgpu_device pointer 1318 * 1319 * Validates certain module parameters and updates 1320 * the associated values used by the driver (all asics). 1321 */ 1322 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1323 { 1324 if (amdgpu_sched_jobs < 4) { 1325 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1326 amdgpu_sched_jobs); 1327 amdgpu_sched_jobs = 4; 1328 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1329 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1330 amdgpu_sched_jobs); 1331 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1332 } 1333 1334 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1335 /* gart size must be greater or equal to 32M */ 1336 dev_warn(adev->dev, "gart size (%d) too small\n", 1337 amdgpu_gart_size); 1338 amdgpu_gart_size = -1; 1339 } 1340 1341 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1342 /* gtt size must be greater or equal to 32M */ 1343 dev_warn(adev->dev, "gtt size (%d) too small\n", 1344 amdgpu_gtt_size); 1345 amdgpu_gtt_size = -1; 1346 } 1347 1348 /* valid range is between 4 and 9 inclusive */ 1349 if (amdgpu_vm_fragment_size != -1 && 1350 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1351 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1352 amdgpu_vm_fragment_size = -1; 1353 } 1354 1355 if (amdgpu_sched_hw_submission < 2) { 1356 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1357 amdgpu_sched_hw_submission); 1358 amdgpu_sched_hw_submission = 2; 1359 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1360 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1361 amdgpu_sched_hw_submission); 1362 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1363 } 1364 1365 amdgpu_device_check_smu_prv_buffer_size(adev); 1366 1367 amdgpu_device_check_vm_size(adev); 1368 1369 amdgpu_device_check_block_size(adev); 1370 1371 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1372 1373 amdgpu_gmc_tmz_set(adev); 1374 1375 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1376 amdgpu_num_kcq = 8; 1377 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1378 } 1379 1380 amdgpu_gmc_noretry_set(adev); 1381 1382 return 0; 1383 } 1384 1385 /** 1386 * amdgpu_switcheroo_set_state - set switcheroo state 1387 * 1388 * @pdev: pci dev pointer 1389 * @state: vga_switcheroo state 1390 * 1391 * Callback for the switcheroo driver. Suspends or resumes the 1392 * the asics before or after it is powered up using ACPI methods. 1393 */ 1394 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1395 enum vga_switcheroo_state state) 1396 { 1397 struct drm_device *dev = pci_get_drvdata(pdev); 1398 int r; 1399 1400 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1401 return; 1402 1403 if (state == VGA_SWITCHEROO_ON) { 1404 pr_info("switched on\n"); 1405 /* don't suspend or resume card normally */ 1406 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1407 1408 pci_set_power_state(dev->pdev, PCI_D0); 1409 amdgpu_device_load_pci_state(dev->pdev); 1410 r = pci_enable_device(dev->pdev); 1411 if (r) 1412 DRM_WARN("pci_enable_device failed (%d)\n", r); 1413 amdgpu_device_resume(dev, true); 1414 1415 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1416 drm_kms_helper_poll_enable(dev); 1417 } else { 1418 pr_info("switched off\n"); 1419 drm_kms_helper_poll_disable(dev); 1420 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1421 amdgpu_device_suspend(dev, true); 1422 amdgpu_device_cache_pci_state(dev->pdev); 1423 /* Shut down the device */ 1424 pci_disable_device(dev->pdev); 1425 pci_set_power_state(dev->pdev, PCI_D3cold); 1426 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1427 } 1428 } 1429 1430 /** 1431 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1432 * 1433 * @pdev: pci dev pointer 1434 * 1435 * Callback for the switcheroo driver. Check of the switcheroo 1436 * state can be changed. 1437 * Returns true if the state can be changed, false if not. 1438 */ 1439 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1440 { 1441 struct drm_device *dev = pci_get_drvdata(pdev); 1442 1443 /* 1444 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1445 * locking inversion with the driver load path. And the access here is 1446 * completely racy anyway. So don't bother with locking for now. 1447 */ 1448 return atomic_read(&dev->open_count) == 0; 1449 } 1450 1451 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1452 .set_gpu_state = amdgpu_switcheroo_set_state, 1453 .reprobe = NULL, 1454 .can_switch = amdgpu_switcheroo_can_switch, 1455 }; 1456 1457 /** 1458 * amdgpu_device_ip_set_clockgating_state - set the CG state 1459 * 1460 * @dev: amdgpu_device pointer 1461 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1462 * @state: clockgating state (gate or ungate) 1463 * 1464 * Sets the requested clockgating state for all instances of 1465 * the hardware IP specified. 1466 * Returns the error code from the last instance. 1467 */ 1468 int amdgpu_device_ip_set_clockgating_state(void *dev, 1469 enum amd_ip_block_type block_type, 1470 enum amd_clockgating_state state) 1471 { 1472 struct amdgpu_device *adev = dev; 1473 int i, r = 0; 1474 1475 for (i = 0; i < adev->num_ip_blocks; i++) { 1476 if (!adev->ip_blocks[i].status.valid) 1477 continue; 1478 if (adev->ip_blocks[i].version->type != block_type) 1479 continue; 1480 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1481 continue; 1482 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1483 (void *)adev, state); 1484 if (r) 1485 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1486 adev->ip_blocks[i].version->funcs->name, r); 1487 } 1488 return r; 1489 } 1490 1491 /** 1492 * amdgpu_device_ip_set_powergating_state - set the PG state 1493 * 1494 * @dev: amdgpu_device pointer 1495 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1496 * @state: powergating state (gate or ungate) 1497 * 1498 * Sets the requested powergating state for all instances of 1499 * the hardware IP specified. 1500 * Returns the error code from the last instance. 1501 */ 1502 int amdgpu_device_ip_set_powergating_state(void *dev, 1503 enum amd_ip_block_type block_type, 1504 enum amd_powergating_state state) 1505 { 1506 struct amdgpu_device *adev = dev; 1507 int i, r = 0; 1508 1509 for (i = 0; i < adev->num_ip_blocks; i++) { 1510 if (!adev->ip_blocks[i].status.valid) 1511 continue; 1512 if (adev->ip_blocks[i].version->type != block_type) 1513 continue; 1514 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1515 continue; 1516 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1517 (void *)adev, state); 1518 if (r) 1519 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1520 adev->ip_blocks[i].version->funcs->name, r); 1521 } 1522 return r; 1523 } 1524 1525 /** 1526 * amdgpu_device_ip_get_clockgating_state - get the CG state 1527 * 1528 * @adev: amdgpu_device pointer 1529 * @flags: clockgating feature flags 1530 * 1531 * Walks the list of IPs on the device and updates the clockgating 1532 * flags for each IP. 1533 * Updates @flags with the feature flags for each hardware IP where 1534 * clockgating is enabled. 1535 */ 1536 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1537 u32 *flags) 1538 { 1539 int i; 1540 1541 for (i = 0; i < adev->num_ip_blocks; i++) { 1542 if (!adev->ip_blocks[i].status.valid) 1543 continue; 1544 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1545 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1546 } 1547 } 1548 1549 /** 1550 * amdgpu_device_ip_wait_for_idle - wait for idle 1551 * 1552 * @adev: amdgpu_device pointer 1553 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1554 * 1555 * Waits for the request hardware IP to be idle. 1556 * Returns 0 for success or a negative error code on failure. 1557 */ 1558 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1559 enum amd_ip_block_type block_type) 1560 { 1561 int i, r; 1562 1563 for (i = 0; i < adev->num_ip_blocks; i++) { 1564 if (!adev->ip_blocks[i].status.valid) 1565 continue; 1566 if (adev->ip_blocks[i].version->type == block_type) { 1567 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1568 if (r) 1569 return r; 1570 break; 1571 } 1572 } 1573 return 0; 1574 1575 } 1576 1577 /** 1578 * amdgpu_device_ip_is_idle - is the hardware IP idle 1579 * 1580 * @adev: amdgpu_device pointer 1581 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1582 * 1583 * Check if the hardware IP is idle or not. 1584 * Returns true if it the IP is idle, false if not. 1585 */ 1586 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1587 enum amd_ip_block_type block_type) 1588 { 1589 int i; 1590 1591 for (i = 0; i < adev->num_ip_blocks; i++) { 1592 if (!adev->ip_blocks[i].status.valid) 1593 continue; 1594 if (adev->ip_blocks[i].version->type == block_type) 1595 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1596 } 1597 return true; 1598 1599 } 1600 1601 /** 1602 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1603 * 1604 * @adev: amdgpu_device pointer 1605 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1606 * 1607 * Returns a pointer to the hardware IP block structure 1608 * if it exists for the asic, otherwise NULL. 1609 */ 1610 struct amdgpu_ip_block * 1611 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1612 enum amd_ip_block_type type) 1613 { 1614 int i; 1615 1616 for (i = 0; i < adev->num_ip_blocks; i++) 1617 if (adev->ip_blocks[i].version->type == type) 1618 return &adev->ip_blocks[i]; 1619 1620 return NULL; 1621 } 1622 1623 /** 1624 * amdgpu_device_ip_block_version_cmp 1625 * 1626 * @adev: amdgpu_device pointer 1627 * @type: enum amd_ip_block_type 1628 * @major: major version 1629 * @minor: minor version 1630 * 1631 * return 0 if equal or greater 1632 * return 1 if smaller or the ip_block doesn't exist 1633 */ 1634 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1635 enum amd_ip_block_type type, 1636 u32 major, u32 minor) 1637 { 1638 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1639 1640 if (ip_block && ((ip_block->version->major > major) || 1641 ((ip_block->version->major == major) && 1642 (ip_block->version->minor >= minor)))) 1643 return 0; 1644 1645 return 1; 1646 } 1647 1648 /** 1649 * amdgpu_device_ip_block_add 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @ip_block_version: pointer to the IP to add 1653 * 1654 * Adds the IP block driver information to the collection of IPs 1655 * on the asic. 1656 */ 1657 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1658 const struct amdgpu_ip_block_version *ip_block_version) 1659 { 1660 if (!ip_block_version) 1661 return -EINVAL; 1662 1663 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1664 ip_block_version->funcs->name); 1665 1666 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1667 1668 return 0; 1669 } 1670 1671 /** 1672 * amdgpu_device_enable_virtual_display - enable virtual display feature 1673 * 1674 * @adev: amdgpu_device pointer 1675 * 1676 * Enabled the virtual display feature if the user has enabled it via 1677 * the module parameter virtual_display. This feature provides a virtual 1678 * display hardware on headless boards or in virtualized environments. 1679 * This function parses and validates the configuration string specified by 1680 * the user and configues the virtual display configuration (number of 1681 * virtual connectors, crtcs, etc.) specified. 1682 */ 1683 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1684 { 1685 adev->enable_virtual_display = false; 1686 1687 if (amdgpu_virtual_display) { 1688 struct drm_device *ddev = adev_to_drm(adev); 1689 const char *pci_address_name = pci_name(ddev->pdev); 1690 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1691 1692 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1693 pciaddstr_tmp = pciaddstr; 1694 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1695 pciaddname = strsep(&pciaddname_tmp, ","); 1696 if (!strcmp("all", pciaddname) 1697 || !strcmp(pci_address_name, pciaddname)) { 1698 long num_crtc; 1699 int res = -1; 1700 1701 adev->enable_virtual_display = true; 1702 1703 if (pciaddname_tmp) 1704 res = kstrtol(pciaddname_tmp, 10, 1705 &num_crtc); 1706 1707 if (!res) { 1708 if (num_crtc < 1) 1709 num_crtc = 1; 1710 if (num_crtc > 6) 1711 num_crtc = 6; 1712 adev->mode_info.num_crtc = num_crtc; 1713 } else { 1714 adev->mode_info.num_crtc = 1; 1715 } 1716 break; 1717 } 1718 } 1719 1720 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1721 amdgpu_virtual_display, pci_address_name, 1722 adev->enable_virtual_display, adev->mode_info.num_crtc); 1723 1724 kfree(pciaddstr); 1725 } 1726 } 1727 1728 /** 1729 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1730 * 1731 * @adev: amdgpu_device pointer 1732 * 1733 * Parses the asic configuration parameters specified in the gpu info 1734 * firmware and makes them availale to the driver for use in configuring 1735 * the asic. 1736 * Returns 0 on success, -EINVAL on failure. 1737 */ 1738 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1739 { 1740 const char *chip_name; 1741 char fw_name[40]; 1742 int err; 1743 const struct gpu_info_firmware_header_v1_0 *hdr; 1744 1745 adev->firmware.gpu_info_fw = NULL; 1746 1747 if (adev->mman.discovery_bin) { 1748 amdgpu_discovery_get_gfx_info(adev); 1749 1750 /* 1751 * FIXME: The bounding box is still needed by Navi12, so 1752 * temporarily read it from gpu_info firmware. Should be droped 1753 * when DAL no longer needs it. 1754 */ 1755 if (adev->asic_type != CHIP_NAVI12) 1756 return 0; 1757 } 1758 1759 switch (adev->asic_type) { 1760 #ifdef CONFIG_DRM_AMDGPU_SI 1761 case CHIP_VERDE: 1762 case CHIP_TAHITI: 1763 case CHIP_PITCAIRN: 1764 case CHIP_OLAND: 1765 case CHIP_HAINAN: 1766 #endif 1767 #ifdef CONFIG_DRM_AMDGPU_CIK 1768 case CHIP_BONAIRE: 1769 case CHIP_HAWAII: 1770 case CHIP_KAVERI: 1771 case CHIP_KABINI: 1772 case CHIP_MULLINS: 1773 #endif 1774 case CHIP_TOPAZ: 1775 case CHIP_TONGA: 1776 case CHIP_FIJI: 1777 case CHIP_POLARIS10: 1778 case CHIP_POLARIS11: 1779 case CHIP_POLARIS12: 1780 case CHIP_VEGAM: 1781 case CHIP_CARRIZO: 1782 case CHIP_STONEY: 1783 case CHIP_VEGA20: 1784 case CHIP_SIENNA_CICHLID: 1785 case CHIP_NAVY_FLOUNDER: 1786 default: 1787 return 0; 1788 case CHIP_VEGA10: 1789 chip_name = "vega10"; 1790 break; 1791 case CHIP_VEGA12: 1792 chip_name = "vega12"; 1793 break; 1794 case CHIP_RAVEN: 1795 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1796 chip_name = "raven2"; 1797 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1798 chip_name = "picasso"; 1799 else 1800 chip_name = "raven"; 1801 break; 1802 case CHIP_ARCTURUS: 1803 chip_name = "arcturus"; 1804 break; 1805 case CHIP_RENOIR: 1806 chip_name = "renoir"; 1807 break; 1808 case CHIP_NAVI10: 1809 chip_name = "navi10"; 1810 break; 1811 case CHIP_NAVI14: 1812 chip_name = "navi14"; 1813 break; 1814 case CHIP_NAVI12: 1815 chip_name = "navi12"; 1816 break; 1817 case CHIP_VANGOGH: 1818 chip_name = "vangogh"; 1819 break; 1820 } 1821 1822 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1823 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1824 if (err) { 1825 dev_err(adev->dev, 1826 "Failed to load gpu_info firmware \"%s\"\n", 1827 fw_name); 1828 goto out; 1829 } 1830 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1831 if (err) { 1832 dev_err(adev->dev, 1833 "Failed to validate gpu_info firmware \"%s\"\n", 1834 fw_name); 1835 goto out; 1836 } 1837 1838 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1839 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1840 1841 switch (hdr->version_major) { 1842 case 1: 1843 { 1844 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1845 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1846 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1847 1848 /* 1849 * Should be droped when DAL no longer needs it. 1850 */ 1851 if (adev->asic_type == CHIP_NAVI12) 1852 goto parse_soc_bounding_box; 1853 1854 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1855 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1856 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1857 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1858 adev->gfx.config.max_texture_channel_caches = 1859 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1860 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1861 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1862 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1863 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1864 adev->gfx.config.double_offchip_lds_buf = 1865 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1866 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1867 adev->gfx.cu_info.max_waves_per_simd = 1868 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1869 adev->gfx.cu_info.max_scratch_slots_per_cu = 1870 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1871 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1872 if (hdr->version_minor >= 1) { 1873 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1874 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1875 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1876 adev->gfx.config.num_sc_per_sh = 1877 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1878 adev->gfx.config.num_packer_per_sc = 1879 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1880 } 1881 1882 parse_soc_bounding_box: 1883 /* 1884 * soc bounding box info is not integrated in disocovery table, 1885 * we always need to parse it from gpu info firmware if needed. 1886 */ 1887 if (hdr->version_minor == 2) { 1888 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1889 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1890 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1891 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1892 } 1893 break; 1894 } 1895 default: 1896 dev_err(adev->dev, 1897 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1898 err = -EINVAL; 1899 goto out; 1900 } 1901 out: 1902 return err; 1903 } 1904 1905 /** 1906 * amdgpu_device_ip_early_init - run early init for hardware IPs 1907 * 1908 * @adev: amdgpu_device pointer 1909 * 1910 * Early initialization pass for hardware IPs. The hardware IPs that make 1911 * up each asic are discovered each IP's early_init callback is run. This 1912 * is the first stage in initializing the asic. 1913 * Returns 0 on success, negative error code on failure. 1914 */ 1915 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1916 { 1917 int i, r; 1918 1919 amdgpu_device_enable_virtual_display(adev); 1920 1921 if (amdgpu_sriov_vf(adev)) { 1922 r = amdgpu_virt_request_full_gpu(adev, true); 1923 if (r) 1924 return r; 1925 } 1926 1927 switch (adev->asic_type) { 1928 #ifdef CONFIG_DRM_AMDGPU_SI 1929 case CHIP_VERDE: 1930 case CHIP_TAHITI: 1931 case CHIP_PITCAIRN: 1932 case CHIP_OLAND: 1933 case CHIP_HAINAN: 1934 adev->family = AMDGPU_FAMILY_SI; 1935 r = si_set_ip_blocks(adev); 1936 if (r) 1937 return r; 1938 break; 1939 #endif 1940 #ifdef CONFIG_DRM_AMDGPU_CIK 1941 case CHIP_BONAIRE: 1942 case CHIP_HAWAII: 1943 case CHIP_KAVERI: 1944 case CHIP_KABINI: 1945 case CHIP_MULLINS: 1946 if (adev->flags & AMD_IS_APU) 1947 adev->family = AMDGPU_FAMILY_KV; 1948 else 1949 adev->family = AMDGPU_FAMILY_CI; 1950 1951 r = cik_set_ip_blocks(adev); 1952 if (r) 1953 return r; 1954 break; 1955 #endif 1956 case CHIP_TOPAZ: 1957 case CHIP_TONGA: 1958 case CHIP_FIJI: 1959 case CHIP_POLARIS10: 1960 case CHIP_POLARIS11: 1961 case CHIP_POLARIS12: 1962 case CHIP_VEGAM: 1963 case CHIP_CARRIZO: 1964 case CHIP_STONEY: 1965 if (adev->flags & AMD_IS_APU) 1966 adev->family = AMDGPU_FAMILY_CZ; 1967 else 1968 adev->family = AMDGPU_FAMILY_VI; 1969 1970 r = vi_set_ip_blocks(adev); 1971 if (r) 1972 return r; 1973 break; 1974 case CHIP_VEGA10: 1975 case CHIP_VEGA12: 1976 case CHIP_VEGA20: 1977 case CHIP_RAVEN: 1978 case CHIP_ARCTURUS: 1979 case CHIP_RENOIR: 1980 if (adev->flags & AMD_IS_APU) 1981 adev->family = AMDGPU_FAMILY_RV; 1982 else 1983 adev->family = AMDGPU_FAMILY_AI; 1984 1985 r = soc15_set_ip_blocks(adev); 1986 if (r) 1987 return r; 1988 break; 1989 case CHIP_NAVI10: 1990 case CHIP_NAVI14: 1991 case CHIP_NAVI12: 1992 case CHIP_SIENNA_CICHLID: 1993 case CHIP_NAVY_FLOUNDER: 1994 case CHIP_VANGOGH: 1995 if (adev->asic_type == CHIP_VANGOGH) 1996 adev->family = AMDGPU_FAMILY_VGH; 1997 else 1998 adev->family = AMDGPU_FAMILY_NV; 1999 2000 r = nv_set_ip_blocks(adev); 2001 if (r) 2002 return r; 2003 break; 2004 default: 2005 /* FIXME: not supported yet */ 2006 return -EINVAL; 2007 } 2008 2009 amdgpu_amdkfd_device_probe(adev); 2010 2011 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2012 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2013 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2014 2015 for (i = 0; i < adev->num_ip_blocks; i++) { 2016 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2017 DRM_ERROR("disabled ip block: %d <%s>\n", 2018 i, adev->ip_blocks[i].version->funcs->name); 2019 adev->ip_blocks[i].status.valid = false; 2020 } else { 2021 if (adev->ip_blocks[i].version->funcs->early_init) { 2022 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2023 if (r == -ENOENT) { 2024 adev->ip_blocks[i].status.valid = false; 2025 } else if (r) { 2026 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2027 adev->ip_blocks[i].version->funcs->name, r); 2028 return r; 2029 } else { 2030 adev->ip_blocks[i].status.valid = true; 2031 } 2032 } else { 2033 adev->ip_blocks[i].status.valid = true; 2034 } 2035 } 2036 /* get the vbios after the asic_funcs are set up */ 2037 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2038 r = amdgpu_device_parse_gpu_info_fw(adev); 2039 if (r) 2040 return r; 2041 2042 /* Read BIOS */ 2043 if (!amdgpu_get_bios(adev)) 2044 return -EINVAL; 2045 2046 r = amdgpu_atombios_init(adev); 2047 if (r) { 2048 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2049 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2050 return r; 2051 } 2052 } 2053 } 2054 2055 adev->cg_flags &= amdgpu_cg_mask; 2056 adev->pg_flags &= amdgpu_pg_mask; 2057 2058 return 0; 2059 } 2060 2061 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2062 { 2063 int i, r; 2064 2065 for (i = 0; i < adev->num_ip_blocks; i++) { 2066 if (!adev->ip_blocks[i].status.sw) 2067 continue; 2068 if (adev->ip_blocks[i].status.hw) 2069 continue; 2070 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2071 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2072 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2073 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2074 if (r) { 2075 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2076 adev->ip_blocks[i].version->funcs->name, r); 2077 return r; 2078 } 2079 adev->ip_blocks[i].status.hw = true; 2080 } 2081 } 2082 2083 return 0; 2084 } 2085 2086 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2087 { 2088 int i, r; 2089 2090 for (i = 0; i < adev->num_ip_blocks; i++) { 2091 if (!adev->ip_blocks[i].status.sw) 2092 continue; 2093 if (adev->ip_blocks[i].status.hw) 2094 continue; 2095 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2096 if (r) { 2097 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2098 adev->ip_blocks[i].version->funcs->name, r); 2099 return r; 2100 } 2101 adev->ip_blocks[i].status.hw = true; 2102 } 2103 2104 return 0; 2105 } 2106 2107 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2108 { 2109 int r = 0; 2110 int i; 2111 uint32_t smu_version; 2112 2113 if (adev->asic_type >= CHIP_VEGA10) { 2114 for (i = 0; i < adev->num_ip_blocks; i++) { 2115 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2116 continue; 2117 2118 /* no need to do the fw loading again if already done*/ 2119 if (adev->ip_blocks[i].status.hw == true) 2120 break; 2121 2122 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2123 r = adev->ip_blocks[i].version->funcs->resume(adev); 2124 if (r) { 2125 DRM_ERROR("resume of IP block <%s> failed %d\n", 2126 adev->ip_blocks[i].version->funcs->name, r); 2127 return r; 2128 } 2129 } else { 2130 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2131 if (r) { 2132 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2133 adev->ip_blocks[i].version->funcs->name, r); 2134 return r; 2135 } 2136 } 2137 2138 adev->ip_blocks[i].status.hw = true; 2139 break; 2140 } 2141 } 2142 2143 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2144 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2145 2146 return r; 2147 } 2148 2149 /** 2150 * amdgpu_device_ip_init - run init for hardware IPs 2151 * 2152 * @adev: amdgpu_device pointer 2153 * 2154 * Main initialization pass for hardware IPs. The list of all the hardware 2155 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2156 * are run. sw_init initializes the software state associated with each IP 2157 * and hw_init initializes the hardware associated with each IP. 2158 * Returns 0 on success, negative error code on failure. 2159 */ 2160 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2161 { 2162 int i, r; 2163 2164 r = amdgpu_ras_init(adev); 2165 if (r) 2166 return r; 2167 2168 for (i = 0; i < adev->num_ip_blocks; i++) { 2169 if (!adev->ip_blocks[i].status.valid) 2170 continue; 2171 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2172 if (r) { 2173 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2174 adev->ip_blocks[i].version->funcs->name, r); 2175 goto init_failed; 2176 } 2177 adev->ip_blocks[i].status.sw = true; 2178 2179 /* need to do gmc hw init early so we can allocate gpu mem */ 2180 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2181 r = amdgpu_device_vram_scratch_init(adev); 2182 if (r) { 2183 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2184 goto init_failed; 2185 } 2186 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2187 if (r) { 2188 DRM_ERROR("hw_init %d failed %d\n", i, r); 2189 goto init_failed; 2190 } 2191 r = amdgpu_device_wb_init(adev); 2192 if (r) { 2193 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2194 goto init_failed; 2195 } 2196 adev->ip_blocks[i].status.hw = true; 2197 2198 /* right after GMC hw init, we create CSA */ 2199 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2200 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2201 AMDGPU_GEM_DOMAIN_VRAM, 2202 AMDGPU_CSA_SIZE); 2203 if (r) { 2204 DRM_ERROR("allocate CSA failed %d\n", r); 2205 goto init_failed; 2206 } 2207 } 2208 } 2209 } 2210 2211 if (amdgpu_sriov_vf(adev)) 2212 amdgpu_virt_init_data_exchange(adev); 2213 2214 r = amdgpu_ib_pool_init(adev); 2215 if (r) { 2216 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2217 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2218 goto init_failed; 2219 } 2220 2221 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2222 if (r) 2223 goto init_failed; 2224 2225 r = amdgpu_device_ip_hw_init_phase1(adev); 2226 if (r) 2227 goto init_failed; 2228 2229 r = amdgpu_device_fw_loading(adev); 2230 if (r) 2231 goto init_failed; 2232 2233 r = amdgpu_device_ip_hw_init_phase2(adev); 2234 if (r) 2235 goto init_failed; 2236 2237 /* 2238 * retired pages will be loaded from eeprom and reserved here, 2239 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2240 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2241 * for I2C communication which only true at this point. 2242 * 2243 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2244 * failure from bad gpu situation and stop amdgpu init process 2245 * accordingly. For other failed cases, it will still release all 2246 * the resource and print error message, rather than returning one 2247 * negative value to upper level. 2248 * 2249 * Note: theoretically, this should be called before all vram allocations 2250 * to protect retired page from abusing 2251 */ 2252 r = amdgpu_ras_recovery_init(adev); 2253 if (r) 2254 goto init_failed; 2255 2256 if (adev->gmc.xgmi.num_physical_nodes > 1) 2257 amdgpu_xgmi_add_device(adev); 2258 amdgpu_amdkfd_device_init(adev); 2259 2260 amdgpu_fru_get_product_info(adev); 2261 2262 init_failed: 2263 if (amdgpu_sriov_vf(adev)) 2264 amdgpu_virt_release_full_gpu(adev, true); 2265 2266 return r; 2267 } 2268 2269 /** 2270 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2271 * 2272 * @adev: amdgpu_device pointer 2273 * 2274 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2275 * this function before a GPU reset. If the value is retained after a 2276 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2277 */ 2278 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2279 { 2280 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2281 } 2282 2283 /** 2284 * amdgpu_device_check_vram_lost - check if vram is valid 2285 * 2286 * @adev: amdgpu_device pointer 2287 * 2288 * Checks the reset magic value written to the gart pointer in VRAM. 2289 * The driver calls this after a GPU reset to see if the contents of 2290 * VRAM is lost or now. 2291 * returns true if vram is lost, false if not. 2292 */ 2293 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2294 { 2295 if (memcmp(adev->gart.ptr, adev->reset_magic, 2296 AMDGPU_RESET_MAGIC_NUM)) 2297 return true; 2298 2299 if (!amdgpu_in_reset(adev)) 2300 return false; 2301 2302 /* 2303 * For all ASICs with baco/mode1 reset, the VRAM is 2304 * always assumed to be lost. 2305 */ 2306 switch (amdgpu_asic_reset_method(adev)) { 2307 case AMD_RESET_METHOD_BACO: 2308 case AMD_RESET_METHOD_MODE1: 2309 return true; 2310 default: 2311 return false; 2312 } 2313 } 2314 2315 /** 2316 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2317 * 2318 * @adev: amdgpu_device pointer 2319 * @state: clockgating state (gate or ungate) 2320 * 2321 * The list of all the hardware IPs that make up the asic is walked and the 2322 * set_clockgating_state callbacks are run. 2323 * Late initialization pass enabling clockgating for hardware IPs. 2324 * Fini or suspend, pass disabling clockgating for hardware IPs. 2325 * Returns 0 on success, negative error code on failure. 2326 */ 2327 2328 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2329 enum amd_clockgating_state state) 2330 { 2331 int i, j, r; 2332 2333 if (amdgpu_emu_mode == 1) 2334 return 0; 2335 2336 for (j = 0; j < adev->num_ip_blocks; j++) { 2337 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2338 if (!adev->ip_blocks[i].status.late_initialized) 2339 continue; 2340 /* skip CG for VCE/UVD, it's handled specially */ 2341 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2342 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2343 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2344 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2345 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2346 /* enable clockgating to save power */ 2347 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2348 state); 2349 if (r) { 2350 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2351 adev->ip_blocks[i].version->funcs->name, r); 2352 return r; 2353 } 2354 } 2355 } 2356 2357 return 0; 2358 } 2359 2360 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2361 { 2362 int i, j, r; 2363 2364 if (amdgpu_emu_mode == 1) 2365 return 0; 2366 2367 for (j = 0; j < adev->num_ip_blocks; j++) { 2368 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2369 if (!adev->ip_blocks[i].status.late_initialized) 2370 continue; 2371 /* skip CG for VCE/UVD, it's handled specially */ 2372 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2373 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2374 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2375 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2376 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2377 /* enable powergating to save power */ 2378 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2379 state); 2380 if (r) { 2381 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2382 adev->ip_blocks[i].version->funcs->name, r); 2383 return r; 2384 } 2385 } 2386 } 2387 return 0; 2388 } 2389 2390 static int amdgpu_device_enable_mgpu_fan_boost(void) 2391 { 2392 struct amdgpu_gpu_instance *gpu_ins; 2393 struct amdgpu_device *adev; 2394 int i, ret = 0; 2395 2396 mutex_lock(&mgpu_info.mutex); 2397 2398 /* 2399 * MGPU fan boost feature should be enabled 2400 * only when there are two or more dGPUs in 2401 * the system 2402 */ 2403 if (mgpu_info.num_dgpu < 2) 2404 goto out; 2405 2406 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2407 gpu_ins = &(mgpu_info.gpu_ins[i]); 2408 adev = gpu_ins->adev; 2409 if (!(adev->flags & AMD_IS_APU) && 2410 !gpu_ins->mgpu_fan_enabled) { 2411 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2412 if (ret) 2413 break; 2414 2415 gpu_ins->mgpu_fan_enabled = 1; 2416 } 2417 } 2418 2419 out: 2420 mutex_unlock(&mgpu_info.mutex); 2421 2422 return ret; 2423 } 2424 2425 /** 2426 * amdgpu_device_ip_late_init - run late init for hardware IPs 2427 * 2428 * @adev: amdgpu_device pointer 2429 * 2430 * Late initialization pass for hardware IPs. The list of all the hardware 2431 * IPs that make up the asic is walked and the late_init callbacks are run. 2432 * late_init covers any special initialization that an IP requires 2433 * after all of the have been initialized or something that needs to happen 2434 * late in the init process. 2435 * Returns 0 on success, negative error code on failure. 2436 */ 2437 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2438 { 2439 struct amdgpu_gpu_instance *gpu_instance; 2440 int i = 0, r; 2441 2442 for (i = 0; i < adev->num_ip_blocks; i++) { 2443 if (!adev->ip_blocks[i].status.hw) 2444 continue; 2445 if (adev->ip_blocks[i].version->funcs->late_init) { 2446 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2447 if (r) { 2448 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2449 adev->ip_blocks[i].version->funcs->name, r); 2450 return r; 2451 } 2452 } 2453 adev->ip_blocks[i].status.late_initialized = true; 2454 } 2455 2456 amdgpu_ras_set_error_query_ready(adev, true); 2457 2458 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2459 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2460 2461 amdgpu_device_fill_reset_magic(adev); 2462 2463 r = amdgpu_device_enable_mgpu_fan_boost(); 2464 if (r) 2465 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2466 2467 2468 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2469 mutex_lock(&mgpu_info.mutex); 2470 2471 /* 2472 * Reset device p-state to low as this was booted with high. 2473 * 2474 * This should be performed only after all devices from the same 2475 * hive get initialized. 2476 * 2477 * However, it's unknown how many device in the hive in advance. 2478 * As this is counted one by one during devices initializations. 2479 * 2480 * So, we wait for all XGMI interlinked devices initialized. 2481 * This may bring some delays as those devices may come from 2482 * different hives. But that should be OK. 2483 */ 2484 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2485 for (i = 0; i < mgpu_info.num_gpu; i++) { 2486 gpu_instance = &(mgpu_info.gpu_ins[i]); 2487 if (gpu_instance->adev->flags & AMD_IS_APU) 2488 continue; 2489 2490 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2491 AMDGPU_XGMI_PSTATE_MIN); 2492 if (r) { 2493 DRM_ERROR("pstate setting failed (%d).\n", r); 2494 break; 2495 } 2496 } 2497 } 2498 2499 mutex_unlock(&mgpu_info.mutex); 2500 } 2501 2502 return 0; 2503 } 2504 2505 /** 2506 * amdgpu_device_ip_fini - run fini for hardware IPs 2507 * 2508 * @adev: amdgpu_device pointer 2509 * 2510 * Main teardown pass for hardware IPs. The list of all the hardware 2511 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2512 * are run. hw_fini tears down the hardware associated with each IP 2513 * and sw_fini tears down any software state associated with each IP. 2514 * Returns 0 on success, negative error code on failure. 2515 */ 2516 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2517 { 2518 int i, r; 2519 2520 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2521 amdgpu_virt_release_ras_err_handler_data(adev); 2522 2523 amdgpu_ras_pre_fini(adev); 2524 2525 if (adev->gmc.xgmi.num_physical_nodes > 1) 2526 amdgpu_xgmi_remove_device(adev); 2527 2528 amdgpu_amdkfd_device_fini(adev); 2529 2530 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2531 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2532 2533 /* need to disable SMC first */ 2534 for (i = 0; i < adev->num_ip_blocks; i++) { 2535 if (!adev->ip_blocks[i].status.hw) 2536 continue; 2537 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2538 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2539 /* XXX handle errors */ 2540 if (r) { 2541 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2542 adev->ip_blocks[i].version->funcs->name, r); 2543 } 2544 adev->ip_blocks[i].status.hw = false; 2545 break; 2546 } 2547 } 2548 2549 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2550 if (!adev->ip_blocks[i].status.hw) 2551 continue; 2552 2553 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2554 /* XXX handle errors */ 2555 if (r) { 2556 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2557 adev->ip_blocks[i].version->funcs->name, r); 2558 } 2559 2560 adev->ip_blocks[i].status.hw = false; 2561 } 2562 2563 2564 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2565 if (!adev->ip_blocks[i].status.sw) 2566 continue; 2567 2568 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2569 amdgpu_ucode_free_bo(adev); 2570 amdgpu_free_static_csa(&adev->virt.csa_obj); 2571 amdgpu_device_wb_fini(adev); 2572 amdgpu_device_vram_scratch_fini(adev); 2573 amdgpu_ib_pool_fini(adev); 2574 } 2575 2576 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2577 /* XXX handle errors */ 2578 if (r) { 2579 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2580 adev->ip_blocks[i].version->funcs->name, r); 2581 } 2582 adev->ip_blocks[i].status.sw = false; 2583 adev->ip_blocks[i].status.valid = false; 2584 } 2585 2586 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2587 if (!adev->ip_blocks[i].status.late_initialized) 2588 continue; 2589 if (adev->ip_blocks[i].version->funcs->late_fini) 2590 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2591 adev->ip_blocks[i].status.late_initialized = false; 2592 } 2593 2594 amdgpu_ras_fini(adev); 2595 2596 if (amdgpu_sriov_vf(adev)) 2597 if (amdgpu_virt_release_full_gpu(adev, false)) 2598 DRM_ERROR("failed to release exclusive mode on fini\n"); 2599 2600 return 0; 2601 } 2602 2603 /** 2604 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2605 * 2606 * @work: work_struct. 2607 */ 2608 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2609 { 2610 struct amdgpu_device *adev = 2611 container_of(work, struct amdgpu_device, delayed_init_work.work); 2612 int r; 2613 2614 r = amdgpu_ib_ring_tests(adev); 2615 if (r) 2616 DRM_ERROR("ib ring test failed (%d).\n", r); 2617 } 2618 2619 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2620 { 2621 struct amdgpu_device *adev = 2622 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2623 2624 mutex_lock(&adev->gfx.gfx_off_mutex); 2625 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2626 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2627 adev->gfx.gfx_off_state = true; 2628 } 2629 mutex_unlock(&adev->gfx.gfx_off_mutex); 2630 } 2631 2632 /** 2633 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2634 * 2635 * @adev: amdgpu_device pointer 2636 * 2637 * Main suspend function for hardware IPs. The list of all the hardware 2638 * IPs that make up the asic is walked, clockgating is disabled and the 2639 * suspend callbacks are run. suspend puts the hardware and software state 2640 * in each IP into a state suitable for suspend. 2641 * Returns 0 on success, negative error code on failure. 2642 */ 2643 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2644 { 2645 int i, r; 2646 2647 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2648 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2649 2650 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2651 if (!adev->ip_blocks[i].status.valid) 2652 continue; 2653 2654 /* displays are handled separately */ 2655 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2656 continue; 2657 2658 /* XXX handle errors */ 2659 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2660 /* XXX handle errors */ 2661 if (r) { 2662 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2663 adev->ip_blocks[i].version->funcs->name, r); 2664 return r; 2665 } 2666 2667 adev->ip_blocks[i].status.hw = false; 2668 } 2669 2670 return 0; 2671 } 2672 2673 /** 2674 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2675 * 2676 * @adev: amdgpu_device pointer 2677 * 2678 * Main suspend function for hardware IPs. The list of all the hardware 2679 * IPs that make up the asic is walked, clockgating is disabled and the 2680 * suspend callbacks are run. suspend puts the hardware and software state 2681 * in each IP into a state suitable for suspend. 2682 * Returns 0 on success, negative error code on failure. 2683 */ 2684 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2685 { 2686 int i, r; 2687 2688 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2689 if (!adev->ip_blocks[i].status.valid) 2690 continue; 2691 /* displays are handled in phase1 */ 2692 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2693 continue; 2694 /* PSP lost connection when err_event_athub occurs */ 2695 if (amdgpu_ras_intr_triggered() && 2696 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2697 adev->ip_blocks[i].status.hw = false; 2698 continue; 2699 } 2700 /* XXX handle errors */ 2701 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2702 /* XXX handle errors */ 2703 if (r) { 2704 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2705 adev->ip_blocks[i].version->funcs->name, r); 2706 } 2707 adev->ip_blocks[i].status.hw = false; 2708 /* handle putting the SMC in the appropriate state */ 2709 if(!amdgpu_sriov_vf(adev)){ 2710 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2711 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2712 if (r) { 2713 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2714 adev->mp1_state, r); 2715 return r; 2716 } 2717 } 2718 } 2719 adev->ip_blocks[i].status.hw = false; 2720 } 2721 2722 return 0; 2723 } 2724 2725 /** 2726 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2727 * 2728 * @adev: amdgpu_device pointer 2729 * 2730 * Main suspend function for hardware IPs. The list of all the hardware 2731 * IPs that make up the asic is walked, clockgating is disabled and the 2732 * suspend callbacks are run. suspend puts the hardware and software state 2733 * in each IP into a state suitable for suspend. 2734 * Returns 0 on success, negative error code on failure. 2735 */ 2736 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2737 { 2738 int r; 2739 2740 if (amdgpu_sriov_vf(adev)) 2741 amdgpu_virt_request_full_gpu(adev, false); 2742 2743 r = amdgpu_device_ip_suspend_phase1(adev); 2744 if (r) 2745 return r; 2746 r = amdgpu_device_ip_suspend_phase2(adev); 2747 2748 if (amdgpu_sriov_vf(adev)) 2749 amdgpu_virt_release_full_gpu(adev, false); 2750 2751 return r; 2752 } 2753 2754 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2755 { 2756 int i, r; 2757 2758 static enum amd_ip_block_type ip_order[] = { 2759 AMD_IP_BLOCK_TYPE_GMC, 2760 AMD_IP_BLOCK_TYPE_COMMON, 2761 AMD_IP_BLOCK_TYPE_PSP, 2762 AMD_IP_BLOCK_TYPE_IH, 2763 }; 2764 2765 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2766 int j; 2767 struct amdgpu_ip_block *block; 2768 2769 block = &adev->ip_blocks[i]; 2770 block->status.hw = false; 2771 2772 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2773 2774 if (block->version->type != ip_order[j] || 2775 !block->status.valid) 2776 continue; 2777 2778 r = block->version->funcs->hw_init(adev); 2779 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2780 if (r) 2781 return r; 2782 block->status.hw = true; 2783 } 2784 } 2785 2786 return 0; 2787 } 2788 2789 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2790 { 2791 int i, r; 2792 2793 static enum amd_ip_block_type ip_order[] = { 2794 AMD_IP_BLOCK_TYPE_SMC, 2795 AMD_IP_BLOCK_TYPE_DCE, 2796 AMD_IP_BLOCK_TYPE_GFX, 2797 AMD_IP_BLOCK_TYPE_SDMA, 2798 AMD_IP_BLOCK_TYPE_UVD, 2799 AMD_IP_BLOCK_TYPE_VCE, 2800 AMD_IP_BLOCK_TYPE_VCN 2801 }; 2802 2803 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2804 int j; 2805 struct amdgpu_ip_block *block; 2806 2807 for (j = 0; j < adev->num_ip_blocks; j++) { 2808 block = &adev->ip_blocks[j]; 2809 2810 if (block->version->type != ip_order[i] || 2811 !block->status.valid || 2812 block->status.hw) 2813 continue; 2814 2815 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2816 r = block->version->funcs->resume(adev); 2817 else 2818 r = block->version->funcs->hw_init(adev); 2819 2820 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2821 if (r) 2822 return r; 2823 block->status.hw = true; 2824 } 2825 } 2826 2827 return 0; 2828 } 2829 2830 /** 2831 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2832 * 2833 * @adev: amdgpu_device pointer 2834 * 2835 * First resume function for hardware IPs. The list of all the hardware 2836 * IPs that make up the asic is walked and the resume callbacks are run for 2837 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2838 * after a suspend and updates the software state as necessary. This 2839 * function is also used for restoring the GPU after a GPU reset. 2840 * Returns 0 on success, negative error code on failure. 2841 */ 2842 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2843 { 2844 int i, r; 2845 2846 for (i = 0; i < adev->num_ip_blocks; i++) { 2847 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2848 continue; 2849 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2850 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2851 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2852 2853 r = adev->ip_blocks[i].version->funcs->resume(adev); 2854 if (r) { 2855 DRM_ERROR("resume of IP block <%s> failed %d\n", 2856 adev->ip_blocks[i].version->funcs->name, r); 2857 return r; 2858 } 2859 adev->ip_blocks[i].status.hw = true; 2860 } 2861 } 2862 2863 return 0; 2864 } 2865 2866 /** 2867 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2868 * 2869 * @adev: amdgpu_device pointer 2870 * 2871 * First resume function for hardware IPs. The list of all the hardware 2872 * IPs that make up the asic is walked and the resume callbacks are run for 2873 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2874 * functional state after a suspend and updates the software state as 2875 * necessary. This function is also used for restoring the GPU after a GPU 2876 * reset. 2877 * Returns 0 on success, negative error code on failure. 2878 */ 2879 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2880 { 2881 int i, r; 2882 2883 for (i = 0; i < adev->num_ip_blocks; i++) { 2884 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2885 continue; 2886 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2887 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2888 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2889 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2890 continue; 2891 r = adev->ip_blocks[i].version->funcs->resume(adev); 2892 if (r) { 2893 DRM_ERROR("resume of IP block <%s> failed %d\n", 2894 adev->ip_blocks[i].version->funcs->name, r); 2895 return r; 2896 } 2897 adev->ip_blocks[i].status.hw = true; 2898 } 2899 2900 return 0; 2901 } 2902 2903 /** 2904 * amdgpu_device_ip_resume - run resume for hardware IPs 2905 * 2906 * @adev: amdgpu_device pointer 2907 * 2908 * Main resume function for hardware IPs. The hardware IPs 2909 * are split into two resume functions because they are 2910 * are also used in in recovering from a GPU reset and some additional 2911 * steps need to be take between them. In this case (S3/S4) they are 2912 * run sequentially. 2913 * Returns 0 on success, negative error code on failure. 2914 */ 2915 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2916 { 2917 int r; 2918 2919 r = amdgpu_device_ip_resume_phase1(adev); 2920 if (r) 2921 return r; 2922 2923 r = amdgpu_device_fw_loading(adev); 2924 if (r) 2925 return r; 2926 2927 r = amdgpu_device_ip_resume_phase2(adev); 2928 2929 return r; 2930 } 2931 2932 /** 2933 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2934 * 2935 * @adev: amdgpu_device pointer 2936 * 2937 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2938 */ 2939 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2940 { 2941 if (amdgpu_sriov_vf(adev)) { 2942 if (adev->is_atom_fw) { 2943 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2944 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2945 } else { 2946 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2947 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2948 } 2949 2950 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2951 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2952 } 2953 } 2954 2955 /** 2956 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2957 * 2958 * @asic_type: AMD asic type 2959 * 2960 * Check if there is DC (new modesetting infrastructre) support for an asic. 2961 * returns true if DC has support, false if not. 2962 */ 2963 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2964 { 2965 switch (asic_type) { 2966 #if defined(CONFIG_DRM_AMD_DC) 2967 #if defined(CONFIG_DRM_AMD_DC_SI) 2968 case CHIP_TAHITI: 2969 case CHIP_PITCAIRN: 2970 case CHIP_VERDE: 2971 case CHIP_OLAND: 2972 #endif 2973 case CHIP_BONAIRE: 2974 case CHIP_KAVERI: 2975 case CHIP_KABINI: 2976 case CHIP_MULLINS: 2977 /* 2978 * We have systems in the wild with these ASICs that require 2979 * LVDS and VGA support which is not supported with DC. 2980 * 2981 * Fallback to the non-DC driver here by default so as not to 2982 * cause regressions. 2983 */ 2984 return amdgpu_dc > 0; 2985 case CHIP_HAWAII: 2986 case CHIP_CARRIZO: 2987 case CHIP_STONEY: 2988 case CHIP_POLARIS10: 2989 case CHIP_POLARIS11: 2990 case CHIP_POLARIS12: 2991 case CHIP_VEGAM: 2992 case CHIP_TONGA: 2993 case CHIP_FIJI: 2994 case CHIP_VEGA10: 2995 case CHIP_VEGA12: 2996 case CHIP_VEGA20: 2997 #if defined(CONFIG_DRM_AMD_DC_DCN) 2998 case CHIP_RAVEN: 2999 case CHIP_NAVI10: 3000 case CHIP_NAVI14: 3001 case CHIP_NAVI12: 3002 case CHIP_RENOIR: 3003 #endif 3004 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 3005 case CHIP_SIENNA_CICHLID: 3006 case CHIP_NAVY_FLOUNDER: 3007 #endif 3008 return amdgpu_dc != 0; 3009 #endif 3010 default: 3011 if (amdgpu_dc > 0) 3012 DRM_INFO("Display Core has been requested via kernel parameter " 3013 "but isn't supported by ASIC, ignoring\n"); 3014 return false; 3015 } 3016 } 3017 3018 /** 3019 * amdgpu_device_has_dc_support - check if dc is supported 3020 * 3021 * @adev: amdgpu_device_pointer 3022 * 3023 * Returns true for supported, false for not supported 3024 */ 3025 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3026 { 3027 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3028 return false; 3029 3030 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3031 } 3032 3033 3034 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3035 { 3036 struct amdgpu_device *adev = 3037 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3038 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3039 3040 /* It's a bug to not have a hive within this function */ 3041 if (WARN_ON(!hive)) 3042 return; 3043 3044 /* 3045 * Use task barrier to synchronize all xgmi reset works across the 3046 * hive. task_barrier_enter and task_barrier_exit will block 3047 * until all the threads running the xgmi reset works reach 3048 * those points. task_barrier_full will do both blocks. 3049 */ 3050 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3051 3052 task_barrier_enter(&hive->tb); 3053 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3054 3055 if (adev->asic_reset_res) 3056 goto fail; 3057 3058 task_barrier_exit(&hive->tb); 3059 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3060 3061 if (adev->asic_reset_res) 3062 goto fail; 3063 3064 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3065 adev->mmhub.funcs->reset_ras_error_count(adev); 3066 } else { 3067 3068 task_barrier_full(&hive->tb); 3069 adev->asic_reset_res = amdgpu_asic_reset(adev); 3070 } 3071 3072 fail: 3073 if (adev->asic_reset_res) 3074 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3075 adev->asic_reset_res, adev_to_drm(adev)->unique); 3076 amdgpu_put_xgmi_hive(hive); 3077 } 3078 3079 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3080 { 3081 char *input = amdgpu_lockup_timeout; 3082 char *timeout_setting = NULL; 3083 int index = 0; 3084 long timeout; 3085 int ret = 0; 3086 3087 /* 3088 * By default timeout for non compute jobs is 10000. 3089 * And there is no timeout enforced on compute jobs. 3090 * In SR-IOV or passthrough mode, timeout for compute 3091 * jobs are 60000 by default. 3092 */ 3093 adev->gfx_timeout = msecs_to_jiffies(10000); 3094 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3095 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3096 adev->compute_timeout = msecs_to_jiffies(60000); 3097 else 3098 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3099 3100 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3101 while ((timeout_setting = strsep(&input, ",")) && 3102 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3103 ret = kstrtol(timeout_setting, 0, &timeout); 3104 if (ret) 3105 return ret; 3106 3107 if (timeout == 0) { 3108 index++; 3109 continue; 3110 } else if (timeout < 0) { 3111 timeout = MAX_SCHEDULE_TIMEOUT; 3112 } else { 3113 timeout = msecs_to_jiffies(timeout); 3114 } 3115 3116 switch (index++) { 3117 case 0: 3118 adev->gfx_timeout = timeout; 3119 break; 3120 case 1: 3121 adev->compute_timeout = timeout; 3122 break; 3123 case 2: 3124 adev->sdma_timeout = timeout; 3125 break; 3126 case 3: 3127 adev->video_timeout = timeout; 3128 break; 3129 default: 3130 break; 3131 } 3132 } 3133 /* 3134 * There is only one value specified and 3135 * it should apply to all non-compute jobs. 3136 */ 3137 if (index == 1) { 3138 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3139 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3140 adev->compute_timeout = adev->gfx_timeout; 3141 } 3142 } 3143 3144 return ret; 3145 } 3146 3147 static const struct attribute *amdgpu_dev_attributes[] = { 3148 &dev_attr_product_name.attr, 3149 &dev_attr_product_number.attr, 3150 &dev_attr_serial_number.attr, 3151 &dev_attr_pcie_replay_count.attr, 3152 NULL 3153 }; 3154 3155 3156 /** 3157 * amdgpu_device_init - initialize the driver 3158 * 3159 * @adev: amdgpu_device pointer 3160 * @flags: driver flags 3161 * 3162 * Initializes the driver info and hw (all asics). 3163 * Returns 0 for success or an error on failure. 3164 * Called at driver startup. 3165 */ 3166 int amdgpu_device_init(struct amdgpu_device *adev, 3167 uint32_t flags) 3168 { 3169 struct drm_device *ddev = adev_to_drm(adev); 3170 struct pci_dev *pdev = adev->pdev; 3171 int r, i; 3172 bool boco = false; 3173 u32 max_MBps; 3174 3175 adev->shutdown = false; 3176 adev->flags = flags; 3177 3178 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3179 adev->asic_type = amdgpu_force_asic_type; 3180 else 3181 adev->asic_type = flags & AMD_ASIC_MASK; 3182 3183 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3184 if (amdgpu_emu_mode == 1) 3185 adev->usec_timeout *= 10; 3186 adev->gmc.gart_size = 512 * 1024 * 1024; 3187 adev->accel_working = false; 3188 adev->num_rings = 0; 3189 adev->mman.buffer_funcs = NULL; 3190 adev->mman.buffer_funcs_ring = NULL; 3191 adev->vm_manager.vm_pte_funcs = NULL; 3192 adev->vm_manager.vm_pte_num_scheds = 0; 3193 adev->gmc.gmc_funcs = NULL; 3194 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3195 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3196 3197 adev->smc_rreg = &amdgpu_invalid_rreg; 3198 adev->smc_wreg = &amdgpu_invalid_wreg; 3199 adev->pcie_rreg = &amdgpu_invalid_rreg; 3200 adev->pcie_wreg = &amdgpu_invalid_wreg; 3201 adev->pciep_rreg = &amdgpu_invalid_rreg; 3202 adev->pciep_wreg = &amdgpu_invalid_wreg; 3203 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3204 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3205 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3206 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3207 adev->didt_rreg = &amdgpu_invalid_rreg; 3208 adev->didt_wreg = &amdgpu_invalid_wreg; 3209 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3210 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3211 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3212 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3213 3214 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3215 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3216 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3217 3218 /* mutex initialization are all done here so we 3219 * can recall function without having locking issues */ 3220 atomic_set(&adev->irq.ih.lock, 0); 3221 mutex_init(&adev->firmware.mutex); 3222 mutex_init(&adev->pm.mutex); 3223 mutex_init(&adev->gfx.gpu_clock_mutex); 3224 mutex_init(&adev->srbm_mutex); 3225 mutex_init(&adev->gfx.pipe_reserve_mutex); 3226 mutex_init(&adev->gfx.gfx_off_mutex); 3227 mutex_init(&adev->grbm_idx_mutex); 3228 mutex_init(&adev->mn_lock); 3229 mutex_init(&adev->virt.vf_errors.lock); 3230 hash_init(adev->mn_hash); 3231 atomic_set(&adev->in_gpu_reset, 0); 3232 init_rwsem(&adev->reset_sem); 3233 mutex_init(&adev->psp.mutex); 3234 mutex_init(&adev->notifier_lock); 3235 3236 r = amdgpu_device_check_arguments(adev); 3237 if (r) 3238 return r; 3239 3240 spin_lock_init(&adev->mmio_idx_lock); 3241 spin_lock_init(&adev->smc_idx_lock); 3242 spin_lock_init(&adev->pcie_idx_lock); 3243 spin_lock_init(&adev->uvd_ctx_idx_lock); 3244 spin_lock_init(&adev->didt_idx_lock); 3245 spin_lock_init(&adev->gc_cac_idx_lock); 3246 spin_lock_init(&adev->se_cac_idx_lock); 3247 spin_lock_init(&adev->audio_endpt_idx_lock); 3248 spin_lock_init(&adev->mm_stats.lock); 3249 3250 INIT_LIST_HEAD(&adev->shadow_list); 3251 mutex_init(&adev->shadow_list_lock); 3252 3253 INIT_DELAYED_WORK(&adev->delayed_init_work, 3254 amdgpu_device_delayed_init_work_handler); 3255 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3256 amdgpu_device_delay_enable_gfx_off); 3257 3258 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3259 3260 adev->gfx.gfx_off_req_count = 1; 3261 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3262 3263 atomic_set(&adev->throttling_logging_enabled, 1); 3264 /* 3265 * If throttling continues, logging will be performed every minute 3266 * to avoid log flooding. "-1" is subtracted since the thermal 3267 * throttling interrupt comes every second. Thus, the total logging 3268 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3269 * for throttling interrupt) = 60 seconds. 3270 */ 3271 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3272 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3273 3274 /* Registers mapping */ 3275 /* TODO: block userspace mapping of io register */ 3276 if (adev->asic_type >= CHIP_BONAIRE) { 3277 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3278 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3279 } else { 3280 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3281 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3282 } 3283 3284 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3285 if (adev->rmmio == NULL) { 3286 return -ENOMEM; 3287 } 3288 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3289 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3290 3291 /* io port mapping */ 3292 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3293 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3294 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3295 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3296 break; 3297 } 3298 } 3299 if (adev->rio_mem == NULL) 3300 DRM_INFO("PCI I/O BAR is not found.\n"); 3301 3302 /* enable PCIE atomic ops */ 3303 r = pci_enable_atomic_ops_to_root(adev->pdev, 3304 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3305 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3306 if (r) { 3307 adev->have_atomics_support = false; 3308 DRM_INFO("PCIE atomic ops is not supported\n"); 3309 } else { 3310 adev->have_atomics_support = true; 3311 } 3312 3313 amdgpu_device_get_pcie_info(adev); 3314 3315 if (amdgpu_mcbp) 3316 DRM_INFO("MCBP is enabled\n"); 3317 3318 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3319 adev->enable_mes = true; 3320 3321 /* detect hw virtualization here */ 3322 amdgpu_detect_virtualization(adev); 3323 3324 r = amdgpu_device_get_job_timeout_settings(adev); 3325 if (r) { 3326 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3327 goto failed_unmap; 3328 } 3329 3330 /* early init functions */ 3331 r = amdgpu_device_ip_early_init(adev); 3332 if (r) 3333 goto failed_unmap; 3334 3335 /* doorbell bar mapping and doorbell index init*/ 3336 amdgpu_device_doorbell_init(adev); 3337 3338 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3339 /* this will fail for cards that aren't VGA class devices, just 3340 * ignore it */ 3341 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3342 3343 if (amdgpu_device_supports_boco(ddev)) 3344 boco = true; 3345 if (amdgpu_has_atpx() && 3346 (amdgpu_is_atpx_hybrid() || 3347 amdgpu_has_atpx_dgpu_power_cntl()) && 3348 !pci_is_thunderbolt_attached(adev->pdev)) 3349 vga_switcheroo_register_client(adev->pdev, 3350 &amdgpu_switcheroo_ops, boco); 3351 if (boco) 3352 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3353 3354 if (amdgpu_emu_mode == 1) { 3355 /* post the asic on emulation mode */ 3356 emu_soc_asic_init(adev); 3357 goto fence_driver_init; 3358 } 3359 3360 /* detect if we are with an SRIOV vbios */ 3361 amdgpu_device_detect_sriov_bios(adev); 3362 3363 /* check if we need to reset the asic 3364 * E.g., driver was not cleanly unloaded previously, etc. 3365 */ 3366 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3367 r = amdgpu_asic_reset(adev); 3368 if (r) { 3369 dev_err(adev->dev, "asic reset on init failed\n"); 3370 goto failed; 3371 } 3372 } 3373 3374 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3375 3376 /* Post card if necessary */ 3377 if (amdgpu_device_need_post(adev)) { 3378 if (!adev->bios) { 3379 dev_err(adev->dev, "no vBIOS found\n"); 3380 r = -EINVAL; 3381 goto failed; 3382 } 3383 DRM_INFO("GPU posting now...\n"); 3384 r = amdgpu_device_asic_init(adev); 3385 if (r) { 3386 dev_err(adev->dev, "gpu post error!\n"); 3387 goto failed; 3388 } 3389 } 3390 3391 if (adev->is_atom_fw) { 3392 /* Initialize clocks */ 3393 r = amdgpu_atomfirmware_get_clock_info(adev); 3394 if (r) { 3395 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3396 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3397 goto failed; 3398 } 3399 } else { 3400 /* Initialize clocks */ 3401 r = amdgpu_atombios_get_clock_info(adev); 3402 if (r) { 3403 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3404 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3405 goto failed; 3406 } 3407 /* init i2c buses */ 3408 if (!amdgpu_device_has_dc_support(adev)) 3409 amdgpu_atombios_i2c_init(adev); 3410 } 3411 3412 fence_driver_init: 3413 /* Fence driver */ 3414 r = amdgpu_fence_driver_init(adev); 3415 if (r) { 3416 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3417 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3418 goto failed; 3419 } 3420 3421 /* init the mode config */ 3422 drm_mode_config_init(adev_to_drm(adev)); 3423 3424 r = amdgpu_device_ip_init(adev); 3425 if (r) { 3426 /* failed in exclusive mode due to timeout */ 3427 if (amdgpu_sriov_vf(adev) && 3428 !amdgpu_sriov_runtime(adev) && 3429 amdgpu_virt_mmio_blocked(adev) && 3430 !amdgpu_virt_wait_reset(adev)) { 3431 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3432 /* Don't send request since VF is inactive. */ 3433 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3434 adev->virt.ops = NULL; 3435 r = -EAGAIN; 3436 goto failed; 3437 } 3438 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3439 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3440 goto failed; 3441 } 3442 3443 dev_info(adev->dev, 3444 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3445 adev->gfx.config.max_shader_engines, 3446 adev->gfx.config.max_sh_per_se, 3447 adev->gfx.config.max_cu_per_sh, 3448 adev->gfx.cu_info.number); 3449 3450 adev->accel_working = true; 3451 3452 amdgpu_vm_check_compute_bug(adev); 3453 3454 /* Initialize the buffer migration limit. */ 3455 if (amdgpu_moverate >= 0) 3456 max_MBps = amdgpu_moverate; 3457 else 3458 max_MBps = 8; /* Allow 8 MB/s. */ 3459 /* Get a log2 for easy divisions. */ 3460 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3461 3462 amdgpu_fbdev_init(adev); 3463 3464 r = amdgpu_pm_sysfs_init(adev); 3465 if (r) { 3466 adev->pm_sysfs_en = false; 3467 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3468 } else 3469 adev->pm_sysfs_en = true; 3470 3471 r = amdgpu_ucode_sysfs_init(adev); 3472 if (r) { 3473 adev->ucode_sysfs_en = false; 3474 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3475 } else 3476 adev->ucode_sysfs_en = true; 3477 3478 if ((amdgpu_testing & 1)) { 3479 if (adev->accel_working) 3480 amdgpu_test_moves(adev); 3481 else 3482 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3483 } 3484 if (amdgpu_benchmarking) { 3485 if (adev->accel_working) 3486 amdgpu_benchmark(adev, amdgpu_benchmarking); 3487 else 3488 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3489 } 3490 3491 /* 3492 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3493 * Otherwise the mgpu fan boost feature will be skipped due to the 3494 * gpu instance is counted less. 3495 */ 3496 amdgpu_register_gpu_instance(adev); 3497 3498 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3499 * explicit gating rather than handling it automatically. 3500 */ 3501 r = amdgpu_device_ip_late_init(adev); 3502 if (r) { 3503 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3504 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3505 goto failed; 3506 } 3507 3508 /* must succeed. */ 3509 amdgpu_ras_resume(adev); 3510 3511 queue_delayed_work(system_wq, &adev->delayed_init_work, 3512 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3513 3514 if (amdgpu_sriov_vf(adev)) 3515 flush_delayed_work(&adev->delayed_init_work); 3516 3517 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3518 if (r) 3519 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3520 3521 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3522 r = amdgpu_pmu_init(adev); 3523 if (r) 3524 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3525 3526 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3527 if (amdgpu_device_cache_pci_state(adev->pdev)) 3528 pci_restore_state(pdev); 3529 3530 return 0; 3531 3532 failed: 3533 amdgpu_vf_error_trans_all(adev); 3534 if (boco) 3535 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3536 3537 failed_unmap: 3538 iounmap(adev->rmmio); 3539 adev->rmmio = NULL; 3540 3541 return r; 3542 } 3543 3544 /** 3545 * amdgpu_device_fini - tear down the driver 3546 * 3547 * @adev: amdgpu_device pointer 3548 * 3549 * Tear down the driver info (all asics). 3550 * Called at driver shutdown. 3551 */ 3552 void amdgpu_device_fini(struct amdgpu_device *adev) 3553 { 3554 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3555 flush_delayed_work(&adev->delayed_init_work); 3556 adev->shutdown = true; 3557 3558 kfree(adev->pci_state); 3559 3560 /* make sure IB test finished before entering exclusive mode 3561 * to avoid preemption on IB test 3562 * */ 3563 if (amdgpu_sriov_vf(adev)) { 3564 amdgpu_virt_request_full_gpu(adev, false); 3565 amdgpu_virt_fini_data_exchange(adev); 3566 } 3567 3568 /* disable all interrupts */ 3569 amdgpu_irq_disable_all(adev); 3570 if (adev->mode_info.mode_config_initialized){ 3571 if (!amdgpu_device_has_dc_support(adev)) 3572 drm_helper_force_disable_all(adev_to_drm(adev)); 3573 else 3574 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3575 } 3576 amdgpu_fence_driver_fini(adev); 3577 if (adev->pm_sysfs_en) 3578 amdgpu_pm_sysfs_fini(adev); 3579 amdgpu_fbdev_fini(adev); 3580 amdgpu_device_ip_fini(adev); 3581 release_firmware(adev->firmware.gpu_info_fw); 3582 adev->firmware.gpu_info_fw = NULL; 3583 adev->accel_working = false; 3584 /* free i2c buses */ 3585 if (!amdgpu_device_has_dc_support(adev)) 3586 amdgpu_i2c_fini(adev); 3587 3588 if (amdgpu_emu_mode != 1) 3589 amdgpu_atombios_fini(adev); 3590 3591 kfree(adev->bios); 3592 adev->bios = NULL; 3593 if (amdgpu_has_atpx() && 3594 (amdgpu_is_atpx_hybrid() || 3595 amdgpu_has_atpx_dgpu_power_cntl()) && 3596 !pci_is_thunderbolt_attached(adev->pdev)) 3597 vga_switcheroo_unregister_client(adev->pdev); 3598 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3599 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3600 vga_client_register(adev->pdev, NULL, NULL, NULL); 3601 if (adev->rio_mem) 3602 pci_iounmap(adev->pdev, adev->rio_mem); 3603 adev->rio_mem = NULL; 3604 iounmap(adev->rmmio); 3605 adev->rmmio = NULL; 3606 amdgpu_device_doorbell_fini(adev); 3607 3608 if (adev->ucode_sysfs_en) 3609 amdgpu_ucode_sysfs_fini(adev); 3610 3611 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3612 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3613 amdgpu_pmu_fini(adev); 3614 if (adev->mman.discovery_bin) 3615 amdgpu_discovery_fini(adev); 3616 } 3617 3618 3619 /* 3620 * Suspend & resume. 3621 */ 3622 /** 3623 * amdgpu_device_suspend - initiate device suspend 3624 * 3625 * @dev: drm dev pointer 3626 * @fbcon : notify the fbdev of suspend 3627 * 3628 * Puts the hw in the suspend state (all asics). 3629 * Returns 0 for success or an error on failure. 3630 * Called at driver suspend. 3631 */ 3632 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3633 { 3634 struct amdgpu_device *adev; 3635 struct drm_crtc *crtc; 3636 struct drm_connector *connector; 3637 struct drm_connector_list_iter iter; 3638 int r; 3639 3640 adev = drm_to_adev(dev); 3641 3642 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3643 return 0; 3644 3645 adev->in_suspend = true; 3646 drm_kms_helper_poll_disable(dev); 3647 3648 if (fbcon) 3649 amdgpu_fbdev_set_suspend(adev, 1); 3650 3651 cancel_delayed_work_sync(&adev->delayed_init_work); 3652 3653 if (!amdgpu_device_has_dc_support(adev)) { 3654 /* turn off display hw */ 3655 drm_modeset_lock_all(dev); 3656 drm_connector_list_iter_begin(dev, &iter); 3657 drm_for_each_connector_iter(connector, &iter) 3658 drm_helper_connector_dpms(connector, 3659 DRM_MODE_DPMS_OFF); 3660 drm_connector_list_iter_end(&iter); 3661 drm_modeset_unlock_all(dev); 3662 /* unpin the front buffers and cursors */ 3663 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3664 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3665 struct drm_framebuffer *fb = crtc->primary->fb; 3666 struct amdgpu_bo *robj; 3667 3668 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3669 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3670 r = amdgpu_bo_reserve(aobj, true); 3671 if (r == 0) { 3672 amdgpu_bo_unpin(aobj); 3673 amdgpu_bo_unreserve(aobj); 3674 } 3675 } 3676 3677 if (fb == NULL || fb->obj[0] == NULL) { 3678 continue; 3679 } 3680 robj = gem_to_amdgpu_bo(fb->obj[0]); 3681 /* don't unpin kernel fb objects */ 3682 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3683 r = amdgpu_bo_reserve(robj, true); 3684 if (r == 0) { 3685 amdgpu_bo_unpin(robj); 3686 amdgpu_bo_unreserve(robj); 3687 } 3688 } 3689 } 3690 } 3691 3692 amdgpu_ras_suspend(adev); 3693 3694 r = amdgpu_device_ip_suspend_phase1(adev); 3695 3696 amdgpu_amdkfd_suspend(adev, !fbcon); 3697 3698 /* evict vram memory */ 3699 amdgpu_bo_evict_vram(adev); 3700 3701 amdgpu_fence_driver_suspend(adev); 3702 3703 r = amdgpu_device_ip_suspend_phase2(adev); 3704 3705 /* evict remaining vram memory 3706 * This second call to evict vram is to evict the gart page table 3707 * using the CPU. 3708 */ 3709 amdgpu_bo_evict_vram(adev); 3710 3711 return 0; 3712 } 3713 3714 /** 3715 * amdgpu_device_resume - initiate device resume 3716 * 3717 * @dev: drm dev pointer 3718 * @fbcon : notify the fbdev of resume 3719 * 3720 * Bring the hw back to operating state (all asics). 3721 * Returns 0 for success or an error on failure. 3722 * Called at driver resume. 3723 */ 3724 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3725 { 3726 struct drm_connector *connector; 3727 struct drm_connector_list_iter iter; 3728 struct amdgpu_device *adev = drm_to_adev(dev); 3729 struct drm_crtc *crtc; 3730 int r = 0; 3731 3732 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3733 return 0; 3734 3735 /* post card */ 3736 if (amdgpu_device_need_post(adev)) { 3737 r = amdgpu_device_asic_init(adev); 3738 if (r) 3739 dev_err(adev->dev, "amdgpu asic init failed\n"); 3740 } 3741 3742 r = amdgpu_device_ip_resume(adev); 3743 if (r) { 3744 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3745 return r; 3746 } 3747 amdgpu_fence_driver_resume(adev); 3748 3749 3750 r = amdgpu_device_ip_late_init(adev); 3751 if (r) 3752 return r; 3753 3754 queue_delayed_work(system_wq, &adev->delayed_init_work, 3755 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3756 3757 if (!amdgpu_device_has_dc_support(adev)) { 3758 /* pin cursors */ 3759 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3760 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3761 3762 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3763 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3764 r = amdgpu_bo_reserve(aobj, true); 3765 if (r == 0) { 3766 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3767 if (r != 0) 3768 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3769 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3770 amdgpu_bo_unreserve(aobj); 3771 } 3772 } 3773 } 3774 } 3775 r = amdgpu_amdkfd_resume(adev, !fbcon); 3776 if (r) 3777 return r; 3778 3779 /* Make sure IB tests flushed */ 3780 flush_delayed_work(&adev->delayed_init_work); 3781 3782 /* blat the mode back in */ 3783 if (fbcon) { 3784 if (!amdgpu_device_has_dc_support(adev)) { 3785 /* pre DCE11 */ 3786 drm_helper_resume_force_mode(dev); 3787 3788 /* turn on display hw */ 3789 drm_modeset_lock_all(dev); 3790 3791 drm_connector_list_iter_begin(dev, &iter); 3792 drm_for_each_connector_iter(connector, &iter) 3793 drm_helper_connector_dpms(connector, 3794 DRM_MODE_DPMS_ON); 3795 drm_connector_list_iter_end(&iter); 3796 3797 drm_modeset_unlock_all(dev); 3798 } 3799 amdgpu_fbdev_set_suspend(adev, 0); 3800 } 3801 3802 drm_kms_helper_poll_enable(dev); 3803 3804 amdgpu_ras_resume(adev); 3805 3806 /* 3807 * Most of the connector probing functions try to acquire runtime pm 3808 * refs to ensure that the GPU is powered on when connector polling is 3809 * performed. Since we're calling this from a runtime PM callback, 3810 * trying to acquire rpm refs will cause us to deadlock. 3811 * 3812 * Since we're guaranteed to be holding the rpm lock, it's safe to 3813 * temporarily disable the rpm helpers so this doesn't deadlock us. 3814 */ 3815 #ifdef CONFIG_PM 3816 dev->dev->power.disable_depth++; 3817 #endif 3818 if (!amdgpu_device_has_dc_support(adev)) 3819 drm_helper_hpd_irq_event(dev); 3820 else 3821 drm_kms_helper_hotplug_event(dev); 3822 #ifdef CONFIG_PM 3823 dev->dev->power.disable_depth--; 3824 #endif 3825 adev->in_suspend = false; 3826 3827 return 0; 3828 } 3829 3830 /** 3831 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3832 * 3833 * @adev: amdgpu_device pointer 3834 * 3835 * The list of all the hardware IPs that make up the asic is walked and 3836 * the check_soft_reset callbacks are run. check_soft_reset determines 3837 * if the asic is still hung or not. 3838 * Returns true if any of the IPs are still in a hung state, false if not. 3839 */ 3840 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3841 { 3842 int i; 3843 bool asic_hang = false; 3844 3845 if (amdgpu_sriov_vf(adev)) 3846 return true; 3847 3848 if (amdgpu_asic_need_full_reset(adev)) 3849 return true; 3850 3851 for (i = 0; i < adev->num_ip_blocks; i++) { 3852 if (!adev->ip_blocks[i].status.valid) 3853 continue; 3854 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3855 adev->ip_blocks[i].status.hang = 3856 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3857 if (adev->ip_blocks[i].status.hang) { 3858 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3859 asic_hang = true; 3860 } 3861 } 3862 return asic_hang; 3863 } 3864 3865 /** 3866 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3867 * 3868 * @adev: amdgpu_device pointer 3869 * 3870 * The list of all the hardware IPs that make up the asic is walked and the 3871 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3872 * handles any IP specific hardware or software state changes that are 3873 * necessary for a soft reset to succeed. 3874 * Returns 0 on success, negative error code on failure. 3875 */ 3876 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3877 { 3878 int i, r = 0; 3879 3880 for (i = 0; i < adev->num_ip_blocks; i++) { 3881 if (!adev->ip_blocks[i].status.valid) 3882 continue; 3883 if (adev->ip_blocks[i].status.hang && 3884 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3885 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3886 if (r) 3887 return r; 3888 } 3889 } 3890 3891 return 0; 3892 } 3893 3894 /** 3895 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3896 * 3897 * @adev: amdgpu_device pointer 3898 * 3899 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3900 * reset is necessary to recover. 3901 * Returns true if a full asic reset is required, false if not. 3902 */ 3903 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3904 { 3905 int i; 3906 3907 if (amdgpu_asic_need_full_reset(adev)) 3908 return true; 3909 3910 for (i = 0; i < adev->num_ip_blocks; i++) { 3911 if (!adev->ip_blocks[i].status.valid) 3912 continue; 3913 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3914 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3915 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3916 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3917 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3918 if (adev->ip_blocks[i].status.hang) { 3919 dev_info(adev->dev, "Some block need full reset!\n"); 3920 return true; 3921 } 3922 } 3923 } 3924 return false; 3925 } 3926 3927 /** 3928 * amdgpu_device_ip_soft_reset - do a soft reset 3929 * 3930 * @adev: amdgpu_device pointer 3931 * 3932 * The list of all the hardware IPs that make up the asic is walked and the 3933 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3934 * IP specific hardware or software state changes that are necessary to soft 3935 * reset the IP. 3936 * Returns 0 on success, negative error code on failure. 3937 */ 3938 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3939 { 3940 int i, r = 0; 3941 3942 for (i = 0; i < adev->num_ip_blocks; i++) { 3943 if (!adev->ip_blocks[i].status.valid) 3944 continue; 3945 if (adev->ip_blocks[i].status.hang && 3946 adev->ip_blocks[i].version->funcs->soft_reset) { 3947 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3948 if (r) 3949 return r; 3950 } 3951 } 3952 3953 return 0; 3954 } 3955 3956 /** 3957 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3958 * 3959 * @adev: amdgpu_device pointer 3960 * 3961 * The list of all the hardware IPs that make up the asic is walked and the 3962 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3963 * handles any IP specific hardware or software state changes that are 3964 * necessary after the IP has been soft reset. 3965 * Returns 0 on success, negative error code on failure. 3966 */ 3967 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3968 { 3969 int i, r = 0; 3970 3971 for (i = 0; i < adev->num_ip_blocks; i++) { 3972 if (!adev->ip_blocks[i].status.valid) 3973 continue; 3974 if (adev->ip_blocks[i].status.hang && 3975 adev->ip_blocks[i].version->funcs->post_soft_reset) 3976 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3977 if (r) 3978 return r; 3979 } 3980 3981 return 0; 3982 } 3983 3984 /** 3985 * amdgpu_device_recover_vram - Recover some VRAM contents 3986 * 3987 * @adev: amdgpu_device pointer 3988 * 3989 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3990 * restore things like GPUVM page tables after a GPU reset where 3991 * the contents of VRAM might be lost. 3992 * 3993 * Returns: 3994 * 0 on success, negative error code on failure. 3995 */ 3996 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3997 { 3998 struct dma_fence *fence = NULL, *next = NULL; 3999 struct amdgpu_bo *shadow; 4000 long r = 1, tmo; 4001 4002 if (amdgpu_sriov_runtime(adev)) 4003 tmo = msecs_to_jiffies(8000); 4004 else 4005 tmo = msecs_to_jiffies(100); 4006 4007 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4008 mutex_lock(&adev->shadow_list_lock); 4009 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4010 4011 /* No need to recover an evicted BO */ 4012 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4013 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4014 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4015 continue; 4016 4017 r = amdgpu_bo_restore_shadow(shadow, &next); 4018 if (r) 4019 break; 4020 4021 if (fence) { 4022 tmo = dma_fence_wait_timeout(fence, false, tmo); 4023 dma_fence_put(fence); 4024 fence = next; 4025 if (tmo == 0) { 4026 r = -ETIMEDOUT; 4027 break; 4028 } else if (tmo < 0) { 4029 r = tmo; 4030 break; 4031 } 4032 } else { 4033 fence = next; 4034 } 4035 } 4036 mutex_unlock(&adev->shadow_list_lock); 4037 4038 if (fence) 4039 tmo = dma_fence_wait_timeout(fence, false, tmo); 4040 dma_fence_put(fence); 4041 4042 if (r < 0 || tmo <= 0) { 4043 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4044 return -EIO; 4045 } 4046 4047 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4048 return 0; 4049 } 4050 4051 4052 /** 4053 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4054 * 4055 * @adev: amdgpu device pointer 4056 * @from_hypervisor: request from hypervisor 4057 * 4058 * do VF FLR and reinitialize Asic 4059 * return 0 means succeeded otherwise failed 4060 */ 4061 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4062 bool from_hypervisor) 4063 { 4064 int r; 4065 4066 if (from_hypervisor) 4067 r = amdgpu_virt_request_full_gpu(adev, true); 4068 else 4069 r = amdgpu_virt_reset_gpu(adev); 4070 if (r) 4071 return r; 4072 4073 amdgpu_amdkfd_pre_reset(adev); 4074 4075 /* Resume IP prior to SMC */ 4076 r = amdgpu_device_ip_reinit_early_sriov(adev); 4077 if (r) 4078 goto error; 4079 4080 amdgpu_virt_init_data_exchange(adev); 4081 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4082 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4083 4084 r = amdgpu_device_fw_loading(adev); 4085 if (r) 4086 return r; 4087 4088 /* now we are okay to resume SMC/CP/SDMA */ 4089 r = amdgpu_device_ip_reinit_late_sriov(adev); 4090 if (r) 4091 goto error; 4092 4093 amdgpu_irq_gpu_reset_resume_helper(adev); 4094 r = amdgpu_ib_ring_tests(adev); 4095 amdgpu_amdkfd_post_reset(adev); 4096 4097 error: 4098 amdgpu_virt_release_full_gpu(adev, true); 4099 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4100 amdgpu_inc_vram_lost(adev); 4101 r = amdgpu_device_recover_vram(adev); 4102 } 4103 4104 return r; 4105 } 4106 4107 /** 4108 * amdgpu_device_has_job_running - check if there is any job in mirror list 4109 * 4110 * @adev: amdgpu device pointer 4111 * 4112 * check if there is any job in mirror list 4113 */ 4114 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4115 { 4116 int i; 4117 struct drm_sched_job *job; 4118 4119 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4120 struct amdgpu_ring *ring = adev->rings[i]; 4121 4122 if (!ring || !ring->sched.thread) 4123 continue; 4124 4125 spin_lock(&ring->sched.job_list_lock); 4126 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4127 struct drm_sched_job, node); 4128 spin_unlock(&ring->sched.job_list_lock); 4129 if (job) 4130 return true; 4131 } 4132 return false; 4133 } 4134 4135 /** 4136 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4137 * 4138 * @adev: amdgpu device pointer 4139 * 4140 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4141 * a hung GPU. 4142 */ 4143 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4144 { 4145 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4146 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4147 return false; 4148 } 4149 4150 if (amdgpu_gpu_recovery == 0) 4151 goto disabled; 4152 4153 if (amdgpu_sriov_vf(adev)) 4154 return true; 4155 4156 if (amdgpu_gpu_recovery == -1) { 4157 switch (adev->asic_type) { 4158 case CHIP_BONAIRE: 4159 case CHIP_HAWAII: 4160 case CHIP_TOPAZ: 4161 case CHIP_TONGA: 4162 case CHIP_FIJI: 4163 case CHIP_POLARIS10: 4164 case CHIP_POLARIS11: 4165 case CHIP_POLARIS12: 4166 case CHIP_VEGAM: 4167 case CHIP_VEGA20: 4168 case CHIP_VEGA10: 4169 case CHIP_VEGA12: 4170 case CHIP_RAVEN: 4171 case CHIP_ARCTURUS: 4172 case CHIP_RENOIR: 4173 case CHIP_NAVI10: 4174 case CHIP_NAVI14: 4175 case CHIP_NAVI12: 4176 case CHIP_SIENNA_CICHLID: 4177 break; 4178 default: 4179 goto disabled; 4180 } 4181 } 4182 4183 return true; 4184 4185 disabled: 4186 dev_info(adev->dev, "GPU recovery disabled.\n"); 4187 return false; 4188 } 4189 4190 4191 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4192 struct amdgpu_job *job, 4193 bool *need_full_reset_arg) 4194 { 4195 int i, r = 0; 4196 bool need_full_reset = *need_full_reset_arg; 4197 4198 amdgpu_debugfs_wait_dump(adev); 4199 4200 if (amdgpu_sriov_vf(adev)) { 4201 /* stop the data exchange thread */ 4202 amdgpu_virt_fini_data_exchange(adev); 4203 } 4204 4205 /* block all schedulers and reset given job's ring */ 4206 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4207 struct amdgpu_ring *ring = adev->rings[i]; 4208 4209 if (!ring || !ring->sched.thread) 4210 continue; 4211 4212 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4213 amdgpu_fence_driver_force_completion(ring); 4214 } 4215 4216 if(job) 4217 drm_sched_increase_karma(&job->base); 4218 4219 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4220 if (!amdgpu_sriov_vf(adev)) { 4221 4222 if (!need_full_reset) 4223 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4224 4225 if (!need_full_reset) { 4226 amdgpu_device_ip_pre_soft_reset(adev); 4227 r = amdgpu_device_ip_soft_reset(adev); 4228 amdgpu_device_ip_post_soft_reset(adev); 4229 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4230 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4231 need_full_reset = true; 4232 } 4233 } 4234 4235 if (need_full_reset) 4236 r = amdgpu_device_ip_suspend(adev); 4237 4238 *need_full_reset_arg = need_full_reset; 4239 } 4240 4241 return r; 4242 } 4243 4244 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4245 struct list_head *device_list_handle, 4246 bool *need_full_reset_arg, 4247 bool skip_hw_reset) 4248 { 4249 struct amdgpu_device *tmp_adev = NULL; 4250 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4251 int r = 0; 4252 4253 /* 4254 * ASIC reset has to be done on all HGMI hive nodes ASAP 4255 * to allow proper links negotiation in FW (within 1 sec) 4256 */ 4257 if (!skip_hw_reset && need_full_reset) { 4258 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4259 /* For XGMI run all resets in parallel to speed up the process */ 4260 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4261 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4262 r = -EALREADY; 4263 } else 4264 r = amdgpu_asic_reset(tmp_adev); 4265 4266 if (r) { 4267 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4268 r, adev_to_drm(tmp_adev)->unique); 4269 break; 4270 } 4271 } 4272 4273 /* For XGMI wait for all resets to complete before proceed */ 4274 if (!r) { 4275 list_for_each_entry(tmp_adev, device_list_handle, 4276 gmc.xgmi.head) { 4277 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4278 flush_work(&tmp_adev->xgmi_reset_work); 4279 r = tmp_adev->asic_reset_res; 4280 if (r) 4281 break; 4282 } 4283 } 4284 } 4285 } 4286 4287 if (!r && amdgpu_ras_intr_triggered()) { 4288 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4289 if (tmp_adev->mmhub.funcs && 4290 tmp_adev->mmhub.funcs->reset_ras_error_count) 4291 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4292 } 4293 4294 amdgpu_ras_intr_cleared(); 4295 } 4296 4297 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4298 if (need_full_reset) { 4299 /* post card */ 4300 if (amdgpu_device_asic_init(tmp_adev)) 4301 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4302 4303 if (!r) { 4304 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4305 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4306 if (r) 4307 goto out; 4308 4309 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4310 if (vram_lost) { 4311 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4312 amdgpu_inc_vram_lost(tmp_adev); 4313 } 4314 4315 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4316 if (r) 4317 goto out; 4318 4319 r = amdgpu_device_fw_loading(tmp_adev); 4320 if (r) 4321 return r; 4322 4323 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4324 if (r) 4325 goto out; 4326 4327 if (vram_lost) 4328 amdgpu_device_fill_reset_magic(tmp_adev); 4329 4330 /* 4331 * Add this ASIC as tracked as reset was already 4332 * complete successfully. 4333 */ 4334 amdgpu_register_gpu_instance(tmp_adev); 4335 4336 r = amdgpu_device_ip_late_init(tmp_adev); 4337 if (r) 4338 goto out; 4339 4340 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4341 4342 /* 4343 * The GPU enters bad state once faulty pages 4344 * by ECC has reached the threshold, and ras 4345 * recovery is scheduled next. So add one check 4346 * here to break recovery if it indeed exceeds 4347 * bad page threshold, and remind user to 4348 * retire this GPU or setting one bigger 4349 * bad_page_threshold value to fix this once 4350 * probing driver again. 4351 */ 4352 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4353 /* must succeed. */ 4354 amdgpu_ras_resume(tmp_adev); 4355 } else { 4356 r = -EINVAL; 4357 goto out; 4358 } 4359 4360 /* Update PSP FW topology after reset */ 4361 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4362 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4363 } 4364 } 4365 4366 out: 4367 if (!r) { 4368 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4369 r = amdgpu_ib_ring_tests(tmp_adev); 4370 if (r) { 4371 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4372 r = amdgpu_device_ip_suspend(tmp_adev); 4373 need_full_reset = true; 4374 r = -EAGAIN; 4375 goto end; 4376 } 4377 } 4378 4379 if (!r) 4380 r = amdgpu_device_recover_vram(tmp_adev); 4381 else 4382 tmp_adev->asic_reset_res = r; 4383 } 4384 4385 end: 4386 *need_full_reset_arg = need_full_reset; 4387 return r; 4388 } 4389 4390 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4391 struct amdgpu_hive_info *hive) 4392 { 4393 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4394 return false; 4395 4396 if (hive) { 4397 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4398 } else { 4399 down_write(&adev->reset_sem); 4400 } 4401 4402 atomic_inc(&adev->gpu_reset_counter); 4403 switch (amdgpu_asic_reset_method(adev)) { 4404 case AMD_RESET_METHOD_MODE1: 4405 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4406 break; 4407 case AMD_RESET_METHOD_MODE2: 4408 adev->mp1_state = PP_MP1_STATE_RESET; 4409 break; 4410 default: 4411 adev->mp1_state = PP_MP1_STATE_NONE; 4412 break; 4413 } 4414 4415 return true; 4416 } 4417 4418 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4419 { 4420 amdgpu_vf_error_trans_all(adev); 4421 adev->mp1_state = PP_MP1_STATE_NONE; 4422 atomic_set(&adev->in_gpu_reset, 0); 4423 up_write(&adev->reset_sem); 4424 } 4425 4426 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4427 { 4428 struct pci_dev *p = NULL; 4429 4430 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4431 adev->pdev->bus->number, 1); 4432 if (p) { 4433 pm_runtime_enable(&(p->dev)); 4434 pm_runtime_resume(&(p->dev)); 4435 } 4436 } 4437 4438 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4439 { 4440 enum amd_reset_method reset_method; 4441 struct pci_dev *p = NULL; 4442 u64 expires; 4443 4444 /* 4445 * For now, only BACO and mode1 reset are confirmed 4446 * to suffer the audio issue without proper suspended. 4447 */ 4448 reset_method = amdgpu_asic_reset_method(adev); 4449 if ((reset_method != AMD_RESET_METHOD_BACO) && 4450 (reset_method != AMD_RESET_METHOD_MODE1)) 4451 return -EINVAL; 4452 4453 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4454 adev->pdev->bus->number, 1); 4455 if (!p) 4456 return -ENODEV; 4457 4458 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4459 if (!expires) 4460 /* 4461 * If we cannot get the audio device autosuspend delay, 4462 * a fixed 4S interval will be used. Considering 3S is 4463 * the audio controller default autosuspend delay setting. 4464 * 4S used here is guaranteed to cover that. 4465 */ 4466 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4467 4468 while (!pm_runtime_status_suspended(&(p->dev))) { 4469 if (!pm_runtime_suspend(&(p->dev))) 4470 break; 4471 4472 if (expires < ktime_get_mono_fast_ns()) { 4473 dev_warn(adev->dev, "failed to suspend display audio\n"); 4474 /* TODO: abort the succeeding gpu reset? */ 4475 return -ETIMEDOUT; 4476 } 4477 } 4478 4479 pm_runtime_disable(&(p->dev)); 4480 4481 return 0; 4482 } 4483 4484 /** 4485 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4486 * 4487 * @adev: amdgpu device pointer 4488 * @job: which job trigger hang 4489 * 4490 * Attempt to reset the GPU if it has hung (all asics). 4491 * Attempt to do soft-reset or full-reset and reinitialize Asic 4492 * Returns 0 for success or an error on failure. 4493 */ 4494 4495 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4496 struct amdgpu_job *job) 4497 { 4498 struct list_head device_list, *device_list_handle = NULL; 4499 bool need_full_reset = false; 4500 bool job_signaled = false; 4501 struct amdgpu_hive_info *hive = NULL; 4502 struct amdgpu_device *tmp_adev = NULL; 4503 int i, r = 0; 4504 bool need_emergency_restart = false; 4505 bool audio_suspended = false; 4506 4507 /** 4508 * Special case: RAS triggered and full reset isn't supported 4509 */ 4510 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4511 4512 /* 4513 * Flush RAM to disk so that after reboot 4514 * the user can read log and see why the system rebooted. 4515 */ 4516 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4517 DRM_WARN("Emergency reboot."); 4518 4519 ksys_sync_helper(); 4520 emergency_restart(); 4521 } 4522 4523 dev_info(adev->dev, "GPU %s begin!\n", 4524 need_emergency_restart ? "jobs stop":"reset"); 4525 4526 /* 4527 * Here we trylock to avoid chain of resets executing from 4528 * either trigger by jobs on different adevs in XGMI hive or jobs on 4529 * different schedulers for same device while this TO handler is running. 4530 * We always reset all schedulers for device and all devices for XGMI 4531 * hive so that should take care of them too. 4532 */ 4533 hive = amdgpu_get_xgmi_hive(adev); 4534 if (hive) { 4535 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4536 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4537 job ? job->base.id : -1, hive->hive_id); 4538 amdgpu_put_xgmi_hive(hive); 4539 return 0; 4540 } 4541 mutex_lock(&hive->hive_lock); 4542 } 4543 4544 /* 4545 * Build list of devices to reset. 4546 * In case we are in XGMI hive mode, resort the device list 4547 * to put adev in the 1st position. 4548 */ 4549 INIT_LIST_HEAD(&device_list); 4550 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4551 if (!hive) 4552 return -ENODEV; 4553 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4554 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4555 device_list_handle = &hive->device_list; 4556 } else { 4557 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4558 device_list_handle = &device_list; 4559 } 4560 4561 /* block all schedulers and reset given job's ring */ 4562 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4563 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4564 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4565 job ? job->base.id : -1); 4566 r = 0; 4567 goto skip_recovery; 4568 } 4569 4570 /* 4571 * Try to put the audio codec into suspend state 4572 * before gpu reset started. 4573 * 4574 * Due to the power domain of the graphics device 4575 * is shared with AZ power domain. Without this, 4576 * we may change the audio hardware from behind 4577 * the audio driver's back. That will trigger 4578 * some audio codec errors. 4579 */ 4580 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4581 audio_suspended = true; 4582 4583 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4584 4585 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4586 4587 if (!amdgpu_sriov_vf(tmp_adev)) 4588 amdgpu_amdkfd_pre_reset(tmp_adev); 4589 4590 /* 4591 * Mark these ASICs to be reseted as untracked first 4592 * And add them back after reset completed 4593 */ 4594 amdgpu_unregister_gpu_instance(tmp_adev); 4595 4596 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4597 4598 /* disable ras on ALL IPs */ 4599 if (!need_emergency_restart && 4600 amdgpu_device_ip_need_full_reset(tmp_adev)) 4601 amdgpu_ras_suspend(tmp_adev); 4602 4603 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4604 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4605 4606 if (!ring || !ring->sched.thread) 4607 continue; 4608 4609 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4610 4611 if (need_emergency_restart) 4612 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4613 } 4614 } 4615 4616 if (need_emergency_restart) 4617 goto skip_sched_resume; 4618 4619 /* 4620 * Must check guilty signal here since after this point all old 4621 * HW fences are force signaled. 4622 * 4623 * job->base holds a reference to parent fence 4624 */ 4625 if (job && job->base.s_fence->parent && 4626 dma_fence_is_signaled(job->base.s_fence->parent)) { 4627 job_signaled = true; 4628 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4629 goto skip_hw_reset; 4630 } 4631 4632 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4633 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4634 r = amdgpu_device_pre_asic_reset(tmp_adev, 4635 NULL, 4636 &need_full_reset); 4637 /*TODO Should we stop ?*/ 4638 if (r) { 4639 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4640 r, adev_to_drm(tmp_adev)->unique); 4641 tmp_adev->asic_reset_res = r; 4642 } 4643 } 4644 4645 /* Actual ASIC resets if needed.*/ 4646 /* TODO Implement XGMI hive reset logic for SRIOV */ 4647 if (amdgpu_sriov_vf(adev)) { 4648 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4649 if (r) 4650 adev->asic_reset_res = r; 4651 } else { 4652 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4653 if (r && r == -EAGAIN) 4654 goto retry; 4655 } 4656 4657 skip_hw_reset: 4658 4659 /* Post ASIC reset for all devs .*/ 4660 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4661 4662 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4663 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4664 4665 if (!ring || !ring->sched.thread) 4666 continue; 4667 4668 /* No point to resubmit jobs if we didn't HW reset*/ 4669 if (!tmp_adev->asic_reset_res && !job_signaled) 4670 drm_sched_resubmit_jobs(&ring->sched); 4671 4672 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4673 } 4674 4675 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4676 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4677 } 4678 4679 tmp_adev->asic_reset_res = 0; 4680 4681 if (r) { 4682 /* bad news, how to tell it to userspace ? */ 4683 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4684 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4685 } else { 4686 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4687 } 4688 } 4689 4690 skip_sched_resume: 4691 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4692 /*unlock kfd: SRIOV would do it separately */ 4693 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4694 amdgpu_amdkfd_post_reset(tmp_adev); 4695 if (audio_suspended) 4696 amdgpu_device_resume_display_audio(tmp_adev); 4697 amdgpu_device_unlock_adev(tmp_adev); 4698 } 4699 4700 skip_recovery: 4701 if (hive) { 4702 atomic_set(&hive->in_reset, 0); 4703 mutex_unlock(&hive->hive_lock); 4704 amdgpu_put_xgmi_hive(hive); 4705 } 4706 4707 if (r) 4708 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4709 return r; 4710 } 4711 4712 /** 4713 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4714 * 4715 * @adev: amdgpu_device pointer 4716 * 4717 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4718 * and lanes) of the slot the device is in. Handles APUs and 4719 * virtualized environments where PCIE config space may not be available. 4720 */ 4721 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4722 { 4723 struct pci_dev *pdev; 4724 enum pci_bus_speed speed_cap, platform_speed_cap; 4725 enum pcie_link_width platform_link_width; 4726 4727 if (amdgpu_pcie_gen_cap) 4728 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4729 4730 if (amdgpu_pcie_lane_cap) 4731 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4732 4733 /* covers APUs as well */ 4734 if (pci_is_root_bus(adev->pdev->bus)) { 4735 if (adev->pm.pcie_gen_mask == 0) 4736 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4737 if (adev->pm.pcie_mlw_mask == 0) 4738 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4739 return; 4740 } 4741 4742 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4743 return; 4744 4745 pcie_bandwidth_available(adev->pdev, NULL, 4746 &platform_speed_cap, &platform_link_width); 4747 4748 if (adev->pm.pcie_gen_mask == 0) { 4749 /* asic caps */ 4750 pdev = adev->pdev; 4751 speed_cap = pcie_get_speed_cap(pdev); 4752 if (speed_cap == PCI_SPEED_UNKNOWN) { 4753 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4755 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4756 } else { 4757 if (speed_cap == PCIE_SPEED_16_0GT) 4758 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4759 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4760 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4761 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4762 else if (speed_cap == PCIE_SPEED_8_0GT) 4763 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4764 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4765 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4766 else if (speed_cap == PCIE_SPEED_5_0GT) 4767 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4768 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4769 else 4770 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4771 } 4772 /* platform caps */ 4773 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4774 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4775 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4776 } else { 4777 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4778 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4779 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4780 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4781 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4782 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4783 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4784 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4785 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4786 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4787 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4788 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4789 else 4790 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4791 4792 } 4793 } 4794 if (adev->pm.pcie_mlw_mask == 0) { 4795 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4796 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4797 } else { 4798 switch (platform_link_width) { 4799 case PCIE_LNK_X32: 4800 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4802 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4803 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4804 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4806 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4807 break; 4808 case PCIE_LNK_X16: 4809 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4810 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4814 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4815 break; 4816 case PCIE_LNK_X12: 4817 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4822 break; 4823 case PCIE_LNK_X8: 4824 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4828 break; 4829 case PCIE_LNK_X4: 4830 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4833 break; 4834 case PCIE_LNK_X2: 4835 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4836 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4837 break; 4838 case PCIE_LNK_X1: 4839 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4840 break; 4841 default: 4842 break; 4843 } 4844 } 4845 } 4846 } 4847 4848 int amdgpu_device_baco_enter(struct drm_device *dev) 4849 { 4850 struct amdgpu_device *adev = drm_to_adev(dev); 4851 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4852 4853 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4854 return -ENOTSUPP; 4855 4856 if (ras && ras->supported) 4857 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4858 4859 return amdgpu_dpm_baco_enter(adev); 4860 } 4861 4862 int amdgpu_device_baco_exit(struct drm_device *dev) 4863 { 4864 struct amdgpu_device *adev = drm_to_adev(dev); 4865 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4866 int ret = 0; 4867 4868 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4869 return -ENOTSUPP; 4870 4871 ret = amdgpu_dpm_baco_exit(adev); 4872 if (ret) 4873 return ret; 4874 4875 if (ras && ras->supported) 4876 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4877 4878 return 0; 4879 } 4880 4881 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4882 { 4883 int i; 4884 4885 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4886 struct amdgpu_ring *ring = adev->rings[i]; 4887 4888 if (!ring || !ring->sched.thread) 4889 continue; 4890 4891 cancel_delayed_work_sync(&ring->sched.work_tdr); 4892 } 4893 } 4894 4895 /** 4896 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4897 * @pdev: PCI device struct 4898 * @state: PCI channel state 4899 * 4900 * Description: Called when a PCI error is detected. 4901 * 4902 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4903 */ 4904 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4905 { 4906 struct drm_device *dev = pci_get_drvdata(pdev); 4907 struct amdgpu_device *adev = drm_to_adev(dev); 4908 int i; 4909 4910 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4911 4912 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4913 DRM_WARN("No support for XGMI hive yet..."); 4914 return PCI_ERS_RESULT_DISCONNECT; 4915 } 4916 4917 switch (state) { 4918 case pci_channel_io_normal: 4919 return PCI_ERS_RESULT_CAN_RECOVER; 4920 /* Fatal error, prepare for slot reset */ 4921 case pci_channel_io_frozen: 4922 /* 4923 * Cancel and wait for all TDRs in progress if failing to 4924 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4925 * 4926 * Locking adev->reset_sem will prevent any external access 4927 * to GPU during PCI error recovery 4928 */ 4929 while (!amdgpu_device_lock_adev(adev, NULL)) 4930 amdgpu_cancel_all_tdr(adev); 4931 4932 /* 4933 * Block any work scheduling as we do for regular GPU reset 4934 * for the duration of the recovery 4935 */ 4936 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4937 struct amdgpu_ring *ring = adev->rings[i]; 4938 4939 if (!ring || !ring->sched.thread) 4940 continue; 4941 4942 drm_sched_stop(&ring->sched, NULL); 4943 } 4944 return PCI_ERS_RESULT_NEED_RESET; 4945 case pci_channel_io_perm_failure: 4946 /* Permanent error, prepare for device removal */ 4947 return PCI_ERS_RESULT_DISCONNECT; 4948 } 4949 4950 return PCI_ERS_RESULT_NEED_RESET; 4951 } 4952 4953 /** 4954 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4955 * @pdev: pointer to PCI device 4956 */ 4957 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4958 { 4959 4960 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4961 4962 /* TODO - dump whatever for debugging purposes */ 4963 4964 /* This called only if amdgpu_pci_error_detected returns 4965 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4966 * works, no need to reset slot. 4967 */ 4968 4969 return PCI_ERS_RESULT_RECOVERED; 4970 } 4971 4972 /** 4973 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4974 * @pdev: PCI device struct 4975 * 4976 * Description: This routine is called by the pci error recovery 4977 * code after the PCI slot has been reset, just before we 4978 * should resume normal operations. 4979 */ 4980 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4981 { 4982 struct drm_device *dev = pci_get_drvdata(pdev); 4983 struct amdgpu_device *adev = drm_to_adev(dev); 4984 int r, i; 4985 bool need_full_reset = true; 4986 u32 memsize; 4987 struct list_head device_list; 4988 4989 DRM_INFO("PCI error: slot reset callback!!\n"); 4990 4991 INIT_LIST_HEAD(&device_list); 4992 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4993 4994 /* wait for asic to come out of reset */ 4995 msleep(500); 4996 4997 /* Restore PCI confspace */ 4998 amdgpu_device_load_pci_state(pdev); 4999 5000 /* confirm ASIC came out of reset */ 5001 for (i = 0; i < adev->usec_timeout; i++) { 5002 memsize = amdgpu_asic_get_config_memsize(adev); 5003 5004 if (memsize != 0xffffffff) 5005 break; 5006 udelay(1); 5007 } 5008 if (memsize == 0xffffffff) { 5009 r = -ETIME; 5010 goto out; 5011 } 5012 5013 adev->in_pci_err_recovery = true; 5014 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5015 adev->in_pci_err_recovery = false; 5016 if (r) 5017 goto out; 5018 5019 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5020 5021 out: 5022 if (!r) { 5023 if (amdgpu_device_cache_pci_state(adev->pdev)) 5024 pci_restore_state(adev->pdev); 5025 5026 DRM_INFO("PCIe error recovery succeeded\n"); 5027 } else { 5028 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5029 amdgpu_device_unlock_adev(adev); 5030 } 5031 5032 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5033 } 5034 5035 /** 5036 * amdgpu_pci_resume() - resume normal ops after PCI reset 5037 * @pdev: pointer to PCI device 5038 * 5039 * Called when the error recovery driver tells us that its 5040 * OK to resume normal operation. Use completion to allow 5041 * halted scsi ops to resume. 5042 */ 5043 void amdgpu_pci_resume(struct pci_dev *pdev) 5044 { 5045 struct drm_device *dev = pci_get_drvdata(pdev); 5046 struct amdgpu_device *adev = drm_to_adev(dev); 5047 int i; 5048 5049 5050 DRM_INFO("PCI error: resume callback!!\n"); 5051 5052 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5053 struct amdgpu_ring *ring = adev->rings[i]; 5054 5055 if (!ring || !ring->sched.thread) 5056 continue; 5057 5058 5059 drm_sched_resubmit_jobs(&ring->sched); 5060 drm_sched_start(&ring->sched, true); 5061 } 5062 5063 amdgpu_device_unlock_adev(adev); 5064 } 5065 5066 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5067 { 5068 struct drm_device *dev = pci_get_drvdata(pdev); 5069 struct amdgpu_device *adev = drm_to_adev(dev); 5070 int r; 5071 5072 r = pci_save_state(pdev); 5073 if (!r) { 5074 kfree(adev->pci_state); 5075 5076 adev->pci_state = pci_store_saved_state(pdev); 5077 5078 if (!adev->pci_state) { 5079 DRM_ERROR("Failed to store PCI saved state"); 5080 return false; 5081 } 5082 } else { 5083 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5084 return false; 5085 } 5086 5087 return true; 5088 } 5089 5090 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5091 { 5092 struct drm_device *dev = pci_get_drvdata(pdev); 5093 struct amdgpu_device *adev = drm_to_adev(dev); 5094 int r; 5095 5096 if (!adev->pci_state) 5097 return false; 5098 5099 r = pci_load_saved_state(pdev, adev->pci_state); 5100 5101 if (!r) { 5102 pci_restore_state(pdev); 5103 } else { 5104 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5105 return false; 5106 } 5107 5108 return true; 5109 } 5110 5111 5112