1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 87 const char *amdgpu_asic_name[] = { 88 "TAHITI", 89 "PITCAIRN", 90 "VERDE", 91 "OLAND", 92 "HAINAN", 93 "BONAIRE", 94 "KAVERI", 95 "KABINI", 96 "HAWAII", 97 "MULLINS", 98 "TOPAZ", 99 "TONGA", 100 "FIJI", 101 "CARRIZO", 102 "STONEY", 103 "POLARIS10", 104 "POLARIS11", 105 "POLARIS12", 106 "VEGAM", 107 "VEGA10", 108 "VEGA12", 109 "VEGA20", 110 "RAVEN", 111 "ARCTURUS", 112 "RENOIR", 113 "NAVI10", 114 "NAVI14", 115 "NAVI12", 116 "SIENNA_CICHLID", 117 "NAVY_FLOUNDER", 118 "LAST", 119 }; 120 121 /** 122 * DOC: pcie_replay_count 123 * 124 * The amdgpu driver provides a sysfs API for reporting the total number 125 * of PCIe replays (NAKs) 126 * The file pcie_replay_count is used for this and returns the total 127 * number of replays as a sum of the NAKs generated and NAKs received 128 */ 129 130 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 131 struct device_attribute *attr, char *buf) 132 { 133 struct drm_device *ddev = dev_get_drvdata(dev); 134 struct amdgpu_device *adev = drm_to_adev(ddev); 135 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 136 137 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 138 } 139 140 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 141 amdgpu_device_get_pcie_replay_count, NULL); 142 143 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 144 145 /** 146 * DOC: product_name 147 * 148 * The amdgpu driver provides a sysfs API for reporting the product name 149 * for the device 150 * The file serial_number is used for this and returns the product name 151 * as returned from the FRU. 152 * NOTE: This is only available for certain server cards 153 */ 154 155 static ssize_t amdgpu_device_get_product_name(struct device *dev, 156 struct device_attribute *attr, char *buf) 157 { 158 struct drm_device *ddev = dev_get_drvdata(dev); 159 struct amdgpu_device *adev = drm_to_adev(ddev); 160 161 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 162 } 163 164 static DEVICE_ATTR(product_name, S_IRUGO, 165 amdgpu_device_get_product_name, NULL); 166 167 /** 168 * DOC: product_number 169 * 170 * The amdgpu driver provides a sysfs API for reporting the part number 171 * for the device 172 * The file serial_number is used for this and returns the part number 173 * as returned from the FRU. 174 * NOTE: This is only available for certain server cards 175 */ 176 177 static ssize_t amdgpu_device_get_product_number(struct device *dev, 178 struct device_attribute *attr, char *buf) 179 { 180 struct drm_device *ddev = dev_get_drvdata(dev); 181 struct amdgpu_device *adev = drm_to_adev(ddev); 182 183 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 184 } 185 186 static DEVICE_ATTR(product_number, S_IRUGO, 187 amdgpu_device_get_product_number, NULL); 188 189 /** 190 * DOC: serial_number 191 * 192 * The amdgpu driver provides a sysfs API for reporting the serial number 193 * for the device 194 * The file serial_number is used for this and returns the serial number 195 * as returned from the FRU. 196 * NOTE: This is only available for certain server cards 197 */ 198 199 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 200 struct device_attribute *attr, char *buf) 201 { 202 struct drm_device *ddev = dev_get_drvdata(dev); 203 struct amdgpu_device *adev = drm_to_adev(ddev); 204 205 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 206 } 207 208 static DEVICE_ATTR(serial_number, S_IRUGO, 209 amdgpu_device_get_serial_number, NULL); 210 211 /** 212 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 213 * 214 * @dev: drm_device pointer 215 * 216 * Returns true if the device is a dGPU with HG/PX power control, 217 * otherwise return false. 218 */ 219 bool amdgpu_device_supports_boco(struct drm_device *dev) 220 { 221 struct amdgpu_device *adev = drm_to_adev(dev); 222 223 if (adev->flags & AMD_IS_PX) 224 return true; 225 return false; 226 } 227 228 /** 229 * amdgpu_device_supports_baco - Does the device support BACO 230 * 231 * @dev: drm_device pointer 232 * 233 * Returns true if the device supporte BACO, 234 * otherwise return false. 235 */ 236 bool amdgpu_device_supports_baco(struct drm_device *dev) 237 { 238 struct amdgpu_device *adev = drm_to_adev(dev); 239 240 return amdgpu_asic_supports_baco(adev); 241 } 242 243 /* 244 * VRAM access helper functions 245 */ 246 247 /** 248 * amdgpu_device_vram_access - read/write a buffer in vram 249 * 250 * @adev: amdgpu_device pointer 251 * @pos: offset of the buffer in vram 252 * @buf: virtual address of the buffer in system memory 253 * @size: read/write size, sizeof(@buf) must > @size 254 * @write: true - write to vram, otherwise - read from vram 255 */ 256 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 257 uint32_t *buf, size_t size, bool write) 258 { 259 unsigned long flags; 260 uint32_t hi = ~0; 261 uint64_t last; 262 263 264 #ifdef CONFIG_64BIT 265 last = min(pos + size, adev->gmc.visible_vram_size); 266 if (last > pos) { 267 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 268 size_t count = last - pos; 269 270 if (write) { 271 memcpy_toio(addr, buf, count); 272 mb(); 273 amdgpu_asic_flush_hdp(adev, NULL); 274 } else { 275 amdgpu_asic_invalidate_hdp(adev, NULL); 276 mb(); 277 memcpy_fromio(buf, addr, count); 278 } 279 280 if (count == size) 281 return; 282 283 pos += count; 284 buf += count / 4; 285 size -= count; 286 } 287 #endif 288 289 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 290 for (last = pos + size; pos < last; pos += 4) { 291 uint32_t tmp = pos >> 31; 292 293 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 294 if (tmp != hi) { 295 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 296 hi = tmp; 297 } 298 if (write) 299 WREG32_NO_KIQ(mmMM_DATA, *buf++); 300 else 301 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 302 } 303 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 304 } 305 306 /* 307 * register access helper functions. 308 */ 309 /** 310 * amdgpu_device_rreg - read a memory mapped IO or indirect register 311 * 312 * @adev: amdgpu_device pointer 313 * @reg: dword aligned register offset 314 * @acc_flags: access flags which require special behavior 315 * 316 * Returns the 32 bit value from the offset specified. 317 */ 318 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 319 uint32_t reg, uint32_t acc_flags) 320 { 321 uint32_t ret; 322 323 if (adev->in_pci_err_recovery) 324 return 0; 325 326 if ((reg * 4) < adev->rmmio_size) { 327 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 328 amdgpu_sriov_runtime(adev) && 329 down_read_trylock(&adev->reset_sem)) { 330 ret = amdgpu_kiq_rreg(adev, reg); 331 up_read(&adev->reset_sem); 332 } else { 333 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 334 } 335 } else { 336 ret = adev->pcie_rreg(adev, reg * 4); 337 } 338 339 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 340 341 return ret; 342 } 343 344 /* 345 * MMIO register read with bytes helper functions 346 * @offset:bytes offset from MMIO start 347 * 348 */ 349 350 /** 351 * amdgpu_mm_rreg8 - read a memory mapped IO register 352 * 353 * @adev: amdgpu_device pointer 354 * @offset: byte aligned register offset 355 * 356 * Returns the 8 bit value from the offset specified. 357 */ 358 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 359 { 360 if (adev->in_pci_err_recovery) 361 return 0; 362 363 if (offset < adev->rmmio_size) 364 return (readb(adev->rmmio + offset)); 365 BUG(); 366 } 367 368 /* 369 * MMIO register write with bytes helper functions 370 * @offset:bytes offset from MMIO start 371 * @value: the value want to be written to the register 372 * 373 */ 374 /** 375 * amdgpu_mm_wreg8 - read a memory mapped IO register 376 * 377 * @adev: amdgpu_device pointer 378 * @offset: byte aligned register offset 379 * @value: 8 bit value to write 380 * 381 * Writes the value specified to the offset specified. 382 */ 383 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 384 { 385 if (adev->in_pci_err_recovery) 386 return; 387 388 if (offset < adev->rmmio_size) 389 writeb(value, adev->rmmio + offset); 390 else 391 BUG(); 392 } 393 394 /** 395 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 396 * 397 * @adev: amdgpu_device pointer 398 * @reg: dword aligned register offset 399 * @v: 32 bit value to write to the register 400 * @acc_flags: access flags which require special behavior 401 * 402 * Writes the value specified to the offset specified. 403 */ 404 void amdgpu_device_wreg(struct amdgpu_device *adev, 405 uint32_t reg, uint32_t v, 406 uint32_t acc_flags) 407 { 408 if (adev->in_pci_err_recovery) 409 return; 410 411 if ((reg * 4) < adev->rmmio_size) { 412 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 413 amdgpu_sriov_runtime(adev) && 414 down_read_trylock(&adev->reset_sem)) { 415 amdgpu_kiq_wreg(adev, reg, v); 416 up_read(&adev->reset_sem); 417 } else { 418 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 419 } 420 } else { 421 adev->pcie_wreg(adev, reg * 4, v); 422 } 423 424 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 425 } 426 427 /* 428 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 429 * 430 * this function is invoked only the debugfs register access 431 * */ 432 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 433 uint32_t reg, uint32_t v) 434 { 435 if (adev->in_pci_err_recovery) 436 return; 437 438 if (amdgpu_sriov_fullaccess(adev) && 439 adev->gfx.rlc.funcs && 440 adev->gfx.rlc.funcs->is_rlcg_access_range) { 441 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 442 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 443 } else { 444 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 445 } 446 } 447 448 /** 449 * amdgpu_io_rreg - read an IO register 450 * 451 * @adev: amdgpu_device pointer 452 * @reg: dword aligned register offset 453 * 454 * Returns the 32 bit value from the offset specified. 455 */ 456 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 457 { 458 if (adev->in_pci_err_recovery) 459 return 0; 460 461 if ((reg * 4) < adev->rio_mem_size) 462 return ioread32(adev->rio_mem + (reg * 4)); 463 else { 464 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 465 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 466 } 467 } 468 469 /** 470 * amdgpu_io_wreg - write to an IO register 471 * 472 * @adev: amdgpu_device pointer 473 * @reg: dword aligned register offset 474 * @v: 32 bit value to write to the register 475 * 476 * Writes the value specified to the offset specified. 477 */ 478 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 479 { 480 if (adev->in_pci_err_recovery) 481 return; 482 483 if ((reg * 4) < adev->rio_mem_size) 484 iowrite32(v, adev->rio_mem + (reg * 4)); 485 else { 486 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 487 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 488 } 489 } 490 491 /** 492 * amdgpu_mm_rdoorbell - read a doorbell dword 493 * 494 * @adev: amdgpu_device pointer 495 * @index: doorbell index 496 * 497 * Returns the value in the doorbell aperture at the 498 * requested doorbell index (CIK). 499 */ 500 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 501 { 502 if (adev->in_pci_err_recovery) 503 return 0; 504 505 if (index < adev->doorbell.num_doorbells) { 506 return readl(adev->doorbell.ptr + index); 507 } else { 508 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 509 return 0; 510 } 511 } 512 513 /** 514 * amdgpu_mm_wdoorbell - write a doorbell dword 515 * 516 * @adev: amdgpu_device pointer 517 * @index: doorbell index 518 * @v: value to write 519 * 520 * Writes @v to the doorbell aperture at the 521 * requested doorbell index (CIK). 522 */ 523 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 524 { 525 if (adev->in_pci_err_recovery) 526 return; 527 528 if (index < adev->doorbell.num_doorbells) { 529 writel(v, adev->doorbell.ptr + index); 530 } else { 531 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 532 } 533 } 534 535 /** 536 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 537 * 538 * @adev: amdgpu_device pointer 539 * @index: doorbell index 540 * 541 * Returns the value in the doorbell aperture at the 542 * requested doorbell index (VEGA10+). 543 */ 544 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 545 { 546 if (adev->in_pci_err_recovery) 547 return 0; 548 549 if (index < adev->doorbell.num_doorbells) { 550 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 551 } else { 552 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 553 return 0; 554 } 555 } 556 557 /** 558 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 559 * 560 * @adev: amdgpu_device pointer 561 * @index: doorbell index 562 * @v: value to write 563 * 564 * Writes @v to the doorbell aperture at the 565 * requested doorbell index (VEGA10+). 566 */ 567 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 568 { 569 if (adev->in_pci_err_recovery) 570 return; 571 572 if (index < adev->doorbell.num_doorbells) { 573 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 574 } else { 575 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 576 } 577 } 578 579 /** 580 * amdgpu_device_indirect_rreg - read an indirect register 581 * 582 * @adev: amdgpu_device pointer 583 * @pcie_index: mmio register offset 584 * @pcie_data: mmio register offset 585 * 586 * Returns the value of indirect register @reg_addr 587 */ 588 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 589 u32 pcie_index, u32 pcie_data, 590 u32 reg_addr) 591 { 592 unsigned long flags; 593 u32 r; 594 void __iomem *pcie_index_offset; 595 void __iomem *pcie_data_offset; 596 597 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 598 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 599 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 600 601 writel(reg_addr, pcie_index_offset); 602 readl(pcie_index_offset); 603 r = readl(pcie_data_offset); 604 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 605 606 return r; 607 } 608 609 /** 610 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 611 * 612 * @adev: amdgpu_device pointer 613 * @pcie_index: mmio register offset 614 * @pcie_data: mmio register offset 615 * 616 * Returns the value of indirect register @reg_addr 617 */ 618 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 619 u32 pcie_index, u32 pcie_data, 620 u32 reg_addr) 621 { 622 unsigned long flags; 623 u64 r; 624 void __iomem *pcie_index_offset; 625 void __iomem *pcie_data_offset; 626 627 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 628 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 629 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 630 631 /* read low 32 bits */ 632 writel(reg_addr, pcie_index_offset); 633 readl(pcie_index_offset); 634 r = readl(pcie_data_offset); 635 /* read high 32 bits */ 636 writel(reg_addr + 4, pcie_index_offset); 637 readl(pcie_index_offset); 638 r |= ((u64)readl(pcie_data_offset) << 32); 639 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 640 641 return r; 642 } 643 644 /** 645 * amdgpu_device_indirect_wreg - write an indirect register address 646 * 647 * @adev: amdgpu_device pointer 648 * @pcie_index: mmio register offset 649 * @pcie_data: mmio register offset 650 * @reg_addr: indirect register offset 651 * @reg_data: indirect register data 652 * 653 */ 654 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 655 u32 pcie_index, u32 pcie_data, 656 u32 reg_addr, u32 reg_data) 657 { 658 unsigned long flags; 659 void __iomem *pcie_index_offset; 660 void __iomem *pcie_data_offset; 661 662 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 663 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 664 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 665 666 writel(reg_addr, pcie_index_offset); 667 readl(pcie_index_offset); 668 writel(reg_data, pcie_data_offset); 669 readl(pcie_data_offset); 670 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 671 } 672 673 /** 674 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 675 * 676 * @adev: amdgpu_device pointer 677 * @pcie_index: mmio register offset 678 * @pcie_data: mmio register offset 679 * @reg_addr: indirect register offset 680 * @reg_data: indirect register data 681 * 682 */ 683 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 684 u32 pcie_index, u32 pcie_data, 685 u32 reg_addr, u64 reg_data) 686 { 687 unsigned long flags; 688 void __iomem *pcie_index_offset; 689 void __iomem *pcie_data_offset; 690 691 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 692 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 693 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 694 695 /* write low 32 bits */ 696 writel(reg_addr, pcie_index_offset); 697 readl(pcie_index_offset); 698 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 699 readl(pcie_data_offset); 700 /* write high 32 bits */ 701 writel(reg_addr + 4, pcie_index_offset); 702 readl(pcie_index_offset); 703 writel((u32)(reg_data >> 32), pcie_data_offset); 704 readl(pcie_data_offset); 705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 706 } 707 708 /** 709 * amdgpu_invalid_rreg - dummy reg read function 710 * 711 * @adev: amdgpu_device pointer 712 * @reg: offset of register 713 * 714 * Dummy register read function. Used for register blocks 715 * that certain asics don't have (all asics). 716 * Returns the value in the register. 717 */ 718 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 719 { 720 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 721 BUG(); 722 return 0; 723 } 724 725 /** 726 * amdgpu_invalid_wreg - dummy reg write function 727 * 728 * @adev: amdgpu_device pointer 729 * @reg: offset of register 730 * @v: value to write to the register 731 * 732 * Dummy register read function. Used for register blocks 733 * that certain asics don't have (all asics). 734 */ 735 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 736 { 737 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 738 reg, v); 739 BUG(); 740 } 741 742 /** 743 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 744 * 745 * @adev: amdgpu_device pointer 746 * @reg: offset of register 747 * 748 * Dummy register read function. Used for register blocks 749 * that certain asics don't have (all asics). 750 * Returns the value in the register. 751 */ 752 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 753 { 754 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 755 BUG(); 756 return 0; 757 } 758 759 /** 760 * amdgpu_invalid_wreg64 - dummy reg write function 761 * 762 * @adev: amdgpu_device pointer 763 * @reg: offset of register 764 * @v: value to write to the register 765 * 766 * Dummy register read function. Used for register blocks 767 * that certain asics don't have (all asics). 768 */ 769 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 770 { 771 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 772 reg, v); 773 BUG(); 774 } 775 776 /** 777 * amdgpu_block_invalid_rreg - dummy reg read function 778 * 779 * @adev: amdgpu_device pointer 780 * @block: offset of instance 781 * @reg: offset of register 782 * 783 * Dummy register read function. Used for register blocks 784 * that certain asics don't have (all asics). 785 * Returns the value in the register. 786 */ 787 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 788 uint32_t block, uint32_t reg) 789 { 790 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 791 reg, block); 792 BUG(); 793 return 0; 794 } 795 796 /** 797 * amdgpu_block_invalid_wreg - dummy reg write function 798 * 799 * @adev: amdgpu_device pointer 800 * @block: offset of instance 801 * @reg: offset of register 802 * @v: value to write to the register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 */ 807 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 808 uint32_t block, 809 uint32_t reg, uint32_t v) 810 { 811 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 812 reg, block, v); 813 BUG(); 814 } 815 816 /** 817 * amdgpu_device_asic_init - Wrapper for atom asic_init 818 * 819 * @adev: amdgpu_device pointer 820 * 821 * Does any asic specific work and then calls atom asic init. 822 */ 823 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 824 { 825 amdgpu_asic_pre_asic_init(adev); 826 827 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 828 } 829 830 /** 831 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 832 * 833 * @adev: amdgpu_device pointer 834 * 835 * Allocates a scratch page of VRAM for use by various things in the 836 * driver. 837 */ 838 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 839 { 840 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 841 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 842 &adev->vram_scratch.robj, 843 &adev->vram_scratch.gpu_addr, 844 (void **)&adev->vram_scratch.ptr); 845 } 846 847 /** 848 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 849 * 850 * @adev: amdgpu_device pointer 851 * 852 * Frees the VRAM scratch page. 853 */ 854 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 855 { 856 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 857 } 858 859 /** 860 * amdgpu_device_program_register_sequence - program an array of registers. 861 * 862 * @adev: amdgpu_device pointer 863 * @registers: pointer to the register array 864 * @array_size: size of the register array 865 * 866 * Programs an array or registers with and and or masks. 867 * This is a helper for setting golden registers. 868 */ 869 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 870 const u32 *registers, 871 const u32 array_size) 872 { 873 u32 tmp, reg, and_mask, or_mask; 874 int i; 875 876 if (array_size % 3) 877 return; 878 879 for (i = 0; i < array_size; i +=3) { 880 reg = registers[i + 0]; 881 and_mask = registers[i + 1]; 882 or_mask = registers[i + 2]; 883 884 if (and_mask == 0xffffffff) { 885 tmp = or_mask; 886 } else { 887 tmp = RREG32(reg); 888 tmp &= ~and_mask; 889 if (adev->family >= AMDGPU_FAMILY_AI) 890 tmp |= (or_mask & and_mask); 891 else 892 tmp |= or_mask; 893 } 894 WREG32(reg, tmp); 895 } 896 } 897 898 /** 899 * amdgpu_device_pci_config_reset - reset the GPU 900 * 901 * @adev: amdgpu_device pointer 902 * 903 * Resets the GPU using the pci config reset sequence. 904 * Only applicable to asics prior to vega10. 905 */ 906 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 907 { 908 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 909 } 910 911 /* 912 * GPU doorbell aperture helpers function. 913 */ 914 /** 915 * amdgpu_device_doorbell_init - Init doorbell driver information. 916 * 917 * @adev: amdgpu_device pointer 918 * 919 * Init doorbell driver information (CIK) 920 * Returns 0 on success, error on failure. 921 */ 922 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 923 { 924 925 /* No doorbell on SI hardware generation */ 926 if (adev->asic_type < CHIP_BONAIRE) { 927 adev->doorbell.base = 0; 928 adev->doorbell.size = 0; 929 adev->doorbell.num_doorbells = 0; 930 adev->doorbell.ptr = NULL; 931 return 0; 932 } 933 934 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 935 return -EINVAL; 936 937 amdgpu_asic_init_doorbell_index(adev); 938 939 /* doorbell bar mapping */ 940 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 941 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 942 943 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 944 adev->doorbell_index.max_assignment+1); 945 if (adev->doorbell.num_doorbells == 0) 946 return -EINVAL; 947 948 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 949 * paging queue doorbell use the second page. The 950 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 951 * doorbells are in the first page. So with paging queue enabled, 952 * the max num_doorbells should + 1 page (0x400 in dword) 953 */ 954 if (adev->asic_type >= CHIP_VEGA10) 955 adev->doorbell.num_doorbells += 0x400; 956 957 adev->doorbell.ptr = ioremap(adev->doorbell.base, 958 adev->doorbell.num_doorbells * 959 sizeof(u32)); 960 if (adev->doorbell.ptr == NULL) 961 return -ENOMEM; 962 963 return 0; 964 } 965 966 /** 967 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 968 * 969 * @adev: amdgpu_device pointer 970 * 971 * Tear down doorbell driver information (CIK) 972 */ 973 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 974 { 975 iounmap(adev->doorbell.ptr); 976 adev->doorbell.ptr = NULL; 977 } 978 979 980 981 /* 982 * amdgpu_device_wb_*() 983 * Writeback is the method by which the GPU updates special pages in memory 984 * with the status of certain GPU events (fences, ring pointers,etc.). 985 */ 986 987 /** 988 * amdgpu_device_wb_fini - Disable Writeback and free memory 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Disables Writeback and frees the Writeback memory (all asics). 993 * Used at driver shutdown. 994 */ 995 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 996 { 997 if (adev->wb.wb_obj) { 998 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 999 &adev->wb.gpu_addr, 1000 (void **)&adev->wb.wb); 1001 adev->wb.wb_obj = NULL; 1002 } 1003 } 1004 1005 /** 1006 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Initializes writeback and allocates writeback memory (all asics). 1011 * Used at driver startup. 1012 * Returns 0 on success or an -error on failure. 1013 */ 1014 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1015 { 1016 int r; 1017 1018 if (adev->wb.wb_obj == NULL) { 1019 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1020 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1021 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1022 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1023 (void **)&adev->wb.wb); 1024 if (r) { 1025 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1026 return r; 1027 } 1028 1029 adev->wb.num_wb = AMDGPU_MAX_WB; 1030 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1031 1032 /* clear wb memory */ 1033 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1034 } 1035 1036 return 0; 1037 } 1038 1039 /** 1040 * amdgpu_device_wb_get - Allocate a wb entry 1041 * 1042 * @adev: amdgpu_device pointer 1043 * @wb: wb index 1044 * 1045 * Allocate a wb slot for use by the driver (all asics). 1046 * Returns 0 on success or -EINVAL on failure. 1047 */ 1048 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1049 { 1050 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1051 1052 if (offset < adev->wb.num_wb) { 1053 __set_bit(offset, adev->wb.used); 1054 *wb = offset << 3; /* convert to dw offset */ 1055 return 0; 1056 } else { 1057 return -EINVAL; 1058 } 1059 } 1060 1061 /** 1062 * amdgpu_device_wb_free - Free a wb entry 1063 * 1064 * @adev: amdgpu_device pointer 1065 * @wb: wb index 1066 * 1067 * Free a wb slot allocated for use by the driver (all asics) 1068 */ 1069 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1070 { 1071 wb >>= 3; 1072 if (wb < adev->wb.num_wb) 1073 __clear_bit(wb, adev->wb.used); 1074 } 1075 1076 /** 1077 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1078 * 1079 * @adev: amdgpu_device pointer 1080 * 1081 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1082 * to fail, but if any of the BARs is not accessible after the size we abort 1083 * driver loading by returning -ENODEV. 1084 */ 1085 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1086 { 1087 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1088 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1089 struct pci_bus *root; 1090 struct resource *res; 1091 unsigned i; 1092 u16 cmd; 1093 int r; 1094 1095 /* Bypass for VF */ 1096 if (amdgpu_sriov_vf(adev)) 1097 return 0; 1098 1099 /* skip if the bios has already enabled large BAR */ 1100 if (adev->gmc.real_vram_size && 1101 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1102 return 0; 1103 1104 /* Check if the root BUS has 64bit memory resources */ 1105 root = adev->pdev->bus; 1106 while (root->parent) 1107 root = root->parent; 1108 1109 pci_bus_for_each_resource(root, res, i) { 1110 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1111 res->start > 0x100000000ull) 1112 break; 1113 } 1114 1115 /* Trying to resize is pointless without a root hub window above 4GB */ 1116 if (!res) 1117 return 0; 1118 1119 /* Disable memory decoding while we change the BAR addresses and size */ 1120 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1121 pci_write_config_word(adev->pdev, PCI_COMMAND, 1122 cmd & ~PCI_COMMAND_MEMORY); 1123 1124 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1125 amdgpu_device_doorbell_fini(adev); 1126 if (adev->asic_type >= CHIP_BONAIRE) 1127 pci_release_resource(adev->pdev, 2); 1128 1129 pci_release_resource(adev->pdev, 0); 1130 1131 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1132 if (r == -ENOSPC) 1133 DRM_INFO("Not enough PCI address space for a large BAR."); 1134 else if (r && r != -ENOTSUPP) 1135 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1136 1137 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1138 1139 /* When the doorbell or fb BAR isn't available we have no chance of 1140 * using the device. 1141 */ 1142 r = amdgpu_device_doorbell_init(adev); 1143 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1144 return -ENODEV; 1145 1146 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1147 1148 return 0; 1149 } 1150 1151 /* 1152 * GPU helpers function. 1153 */ 1154 /** 1155 * amdgpu_device_need_post - check if the hw need post or not 1156 * 1157 * @adev: amdgpu_device pointer 1158 * 1159 * Check if the asic has been initialized (all asics) at driver startup 1160 * or post is needed if hw reset is performed. 1161 * Returns true if need or false if not. 1162 */ 1163 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1164 { 1165 uint32_t reg; 1166 1167 if (amdgpu_sriov_vf(adev)) 1168 return false; 1169 1170 if (amdgpu_passthrough(adev)) { 1171 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1172 * some old smc fw still need driver do vPost otherwise gpu hang, while 1173 * those smc fw version above 22.15 doesn't have this flaw, so we force 1174 * vpost executed for smc version below 22.15 1175 */ 1176 if (adev->asic_type == CHIP_FIJI) { 1177 int err; 1178 uint32_t fw_ver; 1179 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1180 /* force vPost if error occured */ 1181 if (err) 1182 return true; 1183 1184 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1185 if (fw_ver < 0x00160e00) 1186 return true; 1187 } 1188 } 1189 1190 if (adev->has_hw_reset) { 1191 adev->has_hw_reset = false; 1192 return true; 1193 } 1194 1195 /* bios scratch used on CIK+ */ 1196 if (adev->asic_type >= CHIP_BONAIRE) 1197 return amdgpu_atombios_scratch_need_asic_init(adev); 1198 1199 /* check MEM_SIZE for older asics */ 1200 reg = amdgpu_asic_get_config_memsize(adev); 1201 1202 if ((reg != 0) && (reg != 0xffffffff)) 1203 return false; 1204 1205 return true; 1206 } 1207 1208 /* if we get transitioned to only one device, take VGA back */ 1209 /** 1210 * amdgpu_device_vga_set_decode - enable/disable vga decode 1211 * 1212 * @cookie: amdgpu_device pointer 1213 * @state: enable/disable vga decode 1214 * 1215 * Enable/disable vga decode (all asics). 1216 * Returns VGA resource flags. 1217 */ 1218 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1219 { 1220 struct amdgpu_device *adev = cookie; 1221 amdgpu_asic_set_vga_state(adev, state); 1222 if (state) 1223 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1224 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1225 else 1226 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1227 } 1228 1229 /** 1230 * amdgpu_device_check_block_size - validate the vm block size 1231 * 1232 * @adev: amdgpu_device pointer 1233 * 1234 * Validates the vm block size specified via module parameter. 1235 * The vm block size defines number of bits in page table versus page directory, 1236 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1237 * page table and the remaining bits are in the page directory. 1238 */ 1239 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1240 { 1241 /* defines number of bits in page table versus page directory, 1242 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1243 * page table and the remaining bits are in the page directory */ 1244 if (amdgpu_vm_block_size == -1) 1245 return; 1246 1247 if (amdgpu_vm_block_size < 9) { 1248 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1249 amdgpu_vm_block_size); 1250 amdgpu_vm_block_size = -1; 1251 } 1252 } 1253 1254 /** 1255 * amdgpu_device_check_vm_size - validate the vm size 1256 * 1257 * @adev: amdgpu_device pointer 1258 * 1259 * Validates the vm size in GB specified via module parameter. 1260 * The VM size is the size of the GPU virtual memory space in GB. 1261 */ 1262 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1263 { 1264 /* no need to check the default value */ 1265 if (amdgpu_vm_size == -1) 1266 return; 1267 1268 if (amdgpu_vm_size < 1) { 1269 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1270 amdgpu_vm_size); 1271 amdgpu_vm_size = -1; 1272 } 1273 } 1274 1275 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1276 { 1277 struct sysinfo si; 1278 bool is_os_64 = (sizeof(void *) == 8); 1279 uint64_t total_memory; 1280 uint64_t dram_size_seven_GB = 0x1B8000000; 1281 uint64_t dram_size_three_GB = 0xB8000000; 1282 1283 if (amdgpu_smu_memory_pool_size == 0) 1284 return; 1285 1286 if (!is_os_64) { 1287 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1288 goto def_value; 1289 } 1290 si_meminfo(&si); 1291 total_memory = (uint64_t)si.totalram * si.mem_unit; 1292 1293 if ((amdgpu_smu_memory_pool_size == 1) || 1294 (amdgpu_smu_memory_pool_size == 2)) { 1295 if (total_memory < dram_size_three_GB) 1296 goto def_value1; 1297 } else if ((amdgpu_smu_memory_pool_size == 4) || 1298 (amdgpu_smu_memory_pool_size == 8)) { 1299 if (total_memory < dram_size_seven_GB) 1300 goto def_value1; 1301 } else { 1302 DRM_WARN("Smu memory pool size not supported\n"); 1303 goto def_value; 1304 } 1305 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1306 1307 return; 1308 1309 def_value1: 1310 DRM_WARN("No enough system memory\n"); 1311 def_value: 1312 adev->pm.smu_prv_buffer_size = 0; 1313 } 1314 1315 /** 1316 * amdgpu_device_check_arguments - validate module params 1317 * 1318 * @adev: amdgpu_device pointer 1319 * 1320 * Validates certain module parameters and updates 1321 * the associated values used by the driver (all asics). 1322 */ 1323 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1324 { 1325 if (amdgpu_sched_jobs < 4) { 1326 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1327 amdgpu_sched_jobs); 1328 amdgpu_sched_jobs = 4; 1329 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1330 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1331 amdgpu_sched_jobs); 1332 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1333 } 1334 1335 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1336 /* gart size must be greater or equal to 32M */ 1337 dev_warn(adev->dev, "gart size (%d) too small\n", 1338 amdgpu_gart_size); 1339 amdgpu_gart_size = -1; 1340 } 1341 1342 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1343 /* gtt size must be greater or equal to 32M */ 1344 dev_warn(adev->dev, "gtt size (%d) too small\n", 1345 amdgpu_gtt_size); 1346 amdgpu_gtt_size = -1; 1347 } 1348 1349 /* valid range is between 4 and 9 inclusive */ 1350 if (amdgpu_vm_fragment_size != -1 && 1351 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1352 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1353 amdgpu_vm_fragment_size = -1; 1354 } 1355 1356 if (amdgpu_sched_hw_submission < 2) { 1357 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1358 amdgpu_sched_hw_submission); 1359 amdgpu_sched_hw_submission = 2; 1360 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1361 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1362 amdgpu_sched_hw_submission); 1363 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1364 } 1365 1366 amdgpu_device_check_smu_prv_buffer_size(adev); 1367 1368 amdgpu_device_check_vm_size(adev); 1369 1370 amdgpu_device_check_block_size(adev); 1371 1372 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1373 1374 amdgpu_gmc_tmz_set(adev); 1375 1376 if (amdgpu_num_kcq == -1) { 1377 amdgpu_num_kcq = 8; 1378 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1379 amdgpu_num_kcq = 8; 1380 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1381 } 1382 1383 amdgpu_gmc_noretry_set(adev); 1384 1385 return 0; 1386 } 1387 1388 /** 1389 * amdgpu_switcheroo_set_state - set switcheroo state 1390 * 1391 * @pdev: pci dev pointer 1392 * @state: vga_switcheroo state 1393 * 1394 * Callback for the switcheroo driver. Suspends or resumes the 1395 * the asics before or after it is powered up using ACPI methods. 1396 */ 1397 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1398 enum vga_switcheroo_state state) 1399 { 1400 struct drm_device *dev = pci_get_drvdata(pdev); 1401 int r; 1402 1403 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1404 return; 1405 1406 if (state == VGA_SWITCHEROO_ON) { 1407 pr_info("switched on\n"); 1408 /* don't suspend or resume card normally */ 1409 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1410 1411 pci_set_power_state(dev->pdev, PCI_D0); 1412 amdgpu_device_load_pci_state(dev->pdev); 1413 r = pci_enable_device(dev->pdev); 1414 if (r) 1415 DRM_WARN("pci_enable_device failed (%d)\n", r); 1416 amdgpu_device_resume(dev, true); 1417 1418 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1419 drm_kms_helper_poll_enable(dev); 1420 } else { 1421 pr_info("switched off\n"); 1422 drm_kms_helper_poll_disable(dev); 1423 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1424 amdgpu_device_suspend(dev, true); 1425 amdgpu_device_cache_pci_state(dev->pdev); 1426 /* Shut down the device */ 1427 pci_disable_device(dev->pdev); 1428 pci_set_power_state(dev->pdev, PCI_D3cold); 1429 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1430 } 1431 } 1432 1433 /** 1434 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1435 * 1436 * @pdev: pci dev pointer 1437 * 1438 * Callback for the switcheroo driver. Check of the switcheroo 1439 * state can be changed. 1440 * Returns true if the state can be changed, false if not. 1441 */ 1442 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1443 { 1444 struct drm_device *dev = pci_get_drvdata(pdev); 1445 1446 /* 1447 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1448 * locking inversion with the driver load path. And the access here is 1449 * completely racy anyway. So don't bother with locking for now. 1450 */ 1451 return atomic_read(&dev->open_count) == 0; 1452 } 1453 1454 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1455 .set_gpu_state = amdgpu_switcheroo_set_state, 1456 .reprobe = NULL, 1457 .can_switch = amdgpu_switcheroo_can_switch, 1458 }; 1459 1460 /** 1461 * amdgpu_device_ip_set_clockgating_state - set the CG state 1462 * 1463 * @dev: amdgpu_device pointer 1464 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1465 * @state: clockgating state (gate or ungate) 1466 * 1467 * Sets the requested clockgating state for all instances of 1468 * the hardware IP specified. 1469 * Returns the error code from the last instance. 1470 */ 1471 int amdgpu_device_ip_set_clockgating_state(void *dev, 1472 enum amd_ip_block_type block_type, 1473 enum amd_clockgating_state state) 1474 { 1475 struct amdgpu_device *adev = dev; 1476 int i, r = 0; 1477 1478 for (i = 0; i < adev->num_ip_blocks; i++) { 1479 if (!adev->ip_blocks[i].status.valid) 1480 continue; 1481 if (adev->ip_blocks[i].version->type != block_type) 1482 continue; 1483 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1484 continue; 1485 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1486 (void *)adev, state); 1487 if (r) 1488 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1489 adev->ip_blocks[i].version->funcs->name, r); 1490 } 1491 return r; 1492 } 1493 1494 /** 1495 * amdgpu_device_ip_set_powergating_state - set the PG state 1496 * 1497 * @dev: amdgpu_device pointer 1498 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1499 * @state: powergating state (gate or ungate) 1500 * 1501 * Sets the requested powergating state for all instances of 1502 * the hardware IP specified. 1503 * Returns the error code from the last instance. 1504 */ 1505 int amdgpu_device_ip_set_powergating_state(void *dev, 1506 enum amd_ip_block_type block_type, 1507 enum amd_powergating_state state) 1508 { 1509 struct amdgpu_device *adev = dev; 1510 int i, r = 0; 1511 1512 for (i = 0; i < adev->num_ip_blocks; i++) { 1513 if (!adev->ip_blocks[i].status.valid) 1514 continue; 1515 if (adev->ip_blocks[i].version->type != block_type) 1516 continue; 1517 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1518 continue; 1519 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1520 (void *)adev, state); 1521 if (r) 1522 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1523 adev->ip_blocks[i].version->funcs->name, r); 1524 } 1525 return r; 1526 } 1527 1528 /** 1529 * amdgpu_device_ip_get_clockgating_state - get the CG state 1530 * 1531 * @adev: amdgpu_device pointer 1532 * @flags: clockgating feature flags 1533 * 1534 * Walks the list of IPs on the device and updates the clockgating 1535 * flags for each IP. 1536 * Updates @flags with the feature flags for each hardware IP where 1537 * clockgating is enabled. 1538 */ 1539 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1540 u32 *flags) 1541 { 1542 int i; 1543 1544 for (i = 0; i < adev->num_ip_blocks; i++) { 1545 if (!adev->ip_blocks[i].status.valid) 1546 continue; 1547 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1548 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1549 } 1550 } 1551 1552 /** 1553 * amdgpu_device_ip_wait_for_idle - wait for idle 1554 * 1555 * @adev: amdgpu_device pointer 1556 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1557 * 1558 * Waits for the request hardware IP to be idle. 1559 * Returns 0 for success or a negative error code on failure. 1560 */ 1561 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1562 enum amd_ip_block_type block_type) 1563 { 1564 int i, r; 1565 1566 for (i = 0; i < adev->num_ip_blocks; i++) { 1567 if (!adev->ip_blocks[i].status.valid) 1568 continue; 1569 if (adev->ip_blocks[i].version->type == block_type) { 1570 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1571 if (r) 1572 return r; 1573 break; 1574 } 1575 } 1576 return 0; 1577 1578 } 1579 1580 /** 1581 * amdgpu_device_ip_is_idle - is the hardware IP idle 1582 * 1583 * @adev: amdgpu_device pointer 1584 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1585 * 1586 * Check if the hardware IP is idle or not. 1587 * Returns true if it the IP is idle, false if not. 1588 */ 1589 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1590 enum amd_ip_block_type block_type) 1591 { 1592 int i; 1593 1594 for (i = 0; i < adev->num_ip_blocks; i++) { 1595 if (!adev->ip_blocks[i].status.valid) 1596 continue; 1597 if (adev->ip_blocks[i].version->type == block_type) 1598 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1599 } 1600 return true; 1601 1602 } 1603 1604 /** 1605 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1606 * 1607 * @adev: amdgpu_device pointer 1608 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1609 * 1610 * Returns a pointer to the hardware IP block structure 1611 * if it exists for the asic, otherwise NULL. 1612 */ 1613 struct amdgpu_ip_block * 1614 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1615 enum amd_ip_block_type type) 1616 { 1617 int i; 1618 1619 for (i = 0; i < adev->num_ip_blocks; i++) 1620 if (adev->ip_blocks[i].version->type == type) 1621 return &adev->ip_blocks[i]; 1622 1623 return NULL; 1624 } 1625 1626 /** 1627 * amdgpu_device_ip_block_version_cmp 1628 * 1629 * @adev: amdgpu_device pointer 1630 * @type: enum amd_ip_block_type 1631 * @major: major version 1632 * @minor: minor version 1633 * 1634 * return 0 if equal or greater 1635 * return 1 if smaller or the ip_block doesn't exist 1636 */ 1637 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1638 enum amd_ip_block_type type, 1639 u32 major, u32 minor) 1640 { 1641 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1642 1643 if (ip_block && ((ip_block->version->major > major) || 1644 ((ip_block->version->major == major) && 1645 (ip_block->version->minor >= minor)))) 1646 return 0; 1647 1648 return 1; 1649 } 1650 1651 /** 1652 * amdgpu_device_ip_block_add 1653 * 1654 * @adev: amdgpu_device pointer 1655 * @ip_block_version: pointer to the IP to add 1656 * 1657 * Adds the IP block driver information to the collection of IPs 1658 * on the asic. 1659 */ 1660 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1661 const struct amdgpu_ip_block_version *ip_block_version) 1662 { 1663 if (!ip_block_version) 1664 return -EINVAL; 1665 1666 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1667 ip_block_version->funcs->name); 1668 1669 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1670 1671 return 0; 1672 } 1673 1674 /** 1675 * amdgpu_device_enable_virtual_display - enable virtual display feature 1676 * 1677 * @adev: amdgpu_device pointer 1678 * 1679 * Enabled the virtual display feature if the user has enabled it via 1680 * the module parameter virtual_display. This feature provides a virtual 1681 * display hardware on headless boards or in virtualized environments. 1682 * This function parses and validates the configuration string specified by 1683 * the user and configues the virtual display configuration (number of 1684 * virtual connectors, crtcs, etc.) specified. 1685 */ 1686 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1687 { 1688 adev->enable_virtual_display = false; 1689 1690 if (amdgpu_virtual_display) { 1691 struct drm_device *ddev = adev_to_drm(adev); 1692 const char *pci_address_name = pci_name(ddev->pdev); 1693 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1694 1695 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1696 pciaddstr_tmp = pciaddstr; 1697 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1698 pciaddname = strsep(&pciaddname_tmp, ","); 1699 if (!strcmp("all", pciaddname) 1700 || !strcmp(pci_address_name, pciaddname)) { 1701 long num_crtc; 1702 int res = -1; 1703 1704 adev->enable_virtual_display = true; 1705 1706 if (pciaddname_tmp) 1707 res = kstrtol(pciaddname_tmp, 10, 1708 &num_crtc); 1709 1710 if (!res) { 1711 if (num_crtc < 1) 1712 num_crtc = 1; 1713 if (num_crtc > 6) 1714 num_crtc = 6; 1715 adev->mode_info.num_crtc = num_crtc; 1716 } else { 1717 adev->mode_info.num_crtc = 1; 1718 } 1719 break; 1720 } 1721 } 1722 1723 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1724 amdgpu_virtual_display, pci_address_name, 1725 adev->enable_virtual_display, adev->mode_info.num_crtc); 1726 1727 kfree(pciaddstr); 1728 } 1729 } 1730 1731 /** 1732 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1733 * 1734 * @adev: amdgpu_device pointer 1735 * 1736 * Parses the asic configuration parameters specified in the gpu info 1737 * firmware and makes them availale to the driver for use in configuring 1738 * the asic. 1739 * Returns 0 on success, -EINVAL on failure. 1740 */ 1741 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1742 { 1743 const char *chip_name; 1744 char fw_name[40]; 1745 int err; 1746 const struct gpu_info_firmware_header_v1_0 *hdr; 1747 1748 adev->firmware.gpu_info_fw = NULL; 1749 1750 if (adev->mman.discovery_bin) { 1751 amdgpu_discovery_get_gfx_info(adev); 1752 1753 /* 1754 * FIXME: The bounding box is still needed by Navi12, so 1755 * temporarily read it from gpu_info firmware. Should be droped 1756 * when DAL no longer needs it. 1757 */ 1758 if (adev->asic_type != CHIP_NAVI12) 1759 return 0; 1760 } 1761 1762 switch (adev->asic_type) { 1763 #ifdef CONFIG_DRM_AMDGPU_SI 1764 case CHIP_VERDE: 1765 case CHIP_TAHITI: 1766 case CHIP_PITCAIRN: 1767 case CHIP_OLAND: 1768 case CHIP_HAINAN: 1769 #endif 1770 #ifdef CONFIG_DRM_AMDGPU_CIK 1771 case CHIP_BONAIRE: 1772 case CHIP_HAWAII: 1773 case CHIP_KAVERI: 1774 case CHIP_KABINI: 1775 case CHIP_MULLINS: 1776 #endif 1777 case CHIP_TOPAZ: 1778 case CHIP_TONGA: 1779 case CHIP_FIJI: 1780 case CHIP_POLARIS10: 1781 case CHIP_POLARIS11: 1782 case CHIP_POLARIS12: 1783 case CHIP_VEGAM: 1784 case CHIP_CARRIZO: 1785 case CHIP_STONEY: 1786 case CHIP_VEGA20: 1787 case CHIP_SIENNA_CICHLID: 1788 case CHIP_NAVY_FLOUNDER: 1789 default: 1790 return 0; 1791 case CHIP_VEGA10: 1792 chip_name = "vega10"; 1793 break; 1794 case CHIP_VEGA12: 1795 chip_name = "vega12"; 1796 break; 1797 case CHIP_RAVEN: 1798 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1799 chip_name = "raven2"; 1800 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1801 chip_name = "picasso"; 1802 else 1803 chip_name = "raven"; 1804 break; 1805 case CHIP_ARCTURUS: 1806 chip_name = "arcturus"; 1807 break; 1808 case CHIP_RENOIR: 1809 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1810 chip_name = "renoir"; 1811 else 1812 chip_name = "green_sardine"; 1813 break; 1814 case CHIP_NAVI10: 1815 chip_name = "navi10"; 1816 break; 1817 case CHIP_NAVI14: 1818 chip_name = "navi14"; 1819 break; 1820 case CHIP_NAVI12: 1821 chip_name = "navi12"; 1822 break; 1823 } 1824 1825 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1826 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1827 if (err) { 1828 dev_err(adev->dev, 1829 "Failed to load gpu_info firmware \"%s\"\n", 1830 fw_name); 1831 goto out; 1832 } 1833 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1834 if (err) { 1835 dev_err(adev->dev, 1836 "Failed to validate gpu_info firmware \"%s\"\n", 1837 fw_name); 1838 goto out; 1839 } 1840 1841 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1842 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1843 1844 switch (hdr->version_major) { 1845 case 1: 1846 { 1847 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1848 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1849 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1850 1851 /* 1852 * Should be droped when DAL no longer needs it. 1853 */ 1854 if (adev->asic_type == CHIP_NAVI12) 1855 goto parse_soc_bounding_box; 1856 1857 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1858 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1859 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1860 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1861 adev->gfx.config.max_texture_channel_caches = 1862 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1863 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1864 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1865 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1866 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1867 adev->gfx.config.double_offchip_lds_buf = 1868 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1869 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1870 adev->gfx.cu_info.max_waves_per_simd = 1871 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1872 adev->gfx.cu_info.max_scratch_slots_per_cu = 1873 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1874 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1875 if (hdr->version_minor >= 1) { 1876 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1877 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1878 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1879 adev->gfx.config.num_sc_per_sh = 1880 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1881 adev->gfx.config.num_packer_per_sc = 1882 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1883 } 1884 1885 parse_soc_bounding_box: 1886 /* 1887 * soc bounding box info is not integrated in disocovery table, 1888 * we always need to parse it from gpu info firmware if needed. 1889 */ 1890 if (hdr->version_minor == 2) { 1891 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1892 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1893 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1894 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1895 } 1896 break; 1897 } 1898 default: 1899 dev_err(adev->dev, 1900 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1901 err = -EINVAL; 1902 goto out; 1903 } 1904 out: 1905 return err; 1906 } 1907 1908 /** 1909 * amdgpu_device_ip_early_init - run early init for hardware IPs 1910 * 1911 * @adev: amdgpu_device pointer 1912 * 1913 * Early initialization pass for hardware IPs. The hardware IPs that make 1914 * up each asic are discovered each IP's early_init callback is run. This 1915 * is the first stage in initializing the asic. 1916 * Returns 0 on success, negative error code on failure. 1917 */ 1918 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1919 { 1920 int i, r; 1921 1922 amdgpu_device_enable_virtual_display(adev); 1923 1924 if (amdgpu_sriov_vf(adev)) { 1925 r = amdgpu_virt_request_full_gpu(adev, true); 1926 if (r) 1927 return r; 1928 } 1929 1930 switch (adev->asic_type) { 1931 #ifdef CONFIG_DRM_AMDGPU_SI 1932 case CHIP_VERDE: 1933 case CHIP_TAHITI: 1934 case CHIP_PITCAIRN: 1935 case CHIP_OLAND: 1936 case CHIP_HAINAN: 1937 adev->family = AMDGPU_FAMILY_SI; 1938 r = si_set_ip_blocks(adev); 1939 if (r) 1940 return r; 1941 break; 1942 #endif 1943 #ifdef CONFIG_DRM_AMDGPU_CIK 1944 case CHIP_BONAIRE: 1945 case CHIP_HAWAII: 1946 case CHIP_KAVERI: 1947 case CHIP_KABINI: 1948 case CHIP_MULLINS: 1949 if (adev->flags & AMD_IS_APU) 1950 adev->family = AMDGPU_FAMILY_KV; 1951 else 1952 adev->family = AMDGPU_FAMILY_CI; 1953 1954 r = cik_set_ip_blocks(adev); 1955 if (r) 1956 return r; 1957 break; 1958 #endif 1959 case CHIP_TOPAZ: 1960 case CHIP_TONGA: 1961 case CHIP_FIJI: 1962 case CHIP_POLARIS10: 1963 case CHIP_POLARIS11: 1964 case CHIP_POLARIS12: 1965 case CHIP_VEGAM: 1966 case CHIP_CARRIZO: 1967 case CHIP_STONEY: 1968 if (adev->flags & AMD_IS_APU) 1969 adev->family = AMDGPU_FAMILY_CZ; 1970 else 1971 adev->family = AMDGPU_FAMILY_VI; 1972 1973 r = vi_set_ip_blocks(adev); 1974 if (r) 1975 return r; 1976 break; 1977 case CHIP_VEGA10: 1978 case CHIP_VEGA12: 1979 case CHIP_VEGA20: 1980 case CHIP_RAVEN: 1981 case CHIP_ARCTURUS: 1982 case CHIP_RENOIR: 1983 if (adev->flags & AMD_IS_APU) 1984 adev->family = AMDGPU_FAMILY_RV; 1985 else 1986 adev->family = AMDGPU_FAMILY_AI; 1987 1988 r = soc15_set_ip_blocks(adev); 1989 if (r) 1990 return r; 1991 break; 1992 case CHIP_NAVI10: 1993 case CHIP_NAVI14: 1994 case CHIP_NAVI12: 1995 case CHIP_SIENNA_CICHLID: 1996 case CHIP_NAVY_FLOUNDER: 1997 adev->family = AMDGPU_FAMILY_NV; 1998 1999 r = nv_set_ip_blocks(adev); 2000 if (r) 2001 return r; 2002 break; 2003 default: 2004 /* FIXME: not supported yet */ 2005 return -EINVAL; 2006 } 2007 2008 amdgpu_amdkfd_device_probe(adev); 2009 2010 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2011 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2012 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2013 2014 for (i = 0; i < adev->num_ip_blocks; i++) { 2015 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2016 DRM_ERROR("disabled ip block: %d <%s>\n", 2017 i, adev->ip_blocks[i].version->funcs->name); 2018 adev->ip_blocks[i].status.valid = false; 2019 } else { 2020 if (adev->ip_blocks[i].version->funcs->early_init) { 2021 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2022 if (r == -ENOENT) { 2023 adev->ip_blocks[i].status.valid = false; 2024 } else if (r) { 2025 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2026 adev->ip_blocks[i].version->funcs->name, r); 2027 return r; 2028 } else { 2029 adev->ip_blocks[i].status.valid = true; 2030 } 2031 } else { 2032 adev->ip_blocks[i].status.valid = true; 2033 } 2034 } 2035 /* get the vbios after the asic_funcs are set up */ 2036 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2037 r = amdgpu_device_parse_gpu_info_fw(adev); 2038 if (r) 2039 return r; 2040 2041 /* Read BIOS */ 2042 if (!amdgpu_get_bios(adev)) 2043 return -EINVAL; 2044 2045 r = amdgpu_atombios_init(adev); 2046 if (r) { 2047 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2048 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2049 return r; 2050 } 2051 } 2052 } 2053 2054 adev->cg_flags &= amdgpu_cg_mask; 2055 adev->pg_flags &= amdgpu_pg_mask; 2056 2057 return 0; 2058 } 2059 2060 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2061 { 2062 int i, r; 2063 2064 for (i = 0; i < adev->num_ip_blocks; i++) { 2065 if (!adev->ip_blocks[i].status.sw) 2066 continue; 2067 if (adev->ip_blocks[i].status.hw) 2068 continue; 2069 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2070 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2071 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2072 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2073 if (r) { 2074 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2075 adev->ip_blocks[i].version->funcs->name, r); 2076 return r; 2077 } 2078 adev->ip_blocks[i].status.hw = true; 2079 } 2080 } 2081 2082 return 0; 2083 } 2084 2085 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2086 { 2087 int i, r; 2088 2089 for (i = 0; i < adev->num_ip_blocks; i++) { 2090 if (!adev->ip_blocks[i].status.sw) 2091 continue; 2092 if (adev->ip_blocks[i].status.hw) 2093 continue; 2094 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2095 if (r) { 2096 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2097 adev->ip_blocks[i].version->funcs->name, r); 2098 return r; 2099 } 2100 adev->ip_blocks[i].status.hw = true; 2101 } 2102 2103 return 0; 2104 } 2105 2106 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2107 { 2108 int r = 0; 2109 int i; 2110 uint32_t smu_version; 2111 2112 if (adev->asic_type >= CHIP_VEGA10) { 2113 for (i = 0; i < adev->num_ip_blocks; i++) { 2114 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2115 continue; 2116 2117 /* no need to do the fw loading again if already done*/ 2118 if (adev->ip_blocks[i].status.hw == true) 2119 break; 2120 2121 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2122 r = adev->ip_blocks[i].version->funcs->resume(adev); 2123 if (r) { 2124 DRM_ERROR("resume of IP block <%s> failed %d\n", 2125 adev->ip_blocks[i].version->funcs->name, r); 2126 return r; 2127 } 2128 } else { 2129 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2130 if (r) { 2131 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2132 adev->ip_blocks[i].version->funcs->name, r); 2133 return r; 2134 } 2135 } 2136 2137 adev->ip_blocks[i].status.hw = true; 2138 break; 2139 } 2140 } 2141 2142 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2143 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2144 2145 return r; 2146 } 2147 2148 /** 2149 * amdgpu_device_ip_init - run init for hardware IPs 2150 * 2151 * @adev: amdgpu_device pointer 2152 * 2153 * Main initialization pass for hardware IPs. The list of all the hardware 2154 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2155 * are run. sw_init initializes the software state associated with each IP 2156 * and hw_init initializes the hardware associated with each IP. 2157 * Returns 0 on success, negative error code on failure. 2158 */ 2159 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2160 { 2161 int i, r; 2162 2163 r = amdgpu_ras_init(adev); 2164 if (r) 2165 return r; 2166 2167 for (i = 0; i < adev->num_ip_blocks; i++) { 2168 if (!adev->ip_blocks[i].status.valid) 2169 continue; 2170 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2171 if (r) { 2172 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2173 adev->ip_blocks[i].version->funcs->name, r); 2174 goto init_failed; 2175 } 2176 adev->ip_blocks[i].status.sw = true; 2177 2178 /* need to do gmc hw init early so we can allocate gpu mem */ 2179 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2180 r = amdgpu_device_vram_scratch_init(adev); 2181 if (r) { 2182 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2183 goto init_failed; 2184 } 2185 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2186 if (r) { 2187 DRM_ERROR("hw_init %d failed %d\n", i, r); 2188 goto init_failed; 2189 } 2190 r = amdgpu_device_wb_init(adev); 2191 if (r) { 2192 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2193 goto init_failed; 2194 } 2195 adev->ip_blocks[i].status.hw = true; 2196 2197 /* right after GMC hw init, we create CSA */ 2198 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2199 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2200 AMDGPU_GEM_DOMAIN_VRAM, 2201 AMDGPU_CSA_SIZE); 2202 if (r) { 2203 DRM_ERROR("allocate CSA failed %d\n", r); 2204 goto init_failed; 2205 } 2206 } 2207 } 2208 } 2209 2210 if (amdgpu_sriov_vf(adev)) 2211 amdgpu_virt_init_data_exchange(adev); 2212 2213 r = amdgpu_ib_pool_init(adev); 2214 if (r) { 2215 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2216 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2217 goto init_failed; 2218 } 2219 2220 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2221 if (r) 2222 goto init_failed; 2223 2224 r = amdgpu_device_ip_hw_init_phase1(adev); 2225 if (r) 2226 goto init_failed; 2227 2228 r = amdgpu_device_fw_loading(adev); 2229 if (r) 2230 goto init_failed; 2231 2232 r = amdgpu_device_ip_hw_init_phase2(adev); 2233 if (r) 2234 goto init_failed; 2235 2236 /* 2237 * retired pages will be loaded from eeprom and reserved here, 2238 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2239 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2240 * for I2C communication which only true at this point. 2241 * 2242 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2243 * failure from bad gpu situation and stop amdgpu init process 2244 * accordingly. For other failed cases, it will still release all 2245 * the resource and print error message, rather than returning one 2246 * negative value to upper level. 2247 * 2248 * Note: theoretically, this should be called before all vram allocations 2249 * to protect retired page from abusing 2250 */ 2251 r = amdgpu_ras_recovery_init(adev); 2252 if (r) 2253 goto init_failed; 2254 2255 if (adev->gmc.xgmi.num_physical_nodes > 1) 2256 amdgpu_xgmi_add_device(adev); 2257 amdgpu_amdkfd_device_init(adev); 2258 2259 amdgpu_fru_get_product_info(adev); 2260 2261 init_failed: 2262 if (amdgpu_sriov_vf(adev)) 2263 amdgpu_virt_release_full_gpu(adev, true); 2264 2265 return r; 2266 } 2267 2268 /** 2269 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2270 * 2271 * @adev: amdgpu_device pointer 2272 * 2273 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2274 * this function before a GPU reset. If the value is retained after a 2275 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2276 */ 2277 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2278 { 2279 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2280 } 2281 2282 /** 2283 * amdgpu_device_check_vram_lost - check if vram is valid 2284 * 2285 * @adev: amdgpu_device pointer 2286 * 2287 * Checks the reset magic value written to the gart pointer in VRAM. 2288 * The driver calls this after a GPU reset to see if the contents of 2289 * VRAM is lost or now. 2290 * returns true if vram is lost, false if not. 2291 */ 2292 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2293 { 2294 if (memcmp(adev->gart.ptr, adev->reset_magic, 2295 AMDGPU_RESET_MAGIC_NUM)) 2296 return true; 2297 2298 if (!amdgpu_in_reset(adev)) 2299 return false; 2300 2301 /* 2302 * For all ASICs with baco/mode1 reset, the VRAM is 2303 * always assumed to be lost. 2304 */ 2305 switch (amdgpu_asic_reset_method(adev)) { 2306 case AMD_RESET_METHOD_BACO: 2307 case AMD_RESET_METHOD_MODE1: 2308 return true; 2309 default: 2310 return false; 2311 } 2312 } 2313 2314 /** 2315 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2316 * 2317 * @adev: amdgpu_device pointer 2318 * @state: clockgating state (gate or ungate) 2319 * 2320 * The list of all the hardware IPs that make up the asic is walked and the 2321 * set_clockgating_state callbacks are run. 2322 * Late initialization pass enabling clockgating for hardware IPs. 2323 * Fini or suspend, pass disabling clockgating for hardware IPs. 2324 * Returns 0 on success, negative error code on failure. 2325 */ 2326 2327 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2328 enum amd_clockgating_state state) 2329 { 2330 int i, j, r; 2331 2332 if (amdgpu_emu_mode == 1) 2333 return 0; 2334 2335 for (j = 0; j < adev->num_ip_blocks; j++) { 2336 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2337 if (!adev->ip_blocks[i].status.late_initialized) 2338 continue; 2339 /* skip CG for VCE/UVD, it's handled specially */ 2340 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2341 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2342 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2343 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2344 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2345 /* enable clockgating to save power */ 2346 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2347 state); 2348 if (r) { 2349 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2350 adev->ip_blocks[i].version->funcs->name, r); 2351 return r; 2352 } 2353 } 2354 } 2355 2356 return 0; 2357 } 2358 2359 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2360 { 2361 int i, j, r; 2362 2363 if (amdgpu_emu_mode == 1) 2364 return 0; 2365 2366 for (j = 0; j < adev->num_ip_blocks; j++) { 2367 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2368 if (!adev->ip_blocks[i].status.late_initialized) 2369 continue; 2370 /* skip CG for VCE/UVD, it's handled specially */ 2371 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2372 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2373 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2374 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2375 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2376 /* enable powergating to save power */ 2377 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2378 state); 2379 if (r) { 2380 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2381 adev->ip_blocks[i].version->funcs->name, r); 2382 return r; 2383 } 2384 } 2385 } 2386 return 0; 2387 } 2388 2389 static int amdgpu_device_enable_mgpu_fan_boost(void) 2390 { 2391 struct amdgpu_gpu_instance *gpu_ins; 2392 struct amdgpu_device *adev; 2393 int i, ret = 0; 2394 2395 mutex_lock(&mgpu_info.mutex); 2396 2397 /* 2398 * MGPU fan boost feature should be enabled 2399 * only when there are two or more dGPUs in 2400 * the system 2401 */ 2402 if (mgpu_info.num_dgpu < 2) 2403 goto out; 2404 2405 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2406 gpu_ins = &(mgpu_info.gpu_ins[i]); 2407 adev = gpu_ins->adev; 2408 if (!(adev->flags & AMD_IS_APU) && 2409 !gpu_ins->mgpu_fan_enabled) { 2410 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2411 if (ret) 2412 break; 2413 2414 gpu_ins->mgpu_fan_enabled = 1; 2415 } 2416 } 2417 2418 out: 2419 mutex_unlock(&mgpu_info.mutex); 2420 2421 return ret; 2422 } 2423 2424 /** 2425 * amdgpu_device_ip_late_init - run late init for hardware IPs 2426 * 2427 * @adev: amdgpu_device pointer 2428 * 2429 * Late initialization pass for hardware IPs. The list of all the hardware 2430 * IPs that make up the asic is walked and the late_init callbacks are run. 2431 * late_init covers any special initialization that an IP requires 2432 * after all of the have been initialized or something that needs to happen 2433 * late in the init process. 2434 * Returns 0 on success, negative error code on failure. 2435 */ 2436 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2437 { 2438 struct amdgpu_gpu_instance *gpu_instance; 2439 int i = 0, r; 2440 2441 for (i = 0; i < adev->num_ip_blocks; i++) { 2442 if (!adev->ip_blocks[i].status.hw) 2443 continue; 2444 if (adev->ip_blocks[i].version->funcs->late_init) { 2445 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2446 if (r) { 2447 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2448 adev->ip_blocks[i].version->funcs->name, r); 2449 return r; 2450 } 2451 } 2452 adev->ip_blocks[i].status.late_initialized = true; 2453 } 2454 2455 amdgpu_ras_set_error_query_ready(adev, true); 2456 2457 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2458 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2459 2460 amdgpu_device_fill_reset_magic(adev); 2461 2462 r = amdgpu_device_enable_mgpu_fan_boost(); 2463 if (r) 2464 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2465 2466 2467 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2468 mutex_lock(&mgpu_info.mutex); 2469 2470 /* 2471 * Reset device p-state to low as this was booted with high. 2472 * 2473 * This should be performed only after all devices from the same 2474 * hive get initialized. 2475 * 2476 * However, it's unknown how many device in the hive in advance. 2477 * As this is counted one by one during devices initializations. 2478 * 2479 * So, we wait for all XGMI interlinked devices initialized. 2480 * This may bring some delays as those devices may come from 2481 * different hives. But that should be OK. 2482 */ 2483 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2484 for (i = 0; i < mgpu_info.num_gpu; i++) { 2485 gpu_instance = &(mgpu_info.gpu_ins[i]); 2486 if (gpu_instance->adev->flags & AMD_IS_APU) 2487 continue; 2488 2489 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2490 AMDGPU_XGMI_PSTATE_MIN); 2491 if (r) { 2492 DRM_ERROR("pstate setting failed (%d).\n", r); 2493 break; 2494 } 2495 } 2496 } 2497 2498 mutex_unlock(&mgpu_info.mutex); 2499 } 2500 2501 return 0; 2502 } 2503 2504 /** 2505 * amdgpu_device_ip_fini - run fini for hardware IPs 2506 * 2507 * @adev: amdgpu_device pointer 2508 * 2509 * Main teardown pass for hardware IPs. The list of all the hardware 2510 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2511 * are run. hw_fini tears down the hardware associated with each IP 2512 * and sw_fini tears down any software state associated with each IP. 2513 * Returns 0 on success, negative error code on failure. 2514 */ 2515 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2516 { 2517 int i, r; 2518 2519 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2520 amdgpu_virt_release_ras_err_handler_data(adev); 2521 2522 amdgpu_ras_pre_fini(adev); 2523 2524 if (adev->gmc.xgmi.num_physical_nodes > 1) 2525 amdgpu_xgmi_remove_device(adev); 2526 2527 amdgpu_amdkfd_device_fini(adev); 2528 2529 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2530 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2531 2532 /* need to disable SMC first */ 2533 for (i = 0; i < adev->num_ip_blocks; i++) { 2534 if (!adev->ip_blocks[i].status.hw) 2535 continue; 2536 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2537 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2538 /* XXX handle errors */ 2539 if (r) { 2540 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2541 adev->ip_blocks[i].version->funcs->name, r); 2542 } 2543 adev->ip_blocks[i].status.hw = false; 2544 break; 2545 } 2546 } 2547 2548 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2549 if (!adev->ip_blocks[i].status.hw) 2550 continue; 2551 2552 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2553 /* XXX handle errors */ 2554 if (r) { 2555 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2556 adev->ip_blocks[i].version->funcs->name, r); 2557 } 2558 2559 adev->ip_blocks[i].status.hw = false; 2560 } 2561 2562 2563 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2564 if (!adev->ip_blocks[i].status.sw) 2565 continue; 2566 2567 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2568 amdgpu_ucode_free_bo(adev); 2569 amdgpu_free_static_csa(&adev->virt.csa_obj); 2570 amdgpu_device_wb_fini(adev); 2571 amdgpu_device_vram_scratch_fini(adev); 2572 amdgpu_ib_pool_fini(adev); 2573 } 2574 2575 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2576 /* XXX handle errors */ 2577 if (r) { 2578 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2579 adev->ip_blocks[i].version->funcs->name, r); 2580 } 2581 adev->ip_blocks[i].status.sw = false; 2582 adev->ip_blocks[i].status.valid = false; 2583 } 2584 2585 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2586 if (!adev->ip_blocks[i].status.late_initialized) 2587 continue; 2588 if (adev->ip_blocks[i].version->funcs->late_fini) 2589 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2590 adev->ip_blocks[i].status.late_initialized = false; 2591 } 2592 2593 amdgpu_ras_fini(adev); 2594 2595 if (amdgpu_sriov_vf(adev)) 2596 if (amdgpu_virt_release_full_gpu(adev, false)) 2597 DRM_ERROR("failed to release exclusive mode on fini\n"); 2598 2599 return 0; 2600 } 2601 2602 /** 2603 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2604 * 2605 * @work: work_struct. 2606 */ 2607 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2608 { 2609 struct amdgpu_device *adev = 2610 container_of(work, struct amdgpu_device, delayed_init_work.work); 2611 int r; 2612 2613 r = amdgpu_ib_ring_tests(adev); 2614 if (r) 2615 DRM_ERROR("ib ring test failed (%d).\n", r); 2616 } 2617 2618 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2619 { 2620 struct amdgpu_device *adev = 2621 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2622 2623 mutex_lock(&adev->gfx.gfx_off_mutex); 2624 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2625 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2626 adev->gfx.gfx_off_state = true; 2627 } 2628 mutex_unlock(&adev->gfx.gfx_off_mutex); 2629 } 2630 2631 /** 2632 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2633 * 2634 * @adev: amdgpu_device pointer 2635 * 2636 * Main suspend function for hardware IPs. The list of all the hardware 2637 * IPs that make up the asic is walked, clockgating is disabled and the 2638 * suspend callbacks are run. suspend puts the hardware and software state 2639 * in each IP into a state suitable for suspend. 2640 * Returns 0 on success, negative error code on failure. 2641 */ 2642 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2643 { 2644 int i, r; 2645 2646 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2647 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2648 2649 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2650 if (!adev->ip_blocks[i].status.valid) 2651 continue; 2652 2653 /* displays are handled separately */ 2654 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2655 continue; 2656 2657 /* XXX handle errors */ 2658 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2659 /* XXX handle errors */ 2660 if (r) { 2661 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2662 adev->ip_blocks[i].version->funcs->name, r); 2663 return r; 2664 } 2665 2666 adev->ip_blocks[i].status.hw = false; 2667 } 2668 2669 return 0; 2670 } 2671 2672 /** 2673 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2674 * 2675 * @adev: amdgpu_device pointer 2676 * 2677 * Main suspend function for hardware IPs. The list of all the hardware 2678 * IPs that make up the asic is walked, clockgating is disabled and the 2679 * suspend callbacks are run. suspend puts the hardware and software state 2680 * in each IP into a state suitable for suspend. 2681 * Returns 0 on success, negative error code on failure. 2682 */ 2683 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2684 { 2685 int i, r; 2686 2687 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2688 if (!adev->ip_blocks[i].status.valid) 2689 continue; 2690 /* displays are handled in phase1 */ 2691 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2692 continue; 2693 /* PSP lost connection when err_event_athub occurs */ 2694 if (amdgpu_ras_intr_triggered() && 2695 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2696 adev->ip_blocks[i].status.hw = false; 2697 continue; 2698 } 2699 /* XXX handle errors */ 2700 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2701 /* XXX handle errors */ 2702 if (r) { 2703 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2704 adev->ip_blocks[i].version->funcs->name, r); 2705 } 2706 adev->ip_blocks[i].status.hw = false; 2707 /* handle putting the SMC in the appropriate state */ 2708 if(!amdgpu_sriov_vf(adev)){ 2709 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2710 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2711 if (r) { 2712 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2713 adev->mp1_state, r); 2714 return r; 2715 } 2716 } 2717 } 2718 adev->ip_blocks[i].status.hw = false; 2719 } 2720 2721 return 0; 2722 } 2723 2724 /** 2725 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2726 * 2727 * @adev: amdgpu_device pointer 2728 * 2729 * Main suspend function for hardware IPs. The list of all the hardware 2730 * IPs that make up the asic is walked, clockgating is disabled and the 2731 * suspend callbacks are run. suspend puts the hardware and software state 2732 * in each IP into a state suitable for suspend. 2733 * Returns 0 on success, negative error code on failure. 2734 */ 2735 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2736 { 2737 int r; 2738 2739 if (amdgpu_sriov_vf(adev)) 2740 amdgpu_virt_request_full_gpu(adev, false); 2741 2742 r = amdgpu_device_ip_suspend_phase1(adev); 2743 if (r) 2744 return r; 2745 r = amdgpu_device_ip_suspend_phase2(adev); 2746 2747 if (amdgpu_sriov_vf(adev)) 2748 amdgpu_virt_release_full_gpu(adev, false); 2749 2750 return r; 2751 } 2752 2753 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2754 { 2755 int i, r; 2756 2757 static enum amd_ip_block_type ip_order[] = { 2758 AMD_IP_BLOCK_TYPE_GMC, 2759 AMD_IP_BLOCK_TYPE_COMMON, 2760 AMD_IP_BLOCK_TYPE_PSP, 2761 AMD_IP_BLOCK_TYPE_IH, 2762 }; 2763 2764 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2765 int j; 2766 struct amdgpu_ip_block *block; 2767 2768 block = &adev->ip_blocks[i]; 2769 block->status.hw = false; 2770 2771 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2772 2773 if (block->version->type != ip_order[j] || 2774 !block->status.valid) 2775 continue; 2776 2777 r = block->version->funcs->hw_init(adev); 2778 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2779 if (r) 2780 return r; 2781 block->status.hw = true; 2782 } 2783 } 2784 2785 return 0; 2786 } 2787 2788 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2789 { 2790 int i, r; 2791 2792 static enum amd_ip_block_type ip_order[] = { 2793 AMD_IP_BLOCK_TYPE_SMC, 2794 AMD_IP_BLOCK_TYPE_DCE, 2795 AMD_IP_BLOCK_TYPE_GFX, 2796 AMD_IP_BLOCK_TYPE_SDMA, 2797 AMD_IP_BLOCK_TYPE_UVD, 2798 AMD_IP_BLOCK_TYPE_VCE, 2799 AMD_IP_BLOCK_TYPE_VCN 2800 }; 2801 2802 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2803 int j; 2804 struct amdgpu_ip_block *block; 2805 2806 for (j = 0; j < adev->num_ip_blocks; j++) { 2807 block = &adev->ip_blocks[j]; 2808 2809 if (block->version->type != ip_order[i] || 2810 !block->status.valid || 2811 block->status.hw) 2812 continue; 2813 2814 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2815 r = block->version->funcs->resume(adev); 2816 else 2817 r = block->version->funcs->hw_init(adev); 2818 2819 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2820 if (r) 2821 return r; 2822 block->status.hw = true; 2823 } 2824 } 2825 2826 return 0; 2827 } 2828 2829 /** 2830 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2831 * 2832 * @adev: amdgpu_device pointer 2833 * 2834 * First resume function for hardware IPs. The list of all the hardware 2835 * IPs that make up the asic is walked and the resume callbacks are run for 2836 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2837 * after a suspend and updates the software state as necessary. This 2838 * function is also used for restoring the GPU after a GPU reset. 2839 * Returns 0 on success, negative error code on failure. 2840 */ 2841 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2842 { 2843 int i, r; 2844 2845 for (i = 0; i < adev->num_ip_blocks; i++) { 2846 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2847 continue; 2848 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2849 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2850 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2851 2852 r = adev->ip_blocks[i].version->funcs->resume(adev); 2853 if (r) { 2854 DRM_ERROR("resume of IP block <%s> failed %d\n", 2855 adev->ip_blocks[i].version->funcs->name, r); 2856 return r; 2857 } 2858 adev->ip_blocks[i].status.hw = true; 2859 } 2860 } 2861 2862 return 0; 2863 } 2864 2865 /** 2866 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2867 * 2868 * @adev: amdgpu_device pointer 2869 * 2870 * First resume function for hardware IPs. The list of all the hardware 2871 * IPs that make up the asic is walked and the resume callbacks are run for 2872 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2873 * functional state after a suspend and updates the software state as 2874 * necessary. This function is also used for restoring the GPU after a GPU 2875 * reset. 2876 * Returns 0 on success, negative error code on failure. 2877 */ 2878 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2879 { 2880 int i, r; 2881 2882 for (i = 0; i < adev->num_ip_blocks; i++) { 2883 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2884 continue; 2885 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2886 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2887 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2888 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2889 continue; 2890 r = adev->ip_blocks[i].version->funcs->resume(adev); 2891 if (r) { 2892 DRM_ERROR("resume of IP block <%s> failed %d\n", 2893 adev->ip_blocks[i].version->funcs->name, r); 2894 return r; 2895 } 2896 adev->ip_blocks[i].status.hw = true; 2897 } 2898 2899 return 0; 2900 } 2901 2902 /** 2903 * amdgpu_device_ip_resume - run resume for hardware IPs 2904 * 2905 * @adev: amdgpu_device pointer 2906 * 2907 * Main resume function for hardware IPs. The hardware IPs 2908 * are split into two resume functions because they are 2909 * are also used in in recovering from a GPU reset and some additional 2910 * steps need to be take between them. In this case (S3/S4) they are 2911 * run sequentially. 2912 * Returns 0 on success, negative error code on failure. 2913 */ 2914 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2915 { 2916 int r; 2917 2918 r = amdgpu_device_ip_resume_phase1(adev); 2919 if (r) 2920 return r; 2921 2922 r = amdgpu_device_fw_loading(adev); 2923 if (r) 2924 return r; 2925 2926 r = amdgpu_device_ip_resume_phase2(adev); 2927 2928 return r; 2929 } 2930 2931 /** 2932 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2933 * 2934 * @adev: amdgpu_device pointer 2935 * 2936 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2937 */ 2938 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2939 { 2940 if (amdgpu_sriov_vf(adev)) { 2941 if (adev->is_atom_fw) { 2942 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2943 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2944 } else { 2945 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2946 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2947 } 2948 2949 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2950 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2951 } 2952 } 2953 2954 /** 2955 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2956 * 2957 * @asic_type: AMD asic type 2958 * 2959 * Check if there is DC (new modesetting infrastructre) support for an asic. 2960 * returns true if DC has support, false if not. 2961 */ 2962 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2963 { 2964 switch (asic_type) { 2965 #if defined(CONFIG_DRM_AMD_DC) 2966 #if defined(CONFIG_DRM_AMD_DC_SI) 2967 case CHIP_TAHITI: 2968 case CHIP_PITCAIRN: 2969 case CHIP_VERDE: 2970 case CHIP_OLAND: 2971 #endif 2972 case CHIP_BONAIRE: 2973 case CHIP_KAVERI: 2974 case CHIP_KABINI: 2975 case CHIP_MULLINS: 2976 /* 2977 * We have systems in the wild with these ASICs that require 2978 * LVDS and VGA support which is not supported with DC. 2979 * 2980 * Fallback to the non-DC driver here by default so as not to 2981 * cause regressions. 2982 */ 2983 return amdgpu_dc > 0; 2984 case CHIP_HAWAII: 2985 case CHIP_CARRIZO: 2986 case CHIP_STONEY: 2987 case CHIP_POLARIS10: 2988 case CHIP_POLARIS11: 2989 case CHIP_POLARIS12: 2990 case CHIP_VEGAM: 2991 case CHIP_TONGA: 2992 case CHIP_FIJI: 2993 case CHIP_VEGA10: 2994 case CHIP_VEGA12: 2995 case CHIP_VEGA20: 2996 #if defined(CONFIG_DRM_AMD_DC_DCN) 2997 case CHIP_RAVEN: 2998 case CHIP_NAVI10: 2999 case CHIP_NAVI14: 3000 case CHIP_NAVI12: 3001 case CHIP_RENOIR: 3002 #endif 3003 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 3004 case CHIP_SIENNA_CICHLID: 3005 case CHIP_NAVY_FLOUNDER: 3006 #endif 3007 return amdgpu_dc != 0; 3008 #endif 3009 default: 3010 if (amdgpu_dc > 0) 3011 DRM_INFO("Display Core has been requested via kernel parameter " 3012 "but isn't supported by ASIC, ignoring\n"); 3013 return false; 3014 } 3015 } 3016 3017 /** 3018 * amdgpu_device_has_dc_support - check if dc is supported 3019 * 3020 * @adev: amdgpu_device pointer 3021 * 3022 * Returns true for supported, false for not supported 3023 */ 3024 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3025 { 3026 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3027 return false; 3028 3029 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3030 } 3031 3032 3033 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3034 { 3035 struct amdgpu_device *adev = 3036 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3037 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3038 3039 /* It's a bug to not have a hive within this function */ 3040 if (WARN_ON(!hive)) 3041 return; 3042 3043 /* 3044 * Use task barrier to synchronize all xgmi reset works across the 3045 * hive. task_barrier_enter and task_barrier_exit will block 3046 * until all the threads running the xgmi reset works reach 3047 * those points. task_barrier_full will do both blocks. 3048 */ 3049 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3050 3051 task_barrier_enter(&hive->tb); 3052 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3053 3054 if (adev->asic_reset_res) 3055 goto fail; 3056 3057 task_barrier_exit(&hive->tb); 3058 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3059 3060 if (adev->asic_reset_res) 3061 goto fail; 3062 3063 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3064 adev->mmhub.funcs->reset_ras_error_count(adev); 3065 } else { 3066 3067 task_barrier_full(&hive->tb); 3068 adev->asic_reset_res = amdgpu_asic_reset(adev); 3069 } 3070 3071 fail: 3072 if (adev->asic_reset_res) 3073 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3074 adev->asic_reset_res, adev_to_drm(adev)->unique); 3075 amdgpu_put_xgmi_hive(hive); 3076 } 3077 3078 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3079 { 3080 char *input = amdgpu_lockup_timeout; 3081 char *timeout_setting = NULL; 3082 int index = 0; 3083 long timeout; 3084 int ret = 0; 3085 3086 /* 3087 * By default timeout for non compute jobs is 10000. 3088 * And there is no timeout enforced on compute jobs. 3089 * In SR-IOV or passthrough mode, timeout for compute 3090 * jobs are 60000 by default. 3091 */ 3092 adev->gfx_timeout = msecs_to_jiffies(10000); 3093 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3094 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3095 adev->compute_timeout = msecs_to_jiffies(60000); 3096 else 3097 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3098 3099 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3100 while ((timeout_setting = strsep(&input, ",")) && 3101 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3102 ret = kstrtol(timeout_setting, 0, &timeout); 3103 if (ret) 3104 return ret; 3105 3106 if (timeout == 0) { 3107 index++; 3108 continue; 3109 } else if (timeout < 0) { 3110 timeout = MAX_SCHEDULE_TIMEOUT; 3111 } else { 3112 timeout = msecs_to_jiffies(timeout); 3113 } 3114 3115 switch (index++) { 3116 case 0: 3117 adev->gfx_timeout = timeout; 3118 break; 3119 case 1: 3120 adev->compute_timeout = timeout; 3121 break; 3122 case 2: 3123 adev->sdma_timeout = timeout; 3124 break; 3125 case 3: 3126 adev->video_timeout = timeout; 3127 break; 3128 default: 3129 break; 3130 } 3131 } 3132 /* 3133 * There is only one value specified and 3134 * it should apply to all non-compute jobs. 3135 */ 3136 if (index == 1) { 3137 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3138 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3139 adev->compute_timeout = adev->gfx_timeout; 3140 } 3141 } 3142 3143 return ret; 3144 } 3145 3146 static const struct attribute *amdgpu_dev_attributes[] = { 3147 &dev_attr_product_name.attr, 3148 &dev_attr_product_number.attr, 3149 &dev_attr_serial_number.attr, 3150 &dev_attr_pcie_replay_count.attr, 3151 NULL 3152 }; 3153 3154 3155 /** 3156 * amdgpu_device_init - initialize the driver 3157 * 3158 * @adev: amdgpu_device pointer 3159 * @flags: driver flags 3160 * 3161 * Initializes the driver info and hw (all asics). 3162 * Returns 0 for success or an error on failure. 3163 * Called at driver startup. 3164 */ 3165 int amdgpu_device_init(struct amdgpu_device *adev, 3166 uint32_t flags) 3167 { 3168 struct drm_device *ddev = adev_to_drm(adev); 3169 struct pci_dev *pdev = adev->pdev; 3170 int r, i; 3171 bool boco = false; 3172 u32 max_MBps; 3173 3174 adev->shutdown = false; 3175 adev->flags = flags; 3176 3177 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3178 adev->asic_type = amdgpu_force_asic_type; 3179 else 3180 adev->asic_type = flags & AMD_ASIC_MASK; 3181 3182 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3183 if (amdgpu_emu_mode == 1) 3184 adev->usec_timeout *= 10; 3185 adev->gmc.gart_size = 512 * 1024 * 1024; 3186 adev->accel_working = false; 3187 adev->num_rings = 0; 3188 adev->mman.buffer_funcs = NULL; 3189 adev->mman.buffer_funcs_ring = NULL; 3190 adev->vm_manager.vm_pte_funcs = NULL; 3191 adev->vm_manager.vm_pte_num_scheds = 0; 3192 adev->gmc.gmc_funcs = NULL; 3193 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3194 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3195 3196 adev->smc_rreg = &amdgpu_invalid_rreg; 3197 adev->smc_wreg = &amdgpu_invalid_wreg; 3198 adev->pcie_rreg = &amdgpu_invalid_rreg; 3199 adev->pcie_wreg = &amdgpu_invalid_wreg; 3200 adev->pciep_rreg = &amdgpu_invalid_rreg; 3201 adev->pciep_wreg = &amdgpu_invalid_wreg; 3202 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3203 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3204 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3205 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3206 adev->didt_rreg = &amdgpu_invalid_rreg; 3207 adev->didt_wreg = &amdgpu_invalid_wreg; 3208 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3209 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3210 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3211 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3212 3213 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3214 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3215 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3216 3217 /* mutex initialization are all done here so we 3218 * can recall function without having locking issues */ 3219 atomic_set(&adev->irq.ih.lock, 0); 3220 mutex_init(&adev->firmware.mutex); 3221 mutex_init(&adev->pm.mutex); 3222 mutex_init(&adev->gfx.gpu_clock_mutex); 3223 mutex_init(&adev->srbm_mutex); 3224 mutex_init(&adev->gfx.pipe_reserve_mutex); 3225 mutex_init(&adev->gfx.gfx_off_mutex); 3226 mutex_init(&adev->grbm_idx_mutex); 3227 mutex_init(&adev->mn_lock); 3228 mutex_init(&adev->virt.vf_errors.lock); 3229 hash_init(adev->mn_hash); 3230 atomic_set(&adev->in_gpu_reset, 0); 3231 init_rwsem(&adev->reset_sem); 3232 mutex_init(&adev->psp.mutex); 3233 mutex_init(&adev->notifier_lock); 3234 3235 r = amdgpu_device_check_arguments(adev); 3236 if (r) 3237 return r; 3238 3239 spin_lock_init(&adev->mmio_idx_lock); 3240 spin_lock_init(&adev->smc_idx_lock); 3241 spin_lock_init(&adev->pcie_idx_lock); 3242 spin_lock_init(&adev->uvd_ctx_idx_lock); 3243 spin_lock_init(&adev->didt_idx_lock); 3244 spin_lock_init(&adev->gc_cac_idx_lock); 3245 spin_lock_init(&adev->se_cac_idx_lock); 3246 spin_lock_init(&adev->audio_endpt_idx_lock); 3247 spin_lock_init(&adev->mm_stats.lock); 3248 3249 INIT_LIST_HEAD(&adev->shadow_list); 3250 mutex_init(&adev->shadow_list_lock); 3251 3252 INIT_DELAYED_WORK(&adev->delayed_init_work, 3253 amdgpu_device_delayed_init_work_handler); 3254 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3255 amdgpu_device_delay_enable_gfx_off); 3256 3257 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3258 3259 adev->gfx.gfx_off_req_count = 1; 3260 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3261 3262 atomic_set(&adev->throttling_logging_enabled, 1); 3263 /* 3264 * If throttling continues, logging will be performed every minute 3265 * to avoid log flooding. "-1" is subtracted since the thermal 3266 * throttling interrupt comes every second. Thus, the total logging 3267 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3268 * for throttling interrupt) = 60 seconds. 3269 */ 3270 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3271 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3272 3273 /* Registers mapping */ 3274 /* TODO: block userspace mapping of io register */ 3275 if (adev->asic_type >= CHIP_BONAIRE) { 3276 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3277 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3278 } else { 3279 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3280 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3281 } 3282 3283 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3284 if (adev->rmmio == NULL) { 3285 return -ENOMEM; 3286 } 3287 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3288 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3289 3290 /* io port mapping */ 3291 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3292 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3293 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3294 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3295 break; 3296 } 3297 } 3298 if (adev->rio_mem == NULL) 3299 DRM_INFO("PCI I/O BAR is not found.\n"); 3300 3301 /* enable PCIE atomic ops */ 3302 r = pci_enable_atomic_ops_to_root(adev->pdev, 3303 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3304 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3305 if (r) { 3306 adev->have_atomics_support = false; 3307 DRM_INFO("PCIE atomic ops is not supported\n"); 3308 } else { 3309 adev->have_atomics_support = true; 3310 } 3311 3312 amdgpu_device_get_pcie_info(adev); 3313 3314 if (amdgpu_mcbp) 3315 DRM_INFO("MCBP is enabled\n"); 3316 3317 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3318 adev->enable_mes = true; 3319 3320 /* detect hw virtualization here */ 3321 amdgpu_detect_virtualization(adev); 3322 3323 r = amdgpu_device_get_job_timeout_settings(adev); 3324 if (r) { 3325 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3326 goto failed_unmap; 3327 } 3328 3329 /* early init functions */ 3330 r = amdgpu_device_ip_early_init(adev); 3331 if (r) 3332 goto failed_unmap; 3333 3334 /* doorbell bar mapping and doorbell index init*/ 3335 amdgpu_device_doorbell_init(adev); 3336 3337 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3338 /* this will fail for cards that aren't VGA class devices, just 3339 * ignore it */ 3340 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3341 3342 if (amdgpu_device_supports_boco(ddev)) 3343 boco = true; 3344 if (amdgpu_has_atpx() && 3345 (amdgpu_is_atpx_hybrid() || 3346 amdgpu_has_atpx_dgpu_power_cntl()) && 3347 !pci_is_thunderbolt_attached(adev->pdev)) 3348 vga_switcheroo_register_client(adev->pdev, 3349 &amdgpu_switcheroo_ops, boco); 3350 if (boco) 3351 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3352 3353 if (amdgpu_emu_mode == 1) { 3354 /* post the asic on emulation mode */ 3355 emu_soc_asic_init(adev); 3356 goto fence_driver_init; 3357 } 3358 3359 /* detect if we are with an SRIOV vbios */ 3360 amdgpu_device_detect_sriov_bios(adev); 3361 3362 /* check if we need to reset the asic 3363 * E.g., driver was not cleanly unloaded previously, etc. 3364 */ 3365 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3366 r = amdgpu_asic_reset(adev); 3367 if (r) { 3368 dev_err(adev->dev, "asic reset on init failed\n"); 3369 goto failed; 3370 } 3371 } 3372 3373 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3374 3375 /* Post card if necessary */ 3376 if (amdgpu_device_need_post(adev)) { 3377 if (!adev->bios) { 3378 dev_err(adev->dev, "no vBIOS found\n"); 3379 r = -EINVAL; 3380 goto failed; 3381 } 3382 DRM_INFO("GPU posting now...\n"); 3383 r = amdgpu_device_asic_init(adev); 3384 if (r) { 3385 dev_err(adev->dev, "gpu post error!\n"); 3386 goto failed; 3387 } 3388 } 3389 3390 if (adev->is_atom_fw) { 3391 /* Initialize clocks */ 3392 r = amdgpu_atomfirmware_get_clock_info(adev); 3393 if (r) { 3394 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3395 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3396 goto failed; 3397 } 3398 } else { 3399 /* Initialize clocks */ 3400 r = amdgpu_atombios_get_clock_info(adev); 3401 if (r) { 3402 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3403 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3404 goto failed; 3405 } 3406 /* init i2c buses */ 3407 if (!amdgpu_device_has_dc_support(adev)) 3408 amdgpu_atombios_i2c_init(adev); 3409 } 3410 3411 fence_driver_init: 3412 /* Fence driver */ 3413 r = amdgpu_fence_driver_init(adev); 3414 if (r) { 3415 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3416 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3417 goto failed; 3418 } 3419 3420 /* init the mode config */ 3421 drm_mode_config_init(adev_to_drm(adev)); 3422 3423 r = amdgpu_device_ip_init(adev); 3424 if (r) { 3425 /* failed in exclusive mode due to timeout */ 3426 if (amdgpu_sriov_vf(adev) && 3427 !amdgpu_sriov_runtime(adev) && 3428 amdgpu_virt_mmio_blocked(adev) && 3429 !amdgpu_virt_wait_reset(adev)) { 3430 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3431 /* Don't send request since VF is inactive. */ 3432 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3433 adev->virt.ops = NULL; 3434 r = -EAGAIN; 3435 goto failed; 3436 } 3437 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3438 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3439 goto failed; 3440 } 3441 3442 dev_info(adev->dev, 3443 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3444 adev->gfx.config.max_shader_engines, 3445 adev->gfx.config.max_sh_per_se, 3446 adev->gfx.config.max_cu_per_sh, 3447 adev->gfx.cu_info.number); 3448 3449 adev->accel_working = true; 3450 3451 amdgpu_vm_check_compute_bug(adev); 3452 3453 /* Initialize the buffer migration limit. */ 3454 if (amdgpu_moverate >= 0) 3455 max_MBps = amdgpu_moverate; 3456 else 3457 max_MBps = 8; /* Allow 8 MB/s. */ 3458 /* Get a log2 for easy divisions. */ 3459 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3460 3461 amdgpu_fbdev_init(adev); 3462 3463 r = amdgpu_pm_sysfs_init(adev); 3464 if (r) { 3465 adev->pm_sysfs_en = false; 3466 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3467 } else 3468 adev->pm_sysfs_en = true; 3469 3470 r = amdgpu_ucode_sysfs_init(adev); 3471 if (r) { 3472 adev->ucode_sysfs_en = false; 3473 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3474 } else 3475 adev->ucode_sysfs_en = true; 3476 3477 if ((amdgpu_testing & 1)) { 3478 if (adev->accel_working) 3479 amdgpu_test_moves(adev); 3480 else 3481 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3482 } 3483 if (amdgpu_benchmarking) { 3484 if (adev->accel_working) 3485 amdgpu_benchmark(adev, amdgpu_benchmarking); 3486 else 3487 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3488 } 3489 3490 /* 3491 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3492 * Otherwise the mgpu fan boost feature will be skipped due to the 3493 * gpu instance is counted less. 3494 */ 3495 amdgpu_register_gpu_instance(adev); 3496 3497 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3498 * explicit gating rather than handling it automatically. 3499 */ 3500 r = amdgpu_device_ip_late_init(adev); 3501 if (r) { 3502 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3503 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3504 goto failed; 3505 } 3506 3507 /* must succeed. */ 3508 amdgpu_ras_resume(adev); 3509 3510 queue_delayed_work(system_wq, &adev->delayed_init_work, 3511 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3512 3513 if (amdgpu_sriov_vf(adev)) 3514 flush_delayed_work(&adev->delayed_init_work); 3515 3516 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3517 if (r) 3518 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3519 3520 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3521 r = amdgpu_pmu_init(adev); 3522 if (r) 3523 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3524 3525 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3526 if (amdgpu_device_cache_pci_state(adev->pdev)) 3527 pci_restore_state(pdev); 3528 3529 return 0; 3530 3531 failed: 3532 amdgpu_vf_error_trans_all(adev); 3533 if (boco) 3534 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3535 3536 failed_unmap: 3537 iounmap(adev->rmmio); 3538 adev->rmmio = NULL; 3539 3540 return r; 3541 } 3542 3543 /** 3544 * amdgpu_device_fini - tear down the driver 3545 * 3546 * @adev: amdgpu_device pointer 3547 * 3548 * Tear down the driver info (all asics). 3549 * Called at driver shutdown. 3550 */ 3551 void amdgpu_device_fini(struct amdgpu_device *adev) 3552 { 3553 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3554 flush_delayed_work(&adev->delayed_init_work); 3555 adev->shutdown = true; 3556 3557 kfree(adev->pci_state); 3558 3559 /* make sure IB test finished before entering exclusive mode 3560 * to avoid preemption on IB test 3561 * */ 3562 if (amdgpu_sriov_vf(adev)) { 3563 amdgpu_virt_request_full_gpu(adev, false); 3564 amdgpu_virt_fini_data_exchange(adev); 3565 } 3566 3567 /* disable all interrupts */ 3568 amdgpu_irq_disable_all(adev); 3569 if (adev->mode_info.mode_config_initialized){ 3570 if (!amdgpu_device_has_dc_support(adev)) 3571 drm_helper_force_disable_all(adev_to_drm(adev)); 3572 else 3573 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3574 } 3575 amdgpu_fence_driver_fini(adev); 3576 if (adev->pm_sysfs_en) 3577 amdgpu_pm_sysfs_fini(adev); 3578 amdgpu_fbdev_fini(adev); 3579 amdgpu_device_ip_fini(adev); 3580 release_firmware(adev->firmware.gpu_info_fw); 3581 adev->firmware.gpu_info_fw = NULL; 3582 adev->accel_working = false; 3583 /* free i2c buses */ 3584 if (!amdgpu_device_has_dc_support(adev)) 3585 amdgpu_i2c_fini(adev); 3586 3587 if (amdgpu_emu_mode != 1) 3588 amdgpu_atombios_fini(adev); 3589 3590 kfree(adev->bios); 3591 adev->bios = NULL; 3592 if (amdgpu_has_atpx() && 3593 (amdgpu_is_atpx_hybrid() || 3594 amdgpu_has_atpx_dgpu_power_cntl()) && 3595 !pci_is_thunderbolt_attached(adev->pdev)) 3596 vga_switcheroo_unregister_client(adev->pdev); 3597 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3598 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3599 vga_client_register(adev->pdev, NULL, NULL, NULL); 3600 if (adev->rio_mem) 3601 pci_iounmap(adev->pdev, adev->rio_mem); 3602 adev->rio_mem = NULL; 3603 iounmap(adev->rmmio); 3604 adev->rmmio = NULL; 3605 amdgpu_device_doorbell_fini(adev); 3606 3607 if (adev->ucode_sysfs_en) 3608 amdgpu_ucode_sysfs_fini(adev); 3609 3610 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3611 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3612 amdgpu_pmu_fini(adev); 3613 if (adev->mman.discovery_bin) 3614 amdgpu_discovery_fini(adev); 3615 } 3616 3617 3618 /* 3619 * Suspend & resume. 3620 */ 3621 /** 3622 * amdgpu_device_suspend - initiate device suspend 3623 * 3624 * @dev: drm dev pointer 3625 * @fbcon : notify the fbdev of suspend 3626 * 3627 * Puts the hw in the suspend state (all asics). 3628 * Returns 0 for success or an error on failure. 3629 * Called at driver suspend. 3630 */ 3631 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3632 { 3633 struct amdgpu_device *adev; 3634 struct drm_crtc *crtc; 3635 struct drm_connector *connector; 3636 struct drm_connector_list_iter iter; 3637 int r; 3638 3639 adev = drm_to_adev(dev); 3640 3641 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3642 return 0; 3643 3644 adev->in_suspend = true; 3645 drm_kms_helper_poll_disable(dev); 3646 3647 if (fbcon) 3648 amdgpu_fbdev_set_suspend(adev, 1); 3649 3650 cancel_delayed_work_sync(&adev->delayed_init_work); 3651 3652 if (!amdgpu_device_has_dc_support(adev)) { 3653 /* turn off display hw */ 3654 drm_modeset_lock_all(dev); 3655 drm_connector_list_iter_begin(dev, &iter); 3656 drm_for_each_connector_iter(connector, &iter) 3657 drm_helper_connector_dpms(connector, 3658 DRM_MODE_DPMS_OFF); 3659 drm_connector_list_iter_end(&iter); 3660 drm_modeset_unlock_all(dev); 3661 /* unpin the front buffers and cursors */ 3662 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3663 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3664 struct drm_framebuffer *fb = crtc->primary->fb; 3665 struct amdgpu_bo *robj; 3666 3667 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3668 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3669 r = amdgpu_bo_reserve(aobj, true); 3670 if (r == 0) { 3671 amdgpu_bo_unpin(aobj); 3672 amdgpu_bo_unreserve(aobj); 3673 } 3674 } 3675 3676 if (fb == NULL || fb->obj[0] == NULL) { 3677 continue; 3678 } 3679 robj = gem_to_amdgpu_bo(fb->obj[0]); 3680 /* don't unpin kernel fb objects */ 3681 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3682 r = amdgpu_bo_reserve(robj, true); 3683 if (r == 0) { 3684 amdgpu_bo_unpin(robj); 3685 amdgpu_bo_unreserve(robj); 3686 } 3687 } 3688 } 3689 } 3690 3691 amdgpu_ras_suspend(adev); 3692 3693 r = amdgpu_device_ip_suspend_phase1(adev); 3694 3695 amdgpu_amdkfd_suspend(adev, !fbcon); 3696 3697 /* evict vram memory */ 3698 amdgpu_bo_evict_vram(adev); 3699 3700 amdgpu_fence_driver_suspend(adev); 3701 3702 r = amdgpu_device_ip_suspend_phase2(adev); 3703 3704 /* evict remaining vram memory 3705 * This second call to evict vram is to evict the gart page table 3706 * using the CPU. 3707 */ 3708 amdgpu_bo_evict_vram(adev); 3709 3710 return 0; 3711 } 3712 3713 /** 3714 * amdgpu_device_resume - initiate device resume 3715 * 3716 * @dev: drm dev pointer 3717 * @fbcon : notify the fbdev of resume 3718 * 3719 * Bring the hw back to operating state (all asics). 3720 * Returns 0 for success or an error on failure. 3721 * Called at driver resume. 3722 */ 3723 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3724 { 3725 struct drm_connector *connector; 3726 struct drm_connector_list_iter iter; 3727 struct amdgpu_device *adev = drm_to_adev(dev); 3728 struct drm_crtc *crtc; 3729 int r = 0; 3730 3731 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3732 return 0; 3733 3734 /* post card */ 3735 if (amdgpu_device_need_post(adev)) { 3736 r = amdgpu_device_asic_init(adev); 3737 if (r) 3738 dev_err(adev->dev, "amdgpu asic init failed\n"); 3739 } 3740 3741 r = amdgpu_device_ip_resume(adev); 3742 if (r) { 3743 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3744 return r; 3745 } 3746 amdgpu_fence_driver_resume(adev); 3747 3748 3749 r = amdgpu_device_ip_late_init(adev); 3750 if (r) 3751 return r; 3752 3753 queue_delayed_work(system_wq, &adev->delayed_init_work, 3754 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3755 3756 if (!amdgpu_device_has_dc_support(adev)) { 3757 /* pin cursors */ 3758 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3759 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3760 3761 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3762 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3763 r = amdgpu_bo_reserve(aobj, true); 3764 if (r == 0) { 3765 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3766 if (r != 0) 3767 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3768 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3769 amdgpu_bo_unreserve(aobj); 3770 } 3771 } 3772 } 3773 } 3774 r = amdgpu_amdkfd_resume(adev, !fbcon); 3775 if (r) 3776 return r; 3777 3778 /* Make sure IB tests flushed */ 3779 flush_delayed_work(&adev->delayed_init_work); 3780 3781 /* blat the mode back in */ 3782 if (fbcon) { 3783 if (!amdgpu_device_has_dc_support(adev)) { 3784 /* pre DCE11 */ 3785 drm_helper_resume_force_mode(dev); 3786 3787 /* turn on display hw */ 3788 drm_modeset_lock_all(dev); 3789 3790 drm_connector_list_iter_begin(dev, &iter); 3791 drm_for_each_connector_iter(connector, &iter) 3792 drm_helper_connector_dpms(connector, 3793 DRM_MODE_DPMS_ON); 3794 drm_connector_list_iter_end(&iter); 3795 3796 drm_modeset_unlock_all(dev); 3797 } 3798 amdgpu_fbdev_set_suspend(adev, 0); 3799 } 3800 3801 drm_kms_helper_poll_enable(dev); 3802 3803 amdgpu_ras_resume(adev); 3804 3805 /* 3806 * Most of the connector probing functions try to acquire runtime pm 3807 * refs to ensure that the GPU is powered on when connector polling is 3808 * performed. Since we're calling this from a runtime PM callback, 3809 * trying to acquire rpm refs will cause us to deadlock. 3810 * 3811 * Since we're guaranteed to be holding the rpm lock, it's safe to 3812 * temporarily disable the rpm helpers so this doesn't deadlock us. 3813 */ 3814 #ifdef CONFIG_PM 3815 dev->dev->power.disable_depth++; 3816 #endif 3817 if (!amdgpu_device_has_dc_support(adev)) 3818 drm_helper_hpd_irq_event(dev); 3819 else 3820 drm_kms_helper_hotplug_event(dev); 3821 #ifdef CONFIG_PM 3822 dev->dev->power.disable_depth--; 3823 #endif 3824 adev->in_suspend = false; 3825 3826 return 0; 3827 } 3828 3829 /** 3830 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3831 * 3832 * @adev: amdgpu_device pointer 3833 * 3834 * The list of all the hardware IPs that make up the asic is walked and 3835 * the check_soft_reset callbacks are run. check_soft_reset determines 3836 * if the asic is still hung or not. 3837 * Returns true if any of the IPs are still in a hung state, false if not. 3838 */ 3839 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3840 { 3841 int i; 3842 bool asic_hang = false; 3843 3844 if (amdgpu_sriov_vf(adev)) 3845 return true; 3846 3847 if (amdgpu_asic_need_full_reset(adev)) 3848 return true; 3849 3850 for (i = 0; i < adev->num_ip_blocks; i++) { 3851 if (!adev->ip_blocks[i].status.valid) 3852 continue; 3853 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3854 adev->ip_blocks[i].status.hang = 3855 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3856 if (adev->ip_blocks[i].status.hang) { 3857 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3858 asic_hang = true; 3859 } 3860 } 3861 return asic_hang; 3862 } 3863 3864 /** 3865 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3866 * 3867 * @adev: amdgpu_device pointer 3868 * 3869 * The list of all the hardware IPs that make up the asic is walked and the 3870 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3871 * handles any IP specific hardware or software state changes that are 3872 * necessary for a soft reset to succeed. 3873 * Returns 0 on success, negative error code on failure. 3874 */ 3875 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3876 { 3877 int i, r = 0; 3878 3879 for (i = 0; i < adev->num_ip_blocks; i++) { 3880 if (!adev->ip_blocks[i].status.valid) 3881 continue; 3882 if (adev->ip_blocks[i].status.hang && 3883 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3884 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3885 if (r) 3886 return r; 3887 } 3888 } 3889 3890 return 0; 3891 } 3892 3893 /** 3894 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3895 * 3896 * @adev: amdgpu_device pointer 3897 * 3898 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3899 * reset is necessary to recover. 3900 * Returns true if a full asic reset is required, false if not. 3901 */ 3902 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3903 { 3904 int i; 3905 3906 if (amdgpu_asic_need_full_reset(adev)) 3907 return true; 3908 3909 for (i = 0; i < adev->num_ip_blocks; i++) { 3910 if (!adev->ip_blocks[i].status.valid) 3911 continue; 3912 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3913 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3914 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3915 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3916 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3917 if (adev->ip_blocks[i].status.hang) { 3918 dev_info(adev->dev, "Some block need full reset!\n"); 3919 return true; 3920 } 3921 } 3922 } 3923 return false; 3924 } 3925 3926 /** 3927 * amdgpu_device_ip_soft_reset - do a soft reset 3928 * 3929 * @adev: amdgpu_device pointer 3930 * 3931 * The list of all the hardware IPs that make up the asic is walked and the 3932 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3933 * IP specific hardware or software state changes that are necessary to soft 3934 * reset the IP. 3935 * Returns 0 on success, negative error code on failure. 3936 */ 3937 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3938 { 3939 int i, r = 0; 3940 3941 for (i = 0; i < adev->num_ip_blocks; i++) { 3942 if (!adev->ip_blocks[i].status.valid) 3943 continue; 3944 if (adev->ip_blocks[i].status.hang && 3945 adev->ip_blocks[i].version->funcs->soft_reset) { 3946 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3947 if (r) 3948 return r; 3949 } 3950 } 3951 3952 return 0; 3953 } 3954 3955 /** 3956 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3957 * 3958 * @adev: amdgpu_device pointer 3959 * 3960 * The list of all the hardware IPs that make up the asic is walked and the 3961 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3962 * handles any IP specific hardware or software state changes that are 3963 * necessary after the IP has been soft reset. 3964 * Returns 0 on success, negative error code on failure. 3965 */ 3966 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3967 { 3968 int i, r = 0; 3969 3970 for (i = 0; i < adev->num_ip_blocks; i++) { 3971 if (!adev->ip_blocks[i].status.valid) 3972 continue; 3973 if (adev->ip_blocks[i].status.hang && 3974 adev->ip_blocks[i].version->funcs->post_soft_reset) 3975 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3976 if (r) 3977 return r; 3978 } 3979 3980 return 0; 3981 } 3982 3983 /** 3984 * amdgpu_device_recover_vram - Recover some VRAM contents 3985 * 3986 * @adev: amdgpu_device pointer 3987 * 3988 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3989 * restore things like GPUVM page tables after a GPU reset where 3990 * the contents of VRAM might be lost. 3991 * 3992 * Returns: 3993 * 0 on success, negative error code on failure. 3994 */ 3995 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3996 { 3997 struct dma_fence *fence = NULL, *next = NULL; 3998 struct amdgpu_bo *shadow; 3999 long r = 1, tmo; 4000 4001 if (amdgpu_sriov_runtime(adev)) 4002 tmo = msecs_to_jiffies(8000); 4003 else 4004 tmo = msecs_to_jiffies(100); 4005 4006 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4007 mutex_lock(&adev->shadow_list_lock); 4008 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4009 4010 /* No need to recover an evicted BO */ 4011 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4012 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4013 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4014 continue; 4015 4016 r = amdgpu_bo_restore_shadow(shadow, &next); 4017 if (r) 4018 break; 4019 4020 if (fence) { 4021 tmo = dma_fence_wait_timeout(fence, false, tmo); 4022 dma_fence_put(fence); 4023 fence = next; 4024 if (tmo == 0) { 4025 r = -ETIMEDOUT; 4026 break; 4027 } else if (tmo < 0) { 4028 r = tmo; 4029 break; 4030 } 4031 } else { 4032 fence = next; 4033 } 4034 } 4035 mutex_unlock(&adev->shadow_list_lock); 4036 4037 if (fence) 4038 tmo = dma_fence_wait_timeout(fence, false, tmo); 4039 dma_fence_put(fence); 4040 4041 if (r < 0 || tmo <= 0) { 4042 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4043 return -EIO; 4044 } 4045 4046 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4047 return 0; 4048 } 4049 4050 4051 /** 4052 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4053 * 4054 * @adev: amdgpu_device pointer 4055 * @from_hypervisor: request from hypervisor 4056 * 4057 * do VF FLR and reinitialize Asic 4058 * return 0 means succeeded otherwise failed 4059 */ 4060 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4061 bool from_hypervisor) 4062 { 4063 int r; 4064 4065 if (from_hypervisor) 4066 r = amdgpu_virt_request_full_gpu(adev, true); 4067 else 4068 r = amdgpu_virt_reset_gpu(adev); 4069 if (r) 4070 return r; 4071 4072 amdgpu_amdkfd_pre_reset(adev); 4073 4074 /* Resume IP prior to SMC */ 4075 r = amdgpu_device_ip_reinit_early_sriov(adev); 4076 if (r) 4077 goto error; 4078 4079 amdgpu_virt_init_data_exchange(adev); 4080 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4081 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4082 4083 r = amdgpu_device_fw_loading(adev); 4084 if (r) 4085 return r; 4086 4087 /* now we are okay to resume SMC/CP/SDMA */ 4088 r = amdgpu_device_ip_reinit_late_sriov(adev); 4089 if (r) 4090 goto error; 4091 4092 amdgpu_irq_gpu_reset_resume_helper(adev); 4093 r = amdgpu_ib_ring_tests(adev); 4094 amdgpu_amdkfd_post_reset(adev); 4095 4096 error: 4097 amdgpu_virt_release_full_gpu(adev, true); 4098 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4099 amdgpu_inc_vram_lost(adev); 4100 r = amdgpu_device_recover_vram(adev); 4101 } 4102 4103 return r; 4104 } 4105 4106 /** 4107 * amdgpu_device_has_job_running - check if there is any job in mirror list 4108 * 4109 * @adev: amdgpu_device pointer 4110 * 4111 * check if there is any job in mirror list 4112 */ 4113 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4114 { 4115 int i; 4116 struct drm_sched_job *job; 4117 4118 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4119 struct amdgpu_ring *ring = adev->rings[i]; 4120 4121 if (!ring || !ring->sched.thread) 4122 continue; 4123 4124 spin_lock(&ring->sched.job_list_lock); 4125 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4126 struct drm_sched_job, node); 4127 spin_unlock(&ring->sched.job_list_lock); 4128 if (job) 4129 return true; 4130 } 4131 return false; 4132 } 4133 4134 /** 4135 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4136 * 4137 * @adev: amdgpu_device pointer 4138 * 4139 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4140 * a hung GPU. 4141 */ 4142 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4143 { 4144 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4145 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4146 return false; 4147 } 4148 4149 if (amdgpu_gpu_recovery == 0) 4150 goto disabled; 4151 4152 if (amdgpu_sriov_vf(adev)) 4153 return true; 4154 4155 if (amdgpu_gpu_recovery == -1) { 4156 switch (adev->asic_type) { 4157 case CHIP_BONAIRE: 4158 case CHIP_HAWAII: 4159 case CHIP_TOPAZ: 4160 case CHIP_TONGA: 4161 case CHIP_FIJI: 4162 case CHIP_POLARIS10: 4163 case CHIP_POLARIS11: 4164 case CHIP_POLARIS12: 4165 case CHIP_VEGAM: 4166 case CHIP_VEGA20: 4167 case CHIP_VEGA10: 4168 case CHIP_VEGA12: 4169 case CHIP_RAVEN: 4170 case CHIP_ARCTURUS: 4171 case CHIP_RENOIR: 4172 case CHIP_NAVI10: 4173 case CHIP_NAVI14: 4174 case CHIP_NAVI12: 4175 case CHIP_SIENNA_CICHLID: 4176 break; 4177 default: 4178 goto disabled; 4179 } 4180 } 4181 4182 return true; 4183 4184 disabled: 4185 dev_info(adev->dev, "GPU recovery disabled.\n"); 4186 return false; 4187 } 4188 4189 4190 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4191 struct amdgpu_job *job, 4192 bool *need_full_reset_arg) 4193 { 4194 int i, r = 0; 4195 bool need_full_reset = *need_full_reset_arg; 4196 4197 amdgpu_debugfs_wait_dump(adev); 4198 4199 if (amdgpu_sriov_vf(adev)) { 4200 /* stop the data exchange thread */ 4201 amdgpu_virt_fini_data_exchange(adev); 4202 } 4203 4204 /* block all schedulers and reset given job's ring */ 4205 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4206 struct amdgpu_ring *ring = adev->rings[i]; 4207 4208 if (!ring || !ring->sched.thread) 4209 continue; 4210 4211 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4212 amdgpu_fence_driver_force_completion(ring); 4213 } 4214 4215 if(job) 4216 drm_sched_increase_karma(&job->base); 4217 4218 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4219 if (!amdgpu_sriov_vf(adev)) { 4220 4221 if (!need_full_reset) 4222 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4223 4224 if (!need_full_reset) { 4225 amdgpu_device_ip_pre_soft_reset(adev); 4226 r = amdgpu_device_ip_soft_reset(adev); 4227 amdgpu_device_ip_post_soft_reset(adev); 4228 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4229 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4230 need_full_reset = true; 4231 } 4232 } 4233 4234 if (need_full_reset) 4235 r = amdgpu_device_ip_suspend(adev); 4236 4237 *need_full_reset_arg = need_full_reset; 4238 } 4239 4240 return r; 4241 } 4242 4243 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4244 struct list_head *device_list_handle, 4245 bool *need_full_reset_arg, 4246 bool skip_hw_reset) 4247 { 4248 struct amdgpu_device *tmp_adev = NULL; 4249 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4250 int r = 0; 4251 4252 /* 4253 * ASIC reset has to be done on all HGMI hive nodes ASAP 4254 * to allow proper links negotiation in FW (within 1 sec) 4255 */ 4256 if (!skip_hw_reset && need_full_reset) { 4257 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4258 /* For XGMI run all resets in parallel to speed up the process */ 4259 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4260 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4261 r = -EALREADY; 4262 } else 4263 r = amdgpu_asic_reset(tmp_adev); 4264 4265 if (r) { 4266 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4267 r, adev_to_drm(tmp_adev)->unique); 4268 break; 4269 } 4270 } 4271 4272 /* For XGMI wait for all resets to complete before proceed */ 4273 if (!r) { 4274 list_for_each_entry(tmp_adev, device_list_handle, 4275 gmc.xgmi.head) { 4276 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4277 flush_work(&tmp_adev->xgmi_reset_work); 4278 r = tmp_adev->asic_reset_res; 4279 if (r) 4280 break; 4281 } 4282 } 4283 } 4284 } 4285 4286 if (!r && amdgpu_ras_intr_triggered()) { 4287 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4288 if (tmp_adev->mmhub.funcs && 4289 tmp_adev->mmhub.funcs->reset_ras_error_count) 4290 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4291 } 4292 4293 amdgpu_ras_intr_cleared(); 4294 } 4295 4296 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4297 if (need_full_reset) { 4298 /* post card */ 4299 if (amdgpu_device_asic_init(tmp_adev)) 4300 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4301 4302 if (!r) { 4303 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4304 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4305 if (r) 4306 goto out; 4307 4308 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4309 if (vram_lost) { 4310 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4311 amdgpu_inc_vram_lost(tmp_adev); 4312 } 4313 4314 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4315 if (r) 4316 goto out; 4317 4318 r = amdgpu_device_fw_loading(tmp_adev); 4319 if (r) 4320 return r; 4321 4322 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4323 if (r) 4324 goto out; 4325 4326 if (vram_lost) 4327 amdgpu_device_fill_reset_magic(tmp_adev); 4328 4329 /* 4330 * Add this ASIC as tracked as reset was already 4331 * complete successfully. 4332 */ 4333 amdgpu_register_gpu_instance(tmp_adev); 4334 4335 r = amdgpu_device_ip_late_init(tmp_adev); 4336 if (r) 4337 goto out; 4338 4339 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4340 4341 /* 4342 * The GPU enters bad state once faulty pages 4343 * by ECC has reached the threshold, and ras 4344 * recovery is scheduled next. So add one check 4345 * here to break recovery if it indeed exceeds 4346 * bad page threshold, and remind user to 4347 * retire this GPU or setting one bigger 4348 * bad_page_threshold value to fix this once 4349 * probing driver again. 4350 */ 4351 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4352 /* must succeed. */ 4353 amdgpu_ras_resume(tmp_adev); 4354 } else { 4355 r = -EINVAL; 4356 goto out; 4357 } 4358 4359 /* Update PSP FW topology after reset */ 4360 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4361 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4362 } 4363 } 4364 4365 out: 4366 if (!r) { 4367 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4368 r = amdgpu_ib_ring_tests(tmp_adev); 4369 if (r) { 4370 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4371 r = amdgpu_device_ip_suspend(tmp_adev); 4372 need_full_reset = true; 4373 r = -EAGAIN; 4374 goto end; 4375 } 4376 } 4377 4378 if (!r) 4379 r = amdgpu_device_recover_vram(tmp_adev); 4380 else 4381 tmp_adev->asic_reset_res = r; 4382 } 4383 4384 end: 4385 *need_full_reset_arg = need_full_reset; 4386 return r; 4387 } 4388 4389 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4390 struct amdgpu_hive_info *hive) 4391 { 4392 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4393 return false; 4394 4395 if (hive) { 4396 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4397 } else { 4398 down_write(&adev->reset_sem); 4399 } 4400 4401 atomic_inc(&adev->gpu_reset_counter); 4402 switch (amdgpu_asic_reset_method(adev)) { 4403 case AMD_RESET_METHOD_MODE1: 4404 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4405 break; 4406 case AMD_RESET_METHOD_MODE2: 4407 adev->mp1_state = PP_MP1_STATE_RESET; 4408 break; 4409 default: 4410 adev->mp1_state = PP_MP1_STATE_NONE; 4411 break; 4412 } 4413 4414 return true; 4415 } 4416 4417 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4418 { 4419 amdgpu_vf_error_trans_all(adev); 4420 adev->mp1_state = PP_MP1_STATE_NONE; 4421 atomic_set(&adev->in_gpu_reset, 0); 4422 up_write(&adev->reset_sem); 4423 } 4424 4425 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4426 { 4427 struct pci_dev *p = NULL; 4428 4429 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4430 adev->pdev->bus->number, 1); 4431 if (p) { 4432 pm_runtime_enable(&(p->dev)); 4433 pm_runtime_resume(&(p->dev)); 4434 } 4435 } 4436 4437 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4438 { 4439 enum amd_reset_method reset_method; 4440 struct pci_dev *p = NULL; 4441 u64 expires; 4442 4443 /* 4444 * For now, only BACO and mode1 reset are confirmed 4445 * to suffer the audio issue without proper suspended. 4446 */ 4447 reset_method = amdgpu_asic_reset_method(adev); 4448 if ((reset_method != AMD_RESET_METHOD_BACO) && 4449 (reset_method != AMD_RESET_METHOD_MODE1)) 4450 return -EINVAL; 4451 4452 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4453 adev->pdev->bus->number, 1); 4454 if (!p) 4455 return -ENODEV; 4456 4457 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4458 if (!expires) 4459 /* 4460 * If we cannot get the audio device autosuspend delay, 4461 * a fixed 4S interval will be used. Considering 3S is 4462 * the audio controller default autosuspend delay setting. 4463 * 4S used here is guaranteed to cover that. 4464 */ 4465 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4466 4467 while (!pm_runtime_status_suspended(&(p->dev))) { 4468 if (!pm_runtime_suspend(&(p->dev))) 4469 break; 4470 4471 if (expires < ktime_get_mono_fast_ns()) { 4472 dev_warn(adev->dev, "failed to suspend display audio\n"); 4473 /* TODO: abort the succeeding gpu reset? */ 4474 return -ETIMEDOUT; 4475 } 4476 } 4477 4478 pm_runtime_disable(&(p->dev)); 4479 4480 return 0; 4481 } 4482 4483 /** 4484 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4485 * 4486 * @adev: amdgpu_device pointer 4487 * @job: which job trigger hang 4488 * 4489 * Attempt to reset the GPU if it has hung (all asics). 4490 * Attempt to do soft-reset or full-reset and reinitialize Asic 4491 * Returns 0 for success or an error on failure. 4492 */ 4493 4494 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4495 struct amdgpu_job *job) 4496 { 4497 struct list_head device_list, *device_list_handle = NULL; 4498 bool need_full_reset = false; 4499 bool job_signaled = false; 4500 struct amdgpu_hive_info *hive = NULL; 4501 struct amdgpu_device *tmp_adev = NULL; 4502 int i, r = 0; 4503 bool need_emergency_restart = false; 4504 bool audio_suspended = false; 4505 4506 /* 4507 * Special case: RAS triggered and full reset isn't supported 4508 */ 4509 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4510 4511 /* 4512 * Flush RAM to disk so that after reboot 4513 * the user can read log and see why the system rebooted. 4514 */ 4515 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4516 DRM_WARN("Emergency reboot."); 4517 4518 ksys_sync_helper(); 4519 emergency_restart(); 4520 } 4521 4522 dev_info(adev->dev, "GPU %s begin!\n", 4523 need_emergency_restart ? "jobs stop":"reset"); 4524 4525 /* 4526 * Here we trylock to avoid chain of resets executing from 4527 * either trigger by jobs on different adevs in XGMI hive or jobs on 4528 * different schedulers for same device while this TO handler is running. 4529 * We always reset all schedulers for device and all devices for XGMI 4530 * hive so that should take care of them too. 4531 */ 4532 hive = amdgpu_get_xgmi_hive(adev); 4533 if (hive) { 4534 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4535 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4536 job ? job->base.id : -1, hive->hive_id); 4537 amdgpu_put_xgmi_hive(hive); 4538 return 0; 4539 } 4540 mutex_lock(&hive->hive_lock); 4541 } 4542 4543 /* 4544 * Build list of devices to reset. 4545 * In case we are in XGMI hive mode, resort the device list 4546 * to put adev in the 1st position. 4547 */ 4548 INIT_LIST_HEAD(&device_list); 4549 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4550 if (!hive) 4551 return -ENODEV; 4552 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4553 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4554 device_list_handle = &hive->device_list; 4555 } else { 4556 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4557 device_list_handle = &device_list; 4558 } 4559 4560 /* block all schedulers and reset given job's ring */ 4561 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4562 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4563 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4564 job ? job->base.id : -1); 4565 r = 0; 4566 goto skip_recovery; 4567 } 4568 4569 /* 4570 * Try to put the audio codec into suspend state 4571 * before gpu reset started. 4572 * 4573 * Due to the power domain of the graphics device 4574 * is shared with AZ power domain. Without this, 4575 * we may change the audio hardware from behind 4576 * the audio driver's back. That will trigger 4577 * some audio codec errors. 4578 */ 4579 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4580 audio_suspended = true; 4581 4582 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4583 4584 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4585 4586 if (!amdgpu_sriov_vf(tmp_adev)) 4587 amdgpu_amdkfd_pre_reset(tmp_adev); 4588 4589 /* 4590 * Mark these ASICs to be reseted as untracked first 4591 * And add them back after reset completed 4592 */ 4593 amdgpu_unregister_gpu_instance(tmp_adev); 4594 4595 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4596 4597 /* disable ras on ALL IPs */ 4598 if (!need_emergency_restart && 4599 amdgpu_device_ip_need_full_reset(tmp_adev)) 4600 amdgpu_ras_suspend(tmp_adev); 4601 4602 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4603 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4604 4605 if (!ring || !ring->sched.thread) 4606 continue; 4607 4608 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4609 4610 if (need_emergency_restart) 4611 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4612 } 4613 } 4614 4615 if (need_emergency_restart) 4616 goto skip_sched_resume; 4617 4618 /* 4619 * Must check guilty signal here since after this point all old 4620 * HW fences are force signaled. 4621 * 4622 * job->base holds a reference to parent fence 4623 */ 4624 if (job && job->base.s_fence->parent && 4625 dma_fence_is_signaled(job->base.s_fence->parent)) { 4626 job_signaled = true; 4627 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4628 goto skip_hw_reset; 4629 } 4630 4631 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4632 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4633 r = amdgpu_device_pre_asic_reset(tmp_adev, 4634 (tmp_adev == adev) ? job : NULL, 4635 &need_full_reset); 4636 /*TODO Should we stop ?*/ 4637 if (r) { 4638 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4639 r, adev_to_drm(tmp_adev)->unique); 4640 tmp_adev->asic_reset_res = r; 4641 } 4642 } 4643 4644 /* Actual ASIC resets if needed.*/ 4645 /* TODO Implement XGMI hive reset logic for SRIOV */ 4646 if (amdgpu_sriov_vf(adev)) { 4647 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4648 if (r) 4649 adev->asic_reset_res = r; 4650 } else { 4651 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4652 if (r && r == -EAGAIN) 4653 goto retry; 4654 } 4655 4656 skip_hw_reset: 4657 4658 /* Post ASIC reset for all devs .*/ 4659 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4660 4661 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4662 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4663 4664 if (!ring || !ring->sched.thread) 4665 continue; 4666 4667 /* No point to resubmit jobs if we didn't HW reset*/ 4668 if (!tmp_adev->asic_reset_res && !job_signaled) 4669 drm_sched_resubmit_jobs(&ring->sched); 4670 4671 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4672 } 4673 4674 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4675 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4676 } 4677 4678 tmp_adev->asic_reset_res = 0; 4679 4680 if (r) { 4681 /* bad news, how to tell it to userspace ? */ 4682 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4683 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4684 } else { 4685 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4686 } 4687 } 4688 4689 skip_sched_resume: 4690 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4691 /*unlock kfd: SRIOV would do it separately */ 4692 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4693 amdgpu_amdkfd_post_reset(tmp_adev); 4694 if (audio_suspended) 4695 amdgpu_device_resume_display_audio(tmp_adev); 4696 amdgpu_device_unlock_adev(tmp_adev); 4697 } 4698 4699 skip_recovery: 4700 if (hive) { 4701 atomic_set(&hive->in_reset, 0); 4702 mutex_unlock(&hive->hive_lock); 4703 amdgpu_put_xgmi_hive(hive); 4704 } 4705 4706 if (r) 4707 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4708 return r; 4709 } 4710 4711 /** 4712 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4713 * 4714 * @adev: amdgpu_device pointer 4715 * 4716 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4717 * and lanes) of the slot the device is in. Handles APUs and 4718 * virtualized environments where PCIE config space may not be available. 4719 */ 4720 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4721 { 4722 struct pci_dev *pdev; 4723 enum pci_bus_speed speed_cap, platform_speed_cap; 4724 enum pcie_link_width platform_link_width; 4725 4726 if (amdgpu_pcie_gen_cap) 4727 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4728 4729 if (amdgpu_pcie_lane_cap) 4730 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4731 4732 /* covers APUs as well */ 4733 if (pci_is_root_bus(adev->pdev->bus)) { 4734 if (adev->pm.pcie_gen_mask == 0) 4735 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4736 if (adev->pm.pcie_mlw_mask == 0) 4737 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4738 return; 4739 } 4740 4741 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4742 return; 4743 4744 pcie_bandwidth_available(adev->pdev, NULL, 4745 &platform_speed_cap, &platform_link_width); 4746 4747 if (adev->pm.pcie_gen_mask == 0) { 4748 /* asic caps */ 4749 pdev = adev->pdev; 4750 speed_cap = pcie_get_speed_cap(pdev); 4751 if (speed_cap == PCI_SPEED_UNKNOWN) { 4752 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4753 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4755 } else { 4756 if (speed_cap == PCIE_SPEED_16_0GT) 4757 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4758 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4759 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4760 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4761 else if (speed_cap == PCIE_SPEED_8_0GT) 4762 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4763 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4764 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4765 else if (speed_cap == PCIE_SPEED_5_0GT) 4766 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4767 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4768 else 4769 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4770 } 4771 /* platform caps */ 4772 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4773 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4774 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4775 } else { 4776 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4777 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4778 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4779 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4780 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4781 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4782 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4783 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4784 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4785 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4786 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4787 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4788 else 4789 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4790 4791 } 4792 } 4793 if (adev->pm.pcie_mlw_mask == 0) { 4794 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4795 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4796 } else { 4797 switch (platform_link_width) { 4798 case PCIE_LNK_X32: 4799 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4800 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4802 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4803 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4804 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4806 break; 4807 case PCIE_LNK_X16: 4808 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4810 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4814 break; 4815 case PCIE_LNK_X12: 4816 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4821 break; 4822 case PCIE_LNK_X8: 4823 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4827 break; 4828 case PCIE_LNK_X4: 4829 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4832 break; 4833 case PCIE_LNK_X2: 4834 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4836 break; 4837 case PCIE_LNK_X1: 4838 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4839 break; 4840 default: 4841 break; 4842 } 4843 } 4844 } 4845 } 4846 4847 int amdgpu_device_baco_enter(struct drm_device *dev) 4848 { 4849 struct amdgpu_device *adev = drm_to_adev(dev); 4850 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4851 4852 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4853 return -ENOTSUPP; 4854 4855 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4856 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4857 4858 return amdgpu_dpm_baco_enter(adev); 4859 } 4860 4861 int amdgpu_device_baco_exit(struct drm_device *dev) 4862 { 4863 struct amdgpu_device *adev = drm_to_adev(dev); 4864 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4865 int ret = 0; 4866 4867 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4868 return -ENOTSUPP; 4869 4870 ret = amdgpu_dpm_baco_exit(adev); 4871 if (ret) 4872 return ret; 4873 4874 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4875 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4876 4877 return 0; 4878 } 4879 4880 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4881 { 4882 int i; 4883 4884 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4885 struct amdgpu_ring *ring = adev->rings[i]; 4886 4887 if (!ring || !ring->sched.thread) 4888 continue; 4889 4890 cancel_delayed_work_sync(&ring->sched.work_tdr); 4891 } 4892 } 4893 4894 /** 4895 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4896 * @pdev: PCI device struct 4897 * @state: PCI channel state 4898 * 4899 * Description: Called when a PCI error is detected. 4900 * 4901 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4902 */ 4903 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4904 { 4905 struct drm_device *dev = pci_get_drvdata(pdev); 4906 struct amdgpu_device *adev = drm_to_adev(dev); 4907 int i; 4908 4909 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4910 4911 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4912 DRM_WARN("No support for XGMI hive yet..."); 4913 return PCI_ERS_RESULT_DISCONNECT; 4914 } 4915 4916 switch (state) { 4917 case pci_channel_io_normal: 4918 return PCI_ERS_RESULT_CAN_RECOVER; 4919 /* Fatal error, prepare for slot reset */ 4920 case pci_channel_io_frozen: 4921 /* 4922 * Cancel and wait for all TDRs in progress if failing to 4923 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4924 * 4925 * Locking adev->reset_sem will prevent any external access 4926 * to GPU during PCI error recovery 4927 */ 4928 while (!amdgpu_device_lock_adev(adev, NULL)) 4929 amdgpu_cancel_all_tdr(adev); 4930 4931 /* 4932 * Block any work scheduling as we do for regular GPU reset 4933 * for the duration of the recovery 4934 */ 4935 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4936 struct amdgpu_ring *ring = adev->rings[i]; 4937 4938 if (!ring || !ring->sched.thread) 4939 continue; 4940 4941 drm_sched_stop(&ring->sched, NULL); 4942 } 4943 return PCI_ERS_RESULT_NEED_RESET; 4944 case pci_channel_io_perm_failure: 4945 /* Permanent error, prepare for device removal */ 4946 return PCI_ERS_RESULT_DISCONNECT; 4947 } 4948 4949 return PCI_ERS_RESULT_NEED_RESET; 4950 } 4951 4952 /** 4953 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4954 * @pdev: pointer to PCI device 4955 */ 4956 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4957 { 4958 4959 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4960 4961 /* TODO - dump whatever for debugging purposes */ 4962 4963 /* This called only if amdgpu_pci_error_detected returns 4964 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4965 * works, no need to reset slot. 4966 */ 4967 4968 return PCI_ERS_RESULT_RECOVERED; 4969 } 4970 4971 /** 4972 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4973 * @pdev: PCI device struct 4974 * 4975 * Description: This routine is called by the pci error recovery 4976 * code after the PCI slot has been reset, just before we 4977 * should resume normal operations. 4978 */ 4979 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4980 { 4981 struct drm_device *dev = pci_get_drvdata(pdev); 4982 struct amdgpu_device *adev = drm_to_adev(dev); 4983 int r, i; 4984 bool need_full_reset = true; 4985 u32 memsize; 4986 struct list_head device_list; 4987 4988 DRM_INFO("PCI error: slot reset callback!!\n"); 4989 4990 INIT_LIST_HEAD(&device_list); 4991 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4992 4993 /* wait for asic to come out of reset */ 4994 msleep(500); 4995 4996 /* Restore PCI confspace */ 4997 amdgpu_device_load_pci_state(pdev); 4998 4999 /* confirm ASIC came out of reset */ 5000 for (i = 0; i < adev->usec_timeout; i++) { 5001 memsize = amdgpu_asic_get_config_memsize(adev); 5002 5003 if (memsize != 0xffffffff) 5004 break; 5005 udelay(1); 5006 } 5007 if (memsize == 0xffffffff) { 5008 r = -ETIME; 5009 goto out; 5010 } 5011 5012 adev->in_pci_err_recovery = true; 5013 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5014 adev->in_pci_err_recovery = false; 5015 if (r) 5016 goto out; 5017 5018 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5019 5020 out: 5021 if (!r) { 5022 if (amdgpu_device_cache_pci_state(adev->pdev)) 5023 pci_restore_state(adev->pdev); 5024 5025 DRM_INFO("PCIe error recovery succeeded\n"); 5026 } else { 5027 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5028 amdgpu_device_unlock_adev(adev); 5029 } 5030 5031 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5032 } 5033 5034 /** 5035 * amdgpu_pci_resume() - resume normal ops after PCI reset 5036 * @pdev: pointer to PCI device 5037 * 5038 * Called when the error recovery driver tells us that its 5039 * OK to resume normal operation. Use completion to allow 5040 * halted scsi ops to resume. 5041 */ 5042 void amdgpu_pci_resume(struct pci_dev *pdev) 5043 { 5044 struct drm_device *dev = pci_get_drvdata(pdev); 5045 struct amdgpu_device *adev = drm_to_adev(dev); 5046 int i; 5047 5048 5049 DRM_INFO("PCI error: resume callback!!\n"); 5050 5051 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5052 struct amdgpu_ring *ring = adev->rings[i]; 5053 5054 if (!ring || !ring->sched.thread) 5055 continue; 5056 5057 5058 drm_sched_resubmit_jobs(&ring->sched); 5059 drm_sched_start(&ring->sched, true); 5060 } 5061 5062 amdgpu_device_unlock_adev(adev); 5063 } 5064 5065 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5066 { 5067 struct drm_device *dev = pci_get_drvdata(pdev); 5068 struct amdgpu_device *adev = drm_to_adev(dev); 5069 int r; 5070 5071 r = pci_save_state(pdev); 5072 if (!r) { 5073 kfree(adev->pci_state); 5074 5075 adev->pci_state = pci_store_saved_state(pdev); 5076 5077 if (!adev->pci_state) { 5078 DRM_ERROR("Failed to store PCI saved state"); 5079 return false; 5080 } 5081 } else { 5082 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5083 return false; 5084 } 5085 5086 return true; 5087 } 5088 5089 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5090 { 5091 struct drm_device *dev = pci_get_drvdata(pdev); 5092 struct amdgpu_device *adev = drm_to_adev(dev); 5093 int r; 5094 5095 if (!adev->pci_state) 5096 return false; 5097 5098 r = pci_load_saved_state(pdev, adev->pci_state); 5099 5100 if (!r) { 5101 pci_restore_state(pdev); 5102 } else { 5103 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5104 return false; 5105 } 5106 5107 return true; 5108 } 5109 5110 5111