1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 84 #define AMDGPU_RESUME_MS 2000 85 86 const char *amdgpu_asic_name[] = { 87 "TAHITI", 88 "PITCAIRN", 89 "VERDE", 90 "OLAND", 91 "HAINAN", 92 "BONAIRE", 93 "KAVERI", 94 "KABINI", 95 "HAWAII", 96 "MULLINS", 97 "TOPAZ", 98 "TONGA", 99 "FIJI", 100 "CARRIZO", 101 "STONEY", 102 "POLARIS10", 103 "POLARIS11", 104 "POLARIS12", 105 "VEGAM", 106 "VEGA10", 107 "VEGA12", 108 "VEGA20", 109 "RAVEN", 110 "ARCTURUS", 111 "RENOIR", 112 "NAVI10", 113 "NAVI14", 114 "NAVI12", 115 "SIENNA_CICHLID", 116 "NAVY_FLOUNDER", 117 "LAST", 118 }; 119 120 /** 121 * DOC: pcie_replay_count 122 * 123 * The amdgpu driver provides a sysfs API for reporting the total number 124 * of PCIe replays (NAKs) 125 * The file pcie_replay_count is used for this and returns the total 126 * number of replays as a sum of the NAKs generated and NAKs received 127 */ 128 129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 130 struct device_attribute *attr, char *buf) 131 { 132 struct drm_device *ddev = dev_get_drvdata(dev); 133 struct amdgpu_device *adev = drm_to_adev(ddev); 134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 135 136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 137 } 138 139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 140 amdgpu_device_get_pcie_replay_count, NULL); 141 142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 143 144 /** 145 * DOC: product_name 146 * 147 * The amdgpu driver provides a sysfs API for reporting the product name 148 * for the device 149 * The file serial_number is used for this and returns the product name 150 * as returned from the FRU. 151 * NOTE: This is only available for certain server cards 152 */ 153 154 static ssize_t amdgpu_device_get_product_name(struct device *dev, 155 struct device_attribute *attr, char *buf) 156 { 157 struct drm_device *ddev = dev_get_drvdata(dev); 158 struct amdgpu_device *adev = drm_to_adev(ddev); 159 160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 161 } 162 163 static DEVICE_ATTR(product_name, S_IRUGO, 164 amdgpu_device_get_product_name, NULL); 165 166 /** 167 * DOC: product_number 168 * 169 * The amdgpu driver provides a sysfs API for reporting the part number 170 * for the device 171 * The file serial_number is used for this and returns the part number 172 * as returned from the FRU. 173 * NOTE: This is only available for certain server cards 174 */ 175 176 static ssize_t amdgpu_device_get_product_number(struct device *dev, 177 struct device_attribute *attr, char *buf) 178 { 179 struct drm_device *ddev = dev_get_drvdata(dev); 180 struct amdgpu_device *adev = drm_to_adev(ddev); 181 182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 183 } 184 185 static DEVICE_ATTR(product_number, S_IRUGO, 186 amdgpu_device_get_product_number, NULL); 187 188 /** 189 * DOC: serial_number 190 * 191 * The amdgpu driver provides a sysfs API for reporting the serial number 192 * for the device 193 * The file serial_number is used for this and returns the serial number 194 * as returned from the FRU. 195 * NOTE: This is only available for certain server cards 196 */ 197 198 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 199 struct device_attribute *attr, char *buf) 200 { 201 struct drm_device *ddev = dev_get_drvdata(dev); 202 struct amdgpu_device *adev = drm_to_adev(ddev); 203 204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 205 } 206 207 static DEVICE_ATTR(serial_number, S_IRUGO, 208 amdgpu_device_get_serial_number, NULL); 209 210 /** 211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 212 * 213 * @dev: drm_device pointer 214 * 215 * Returns true if the device is a dGPU with HG/PX power control, 216 * otherwise return false. 217 */ 218 bool amdgpu_device_supports_boco(struct drm_device *dev) 219 { 220 struct amdgpu_device *adev = drm_to_adev(dev); 221 222 if (adev->flags & AMD_IS_PX) 223 return true; 224 return false; 225 } 226 227 /** 228 * amdgpu_device_supports_baco - Does the device support BACO 229 * 230 * @dev: drm_device pointer 231 * 232 * Returns true if the device supporte BACO, 233 * otherwise return false. 234 */ 235 bool amdgpu_device_supports_baco(struct drm_device *dev) 236 { 237 struct amdgpu_device *adev = drm_to_adev(dev); 238 239 return amdgpu_asic_supports_baco(adev); 240 } 241 242 /** 243 * VRAM access helper functions. 244 * 245 * amdgpu_device_vram_access - read/write a buffer in vram 246 * 247 * @adev: amdgpu_device pointer 248 * @pos: offset of the buffer in vram 249 * @buf: virtual address of the buffer in system memory 250 * @size: read/write size, sizeof(@buf) must > @size 251 * @write: true - write to vram, otherwise - read from vram 252 */ 253 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 254 uint32_t *buf, size_t size, bool write) 255 { 256 unsigned long flags; 257 uint32_t hi = ~0; 258 uint64_t last; 259 260 261 #ifdef CONFIG_64BIT 262 last = min(pos + size, adev->gmc.visible_vram_size); 263 if (last > pos) { 264 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 265 size_t count = last - pos; 266 267 if (write) { 268 memcpy_toio(addr, buf, count); 269 mb(); 270 amdgpu_asic_flush_hdp(adev, NULL); 271 } else { 272 amdgpu_asic_invalidate_hdp(adev, NULL); 273 mb(); 274 memcpy_fromio(buf, addr, count); 275 } 276 277 if (count == size) 278 return; 279 280 pos += count; 281 buf += count / 4; 282 size -= count; 283 } 284 #endif 285 286 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 287 for (last = pos + size; pos < last; pos += 4) { 288 uint32_t tmp = pos >> 31; 289 290 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 291 if (tmp != hi) { 292 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 293 hi = tmp; 294 } 295 if (write) 296 WREG32_NO_KIQ(mmMM_DATA, *buf++); 297 else 298 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 299 } 300 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 301 } 302 303 /* 304 * register access helper functions. 305 */ 306 /** 307 * amdgpu_device_rreg - read a memory mapped IO or indirect register 308 * 309 * @adev: amdgpu_device pointer 310 * @reg: dword aligned register offset 311 * @acc_flags: access flags which require special behavior 312 * 313 * Returns the 32 bit value from the offset specified. 314 */ 315 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 316 uint32_t reg, uint32_t acc_flags) 317 { 318 uint32_t ret; 319 320 if (adev->in_pci_err_recovery) 321 return 0; 322 323 if ((reg * 4) < adev->rmmio_size) { 324 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 325 amdgpu_sriov_runtime(adev) && 326 down_read_trylock(&adev->reset_sem)) { 327 ret = amdgpu_kiq_rreg(adev, reg); 328 up_read(&adev->reset_sem); 329 } else { 330 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 331 } 332 } else { 333 ret = adev->pcie_rreg(adev, reg * 4); 334 } 335 336 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 337 338 return ret; 339 } 340 341 /* 342 * MMIO register read with bytes helper functions 343 * @offset:bytes offset from MMIO start 344 * 345 */ 346 347 /** 348 * amdgpu_mm_rreg8 - read a memory mapped IO register 349 * 350 * @adev: amdgpu_device pointer 351 * @offset: byte aligned register offset 352 * 353 * Returns the 8 bit value from the offset specified. 354 */ 355 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 356 { 357 if (adev->in_pci_err_recovery) 358 return 0; 359 360 if (offset < adev->rmmio_size) 361 return (readb(adev->rmmio + offset)); 362 BUG(); 363 } 364 365 /* 366 * MMIO register write with bytes helper functions 367 * @offset:bytes offset from MMIO start 368 * @value: the value want to be written to the register 369 * 370 */ 371 /** 372 * amdgpu_mm_wreg8 - read a memory mapped IO register 373 * 374 * @adev: amdgpu_device pointer 375 * @offset: byte aligned register offset 376 * @value: 8 bit value to write 377 * 378 * Writes the value specified to the offset specified. 379 */ 380 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 381 { 382 if (adev->in_pci_err_recovery) 383 return; 384 385 if (offset < adev->rmmio_size) 386 writeb(value, adev->rmmio + offset); 387 else 388 BUG(); 389 } 390 391 /** 392 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 393 * 394 * @adev: amdgpu_device pointer 395 * @reg: dword aligned register offset 396 * @v: 32 bit value to write to the register 397 * @acc_flags: access flags which require special behavior 398 * 399 * Writes the value specified to the offset specified. 400 */ 401 void amdgpu_device_wreg(struct amdgpu_device *adev, 402 uint32_t reg, uint32_t v, 403 uint32_t acc_flags) 404 { 405 if (adev->in_pci_err_recovery) 406 return; 407 408 if ((reg * 4) < adev->rmmio_size) { 409 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 410 amdgpu_sriov_runtime(adev) && 411 down_read_trylock(&adev->reset_sem)) { 412 amdgpu_kiq_wreg(adev, reg, v); 413 up_read(&adev->reset_sem); 414 } else { 415 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 416 } 417 } else { 418 adev->pcie_wreg(adev, reg * 4, v); 419 } 420 421 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 422 } 423 424 /* 425 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 426 * 427 * this function is invoked only the debugfs register access 428 * */ 429 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 430 uint32_t reg, uint32_t v) 431 { 432 if (adev->in_pci_err_recovery) 433 return; 434 435 if (amdgpu_sriov_fullaccess(adev) && 436 adev->gfx.rlc.funcs && 437 adev->gfx.rlc.funcs->is_rlcg_access_range) { 438 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 439 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 440 } else { 441 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 442 } 443 } 444 445 /** 446 * amdgpu_io_rreg - read an IO register 447 * 448 * @adev: amdgpu_device pointer 449 * @reg: dword aligned register offset 450 * 451 * Returns the 32 bit value from the offset specified. 452 */ 453 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 454 { 455 if (adev->in_pci_err_recovery) 456 return 0; 457 458 if ((reg * 4) < adev->rio_mem_size) 459 return ioread32(adev->rio_mem + (reg * 4)); 460 else { 461 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 462 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 463 } 464 } 465 466 /** 467 * amdgpu_io_wreg - write to an IO register 468 * 469 * @adev: amdgpu_device pointer 470 * @reg: dword aligned register offset 471 * @v: 32 bit value to write to the register 472 * 473 * Writes the value specified to the offset specified. 474 */ 475 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 476 { 477 if (adev->in_pci_err_recovery) 478 return; 479 480 if ((reg * 4) < adev->rio_mem_size) 481 iowrite32(v, adev->rio_mem + (reg * 4)); 482 else { 483 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 484 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 485 } 486 } 487 488 /** 489 * amdgpu_mm_rdoorbell - read a doorbell dword 490 * 491 * @adev: amdgpu_device pointer 492 * @index: doorbell index 493 * 494 * Returns the value in the doorbell aperture at the 495 * requested doorbell index (CIK). 496 */ 497 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 498 { 499 if (adev->in_pci_err_recovery) 500 return 0; 501 502 if (index < adev->doorbell.num_doorbells) { 503 return readl(adev->doorbell.ptr + index); 504 } else { 505 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 506 return 0; 507 } 508 } 509 510 /** 511 * amdgpu_mm_wdoorbell - write a doorbell dword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * @v: value to write 516 * 517 * Writes @v to the doorbell aperture at the 518 * requested doorbell index (CIK). 519 */ 520 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 521 { 522 if (adev->in_pci_err_recovery) 523 return; 524 525 if (index < adev->doorbell.num_doorbells) { 526 writel(v, adev->doorbell.ptr + index); 527 } else { 528 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 529 } 530 } 531 532 /** 533 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 534 * 535 * @adev: amdgpu_device pointer 536 * @index: doorbell index 537 * 538 * Returns the value in the doorbell aperture at the 539 * requested doorbell index (VEGA10+). 540 */ 541 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 542 { 543 if (adev->in_pci_err_recovery) 544 return 0; 545 546 if (index < adev->doorbell.num_doorbells) { 547 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 548 } else { 549 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 550 return 0; 551 } 552 } 553 554 /** 555 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 556 * 557 * @adev: amdgpu_device pointer 558 * @index: doorbell index 559 * @v: value to write 560 * 561 * Writes @v to the doorbell aperture at the 562 * requested doorbell index (VEGA10+). 563 */ 564 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 565 { 566 if (adev->in_pci_err_recovery) 567 return; 568 569 if (index < adev->doorbell.num_doorbells) { 570 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 571 } else { 572 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 573 } 574 } 575 576 /** 577 * amdgpu_device_indirect_rreg - read an indirect register 578 * 579 * @adev: amdgpu_device pointer 580 * @pcie_index: mmio register offset 581 * @pcie_data: mmio register offset 582 * 583 * Returns the value of indirect register @reg_addr 584 */ 585 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 586 u32 pcie_index, u32 pcie_data, 587 u32 reg_addr) 588 { 589 unsigned long flags; 590 u32 r; 591 void __iomem *pcie_index_offset; 592 void __iomem *pcie_data_offset; 593 594 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 595 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 596 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 597 598 writel(reg_addr, pcie_index_offset); 599 readl(pcie_index_offset); 600 r = readl(pcie_data_offset); 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @pcie_index: mmio register offset 611 * @pcie_data: mmio register offset 612 * 613 * Returns the value of indirect register @reg_addr 614 */ 615 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 616 u32 pcie_index, u32 pcie_data, 617 u32 reg_addr) 618 { 619 unsigned long flags; 620 u64 r; 621 void __iomem *pcie_index_offset; 622 void __iomem *pcie_data_offset; 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @pcie_index: mmio register offset 646 * @pcie_data: mmio register offset 647 * @reg_addr: indirect register offset 648 * @reg_data: indirect register data 649 * 650 */ 651 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 652 u32 pcie_index, u32 pcie_data, 653 u32 reg_addr, u32 reg_data) 654 { 655 unsigned long flags; 656 void __iomem *pcie_index_offset; 657 void __iomem *pcie_data_offset; 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 /** 671 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 672 * 673 * @adev: amdgpu_device pointer 674 * @pcie_index: mmio register offset 675 * @pcie_data: mmio register offset 676 * @reg_addr: indirect register offset 677 * @reg_data: indirect register data 678 * 679 */ 680 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 681 u32 pcie_index, u32 pcie_data, 682 u32 reg_addr, u64 reg_data) 683 { 684 unsigned long flags; 685 void __iomem *pcie_index_offset; 686 void __iomem *pcie_data_offset; 687 688 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 689 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 690 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 691 692 /* write low 32 bits */ 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 696 readl(pcie_data_offset); 697 /* write high 32 bits */ 698 writel(reg_addr + 4, pcie_index_offset); 699 readl(pcie_index_offset); 700 writel((u32)(reg_data >> 32), pcie_data_offset); 701 readl(pcie_data_offset); 702 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 703 } 704 705 /** 706 * amdgpu_invalid_rreg - dummy reg read function 707 * 708 * @adev: amdgpu device pointer 709 * @reg: offset of register 710 * 711 * Dummy register read function. Used for register blocks 712 * that certain asics don't have (all asics). 713 * Returns the value in the register. 714 */ 715 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 716 { 717 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 718 BUG(); 719 return 0; 720 } 721 722 /** 723 * amdgpu_invalid_wreg - dummy reg write function 724 * 725 * @adev: amdgpu device pointer 726 * @reg: offset of register 727 * @v: value to write to the register 728 * 729 * Dummy register read function. Used for register blocks 730 * that certain asics don't have (all asics). 731 */ 732 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 733 { 734 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 735 reg, v); 736 BUG(); 737 } 738 739 /** 740 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 741 * 742 * @adev: amdgpu device pointer 743 * @reg: offset of register 744 * 745 * Dummy register read function. Used for register blocks 746 * that certain asics don't have (all asics). 747 * Returns the value in the register. 748 */ 749 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 750 { 751 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 752 BUG(); 753 return 0; 754 } 755 756 /** 757 * amdgpu_invalid_wreg64 - dummy reg write function 758 * 759 * @adev: amdgpu device pointer 760 * @reg: offset of register 761 * @v: value to write to the register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 */ 766 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 767 { 768 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 769 reg, v); 770 BUG(); 771 } 772 773 /** 774 * amdgpu_block_invalid_rreg - dummy reg read function 775 * 776 * @adev: amdgpu device pointer 777 * @block: offset of instance 778 * @reg: offset of register 779 * 780 * Dummy register read function. Used for register blocks 781 * that certain asics don't have (all asics). 782 * Returns the value in the register. 783 */ 784 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 785 uint32_t block, uint32_t reg) 786 { 787 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 788 reg, block); 789 BUG(); 790 return 0; 791 } 792 793 /** 794 * amdgpu_block_invalid_wreg - dummy reg write function 795 * 796 * @adev: amdgpu device pointer 797 * @block: offset of instance 798 * @reg: offset of register 799 * @v: value to write to the register 800 * 801 * Dummy register read function. Used for register blocks 802 * that certain asics don't have (all asics). 803 */ 804 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 805 uint32_t block, 806 uint32_t reg, uint32_t v) 807 { 808 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 809 reg, block, v); 810 BUG(); 811 } 812 813 /** 814 * amdgpu_device_asic_init - Wrapper for atom asic_init 815 * 816 * @dev: drm_device pointer 817 * 818 * Does any asic specific work and then calls atom asic init. 819 */ 820 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 821 { 822 amdgpu_asic_pre_asic_init(adev); 823 824 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 825 } 826 827 /** 828 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 829 * 830 * @adev: amdgpu device pointer 831 * 832 * Allocates a scratch page of VRAM for use by various things in the 833 * driver. 834 */ 835 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 836 { 837 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 838 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 839 &adev->vram_scratch.robj, 840 &adev->vram_scratch.gpu_addr, 841 (void **)&adev->vram_scratch.ptr); 842 } 843 844 /** 845 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 846 * 847 * @adev: amdgpu device pointer 848 * 849 * Frees the VRAM scratch page. 850 */ 851 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 852 { 853 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 854 } 855 856 /** 857 * amdgpu_device_program_register_sequence - program an array of registers. 858 * 859 * @adev: amdgpu_device pointer 860 * @registers: pointer to the register array 861 * @array_size: size of the register array 862 * 863 * Programs an array or registers with and and or masks. 864 * This is a helper for setting golden registers. 865 */ 866 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 867 const u32 *registers, 868 const u32 array_size) 869 { 870 u32 tmp, reg, and_mask, or_mask; 871 int i; 872 873 if (array_size % 3) 874 return; 875 876 for (i = 0; i < array_size; i +=3) { 877 reg = registers[i + 0]; 878 and_mask = registers[i + 1]; 879 or_mask = registers[i + 2]; 880 881 if (and_mask == 0xffffffff) { 882 tmp = or_mask; 883 } else { 884 tmp = RREG32(reg); 885 tmp &= ~and_mask; 886 if (adev->family >= AMDGPU_FAMILY_AI) 887 tmp |= (or_mask & and_mask); 888 else 889 tmp |= or_mask; 890 } 891 WREG32(reg, tmp); 892 } 893 } 894 895 /** 896 * amdgpu_device_pci_config_reset - reset the GPU 897 * 898 * @adev: amdgpu_device pointer 899 * 900 * Resets the GPU using the pci config reset sequence. 901 * Only applicable to asics prior to vega10. 902 */ 903 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 904 { 905 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 906 } 907 908 /* 909 * GPU doorbell aperture helpers function. 910 */ 911 /** 912 * amdgpu_device_doorbell_init - Init doorbell driver information. 913 * 914 * @adev: amdgpu_device pointer 915 * 916 * Init doorbell driver information (CIK) 917 * Returns 0 on success, error on failure. 918 */ 919 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 920 { 921 922 /* No doorbell on SI hardware generation */ 923 if (adev->asic_type < CHIP_BONAIRE) { 924 adev->doorbell.base = 0; 925 adev->doorbell.size = 0; 926 adev->doorbell.num_doorbells = 0; 927 adev->doorbell.ptr = NULL; 928 return 0; 929 } 930 931 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 932 return -EINVAL; 933 934 amdgpu_asic_init_doorbell_index(adev); 935 936 /* doorbell bar mapping */ 937 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 938 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 939 940 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 941 adev->doorbell_index.max_assignment+1); 942 if (adev->doorbell.num_doorbells == 0) 943 return -EINVAL; 944 945 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 946 * paging queue doorbell use the second page. The 947 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 948 * doorbells are in the first page. So with paging queue enabled, 949 * the max num_doorbells should + 1 page (0x400 in dword) 950 */ 951 if (adev->asic_type >= CHIP_VEGA10) 952 adev->doorbell.num_doorbells += 0x400; 953 954 adev->doorbell.ptr = ioremap(adev->doorbell.base, 955 adev->doorbell.num_doorbells * 956 sizeof(u32)); 957 if (adev->doorbell.ptr == NULL) 958 return -ENOMEM; 959 960 return 0; 961 } 962 963 /** 964 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 965 * 966 * @adev: amdgpu_device pointer 967 * 968 * Tear down doorbell driver information (CIK) 969 */ 970 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 971 { 972 iounmap(adev->doorbell.ptr); 973 adev->doorbell.ptr = NULL; 974 } 975 976 977 978 /* 979 * amdgpu_device_wb_*() 980 * Writeback is the method by which the GPU updates special pages in memory 981 * with the status of certain GPU events (fences, ring pointers,etc.). 982 */ 983 984 /** 985 * amdgpu_device_wb_fini - Disable Writeback and free memory 986 * 987 * @adev: amdgpu_device pointer 988 * 989 * Disables Writeback and frees the Writeback memory (all asics). 990 * Used at driver shutdown. 991 */ 992 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 993 { 994 if (adev->wb.wb_obj) { 995 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 996 &adev->wb.gpu_addr, 997 (void **)&adev->wb.wb); 998 adev->wb.wb_obj = NULL; 999 } 1000 } 1001 1002 /** 1003 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1004 * 1005 * @adev: amdgpu_device pointer 1006 * 1007 * Initializes writeback and allocates writeback memory (all asics). 1008 * Used at driver startup. 1009 * Returns 0 on success or an -error on failure. 1010 */ 1011 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1012 { 1013 int r; 1014 1015 if (adev->wb.wb_obj == NULL) { 1016 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1017 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1018 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1019 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1020 (void **)&adev->wb.wb); 1021 if (r) { 1022 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1023 return r; 1024 } 1025 1026 adev->wb.num_wb = AMDGPU_MAX_WB; 1027 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1028 1029 /* clear wb memory */ 1030 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1031 } 1032 1033 return 0; 1034 } 1035 1036 /** 1037 * amdgpu_device_wb_get - Allocate a wb entry 1038 * 1039 * @adev: amdgpu_device pointer 1040 * @wb: wb index 1041 * 1042 * Allocate a wb slot for use by the driver (all asics). 1043 * Returns 0 on success or -EINVAL on failure. 1044 */ 1045 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1046 { 1047 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1048 1049 if (offset < adev->wb.num_wb) { 1050 __set_bit(offset, adev->wb.used); 1051 *wb = offset << 3; /* convert to dw offset */ 1052 return 0; 1053 } else { 1054 return -EINVAL; 1055 } 1056 } 1057 1058 /** 1059 * amdgpu_device_wb_free - Free a wb entry 1060 * 1061 * @adev: amdgpu_device pointer 1062 * @wb: wb index 1063 * 1064 * Free a wb slot allocated for use by the driver (all asics) 1065 */ 1066 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1067 { 1068 wb >>= 3; 1069 if (wb < adev->wb.num_wb) 1070 __clear_bit(wb, adev->wb.used); 1071 } 1072 1073 /** 1074 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1075 * 1076 * @adev: amdgpu_device pointer 1077 * 1078 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1079 * to fail, but if any of the BARs is not accessible after the size we abort 1080 * driver loading by returning -ENODEV. 1081 */ 1082 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1083 { 1084 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1085 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1086 struct pci_bus *root; 1087 struct resource *res; 1088 unsigned i; 1089 u16 cmd; 1090 int r; 1091 1092 /* Bypass for VF */ 1093 if (amdgpu_sriov_vf(adev)) 1094 return 0; 1095 1096 /* skip if the bios has already enabled large BAR */ 1097 if (adev->gmc.real_vram_size && 1098 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1099 return 0; 1100 1101 /* Check if the root BUS has 64bit memory resources */ 1102 root = adev->pdev->bus; 1103 while (root->parent) 1104 root = root->parent; 1105 1106 pci_bus_for_each_resource(root, res, i) { 1107 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1108 res->start > 0x100000000ull) 1109 break; 1110 } 1111 1112 /* Trying to resize is pointless without a root hub window above 4GB */ 1113 if (!res) 1114 return 0; 1115 1116 /* Disable memory decoding while we change the BAR addresses and size */ 1117 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1118 pci_write_config_word(adev->pdev, PCI_COMMAND, 1119 cmd & ~PCI_COMMAND_MEMORY); 1120 1121 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1122 amdgpu_device_doorbell_fini(adev); 1123 if (adev->asic_type >= CHIP_BONAIRE) 1124 pci_release_resource(adev->pdev, 2); 1125 1126 pci_release_resource(adev->pdev, 0); 1127 1128 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1129 if (r == -ENOSPC) 1130 DRM_INFO("Not enough PCI address space for a large BAR."); 1131 else if (r && r != -ENOTSUPP) 1132 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1133 1134 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1135 1136 /* When the doorbell or fb BAR isn't available we have no chance of 1137 * using the device. 1138 */ 1139 r = amdgpu_device_doorbell_init(adev); 1140 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1141 return -ENODEV; 1142 1143 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1144 1145 return 0; 1146 } 1147 1148 /* 1149 * GPU helpers function. 1150 */ 1151 /** 1152 * amdgpu_device_need_post - check if the hw need post or not 1153 * 1154 * @adev: amdgpu_device pointer 1155 * 1156 * Check if the asic has been initialized (all asics) at driver startup 1157 * or post is needed if hw reset is performed. 1158 * Returns true if need or false if not. 1159 */ 1160 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1161 { 1162 uint32_t reg; 1163 1164 if (amdgpu_sriov_vf(adev)) 1165 return false; 1166 1167 if (amdgpu_passthrough(adev)) { 1168 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1169 * some old smc fw still need driver do vPost otherwise gpu hang, while 1170 * those smc fw version above 22.15 doesn't have this flaw, so we force 1171 * vpost executed for smc version below 22.15 1172 */ 1173 if (adev->asic_type == CHIP_FIJI) { 1174 int err; 1175 uint32_t fw_ver; 1176 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1177 /* force vPost if error occured */ 1178 if (err) 1179 return true; 1180 1181 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1182 if (fw_ver < 0x00160e00) 1183 return true; 1184 } 1185 } 1186 1187 if (adev->has_hw_reset) { 1188 adev->has_hw_reset = false; 1189 return true; 1190 } 1191 1192 /* bios scratch used on CIK+ */ 1193 if (adev->asic_type >= CHIP_BONAIRE) 1194 return amdgpu_atombios_scratch_need_asic_init(adev); 1195 1196 /* check MEM_SIZE for older asics */ 1197 reg = amdgpu_asic_get_config_memsize(adev); 1198 1199 if ((reg != 0) && (reg != 0xffffffff)) 1200 return false; 1201 1202 return true; 1203 } 1204 1205 /* if we get transitioned to only one device, take VGA back */ 1206 /** 1207 * amdgpu_device_vga_set_decode - enable/disable vga decode 1208 * 1209 * @cookie: amdgpu_device pointer 1210 * @state: enable/disable vga decode 1211 * 1212 * Enable/disable vga decode (all asics). 1213 * Returns VGA resource flags. 1214 */ 1215 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1216 { 1217 struct amdgpu_device *adev = cookie; 1218 amdgpu_asic_set_vga_state(adev, state); 1219 if (state) 1220 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1221 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1222 else 1223 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1224 } 1225 1226 /** 1227 * amdgpu_device_check_block_size - validate the vm block size 1228 * 1229 * @adev: amdgpu_device pointer 1230 * 1231 * Validates the vm block size specified via module parameter. 1232 * The vm block size defines number of bits in page table versus page directory, 1233 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1234 * page table and the remaining bits are in the page directory. 1235 */ 1236 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1237 { 1238 /* defines number of bits in page table versus page directory, 1239 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1240 * page table and the remaining bits are in the page directory */ 1241 if (amdgpu_vm_block_size == -1) 1242 return; 1243 1244 if (amdgpu_vm_block_size < 9) { 1245 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1246 amdgpu_vm_block_size); 1247 amdgpu_vm_block_size = -1; 1248 } 1249 } 1250 1251 /** 1252 * amdgpu_device_check_vm_size - validate the vm size 1253 * 1254 * @adev: amdgpu_device pointer 1255 * 1256 * Validates the vm size in GB specified via module parameter. 1257 * The VM size is the size of the GPU virtual memory space in GB. 1258 */ 1259 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1260 { 1261 /* no need to check the default value */ 1262 if (amdgpu_vm_size == -1) 1263 return; 1264 1265 if (amdgpu_vm_size < 1) { 1266 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1267 amdgpu_vm_size); 1268 amdgpu_vm_size = -1; 1269 } 1270 } 1271 1272 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1273 { 1274 struct sysinfo si; 1275 bool is_os_64 = (sizeof(void *) == 8); 1276 uint64_t total_memory; 1277 uint64_t dram_size_seven_GB = 0x1B8000000; 1278 uint64_t dram_size_three_GB = 0xB8000000; 1279 1280 if (amdgpu_smu_memory_pool_size == 0) 1281 return; 1282 1283 if (!is_os_64) { 1284 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1285 goto def_value; 1286 } 1287 si_meminfo(&si); 1288 total_memory = (uint64_t)si.totalram * si.mem_unit; 1289 1290 if ((amdgpu_smu_memory_pool_size == 1) || 1291 (amdgpu_smu_memory_pool_size == 2)) { 1292 if (total_memory < dram_size_three_GB) 1293 goto def_value1; 1294 } else if ((amdgpu_smu_memory_pool_size == 4) || 1295 (amdgpu_smu_memory_pool_size == 8)) { 1296 if (total_memory < dram_size_seven_GB) 1297 goto def_value1; 1298 } else { 1299 DRM_WARN("Smu memory pool size not supported\n"); 1300 goto def_value; 1301 } 1302 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1303 1304 return; 1305 1306 def_value1: 1307 DRM_WARN("No enough system memory\n"); 1308 def_value: 1309 adev->pm.smu_prv_buffer_size = 0; 1310 } 1311 1312 /** 1313 * amdgpu_device_check_arguments - validate module params 1314 * 1315 * @adev: amdgpu_device pointer 1316 * 1317 * Validates certain module parameters and updates 1318 * the associated values used by the driver (all asics). 1319 */ 1320 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1321 { 1322 if (amdgpu_sched_jobs < 4) { 1323 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1324 amdgpu_sched_jobs); 1325 amdgpu_sched_jobs = 4; 1326 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1327 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1328 amdgpu_sched_jobs); 1329 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1330 } 1331 1332 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1333 /* gart size must be greater or equal to 32M */ 1334 dev_warn(adev->dev, "gart size (%d) too small\n", 1335 amdgpu_gart_size); 1336 amdgpu_gart_size = -1; 1337 } 1338 1339 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1340 /* gtt size must be greater or equal to 32M */ 1341 dev_warn(adev->dev, "gtt size (%d) too small\n", 1342 amdgpu_gtt_size); 1343 amdgpu_gtt_size = -1; 1344 } 1345 1346 /* valid range is between 4 and 9 inclusive */ 1347 if (amdgpu_vm_fragment_size != -1 && 1348 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1349 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1350 amdgpu_vm_fragment_size = -1; 1351 } 1352 1353 if (amdgpu_sched_hw_submission < 2) { 1354 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1355 amdgpu_sched_hw_submission); 1356 amdgpu_sched_hw_submission = 2; 1357 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1358 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1359 amdgpu_sched_hw_submission); 1360 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1361 } 1362 1363 amdgpu_device_check_smu_prv_buffer_size(adev); 1364 1365 amdgpu_device_check_vm_size(adev); 1366 1367 amdgpu_device_check_block_size(adev); 1368 1369 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1370 1371 amdgpu_gmc_tmz_set(adev); 1372 1373 if (amdgpu_num_kcq == -1) { 1374 amdgpu_num_kcq = 8; 1375 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1376 amdgpu_num_kcq = 8; 1377 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1378 } 1379 1380 amdgpu_gmc_noretry_set(adev); 1381 1382 return 0; 1383 } 1384 1385 /** 1386 * amdgpu_switcheroo_set_state - set switcheroo state 1387 * 1388 * @pdev: pci dev pointer 1389 * @state: vga_switcheroo state 1390 * 1391 * Callback for the switcheroo driver. Suspends or resumes the 1392 * the asics before or after it is powered up using ACPI methods. 1393 */ 1394 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1395 enum vga_switcheroo_state state) 1396 { 1397 struct drm_device *dev = pci_get_drvdata(pdev); 1398 int r; 1399 1400 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1401 return; 1402 1403 if (state == VGA_SWITCHEROO_ON) { 1404 pr_info("switched on\n"); 1405 /* don't suspend or resume card normally */ 1406 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1407 1408 pci_set_power_state(dev->pdev, PCI_D0); 1409 amdgpu_device_load_pci_state(dev->pdev); 1410 r = pci_enable_device(dev->pdev); 1411 if (r) 1412 DRM_WARN("pci_enable_device failed (%d)\n", r); 1413 amdgpu_device_resume(dev, true); 1414 1415 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1416 drm_kms_helper_poll_enable(dev); 1417 } else { 1418 pr_info("switched off\n"); 1419 drm_kms_helper_poll_disable(dev); 1420 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1421 amdgpu_device_suspend(dev, true); 1422 amdgpu_device_cache_pci_state(dev->pdev); 1423 /* Shut down the device */ 1424 pci_disable_device(dev->pdev); 1425 pci_set_power_state(dev->pdev, PCI_D3cold); 1426 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1427 } 1428 } 1429 1430 /** 1431 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1432 * 1433 * @pdev: pci dev pointer 1434 * 1435 * Callback for the switcheroo driver. Check of the switcheroo 1436 * state can be changed. 1437 * Returns true if the state can be changed, false if not. 1438 */ 1439 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1440 { 1441 struct drm_device *dev = pci_get_drvdata(pdev); 1442 1443 /* 1444 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1445 * locking inversion with the driver load path. And the access here is 1446 * completely racy anyway. So don't bother with locking for now. 1447 */ 1448 return atomic_read(&dev->open_count) == 0; 1449 } 1450 1451 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1452 .set_gpu_state = amdgpu_switcheroo_set_state, 1453 .reprobe = NULL, 1454 .can_switch = amdgpu_switcheroo_can_switch, 1455 }; 1456 1457 /** 1458 * amdgpu_device_ip_set_clockgating_state - set the CG state 1459 * 1460 * @dev: amdgpu_device pointer 1461 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1462 * @state: clockgating state (gate or ungate) 1463 * 1464 * Sets the requested clockgating state for all instances of 1465 * the hardware IP specified. 1466 * Returns the error code from the last instance. 1467 */ 1468 int amdgpu_device_ip_set_clockgating_state(void *dev, 1469 enum amd_ip_block_type block_type, 1470 enum amd_clockgating_state state) 1471 { 1472 struct amdgpu_device *adev = dev; 1473 int i, r = 0; 1474 1475 for (i = 0; i < adev->num_ip_blocks; i++) { 1476 if (!adev->ip_blocks[i].status.valid) 1477 continue; 1478 if (adev->ip_blocks[i].version->type != block_type) 1479 continue; 1480 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1481 continue; 1482 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1483 (void *)adev, state); 1484 if (r) 1485 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1486 adev->ip_blocks[i].version->funcs->name, r); 1487 } 1488 return r; 1489 } 1490 1491 /** 1492 * amdgpu_device_ip_set_powergating_state - set the PG state 1493 * 1494 * @dev: amdgpu_device pointer 1495 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1496 * @state: powergating state (gate or ungate) 1497 * 1498 * Sets the requested powergating state for all instances of 1499 * the hardware IP specified. 1500 * Returns the error code from the last instance. 1501 */ 1502 int amdgpu_device_ip_set_powergating_state(void *dev, 1503 enum amd_ip_block_type block_type, 1504 enum amd_powergating_state state) 1505 { 1506 struct amdgpu_device *adev = dev; 1507 int i, r = 0; 1508 1509 for (i = 0; i < adev->num_ip_blocks; i++) { 1510 if (!adev->ip_blocks[i].status.valid) 1511 continue; 1512 if (adev->ip_blocks[i].version->type != block_type) 1513 continue; 1514 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1515 continue; 1516 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1517 (void *)adev, state); 1518 if (r) 1519 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1520 adev->ip_blocks[i].version->funcs->name, r); 1521 } 1522 return r; 1523 } 1524 1525 /** 1526 * amdgpu_device_ip_get_clockgating_state - get the CG state 1527 * 1528 * @adev: amdgpu_device pointer 1529 * @flags: clockgating feature flags 1530 * 1531 * Walks the list of IPs on the device and updates the clockgating 1532 * flags for each IP. 1533 * Updates @flags with the feature flags for each hardware IP where 1534 * clockgating is enabled. 1535 */ 1536 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1537 u32 *flags) 1538 { 1539 int i; 1540 1541 for (i = 0; i < adev->num_ip_blocks; i++) { 1542 if (!adev->ip_blocks[i].status.valid) 1543 continue; 1544 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1545 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1546 } 1547 } 1548 1549 /** 1550 * amdgpu_device_ip_wait_for_idle - wait for idle 1551 * 1552 * @adev: amdgpu_device pointer 1553 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1554 * 1555 * Waits for the request hardware IP to be idle. 1556 * Returns 0 for success or a negative error code on failure. 1557 */ 1558 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1559 enum amd_ip_block_type block_type) 1560 { 1561 int i, r; 1562 1563 for (i = 0; i < adev->num_ip_blocks; i++) { 1564 if (!adev->ip_blocks[i].status.valid) 1565 continue; 1566 if (adev->ip_blocks[i].version->type == block_type) { 1567 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1568 if (r) 1569 return r; 1570 break; 1571 } 1572 } 1573 return 0; 1574 1575 } 1576 1577 /** 1578 * amdgpu_device_ip_is_idle - is the hardware IP idle 1579 * 1580 * @adev: amdgpu_device pointer 1581 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1582 * 1583 * Check if the hardware IP is idle or not. 1584 * Returns true if it the IP is idle, false if not. 1585 */ 1586 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1587 enum amd_ip_block_type block_type) 1588 { 1589 int i; 1590 1591 for (i = 0; i < adev->num_ip_blocks; i++) { 1592 if (!adev->ip_blocks[i].status.valid) 1593 continue; 1594 if (adev->ip_blocks[i].version->type == block_type) 1595 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1596 } 1597 return true; 1598 1599 } 1600 1601 /** 1602 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1603 * 1604 * @adev: amdgpu_device pointer 1605 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1606 * 1607 * Returns a pointer to the hardware IP block structure 1608 * if it exists for the asic, otherwise NULL. 1609 */ 1610 struct amdgpu_ip_block * 1611 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1612 enum amd_ip_block_type type) 1613 { 1614 int i; 1615 1616 for (i = 0; i < adev->num_ip_blocks; i++) 1617 if (adev->ip_blocks[i].version->type == type) 1618 return &adev->ip_blocks[i]; 1619 1620 return NULL; 1621 } 1622 1623 /** 1624 * amdgpu_device_ip_block_version_cmp 1625 * 1626 * @adev: amdgpu_device pointer 1627 * @type: enum amd_ip_block_type 1628 * @major: major version 1629 * @minor: minor version 1630 * 1631 * return 0 if equal or greater 1632 * return 1 if smaller or the ip_block doesn't exist 1633 */ 1634 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1635 enum amd_ip_block_type type, 1636 u32 major, u32 minor) 1637 { 1638 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1639 1640 if (ip_block && ((ip_block->version->major > major) || 1641 ((ip_block->version->major == major) && 1642 (ip_block->version->minor >= minor)))) 1643 return 0; 1644 1645 return 1; 1646 } 1647 1648 /** 1649 * amdgpu_device_ip_block_add 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @ip_block_version: pointer to the IP to add 1653 * 1654 * Adds the IP block driver information to the collection of IPs 1655 * on the asic. 1656 */ 1657 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1658 const struct amdgpu_ip_block_version *ip_block_version) 1659 { 1660 if (!ip_block_version) 1661 return -EINVAL; 1662 1663 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1664 ip_block_version->funcs->name); 1665 1666 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1667 1668 return 0; 1669 } 1670 1671 /** 1672 * amdgpu_device_enable_virtual_display - enable virtual display feature 1673 * 1674 * @adev: amdgpu_device pointer 1675 * 1676 * Enabled the virtual display feature if the user has enabled it via 1677 * the module parameter virtual_display. This feature provides a virtual 1678 * display hardware on headless boards or in virtualized environments. 1679 * This function parses and validates the configuration string specified by 1680 * the user and configues the virtual display configuration (number of 1681 * virtual connectors, crtcs, etc.) specified. 1682 */ 1683 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1684 { 1685 adev->enable_virtual_display = false; 1686 1687 if (amdgpu_virtual_display) { 1688 struct drm_device *ddev = adev_to_drm(adev); 1689 const char *pci_address_name = pci_name(ddev->pdev); 1690 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1691 1692 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1693 pciaddstr_tmp = pciaddstr; 1694 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1695 pciaddname = strsep(&pciaddname_tmp, ","); 1696 if (!strcmp("all", pciaddname) 1697 || !strcmp(pci_address_name, pciaddname)) { 1698 long num_crtc; 1699 int res = -1; 1700 1701 adev->enable_virtual_display = true; 1702 1703 if (pciaddname_tmp) 1704 res = kstrtol(pciaddname_tmp, 10, 1705 &num_crtc); 1706 1707 if (!res) { 1708 if (num_crtc < 1) 1709 num_crtc = 1; 1710 if (num_crtc > 6) 1711 num_crtc = 6; 1712 adev->mode_info.num_crtc = num_crtc; 1713 } else { 1714 adev->mode_info.num_crtc = 1; 1715 } 1716 break; 1717 } 1718 } 1719 1720 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1721 amdgpu_virtual_display, pci_address_name, 1722 adev->enable_virtual_display, adev->mode_info.num_crtc); 1723 1724 kfree(pciaddstr); 1725 } 1726 } 1727 1728 /** 1729 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1730 * 1731 * @adev: amdgpu_device pointer 1732 * 1733 * Parses the asic configuration parameters specified in the gpu info 1734 * firmware and makes them availale to the driver for use in configuring 1735 * the asic. 1736 * Returns 0 on success, -EINVAL on failure. 1737 */ 1738 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1739 { 1740 const char *chip_name; 1741 char fw_name[40]; 1742 int err; 1743 const struct gpu_info_firmware_header_v1_0 *hdr; 1744 1745 adev->firmware.gpu_info_fw = NULL; 1746 1747 if (adev->mman.discovery_bin) { 1748 amdgpu_discovery_get_gfx_info(adev); 1749 1750 /* 1751 * FIXME: The bounding box is still needed by Navi12, so 1752 * temporarily read it from gpu_info firmware. Should be droped 1753 * when DAL no longer needs it. 1754 */ 1755 if (adev->asic_type != CHIP_NAVI12) 1756 return 0; 1757 } 1758 1759 switch (adev->asic_type) { 1760 #ifdef CONFIG_DRM_AMDGPU_SI 1761 case CHIP_VERDE: 1762 case CHIP_TAHITI: 1763 case CHIP_PITCAIRN: 1764 case CHIP_OLAND: 1765 case CHIP_HAINAN: 1766 #endif 1767 #ifdef CONFIG_DRM_AMDGPU_CIK 1768 case CHIP_BONAIRE: 1769 case CHIP_HAWAII: 1770 case CHIP_KAVERI: 1771 case CHIP_KABINI: 1772 case CHIP_MULLINS: 1773 #endif 1774 case CHIP_TOPAZ: 1775 case CHIP_TONGA: 1776 case CHIP_FIJI: 1777 case CHIP_POLARIS10: 1778 case CHIP_POLARIS11: 1779 case CHIP_POLARIS12: 1780 case CHIP_VEGAM: 1781 case CHIP_CARRIZO: 1782 case CHIP_STONEY: 1783 case CHIP_VEGA20: 1784 case CHIP_SIENNA_CICHLID: 1785 case CHIP_NAVY_FLOUNDER: 1786 default: 1787 return 0; 1788 case CHIP_VEGA10: 1789 chip_name = "vega10"; 1790 break; 1791 case CHIP_VEGA12: 1792 chip_name = "vega12"; 1793 break; 1794 case CHIP_RAVEN: 1795 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1796 chip_name = "raven2"; 1797 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1798 chip_name = "picasso"; 1799 else 1800 chip_name = "raven"; 1801 break; 1802 case CHIP_ARCTURUS: 1803 chip_name = "arcturus"; 1804 break; 1805 case CHIP_RENOIR: 1806 chip_name = "renoir"; 1807 break; 1808 case CHIP_NAVI10: 1809 chip_name = "navi10"; 1810 break; 1811 case CHIP_NAVI14: 1812 chip_name = "navi14"; 1813 break; 1814 case CHIP_NAVI12: 1815 chip_name = "navi12"; 1816 break; 1817 } 1818 1819 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1820 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1821 if (err) { 1822 dev_err(adev->dev, 1823 "Failed to load gpu_info firmware \"%s\"\n", 1824 fw_name); 1825 goto out; 1826 } 1827 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1828 if (err) { 1829 dev_err(adev->dev, 1830 "Failed to validate gpu_info firmware \"%s\"\n", 1831 fw_name); 1832 goto out; 1833 } 1834 1835 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1836 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1837 1838 switch (hdr->version_major) { 1839 case 1: 1840 { 1841 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1842 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1843 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1844 1845 /* 1846 * Should be droped when DAL no longer needs it. 1847 */ 1848 if (adev->asic_type == CHIP_NAVI12) 1849 goto parse_soc_bounding_box; 1850 1851 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1852 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1853 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1854 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1855 adev->gfx.config.max_texture_channel_caches = 1856 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1857 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1858 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1859 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1860 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1861 adev->gfx.config.double_offchip_lds_buf = 1862 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1863 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1864 adev->gfx.cu_info.max_waves_per_simd = 1865 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1866 adev->gfx.cu_info.max_scratch_slots_per_cu = 1867 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1868 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1869 if (hdr->version_minor >= 1) { 1870 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1871 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1872 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1873 adev->gfx.config.num_sc_per_sh = 1874 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1875 adev->gfx.config.num_packer_per_sc = 1876 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1877 } 1878 1879 parse_soc_bounding_box: 1880 /* 1881 * soc bounding box info is not integrated in disocovery table, 1882 * we always need to parse it from gpu info firmware if needed. 1883 */ 1884 if (hdr->version_minor == 2) { 1885 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1886 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1887 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1888 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1889 } 1890 break; 1891 } 1892 default: 1893 dev_err(adev->dev, 1894 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1895 err = -EINVAL; 1896 goto out; 1897 } 1898 out: 1899 return err; 1900 } 1901 1902 /** 1903 * amdgpu_device_ip_early_init - run early init for hardware IPs 1904 * 1905 * @adev: amdgpu_device pointer 1906 * 1907 * Early initialization pass for hardware IPs. The hardware IPs that make 1908 * up each asic are discovered each IP's early_init callback is run. This 1909 * is the first stage in initializing the asic. 1910 * Returns 0 on success, negative error code on failure. 1911 */ 1912 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1913 { 1914 int i, r; 1915 1916 amdgpu_device_enable_virtual_display(adev); 1917 1918 if (amdgpu_sriov_vf(adev)) { 1919 r = amdgpu_virt_request_full_gpu(adev, true); 1920 if (r) 1921 return r; 1922 } 1923 1924 switch (adev->asic_type) { 1925 #ifdef CONFIG_DRM_AMDGPU_SI 1926 case CHIP_VERDE: 1927 case CHIP_TAHITI: 1928 case CHIP_PITCAIRN: 1929 case CHIP_OLAND: 1930 case CHIP_HAINAN: 1931 adev->family = AMDGPU_FAMILY_SI; 1932 r = si_set_ip_blocks(adev); 1933 if (r) 1934 return r; 1935 break; 1936 #endif 1937 #ifdef CONFIG_DRM_AMDGPU_CIK 1938 case CHIP_BONAIRE: 1939 case CHIP_HAWAII: 1940 case CHIP_KAVERI: 1941 case CHIP_KABINI: 1942 case CHIP_MULLINS: 1943 if (adev->flags & AMD_IS_APU) 1944 adev->family = AMDGPU_FAMILY_KV; 1945 else 1946 adev->family = AMDGPU_FAMILY_CI; 1947 1948 r = cik_set_ip_blocks(adev); 1949 if (r) 1950 return r; 1951 break; 1952 #endif 1953 case CHIP_TOPAZ: 1954 case CHIP_TONGA: 1955 case CHIP_FIJI: 1956 case CHIP_POLARIS10: 1957 case CHIP_POLARIS11: 1958 case CHIP_POLARIS12: 1959 case CHIP_VEGAM: 1960 case CHIP_CARRIZO: 1961 case CHIP_STONEY: 1962 if (adev->flags & AMD_IS_APU) 1963 adev->family = AMDGPU_FAMILY_CZ; 1964 else 1965 adev->family = AMDGPU_FAMILY_VI; 1966 1967 r = vi_set_ip_blocks(adev); 1968 if (r) 1969 return r; 1970 break; 1971 case CHIP_VEGA10: 1972 case CHIP_VEGA12: 1973 case CHIP_VEGA20: 1974 case CHIP_RAVEN: 1975 case CHIP_ARCTURUS: 1976 case CHIP_RENOIR: 1977 if (adev->flags & AMD_IS_APU) 1978 adev->family = AMDGPU_FAMILY_RV; 1979 else 1980 adev->family = AMDGPU_FAMILY_AI; 1981 1982 r = soc15_set_ip_blocks(adev); 1983 if (r) 1984 return r; 1985 break; 1986 case CHIP_NAVI10: 1987 case CHIP_NAVI14: 1988 case CHIP_NAVI12: 1989 case CHIP_SIENNA_CICHLID: 1990 case CHIP_NAVY_FLOUNDER: 1991 adev->family = AMDGPU_FAMILY_NV; 1992 1993 r = nv_set_ip_blocks(adev); 1994 if (r) 1995 return r; 1996 break; 1997 default: 1998 /* FIXME: not supported yet */ 1999 return -EINVAL; 2000 } 2001 2002 amdgpu_amdkfd_device_probe(adev); 2003 2004 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2005 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2006 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2007 2008 for (i = 0; i < adev->num_ip_blocks; i++) { 2009 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2010 DRM_ERROR("disabled ip block: %d <%s>\n", 2011 i, adev->ip_blocks[i].version->funcs->name); 2012 adev->ip_blocks[i].status.valid = false; 2013 } else { 2014 if (adev->ip_blocks[i].version->funcs->early_init) { 2015 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2016 if (r == -ENOENT) { 2017 adev->ip_blocks[i].status.valid = false; 2018 } else if (r) { 2019 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2020 adev->ip_blocks[i].version->funcs->name, r); 2021 return r; 2022 } else { 2023 adev->ip_blocks[i].status.valid = true; 2024 } 2025 } else { 2026 adev->ip_blocks[i].status.valid = true; 2027 } 2028 } 2029 /* get the vbios after the asic_funcs are set up */ 2030 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2031 r = amdgpu_device_parse_gpu_info_fw(adev); 2032 if (r) 2033 return r; 2034 2035 /* Read BIOS */ 2036 if (!amdgpu_get_bios(adev)) 2037 return -EINVAL; 2038 2039 r = amdgpu_atombios_init(adev); 2040 if (r) { 2041 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2043 return r; 2044 } 2045 } 2046 } 2047 2048 adev->cg_flags &= amdgpu_cg_mask; 2049 adev->pg_flags &= amdgpu_pg_mask; 2050 2051 return 0; 2052 } 2053 2054 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2055 { 2056 int i, r; 2057 2058 for (i = 0; i < adev->num_ip_blocks; i++) { 2059 if (!adev->ip_blocks[i].status.sw) 2060 continue; 2061 if (adev->ip_blocks[i].status.hw) 2062 continue; 2063 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2064 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2065 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2066 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2067 if (r) { 2068 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2069 adev->ip_blocks[i].version->funcs->name, r); 2070 return r; 2071 } 2072 adev->ip_blocks[i].status.hw = true; 2073 } 2074 } 2075 2076 return 0; 2077 } 2078 2079 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2080 { 2081 int i, r; 2082 2083 for (i = 0; i < adev->num_ip_blocks; i++) { 2084 if (!adev->ip_blocks[i].status.sw) 2085 continue; 2086 if (adev->ip_blocks[i].status.hw) 2087 continue; 2088 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2089 if (r) { 2090 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2091 adev->ip_blocks[i].version->funcs->name, r); 2092 return r; 2093 } 2094 adev->ip_blocks[i].status.hw = true; 2095 } 2096 2097 return 0; 2098 } 2099 2100 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2101 { 2102 int r = 0; 2103 int i; 2104 uint32_t smu_version; 2105 2106 if (adev->asic_type >= CHIP_VEGA10) { 2107 for (i = 0; i < adev->num_ip_blocks; i++) { 2108 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2109 continue; 2110 2111 /* no need to do the fw loading again if already done*/ 2112 if (adev->ip_blocks[i].status.hw == true) 2113 break; 2114 2115 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2116 r = adev->ip_blocks[i].version->funcs->resume(adev); 2117 if (r) { 2118 DRM_ERROR("resume of IP block <%s> failed %d\n", 2119 adev->ip_blocks[i].version->funcs->name, r); 2120 return r; 2121 } 2122 } else { 2123 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2124 if (r) { 2125 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2126 adev->ip_blocks[i].version->funcs->name, r); 2127 return r; 2128 } 2129 } 2130 2131 adev->ip_blocks[i].status.hw = true; 2132 break; 2133 } 2134 } 2135 2136 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2137 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2138 2139 return r; 2140 } 2141 2142 /** 2143 * amdgpu_device_ip_init - run init for hardware IPs 2144 * 2145 * @adev: amdgpu_device pointer 2146 * 2147 * Main initialization pass for hardware IPs. The list of all the hardware 2148 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2149 * are run. sw_init initializes the software state associated with each IP 2150 * and hw_init initializes the hardware associated with each IP. 2151 * Returns 0 on success, negative error code on failure. 2152 */ 2153 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2154 { 2155 int i, r; 2156 2157 r = amdgpu_ras_init(adev); 2158 if (r) 2159 return r; 2160 2161 for (i = 0; i < adev->num_ip_blocks; i++) { 2162 if (!adev->ip_blocks[i].status.valid) 2163 continue; 2164 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2165 if (r) { 2166 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2167 adev->ip_blocks[i].version->funcs->name, r); 2168 goto init_failed; 2169 } 2170 adev->ip_blocks[i].status.sw = true; 2171 2172 /* need to do gmc hw init early so we can allocate gpu mem */ 2173 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2174 r = amdgpu_device_vram_scratch_init(adev); 2175 if (r) { 2176 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2177 goto init_failed; 2178 } 2179 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2180 if (r) { 2181 DRM_ERROR("hw_init %d failed %d\n", i, r); 2182 goto init_failed; 2183 } 2184 r = amdgpu_device_wb_init(adev); 2185 if (r) { 2186 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2187 goto init_failed; 2188 } 2189 adev->ip_blocks[i].status.hw = true; 2190 2191 /* right after GMC hw init, we create CSA */ 2192 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2193 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2194 AMDGPU_GEM_DOMAIN_VRAM, 2195 AMDGPU_CSA_SIZE); 2196 if (r) { 2197 DRM_ERROR("allocate CSA failed %d\n", r); 2198 goto init_failed; 2199 } 2200 } 2201 } 2202 } 2203 2204 if (amdgpu_sriov_vf(adev)) 2205 amdgpu_virt_init_data_exchange(adev); 2206 2207 r = amdgpu_ib_pool_init(adev); 2208 if (r) { 2209 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2210 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2211 goto init_failed; 2212 } 2213 2214 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2215 if (r) 2216 goto init_failed; 2217 2218 r = amdgpu_device_ip_hw_init_phase1(adev); 2219 if (r) 2220 goto init_failed; 2221 2222 r = amdgpu_device_fw_loading(adev); 2223 if (r) 2224 goto init_failed; 2225 2226 r = amdgpu_device_ip_hw_init_phase2(adev); 2227 if (r) 2228 goto init_failed; 2229 2230 /* 2231 * retired pages will be loaded from eeprom and reserved here, 2232 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2233 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2234 * for I2C communication which only true at this point. 2235 * 2236 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2237 * failure from bad gpu situation and stop amdgpu init process 2238 * accordingly. For other failed cases, it will still release all 2239 * the resource and print error message, rather than returning one 2240 * negative value to upper level. 2241 * 2242 * Note: theoretically, this should be called before all vram allocations 2243 * to protect retired page from abusing 2244 */ 2245 r = amdgpu_ras_recovery_init(adev); 2246 if (r) 2247 goto init_failed; 2248 2249 if (adev->gmc.xgmi.num_physical_nodes > 1) 2250 amdgpu_xgmi_add_device(adev); 2251 amdgpu_amdkfd_device_init(adev); 2252 2253 amdgpu_fru_get_product_info(adev); 2254 2255 init_failed: 2256 if (amdgpu_sriov_vf(adev)) 2257 amdgpu_virt_release_full_gpu(adev, true); 2258 2259 return r; 2260 } 2261 2262 /** 2263 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2264 * 2265 * @adev: amdgpu_device pointer 2266 * 2267 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2268 * this function before a GPU reset. If the value is retained after a 2269 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2270 */ 2271 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2272 { 2273 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2274 } 2275 2276 /** 2277 * amdgpu_device_check_vram_lost - check if vram is valid 2278 * 2279 * @adev: amdgpu_device pointer 2280 * 2281 * Checks the reset magic value written to the gart pointer in VRAM. 2282 * The driver calls this after a GPU reset to see if the contents of 2283 * VRAM is lost or now. 2284 * returns true if vram is lost, false if not. 2285 */ 2286 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2287 { 2288 if (memcmp(adev->gart.ptr, adev->reset_magic, 2289 AMDGPU_RESET_MAGIC_NUM)) 2290 return true; 2291 2292 if (!amdgpu_in_reset(adev)) 2293 return false; 2294 2295 /* 2296 * For all ASICs with baco/mode1 reset, the VRAM is 2297 * always assumed to be lost. 2298 */ 2299 switch (amdgpu_asic_reset_method(adev)) { 2300 case AMD_RESET_METHOD_BACO: 2301 case AMD_RESET_METHOD_MODE1: 2302 return true; 2303 default: 2304 return false; 2305 } 2306 } 2307 2308 /** 2309 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2310 * 2311 * @adev: amdgpu_device pointer 2312 * @state: clockgating state (gate or ungate) 2313 * 2314 * The list of all the hardware IPs that make up the asic is walked and the 2315 * set_clockgating_state callbacks are run. 2316 * Late initialization pass enabling clockgating for hardware IPs. 2317 * Fini or suspend, pass disabling clockgating for hardware IPs. 2318 * Returns 0 on success, negative error code on failure. 2319 */ 2320 2321 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2322 enum amd_clockgating_state state) 2323 { 2324 int i, j, r; 2325 2326 if (amdgpu_emu_mode == 1) 2327 return 0; 2328 2329 for (j = 0; j < adev->num_ip_blocks; j++) { 2330 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2331 if (!adev->ip_blocks[i].status.late_initialized) 2332 continue; 2333 /* skip CG for VCE/UVD, it's handled specially */ 2334 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2335 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2336 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2337 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2338 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2339 /* enable clockgating to save power */ 2340 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2341 state); 2342 if (r) { 2343 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2344 adev->ip_blocks[i].version->funcs->name, r); 2345 return r; 2346 } 2347 } 2348 } 2349 2350 return 0; 2351 } 2352 2353 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2354 { 2355 int i, j, r; 2356 2357 if (amdgpu_emu_mode == 1) 2358 return 0; 2359 2360 for (j = 0; j < adev->num_ip_blocks; j++) { 2361 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2362 if (!adev->ip_blocks[i].status.late_initialized) 2363 continue; 2364 /* skip CG for VCE/UVD, it's handled specially */ 2365 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2366 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2367 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2368 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2369 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2370 /* enable powergating to save power */ 2371 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2372 state); 2373 if (r) { 2374 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2375 adev->ip_blocks[i].version->funcs->name, r); 2376 return r; 2377 } 2378 } 2379 } 2380 return 0; 2381 } 2382 2383 static int amdgpu_device_enable_mgpu_fan_boost(void) 2384 { 2385 struct amdgpu_gpu_instance *gpu_ins; 2386 struct amdgpu_device *adev; 2387 int i, ret = 0; 2388 2389 mutex_lock(&mgpu_info.mutex); 2390 2391 /* 2392 * MGPU fan boost feature should be enabled 2393 * only when there are two or more dGPUs in 2394 * the system 2395 */ 2396 if (mgpu_info.num_dgpu < 2) 2397 goto out; 2398 2399 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2400 gpu_ins = &(mgpu_info.gpu_ins[i]); 2401 adev = gpu_ins->adev; 2402 if (!(adev->flags & AMD_IS_APU) && 2403 !gpu_ins->mgpu_fan_enabled) { 2404 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2405 if (ret) 2406 break; 2407 2408 gpu_ins->mgpu_fan_enabled = 1; 2409 } 2410 } 2411 2412 out: 2413 mutex_unlock(&mgpu_info.mutex); 2414 2415 return ret; 2416 } 2417 2418 /** 2419 * amdgpu_device_ip_late_init - run late init for hardware IPs 2420 * 2421 * @adev: amdgpu_device pointer 2422 * 2423 * Late initialization pass for hardware IPs. The list of all the hardware 2424 * IPs that make up the asic is walked and the late_init callbacks are run. 2425 * late_init covers any special initialization that an IP requires 2426 * after all of the have been initialized or something that needs to happen 2427 * late in the init process. 2428 * Returns 0 on success, negative error code on failure. 2429 */ 2430 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2431 { 2432 struct amdgpu_gpu_instance *gpu_instance; 2433 int i = 0, r; 2434 2435 for (i = 0; i < adev->num_ip_blocks; i++) { 2436 if (!adev->ip_blocks[i].status.hw) 2437 continue; 2438 if (adev->ip_blocks[i].version->funcs->late_init) { 2439 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2440 if (r) { 2441 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2442 adev->ip_blocks[i].version->funcs->name, r); 2443 return r; 2444 } 2445 } 2446 adev->ip_blocks[i].status.late_initialized = true; 2447 } 2448 2449 amdgpu_ras_set_error_query_ready(adev, true); 2450 2451 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2452 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2453 2454 amdgpu_device_fill_reset_magic(adev); 2455 2456 r = amdgpu_device_enable_mgpu_fan_boost(); 2457 if (r) 2458 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2459 2460 2461 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2462 mutex_lock(&mgpu_info.mutex); 2463 2464 /* 2465 * Reset device p-state to low as this was booted with high. 2466 * 2467 * This should be performed only after all devices from the same 2468 * hive get initialized. 2469 * 2470 * However, it's unknown how many device in the hive in advance. 2471 * As this is counted one by one during devices initializations. 2472 * 2473 * So, we wait for all XGMI interlinked devices initialized. 2474 * This may bring some delays as those devices may come from 2475 * different hives. But that should be OK. 2476 */ 2477 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2478 for (i = 0; i < mgpu_info.num_gpu; i++) { 2479 gpu_instance = &(mgpu_info.gpu_ins[i]); 2480 if (gpu_instance->adev->flags & AMD_IS_APU) 2481 continue; 2482 2483 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2484 AMDGPU_XGMI_PSTATE_MIN); 2485 if (r) { 2486 DRM_ERROR("pstate setting failed (%d).\n", r); 2487 break; 2488 } 2489 } 2490 } 2491 2492 mutex_unlock(&mgpu_info.mutex); 2493 } 2494 2495 return 0; 2496 } 2497 2498 /** 2499 * amdgpu_device_ip_fini - run fini for hardware IPs 2500 * 2501 * @adev: amdgpu_device pointer 2502 * 2503 * Main teardown pass for hardware IPs. The list of all the hardware 2504 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2505 * are run. hw_fini tears down the hardware associated with each IP 2506 * and sw_fini tears down any software state associated with each IP. 2507 * Returns 0 on success, negative error code on failure. 2508 */ 2509 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2510 { 2511 int i, r; 2512 2513 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2514 amdgpu_virt_release_ras_err_handler_data(adev); 2515 2516 amdgpu_ras_pre_fini(adev); 2517 2518 if (adev->gmc.xgmi.num_physical_nodes > 1) 2519 amdgpu_xgmi_remove_device(adev); 2520 2521 amdgpu_amdkfd_device_fini(adev); 2522 2523 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2524 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2525 2526 /* need to disable SMC first */ 2527 for (i = 0; i < adev->num_ip_blocks; i++) { 2528 if (!adev->ip_blocks[i].status.hw) 2529 continue; 2530 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2531 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2532 /* XXX handle errors */ 2533 if (r) { 2534 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2535 adev->ip_blocks[i].version->funcs->name, r); 2536 } 2537 adev->ip_blocks[i].status.hw = false; 2538 break; 2539 } 2540 } 2541 2542 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2543 if (!adev->ip_blocks[i].status.hw) 2544 continue; 2545 2546 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2547 /* XXX handle errors */ 2548 if (r) { 2549 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2550 adev->ip_blocks[i].version->funcs->name, r); 2551 } 2552 2553 adev->ip_blocks[i].status.hw = false; 2554 } 2555 2556 2557 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2558 if (!adev->ip_blocks[i].status.sw) 2559 continue; 2560 2561 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2562 amdgpu_ucode_free_bo(adev); 2563 amdgpu_free_static_csa(&adev->virt.csa_obj); 2564 amdgpu_device_wb_fini(adev); 2565 amdgpu_device_vram_scratch_fini(adev); 2566 amdgpu_ib_pool_fini(adev); 2567 } 2568 2569 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2570 /* XXX handle errors */ 2571 if (r) { 2572 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2573 adev->ip_blocks[i].version->funcs->name, r); 2574 } 2575 adev->ip_blocks[i].status.sw = false; 2576 adev->ip_blocks[i].status.valid = false; 2577 } 2578 2579 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2580 if (!adev->ip_blocks[i].status.late_initialized) 2581 continue; 2582 if (adev->ip_blocks[i].version->funcs->late_fini) 2583 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2584 adev->ip_blocks[i].status.late_initialized = false; 2585 } 2586 2587 amdgpu_ras_fini(adev); 2588 2589 if (amdgpu_sriov_vf(adev)) 2590 if (amdgpu_virt_release_full_gpu(adev, false)) 2591 DRM_ERROR("failed to release exclusive mode on fini\n"); 2592 2593 return 0; 2594 } 2595 2596 /** 2597 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2598 * 2599 * @work: work_struct. 2600 */ 2601 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2602 { 2603 struct amdgpu_device *adev = 2604 container_of(work, struct amdgpu_device, delayed_init_work.work); 2605 int r; 2606 2607 r = amdgpu_ib_ring_tests(adev); 2608 if (r) 2609 DRM_ERROR("ib ring test failed (%d).\n", r); 2610 } 2611 2612 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2613 { 2614 struct amdgpu_device *adev = 2615 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2616 2617 mutex_lock(&adev->gfx.gfx_off_mutex); 2618 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2619 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2620 adev->gfx.gfx_off_state = true; 2621 } 2622 mutex_unlock(&adev->gfx.gfx_off_mutex); 2623 } 2624 2625 /** 2626 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2627 * 2628 * @adev: amdgpu_device pointer 2629 * 2630 * Main suspend function for hardware IPs. The list of all the hardware 2631 * IPs that make up the asic is walked, clockgating is disabled and the 2632 * suspend callbacks are run. suspend puts the hardware and software state 2633 * in each IP into a state suitable for suspend. 2634 * Returns 0 on success, negative error code on failure. 2635 */ 2636 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2637 { 2638 int i, r; 2639 2640 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2641 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2642 2643 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2644 if (!adev->ip_blocks[i].status.valid) 2645 continue; 2646 2647 /* displays are handled separately */ 2648 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2649 continue; 2650 2651 /* XXX handle errors */ 2652 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2653 /* XXX handle errors */ 2654 if (r) { 2655 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2656 adev->ip_blocks[i].version->funcs->name, r); 2657 return r; 2658 } 2659 2660 adev->ip_blocks[i].status.hw = false; 2661 } 2662 2663 return 0; 2664 } 2665 2666 /** 2667 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2668 * 2669 * @adev: amdgpu_device pointer 2670 * 2671 * Main suspend function for hardware IPs. The list of all the hardware 2672 * IPs that make up the asic is walked, clockgating is disabled and the 2673 * suspend callbacks are run. suspend puts the hardware and software state 2674 * in each IP into a state suitable for suspend. 2675 * Returns 0 on success, negative error code on failure. 2676 */ 2677 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2678 { 2679 int i, r; 2680 2681 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2682 if (!adev->ip_blocks[i].status.valid) 2683 continue; 2684 /* displays are handled in phase1 */ 2685 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2686 continue; 2687 /* PSP lost connection when err_event_athub occurs */ 2688 if (amdgpu_ras_intr_triggered() && 2689 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2690 adev->ip_blocks[i].status.hw = false; 2691 continue; 2692 } 2693 /* XXX handle errors */ 2694 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2695 /* XXX handle errors */ 2696 if (r) { 2697 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2698 adev->ip_blocks[i].version->funcs->name, r); 2699 } 2700 adev->ip_blocks[i].status.hw = false; 2701 /* handle putting the SMC in the appropriate state */ 2702 if(!amdgpu_sriov_vf(adev)){ 2703 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2704 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2705 if (r) { 2706 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2707 adev->mp1_state, r); 2708 return r; 2709 } 2710 } 2711 } 2712 adev->ip_blocks[i].status.hw = false; 2713 } 2714 2715 return 0; 2716 } 2717 2718 /** 2719 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2720 * 2721 * @adev: amdgpu_device pointer 2722 * 2723 * Main suspend function for hardware IPs. The list of all the hardware 2724 * IPs that make up the asic is walked, clockgating is disabled and the 2725 * suspend callbacks are run. suspend puts the hardware and software state 2726 * in each IP into a state suitable for suspend. 2727 * Returns 0 on success, negative error code on failure. 2728 */ 2729 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2730 { 2731 int r; 2732 2733 if (amdgpu_sriov_vf(adev)) 2734 amdgpu_virt_request_full_gpu(adev, false); 2735 2736 r = amdgpu_device_ip_suspend_phase1(adev); 2737 if (r) 2738 return r; 2739 r = amdgpu_device_ip_suspend_phase2(adev); 2740 2741 if (amdgpu_sriov_vf(adev)) 2742 amdgpu_virt_release_full_gpu(adev, false); 2743 2744 return r; 2745 } 2746 2747 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2748 { 2749 int i, r; 2750 2751 static enum amd_ip_block_type ip_order[] = { 2752 AMD_IP_BLOCK_TYPE_GMC, 2753 AMD_IP_BLOCK_TYPE_COMMON, 2754 AMD_IP_BLOCK_TYPE_PSP, 2755 AMD_IP_BLOCK_TYPE_IH, 2756 }; 2757 2758 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2759 int j; 2760 struct amdgpu_ip_block *block; 2761 2762 block = &adev->ip_blocks[i]; 2763 block->status.hw = false; 2764 2765 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2766 2767 if (block->version->type != ip_order[j] || 2768 !block->status.valid) 2769 continue; 2770 2771 r = block->version->funcs->hw_init(adev); 2772 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2773 if (r) 2774 return r; 2775 block->status.hw = true; 2776 } 2777 } 2778 2779 return 0; 2780 } 2781 2782 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2783 { 2784 int i, r; 2785 2786 static enum amd_ip_block_type ip_order[] = { 2787 AMD_IP_BLOCK_TYPE_SMC, 2788 AMD_IP_BLOCK_TYPE_DCE, 2789 AMD_IP_BLOCK_TYPE_GFX, 2790 AMD_IP_BLOCK_TYPE_SDMA, 2791 AMD_IP_BLOCK_TYPE_UVD, 2792 AMD_IP_BLOCK_TYPE_VCE, 2793 AMD_IP_BLOCK_TYPE_VCN 2794 }; 2795 2796 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2797 int j; 2798 struct amdgpu_ip_block *block; 2799 2800 for (j = 0; j < adev->num_ip_blocks; j++) { 2801 block = &adev->ip_blocks[j]; 2802 2803 if (block->version->type != ip_order[i] || 2804 !block->status.valid || 2805 block->status.hw) 2806 continue; 2807 2808 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2809 r = block->version->funcs->resume(adev); 2810 else 2811 r = block->version->funcs->hw_init(adev); 2812 2813 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2814 if (r) 2815 return r; 2816 block->status.hw = true; 2817 } 2818 } 2819 2820 return 0; 2821 } 2822 2823 /** 2824 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2825 * 2826 * @adev: amdgpu_device pointer 2827 * 2828 * First resume function for hardware IPs. The list of all the hardware 2829 * IPs that make up the asic is walked and the resume callbacks are run for 2830 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2831 * after a suspend and updates the software state as necessary. This 2832 * function is also used for restoring the GPU after a GPU reset. 2833 * Returns 0 on success, negative error code on failure. 2834 */ 2835 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2836 { 2837 int i, r; 2838 2839 for (i = 0; i < adev->num_ip_blocks; i++) { 2840 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2841 continue; 2842 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2843 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2844 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2845 2846 r = adev->ip_blocks[i].version->funcs->resume(adev); 2847 if (r) { 2848 DRM_ERROR("resume of IP block <%s> failed %d\n", 2849 adev->ip_blocks[i].version->funcs->name, r); 2850 return r; 2851 } 2852 adev->ip_blocks[i].status.hw = true; 2853 } 2854 } 2855 2856 return 0; 2857 } 2858 2859 /** 2860 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2861 * 2862 * @adev: amdgpu_device pointer 2863 * 2864 * First resume function for hardware IPs. The list of all the hardware 2865 * IPs that make up the asic is walked and the resume callbacks are run for 2866 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2867 * functional state after a suspend and updates the software state as 2868 * necessary. This function is also used for restoring the GPU after a GPU 2869 * reset. 2870 * Returns 0 on success, negative error code on failure. 2871 */ 2872 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2873 { 2874 int i, r; 2875 2876 for (i = 0; i < adev->num_ip_blocks; i++) { 2877 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2878 continue; 2879 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2880 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2881 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2882 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2883 continue; 2884 r = adev->ip_blocks[i].version->funcs->resume(adev); 2885 if (r) { 2886 DRM_ERROR("resume of IP block <%s> failed %d\n", 2887 adev->ip_blocks[i].version->funcs->name, r); 2888 return r; 2889 } 2890 adev->ip_blocks[i].status.hw = true; 2891 } 2892 2893 return 0; 2894 } 2895 2896 /** 2897 * amdgpu_device_ip_resume - run resume for hardware IPs 2898 * 2899 * @adev: amdgpu_device pointer 2900 * 2901 * Main resume function for hardware IPs. The hardware IPs 2902 * are split into two resume functions because they are 2903 * are also used in in recovering from a GPU reset and some additional 2904 * steps need to be take between them. In this case (S3/S4) they are 2905 * run sequentially. 2906 * Returns 0 on success, negative error code on failure. 2907 */ 2908 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2909 { 2910 int r; 2911 2912 r = amdgpu_device_ip_resume_phase1(adev); 2913 if (r) 2914 return r; 2915 2916 r = amdgpu_device_fw_loading(adev); 2917 if (r) 2918 return r; 2919 2920 r = amdgpu_device_ip_resume_phase2(adev); 2921 2922 return r; 2923 } 2924 2925 /** 2926 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2927 * 2928 * @adev: amdgpu_device pointer 2929 * 2930 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2931 */ 2932 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2933 { 2934 if (amdgpu_sriov_vf(adev)) { 2935 if (adev->is_atom_fw) { 2936 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2937 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2938 } else { 2939 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2940 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2941 } 2942 2943 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2944 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2945 } 2946 } 2947 2948 /** 2949 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2950 * 2951 * @asic_type: AMD asic type 2952 * 2953 * Check if there is DC (new modesetting infrastructre) support for an asic. 2954 * returns true if DC has support, false if not. 2955 */ 2956 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2957 { 2958 switch (asic_type) { 2959 #if defined(CONFIG_DRM_AMD_DC) 2960 #if defined(CONFIG_DRM_AMD_DC_SI) 2961 case CHIP_TAHITI: 2962 case CHIP_PITCAIRN: 2963 case CHIP_VERDE: 2964 case CHIP_OLAND: 2965 #endif 2966 case CHIP_BONAIRE: 2967 case CHIP_KAVERI: 2968 case CHIP_KABINI: 2969 case CHIP_MULLINS: 2970 /* 2971 * We have systems in the wild with these ASICs that require 2972 * LVDS and VGA support which is not supported with DC. 2973 * 2974 * Fallback to the non-DC driver here by default so as not to 2975 * cause regressions. 2976 */ 2977 return amdgpu_dc > 0; 2978 case CHIP_HAWAII: 2979 case CHIP_CARRIZO: 2980 case CHIP_STONEY: 2981 case CHIP_POLARIS10: 2982 case CHIP_POLARIS11: 2983 case CHIP_POLARIS12: 2984 case CHIP_VEGAM: 2985 case CHIP_TONGA: 2986 case CHIP_FIJI: 2987 case CHIP_VEGA10: 2988 case CHIP_VEGA12: 2989 case CHIP_VEGA20: 2990 #if defined(CONFIG_DRM_AMD_DC_DCN) 2991 case CHIP_RAVEN: 2992 case CHIP_NAVI10: 2993 case CHIP_NAVI14: 2994 case CHIP_NAVI12: 2995 case CHIP_RENOIR: 2996 #endif 2997 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2998 case CHIP_SIENNA_CICHLID: 2999 case CHIP_NAVY_FLOUNDER: 3000 #endif 3001 return amdgpu_dc != 0; 3002 #endif 3003 default: 3004 if (amdgpu_dc > 0) 3005 DRM_INFO("Display Core has been requested via kernel parameter " 3006 "but isn't supported by ASIC, ignoring\n"); 3007 return false; 3008 } 3009 } 3010 3011 /** 3012 * amdgpu_device_has_dc_support - check if dc is supported 3013 * 3014 * @adev: amdgpu_device_pointer 3015 * 3016 * Returns true for supported, false for not supported 3017 */ 3018 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3019 { 3020 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3021 return false; 3022 3023 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3024 } 3025 3026 3027 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3028 { 3029 struct amdgpu_device *adev = 3030 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3031 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3032 3033 /* It's a bug to not have a hive within this function */ 3034 if (WARN_ON(!hive)) 3035 return; 3036 3037 /* 3038 * Use task barrier to synchronize all xgmi reset works across the 3039 * hive. task_barrier_enter and task_barrier_exit will block 3040 * until all the threads running the xgmi reset works reach 3041 * those points. task_barrier_full will do both blocks. 3042 */ 3043 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3044 3045 task_barrier_enter(&hive->tb); 3046 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3047 3048 if (adev->asic_reset_res) 3049 goto fail; 3050 3051 task_barrier_exit(&hive->tb); 3052 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3053 3054 if (adev->asic_reset_res) 3055 goto fail; 3056 3057 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3058 adev->mmhub.funcs->reset_ras_error_count(adev); 3059 } else { 3060 3061 task_barrier_full(&hive->tb); 3062 adev->asic_reset_res = amdgpu_asic_reset(adev); 3063 } 3064 3065 fail: 3066 if (adev->asic_reset_res) 3067 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3068 adev->asic_reset_res, adev_to_drm(adev)->unique); 3069 amdgpu_put_xgmi_hive(hive); 3070 } 3071 3072 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3073 { 3074 char *input = amdgpu_lockup_timeout; 3075 char *timeout_setting = NULL; 3076 int index = 0; 3077 long timeout; 3078 int ret = 0; 3079 3080 /* 3081 * By default timeout for non compute jobs is 10000. 3082 * And there is no timeout enforced on compute jobs. 3083 * In SR-IOV or passthrough mode, timeout for compute 3084 * jobs are 60000 by default. 3085 */ 3086 adev->gfx_timeout = msecs_to_jiffies(10000); 3087 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3088 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3089 adev->compute_timeout = msecs_to_jiffies(60000); 3090 else 3091 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3092 3093 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3094 while ((timeout_setting = strsep(&input, ",")) && 3095 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3096 ret = kstrtol(timeout_setting, 0, &timeout); 3097 if (ret) 3098 return ret; 3099 3100 if (timeout == 0) { 3101 index++; 3102 continue; 3103 } else if (timeout < 0) { 3104 timeout = MAX_SCHEDULE_TIMEOUT; 3105 } else { 3106 timeout = msecs_to_jiffies(timeout); 3107 } 3108 3109 switch (index++) { 3110 case 0: 3111 adev->gfx_timeout = timeout; 3112 break; 3113 case 1: 3114 adev->compute_timeout = timeout; 3115 break; 3116 case 2: 3117 adev->sdma_timeout = timeout; 3118 break; 3119 case 3: 3120 adev->video_timeout = timeout; 3121 break; 3122 default: 3123 break; 3124 } 3125 } 3126 /* 3127 * There is only one value specified and 3128 * it should apply to all non-compute jobs. 3129 */ 3130 if (index == 1) { 3131 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3132 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3133 adev->compute_timeout = adev->gfx_timeout; 3134 } 3135 } 3136 3137 return ret; 3138 } 3139 3140 static const struct attribute *amdgpu_dev_attributes[] = { 3141 &dev_attr_product_name.attr, 3142 &dev_attr_product_number.attr, 3143 &dev_attr_serial_number.attr, 3144 &dev_attr_pcie_replay_count.attr, 3145 NULL 3146 }; 3147 3148 3149 /** 3150 * amdgpu_device_init - initialize the driver 3151 * 3152 * @adev: amdgpu_device pointer 3153 * @flags: driver flags 3154 * 3155 * Initializes the driver info and hw (all asics). 3156 * Returns 0 for success or an error on failure. 3157 * Called at driver startup. 3158 */ 3159 int amdgpu_device_init(struct amdgpu_device *adev, 3160 uint32_t flags) 3161 { 3162 struct drm_device *ddev = adev_to_drm(adev); 3163 struct pci_dev *pdev = adev->pdev; 3164 int r, i; 3165 bool boco = false; 3166 u32 max_MBps; 3167 3168 adev->shutdown = false; 3169 adev->flags = flags; 3170 3171 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3172 adev->asic_type = amdgpu_force_asic_type; 3173 else 3174 adev->asic_type = flags & AMD_ASIC_MASK; 3175 3176 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3177 if (amdgpu_emu_mode == 1) 3178 adev->usec_timeout *= 10; 3179 adev->gmc.gart_size = 512 * 1024 * 1024; 3180 adev->accel_working = false; 3181 adev->num_rings = 0; 3182 adev->mman.buffer_funcs = NULL; 3183 adev->mman.buffer_funcs_ring = NULL; 3184 adev->vm_manager.vm_pte_funcs = NULL; 3185 adev->vm_manager.vm_pte_num_scheds = 0; 3186 adev->gmc.gmc_funcs = NULL; 3187 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3188 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3189 3190 adev->smc_rreg = &amdgpu_invalid_rreg; 3191 adev->smc_wreg = &amdgpu_invalid_wreg; 3192 adev->pcie_rreg = &amdgpu_invalid_rreg; 3193 adev->pcie_wreg = &amdgpu_invalid_wreg; 3194 adev->pciep_rreg = &amdgpu_invalid_rreg; 3195 adev->pciep_wreg = &amdgpu_invalid_wreg; 3196 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3197 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3198 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3199 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3200 adev->didt_rreg = &amdgpu_invalid_rreg; 3201 adev->didt_wreg = &amdgpu_invalid_wreg; 3202 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3203 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3204 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3205 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3206 3207 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3208 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3209 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3210 3211 /* mutex initialization are all done here so we 3212 * can recall function without having locking issues */ 3213 atomic_set(&adev->irq.ih.lock, 0); 3214 mutex_init(&adev->firmware.mutex); 3215 mutex_init(&adev->pm.mutex); 3216 mutex_init(&adev->gfx.gpu_clock_mutex); 3217 mutex_init(&adev->srbm_mutex); 3218 mutex_init(&adev->gfx.pipe_reserve_mutex); 3219 mutex_init(&adev->gfx.gfx_off_mutex); 3220 mutex_init(&adev->grbm_idx_mutex); 3221 mutex_init(&adev->mn_lock); 3222 mutex_init(&adev->virt.vf_errors.lock); 3223 hash_init(adev->mn_hash); 3224 atomic_set(&adev->in_gpu_reset, 0); 3225 init_rwsem(&adev->reset_sem); 3226 mutex_init(&adev->psp.mutex); 3227 mutex_init(&adev->notifier_lock); 3228 3229 r = amdgpu_device_check_arguments(adev); 3230 if (r) 3231 return r; 3232 3233 spin_lock_init(&adev->mmio_idx_lock); 3234 spin_lock_init(&adev->smc_idx_lock); 3235 spin_lock_init(&adev->pcie_idx_lock); 3236 spin_lock_init(&adev->uvd_ctx_idx_lock); 3237 spin_lock_init(&adev->didt_idx_lock); 3238 spin_lock_init(&adev->gc_cac_idx_lock); 3239 spin_lock_init(&adev->se_cac_idx_lock); 3240 spin_lock_init(&adev->audio_endpt_idx_lock); 3241 spin_lock_init(&adev->mm_stats.lock); 3242 3243 INIT_LIST_HEAD(&adev->shadow_list); 3244 mutex_init(&adev->shadow_list_lock); 3245 3246 INIT_DELAYED_WORK(&adev->delayed_init_work, 3247 amdgpu_device_delayed_init_work_handler); 3248 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3249 amdgpu_device_delay_enable_gfx_off); 3250 3251 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3252 3253 adev->gfx.gfx_off_req_count = 1; 3254 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3255 3256 atomic_set(&adev->throttling_logging_enabled, 1); 3257 /* 3258 * If throttling continues, logging will be performed every minute 3259 * to avoid log flooding. "-1" is subtracted since the thermal 3260 * throttling interrupt comes every second. Thus, the total logging 3261 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3262 * for throttling interrupt) = 60 seconds. 3263 */ 3264 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3265 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3266 3267 /* Registers mapping */ 3268 /* TODO: block userspace mapping of io register */ 3269 if (adev->asic_type >= CHIP_BONAIRE) { 3270 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3271 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3272 } else { 3273 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3274 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3275 } 3276 3277 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3278 if (adev->rmmio == NULL) { 3279 return -ENOMEM; 3280 } 3281 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3282 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3283 3284 /* io port mapping */ 3285 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3286 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3287 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3288 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3289 break; 3290 } 3291 } 3292 if (adev->rio_mem == NULL) 3293 DRM_INFO("PCI I/O BAR is not found.\n"); 3294 3295 /* enable PCIE atomic ops */ 3296 r = pci_enable_atomic_ops_to_root(adev->pdev, 3297 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3298 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3299 if (r) { 3300 adev->have_atomics_support = false; 3301 DRM_INFO("PCIE atomic ops is not supported\n"); 3302 } else { 3303 adev->have_atomics_support = true; 3304 } 3305 3306 amdgpu_device_get_pcie_info(adev); 3307 3308 if (amdgpu_mcbp) 3309 DRM_INFO("MCBP is enabled\n"); 3310 3311 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3312 adev->enable_mes = true; 3313 3314 /* detect hw virtualization here */ 3315 amdgpu_detect_virtualization(adev); 3316 3317 r = amdgpu_device_get_job_timeout_settings(adev); 3318 if (r) { 3319 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3320 goto failed_unmap; 3321 } 3322 3323 /* early init functions */ 3324 r = amdgpu_device_ip_early_init(adev); 3325 if (r) 3326 goto failed_unmap; 3327 3328 /* doorbell bar mapping and doorbell index init*/ 3329 amdgpu_device_doorbell_init(adev); 3330 3331 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3332 /* this will fail for cards that aren't VGA class devices, just 3333 * ignore it */ 3334 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3335 3336 if (amdgpu_device_supports_boco(ddev)) 3337 boco = true; 3338 if (amdgpu_has_atpx() && 3339 (amdgpu_is_atpx_hybrid() || 3340 amdgpu_has_atpx_dgpu_power_cntl()) && 3341 !pci_is_thunderbolt_attached(adev->pdev)) 3342 vga_switcheroo_register_client(adev->pdev, 3343 &amdgpu_switcheroo_ops, boco); 3344 if (boco) 3345 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3346 3347 if (amdgpu_emu_mode == 1) { 3348 /* post the asic on emulation mode */ 3349 emu_soc_asic_init(adev); 3350 goto fence_driver_init; 3351 } 3352 3353 /* detect if we are with an SRIOV vbios */ 3354 amdgpu_device_detect_sriov_bios(adev); 3355 3356 /* check if we need to reset the asic 3357 * E.g., driver was not cleanly unloaded previously, etc. 3358 */ 3359 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3360 r = amdgpu_asic_reset(adev); 3361 if (r) { 3362 dev_err(adev->dev, "asic reset on init failed\n"); 3363 goto failed; 3364 } 3365 } 3366 3367 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3368 3369 /* Post card if necessary */ 3370 if (amdgpu_device_need_post(adev)) { 3371 if (!adev->bios) { 3372 dev_err(adev->dev, "no vBIOS found\n"); 3373 r = -EINVAL; 3374 goto failed; 3375 } 3376 DRM_INFO("GPU posting now...\n"); 3377 r = amdgpu_device_asic_init(adev); 3378 if (r) { 3379 dev_err(adev->dev, "gpu post error!\n"); 3380 goto failed; 3381 } 3382 } 3383 3384 if (adev->is_atom_fw) { 3385 /* Initialize clocks */ 3386 r = amdgpu_atomfirmware_get_clock_info(adev); 3387 if (r) { 3388 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3389 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3390 goto failed; 3391 } 3392 } else { 3393 /* Initialize clocks */ 3394 r = amdgpu_atombios_get_clock_info(adev); 3395 if (r) { 3396 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3397 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3398 goto failed; 3399 } 3400 /* init i2c buses */ 3401 if (!amdgpu_device_has_dc_support(adev)) 3402 amdgpu_atombios_i2c_init(adev); 3403 } 3404 3405 fence_driver_init: 3406 /* Fence driver */ 3407 r = amdgpu_fence_driver_init(adev); 3408 if (r) { 3409 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3410 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3411 goto failed; 3412 } 3413 3414 /* init the mode config */ 3415 drm_mode_config_init(adev_to_drm(adev)); 3416 3417 r = amdgpu_device_ip_init(adev); 3418 if (r) { 3419 /* failed in exclusive mode due to timeout */ 3420 if (amdgpu_sriov_vf(adev) && 3421 !amdgpu_sriov_runtime(adev) && 3422 amdgpu_virt_mmio_blocked(adev) && 3423 !amdgpu_virt_wait_reset(adev)) { 3424 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3425 /* Don't send request since VF is inactive. */ 3426 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3427 adev->virt.ops = NULL; 3428 r = -EAGAIN; 3429 goto failed; 3430 } 3431 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3432 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3433 goto failed; 3434 } 3435 3436 dev_info(adev->dev, 3437 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3438 adev->gfx.config.max_shader_engines, 3439 adev->gfx.config.max_sh_per_se, 3440 adev->gfx.config.max_cu_per_sh, 3441 adev->gfx.cu_info.number); 3442 3443 adev->accel_working = true; 3444 3445 amdgpu_vm_check_compute_bug(adev); 3446 3447 /* Initialize the buffer migration limit. */ 3448 if (amdgpu_moverate >= 0) 3449 max_MBps = amdgpu_moverate; 3450 else 3451 max_MBps = 8; /* Allow 8 MB/s. */ 3452 /* Get a log2 for easy divisions. */ 3453 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3454 3455 amdgpu_fbdev_init(adev); 3456 3457 r = amdgpu_pm_sysfs_init(adev); 3458 if (r) { 3459 adev->pm_sysfs_en = false; 3460 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3461 } else 3462 adev->pm_sysfs_en = true; 3463 3464 r = amdgpu_ucode_sysfs_init(adev); 3465 if (r) { 3466 adev->ucode_sysfs_en = false; 3467 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3468 } else 3469 adev->ucode_sysfs_en = true; 3470 3471 if ((amdgpu_testing & 1)) { 3472 if (adev->accel_working) 3473 amdgpu_test_moves(adev); 3474 else 3475 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3476 } 3477 if (amdgpu_benchmarking) { 3478 if (adev->accel_working) 3479 amdgpu_benchmark(adev, amdgpu_benchmarking); 3480 else 3481 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3482 } 3483 3484 /* 3485 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3486 * Otherwise the mgpu fan boost feature will be skipped due to the 3487 * gpu instance is counted less. 3488 */ 3489 amdgpu_register_gpu_instance(adev); 3490 3491 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3492 * explicit gating rather than handling it automatically. 3493 */ 3494 r = amdgpu_device_ip_late_init(adev); 3495 if (r) { 3496 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3497 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3498 goto failed; 3499 } 3500 3501 /* must succeed. */ 3502 amdgpu_ras_resume(adev); 3503 3504 queue_delayed_work(system_wq, &adev->delayed_init_work, 3505 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3506 3507 if (amdgpu_sriov_vf(adev)) 3508 flush_delayed_work(&adev->delayed_init_work); 3509 3510 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3511 if (r) 3512 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3513 3514 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3515 r = amdgpu_pmu_init(adev); 3516 if (r) 3517 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3518 3519 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3520 if (amdgpu_device_cache_pci_state(adev->pdev)) 3521 pci_restore_state(pdev); 3522 3523 return 0; 3524 3525 failed: 3526 amdgpu_vf_error_trans_all(adev); 3527 if (boco) 3528 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3529 3530 failed_unmap: 3531 iounmap(adev->rmmio); 3532 adev->rmmio = NULL; 3533 3534 return r; 3535 } 3536 3537 /** 3538 * amdgpu_device_fini - tear down the driver 3539 * 3540 * @adev: amdgpu_device pointer 3541 * 3542 * Tear down the driver info (all asics). 3543 * Called at driver shutdown. 3544 */ 3545 void amdgpu_device_fini(struct amdgpu_device *adev) 3546 { 3547 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3548 flush_delayed_work(&adev->delayed_init_work); 3549 adev->shutdown = true; 3550 3551 kfree(adev->pci_state); 3552 3553 /* make sure IB test finished before entering exclusive mode 3554 * to avoid preemption on IB test 3555 * */ 3556 if (amdgpu_sriov_vf(adev)) { 3557 amdgpu_virt_request_full_gpu(adev, false); 3558 amdgpu_virt_fini_data_exchange(adev); 3559 } 3560 3561 /* disable all interrupts */ 3562 amdgpu_irq_disable_all(adev); 3563 if (adev->mode_info.mode_config_initialized){ 3564 if (!amdgpu_device_has_dc_support(adev)) 3565 drm_helper_force_disable_all(adev_to_drm(adev)); 3566 else 3567 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3568 } 3569 amdgpu_fence_driver_fini(adev); 3570 if (adev->pm_sysfs_en) 3571 amdgpu_pm_sysfs_fini(adev); 3572 amdgpu_fbdev_fini(adev); 3573 amdgpu_device_ip_fini(adev); 3574 release_firmware(adev->firmware.gpu_info_fw); 3575 adev->firmware.gpu_info_fw = NULL; 3576 adev->accel_working = false; 3577 /* free i2c buses */ 3578 if (!amdgpu_device_has_dc_support(adev)) 3579 amdgpu_i2c_fini(adev); 3580 3581 if (amdgpu_emu_mode != 1) 3582 amdgpu_atombios_fini(adev); 3583 3584 kfree(adev->bios); 3585 adev->bios = NULL; 3586 if (amdgpu_has_atpx() && 3587 (amdgpu_is_atpx_hybrid() || 3588 amdgpu_has_atpx_dgpu_power_cntl()) && 3589 !pci_is_thunderbolt_attached(adev->pdev)) 3590 vga_switcheroo_unregister_client(adev->pdev); 3591 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3592 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3593 vga_client_register(adev->pdev, NULL, NULL, NULL); 3594 if (adev->rio_mem) 3595 pci_iounmap(adev->pdev, adev->rio_mem); 3596 adev->rio_mem = NULL; 3597 iounmap(adev->rmmio); 3598 adev->rmmio = NULL; 3599 amdgpu_device_doorbell_fini(adev); 3600 3601 if (adev->ucode_sysfs_en) 3602 amdgpu_ucode_sysfs_fini(adev); 3603 3604 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3605 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3606 amdgpu_pmu_fini(adev); 3607 if (adev->mman.discovery_bin) 3608 amdgpu_discovery_fini(adev); 3609 } 3610 3611 3612 /* 3613 * Suspend & resume. 3614 */ 3615 /** 3616 * amdgpu_device_suspend - initiate device suspend 3617 * 3618 * @dev: drm dev pointer 3619 * @fbcon : notify the fbdev of suspend 3620 * 3621 * Puts the hw in the suspend state (all asics). 3622 * Returns 0 for success or an error on failure. 3623 * Called at driver suspend. 3624 */ 3625 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3626 { 3627 struct amdgpu_device *adev; 3628 struct drm_crtc *crtc; 3629 struct drm_connector *connector; 3630 struct drm_connector_list_iter iter; 3631 int r; 3632 3633 adev = drm_to_adev(dev); 3634 3635 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3636 return 0; 3637 3638 adev->in_suspend = true; 3639 drm_kms_helper_poll_disable(dev); 3640 3641 if (fbcon) 3642 amdgpu_fbdev_set_suspend(adev, 1); 3643 3644 cancel_delayed_work_sync(&adev->delayed_init_work); 3645 3646 if (!amdgpu_device_has_dc_support(adev)) { 3647 /* turn off display hw */ 3648 drm_modeset_lock_all(dev); 3649 drm_connector_list_iter_begin(dev, &iter); 3650 drm_for_each_connector_iter(connector, &iter) 3651 drm_helper_connector_dpms(connector, 3652 DRM_MODE_DPMS_OFF); 3653 drm_connector_list_iter_end(&iter); 3654 drm_modeset_unlock_all(dev); 3655 /* unpin the front buffers and cursors */ 3656 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3657 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3658 struct drm_framebuffer *fb = crtc->primary->fb; 3659 struct amdgpu_bo *robj; 3660 3661 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3662 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3663 r = amdgpu_bo_reserve(aobj, true); 3664 if (r == 0) { 3665 amdgpu_bo_unpin(aobj); 3666 amdgpu_bo_unreserve(aobj); 3667 } 3668 } 3669 3670 if (fb == NULL || fb->obj[0] == NULL) { 3671 continue; 3672 } 3673 robj = gem_to_amdgpu_bo(fb->obj[0]); 3674 /* don't unpin kernel fb objects */ 3675 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3676 r = amdgpu_bo_reserve(robj, true); 3677 if (r == 0) { 3678 amdgpu_bo_unpin(robj); 3679 amdgpu_bo_unreserve(robj); 3680 } 3681 } 3682 } 3683 } 3684 3685 amdgpu_ras_suspend(adev); 3686 3687 r = amdgpu_device_ip_suspend_phase1(adev); 3688 3689 amdgpu_amdkfd_suspend(adev, !fbcon); 3690 3691 /* evict vram memory */ 3692 amdgpu_bo_evict_vram(adev); 3693 3694 amdgpu_fence_driver_suspend(adev); 3695 3696 r = amdgpu_device_ip_suspend_phase2(adev); 3697 3698 /* evict remaining vram memory 3699 * This second call to evict vram is to evict the gart page table 3700 * using the CPU. 3701 */ 3702 amdgpu_bo_evict_vram(adev); 3703 3704 return 0; 3705 } 3706 3707 /** 3708 * amdgpu_device_resume - initiate device resume 3709 * 3710 * @dev: drm dev pointer 3711 * @fbcon : notify the fbdev of resume 3712 * 3713 * Bring the hw back to operating state (all asics). 3714 * Returns 0 for success or an error on failure. 3715 * Called at driver resume. 3716 */ 3717 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3718 { 3719 struct drm_connector *connector; 3720 struct drm_connector_list_iter iter; 3721 struct amdgpu_device *adev = drm_to_adev(dev); 3722 struct drm_crtc *crtc; 3723 int r = 0; 3724 3725 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3726 return 0; 3727 3728 /* post card */ 3729 if (amdgpu_device_need_post(adev)) { 3730 r = amdgpu_device_asic_init(adev); 3731 if (r) 3732 dev_err(adev->dev, "amdgpu asic init failed\n"); 3733 } 3734 3735 r = amdgpu_device_ip_resume(adev); 3736 if (r) { 3737 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3738 return r; 3739 } 3740 amdgpu_fence_driver_resume(adev); 3741 3742 3743 r = amdgpu_device_ip_late_init(adev); 3744 if (r) 3745 return r; 3746 3747 queue_delayed_work(system_wq, &adev->delayed_init_work, 3748 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3749 3750 if (!amdgpu_device_has_dc_support(adev)) { 3751 /* pin cursors */ 3752 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3753 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3754 3755 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3756 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3757 r = amdgpu_bo_reserve(aobj, true); 3758 if (r == 0) { 3759 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3760 if (r != 0) 3761 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3762 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3763 amdgpu_bo_unreserve(aobj); 3764 } 3765 } 3766 } 3767 } 3768 r = amdgpu_amdkfd_resume(adev, !fbcon); 3769 if (r) 3770 return r; 3771 3772 /* Make sure IB tests flushed */ 3773 flush_delayed_work(&adev->delayed_init_work); 3774 3775 /* blat the mode back in */ 3776 if (fbcon) { 3777 if (!amdgpu_device_has_dc_support(adev)) { 3778 /* pre DCE11 */ 3779 drm_helper_resume_force_mode(dev); 3780 3781 /* turn on display hw */ 3782 drm_modeset_lock_all(dev); 3783 3784 drm_connector_list_iter_begin(dev, &iter); 3785 drm_for_each_connector_iter(connector, &iter) 3786 drm_helper_connector_dpms(connector, 3787 DRM_MODE_DPMS_ON); 3788 drm_connector_list_iter_end(&iter); 3789 3790 drm_modeset_unlock_all(dev); 3791 } 3792 amdgpu_fbdev_set_suspend(adev, 0); 3793 } 3794 3795 drm_kms_helper_poll_enable(dev); 3796 3797 amdgpu_ras_resume(adev); 3798 3799 /* 3800 * Most of the connector probing functions try to acquire runtime pm 3801 * refs to ensure that the GPU is powered on when connector polling is 3802 * performed. Since we're calling this from a runtime PM callback, 3803 * trying to acquire rpm refs will cause us to deadlock. 3804 * 3805 * Since we're guaranteed to be holding the rpm lock, it's safe to 3806 * temporarily disable the rpm helpers so this doesn't deadlock us. 3807 */ 3808 #ifdef CONFIG_PM 3809 dev->dev->power.disable_depth++; 3810 #endif 3811 if (!amdgpu_device_has_dc_support(adev)) 3812 drm_helper_hpd_irq_event(dev); 3813 else 3814 drm_kms_helper_hotplug_event(dev); 3815 #ifdef CONFIG_PM 3816 dev->dev->power.disable_depth--; 3817 #endif 3818 adev->in_suspend = false; 3819 3820 return 0; 3821 } 3822 3823 /** 3824 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3825 * 3826 * @adev: amdgpu_device pointer 3827 * 3828 * The list of all the hardware IPs that make up the asic is walked and 3829 * the check_soft_reset callbacks are run. check_soft_reset determines 3830 * if the asic is still hung or not. 3831 * Returns true if any of the IPs are still in a hung state, false if not. 3832 */ 3833 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3834 { 3835 int i; 3836 bool asic_hang = false; 3837 3838 if (amdgpu_sriov_vf(adev)) 3839 return true; 3840 3841 if (amdgpu_asic_need_full_reset(adev)) 3842 return true; 3843 3844 for (i = 0; i < adev->num_ip_blocks; i++) { 3845 if (!adev->ip_blocks[i].status.valid) 3846 continue; 3847 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3848 adev->ip_blocks[i].status.hang = 3849 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3850 if (adev->ip_blocks[i].status.hang) { 3851 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3852 asic_hang = true; 3853 } 3854 } 3855 return asic_hang; 3856 } 3857 3858 /** 3859 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3860 * 3861 * @adev: amdgpu_device pointer 3862 * 3863 * The list of all the hardware IPs that make up the asic is walked and the 3864 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3865 * handles any IP specific hardware or software state changes that are 3866 * necessary for a soft reset to succeed. 3867 * Returns 0 on success, negative error code on failure. 3868 */ 3869 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3870 { 3871 int i, r = 0; 3872 3873 for (i = 0; i < adev->num_ip_blocks; i++) { 3874 if (!adev->ip_blocks[i].status.valid) 3875 continue; 3876 if (adev->ip_blocks[i].status.hang && 3877 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3878 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3879 if (r) 3880 return r; 3881 } 3882 } 3883 3884 return 0; 3885 } 3886 3887 /** 3888 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3889 * 3890 * @adev: amdgpu_device pointer 3891 * 3892 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3893 * reset is necessary to recover. 3894 * Returns true if a full asic reset is required, false if not. 3895 */ 3896 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3897 { 3898 int i; 3899 3900 if (amdgpu_asic_need_full_reset(adev)) 3901 return true; 3902 3903 for (i = 0; i < adev->num_ip_blocks; i++) { 3904 if (!adev->ip_blocks[i].status.valid) 3905 continue; 3906 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3907 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3908 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3909 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3910 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3911 if (adev->ip_blocks[i].status.hang) { 3912 dev_info(adev->dev, "Some block need full reset!\n"); 3913 return true; 3914 } 3915 } 3916 } 3917 return false; 3918 } 3919 3920 /** 3921 * amdgpu_device_ip_soft_reset - do a soft reset 3922 * 3923 * @adev: amdgpu_device pointer 3924 * 3925 * The list of all the hardware IPs that make up the asic is walked and the 3926 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3927 * IP specific hardware or software state changes that are necessary to soft 3928 * reset the IP. 3929 * Returns 0 on success, negative error code on failure. 3930 */ 3931 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3932 { 3933 int i, r = 0; 3934 3935 for (i = 0; i < adev->num_ip_blocks; i++) { 3936 if (!adev->ip_blocks[i].status.valid) 3937 continue; 3938 if (adev->ip_blocks[i].status.hang && 3939 adev->ip_blocks[i].version->funcs->soft_reset) { 3940 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3941 if (r) 3942 return r; 3943 } 3944 } 3945 3946 return 0; 3947 } 3948 3949 /** 3950 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3951 * 3952 * @adev: amdgpu_device pointer 3953 * 3954 * The list of all the hardware IPs that make up the asic is walked and the 3955 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3956 * handles any IP specific hardware or software state changes that are 3957 * necessary after the IP has been soft reset. 3958 * Returns 0 on success, negative error code on failure. 3959 */ 3960 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3961 { 3962 int i, r = 0; 3963 3964 for (i = 0; i < adev->num_ip_blocks; i++) { 3965 if (!adev->ip_blocks[i].status.valid) 3966 continue; 3967 if (adev->ip_blocks[i].status.hang && 3968 adev->ip_blocks[i].version->funcs->post_soft_reset) 3969 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3970 if (r) 3971 return r; 3972 } 3973 3974 return 0; 3975 } 3976 3977 /** 3978 * amdgpu_device_recover_vram - Recover some VRAM contents 3979 * 3980 * @adev: amdgpu_device pointer 3981 * 3982 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3983 * restore things like GPUVM page tables after a GPU reset where 3984 * the contents of VRAM might be lost. 3985 * 3986 * Returns: 3987 * 0 on success, negative error code on failure. 3988 */ 3989 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3990 { 3991 struct dma_fence *fence = NULL, *next = NULL; 3992 struct amdgpu_bo *shadow; 3993 long r = 1, tmo; 3994 3995 if (amdgpu_sriov_runtime(adev)) 3996 tmo = msecs_to_jiffies(8000); 3997 else 3998 tmo = msecs_to_jiffies(100); 3999 4000 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4001 mutex_lock(&adev->shadow_list_lock); 4002 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4003 4004 /* No need to recover an evicted BO */ 4005 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4006 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4007 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4008 continue; 4009 4010 r = amdgpu_bo_restore_shadow(shadow, &next); 4011 if (r) 4012 break; 4013 4014 if (fence) { 4015 tmo = dma_fence_wait_timeout(fence, false, tmo); 4016 dma_fence_put(fence); 4017 fence = next; 4018 if (tmo == 0) { 4019 r = -ETIMEDOUT; 4020 break; 4021 } else if (tmo < 0) { 4022 r = tmo; 4023 break; 4024 } 4025 } else { 4026 fence = next; 4027 } 4028 } 4029 mutex_unlock(&adev->shadow_list_lock); 4030 4031 if (fence) 4032 tmo = dma_fence_wait_timeout(fence, false, tmo); 4033 dma_fence_put(fence); 4034 4035 if (r < 0 || tmo <= 0) { 4036 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4037 return -EIO; 4038 } 4039 4040 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4041 return 0; 4042 } 4043 4044 4045 /** 4046 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4047 * 4048 * @adev: amdgpu device pointer 4049 * @from_hypervisor: request from hypervisor 4050 * 4051 * do VF FLR and reinitialize Asic 4052 * return 0 means succeeded otherwise failed 4053 */ 4054 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4055 bool from_hypervisor) 4056 { 4057 int r; 4058 4059 if (from_hypervisor) 4060 r = amdgpu_virt_request_full_gpu(adev, true); 4061 else 4062 r = amdgpu_virt_reset_gpu(adev); 4063 if (r) 4064 return r; 4065 4066 amdgpu_amdkfd_pre_reset(adev); 4067 4068 /* Resume IP prior to SMC */ 4069 r = amdgpu_device_ip_reinit_early_sriov(adev); 4070 if (r) 4071 goto error; 4072 4073 amdgpu_virt_init_data_exchange(adev); 4074 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4075 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4076 4077 r = amdgpu_device_fw_loading(adev); 4078 if (r) 4079 return r; 4080 4081 /* now we are okay to resume SMC/CP/SDMA */ 4082 r = amdgpu_device_ip_reinit_late_sriov(adev); 4083 if (r) 4084 goto error; 4085 4086 amdgpu_irq_gpu_reset_resume_helper(adev); 4087 r = amdgpu_ib_ring_tests(adev); 4088 amdgpu_amdkfd_post_reset(adev); 4089 4090 error: 4091 amdgpu_virt_release_full_gpu(adev, true); 4092 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4093 amdgpu_inc_vram_lost(adev); 4094 r = amdgpu_device_recover_vram(adev); 4095 } 4096 4097 return r; 4098 } 4099 4100 /** 4101 * amdgpu_device_has_job_running - check if there is any job in mirror list 4102 * 4103 * @adev: amdgpu device pointer 4104 * 4105 * check if there is any job in mirror list 4106 */ 4107 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4108 { 4109 int i; 4110 struct drm_sched_job *job; 4111 4112 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4113 struct amdgpu_ring *ring = adev->rings[i]; 4114 4115 if (!ring || !ring->sched.thread) 4116 continue; 4117 4118 spin_lock(&ring->sched.job_list_lock); 4119 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4120 struct drm_sched_job, node); 4121 spin_unlock(&ring->sched.job_list_lock); 4122 if (job) 4123 return true; 4124 } 4125 return false; 4126 } 4127 4128 /** 4129 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4130 * 4131 * @adev: amdgpu device pointer 4132 * 4133 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4134 * a hung GPU. 4135 */ 4136 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4137 { 4138 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4139 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4140 return false; 4141 } 4142 4143 if (amdgpu_gpu_recovery == 0) 4144 goto disabled; 4145 4146 if (amdgpu_sriov_vf(adev)) 4147 return true; 4148 4149 if (amdgpu_gpu_recovery == -1) { 4150 switch (adev->asic_type) { 4151 case CHIP_BONAIRE: 4152 case CHIP_HAWAII: 4153 case CHIP_TOPAZ: 4154 case CHIP_TONGA: 4155 case CHIP_FIJI: 4156 case CHIP_POLARIS10: 4157 case CHIP_POLARIS11: 4158 case CHIP_POLARIS12: 4159 case CHIP_VEGAM: 4160 case CHIP_VEGA20: 4161 case CHIP_VEGA10: 4162 case CHIP_VEGA12: 4163 case CHIP_RAVEN: 4164 case CHIP_ARCTURUS: 4165 case CHIP_RENOIR: 4166 case CHIP_NAVI10: 4167 case CHIP_NAVI14: 4168 case CHIP_NAVI12: 4169 case CHIP_SIENNA_CICHLID: 4170 break; 4171 default: 4172 goto disabled; 4173 } 4174 } 4175 4176 return true; 4177 4178 disabled: 4179 dev_info(adev->dev, "GPU recovery disabled.\n"); 4180 return false; 4181 } 4182 4183 4184 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4185 struct amdgpu_job *job, 4186 bool *need_full_reset_arg) 4187 { 4188 int i, r = 0; 4189 bool need_full_reset = *need_full_reset_arg; 4190 4191 amdgpu_debugfs_wait_dump(adev); 4192 4193 if (amdgpu_sriov_vf(adev)) { 4194 /* stop the data exchange thread */ 4195 amdgpu_virt_fini_data_exchange(adev); 4196 } 4197 4198 /* block all schedulers and reset given job's ring */ 4199 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4200 struct amdgpu_ring *ring = adev->rings[i]; 4201 4202 if (!ring || !ring->sched.thread) 4203 continue; 4204 4205 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4206 amdgpu_fence_driver_force_completion(ring); 4207 } 4208 4209 if(job) 4210 drm_sched_increase_karma(&job->base); 4211 4212 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4213 if (!amdgpu_sriov_vf(adev)) { 4214 4215 if (!need_full_reset) 4216 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4217 4218 if (!need_full_reset) { 4219 amdgpu_device_ip_pre_soft_reset(adev); 4220 r = amdgpu_device_ip_soft_reset(adev); 4221 amdgpu_device_ip_post_soft_reset(adev); 4222 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4223 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4224 need_full_reset = true; 4225 } 4226 } 4227 4228 if (need_full_reset) 4229 r = amdgpu_device_ip_suspend(adev); 4230 4231 *need_full_reset_arg = need_full_reset; 4232 } 4233 4234 return r; 4235 } 4236 4237 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4238 struct list_head *device_list_handle, 4239 bool *need_full_reset_arg, 4240 bool skip_hw_reset) 4241 { 4242 struct amdgpu_device *tmp_adev = NULL; 4243 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4244 int r = 0; 4245 4246 /* 4247 * ASIC reset has to be done on all HGMI hive nodes ASAP 4248 * to allow proper links negotiation in FW (within 1 sec) 4249 */ 4250 if (!skip_hw_reset && need_full_reset) { 4251 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4252 /* For XGMI run all resets in parallel to speed up the process */ 4253 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4254 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4255 r = -EALREADY; 4256 } else 4257 r = amdgpu_asic_reset(tmp_adev); 4258 4259 if (r) { 4260 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4261 r, adev_to_drm(tmp_adev)->unique); 4262 break; 4263 } 4264 } 4265 4266 /* For XGMI wait for all resets to complete before proceed */ 4267 if (!r) { 4268 list_for_each_entry(tmp_adev, device_list_handle, 4269 gmc.xgmi.head) { 4270 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4271 flush_work(&tmp_adev->xgmi_reset_work); 4272 r = tmp_adev->asic_reset_res; 4273 if (r) 4274 break; 4275 } 4276 } 4277 } 4278 } 4279 4280 if (!r && amdgpu_ras_intr_triggered()) { 4281 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4282 if (tmp_adev->mmhub.funcs && 4283 tmp_adev->mmhub.funcs->reset_ras_error_count) 4284 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4285 } 4286 4287 amdgpu_ras_intr_cleared(); 4288 } 4289 4290 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4291 if (need_full_reset) { 4292 /* post card */ 4293 if (amdgpu_device_asic_init(tmp_adev)) 4294 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4295 4296 if (!r) { 4297 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4298 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4299 if (r) 4300 goto out; 4301 4302 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4303 if (vram_lost) { 4304 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4305 amdgpu_inc_vram_lost(tmp_adev); 4306 } 4307 4308 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4309 if (r) 4310 goto out; 4311 4312 r = amdgpu_device_fw_loading(tmp_adev); 4313 if (r) 4314 return r; 4315 4316 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4317 if (r) 4318 goto out; 4319 4320 if (vram_lost) 4321 amdgpu_device_fill_reset_magic(tmp_adev); 4322 4323 /* 4324 * Add this ASIC as tracked as reset was already 4325 * complete successfully. 4326 */ 4327 amdgpu_register_gpu_instance(tmp_adev); 4328 4329 r = amdgpu_device_ip_late_init(tmp_adev); 4330 if (r) 4331 goto out; 4332 4333 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4334 4335 /* 4336 * The GPU enters bad state once faulty pages 4337 * by ECC has reached the threshold, and ras 4338 * recovery is scheduled next. So add one check 4339 * here to break recovery if it indeed exceeds 4340 * bad page threshold, and remind user to 4341 * retire this GPU or setting one bigger 4342 * bad_page_threshold value to fix this once 4343 * probing driver again. 4344 */ 4345 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4346 /* must succeed. */ 4347 amdgpu_ras_resume(tmp_adev); 4348 } else { 4349 r = -EINVAL; 4350 goto out; 4351 } 4352 4353 /* Update PSP FW topology after reset */ 4354 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4355 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4356 } 4357 } 4358 4359 out: 4360 if (!r) { 4361 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4362 r = amdgpu_ib_ring_tests(tmp_adev); 4363 if (r) { 4364 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4365 r = amdgpu_device_ip_suspend(tmp_adev); 4366 need_full_reset = true; 4367 r = -EAGAIN; 4368 goto end; 4369 } 4370 } 4371 4372 if (!r) 4373 r = amdgpu_device_recover_vram(tmp_adev); 4374 else 4375 tmp_adev->asic_reset_res = r; 4376 } 4377 4378 end: 4379 *need_full_reset_arg = need_full_reset; 4380 return r; 4381 } 4382 4383 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4384 struct amdgpu_hive_info *hive) 4385 { 4386 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4387 return false; 4388 4389 if (hive) { 4390 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4391 } else { 4392 down_write(&adev->reset_sem); 4393 } 4394 4395 atomic_inc(&adev->gpu_reset_counter); 4396 switch (amdgpu_asic_reset_method(adev)) { 4397 case AMD_RESET_METHOD_MODE1: 4398 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4399 break; 4400 case AMD_RESET_METHOD_MODE2: 4401 adev->mp1_state = PP_MP1_STATE_RESET; 4402 break; 4403 default: 4404 adev->mp1_state = PP_MP1_STATE_NONE; 4405 break; 4406 } 4407 4408 return true; 4409 } 4410 4411 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4412 { 4413 amdgpu_vf_error_trans_all(adev); 4414 adev->mp1_state = PP_MP1_STATE_NONE; 4415 atomic_set(&adev->in_gpu_reset, 0); 4416 up_write(&adev->reset_sem); 4417 } 4418 4419 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4420 { 4421 struct pci_dev *p = NULL; 4422 4423 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4424 adev->pdev->bus->number, 1); 4425 if (p) { 4426 pm_runtime_enable(&(p->dev)); 4427 pm_runtime_resume(&(p->dev)); 4428 } 4429 } 4430 4431 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4432 { 4433 enum amd_reset_method reset_method; 4434 struct pci_dev *p = NULL; 4435 u64 expires; 4436 4437 /* 4438 * For now, only BACO and mode1 reset are confirmed 4439 * to suffer the audio issue without proper suspended. 4440 */ 4441 reset_method = amdgpu_asic_reset_method(adev); 4442 if ((reset_method != AMD_RESET_METHOD_BACO) && 4443 (reset_method != AMD_RESET_METHOD_MODE1)) 4444 return -EINVAL; 4445 4446 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4447 adev->pdev->bus->number, 1); 4448 if (!p) 4449 return -ENODEV; 4450 4451 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4452 if (!expires) 4453 /* 4454 * If we cannot get the audio device autosuspend delay, 4455 * a fixed 4S interval will be used. Considering 3S is 4456 * the audio controller default autosuspend delay setting. 4457 * 4S used here is guaranteed to cover that. 4458 */ 4459 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4460 4461 while (!pm_runtime_status_suspended(&(p->dev))) { 4462 if (!pm_runtime_suspend(&(p->dev))) 4463 break; 4464 4465 if (expires < ktime_get_mono_fast_ns()) { 4466 dev_warn(adev->dev, "failed to suspend display audio\n"); 4467 /* TODO: abort the succeeding gpu reset? */ 4468 return -ETIMEDOUT; 4469 } 4470 } 4471 4472 pm_runtime_disable(&(p->dev)); 4473 4474 return 0; 4475 } 4476 4477 /** 4478 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4479 * 4480 * @adev: amdgpu device pointer 4481 * @job: which job trigger hang 4482 * 4483 * Attempt to reset the GPU if it has hung (all asics). 4484 * Attempt to do soft-reset or full-reset and reinitialize Asic 4485 * Returns 0 for success or an error on failure. 4486 */ 4487 4488 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4489 struct amdgpu_job *job) 4490 { 4491 struct list_head device_list, *device_list_handle = NULL; 4492 bool need_full_reset = false; 4493 bool job_signaled = false; 4494 struct amdgpu_hive_info *hive = NULL; 4495 struct amdgpu_device *tmp_adev = NULL; 4496 int i, r = 0; 4497 bool need_emergency_restart = false; 4498 bool audio_suspended = false; 4499 4500 /** 4501 * Special case: RAS triggered and full reset isn't supported 4502 */ 4503 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4504 4505 /* 4506 * Flush RAM to disk so that after reboot 4507 * the user can read log and see why the system rebooted. 4508 */ 4509 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4510 DRM_WARN("Emergency reboot."); 4511 4512 ksys_sync_helper(); 4513 emergency_restart(); 4514 } 4515 4516 dev_info(adev->dev, "GPU %s begin!\n", 4517 need_emergency_restart ? "jobs stop":"reset"); 4518 4519 /* 4520 * Here we trylock to avoid chain of resets executing from 4521 * either trigger by jobs on different adevs in XGMI hive or jobs on 4522 * different schedulers for same device while this TO handler is running. 4523 * We always reset all schedulers for device and all devices for XGMI 4524 * hive so that should take care of them too. 4525 */ 4526 hive = amdgpu_get_xgmi_hive(adev); 4527 if (hive) { 4528 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4529 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4530 job ? job->base.id : -1, hive->hive_id); 4531 amdgpu_put_xgmi_hive(hive); 4532 return 0; 4533 } 4534 mutex_lock(&hive->hive_lock); 4535 } 4536 4537 /* 4538 * Build list of devices to reset. 4539 * In case we are in XGMI hive mode, resort the device list 4540 * to put adev in the 1st position. 4541 */ 4542 INIT_LIST_HEAD(&device_list); 4543 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4544 if (!hive) 4545 return -ENODEV; 4546 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4547 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4548 device_list_handle = &hive->device_list; 4549 } else { 4550 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4551 device_list_handle = &device_list; 4552 } 4553 4554 /* block all schedulers and reset given job's ring */ 4555 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4556 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4557 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4558 job ? job->base.id : -1); 4559 r = 0; 4560 goto skip_recovery; 4561 } 4562 4563 /* 4564 * Try to put the audio codec into suspend state 4565 * before gpu reset started. 4566 * 4567 * Due to the power domain of the graphics device 4568 * is shared with AZ power domain. Without this, 4569 * we may change the audio hardware from behind 4570 * the audio driver's back. That will trigger 4571 * some audio codec errors. 4572 */ 4573 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4574 audio_suspended = true; 4575 4576 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4577 4578 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4579 4580 if (!amdgpu_sriov_vf(tmp_adev)) 4581 amdgpu_amdkfd_pre_reset(tmp_adev); 4582 4583 /* 4584 * Mark these ASICs to be reseted as untracked first 4585 * And add them back after reset completed 4586 */ 4587 amdgpu_unregister_gpu_instance(tmp_adev); 4588 4589 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4590 4591 /* disable ras on ALL IPs */ 4592 if (!need_emergency_restart && 4593 amdgpu_device_ip_need_full_reset(tmp_adev)) 4594 amdgpu_ras_suspend(tmp_adev); 4595 4596 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4597 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4598 4599 if (!ring || !ring->sched.thread) 4600 continue; 4601 4602 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4603 4604 if (need_emergency_restart) 4605 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4606 } 4607 } 4608 4609 if (need_emergency_restart) 4610 goto skip_sched_resume; 4611 4612 /* 4613 * Must check guilty signal here since after this point all old 4614 * HW fences are force signaled. 4615 * 4616 * job->base holds a reference to parent fence 4617 */ 4618 if (job && job->base.s_fence->parent && 4619 dma_fence_is_signaled(job->base.s_fence->parent)) { 4620 job_signaled = true; 4621 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4622 goto skip_hw_reset; 4623 } 4624 4625 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4626 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4627 r = amdgpu_device_pre_asic_reset(tmp_adev, 4628 (tmp_adev == adev) ? job : NULL, 4629 &need_full_reset); 4630 /*TODO Should we stop ?*/ 4631 if (r) { 4632 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4633 r, adev_to_drm(tmp_adev)->unique); 4634 tmp_adev->asic_reset_res = r; 4635 } 4636 } 4637 4638 /* Actual ASIC resets if needed.*/ 4639 /* TODO Implement XGMI hive reset logic for SRIOV */ 4640 if (amdgpu_sriov_vf(adev)) { 4641 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4642 if (r) 4643 adev->asic_reset_res = r; 4644 } else { 4645 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4646 if (r && r == -EAGAIN) 4647 goto retry; 4648 } 4649 4650 skip_hw_reset: 4651 4652 /* Post ASIC reset for all devs .*/ 4653 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4654 4655 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4656 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4657 4658 if (!ring || !ring->sched.thread) 4659 continue; 4660 4661 /* No point to resubmit jobs if we didn't HW reset*/ 4662 if (!tmp_adev->asic_reset_res && !job_signaled) 4663 drm_sched_resubmit_jobs(&ring->sched); 4664 4665 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4666 } 4667 4668 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4669 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4670 } 4671 4672 tmp_adev->asic_reset_res = 0; 4673 4674 if (r) { 4675 /* bad news, how to tell it to userspace ? */ 4676 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4677 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4678 } else { 4679 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4680 } 4681 } 4682 4683 skip_sched_resume: 4684 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4685 /*unlock kfd: SRIOV would do it separately */ 4686 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4687 amdgpu_amdkfd_post_reset(tmp_adev); 4688 if (audio_suspended) 4689 amdgpu_device_resume_display_audio(tmp_adev); 4690 amdgpu_device_unlock_adev(tmp_adev); 4691 } 4692 4693 skip_recovery: 4694 if (hive) { 4695 atomic_set(&hive->in_reset, 0); 4696 mutex_unlock(&hive->hive_lock); 4697 amdgpu_put_xgmi_hive(hive); 4698 } 4699 4700 if (r) 4701 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4702 return r; 4703 } 4704 4705 /** 4706 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4707 * 4708 * @adev: amdgpu_device pointer 4709 * 4710 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4711 * and lanes) of the slot the device is in. Handles APUs and 4712 * virtualized environments where PCIE config space may not be available. 4713 */ 4714 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4715 { 4716 struct pci_dev *pdev; 4717 enum pci_bus_speed speed_cap, platform_speed_cap; 4718 enum pcie_link_width platform_link_width; 4719 4720 if (amdgpu_pcie_gen_cap) 4721 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4722 4723 if (amdgpu_pcie_lane_cap) 4724 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4725 4726 /* covers APUs as well */ 4727 if (pci_is_root_bus(adev->pdev->bus)) { 4728 if (adev->pm.pcie_gen_mask == 0) 4729 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4730 if (adev->pm.pcie_mlw_mask == 0) 4731 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4732 return; 4733 } 4734 4735 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4736 return; 4737 4738 pcie_bandwidth_available(adev->pdev, NULL, 4739 &platform_speed_cap, &platform_link_width); 4740 4741 if (adev->pm.pcie_gen_mask == 0) { 4742 /* asic caps */ 4743 pdev = adev->pdev; 4744 speed_cap = pcie_get_speed_cap(pdev); 4745 if (speed_cap == PCI_SPEED_UNKNOWN) { 4746 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4747 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4748 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4749 } else { 4750 if (speed_cap == PCIE_SPEED_16_0GT) 4751 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4752 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4753 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4755 else if (speed_cap == PCIE_SPEED_8_0GT) 4756 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4757 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4758 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4759 else if (speed_cap == PCIE_SPEED_5_0GT) 4760 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4761 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4762 else 4763 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4764 } 4765 /* platform caps */ 4766 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4767 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4768 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4769 } else { 4770 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4771 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4772 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4773 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4774 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4775 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4776 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4777 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4778 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4779 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4780 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4781 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4782 else 4783 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4784 4785 } 4786 } 4787 if (adev->pm.pcie_mlw_mask == 0) { 4788 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4789 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4790 } else { 4791 switch (platform_link_width) { 4792 case PCIE_LNK_X32: 4793 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4794 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4795 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4796 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4797 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4798 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4799 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4800 break; 4801 case PCIE_LNK_X16: 4802 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4803 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4804 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4806 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4807 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4808 break; 4809 case PCIE_LNK_X12: 4810 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4814 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4815 break; 4816 case PCIE_LNK_X8: 4817 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4821 break; 4822 case PCIE_LNK_X4: 4823 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4826 break; 4827 case PCIE_LNK_X2: 4828 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4830 break; 4831 case PCIE_LNK_X1: 4832 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4833 break; 4834 default: 4835 break; 4836 } 4837 } 4838 } 4839 } 4840 4841 int amdgpu_device_baco_enter(struct drm_device *dev) 4842 { 4843 struct amdgpu_device *adev = drm_to_adev(dev); 4844 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4845 4846 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4847 return -ENOTSUPP; 4848 4849 if (ras && ras->supported) 4850 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4851 4852 return amdgpu_dpm_baco_enter(adev); 4853 } 4854 4855 int amdgpu_device_baco_exit(struct drm_device *dev) 4856 { 4857 struct amdgpu_device *adev = drm_to_adev(dev); 4858 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4859 int ret = 0; 4860 4861 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4862 return -ENOTSUPP; 4863 4864 ret = amdgpu_dpm_baco_exit(adev); 4865 if (ret) 4866 return ret; 4867 4868 if (ras && ras->supported) 4869 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4870 4871 return 0; 4872 } 4873 4874 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4875 { 4876 int i; 4877 4878 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4879 struct amdgpu_ring *ring = adev->rings[i]; 4880 4881 if (!ring || !ring->sched.thread) 4882 continue; 4883 4884 cancel_delayed_work_sync(&ring->sched.work_tdr); 4885 } 4886 } 4887 4888 /** 4889 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4890 * @pdev: PCI device struct 4891 * @state: PCI channel state 4892 * 4893 * Description: Called when a PCI error is detected. 4894 * 4895 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4896 */ 4897 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4898 { 4899 struct drm_device *dev = pci_get_drvdata(pdev); 4900 struct amdgpu_device *adev = drm_to_adev(dev); 4901 int i; 4902 4903 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4904 4905 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4906 DRM_WARN("No support for XGMI hive yet..."); 4907 return PCI_ERS_RESULT_DISCONNECT; 4908 } 4909 4910 switch (state) { 4911 case pci_channel_io_normal: 4912 return PCI_ERS_RESULT_CAN_RECOVER; 4913 /* Fatal error, prepare for slot reset */ 4914 case pci_channel_io_frozen: 4915 /* 4916 * Cancel and wait for all TDRs in progress if failing to 4917 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4918 * 4919 * Locking adev->reset_sem will prevent any external access 4920 * to GPU during PCI error recovery 4921 */ 4922 while (!amdgpu_device_lock_adev(adev, NULL)) 4923 amdgpu_cancel_all_tdr(adev); 4924 4925 /* 4926 * Block any work scheduling as we do for regular GPU reset 4927 * for the duration of the recovery 4928 */ 4929 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4930 struct amdgpu_ring *ring = adev->rings[i]; 4931 4932 if (!ring || !ring->sched.thread) 4933 continue; 4934 4935 drm_sched_stop(&ring->sched, NULL); 4936 } 4937 return PCI_ERS_RESULT_NEED_RESET; 4938 case pci_channel_io_perm_failure: 4939 /* Permanent error, prepare for device removal */ 4940 return PCI_ERS_RESULT_DISCONNECT; 4941 } 4942 4943 return PCI_ERS_RESULT_NEED_RESET; 4944 } 4945 4946 /** 4947 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4948 * @pdev: pointer to PCI device 4949 */ 4950 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4951 { 4952 4953 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4954 4955 /* TODO - dump whatever for debugging purposes */ 4956 4957 /* This called only if amdgpu_pci_error_detected returns 4958 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4959 * works, no need to reset slot. 4960 */ 4961 4962 return PCI_ERS_RESULT_RECOVERED; 4963 } 4964 4965 /** 4966 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4967 * @pdev: PCI device struct 4968 * 4969 * Description: This routine is called by the pci error recovery 4970 * code after the PCI slot has been reset, just before we 4971 * should resume normal operations. 4972 */ 4973 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4974 { 4975 struct drm_device *dev = pci_get_drvdata(pdev); 4976 struct amdgpu_device *adev = drm_to_adev(dev); 4977 int r, i; 4978 bool need_full_reset = true; 4979 u32 memsize; 4980 struct list_head device_list; 4981 4982 DRM_INFO("PCI error: slot reset callback!!\n"); 4983 4984 INIT_LIST_HEAD(&device_list); 4985 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4986 4987 /* wait for asic to come out of reset */ 4988 msleep(500); 4989 4990 /* Restore PCI confspace */ 4991 amdgpu_device_load_pci_state(pdev); 4992 4993 /* confirm ASIC came out of reset */ 4994 for (i = 0; i < adev->usec_timeout; i++) { 4995 memsize = amdgpu_asic_get_config_memsize(adev); 4996 4997 if (memsize != 0xffffffff) 4998 break; 4999 udelay(1); 5000 } 5001 if (memsize == 0xffffffff) { 5002 r = -ETIME; 5003 goto out; 5004 } 5005 5006 adev->in_pci_err_recovery = true; 5007 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5008 adev->in_pci_err_recovery = false; 5009 if (r) 5010 goto out; 5011 5012 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5013 5014 out: 5015 if (!r) { 5016 if (amdgpu_device_cache_pci_state(adev->pdev)) 5017 pci_restore_state(adev->pdev); 5018 5019 DRM_INFO("PCIe error recovery succeeded\n"); 5020 } else { 5021 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5022 amdgpu_device_unlock_adev(adev); 5023 } 5024 5025 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5026 } 5027 5028 /** 5029 * amdgpu_pci_resume() - resume normal ops after PCI reset 5030 * @pdev: pointer to PCI device 5031 * 5032 * Called when the error recovery driver tells us that its 5033 * OK to resume normal operation. Use completion to allow 5034 * halted scsi ops to resume. 5035 */ 5036 void amdgpu_pci_resume(struct pci_dev *pdev) 5037 { 5038 struct drm_device *dev = pci_get_drvdata(pdev); 5039 struct amdgpu_device *adev = drm_to_adev(dev); 5040 int i; 5041 5042 5043 DRM_INFO("PCI error: resume callback!!\n"); 5044 5045 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5046 struct amdgpu_ring *ring = adev->rings[i]; 5047 5048 if (!ring || !ring->sched.thread) 5049 continue; 5050 5051 5052 drm_sched_resubmit_jobs(&ring->sched); 5053 drm_sched_start(&ring->sched, true); 5054 } 5055 5056 amdgpu_device_unlock_adev(adev); 5057 } 5058 5059 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5060 { 5061 struct drm_device *dev = pci_get_drvdata(pdev); 5062 struct amdgpu_device *adev = drm_to_adev(dev); 5063 int r; 5064 5065 r = pci_save_state(pdev); 5066 if (!r) { 5067 kfree(adev->pci_state); 5068 5069 adev->pci_state = pci_store_saved_state(pdev); 5070 5071 if (!adev->pci_state) { 5072 DRM_ERROR("Failed to store PCI saved state"); 5073 return false; 5074 } 5075 } else { 5076 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5077 return false; 5078 } 5079 5080 return true; 5081 } 5082 5083 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5084 { 5085 struct drm_device *dev = pci_get_drvdata(pdev); 5086 struct amdgpu_device *adev = drm_to_adev(dev); 5087 int r; 5088 5089 if (!adev->pci_state) 5090 return false; 5091 5092 r = pci_load_saved_state(pdev, adev->pci_state); 5093 5094 if (!r) { 5095 pci_restore_state(pdev); 5096 } else { 5097 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5098 return false; 5099 } 5100 5101 return true; 5102 } 5103 5104 5105