1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 84 #define AMDGPU_RESUME_MS 2000 85 86 const char *amdgpu_asic_name[] = { 87 "TAHITI", 88 "PITCAIRN", 89 "VERDE", 90 "OLAND", 91 "HAINAN", 92 "BONAIRE", 93 "KAVERI", 94 "KABINI", 95 "HAWAII", 96 "MULLINS", 97 "TOPAZ", 98 "TONGA", 99 "FIJI", 100 "CARRIZO", 101 "STONEY", 102 "POLARIS10", 103 "POLARIS11", 104 "POLARIS12", 105 "VEGAM", 106 "VEGA10", 107 "VEGA12", 108 "VEGA20", 109 "RAVEN", 110 "ARCTURUS", 111 "RENOIR", 112 "NAVI10", 113 "NAVI14", 114 "NAVI12", 115 "SIENNA_CICHLID", 116 "NAVY_FLOUNDER", 117 "LAST", 118 }; 119 120 /** 121 * DOC: pcie_replay_count 122 * 123 * The amdgpu driver provides a sysfs API for reporting the total number 124 * of PCIe replays (NAKs) 125 * The file pcie_replay_count is used for this and returns the total 126 * number of replays as a sum of the NAKs generated and NAKs received 127 */ 128 129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 130 struct device_attribute *attr, char *buf) 131 { 132 struct drm_device *ddev = dev_get_drvdata(dev); 133 struct amdgpu_device *adev = drm_to_adev(ddev); 134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 135 136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 137 } 138 139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 140 amdgpu_device_get_pcie_replay_count, NULL); 141 142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 143 144 /** 145 * DOC: product_name 146 * 147 * The amdgpu driver provides a sysfs API for reporting the product name 148 * for the device 149 * The file serial_number is used for this and returns the product name 150 * as returned from the FRU. 151 * NOTE: This is only available for certain server cards 152 */ 153 154 static ssize_t amdgpu_device_get_product_name(struct device *dev, 155 struct device_attribute *attr, char *buf) 156 { 157 struct drm_device *ddev = dev_get_drvdata(dev); 158 struct amdgpu_device *adev = drm_to_adev(ddev); 159 160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 161 } 162 163 static DEVICE_ATTR(product_name, S_IRUGO, 164 amdgpu_device_get_product_name, NULL); 165 166 /** 167 * DOC: product_number 168 * 169 * The amdgpu driver provides a sysfs API for reporting the part number 170 * for the device 171 * The file serial_number is used for this and returns the part number 172 * as returned from the FRU. 173 * NOTE: This is only available for certain server cards 174 */ 175 176 static ssize_t amdgpu_device_get_product_number(struct device *dev, 177 struct device_attribute *attr, char *buf) 178 { 179 struct drm_device *ddev = dev_get_drvdata(dev); 180 struct amdgpu_device *adev = drm_to_adev(ddev); 181 182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 183 } 184 185 static DEVICE_ATTR(product_number, S_IRUGO, 186 amdgpu_device_get_product_number, NULL); 187 188 /** 189 * DOC: serial_number 190 * 191 * The amdgpu driver provides a sysfs API for reporting the serial number 192 * for the device 193 * The file serial_number is used for this and returns the serial number 194 * as returned from the FRU. 195 * NOTE: This is only available for certain server cards 196 */ 197 198 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 199 struct device_attribute *attr, char *buf) 200 { 201 struct drm_device *ddev = dev_get_drvdata(dev); 202 struct amdgpu_device *adev = drm_to_adev(ddev); 203 204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 205 } 206 207 static DEVICE_ATTR(serial_number, S_IRUGO, 208 amdgpu_device_get_serial_number, NULL); 209 210 /** 211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 212 * 213 * @dev: drm_device pointer 214 * 215 * Returns true if the device is a dGPU with HG/PX power control, 216 * otherwise return false. 217 */ 218 bool amdgpu_device_supports_boco(struct drm_device *dev) 219 { 220 struct amdgpu_device *adev = drm_to_adev(dev); 221 222 if (adev->flags & AMD_IS_PX) 223 return true; 224 return false; 225 } 226 227 /** 228 * amdgpu_device_supports_baco - Does the device support BACO 229 * 230 * @dev: drm_device pointer 231 * 232 * Returns true if the device supporte BACO, 233 * otherwise return false. 234 */ 235 bool amdgpu_device_supports_baco(struct drm_device *dev) 236 { 237 struct amdgpu_device *adev = drm_to_adev(dev); 238 239 return amdgpu_asic_supports_baco(adev); 240 } 241 242 /** 243 * VRAM access helper functions. 244 * 245 * amdgpu_device_vram_access - read/write a buffer in vram 246 * 247 * @adev: amdgpu_device pointer 248 * @pos: offset of the buffer in vram 249 * @buf: virtual address of the buffer in system memory 250 * @size: read/write size, sizeof(@buf) must > @size 251 * @write: true - write to vram, otherwise - read from vram 252 */ 253 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 254 uint32_t *buf, size_t size, bool write) 255 { 256 unsigned long flags; 257 uint32_t hi = ~0; 258 uint64_t last; 259 260 261 #ifdef CONFIG_64BIT 262 last = min(pos + size, adev->gmc.visible_vram_size); 263 if (last > pos) { 264 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 265 size_t count = last - pos; 266 267 if (write) { 268 memcpy_toio(addr, buf, count); 269 mb(); 270 amdgpu_asic_flush_hdp(adev, NULL); 271 } else { 272 amdgpu_asic_invalidate_hdp(adev, NULL); 273 mb(); 274 memcpy_fromio(buf, addr, count); 275 } 276 277 if (count == size) 278 return; 279 280 pos += count; 281 buf += count / 4; 282 size -= count; 283 } 284 #endif 285 286 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 287 for (last = pos + size; pos < last; pos += 4) { 288 uint32_t tmp = pos >> 31; 289 290 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 291 if (tmp != hi) { 292 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 293 hi = tmp; 294 } 295 if (write) 296 WREG32_NO_KIQ(mmMM_DATA, *buf++); 297 else 298 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 299 } 300 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 301 } 302 303 /* 304 * register access helper functions. 305 */ 306 /** 307 * amdgpu_device_rreg - read a memory mapped IO or indirect register 308 * 309 * @adev: amdgpu_device pointer 310 * @reg: dword aligned register offset 311 * @acc_flags: access flags which require special behavior 312 * 313 * Returns the 32 bit value from the offset specified. 314 */ 315 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 316 uint32_t reg, uint32_t acc_flags) 317 { 318 uint32_t ret; 319 320 if (adev->in_pci_err_recovery) 321 return 0; 322 323 if ((reg * 4) < adev->rmmio_size) { 324 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 325 amdgpu_sriov_runtime(adev) && 326 down_read_trylock(&adev->reset_sem)) { 327 ret = amdgpu_kiq_rreg(adev, reg); 328 up_read(&adev->reset_sem); 329 } else { 330 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 331 } 332 } else { 333 ret = adev->pcie_rreg(adev, reg * 4); 334 } 335 336 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 337 338 return ret; 339 } 340 341 /* 342 * MMIO register read with bytes helper functions 343 * @offset:bytes offset from MMIO start 344 * 345 */ 346 347 /** 348 * amdgpu_mm_rreg8 - read a memory mapped IO register 349 * 350 * @adev: amdgpu_device pointer 351 * @offset: byte aligned register offset 352 * 353 * Returns the 8 bit value from the offset specified. 354 */ 355 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 356 { 357 if (adev->in_pci_err_recovery) 358 return 0; 359 360 if (offset < adev->rmmio_size) 361 return (readb(adev->rmmio + offset)); 362 BUG(); 363 } 364 365 /* 366 * MMIO register write with bytes helper functions 367 * @offset:bytes offset from MMIO start 368 * @value: the value want to be written to the register 369 * 370 */ 371 /** 372 * amdgpu_mm_wreg8 - read a memory mapped IO register 373 * 374 * @adev: amdgpu_device pointer 375 * @offset: byte aligned register offset 376 * @value: 8 bit value to write 377 * 378 * Writes the value specified to the offset specified. 379 */ 380 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 381 { 382 if (adev->in_pci_err_recovery) 383 return; 384 385 if (offset < adev->rmmio_size) 386 writeb(value, adev->rmmio + offset); 387 else 388 BUG(); 389 } 390 391 /** 392 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 393 * 394 * @adev: amdgpu_device pointer 395 * @reg: dword aligned register offset 396 * @v: 32 bit value to write to the register 397 * @acc_flags: access flags which require special behavior 398 * 399 * Writes the value specified to the offset specified. 400 */ 401 void amdgpu_device_wreg(struct amdgpu_device *adev, 402 uint32_t reg, uint32_t v, 403 uint32_t acc_flags) 404 { 405 if (adev->in_pci_err_recovery) 406 return; 407 408 if ((reg * 4) < adev->rmmio_size) { 409 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 410 amdgpu_sriov_runtime(adev) && 411 down_read_trylock(&adev->reset_sem)) { 412 amdgpu_kiq_wreg(adev, reg, v); 413 up_read(&adev->reset_sem); 414 } else { 415 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 416 } 417 } else { 418 adev->pcie_wreg(adev, reg * 4, v); 419 } 420 421 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 422 } 423 424 /* 425 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 426 * 427 * this function is invoked only the debugfs register access 428 * */ 429 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 430 uint32_t reg, uint32_t v) 431 { 432 if (adev->in_pci_err_recovery) 433 return; 434 435 if (amdgpu_sriov_fullaccess(adev) && 436 adev->gfx.rlc.funcs && 437 adev->gfx.rlc.funcs->is_rlcg_access_range) { 438 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 439 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 440 } else { 441 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 442 } 443 } 444 445 /** 446 * amdgpu_io_rreg - read an IO register 447 * 448 * @adev: amdgpu_device pointer 449 * @reg: dword aligned register offset 450 * 451 * Returns the 32 bit value from the offset specified. 452 */ 453 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 454 { 455 if (adev->in_pci_err_recovery) 456 return 0; 457 458 if ((reg * 4) < adev->rio_mem_size) 459 return ioread32(adev->rio_mem + (reg * 4)); 460 else { 461 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 462 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 463 } 464 } 465 466 /** 467 * amdgpu_io_wreg - write to an IO register 468 * 469 * @adev: amdgpu_device pointer 470 * @reg: dword aligned register offset 471 * @v: 32 bit value to write to the register 472 * 473 * Writes the value specified to the offset specified. 474 */ 475 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 476 { 477 if (adev->in_pci_err_recovery) 478 return; 479 480 if ((reg * 4) < adev->rio_mem_size) 481 iowrite32(v, adev->rio_mem + (reg * 4)); 482 else { 483 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 484 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 485 } 486 } 487 488 /** 489 * amdgpu_mm_rdoorbell - read a doorbell dword 490 * 491 * @adev: amdgpu_device pointer 492 * @index: doorbell index 493 * 494 * Returns the value in the doorbell aperture at the 495 * requested doorbell index (CIK). 496 */ 497 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 498 { 499 if (adev->in_pci_err_recovery) 500 return 0; 501 502 if (index < adev->doorbell.num_doorbells) { 503 return readl(adev->doorbell.ptr + index); 504 } else { 505 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 506 return 0; 507 } 508 } 509 510 /** 511 * amdgpu_mm_wdoorbell - write a doorbell dword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * @v: value to write 516 * 517 * Writes @v to the doorbell aperture at the 518 * requested doorbell index (CIK). 519 */ 520 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 521 { 522 if (adev->in_pci_err_recovery) 523 return; 524 525 if (index < adev->doorbell.num_doorbells) { 526 writel(v, adev->doorbell.ptr + index); 527 } else { 528 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 529 } 530 } 531 532 /** 533 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 534 * 535 * @adev: amdgpu_device pointer 536 * @index: doorbell index 537 * 538 * Returns the value in the doorbell aperture at the 539 * requested doorbell index (VEGA10+). 540 */ 541 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 542 { 543 if (adev->in_pci_err_recovery) 544 return 0; 545 546 if (index < adev->doorbell.num_doorbells) { 547 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 548 } else { 549 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 550 return 0; 551 } 552 } 553 554 /** 555 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 556 * 557 * @adev: amdgpu_device pointer 558 * @index: doorbell index 559 * @v: value to write 560 * 561 * Writes @v to the doorbell aperture at the 562 * requested doorbell index (VEGA10+). 563 */ 564 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 565 { 566 if (adev->in_pci_err_recovery) 567 return; 568 569 if (index < adev->doorbell.num_doorbells) { 570 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 571 } else { 572 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 573 } 574 } 575 576 /** 577 * amdgpu_device_indirect_rreg - read an indirect register 578 * 579 * @adev: amdgpu_device pointer 580 * @pcie_index: mmio register offset 581 * @pcie_data: mmio register offset 582 * 583 * Returns the value of indirect register @reg_addr 584 */ 585 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 586 u32 pcie_index, u32 pcie_data, 587 u32 reg_addr) 588 { 589 unsigned long flags; 590 u32 r; 591 void __iomem *pcie_index_offset; 592 void __iomem *pcie_data_offset; 593 594 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 595 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 596 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 597 598 writel(reg_addr, pcie_index_offset); 599 readl(pcie_index_offset); 600 r = readl(pcie_data_offset); 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @pcie_index: mmio register offset 611 * @pcie_data: mmio register offset 612 * 613 * Returns the value of indirect register @reg_addr 614 */ 615 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 616 u32 pcie_index, u32 pcie_data, 617 u32 reg_addr) 618 { 619 unsigned long flags; 620 u64 r; 621 void __iomem *pcie_index_offset; 622 void __iomem *pcie_data_offset; 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @pcie_index: mmio register offset 646 * @pcie_data: mmio register offset 647 * @reg_addr: indirect register offset 648 * @reg_data: indirect register data 649 * 650 */ 651 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 652 u32 pcie_index, u32 pcie_data, 653 u32 reg_addr, u32 reg_data) 654 { 655 unsigned long flags; 656 void __iomem *pcie_index_offset; 657 void __iomem *pcie_data_offset; 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 /** 671 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 672 * 673 * @adev: amdgpu_device pointer 674 * @pcie_index: mmio register offset 675 * @pcie_data: mmio register offset 676 * @reg_addr: indirect register offset 677 * @reg_data: indirect register data 678 * 679 */ 680 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 681 u32 pcie_index, u32 pcie_data, 682 u32 reg_addr, u64 reg_data) 683 { 684 unsigned long flags; 685 void __iomem *pcie_index_offset; 686 void __iomem *pcie_data_offset; 687 688 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 689 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 690 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 691 692 /* write low 32 bits */ 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 696 readl(pcie_data_offset); 697 /* write high 32 bits */ 698 writel(reg_addr + 4, pcie_index_offset); 699 readl(pcie_index_offset); 700 writel((u32)(reg_data >> 32), pcie_data_offset); 701 readl(pcie_data_offset); 702 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 703 } 704 705 /** 706 * amdgpu_invalid_rreg - dummy reg read function 707 * 708 * @adev: amdgpu device pointer 709 * @reg: offset of register 710 * 711 * Dummy register read function. Used for register blocks 712 * that certain asics don't have (all asics). 713 * Returns the value in the register. 714 */ 715 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 716 { 717 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 718 BUG(); 719 return 0; 720 } 721 722 /** 723 * amdgpu_invalid_wreg - dummy reg write function 724 * 725 * @adev: amdgpu device pointer 726 * @reg: offset of register 727 * @v: value to write to the register 728 * 729 * Dummy register read function. Used for register blocks 730 * that certain asics don't have (all asics). 731 */ 732 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 733 { 734 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 735 reg, v); 736 BUG(); 737 } 738 739 /** 740 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 741 * 742 * @adev: amdgpu device pointer 743 * @reg: offset of register 744 * 745 * Dummy register read function. Used for register blocks 746 * that certain asics don't have (all asics). 747 * Returns the value in the register. 748 */ 749 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 750 { 751 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 752 BUG(); 753 return 0; 754 } 755 756 /** 757 * amdgpu_invalid_wreg64 - dummy reg write function 758 * 759 * @adev: amdgpu device pointer 760 * @reg: offset of register 761 * @v: value to write to the register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 */ 766 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 767 { 768 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 769 reg, v); 770 BUG(); 771 } 772 773 /** 774 * amdgpu_block_invalid_rreg - dummy reg read function 775 * 776 * @adev: amdgpu device pointer 777 * @block: offset of instance 778 * @reg: offset of register 779 * 780 * Dummy register read function. Used for register blocks 781 * that certain asics don't have (all asics). 782 * Returns the value in the register. 783 */ 784 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 785 uint32_t block, uint32_t reg) 786 { 787 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 788 reg, block); 789 BUG(); 790 return 0; 791 } 792 793 /** 794 * amdgpu_block_invalid_wreg - dummy reg write function 795 * 796 * @adev: amdgpu device pointer 797 * @block: offset of instance 798 * @reg: offset of register 799 * @v: value to write to the register 800 * 801 * Dummy register read function. Used for register blocks 802 * that certain asics don't have (all asics). 803 */ 804 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 805 uint32_t block, 806 uint32_t reg, uint32_t v) 807 { 808 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 809 reg, block, v); 810 BUG(); 811 } 812 813 /** 814 * amdgpu_device_asic_init - Wrapper for atom asic_init 815 * 816 * @dev: drm_device pointer 817 * 818 * Does any asic specific work and then calls atom asic init. 819 */ 820 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 821 { 822 amdgpu_asic_pre_asic_init(adev); 823 824 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 825 } 826 827 /** 828 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 829 * 830 * @adev: amdgpu device pointer 831 * 832 * Allocates a scratch page of VRAM for use by various things in the 833 * driver. 834 */ 835 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 836 { 837 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 838 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 839 &adev->vram_scratch.robj, 840 &adev->vram_scratch.gpu_addr, 841 (void **)&adev->vram_scratch.ptr); 842 } 843 844 /** 845 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 846 * 847 * @adev: amdgpu device pointer 848 * 849 * Frees the VRAM scratch page. 850 */ 851 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 852 { 853 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 854 } 855 856 /** 857 * amdgpu_device_program_register_sequence - program an array of registers. 858 * 859 * @adev: amdgpu_device pointer 860 * @registers: pointer to the register array 861 * @array_size: size of the register array 862 * 863 * Programs an array or registers with and and or masks. 864 * This is a helper for setting golden registers. 865 */ 866 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 867 const u32 *registers, 868 const u32 array_size) 869 { 870 u32 tmp, reg, and_mask, or_mask; 871 int i; 872 873 if (array_size % 3) 874 return; 875 876 for (i = 0; i < array_size; i +=3) { 877 reg = registers[i + 0]; 878 and_mask = registers[i + 1]; 879 or_mask = registers[i + 2]; 880 881 if (and_mask == 0xffffffff) { 882 tmp = or_mask; 883 } else { 884 tmp = RREG32(reg); 885 tmp &= ~and_mask; 886 if (adev->family >= AMDGPU_FAMILY_AI) 887 tmp |= (or_mask & and_mask); 888 else 889 tmp |= or_mask; 890 } 891 WREG32(reg, tmp); 892 } 893 } 894 895 /** 896 * amdgpu_device_pci_config_reset - reset the GPU 897 * 898 * @adev: amdgpu_device pointer 899 * 900 * Resets the GPU using the pci config reset sequence. 901 * Only applicable to asics prior to vega10. 902 */ 903 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 904 { 905 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 906 } 907 908 /* 909 * GPU doorbell aperture helpers function. 910 */ 911 /** 912 * amdgpu_device_doorbell_init - Init doorbell driver information. 913 * 914 * @adev: amdgpu_device pointer 915 * 916 * Init doorbell driver information (CIK) 917 * Returns 0 on success, error on failure. 918 */ 919 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 920 { 921 922 /* No doorbell on SI hardware generation */ 923 if (adev->asic_type < CHIP_BONAIRE) { 924 adev->doorbell.base = 0; 925 adev->doorbell.size = 0; 926 adev->doorbell.num_doorbells = 0; 927 adev->doorbell.ptr = NULL; 928 return 0; 929 } 930 931 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 932 return -EINVAL; 933 934 amdgpu_asic_init_doorbell_index(adev); 935 936 /* doorbell bar mapping */ 937 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 938 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 939 940 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 941 adev->doorbell_index.max_assignment+1); 942 if (adev->doorbell.num_doorbells == 0) 943 return -EINVAL; 944 945 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 946 * paging queue doorbell use the second page. The 947 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 948 * doorbells are in the first page. So with paging queue enabled, 949 * the max num_doorbells should + 1 page (0x400 in dword) 950 */ 951 if (adev->asic_type >= CHIP_VEGA10) 952 adev->doorbell.num_doorbells += 0x400; 953 954 adev->doorbell.ptr = ioremap(adev->doorbell.base, 955 adev->doorbell.num_doorbells * 956 sizeof(u32)); 957 if (adev->doorbell.ptr == NULL) 958 return -ENOMEM; 959 960 return 0; 961 } 962 963 /** 964 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 965 * 966 * @adev: amdgpu_device pointer 967 * 968 * Tear down doorbell driver information (CIK) 969 */ 970 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 971 { 972 iounmap(adev->doorbell.ptr); 973 adev->doorbell.ptr = NULL; 974 } 975 976 977 978 /* 979 * amdgpu_device_wb_*() 980 * Writeback is the method by which the GPU updates special pages in memory 981 * with the status of certain GPU events (fences, ring pointers,etc.). 982 */ 983 984 /** 985 * amdgpu_device_wb_fini - Disable Writeback and free memory 986 * 987 * @adev: amdgpu_device pointer 988 * 989 * Disables Writeback and frees the Writeback memory (all asics). 990 * Used at driver shutdown. 991 */ 992 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 993 { 994 if (adev->wb.wb_obj) { 995 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 996 &adev->wb.gpu_addr, 997 (void **)&adev->wb.wb); 998 adev->wb.wb_obj = NULL; 999 } 1000 } 1001 1002 /** 1003 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1004 * 1005 * @adev: amdgpu_device pointer 1006 * 1007 * Initializes writeback and allocates writeback memory (all asics). 1008 * Used at driver startup. 1009 * Returns 0 on success or an -error on failure. 1010 */ 1011 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1012 { 1013 int r; 1014 1015 if (adev->wb.wb_obj == NULL) { 1016 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1017 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1018 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1019 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1020 (void **)&adev->wb.wb); 1021 if (r) { 1022 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1023 return r; 1024 } 1025 1026 adev->wb.num_wb = AMDGPU_MAX_WB; 1027 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1028 1029 /* clear wb memory */ 1030 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1031 } 1032 1033 return 0; 1034 } 1035 1036 /** 1037 * amdgpu_device_wb_get - Allocate a wb entry 1038 * 1039 * @adev: amdgpu_device pointer 1040 * @wb: wb index 1041 * 1042 * Allocate a wb slot for use by the driver (all asics). 1043 * Returns 0 on success or -EINVAL on failure. 1044 */ 1045 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1046 { 1047 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1048 1049 if (offset < adev->wb.num_wb) { 1050 __set_bit(offset, adev->wb.used); 1051 *wb = offset << 3; /* convert to dw offset */ 1052 return 0; 1053 } else { 1054 return -EINVAL; 1055 } 1056 } 1057 1058 /** 1059 * amdgpu_device_wb_free - Free a wb entry 1060 * 1061 * @adev: amdgpu_device pointer 1062 * @wb: wb index 1063 * 1064 * Free a wb slot allocated for use by the driver (all asics) 1065 */ 1066 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1067 { 1068 wb >>= 3; 1069 if (wb < adev->wb.num_wb) 1070 __clear_bit(wb, adev->wb.used); 1071 } 1072 1073 /** 1074 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1075 * 1076 * @adev: amdgpu_device pointer 1077 * 1078 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1079 * to fail, but if any of the BARs is not accessible after the size we abort 1080 * driver loading by returning -ENODEV. 1081 */ 1082 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1083 { 1084 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1085 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1086 struct pci_bus *root; 1087 struct resource *res; 1088 unsigned i; 1089 u16 cmd; 1090 int r; 1091 1092 /* Bypass for VF */ 1093 if (amdgpu_sriov_vf(adev)) 1094 return 0; 1095 1096 /* skip if the bios has already enabled large BAR */ 1097 if (adev->gmc.real_vram_size && 1098 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1099 return 0; 1100 1101 /* Check if the root BUS has 64bit memory resources */ 1102 root = adev->pdev->bus; 1103 while (root->parent) 1104 root = root->parent; 1105 1106 pci_bus_for_each_resource(root, res, i) { 1107 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1108 res->start > 0x100000000ull) 1109 break; 1110 } 1111 1112 /* Trying to resize is pointless without a root hub window above 4GB */ 1113 if (!res) 1114 return 0; 1115 1116 /* Disable memory decoding while we change the BAR addresses and size */ 1117 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1118 pci_write_config_word(adev->pdev, PCI_COMMAND, 1119 cmd & ~PCI_COMMAND_MEMORY); 1120 1121 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1122 amdgpu_device_doorbell_fini(adev); 1123 if (adev->asic_type >= CHIP_BONAIRE) 1124 pci_release_resource(adev->pdev, 2); 1125 1126 pci_release_resource(adev->pdev, 0); 1127 1128 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1129 if (r == -ENOSPC) 1130 DRM_INFO("Not enough PCI address space for a large BAR."); 1131 else if (r && r != -ENOTSUPP) 1132 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1133 1134 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1135 1136 /* When the doorbell or fb BAR isn't available we have no chance of 1137 * using the device. 1138 */ 1139 r = amdgpu_device_doorbell_init(adev); 1140 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1141 return -ENODEV; 1142 1143 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1144 1145 return 0; 1146 } 1147 1148 /* 1149 * GPU helpers function. 1150 */ 1151 /** 1152 * amdgpu_device_need_post - check if the hw need post or not 1153 * 1154 * @adev: amdgpu_device pointer 1155 * 1156 * Check if the asic has been initialized (all asics) at driver startup 1157 * or post is needed if hw reset is performed. 1158 * Returns true if need or false if not. 1159 */ 1160 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1161 { 1162 uint32_t reg; 1163 1164 if (amdgpu_sriov_vf(adev)) 1165 return false; 1166 1167 if (amdgpu_passthrough(adev)) { 1168 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1169 * some old smc fw still need driver do vPost otherwise gpu hang, while 1170 * those smc fw version above 22.15 doesn't have this flaw, so we force 1171 * vpost executed for smc version below 22.15 1172 */ 1173 if (adev->asic_type == CHIP_FIJI) { 1174 int err; 1175 uint32_t fw_ver; 1176 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1177 /* force vPost if error occured */ 1178 if (err) 1179 return true; 1180 1181 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1182 if (fw_ver < 0x00160e00) 1183 return true; 1184 } 1185 } 1186 1187 if (adev->has_hw_reset) { 1188 adev->has_hw_reset = false; 1189 return true; 1190 } 1191 1192 /* bios scratch used on CIK+ */ 1193 if (adev->asic_type >= CHIP_BONAIRE) 1194 return amdgpu_atombios_scratch_need_asic_init(adev); 1195 1196 /* check MEM_SIZE for older asics */ 1197 reg = amdgpu_asic_get_config_memsize(adev); 1198 1199 if ((reg != 0) && (reg != 0xffffffff)) 1200 return false; 1201 1202 return true; 1203 } 1204 1205 /* if we get transitioned to only one device, take VGA back */ 1206 /** 1207 * amdgpu_device_vga_set_decode - enable/disable vga decode 1208 * 1209 * @cookie: amdgpu_device pointer 1210 * @state: enable/disable vga decode 1211 * 1212 * Enable/disable vga decode (all asics). 1213 * Returns VGA resource flags. 1214 */ 1215 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1216 { 1217 struct amdgpu_device *adev = cookie; 1218 amdgpu_asic_set_vga_state(adev, state); 1219 if (state) 1220 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1221 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1222 else 1223 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1224 } 1225 1226 /** 1227 * amdgpu_device_check_block_size - validate the vm block size 1228 * 1229 * @adev: amdgpu_device pointer 1230 * 1231 * Validates the vm block size specified via module parameter. 1232 * The vm block size defines number of bits in page table versus page directory, 1233 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1234 * page table and the remaining bits are in the page directory. 1235 */ 1236 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1237 { 1238 /* defines number of bits in page table versus page directory, 1239 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1240 * page table and the remaining bits are in the page directory */ 1241 if (amdgpu_vm_block_size == -1) 1242 return; 1243 1244 if (amdgpu_vm_block_size < 9) { 1245 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1246 amdgpu_vm_block_size); 1247 amdgpu_vm_block_size = -1; 1248 } 1249 } 1250 1251 /** 1252 * amdgpu_device_check_vm_size - validate the vm size 1253 * 1254 * @adev: amdgpu_device pointer 1255 * 1256 * Validates the vm size in GB specified via module parameter. 1257 * The VM size is the size of the GPU virtual memory space in GB. 1258 */ 1259 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1260 { 1261 /* no need to check the default value */ 1262 if (amdgpu_vm_size == -1) 1263 return; 1264 1265 if (amdgpu_vm_size < 1) { 1266 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1267 amdgpu_vm_size); 1268 amdgpu_vm_size = -1; 1269 } 1270 } 1271 1272 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1273 { 1274 struct sysinfo si; 1275 bool is_os_64 = (sizeof(void *) == 8); 1276 uint64_t total_memory; 1277 uint64_t dram_size_seven_GB = 0x1B8000000; 1278 uint64_t dram_size_three_GB = 0xB8000000; 1279 1280 if (amdgpu_smu_memory_pool_size == 0) 1281 return; 1282 1283 if (!is_os_64) { 1284 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1285 goto def_value; 1286 } 1287 si_meminfo(&si); 1288 total_memory = (uint64_t)si.totalram * si.mem_unit; 1289 1290 if ((amdgpu_smu_memory_pool_size == 1) || 1291 (amdgpu_smu_memory_pool_size == 2)) { 1292 if (total_memory < dram_size_three_GB) 1293 goto def_value1; 1294 } else if ((amdgpu_smu_memory_pool_size == 4) || 1295 (amdgpu_smu_memory_pool_size == 8)) { 1296 if (total_memory < dram_size_seven_GB) 1297 goto def_value1; 1298 } else { 1299 DRM_WARN("Smu memory pool size not supported\n"); 1300 goto def_value; 1301 } 1302 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1303 1304 return; 1305 1306 def_value1: 1307 DRM_WARN("No enough system memory\n"); 1308 def_value: 1309 adev->pm.smu_prv_buffer_size = 0; 1310 } 1311 1312 /** 1313 * amdgpu_device_check_arguments - validate module params 1314 * 1315 * @adev: amdgpu_device pointer 1316 * 1317 * Validates certain module parameters and updates 1318 * the associated values used by the driver (all asics). 1319 */ 1320 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1321 { 1322 if (amdgpu_sched_jobs < 4) { 1323 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1324 amdgpu_sched_jobs); 1325 amdgpu_sched_jobs = 4; 1326 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1327 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1328 amdgpu_sched_jobs); 1329 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1330 } 1331 1332 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1333 /* gart size must be greater or equal to 32M */ 1334 dev_warn(adev->dev, "gart size (%d) too small\n", 1335 amdgpu_gart_size); 1336 amdgpu_gart_size = -1; 1337 } 1338 1339 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1340 /* gtt size must be greater or equal to 32M */ 1341 dev_warn(adev->dev, "gtt size (%d) too small\n", 1342 amdgpu_gtt_size); 1343 amdgpu_gtt_size = -1; 1344 } 1345 1346 /* valid range is between 4 and 9 inclusive */ 1347 if (amdgpu_vm_fragment_size != -1 && 1348 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1349 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1350 amdgpu_vm_fragment_size = -1; 1351 } 1352 1353 if (amdgpu_sched_hw_submission < 2) { 1354 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1355 amdgpu_sched_hw_submission); 1356 amdgpu_sched_hw_submission = 2; 1357 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1358 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1359 amdgpu_sched_hw_submission); 1360 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1361 } 1362 1363 amdgpu_device_check_smu_prv_buffer_size(adev); 1364 1365 amdgpu_device_check_vm_size(adev); 1366 1367 amdgpu_device_check_block_size(adev); 1368 1369 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1370 1371 amdgpu_gmc_tmz_set(adev); 1372 1373 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1374 amdgpu_num_kcq = 8; 1375 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1376 } 1377 1378 amdgpu_gmc_noretry_set(adev); 1379 1380 return 0; 1381 } 1382 1383 /** 1384 * amdgpu_switcheroo_set_state - set switcheroo state 1385 * 1386 * @pdev: pci dev pointer 1387 * @state: vga_switcheroo state 1388 * 1389 * Callback for the switcheroo driver. Suspends or resumes the 1390 * the asics before or after it is powered up using ACPI methods. 1391 */ 1392 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1393 enum vga_switcheroo_state state) 1394 { 1395 struct drm_device *dev = pci_get_drvdata(pdev); 1396 int r; 1397 1398 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1399 return; 1400 1401 if (state == VGA_SWITCHEROO_ON) { 1402 pr_info("switched on\n"); 1403 /* don't suspend or resume card normally */ 1404 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1405 1406 pci_set_power_state(dev->pdev, PCI_D0); 1407 amdgpu_device_load_pci_state(dev->pdev); 1408 r = pci_enable_device(dev->pdev); 1409 if (r) 1410 DRM_WARN("pci_enable_device failed (%d)\n", r); 1411 amdgpu_device_resume(dev, true); 1412 1413 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1414 drm_kms_helper_poll_enable(dev); 1415 } else { 1416 pr_info("switched off\n"); 1417 drm_kms_helper_poll_disable(dev); 1418 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1419 amdgpu_device_suspend(dev, true); 1420 amdgpu_device_cache_pci_state(dev->pdev); 1421 /* Shut down the device */ 1422 pci_disable_device(dev->pdev); 1423 pci_set_power_state(dev->pdev, PCI_D3cold); 1424 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1425 } 1426 } 1427 1428 /** 1429 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1430 * 1431 * @pdev: pci dev pointer 1432 * 1433 * Callback for the switcheroo driver. Check of the switcheroo 1434 * state can be changed. 1435 * Returns true if the state can be changed, false if not. 1436 */ 1437 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1438 { 1439 struct drm_device *dev = pci_get_drvdata(pdev); 1440 1441 /* 1442 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1443 * locking inversion with the driver load path. And the access here is 1444 * completely racy anyway. So don't bother with locking for now. 1445 */ 1446 return atomic_read(&dev->open_count) == 0; 1447 } 1448 1449 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1450 .set_gpu_state = amdgpu_switcheroo_set_state, 1451 .reprobe = NULL, 1452 .can_switch = amdgpu_switcheroo_can_switch, 1453 }; 1454 1455 /** 1456 * amdgpu_device_ip_set_clockgating_state - set the CG state 1457 * 1458 * @dev: amdgpu_device pointer 1459 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1460 * @state: clockgating state (gate or ungate) 1461 * 1462 * Sets the requested clockgating state for all instances of 1463 * the hardware IP specified. 1464 * Returns the error code from the last instance. 1465 */ 1466 int amdgpu_device_ip_set_clockgating_state(void *dev, 1467 enum amd_ip_block_type block_type, 1468 enum amd_clockgating_state state) 1469 { 1470 struct amdgpu_device *adev = dev; 1471 int i, r = 0; 1472 1473 for (i = 0; i < adev->num_ip_blocks; i++) { 1474 if (!adev->ip_blocks[i].status.valid) 1475 continue; 1476 if (adev->ip_blocks[i].version->type != block_type) 1477 continue; 1478 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1479 continue; 1480 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1481 (void *)adev, state); 1482 if (r) 1483 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1484 adev->ip_blocks[i].version->funcs->name, r); 1485 } 1486 return r; 1487 } 1488 1489 /** 1490 * amdgpu_device_ip_set_powergating_state - set the PG state 1491 * 1492 * @dev: amdgpu_device pointer 1493 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1494 * @state: powergating state (gate or ungate) 1495 * 1496 * Sets the requested powergating state for all instances of 1497 * the hardware IP specified. 1498 * Returns the error code from the last instance. 1499 */ 1500 int amdgpu_device_ip_set_powergating_state(void *dev, 1501 enum amd_ip_block_type block_type, 1502 enum amd_powergating_state state) 1503 { 1504 struct amdgpu_device *adev = dev; 1505 int i, r = 0; 1506 1507 for (i = 0; i < adev->num_ip_blocks; i++) { 1508 if (!adev->ip_blocks[i].status.valid) 1509 continue; 1510 if (adev->ip_blocks[i].version->type != block_type) 1511 continue; 1512 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1513 continue; 1514 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1515 (void *)adev, state); 1516 if (r) 1517 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1518 adev->ip_blocks[i].version->funcs->name, r); 1519 } 1520 return r; 1521 } 1522 1523 /** 1524 * amdgpu_device_ip_get_clockgating_state - get the CG state 1525 * 1526 * @adev: amdgpu_device pointer 1527 * @flags: clockgating feature flags 1528 * 1529 * Walks the list of IPs on the device and updates the clockgating 1530 * flags for each IP. 1531 * Updates @flags with the feature flags for each hardware IP where 1532 * clockgating is enabled. 1533 */ 1534 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1535 u32 *flags) 1536 { 1537 int i; 1538 1539 for (i = 0; i < adev->num_ip_blocks; i++) { 1540 if (!adev->ip_blocks[i].status.valid) 1541 continue; 1542 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1543 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1544 } 1545 } 1546 1547 /** 1548 * amdgpu_device_ip_wait_for_idle - wait for idle 1549 * 1550 * @adev: amdgpu_device pointer 1551 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1552 * 1553 * Waits for the request hardware IP to be idle. 1554 * Returns 0 for success or a negative error code on failure. 1555 */ 1556 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1557 enum amd_ip_block_type block_type) 1558 { 1559 int i, r; 1560 1561 for (i = 0; i < adev->num_ip_blocks; i++) { 1562 if (!adev->ip_blocks[i].status.valid) 1563 continue; 1564 if (adev->ip_blocks[i].version->type == block_type) { 1565 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1566 if (r) 1567 return r; 1568 break; 1569 } 1570 } 1571 return 0; 1572 1573 } 1574 1575 /** 1576 * amdgpu_device_ip_is_idle - is the hardware IP idle 1577 * 1578 * @adev: amdgpu_device pointer 1579 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1580 * 1581 * Check if the hardware IP is idle or not. 1582 * Returns true if it the IP is idle, false if not. 1583 */ 1584 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1585 enum amd_ip_block_type block_type) 1586 { 1587 int i; 1588 1589 for (i = 0; i < adev->num_ip_blocks; i++) { 1590 if (!adev->ip_blocks[i].status.valid) 1591 continue; 1592 if (adev->ip_blocks[i].version->type == block_type) 1593 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1594 } 1595 return true; 1596 1597 } 1598 1599 /** 1600 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1601 * 1602 * @adev: amdgpu_device pointer 1603 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1604 * 1605 * Returns a pointer to the hardware IP block structure 1606 * if it exists for the asic, otherwise NULL. 1607 */ 1608 struct amdgpu_ip_block * 1609 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1610 enum amd_ip_block_type type) 1611 { 1612 int i; 1613 1614 for (i = 0; i < adev->num_ip_blocks; i++) 1615 if (adev->ip_blocks[i].version->type == type) 1616 return &adev->ip_blocks[i]; 1617 1618 return NULL; 1619 } 1620 1621 /** 1622 * amdgpu_device_ip_block_version_cmp 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @type: enum amd_ip_block_type 1626 * @major: major version 1627 * @minor: minor version 1628 * 1629 * return 0 if equal or greater 1630 * return 1 if smaller or the ip_block doesn't exist 1631 */ 1632 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1633 enum amd_ip_block_type type, 1634 u32 major, u32 minor) 1635 { 1636 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1637 1638 if (ip_block && ((ip_block->version->major > major) || 1639 ((ip_block->version->major == major) && 1640 (ip_block->version->minor >= minor)))) 1641 return 0; 1642 1643 return 1; 1644 } 1645 1646 /** 1647 * amdgpu_device_ip_block_add 1648 * 1649 * @adev: amdgpu_device pointer 1650 * @ip_block_version: pointer to the IP to add 1651 * 1652 * Adds the IP block driver information to the collection of IPs 1653 * on the asic. 1654 */ 1655 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1656 const struct amdgpu_ip_block_version *ip_block_version) 1657 { 1658 if (!ip_block_version) 1659 return -EINVAL; 1660 1661 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1662 ip_block_version->funcs->name); 1663 1664 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1665 1666 return 0; 1667 } 1668 1669 /** 1670 * amdgpu_device_enable_virtual_display - enable virtual display feature 1671 * 1672 * @adev: amdgpu_device pointer 1673 * 1674 * Enabled the virtual display feature if the user has enabled it via 1675 * the module parameter virtual_display. This feature provides a virtual 1676 * display hardware on headless boards or in virtualized environments. 1677 * This function parses and validates the configuration string specified by 1678 * the user and configues the virtual display configuration (number of 1679 * virtual connectors, crtcs, etc.) specified. 1680 */ 1681 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1682 { 1683 adev->enable_virtual_display = false; 1684 1685 if (amdgpu_virtual_display) { 1686 struct drm_device *ddev = adev_to_drm(adev); 1687 const char *pci_address_name = pci_name(ddev->pdev); 1688 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1689 1690 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1691 pciaddstr_tmp = pciaddstr; 1692 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1693 pciaddname = strsep(&pciaddname_tmp, ","); 1694 if (!strcmp("all", pciaddname) 1695 || !strcmp(pci_address_name, pciaddname)) { 1696 long num_crtc; 1697 int res = -1; 1698 1699 adev->enable_virtual_display = true; 1700 1701 if (pciaddname_tmp) 1702 res = kstrtol(pciaddname_tmp, 10, 1703 &num_crtc); 1704 1705 if (!res) { 1706 if (num_crtc < 1) 1707 num_crtc = 1; 1708 if (num_crtc > 6) 1709 num_crtc = 6; 1710 adev->mode_info.num_crtc = num_crtc; 1711 } else { 1712 adev->mode_info.num_crtc = 1; 1713 } 1714 break; 1715 } 1716 } 1717 1718 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1719 amdgpu_virtual_display, pci_address_name, 1720 adev->enable_virtual_display, adev->mode_info.num_crtc); 1721 1722 kfree(pciaddstr); 1723 } 1724 } 1725 1726 /** 1727 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1728 * 1729 * @adev: amdgpu_device pointer 1730 * 1731 * Parses the asic configuration parameters specified in the gpu info 1732 * firmware and makes them availale to the driver for use in configuring 1733 * the asic. 1734 * Returns 0 on success, -EINVAL on failure. 1735 */ 1736 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1737 { 1738 const char *chip_name; 1739 char fw_name[40]; 1740 int err; 1741 const struct gpu_info_firmware_header_v1_0 *hdr; 1742 1743 adev->firmware.gpu_info_fw = NULL; 1744 1745 if (adev->mman.discovery_bin) { 1746 amdgpu_discovery_get_gfx_info(adev); 1747 1748 /* 1749 * FIXME: The bounding box is still needed by Navi12, so 1750 * temporarily read it from gpu_info firmware. Should be droped 1751 * when DAL no longer needs it. 1752 */ 1753 if (adev->asic_type != CHIP_NAVI12) 1754 return 0; 1755 } 1756 1757 switch (adev->asic_type) { 1758 #ifdef CONFIG_DRM_AMDGPU_SI 1759 case CHIP_VERDE: 1760 case CHIP_TAHITI: 1761 case CHIP_PITCAIRN: 1762 case CHIP_OLAND: 1763 case CHIP_HAINAN: 1764 #endif 1765 #ifdef CONFIG_DRM_AMDGPU_CIK 1766 case CHIP_BONAIRE: 1767 case CHIP_HAWAII: 1768 case CHIP_KAVERI: 1769 case CHIP_KABINI: 1770 case CHIP_MULLINS: 1771 #endif 1772 case CHIP_TOPAZ: 1773 case CHIP_TONGA: 1774 case CHIP_FIJI: 1775 case CHIP_POLARIS10: 1776 case CHIP_POLARIS11: 1777 case CHIP_POLARIS12: 1778 case CHIP_VEGAM: 1779 case CHIP_CARRIZO: 1780 case CHIP_STONEY: 1781 case CHIP_VEGA20: 1782 case CHIP_SIENNA_CICHLID: 1783 case CHIP_NAVY_FLOUNDER: 1784 default: 1785 return 0; 1786 case CHIP_VEGA10: 1787 chip_name = "vega10"; 1788 break; 1789 case CHIP_VEGA12: 1790 chip_name = "vega12"; 1791 break; 1792 case CHIP_RAVEN: 1793 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1794 chip_name = "raven2"; 1795 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1796 chip_name = "picasso"; 1797 else 1798 chip_name = "raven"; 1799 break; 1800 case CHIP_ARCTURUS: 1801 chip_name = "arcturus"; 1802 break; 1803 case CHIP_RENOIR: 1804 chip_name = "renoir"; 1805 break; 1806 case CHIP_NAVI10: 1807 chip_name = "navi10"; 1808 break; 1809 case CHIP_NAVI14: 1810 chip_name = "navi14"; 1811 break; 1812 case CHIP_NAVI12: 1813 chip_name = "navi12"; 1814 break; 1815 } 1816 1817 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1818 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1819 if (err) { 1820 dev_err(adev->dev, 1821 "Failed to load gpu_info firmware \"%s\"\n", 1822 fw_name); 1823 goto out; 1824 } 1825 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1826 if (err) { 1827 dev_err(adev->dev, 1828 "Failed to validate gpu_info firmware \"%s\"\n", 1829 fw_name); 1830 goto out; 1831 } 1832 1833 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1834 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1835 1836 switch (hdr->version_major) { 1837 case 1: 1838 { 1839 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1840 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1841 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1842 1843 /* 1844 * Should be droped when DAL no longer needs it. 1845 */ 1846 if (adev->asic_type == CHIP_NAVI12) 1847 goto parse_soc_bounding_box; 1848 1849 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1850 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1851 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1852 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1853 adev->gfx.config.max_texture_channel_caches = 1854 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1855 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1856 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1857 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1858 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1859 adev->gfx.config.double_offchip_lds_buf = 1860 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1861 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1862 adev->gfx.cu_info.max_waves_per_simd = 1863 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1864 adev->gfx.cu_info.max_scratch_slots_per_cu = 1865 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1866 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1867 if (hdr->version_minor >= 1) { 1868 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1869 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1870 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1871 adev->gfx.config.num_sc_per_sh = 1872 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1873 adev->gfx.config.num_packer_per_sc = 1874 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1875 } 1876 1877 parse_soc_bounding_box: 1878 /* 1879 * soc bounding box info is not integrated in disocovery table, 1880 * we always need to parse it from gpu info firmware if needed. 1881 */ 1882 if (hdr->version_minor == 2) { 1883 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1884 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1885 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1886 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1887 } 1888 break; 1889 } 1890 default: 1891 dev_err(adev->dev, 1892 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1893 err = -EINVAL; 1894 goto out; 1895 } 1896 out: 1897 return err; 1898 } 1899 1900 /** 1901 * amdgpu_device_ip_early_init - run early init for hardware IPs 1902 * 1903 * @adev: amdgpu_device pointer 1904 * 1905 * Early initialization pass for hardware IPs. The hardware IPs that make 1906 * up each asic are discovered each IP's early_init callback is run. This 1907 * is the first stage in initializing the asic. 1908 * Returns 0 on success, negative error code on failure. 1909 */ 1910 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1911 { 1912 int i, r; 1913 1914 amdgpu_device_enable_virtual_display(adev); 1915 1916 if (amdgpu_sriov_vf(adev)) { 1917 r = amdgpu_virt_request_full_gpu(adev, true); 1918 if (r) 1919 return r; 1920 } 1921 1922 switch (adev->asic_type) { 1923 #ifdef CONFIG_DRM_AMDGPU_SI 1924 case CHIP_VERDE: 1925 case CHIP_TAHITI: 1926 case CHIP_PITCAIRN: 1927 case CHIP_OLAND: 1928 case CHIP_HAINAN: 1929 adev->family = AMDGPU_FAMILY_SI; 1930 r = si_set_ip_blocks(adev); 1931 if (r) 1932 return r; 1933 break; 1934 #endif 1935 #ifdef CONFIG_DRM_AMDGPU_CIK 1936 case CHIP_BONAIRE: 1937 case CHIP_HAWAII: 1938 case CHIP_KAVERI: 1939 case CHIP_KABINI: 1940 case CHIP_MULLINS: 1941 if (adev->flags & AMD_IS_APU) 1942 adev->family = AMDGPU_FAMILY_KV; 1943 else 1944 adev->family = AMDGPU_FAMILY_CI; 1945 1946 r = cik_set_ip_blocks(adev); 1947 if (r) 1948 return r; 1949 break; 1950 #endif 1951 case CHIP_TOPAZ: 1952 case CHIP_TONGA: 1953 case CHIP_FIJI: 1954 case CHIP_POLARIS10: 1955 case CHIP_POLARIS11: 1956 case CHIP_POLARIS12: 1957 case CHIP_VEGAM: 1958 case CHIP_CARRIZO: 1959 case CHIP_STONEY: 1960 if (adev->flags & AMD_IS_APU) 1961 adev->family = AMDGPU_FAMILY_CZ; 1962 else 1963 adev->family = AMDGPU_FAMILY_VI; 1964 1965 r = vi_set_ip_blocks(adev); 1966 if (r) 1967 return r; 1968 break; 1969 case CHIP_VEGA10: 1970 case CHIP_VEGA12: 1971 case CHIP_VEGA20: 1972 case CHIP_RAVEN: 1973 case CHIP_ARCTURUS: 1974 case CHIP_RENOIR: 1975 if (adev->flags & AMD_IS_APU) 1976 adev->family = AMDGPU_FAMILY_RV; 1977 else 1978 adev->family = AMDGPU_FAMILY_AI; 1979 1980 r = soc15_set_ip_blocks(adev); 1981 if (r) 1982 return r; 1983 break; 1984 case CHIP_NAVI10: 1985 case CHIP_NAVI14: 1986 case CHIP_NAVI12: 1987 case CHIP_SIENNA_CICHLID: 1988 case CHIP_NAVY_FLOUNDER: 1989 adev->family = AMDGPU_FAMILY_NV; 1990 1991 r = nv_set_ip_blocks(adev); 1992 if (r) 1993 return r; 1994 break; 1995 default: 1996 /* FIXME: not supported yet */ 1997 return -EINVAL; 1998 } 1999 2000 amdgpu_amdkfd_device_probe(adev); 2001 2002 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2003 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2004 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2005 2006 for (i = 0; i < adev->num_ip_blocks; i++) { 2007 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2008 DRM_ERROR("disabled ip block: %d <%s>\n", 2009 i, adev->ip_blocks[i].version->funcs->name); 2010 adev->ip_blocks[i].status.valid = false; 2011 } else { 2012 if (adev->ip_blocks[i].version->funcs->early_init) { 2013 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2014 if (r == -ENOENT) { 2015 adev->ip_blocks[i].status.valid = false; 2016 } else if (r) { 2017 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2018 adev->ip_blocks[i].version->funcs->name, r); 2019 return r; 2020 } else { 2021 adev->ip_blocks[i].status.valid = true; 2022 } 2023 } else { 2024 adev->ip_blocks[i].status.valid = true; 2025 } 2026 } 2027 /* get the vbios after the asic_funcs are set up */ 2028 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2029 r = amdgpu_device_parse_gpu_info_fw(adev); 2030 if (r) 2031 return r; 2032 2033 /* Read BIOS */ 2034 if (!amdgpu_get_bios(adev)) 2035 return -EINVAL; 2036 2037 r = amdgpu_atombios_init(adev); 2038 if (r) { 2039 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2040 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2041 return r; 2042 } 2043 } 2044 } 2045 2046 adev->cg_flags &= amdgpu_cg_mask; 2047 adev->pg_flags &= amdgpu_pg_mask; 2048 2049 return 0; 2050 } 2051 2052 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2053 { 2054 int i, r; 2055 2056 for (i = 0; i < adev->num_ip_blocks; i++) { 2057 if (!adev->ip_blocks[i].status.sw) 2058 continue; 2059 if (adev->ip_blocks[i].status.hw) 2060 continue; 2061 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2062 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2063 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2064 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2065 if (r) { 2066 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2067 adev->ip_blocks[i].version->funcs->name, r); 2068 return r; 2069 } 2070 adev->ip_blocks[i].status.hw = true; 2071 } 2072 } 2073 2074 return 0; 2075 } 2076 2077 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2078 { 2079 int i, r; 2080 2081 for (i = 0; i < adev->num_ip_blocks; i++) { 2082 if (!adev->ip_blocks[i].status.sw) 2083 continue; 2084 if (adev->ip_blocks[i].status.hw) 2085 continue; 2086 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2087 if (r) { 2088 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2089 adev->ip_blocks[i].version->funcs->name, r); 2090 return r; 2091 } 2092 adev->ip_blocks[i].status.hw = true; 2093 } 2094 2095 return 0; 2096 } 2097 2098 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2099 { 2100 int r = 0; 2101 int i; 2102 uint32_t smu_version; 2103 2104 if (adev->asic_type >= CHIP_VEGA10) { 2105 for (i = 0; i < adev->num_ip_blocks; i++) { 2106 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2107 continue; 2108 2109 /* no need to do the fw loading again if already done*/ 2110 if (adev->ip_blocks[i].status.hw == true) 2111 break; 2112 2113 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2114 r = adev->ip_blocks[i].version->funcs->resume(adev); 2115 if (r) { 2116 DRM_ERROR("resume of IP block <%s> failed %d\n", 2117 adev->ip_blocks[i].version->funcs->name, r); 2118 return r; 2119 } 2120 } else { 2121 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2122 if (r) { 2123 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2124 adev->ip_blocks[i].version->funcs->name, r); 2125 return r; 2126 } 2127 } 2128 2129 adev->ip_blocks[i].status.hw = true; 2130 break; 2131 } 2132 } 2133 2134 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2135 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2136 2137 return r; 2138 } 2139 2140 /** 2141 * amdgpu_device_ip_init - run init for hardware IPs 2142 * 2143 * @adev: amdgpu_device pointer 2144 * 2145 * Main initialization pass for hardware IPs. The list of all the hardware 2146 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2147 * are run. sw_init initializes the software state associated with each IP 2148 * and hw_init initializes the hardware associated with each IP. 2149 * Returns 0 on success, negative error code on failure. 2150 */ 2151 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2152 { 2153 int i, r; 2154 2155 r = amdgpu_ras_init(adev); 2156 if (r) 2157 return r; 2158 2159 for (i = 0; i < adev->num_ip_blocks; i++) { 2160 if (!adev->ip_blocks[i].status.valid) 2161 continue; 2162 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2163 if (r) { 2164 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2165 adev->ip_blocks[i].version->funcs->name, r); 2166 goto init_failed; 2167 } 2168 adev->ip_blocks[i].status.sw = true; 2169 2170 /* need to do gmc hw init early so we can allocate gpu mem */ 2171 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2172 r = amdgpu_device_vram_scratch_init(adev); 2173 if (r) { 2174 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2175 goto init_failed; 2176 } 2177 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2178 if (r) { 2179 DRM_ERROR("hw_init %d failed %d\n", i, r); 2180 goto init_failed; 2181 } 2182 r = amdgpu_device_wb_init(adev); 2183 if (r) { 2184 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2185 goto init_failed; 2186 } 2187 adev->ip_blocks[i].status.hw = true; 2188 2189 /* right after GMC hw init, we create CSA */ 2190 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2191 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2192 AMDGPU_GEM_DOMAIN_VRAM, 2193 AMDGPU_CSA_SIZE); 2194 if (r) { 2195 DRM_ERROR("allocate CSA failed %d\n", r); 2196 goto init_failed; 2197 } 2198 } 2199 } 2200 } 2201 2202 if (amdgpu_sriov_vf(adev)) 2203 amdgpu_virt_init_data_exchange(adev); 2204 2205 r = amdgpu_ib_pool_init(adev); 2206 if (r) { 2207 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2208 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2209 goto init_failed; 2210 } 2211 2212 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2213 if (r) 2214 goto init_failed; 2215 2216 r = amdgpu_device_ip_hw_init_phase1(adev); 2217 if (r) 2218 goto init_failed; 2219 2220 r = amdgpu_device_fw_loading(adev); 2221 if (r) 2222 goto init_failed; 2223 2224 r = amdgpu_device_ip_hw_init_phase2(adev); 2225 if (r) 2226 goto init_failed; 2227 2228 /* 2229 * retired pages will be loaded from eeprom and reserved here, 2230 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2231 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2232 * for I2C communication which only true at this point. 2233 * 2234 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2235 * failure from bad gpu situation and stop amdgpu init process 2236 * accordingly. For other failed cases, it will still release all 2237 * the resource and print error message, rather than returning one 2238 * negative value to upper level. 2239 * 2240 * Note: theoretically, this should be called before all vram allocations 2241 * to protect retired page from abusing 2242 */ 2243 r = amdgpu_ras_recovery_init(adev); 2244 if (r) 2245 goto init_failed; 2246 2247 if (adev->gmc.xgmi.num_physical_nodes > 1) 2248 amdgpu_xgmi_add_device(adev); 2249 amdgpu_amdkfd_device_init(adev); 2250 2251 amdgpu_fru_get_product_info(adev); 2252 2253 init_failed: 2254 if (amdgpu_sriov_vf(adev)) 2255 amdgpu_virt_release_full_gpu(adev, true); 2256 2257 return r; 2258 } 2259 2260 /** 2261 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2262 * 2263 * @adev: amdgpu_device pointer 2264 * 2265 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2266 * this function before a GPU reset. If the value is retained after a 2267 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2268 */ 2269 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2270 { 2271 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2272 } 2273 2274 /** 2275 * amdgpu_device_check_vram_lost - check if vram is valid 2276 * 2277 * @adev: amdgpu_device pointer 2278 * 2279 * Checks the reset magic value written to the gart pointer in VRAM. 2280 * The driver calls this after a GPU reset to see if the contents of 2281 * VRAM is lost or now. 2282 * returns true if vram is lost, false if not. 2283 */ 2284 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2285 { 2286 if (memcmp(adev->gart.ptr, adev->reset_magic, 2287 AMDGPU_RESET_MAGIC_NUM)) 2288 return true; 2289 2290 if (!amdgpu_in_reset(adev)) 2291 return false; 2292 2293 /* 2294 * For all ASICs with baco/mode1 reset, the VRAM is 2295 * always assumed to be lost. 2296 */ 2297 switch (amdgpu_asic_reset_method(adev)) { 2298 case AMD_RESET_METHOD_BACO: 2299 case AMD_RESET_METHOD_MODE1: 2300 return true; 2301 default: 2302 return false; 2303 } 2304 } 2305 2306 /** 2307 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2308 * 2309 * @adev: amdgpu_device pointer 2310 * @state: clockgating state (gate or ungate) 2311 * 2312 * The list of all the hardware IPs that make up the asic is walked and the 2313 * set_clockgating_state callbacks are run. 2314 * Late initialization pass enabling clockgating for hardware IPs. 2315 * Fini or suspend, pass disabling clockgating for hardware IPs. 2316 * Returns 0 on success, negative error code on failure. 2317 */ 2318 2319 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2320 enum amd_clockgating_state state) 2321 { 2322 int i, j, r; 2323 2324 if (amdgpu_emu_mode == 1) 2325 return 0; 2326 2327 for (j = 0; j < adev->num_ip_blocks; j++) { 2328 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2329 if (!adev->ip_blocks[i].status.late_initialized) 2330 continue; 2331 /* skip CG for VCE/UVD, it's handled specially */ 2332 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2333 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2334 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2335 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2336 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2337 /* enable clockgating to save power */ 2338 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2339 state); 2340 if (r) { 2341 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2342 adev->ip_blocks[i].version->funcs->name, r); 2343 return r; 2344 } 2345 } 2346 } 2347 2348 return 0; 2349 } 2350 2351 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2352 { 2353 int i, j, r; 2354 2355 if (amdgpu_emu_mode == 1) 2356 return 0; 2357 2358 for (j = 0; j < adev->num_ip_blocks; j++) { 2359 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2360 if (!adev->ip_blocks[i].status.late_initialized) 2361 continue; 2362 /* skip CG for VCE/UVD, it's handled specially */ 2363 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2364 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2365 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2366 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2367 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2368 /* enable powergating to save power */ 2369 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2370 state); 2371 if (r) { 2372 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2373 adev->ip_blocks[i].version->funcs->name, r); 2374 return r; 2375 } 2376 } 2377 } 2378 return 0; 2379 } 2380 2381 static int amdgpu_device_enable_mgpu_fan_boost(void) 2382 { 2383 struct amdgpu_gpu_instance *gpu_ins; 2384 struct amdgpu_device *adev; 2385 int i, ret = 0; 2386 2387 mutex_lock(&mgpu_info.mutex); 2388 2389 /* 2390 * MGPU fan boost feature should be enabled 2391 * only when there are two or more dGPUs in 2392 * the system 2393 */ 2394 if (mgpu_info.num_dgpu < 2) 2395 goto out; 2396 2397 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2398 gpu_ins = &(mgpu_info.gpu_ins[i]); 2399 adev = gpu_ins->adev; 2400 if (!(adev->flags & AMD_IS_APU) && 2401 !gpu_ins->mgpu_fan_enabled) { 2402 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2403 if (ret) 2404 break; 2405 2406 gpu_ins->mgpu_fan_enabled = 1; 2407 } 2408 } 2409 2410 out: 2411 mutex_unlock(&mgpu_info.mutex); 2412 2413 return ret; 2414 } 2415 2416 /** 2417 * amdgpu_device_ip_late_init - run late init for hardware IPs 2418 * 2419 * @adev: amdgpu_device pointer 2420 * 2421 * Late initialization pass for hardware IPs. The list of all the hardware 2422 * IPs that make up the asic is walked and the late_init callbacks are run. 2423 * late_init covers any special initialization that an IP requires 2424 * after all of the have been initialized or something that needs to happen 2425 * late in the init process. 2426 * Returns 0 on success, negative error code on failure. 2427 */ 2428 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2429 { 2430 struct amdgpu_gpu_instance *gpu_instance; 2431 int i = 0, r; 2432 2433 for (i = 0; i < adev->num_ip_blocks; i++) { 2434 if (!adev->ip_blocks[i].status.hw) 2435 continue; 2436 if (adev->ip_blocks[i].version->funcs->late_init) { 2437 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2438 if (r) { 2439 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2440 adev->ip_blocks[i].version->funcs->name, r); 2441 return r; 2442 } 2443 } 2444 adev->ip_blocks[i].status.late_initialized = true; 2445 } 2446 2447 amdgpu_ras_set_error_query_ready(adev, true); 2448 2449 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2450 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2451 2452 amdgpu_device_fill_reset_magic(adev); 2453 2454 r = amdgpu_device_enable_mgpu_fan_boost(); 2455 if (r) 2456 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2457 2458 2459 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2460 mutex_lock(&mgpu_info.mutex); 2461 2462 /* 2463 * Reset device p-state to low as this was booted with high. 2464 * 2465 * This should be performed only after all devices from the same 2466 * hive get initialized. 2467 * 2468 * However, it's unknown how many device in the hive in advance. 2469 * As this is counted one by one during devices initializations. 2470 * 2471 * So, we wait for all XGMI interlinked devices initialized. 2472 * This may bring some delays as those devices may come from 2473 * different hives. But that should be OK. 2474 */ 2475 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2476 for (i = 0; i < mgpu_info.num_gpu; i++) { 2477 gpu_instance = &(mgpu_info.gpu_ins[i]); 2478 if (gpu_instance->adev->flags & AMD_IS_APU) 2479 continue; 2480 2481 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2482 AMDGPU_XGMI_PSTATE_MIN); 2483 if (r) { 2484 DRM_ERROR("pstate setting failed (%d).\n", r); 2485 break; 2486 } 2487 } 2488 } 2489 2490 mutex_unlock(&mgpu_info.mutex); 2491 } 2492 2493 return 0; 2494 } 2495 2496 /** 2497 * amdgpu_device_ip_fini - run fini for hardware IPs 2498 * 2499 * @adev: amdgpu_device pointer 2500 * 2501 * Main teardown pass for hardware IPs. The list of all the hardware 2502 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2503 * are run. hw_fini tears down the hardware associated with each IP 2504 * and sw_fini tears down any software state associated with each IP. 2505 * Returns 0 on success, negative error code on failure. 2506 */ 2507 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2508 { 2509 int i, r; 2510 2511 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2512 amdgpu_virt_release_ras_err_handler_data(adev); 2513 2514 amdgpu_ras_pre_fini(adev); 2515 2516 if (adev->gmc.xgmi.num_physical_nodes > 1) 2517 amdgpu_xgmi_remove_device(adev); 2518 2519 amdgpu_amdkfd_device_fini(adev); 2520 2521 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2522 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2523 2524 /* need to disable SMC first */ 2525 for (i = 0; i < adev->num_ip_blocks; i++) { 2526 if (!adev->ip_blocks[i].status.hw) 2527 continue; 2528 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2529 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2530 /* XXX handle errors */ 2531 if (r) { 2532 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2533 adev->ip_blocks[i].version->funcs->name, r); 2534 } 2535 adev->ip_blocks[i].status.hw = false; 2536 break; 2537 } 2538 } 2539 2540 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2541 if (!adev->ip_blocks[i].status.hw) 2542 continue; 2543 2544 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2545 /* XXX handle errors */ 2546 if (r) { 2547 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2548 adev->ip_blocks[i].version->funcs->name, r); 2549 } 2550 2551 adev->ip_blocks[i].status.hw = false; 2552 } 2553 2554 2555 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2556 if (!adev->ip_blocks[i].status.sw) 2557 continue; 2558 2559 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2560 amdgpu_ucode_free_bo(adev); 2561 amdgpu_free_static_csa(&adev->virt.csa_obj); 2562 amdgpu_device_wb_fini(adev); 2563 amdgpu_device_vram_scratch_fini(adev); 2564 amdgpu_ib_pool_fini(adev); 2565 } 2566 2567 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2568 /* XXX handle errors */ 2569 if (r) { 2570 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2571 adev->ip_blocks[i].version->funcs->name, r); 2572 } 2573 adev->ip_blocks[i].status.sw = false; 2574 adev->ip_blocks[i].status.valid = false; 2575 } 2576 2577 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2578 if (!adev->ip_blocks[i].status.late_initialized) 2579 continue; 2580 if (adev->ip_blocks[i].version->funcs->late_fini) 2581 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2582 adev->ip_blocks[i].status.late_initialized = false; 2583 } 2584 2585 amdgpu_ras_fini(adev); 2586 2587 if (amdgpu_sriov_vf(adev)) 2588 if (amdgpu_virt_release_full_gpu(adev, false)) 2589 DRM_ERROR("failed to release exclusive mode on fini\n"); 2590 2591 return 0; 2592 } 2593 2594 /** 2595 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2596 * 2597 * @work: work_struct. 2598 */ 2599 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2600 { 2601 struct amdgpu_device *adev = 2602 container_of(work, struct amdgpu_device, delayed_init_work.work); 2603 int r; 2604 2605 r = amdgpu_ib_ring_tests(adev); 2606 if (r) 2607 DRM_ERROR("ib ring test failed (%d).\n", r); 2608 } 2609 2610 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2611 { 2612 struct amdgpu_device *adev = 2613 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2614 2615 mutex_lock(&adev->gfx.gfx_off_mutex); 2616 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2617 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2618 adev->gfx.gfx_off_state = true; 2619 } 2620 mutex_unlock(&adev->gfx.gfx_off_mutex); 2621 } 2622 2623 /** 2624 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2625 * 2626 * @adev: amdgpu_device pointer 2627 * 2628 * Main suspend function for hardware IPs. The list of all the hardware 2629 * IPs that make up the asic is walked, clockgating is disabled and the 2630 * suspend callbacks are run. suspend puts the hardware and software state 2631 * in each IP into a state suitable for suspend. 2632 * Returns 0 on success, negative error code on failure. 2633 */ 2634 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2635 { 2636 int i, r; 2637 2638 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2639 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2640 2641 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2642 if (!adev->ip_blocks[i].status.valid) 2643 continue; 2644 2645 /* displays are handled separately */ 2646 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2647 continue; 2648 2649 /* XXX handle errors */ 2650 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2651 /* XXX handle errors */ 2652 if (r) { 2653 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2654 adev->ip_blocks[i].version->funcs->name, r); 2655 return r; 2656 } 2657 2658 adev->ip_blocks[i].status.hw = false; 2659 } 2660 2661 return 0; 2662 } 2663 2664 /** 2665 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2666 * 2667 * @adev: amdgpu_device pointer 2668 * 2669 * Main suspend function for hardware IPs. The list of all the hardware 2670 * IPs that make up the asic is walked, clockgating is disabled and the 2671 * suspend callbacks are run. suspend puts the hardware and software state 2672 * in each IP into a state suitable for suspend. 2673 * Returns 0 on success, negative error code on failure. 2674 */ 2675 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2676 { 2677 int i, r; 2678 2679 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2680 if (!adev->ip_blocks[i].status.valid) 2681 continue; 2682 /* displays are handled in phase1 */ 2683 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2684 continue; 2685 /* PSP lost connection when err_event_athub occurs */ 2686 if (amdgpu_ras_intr_triggered() && 2687 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2688 adev->ip_blocks[i].status.hw = false; 2689 continue; 2690 } 2691 /* XXX handle errors */ 2692 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2693 /* XXX handle errors */ 2694 if (r) { 2695 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2696 adev->ip_blocks[i].version->funcs->name, r); 2697 } 2698 adev->ip_blocks[i].status.hw = false; 2699 /* handle putting the SMC in the appropriate state */ 2700 if(!amdgpu_sriov_vf(adev)){ 2701 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2702 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2703 if (r) { 2704 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2705 adev->mp1_state, r); 2706 return r; 2707 } 2708 } 2709 } 2710 adev->ip_blocks[i].status.hw = false; 2711 } 2712 2713 return 0; 2714 } 2715 2716 /** 2717 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2718 * 2719 * @adev: amdgpu_device pointer 2720 * 2721 * Main suspend function for hardware IPs. The list of all the hardware 2722 * IPs that make up the asic is walked, clockgating is disabled and the 2723 * suspend callbacks are run. suspend puts the hardware and software state 2724 * in each IP into a state suitable for suspend. 2725 * Returns 0 on success, negative error code on failure. 2726 */ 2727 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2728 { 2729 int r; 2730 2731 if (amdgpu_sriov_vf(adev)) 2732 amdgpu_virt_request_full_gpu(adev, false); 2733 2734 r = amdgpu_device_ip_suspend_phase1(adev); 2735 if (r) 2736 return r; 2737 r = amdgpu_device_ip_suspend_phase2(adev); 2738 2739 if (amdgpu_sriov_vf(adev)) 2740 amdgpu_virt_release_full_gpu(adev, false); 2741 2742 return r; 2743 } 2744 2745 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2746 { 2747 int i, r; 2748 2749 static enum amd_ip_block_type ip_order[] = { 2750 AMD_IP_BLOCK_TYPE_GMC, 2751 AMD_IP_BLOCK_TYPE_COMMON, 2752 AMD_IP_BLOCK_TYPE_PSP, 2753 AMD_IP_BLOCK_TYPE_IH, 2754 }; 2755 2756 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2757 int j; 2758 struct amdgpu_ip_block *block; 2759 2760 block = &adev->ip_blocks[i]; 2761 block->status.hw = false; 2762 2763 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2764 2765 if (block->version->type != ip_order[j] || 2766 !block->status.valid) 2767 continue; 2768 2769 r = block->version->funcs->hw_init(adev); 2770 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2771 if (r) 2772 return r; 2773 block->status.hw = true; 2774 } 2775 } 2776 2777 return 0; 2778 } 2779 2780 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2781 { 2782 int i, r; 2783 2784 static enum amd_ip_block_type ip_order[] = { 2785 AMD_IP_BLOCK_TYPE_SMC, 2786 AMD_IP_BLOCK_TYPE_DCE, 2787 AMD_IP_BLOCK_TYPE_GFX, 2788 AMD_IP_BLOCK_TYPE_SDMA, 2789 AMD_IP_BLOCK_TYPE_UVD, 2790 AMD_IP_BLOCK_TYPE_VCE, 2791 AMD_IP_BLOCK_TYPE_VCN 2792 }; 2793 2794 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2795 int j; 2796 struct amdgpu_ip_block *block; 2797 2798 for (j = 0; j < adev->num_ip_blocks; j++) { 2799 block = &adev->ip_blocks[j]; 2800 2801 if (block->version->type != ip_order[i] || 2802 !block->status.valid || 2803 block->status.hw) 2804 continue; 2805 2806 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2807 r = block->version->funcs->resume(adev); 2808 else 2809 r = block->version->funcs->hw_init(adev); 2810 2811 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2812 if (r) 2813 return r; 2814 block->status.hw = true; 2815 } 2816 } 2817 2818 return 0; 2819 } 2820 2821 /** 2822 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2823 * 2824 * @adev: amdgpu_device pointer 2825 * 2826 * First resume function for hardware IPs. The list of all the hardware 2827 * IPs that make up the asic is walked and the resume callbacks are run for 2828 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2829 * after a suspend and updates the software state as necessary. This 2830 * function is also used for restoring the GPU after a GPU reset. 2831 * Returns 0 on success, negative error code on failure. 2832 */ 2833 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2834 { 2835 int i, r; 2836 2837 for (i = 0; i < adev->num_ip_blocks; i++) { 2838 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2839 continue; 2840 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2841 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2842 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2843 2844 r = adev->ip_blocks[i].version->funcs->resume(adev); 2845 if (r) { 2846 DRM_ERROR("resume of IP block <%s> failed %d\n", 2847 adev->ip_blocks[i].version->funcs->name, r); 2848 return r; 2849 } 2850 adev->ip_blocks[i].status.hw = true; 2851 } 2852 } 2853 2854 return 0; 2855 } 2856 2857 /** 2858 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2859 * 2860 * @adev: amdgpu_device pointer 2861 * 2862 * First resume function for hardware IPs. The list of all the hardware 2863 * IPs that make up the asic is walked and the resume callbacks are run for 2864 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2865 * functional state after a suspend and updates the software state as 2866 * necessary. This function is also used for restoring the GPU after a GPU 2867 * reset. 2868 * Returns 0 on success, negative error code on failure. 2869 */ 2870 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2871 { 2872 int i, r; 2873 2874 for (i = 0; i < adev->num_ip_blocks; i++) { 2875 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2876 continue; 2877 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2878 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2879 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2880 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2881 continue; 2882 r = adev->ip_blocks[i].version->funcs->resume(adev); 2883 if (r) { 2884 DRM_ERROR("resume of IP block <%s> failed %d\n", 2885 adev->ip_blocks[i].version->funcs->name, r); 2886 return r; 2887 } 2888 adev->ip_blocks[i].status.hw = true; 2889 } 2890 2891 return 0; 2892 } 2893 2894 /** 2895 * amdgpu_device_ip_resume - run resume for hardware IPs 2896 * 2897 * @adev: amdgpu_device pointer 2898 * 2899 * Main resume function for hardware IPs. The hardware IPs 2900 * are split into two resume functions because they are 2901 * are also used in in recovering from a GPU reset and some additional 2902 * steps need to be take between them. In this case (S3/S4) they are 2903 * run sequentially. 2904 * Returns 0 on success, negative error code on failure. 2905 */ 2906 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2907 { 2908 int r; 2909 2910 r = amdgpu_device_ip_resume_phase1(adev); 2911 if (r) 2912 return r; 2913 2914 r = amdgpu_device_fw_loading(adev); 2915 if (r) 2916 return r; 2917 2918 r = amdgpu_device_ip_resume_phase2(adev); 2919 2920 return r; 2921 } 2922 2923 /** 2924 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2925 * 2926 * @adev: amdgpu_device pointer 2927 * 2928 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2929 */ 2930 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2931 { 2932 if (amdgpu_sriov_vf(adev)) { 2933 if (adev->is_atom_fw) { 2934 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2935 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2936 } else { 2937 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2938 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2939 } 2940 2941 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2942 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2943 } 2944 } 2945 2946 /** 2947 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2948 * 2949 * @asic_type: AMD asic type 2950 * 2951 * Check if there is DC (new modesetting infrastructre) support for an asic. 2952 * returns true if DC has support, false if not. 2953 */ 2954 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2955 { 2956 switch (asic_type) { 2957 #if defined(CONFIG_DRM_AMD_DC) 2958 #if defined(CONFIG_DRM_AMD_DC_SI) 2959 case CHIP_TAHITI: 2960 case CHIP_PITCAIRN: 2961 case CHIP_VERDE: 2962 case CHIP_OLAND: 2963 #endif 2964 case CHIP_BONAIRE: 2965 case CHIP_KAVERI: 2966 case CHIP_KABINI: 2967 case CHIP_MULLINS: 2968 /* 2969 * We have systems in the wild with these ASICs that require 2970 * LVDS and VGA support which is not supported with DC. 2971 * 2972 * Fallback to the non-DC driver here by default so as not to 2973 * cause regressions. 2974 */ 2975 return amdgpu_dc > 0; 2976 case CHIP_HAWAII: 2977 case CHIP_CARRIZO: 2978 case CHIP_STONEY: 2979 case CHIP_POLARIS10: 2980 case CHIP_POLARIS11: 2981 case CHIP_POLARIS12: 2982 case CHIP_VEGAM: 2983 case CHIP_TONGA: 2984 case CHIP_FIJI: 2985 case CHIP_VEGA10: 2986 case CHIP_VEGA12: 2987 case CHIP_VEGA20: 2988 #if defined(CONFIG_DRM_AMD_DC_DCN) 2989 case CHIP_RAVEN: 2990 case CHIP_NAVI10: 2991 case CHIP_NAVI14: 2992 case CHIP_NAVI12: 2993 case CHIP_RENOIR: 2994 #endif 2995 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2996 case CHIP_SIENNA_CICHLID: 2997 case CHIP_NAVY_FLOUNDER: 2998 #endif 2999 return amdgpu_dc != 0; 3000 #endif 3001 default: 3002 if (amdgpu_dc > 0) 3003 DRM_INFO("Display Core has been requested via kernel parameter " 3004 "but isn't supported by ASIC, ignoring\n"); 3005 return false; 3006 } 3007 } 3008 3009 /** 3010 * amdgpu_device_has_dc_support - check if dc is supported 3011 * 3012 * @adev: amdgpu_device_pointer 3013 * 3014 * Returns true for supported, false for not supported 3015 */ 3016 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3017 { 3018 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3019 return false; 3020 3021 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3022 } 3023 3024 3025 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3026 { 3027 struct amdgpu_device *adev = 3028 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3029 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3030 3031 /* It's a bug to not have a hive within this function */ 3032 if (WARN_ON(!hive)) 3033 return; 3034 3035 /* 3036 * Use task barrier to synchronize all xgmi reset works across the 3037 * hive. task_barrier_enter and task_barrier_exit will block 3038 * until all the threads running the xgmi reset works reach 3039 * those points. task_barrier_full will do both blocks. 3040 */ 3041 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3042 3043 task_barrier_enter(&hive->tb); 3044 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3045 3046 if (adev->asic_reset_res) 3047 goto fail; 3048 3049 task_barrier_exit(&hive->tb); 3050 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3051 3052 if (adev->asic_reset_res) 3053 goto fail; 3054 3055 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3056 adev->mmhub.funcs->reset_ras_error_count(adev); 3057 } else { 3058 3059 task_barrier_full(&hive->tb); 3060 adev->asic_reset_res = amdgpu_asic_reset(adev); 3061 } 3062 3063 fail: 3064 if (adev->asic_reset_res) 3065 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3066 adev->asic_reset_res, adev_to_drm(adev)->unique); 3067 amdgpu_put_xgmi_hive(hive); 3068 } 3069 3070 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3071 { 3072 char *input = amdgpu_lockup_timeout; 3073 char *timeout_setting = NULL; 3074 int index = 0; 3075 long timeout; 3076 int ret = 0; 3077 3078 /* 3079 * By default timeout for non compute jobs is 10000. 3080 * And there is no timeout enforced on compute jobs. 3081 * In SR-IOV or passthrough mode, timeout for compute 3082 * jobs are 60000 by default. 3083 */ 3084 adev->gfx_timeout = msecs_to_jiffies(10000); 3085 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3086 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3087 adev->compute_timeout = msecs_to_jiffies(60000); 3088 else 3089 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3090 3091 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3092 while ((timeout_setting = strsep(&input, ",")) && 3093 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3094 ret = kstrtol(timeout_setting, 0, &timeout); 3095 if (ret) 3096 return ret; 3097 3098 if (timeout == 0) { 3099 index++; 3100 continue; 3101 } else if (timeout < 0) { 3102 timeout = MAX_SCHEDULE_TIMEOUT; 3103 } else { 3104 timeout = msecs_to_jiffies(timeout); 3105 } 3106 3107 switch (index++) { 3108 case 0: 3109 adev->gfx_timeout = timeout; 3110 break; 3111 case 1: 3112 adev->compute_timeout = timeout; 3113 break; 3114 case 2: 3115 adev->sdma_timeout = timeout; 3116 break; 3117 case 3: 3118 adev->video_timeout = timeout; 3119 break; 3120 default: 3121 break; 3122 } 3123 } 3124 /* 3125 * There is only one value specified and 3126 * it should apply to all non-compute jobs. 3127 */ 3128 if (index == 1) { 3129 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3130 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3131 adev->compute_timeout = adev->gfx_timeout; 3132 } 3133 } 3134 3135 return ret; 3136 } 3137 3138 static const struct attribute *amdgpu_dev_attributes[] = { 3139 &dev_attr_product_name.attr, 3140 &dev_attr_product_number.attr, 3141 &dev_attr_serial_number.attr, 3142 &dev_attr_pcie_replay_count.attr, 3143 NULL 3144 }; 3145 3146 3147 /** 3148 * amdgpu_device_init - initialize the driver 3149 * 3150 * @adev: amdgpu_device pointer 3151 * @flags: driver flags 3152 * 3153 * Initializes the driver info and hw (all asics). 3154 * Returns 0 for success or an error on failure. 3155 * Called at driver startup. 3156 */ 3157 int amdgpu_device_init(struct amdgpu_device *adev, 3158 uint32_t flags) 3159 { 3160 struct drm_device *ddev = adev_to_drm(adev); 3161 struct pci_dev *pdev = adev->pdev; 3162 int r, i; 3163 bool boco = false; 3164 u32 max_MBps; 3165 3166 adev->shutdown = false; 3167 adev->flags = flags; 3168 3169 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3170 adev->asic_type = amdgpu_force_asic_type; 3171 else 3172 adev->asic_type = flags & AMD_ASIC_MASK; 3173 3174 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3175 if (amdgpu_emu_mode == 1) 3176 adev->usec_timeout *= 10; 3177 adev->gmc.gart_size = 512 * 1024 * 1024; 3178 adev->accel_working = false; 3179 adev->num_rings = 0; 3180 adev->mman.buffer_funcs = NULL; 3181 adev->mman.buffer_funcs_ring = NULL; 3182 adev->vm_manager.vm_pte_funcs = NULL; 3183 adev->vm_manager.vm_pte_num_scheds = 0; 3184 adev->gmc.gmc_funcs = NULL; 3185 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3186 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3187 3188 adev->smc_rreg = &amdgpu_invalid_rreg; 3189 adev->smc_wreg = &amdgpu_invalid_wreg; 3190 adev->pcie_rreg = &amdgpu_invalid_rreg; 3191 adev->pcie_wreg = &amdgpu_invalid_wreg; 3192 adev->pciep_rreg = &amdgpu_invalid_rreg; 3193 adev->pciep_wreg = &amdgpu_invalid_wreg; 3194 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3195 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3196 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3197 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3198 adev->didt_rreg = &amdgpu_invalid_rreg; 3199 adev->didt_wreg = &amdgpu_invalid_wreg; 3200 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3201 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3202 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3203 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3204 3205 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3206 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3207 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3208 3209 /* mutex initialization are all done here so we 3210 * can recall function without having locking issues */ 3211 atomic_set(&adev->irq.ih.lock, 0); 3212 mutex_init(&adev->firmware.mutex); 3213 mutex_init(&adev->pm.mutex); 3214 mutex_init(&adev->gfx.gpu_clock_mutex); 3215 mutex_init(&adev->srbm_mutex); 3216 mutex_init(&adev->gfx.pipe_reserve_mutex); 3217 mutex_init(&adev->gfx.gfx_off_mutex); 3218 mutex_init(&adev->grbm_idx_mutex); 3219 mutex_init(&adev->mn_lock); 3220 mutex_init(&adev->virt.vf_errors.lock); 3221 hash_init(adev->mn_hash); 3222 atomic_set(&adev->in_gpu_reset, 0); 3223 init_rwsem(&adev->reset_sem); 3224 mutex_init(&adev->psp.mutex); 3225 mutex_init(&adev->notifier_lock); 3226 3227 r = amdgpu_device_check_arguments(adev); 3228 if (r) 3229 return r; 3230 3231 spin_lock_init(&adev->mmio_idx_lock); 3232 spin_lock_init(&adev->smc_idx_lock); 3233 spin_lock_init(&adev->pcie_idx_lock); 3234 spin_lock_init(&adev->uvd_ctx_idx_lock); 3235 spin_lock_init(&adev->didt_idx_lock); 3236 spin_lock_init(&adev->gc_cac_idx_lock); 3237 spin_lock_init(&adev->se_cac_idx_lock); 3238 spin_lock_init(&adev->audio_endpt_idx_lock); 3239 spin_lock_init(&adev->mm_stats.lock); 3240 3241 INIT_LIST_HEAD(&adev->shadow_list); 3242 mutex_init(&adev->shadow_list_lock); 3243 3244 INIT_DELAYED_WORK(&adev->delayed_init_work, 3245 amdgpu_device_delayed_init_work_handler); 3246 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3247 amdgpu_device_delay_enable_gfx_off); 3248 3249 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3250 3251 adev->gfx.gfx_off_req_count = 1; 3252 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3253 3254 atomic_set(&adev->throttling_logging_enabled, 1); 3255 /* 3256 * If throttling continues, logging will be performed every minute 3257 * to avoid log flooding. "-1" is subtracted since the thermal 3258 * throttling interrupt comes every second. Thus, the total logging 3259 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3260 * for throttling interrupt) = 60 seconds. 3261 */ 3262 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3263 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3264 3265 /* Registers mapping */ 3266 /* TODO: block userspace mapping of io register */ 3267 if (adev->asic_type >= CHIP_BONAIRE) { 3268 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3269 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3270 } else { 3271 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3272 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3273 } 3274 3275 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3276 if (adev->rmmio == NULL) { 3277 return -ENOMEM; 3278 } 3279 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3280 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3281 3282 /* io port mapping */ 3283 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3284 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3285 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3286 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3287 break; 3288 } 3289 } 3290 if (adev->rio_mem == NULL) 3291 DRM_INFO("PCI I/O BAR is not found.\n"); 3292 3293 /* enable PCIE atomic ops */ 3294 r = pci_enable_atomic_ops_to_root(adev->pdev, 3295 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3296 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3297 if (r) { 3298 adev->have_atomics_support = false; 3299 DRM_INFO("PCIE atomic ops is not supported\n"); 3300 } else { 3301 adev->have_atomics_support = true; 3302 } 3303 3304 amdgpu_device_get_pcie_info(adev); 3305 3306 if (amdgpu_mcbp) 3307 DRM_INFO("MCBP is enabled\n"); 3308 3309 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3310 adev->enable_mes = true; 3311 3312 /* detect hw virtualization here */ 3313 amdgpu_detect_virtualization(adev); 3314 3315 r = amdgpu_device_get_job_timeout_settings(adev); 3316 if (r) { 3317 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3318 goto failed_unmap; 3319 } 3320 3321 /* early init functions */ 3322 r = amdgpu_device_ip_early_init(adev); 3323 if (r) 3324 goto failed_unmap; 3325 3326 /* doorbell bar mapping and doorbell index init*/ 3327 amdgpu_device_doorbell_init(adev); 3328 3329 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3330 /* this will fail for cards that aren't VGA class devices, just 3331 * ignore it */ 3332 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3333 3334 if (amdgpu_device_supports_boco(ddev)) 3335 boco = true; 3336 if (amdgpu_has_atpx() && 3337 (amdgpu_is_atpx_hybrid() || 3338 amdgpu_has_atpx_dgpu_power_cntl()) && 3339 !pci_is_thunderbolt_attached(adev->pdev)) 3340 vga_switcheroo_register_client(adev->pdev, 3341 &amdgpu_switcheroo_ops, boco); 3342 if (boco) 3343 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3344 3345 if (amdgpu_emu_mode == 1) { 3346 /* post the asic on emulation mode */ 3347 emu_soc_asic_init(adev); 3348 goto fence_driver_init; 3349 } 3350 3351 /* detect if we are with an SRIOV vbios */ 3352 amdgpu_device_detect_sriov_bios(adev); 3353 3354 /* check if we need to reset the asic 3355 * E.g., driver was not cleanly unloaded previously, etc. 3356 */ 3357 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3358 r = amdgpu_asic_reset(adev); 3359 if (r) { 3360 dev_err(adev->dev, "asic reset on init failed\n"); 3361 goto failed; 3362 } 3363 } 3364 3365 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3366 3367 /* Post card if necessary */ 3368 if (amdgpu_device_need_post(adev)) { 3369 if (!adev->bios) { 3370 dev_err(adev->dev, "no vBIOS found\n"); 3371 r = -EINVAL; 3372 goto failed; 3373 } 3374 DRM_INFO("GPU posting now...\n"); 3375 r = amdgpu_device_asic_init(adev); 3376 if (r) { 3377 dev_err(adev->dev, "gpu post error!\n"); 3378 goto failed; 3379 } 3380 } 3381 3382 if (adev->is_atom_fw) { 3383 /* Initialize clocks */ 3384 r = amdgpu_atomfirmware_get_clock_info(adev); 3385 if (r) { 3386 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3387 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3388 goto failed; 3389 } 3390 } else { 3391 /* Initialize clocks */ 3392 r = amdgpu_atombios_get_clock_info(adev); 3393 if (r) { 3394 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3395 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3396 goto failed; 3397 } 3398 /* init i2c buses */ 3399 if (!amdgpu_device_has_dc_support(adev)) 3400 amdgpu_atombios_i2c_init(adev); 3401 } 3402 3403 fence_driver_init: 3404 /* Fence driver */ 3405 r = amdgpu_fence_driver_init(adev); 3406 if (r) { 3407 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3408 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3409 goto failed; 3410 } 3411 3412 /* init the mode config */ 3413 drm_mode_config_init(adev_to_drm(adev)); 3414 3415 r = amdgpu_device_ip_init(adev); 3416 if (r) { 3417 /* failed in exclusive mode due to timeout */ 3418 if (amdgpu_sriov_vf(adev) && 3419 !amdgpu_sriov_runtime(adev) && 3420 amdgpu_virt_mmio_blocked(adev) && 3421 !amdgpu_virt_wait_reset(adev)) { 3422 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3423 /* Don't send request since VF is inactive. */ 3424 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3425 adev->virt.ops = NULL; 3426 r = -EAGAIN; 3427 goto failed; 3428 } 3429 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3430 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3431 goto failed; 3432 } 3433 3434 dev_info(adev->dev, 3435 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3436 adev->gfx.config.max_shader_engines, 3437 adev->gfx.config.max_sh_per_se, 3438 adev->gfx.config.max_cu_per_sh, 3439 adev->gfx.cu_info.number); 3440 3441 adev->accel_working = true; 3442 3443 amdgpu_vm_check_compute_bug(adev); 3444 3445 /* Initialize the buffer migration limit. */ 3446 if (amdgpu_moverate >= 0) 3447 max_MBps = amdgpu_moverate; 3448 else 3449 max_MBps = 8; /* Allow 8 MB/s. */ 3450 /* Get a log2 for easy divisions. */ 3451 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3452 3453 amdgpu_fbdev_init(adev); 3454 3455 r = amdgpu_pm_sysfs_init(adev); 3456 if (r) { 3457 adev->pm_sysfs_en = false; 3458 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3459 } else 3460 adev->pm_sysfs_en = true; 3461 3462 r = amdgpu_ucode_sysfs_init(adev); 3463 if (r) { 3464 adev->ucode_sysfs_en = false; 3465 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3466 } else 3467 adev->ucode_sysfs_en = true; 3468 3469 if ((amdgpu_testing & 1)) { 3470 if (adev->accel_working) 3471 amdgpu_test_moves(adev); 3472 else 3473 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3474 } 3475 if (amdgpu_benchmarking) { 3476 if (adev->accel_working) 3477 amdgpu_benchmark(adev, amdgpu_benchmarking); 3478 else 3479 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3480 } 3481 3482 /* 3483 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3484 * Otherwise the mgpu fan boost feature will be skipped due to the 3485 * gpu instance is counted less. 3486 */ 3487 amdgpu_register_gpu_instance(adev); 3488 3489 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3490 * explicit gating rather than handling it automatically. 3491 */ 3492 r = amdgpu_device_ip_late_init(adev); 3493 if (r) { 3494 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3495 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3496 goto failed; 3497 } 3498 3499 /* must succeed. */ 3500 amdgpu_ras_resume(adev); 3501 3502 queue_delayed_work(system_wq, &adev->delayed_init_work, 3503 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3504 3505 if (amdgpu_sriov_vf(adev)) 3506 flush_delayed_work(&adev->delayed_init_work); 3507 3508 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3509 if (r) 3510 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3511 3512 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3513 r = amdgpu_pmu_init(adev); 3514 if (r) 3515 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3516 3517 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3518 if (amdgpu_device_cache_pci_state(adev->pdev)) 3519 pci_restore_state(pdev); 3520 3521 return 0; 3522 3523 failed: 3524 amdgpu_vf_error_trans_all(adev); 3525 if (boco) 3526 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3527 3528 failed_unmap: 3529 iounmap(adev->rmmio); 3530 adev->rmmio = NULL; 3531 3532 return r; 3533 } 3534 3535 /** 3536 * amdgpu_device_fini - tear down the driver 3537 * 3538 * @adev: amdgpu_device pointer 3539 * 3540 * Tear down the driver info (all asics). 3541 * Called at driver shutdown. 3542 */ 3543 void amdgpu_device_fini(struct amdgpu_device *adev) 3544 { 3545 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3546 flush_delayed_work(&adev->delayed_init_work); 3547 adev->shutdown = true; 3548 3549 kfree(adev->pci_state); 3550 3551 /* make sure IB test finished before entering exclusive mode 3552 * to avoid preemption on IB test 3553 * */ 3554 if (amdgpu_sriov_vf(adev)) { 3555 amdgpu_virt_request_full_gpu(adev, false); 3556 amdgpu_virt_fini_data_exchange(adev); 3557 } 3558 3559 /* disable all interrupts */ 3560 amdgpu_irq_disable_all(adev); 3561 if (adev->mode_info.mode_config_initialized){ 3562 if (!amdgpu_device_has_dc_support(adev)) 3563 drm_helper_force_disable_all(adev_to_drm(adev)); 3564 else 3565 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3566 } 3567 amdgpu_fence_driver_fini(adev); 3568 if (adev->pm_sysfs_en) 3569 amdgpu_pm_sysfs_fini(adev); 3570 amdgpu_fbdev_fini(adev); 3571 amdgpu_device_ip_fini(adev); 3572 release_firmware(adev->firmware.gpu_info_fw); 3573 adev->firmware.gpu_info_fw = NULL; 3574 adev->accel_working = false; 3575 /* free i2c buses */ 3576 if (!amdgpu_device_has_dc_support(adev)) 3577 amdgpu_i2c_fini(adev); 3578 3579 if (amdgpu_emu_mode != 1) 3580 amdgpu_atombios_fini(adev); 3581 3582 kfree(adev->bios); 3583 adev->bios = NULL; 3584 if (amdgpu_has_atpx() && 3585 (amdgpu_is_atpx_hybrid() || 3586 amdgpu_has_atpx_dgpu_power_cntl()) && 3587 !pci_is_thunderbolt_attached(adev->pdev)) 3588 vga_switcheroo_unregister_client(adev->pdev); 3589 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3590 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3591 vga_client_register(adev->pdev, NULL, NULL, NULL); 3592 if (adev->rio_mem) 3593 pci_iounmap(adev->pdev, adev->rio_mem); 3594 adev->rio_mem = NULL; 3595 iounmap(adev->rmmio); 3596 adev->rmmio = NULL; 3597 amdgpu_device_doorbell_fini(adev); 3598 3599 if (adev->ucode_sysfs_en) 3600 amdgpu_ucode_sysfs_fini(adev); 3601 3602 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3603 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3604 amdgpu_pmu_fini(adev); 3605 if (adev->mman.discovery_bin) 3606 amdgpu_discovery_fini(adev); 3607 } 3608 3609 3610 /* 3611 * Suspend & resume. 3612 */ 3613 /** 3614 * amdgpu_device_suspend - initiate device suspend 3615 * 3616 * @dev: drm dev pointer 3617 * @fbcon : notify the fbdev of suspend 3618 * 3619 * Puts the hw in the suspend state (all asics). 3620 * Returns 0 for success or an error on failure. 3621 * Called at driver suspend. 3622 */ 3623 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3624 { 3625 struct amdgpu_device *adev; 3626 struct drm_crtc *crtc; 3627 struct drm_connector *connector; 3628 struct drm_connector_list_iter iter; 3629 int r; 3630 3631 adev = drm_to_adev(dev); 3632 3633 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3634 return 0; 3635 3636 adev->in_suspend = true; 3637 drm_kms_helper_poll_disable(dev); 3638 3639 if (fbcon) 3640 amdgpu_fbdev_set_suspend(adev, 1); 3641 3642 cancel_delayed_work_sync(&adev->delayed_init_work); 3643 3644 if (!amdgpu_device_has_dc_support(adev)) { 3645 /* turn off display hw */ 3646 drm_modeset_lock_all(dev); 3647 drm_connector_list_iter_begin(dev, &iter); 3648 drm_for_each_connector_iter(connector, &iter) 3649 drm_helper_connector_dpms(connector, 3650 DRM_MODE_DPMS_OFF); 3651 drm_connector_list_iter_end(&iter); 3652 drm_modeset_unlock_all(dev); 3653 /* unpin the front buffers and cursors */ 3654 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3655 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3656 struct drm_framebuffer *fb = crtc->primary->fb; 3657 struct amdgpu_bo *robj; 3658 3659 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3660 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3661 r = amdgpu_bo_reserve(aobj, true); 3662 if (r == 0) { 3663 amdgpu_bo_unpin(aobj); 3664 amdgpu_bo_unreserve(aobj); 3665 } 3666 } 3667 3668 if (fb == NULL || fb->obj[0] == NULL) { 3669 continue; 3670 } 3671 robj = gem_to_amdgpu_bo(fb->obj[0]); 3672 /* don't unpin kernel fb objects */ 3673 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3674 r = amdgpu_bo_reserve(robj, true); 3675 if (r == 0) { 3676 amdgpu_bo_unpin(robj); 3677 amdgpu_bo_unreserve(robj); 3678 } 3679 } 3680 } 3681 } 3682 3683 amdgpu_ras_suspend(adev); 3684 3685 r = amdgpu_device_ip_suspend_phase1(adev); 3686 3687 amdgpu_amdkfd_suspend(adev, !fbcon); 3688 3689 /* evict vram memory */ 3690 amdgpu_bo_evict_vram(adev); 3691 3692 amdgpu_fence_driver_suspend(adev); 3693 3694 r = amdgpu_device_ip_suspend_phase2(adev); 3695 3696 /* evict remaining vram memory 3697 * This second call to evict vram is to evict the gart page table 3698 * using the CPU. 3699 */ 3700 amdgpu_bo_evict_vram(adev); 3701 3702 return 0; 3703 } 3704 3705 /** 3706 * amdgpu_device_resume - initiate device resume 3707 * 3708 * @dev: drm dev pointer 3709 * @fbcon : notify the fbdev of resume 3710 * 3711 * Bring the hw back to operating state (all asics). 3712 * Returns 0 for success or an error on failure. 3713 * Called at driver resume. 3714 */ 3715 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3716 { 3717 struct drm_connector *connector; 3718 struct drm_connector_list_iter iter; 3719 struct amdgpu_device *adev = drm_to_adev(dev); 3720 struct drm_crtc *crtc; 3721 int r = 0; 3722 3723 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3724 return 0; 3725 3726 /* post card */ 3727 if (amdgpu_device_need_post(adev)) { 3728 r = amdgpu_device_asic_init(adev); 3729 if (r) 3730 dev_err(adev->dev, "amdgpu asic init failed\n"); 3731 } 3732 3733 r = amdgpu_device_ip_resume(adev); 3734 if (r) { 3735 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3736 return r; 3737 } 3738 amdgpu_fence_driver_resume(adev); 3739 3740 3741 r = amdgpu_device_ip_late_init(adev); 3742 if (r) 3743 return r; 3744 3745 queue_delayed_work(system_wq, &adev->delayed_init_work, 3746 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3747 3748 if (!amdgpu_device_has_dc_support(adev)) { 3749 /* pin cursors */ 3750 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3751 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3752 3753 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3754 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3755 r = amdgpu_bo_reserve(aobj, true); 3756 if (r == 0) { 3757 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3758 if (r != 0) 3759 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3760 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3761 amdgpu_bo_unreserve(aobj); 3762 } 3763 } 3764 } 3765 } 3766 r = amdgpu_amdkfd_resume(adev, !fbcon); 3767 if (r) 3768 return r; 3769 3770 /* Make sure IB tests flushed */ 3771 flush_delayed_work(&adev->delayed_init_work); 3772 3773 /* blat the mode back in */ 3774 if (fbcon) { 3775 if (!amdgpu_device_has_dc_support(adev)) { 3776 /* pre DCE11 */ 3777 drm_helper_resume_force_mode(dev); 3778 3779 /* turn on display hw */ 3780 drm_modeset_lock_all(dev); 3781 3782 drm_connector_list_iter_begin(dev, &iter); 3783 drm_for_each_connector_iter(connector, &iter) 3784 drm_helper_connector_dpms(connector, 3785 DRM_MODE_DPMS_ON); 3786 drm_connector_list_iter_end(&iter); 3787 3788 drm_modeset_unlock_all(dev); 3789 } 3790 amdgpu_fbdev_set_suspend(adev, 0); 3791 } 3792 3793 drm_kms_helper_poll_enable(dev); 3794 3795 amdgpu_ras_resume(adev); 3796 3797 /* 3798 * Most of the connector probing functions try to acquire runtime pm 3799 * refs to ensure that the GPU is powered on when connector polling is 3800 * performed. Since we're calling this from a runtime PM callback, 3801 * trying to acquire rpm refs will cause us to deadlock. 3802 * 3803 * Since we're guaranteed to be holding the rpm lock, it's safe to 3804 * temporarily disable the rpm helpers so this doesn't deadlock us. 3805 */ 3806 #ifdef CONFIG_PM 3807 dev->dev->power.disable_depth++; 3808 #endif 3809 if (!amdgpu_device_has_dc_support(adev)) 3810 drm_helper_hpd_irq_event(dev); 3811 else 3812 drm_kms_helper_hotplug_event(dev); 3813 #ifdef CONFIG_PM 3814 dev->dev->power.disable_depth--; 3815 #endif 3816 adev->in_suspend = false; 3817 3818 return 0; 3819 } 3820 3821 /** 3822 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3823 * 3824 * @adev: amdgpu_device pointer 3825 * 3826 * The list of all the hardware IPs that make up the asic is walked and 3827 * the check_soft_reset callbacks are run. check_soft_reset determines 3828 * if the asic is still hung or not. 3829 * Returns true if any of the IPs are still in a hung state, false if not. 3830 */ 3831 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3832 { 3833 int i; 3834 bool asic_hang = false; 3835 3836 if (amdgpu_sriov_vf(adev)) 3837 return true; 3838 3839 if (amdgpu_asic_need_full_reset(adev)) 3840 return true; 3841 3842 for (i = 0; i < adev->num_ip_blocks; i++) { 3843 if (!adev->ip_blocks[i].status.valid) 3844 continue; 3845 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3846 adev->ip_blocks[i].status.hang = 3847 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3848 if (adev->ip_blocks[i].status.hang) { 3849 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3850 asic_hang = true; 3851 } 3852 } 3853 return asic_hang; 3854 } 3855 3856 /** 3857 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3858 * 3859 * @adev: amdgpu_device pointer 3860 * 3861 * The list of all the hardware IPs that make up the asic is walked and the 3862 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3863 * handles any IP specific hardware or software state changes that are 3864 * necessary for a soft reset to succeed. 3865 * Returns 0 on success, negative error code on failure. 3866 */ 3867 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3868 { 3869 int i, r = 0; 3870 3871 for (i = 0; i < adev->num_ip_blocks; i++) { 3872 if (!adev->ip_blocks[i].status.valid) 3873 continue; 3874 if (adev->ip_blocks[i].status.hang && 3875 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3876 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3877 if (r) 3878 return r; 3879 } 3880 } 3881 3882 return 0; 3883 } 3884 3885 /** 3886 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3887 * 3888 * @adev: amdgpu_device pointer 3889 * 3890 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3891 * reset is necessary to recover. 3892 * Returns true if a full asic reset is required, false if not. 3893 */ 3894 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3895 { 3896 int i; 3897 3898 if (amdgpu_asic_need_full_reset(adev)) 3899 return true; 3900 3901 for (i = 0; i < adev->num_ip_blocks; i++) { 3902 if (!adev->ip_blocks[i].status.valid) 3903 continue; 3904 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3905 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3906 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3907 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3908 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3909 if (adev->ip_blocks[i].status.hang) { 3910 dev_info(adev->dev, "Some block need full reset!\n"); 3911 return true; 3912 } 3913 } 3914 } 3915 return false; 3916 } 3917 3918 /** 3919 * amdgpu_device_ip_soft_reset - do a soft reset 3920 * 3921 * @adev: amdgpu_device pointer 3922 * 3923 * The list of all the hardware IPs that make up the asic is walked and the 3924 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3925 * IP specific hardware or software state changes that are necessary to soft 3926 * reset the IP. 3927 * Returns 0 on success, negative error code on failure. 3928 */ 3929 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3930 { 3931 int i, r = 0; 3932 3933 for (i = 0; i < adev->num_ip_blocks; i++) { 3934 if (!adev->ip_blocks[i].status.valid) 3935 continue; 3936 if (adev->ip_blocks[i].status.hang && 3937 adev->ip_blocks[i].version->funcs->soft_reset) { 3938 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3939 if (r) 3940 return r; 3941 } 3942 } 3943 3944 return 0; 3945 } 3946 3947 /** 3948 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3949 * 3950 * @adev: amdgpu_device pointer 3951 * 3952 * The list of all the hardware IPs that make up the asic is walked and the 3953 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3954 * handles any IP specific hardware or software state changes that are 3955 * necessary after the IP has been soft reset. 3956 * Returns 0 on success, negative error code on failure. 3957 */ 3958 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3959 { 3960 int i, r = 0; 3961 3962 for (i = 0; i < adev->num_ip_blocks; i++) { 3963 if (!adev->ip_blocks[i].status.valid) 3964 continue; 3965 if (adev->ip_blocks[i].status.hang && 3966 adev->ip_blocks[i].version->funcs->post_soft_reset) 3967 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3968 if (r) 3969 return r; 3970 } 3971 3972 return 0; 3973 } 3974 3975 /** 3976 * amdgpu_device_recover_vram - Recover some VRAM contents 3977 * 3978 * @adev: amdgpu_device pointer 3979 * 3980 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3981 * restore things like GPUVM page tables after a GPU reset where 3982 * the contents of VRAM might be lost. 3983 * 3984 * Returns: 3985 * 0 on success, negative error code on failure. 3986 */ 3987 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3988 { 3989 struct dma_fence *fence = NULL, *next = NULL; 3990 struct amdgpu_bo *shadow; 3991 long r = 1, tmo; 3992 3993 if (amdgpu_sriov_runtime(adev)) 3994 tmo = msecs_to_jiffies(8000); 3995 else 3996 tmo = msecs_to_jiffies(100); 3997 3998 dev_info(adev->dev, "recover vram bo from shadow start\n"); 3999 mutex_lock(&adev->shadow_list_lock); 4000 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4001 4002 /* No need to recover an evicted BO */ 4003 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4004 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4005 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4006 continue; 4007 4008 r = amdgpu_bo_restore_shadow(shadow, &next); 4009 if (r) 4010 break; 4011 4012 if (fence) { 4013 tmo = dma_fence_wait_timeout(fence, false, tmo); 4014 dma_fence_put(fence); 4015 fence = next; 4016 if (tmo == 0) { 4017 r = -ETIMEDOUT; 4018 break; 4019 } else if (tmo < 0) { 4020 r = tmo; 4021 break; 4022 } 4023 } else { 4024 fence = next; 4025 } 4026 } 4027 mutex_unlock(&adev->shadow_list_lock); 4028 4029 if (fence) 4030 tmo = dma_fence_wait_timeout(fence, false, tmo); 4031 dma_fence_put(fence); 4032 4033 if (r < 0 || tmo <= 0) { 4034 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4035 return -EIO; 4036 } 4037 4038 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4039 return 0; 4040 } 4041 4042 4043 /** 4044 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4045 * 4046 * @adev: amdgpu device pointer 4047 * @from_hypervisor: request from hypervisor 4048 * 4049 * do VF FLR and reinitialize Asic 4050 * return 0 means succeeded otherwise failed 4051 */ 4052 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4053 bool from_hypervisor) 4054 { 4055 int r; 4056 4057 if (from_hypervisor) 4058 r = amdgpu_virt_request_full_gpu(adev, true); 4059 else 4060 r = amdgpu_virt_reset_gpu(adev); 4061 if (r) 4062 return r; 4063 4064 amdgpu_amdkfd_pre_reset(adev); 4065 4066 /* Resume IP prior to SMC */ 4067 r = amdgpu_device_ip_reinit_early_sriov(adev); 4068 if (r) 4069 goto error; 4070 4071 amdgpu_virt_init_data_exchange(adev); 4072 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4073 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4074 4075 r = amdgpu_device_fw_loading(adev); 4076 if (r) 4077 return r; 4078 4079 /* now we are okay to resume SMC/CP/SDMA */ 4080 r = amdgpu_device_ip_reinit_late_sriov(adev); 4081 if (r) 4082 goto error; 4083 4084 amdgpu_irq_gpu_reset_resume_helper(adev); 4085 r = amdgpu_ib_ring_tests(adev); 4086 amdgpu_amdkfd_post_reset(adev); 4087 4088 error: 4089 amdgpu_virt_release_full_gpu(adev, true); 4090 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4091 amdgpu_inc_vram_lost(adev); 4092 r = amdgpu_device_recover_vram(adev); 4093 } 4094 4095 return r; 4096 } 4097 4098 /** 4099 * amdgpu_device_has_job_running - check if there is any job in mirror list 4100 * 4101 * @adev: amdgpu device pointer 4102 * 4103 * check if there is any job in mirror list 4104 */ 4105 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4106 { 4107 int i; 4108 struct drm_sched_job *job; 4109 4110 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4111 struct amdgpu_ring *ring = adev->rings[i]; 4112 4113 if (!ring || !ring->sched.thread) 4114 continue; 4115 4116 spin_lock(&ring->sched.job_list_lock); 4117 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4118 struct drm_sched_job, node); 4119 spin_unlock(&ring->sched.job_list_lock); 4120 if (job) 4121 return true; 4122 } 4123 return false; 4124 } 4125 4126 /** 4127 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4128 * 4129 * @adev: amdgpu device pointer 4130 * 4131 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4132 * a hung GPU. 4133 */ 4134 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4135 { 4136 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4137 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4138 return false; 4139 } 4140 4141 if (amdgpu_gpu_recovery == 0) 4142 goto disabled; 4143 4144 if (amdgpu_sriov_vf(adev)) 4145 return true; 4146 4147 if (amdgpu_gpu_recovery == -1) { 4148 switch (adev->asic_type) { 4149 case CHIP_BONAIRE: 4150 case CHIP_HAWAII: 4151 case CHIP_TOPAZ: 4152 case CHIP_TONGA: 4153 case CHIP_FIJI: 4154 case CHIP_POLARIS10: 4155 case CHIP_POLARIS11: 4156 case CHIP_POLARIS12: 4157 case CHIP_VEGAM: 4158 case CHIP_VEGA20: 4159 case CHIP_VEGA10: 4160 case CHIP_VEGA12: 4161 case CHIP_RAVEN: 4162 case CHIP_ARCTURUS: 4163 case CHIP_RENOIR: 4164 case CHIP_NAVI10: 4165 case CHIP_NAVI14: 4166 case CHIP_NAVI12: 4167 case CHIP_SIENNA_CICHLID: 4168 break; 4169 default: 4170 goto disabled; 4171 } 4172 } 4173 4174 return true; 4175 4176 disabled: 4177 dev_info(adev->dev, "GPU recovery disabled.\n"); 4178 return false; 4179 } 4180 4181 4182 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4183 struct amdgpu_job *job, 4184 bool *need_full_reset_arg) 4185 { 4186 int i, r = 0; 4187 bool need_full_reset = *need_full_reset_arg; 4188 4189 amdgpu_debugfs_wait_dump(adev); 4190 4191 if (amdgpu_sriov_vf(adev)) { 4192 /* stop the data exchange thread */ 4193 amdgpu_virt_fini_data_exchange(adev); 4194 } 4195 4196 /* block all schedulers and reset given job's ring */ 4197 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4198 struct amdgpu_ring *ring = adev->rings[i]; 4199 4200 if (!ring || !ring->sched.thread) 4201 continue; 4202 4203 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4204 amdgpu_fence_driver_force_completion(ring); 4205 } 4206 4207 if(job) 4208 drm_sched_increase_karma(&job->base); 4209 4210 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4211 if (!amdgpu_sriov_vf(adev)) { 4212 4213 if (!need_full_reset) 4214 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4215 4216 if (!need_full_reset) { 4217 amdgpu_device_ip_pre_soft_reset(adev); 4218 r = amdgpu_device_ip_soft_reset(adev); 4219 amdgpu_device_ip_post_soft_reset(adev); 4220 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4221 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4222 need_full_reset = true; 4223 } 4224 } 4225 4226 if (need_full_reset) 4227 r = amdgpu_device_ip_suspend(adev); 4228 4229 *need_full_reset_arg = need_full_reset; 4230 } 4231 4232 return r; 4233 } 4234 4235 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4236 struct list_head *device_list_handle, 4237 bool *need_full_reset_arg, 4238 bool skip_hw_reset) 4239 { 4240 struct amdgpu_device *tmp_adev = NULL; 4241 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4242 int r = 0; 4243 4244 /* 4245 * ASIC reset has to be done on all HGMI hive nodes ASAP 4246 * to allow proper links negotiation in FW (within 1 sec) 4247 */ 4248 if (!skip_hw_reset && need_full_reset) { 4249 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4250 /* For XGMI run all resets in parallel to speed up the process */ 4251 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4252 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4253 r = -EALREADY; 4254 } else 4255 r = amdgpu_asic_reset(tmp_adev); 4256 4257 if (r) { 4258 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4259 r, adev_to_drm(tmp_adev)->unique); 4260 break; 4261 } 4262 } 4263 4264 /* For XGMI wait for all resets to complete before proceed */ 4265 if (!r) { 4266 list_for_each_entry(tmp_adev, device_list_handle, 4267 gmc.xgmi.head) { 4268 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4269 flush_work(&tmp_adev->xgmi_reset_work); 4270 r = tmp_adev->asic_reset_res; 4271 if (r) 4272 break; 4273 } 4274 } 4275 } 4276 } 4277 4278 if (!r && amdgpu_ras_intr_triggered()) { 4279 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4280 if (tmp_adev->mmhub.funcs && 4281 tmp_adev->mmhub.funcs->reset_ras_error_count) 4282 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4283 } 4284 4285 amdgpu_ras_intr_cleared(); 4286 } 4287 4288 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4289 if (need_full_reset) { 4290 /* post card */ 4291 if (amdgpu_device_asic_init(tmp_adev)) 4292 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4293 4294 if (!r) { 4295 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4296 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4297 if (r) 4298 goto out; 4299 4300 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4301 if (vram_lost) { 4302 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4303 amdgpu_inc_vram_lost(tmp_adev); 4304 } 4305 4306 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4307 if (r) 4308 goto out; 4309 4310 r = amdgpu_device_fw_loading(tmp_adev); 4311 if (r) 4312 return r; 4313 4314 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4315 if (r) 4316 goto out; 4317 4318 if (vram_lost) 4319 amdgpu_device_fill_reset_magic(tmp_adev); 4320 4321 /* 4322 * Add this ASIC as tracked as reset was already 4323 * complete successfully. 4324 */ 4325 amdgpu_register_gpu_instance(tmp_adev); 4326 4327 r = amdgpu_device_ip_late_init(tmp_adev); 4328 if (r) 4329 goto out; 4330 4331 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4332 4333 /* 4334 * The GPU enters bad state once faulty pages 4335 * by ECC has reached the threshold, and ras 4336 * recovery is scheduled next. So add one check 4337 * here to break recovery if it indeed exceeds 4338 * bad page threshold, and remind user to 4339 * retire this GPU or setting one bigger 4340 * bad_page_threshold value to fix this once 4341 * probing driver again. 4342 */ 4343 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4344 /* must succeed. */ 4345 amdgpu_ras_resume(tmp_adev); 4346 } else { 4347 r = -EINVAL; 4348 goto out; 4349 } 4350 4351 /* Update PSP FW topology after reset */ 4352 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4353 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4354 } 4355 } 4356 4357 out: 4358 if (!r) { 4359 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4360 r = amdgpu_ib_ring_tests(tmp_adev); 4361 if (r) { 4362 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4363 r = amdgpu_device_ip_suspend(tmp_adev); 4364 need_full_reset = true; 4365 r = -EAGAIN; 4366 goto end; 4367 } 4368 } 4369 4370 if (!r) 4371 r = amdgpu_device_recover_vram(tmp_adev); 4372 else 4373 tmp_adev->asic_reset_res = r; 4374 } 4375 4376 end: 4377 *need_full_reset_arg = need_full_reset; 4378 return r; 4379 } 4380 4381 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4382 struct amdgpu_hive_info *hive) 4383 { 4384 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4385 return false; 4386 4387 if (hive) { 4388 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4389 } else { 4390 down_write(&adev->reset_sem); 4391 } 4392 4393 atomic_inc(&adev->gpu_reset_counter); 4394 switch (amdgpu_asic_reset_method(adev)) { 4395 case AMD_RESET_METHOD_MODE1: 4396 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4397 break; 4398 case AMD_RESET_METHOD_MODE2: 4399 adev->mp1_state = PP_MP1_STATE_RESET; 4400 break; 4401 default: 4402 adev->mp1_state = PP_MP1_STATE_NONE; 4403 break; 4404 } 4405 4406 return true; 4407 } 4408 4409 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4410 { 4411 amdgpu_vf_error_trans_all(adev); 4412 adev->mp1_state = PP_MP1_STATE_NONE; 4413 atomic_set(&adev->in_gpu_reset, 0); 4414 up_write(&adev->reset_sem); 4415 } 4416 4417 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4418 { 4419 struct pci_dev *p = NULL; 4420 4421 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4422 adev->pdev->bus->number, 1); 4423 if (p) { 4424 pm_runtime_enable(&(p->dev)); 4425 pm_runtime_resume(&(p->dev)); 4426 } 4427 } 4428 4429 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4430 { 4431 enum amd_reset_method reset_method; 4432 struct pci_dev *p = NULL; 4433 u64 expires; 4434 4435 /* 4436 * For now, only BACO and mode1 reset are confirmed 4437 * to suffer the audio issue without proper suspended. 4438 */ 4439 reset_method = amdgpu_asic_reset_method(adev); 4440 if ((reset_method != AMD_RESET_METHOD_BACO) && 4441 (reset_method != AMD_RESET_METHOD_MODE1)) 4442 return -EINVAL; 4443 4444 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4445 adev->pdev->bus->number, 1); 4446 if (!p) 4447 return -ENODEV; 4448 4449 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4450 if (!expires) 4451 /* 4452 * If we cannot get the audio device autosuspend delay, 4453 * a fixed 4S interval will be used. Considering 3S is 4454 * the audio controller default autosuspend delay setting. 4455 * 4S used here is guaranteed to cover that. 4456 */ 4457 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4458 4459 while (!pm_runtime_status_suspended(&(p->dev))) { 4460 if (!pm_runtime_suspend(&(p->dev))) 4461 break; 4462 4463 if (expires < ktime_get_mono_fast_ns()) { 4464 dev_warn(adev->dev, "failed to suspend display audio\n"); 4465 /* TODO: abort the succeeding gpu reset? */ 4466 return -ETIMEDOUT; 4467 } 4468 } 4469 4470 pm_runtime_disable(&(p->dev)); 4471 4472 return 0; 4473 } 4474 4475 /** 4476 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4477 * 4478 * @adev: amdgpu device pointer 4479 * @job: which job trigger hang 4480 * 4481 * Attempt to reset the GPU if it has hung (all asics). 4482 * Attempt to do soft-reset or full-reset and reinitialize Asic 4483 * Returns 0 for success or an error on failure. 4484 */ 4485 4486 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4487 struct amdgpu_job *job) 4488 { 4489 struct list_head device_list, *device_list_handle = NULL; 4490 bool need_full_reset = false; 4491 bool job_signaled = false; 4492 struct amdgpu_hive_info *hive = NULL; 4493 struct amdgpu_device *tmp_adev = NULL; 4494 int i, r = 0; 4495 bool need_emergency_restart = false; 4496 bool audio_suspended = false; 4497 4498 /** 4499 * Special case: RAS triggered and full reset isn't supported 4500 */ 4501 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4502 4503 /* 4504 * Flush RAM to disk so that after reboot 4505 * the user can read log and see why the system rebooted. 4506 */ 4507 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4508 DRM_WARN("Emergency reboot."); 4509 4510 ksys_sync_helper(); 4511 emergency_restart(); 4512 } 4513 4514 dev_info(adev->dev, "GPU %s begin!\n", 4515 need_emergency_restart ? "jobs stop":"reset"); 4516 4517 /* 4518 * Here we trylock to avoid chain of resets executing from 4519 * either trigger by jobs on different adevs in XGMI hive or jobs on 4520 * different schedulers for same device while this TO handler is running. 4521 * We always reset all schedulers for device and all devices for XGMI 4522 * hive so that should take care of them too. 4523 */ 4524 hive = amdgpu_get_xgmi_hive(adev); 4525 if (hive) { 4526 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4527 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4528 job ? job->base.id : -1, hive->hive_id); 4529 amdgpu_put_xgmi_hive(hive); 4530 return 0; 4531 } 4532 mutex_lock(&hive->hive_lock); 4533 } 4534 4535 /* 4536 * Build list of devices to reset. 4537 * In case we are in XGMI hive mode, resort the device list 4538 * to put adev in the 1st position. 4539 */ 4540 INIT_LIST_HEAD(&device_list); 4541 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4542 if (!hive) 4543 return -ENODEV; 4544 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4545 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4546 device_list_handle = &hive->device_list; 4547 } else { 4548 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4549 device_list_handle = &device_list; 4550 } 4551 4552 /* block all schedulers and reset given job's ring */ 4553 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4554 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4555 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4556 job ? job->base.id : -1); 4557 r = 0; 4558 goto skip_recovery; 4559 } 4560 4561 /* 4562 * Try to put the audio codec into suspend state 4563 * before gpu reset started. 4564 * 4565 * Due to the power domain of the graphics device 4566 * is shared with AZ power domain. Without this, 4567 * we may change the audio hardware from behind 4568 * the audio driver's back. That will trigger 4569 * some audio codec errors. 4570 */ 4571 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4572 audio_suspended = true; 4573 4574 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4575 4576 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4577 4578 if (!amdgpu_sriov_vf(tmp_adev)) 4579 amdgpu_amdkfd_pre_reset(tmp_adev); 4580 4581 /* 4582 * Mark these ASICs to be reseted as untracked first 4583 * And add them back after reset completed 4584 */ 4585 amdgpu_unregister_gpu_instance(tmp_adev); 4586 4587 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4588 4589 /* disable ras on ALL IPs */ 4590 if (!need_emergency_restart && 4591 amdgpu_device_ip_need_full_reset(tmp_adev)) 4592 amdgpu_ras_suspend(tmp_adev); 4593 4594 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4595 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4596 4597 if (!ring || !ring->sched.thread) 4598 continue; 4599 4600 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4601 4602 if (need_emergency_restart) 4603 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4604 } 4605 } 4606 4607 if (need_emergency_restart) 4608 goto skip_sched_resume; 4609 4610 /* 4611 * Must check guilty signal here since after this point all old 4612 * HW fences are force signaled. 4613 * 4614 * job->base holds a reference to parent fence 4615 */ 4616 if (job && job->base.s_fence->parent && 4617 dma_fence_is_signaled(job->base.s_fence->parent)) { 4618 job_signaled = true; 4619 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4620 goto skip_hw_reset; 4621 } 4622 4623 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4624 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4625 r = amdgpu_device_pre_asic_reset(tmp_adev, 4626 NULL, 4627 &need_full_reset); 4628 /*TODO Should we stop ?*/ 4629 if (r) { 4630 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4631 r, adev_to_drm(tmp_adev)->unique); 4632 tmp_adev->asic_reset_res = r; 4633 } 4634 } 4635 4636 /* Actual ASIC resets if needed.*/ 4637 /* TODO Implement XGMI hive reset logic for SRIOV */ 4638 if (amdgpu_sriov_vf(adev)) { 4639 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4640 if (r) 4641 adev->asic_reset_res = r; 4642 } else { 4643 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4644 if (r && r == -EAGAIN) 4645 goto retry; 4646 } 4647 4648 skip_hw_reset: 4649 4650 /* Post ASIC reset for all devs .*/ 4651 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4652 4653 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4654 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4655 4656 if (!ring || !ring->sched.thread) 4657 continue; 4658 4659 /* No point to resubmit jobs if we didn't HW reset*/ 4660 if (!tmp_adev->asic_reset_res && !job_signaled) 4661 drm_sched_resubmit_jobs(&ring->sched); 4662 4663 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4664 } 4665 4666 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4667 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4668 } 4669 4670 tmp_adev->asic_reset_res = 0; 4671 4672 if (r) { 4673 /* bad news, how to tell it to userspace ? */ 4674 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4675 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4676 } else { 4677 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4678 } 4679 } 4680 4681 skip_sched_resume: 4682 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4683 /*unlock kfd: SRIOV would do it separately */ 4684 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4685 amdgpu_amdkfd_post_reset(tmp_adev); 4686 if (audio_suspended) 4687 amdgpu_device_resume_display_audio(tmp_adev); 4688 amdgpu_device_unlock_adev(tmp_adev); 4689 } 4690 4691 skip_recovery: 4692 if (hive) { 4693 atomic_set(&hive->in_reset, 0); 4694 mutex_unlock(&hive->hive_lock); 4695 amdgpu_put_xgmi_hive(hive); 4696 } 4697 4698 if (r) 4699 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4700 return r; 4701 } 4702 4703 /** 4704 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4705 * 4706 * @adev: amdgpu_device pointer 4707 * 4708 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4709 * and lanes) of the slot the device is in. Handles APUs and 4710 * virtualized environments where PCIE config space may not be available. 4711 */ 4712 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4713 { 4714 struct pci_dev *pdev; 4715 enum pci_bus_speed speed_cap, platform_speed_cap; 4716 enum pcie_link_width platform_link_width; 4717 4718 if (amdgpu_pcie_gen_cap) 4719 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4720 4721 if (amdgpu_pcie_lane_cap) 4722 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4723 4724 /* covers APUs as well */ 4725 if (pci_is_root_bus(adev->pdev->bus)) { 4726 if (adev->pm.pcie_gen_mask == 0) 4727 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4728 if (adev->pm.pcie_mlw_mask == 0) 4729 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4730 return; 4731 } 4732 4733 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4734 return; 4735 4736 pcie_bandwidth_available(adev->pdev, NULL, 4737 &platform_speed_cap, &platform_link_width); 4738 4739 if (adev->pm.pcie_gen_mask == 0) { 4740 /* asic caps */ 4741 pdev = adev->pdev; 4742 speed_cap = pcie_get_speed_cap(pdev); 4743 if (speed_cap == PCI_SPEED_UNKNOWN) { 4744 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4745 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4746 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4747 } else { 4748 if (speed_cap == PCIE_SPEED_16_0GT) 4749 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4750 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4751 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4752 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4753 else if (speed_cap == PCIE_SPEED_8_0GT) 4754 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4755 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4756 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4757 else if (speed_cap == PCIE_SPEED_5_0GT) 4758 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4759 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4760 else 4761 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4762 } 4763 /* platform caps */ 4764 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4765 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4766 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4767 } else { 4768 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4769 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4770 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4771 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4772 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4773 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4774 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4775 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4776 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4777 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4778 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4779 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4780 else 4781 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4782 4783 } 4784 } 4785 if (adev->pm.pcie_mlw_mask == 0) { 4786 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4787 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4788 } else { 4789 switch (platform_link_width) { 4790 case PCIE_LNK_X32: 4791 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4792 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4793 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4794 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4795 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4796 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4797 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4798 break; 4799 case PCIE_LNK_X16: 4800 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4802 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4803 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4804 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4806 break; 4807 case PCIE_LNK_X12: 4808 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4810 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4813 break; 4814 case PCIE_LNK_X8: 4815 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4819 break; 4820 case PCIE_LNK_X4: 4821 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4823 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4824 break; 4825 case PCIE_LNK_X2: 4826 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4828 break; 4829 case PCIE_LNK_X1: 4830 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4831 break; 4832 default: 4833 break; 4834 } 4835 } 4836 } 4837 } 4838 4839 int amdgpu_device_baco_enter(struct drm_device *dev) 4840 { 4841 struct amdgpu_device *adev = drm_to_adev(dev); 4842 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4843 4844 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4845 return -ENOTSUPP; 4846 4847 if (ras && ras->supported) 4848 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4849 4850 return amdgpu_dpm_baco_enter(adev); 4851 } 4852 4853 int amdgpu_device_baco_exit(struct drm_device *dev) 4854 { 4855 struct amdgpu_device *adev = drm_to_adev(dev); 4856 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4857 int ret = 0; 4858 4859 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4860 return -ENOTSUPP; 4861 4862 ret = amdgpu_dpm_baco_exit(adev); 4863 if (ret) 4864 return ret; 4865 4866 if (ras && ras->supported) 4867 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4868 4869 return 0; 4870 } 4871 4872 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4873 { 4874 int i; 4875 4876 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4877 struct amdgpu_ring *ring = adev->rings[i]; 4878 4879 if (!ring || !ring->sched.thread) 4880 continue; 4881 4882 cancel_delayed_work_sync(&ring->sched.work_tdr); 4883 } 4884 } 4885 4886 /** 4887 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4888 * @pdev: PCI device struct 4889 * @state: PCI channel state 4890 * 4891 * Description: Called when a PCI error is detected. 4892 * 4893 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4894 */ 4895 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4896 { 4897 struct drm_device *dev = pci_get_drvdata(pdev); 4898 struct amdgpu_device *adev = drm_to_adev(dev); 4899 int i; 4900 4901 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4902 4903 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4904 DRM_WARN("No support for XGMI hive yet..."); 4905 return PCI_ERS_RESULT_DISCONNECT; 4906 } 4907 4908 switch (state) { 4909 case pci_channel_io_normal: 4910 return PCI_ERS_RESULT_CAN_RECOVER; 4911 /* Fatal error, prepare for slot reset */ 4912 case pci_channel_io_frozen: 4913 /* 4914 * Cancel and wait for all TDRs in progress if failing to 4915 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4916 * 4917 * Locking adev->reset_sem will prevent any external access 4918 * to GPU during PCI error recovery 4919 */ 4920 while (!amdgpu_device_lock_adev(adev, NULL)) 4921 amdgpu_cancel_all_tdr(adev); 4922 4923 /* 4924 * Block any work scheduling as we do for regular GPU reset 4925 * for the duration of the recovery 4926 */ 4927 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4928 struct amdgpu_ring *ring = adev->rings[i]; 4929 4930 if (!ring || !ring->sched.thread) 4931 continue; 4932 4933 drm_sched_stop(&ring->sched, NULL); 4934 } 4935 return PCI_ERS_RESULT_NEED_RESET; 4936 case pci_channel_io_perm_failure: 4937 /* Permanent error, prepare for device removal */ 4938 return PCI_ERS_RESULT_DISCONNECT; 4939 } 4940 4941 return PCI_ERS_RESULT_NEED_RESET; 4942 } 4943 4944 /** 4945 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4946 * @pdev: pointer to PCI device 4947 */ 4948 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4949 { 4950 4951 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4952 4953 /* TODO - dump whatever for debugging purposes */ 4954 4955 /* This called only if amdgpu_pci_error_detected returns 4956 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4957 * works, no need to reset slot. 4958 */ 4959 4960 return PCI_ERS_RESULT_RECOVERED; 4961 } 4962 4963 /** 4964 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4965 * @pdev: PCI device struct 4966 * 4967 * Description: This routine is called by the pci error recovery 4968 * code after the PCI slot has been reset, just before we 4969 * should resume normal operations. 4970 */ 4971 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4972 { 4973 struct drm_device *dev = pci_get_drvdata(pdev); 4974 struct amdgpu_device *adev = drm_to_adev(dev); 4975 int r, i; 4976 bool need_full_reset = true; 4977 u32 memsize; 4978 struct list_head device_list; 4979 4980 DRM_INFO("PCI error: slot reset callback!!\n"); 4981 4982 INIT_LIST_HEAD(&device_list); 4983 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4984 4985 /* wait for asic to come out of reset */ 4986 msleep(500); 4987 4988 /* Restore PCI confspace */ 4989 amdgpu_device_load_pci_state(pdev); 4990 4991 /* confirm ASIC came out of reset */ 4992 for (i = 0; i < adev->usec_timeout; i++) { 4993 memsize = amdgpu_asic_get_config_memsize(adev); 4994 4995 if (memsize != 0xffffffff) 4996 break; 4997 udelay(1); 4998 } 4999 if (memsize == 0xffffffff) { 5000 r = -ETIME; 5001 goto out; 5002 } 5003 5004 adev->in_pci_err_recovery = true; 5005 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5006 adev->in_pci_err_recovery = false; 5007 if (r) 5008 goto out; 5009 5010 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5011 5012 out: 5013 if (!r) { 5014 if (amdgpu_device_cache_pci_state(adev->pdev)) 5015 pci_restore_state(adev->pdev); 5016 5017 DRM_INFO("PCIe error recovery succeeded\n"); 5018 } else { 5019 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5020 amdgpu_device_unlock_adev(adev); 5021 } 5022 5023 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5024 } 5025 5026 /** 5027 * amdgpu_pci_resume() - resume normal ops after PCI reset 5028 * @pdev: pointer to PCI device 5029 * 5030 * Called when the error recovery driver tells us that its 5031 * OK to resume normal operation. Use completion to allow 5032 * halted scsi ops to resume. 5033 */ 5034 void amdgpu_pci_resume(struct pci_dev *pdev) 5035 { 5036 struct drm_device *dev = pci_get_drvdata(pdev); 5037 struct amdgpu_device *adev = drm_to_adev(dev); 5038 int i; 5039 5040 5041 DRM_INFO("PCI error: resume callback!!\n"); 5042 5043 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5044 struct amdgpu_ring *ring = adev->rings[i]; 5045 5046 if (!ring || !ring->sched.thread) 5047 continue; 5048 5049 5050 drm_sched_resubmit_jobs(&ring->sched); 5051 drm_sched_start(&ring->sched, true); 5052 } 5053 5054 amdgpu_device_unlock_adev(adev); 5055 } 5056 5057 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5058 { 5059 struct drm_device *dev = pci_get_drvdata(pdev); 5060 struct amdgpu_device *adev = drm_to_adev(dev); 5061 int r; 5062 5063 r = pci_save_state(pdev); 5064 if (!r) { 5065 kfree(adev->pci_state); 5066 5067 adev->pci_state = pci_store_saved_state(pdev); 5068 5069 if (!adev->pci_state) { 5070 DRM_ERROR("Failed to store PCI saved state"); 5071 return false; 5072 } 5073 } else { 5074 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5075 return false; 5076 } 5077 5078 return true; 5079 } 5080 5081 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5082 { 5083 struct drm_device *dev = pci_get_drvdata(pdev); 5084 struct amdgpu_device *adev = drm_to_adev(dev); 5085 int r; 5086 5087 if (!adev->pci_state) 5088 return false; 5089 5090 r = pci_load_saved_state(pdev, adev->pci_state); 5091 5092 if (!r) { 5093 pci_restore_state(pdev); 5094 } else { 5095 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5096 return false; 5097 } 5098 5099 return true; 5100 } 5101 5102 5103