1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 87 const char *amdgpu_asic_name[] = { 88 "TAHITI", 89 "PITCAIRN", 90 "VERDE", 91 "OLAND", 92 "HAINAN", 93 "BONAIRE", 94 "KAVERI", 95 "KABINI", 96 "HAWAII", 97 "MULLINS", 98 "TOPAZ", 99 "TONGA", 100 "FIJI", 101 "CARRIZO", 102 "STONEY", 103 "POLARIS10", 104 "POLARIS11", 105 "POLARIS12", 106 "VEGAM", 107 "VEGA10", 108 "VEGA12", 109 "VEGA20", 110 "RAVEN", 111 "ARCTURUS", 112 "RENOIR", 113 "NAVI10", 114 "NAVI14", 115 "NAVI12", 116 "SIENNA_CICHLID", 117 "NAVY_FLOUNDER", 118 "VANGOGH", 119 "DIMGREY_CAVEFISH", 120 "LAST", 121 }; 122 123 /** 124 * DOC: pcie_replay_count 125 * 126 * The amdgpu driver provides a sysfs API for reporting the total number 127 * of PCIe replays (NAKs) 128 * The file pcie_replay_count is used for this and returns the total 129 * number of replays as a sum of the NAKs generated and NAKs received 130 */ 131 132 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 133 struct device_attribute *attr, char *buf) 134 { 135 struct drm_device *ddev = dev_get_drvdata(dev); 136 struct amdgpu_device *adev = drm_to_adev(ddev); 137 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 138 139 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 140 } 141 142 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 143 amdgpu_device_get_pcie_replay_count, NULL); 144 145 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 146 147 /** 148 * DOC: product_name 149 * 150 * The amdgpu driver provides a sysfs API for reporting the product name 151 * for the device 152 * The file serial_number is used for this and returns the product name 153 * as returned from the FRU. 154 * NOTE: This is only available for certain server cards 155 */ 156 157 static ssize_t amdgpu_device_get_product_name(struct device *dev, 158 struct device_attribute *attr, char *buf) 159 { 160 struct drm_device *ddev = dev_get_drvdata(dev); 161 struct amdgpu_device *adev = drm_to_adev(ddev); 162 163 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 164 } 165 166 static DEVICE_ATTR(product_name, S_IRUGO, 167 amdgpu_device_get_product_name, NULL); 168 169 /** 170 * DOC: product_number 171 * 172 * The amdgpu driver provides a sysfs API for reporting the part number 173 * for the device 174 * The file serial_number is used for this and returns the part number 175 * as returned from the FRU. 176 * NOTE: This is only available for certain server cards 177 */ 178 179 static ssize_t amdgpu_device_get_product_number(struct device *dev, 180 struct device_attribute *attr, char *buf) 181 { 182 struct drm_device *ddev = dev_get_drvdata(dev); 183 struct amdgpu_device *adev = drm_to_adev(ddev); 184 185 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 186 } 187 188 static DEVICE_ATTR(product_number, S_IRUGO, 189 amdgpu_device_get_product_number, NULL); 190 191 /** 192 * DOC: serial_number 193 * 194 * The amdgpu driver provides a sysfs API for reporting the serial number 195 * for the device 196 * The file serial_number is used for this and returns the serial number 197 * as returned from the FRU. 198 * NOTE: This is only available for certain server cards 199 */ 200 201 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 202 struct device_attribute *attr, char *buf) 203 { 204 struct drm_device *ddev = dev_get_drvdata(dev); 205 struct amdgpu_device *adev = drm_to_adev(ddev); 206 207 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 208 } 209 210 static DEVICE_ATTR(serial_number, S_IRUGO, 211 amdgpu_device_get_serial_number, NULL); 212 213 /** 214 * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control 215 * 216 * @dev: drm_device pointer 217 * 218 * Returns true if the device is a dGPU with HG/PX power control, 219 * otherwise return false. 220 */ 221 bool amdgpu_device_supports_atpx(struct drm_device *dev) 222 { 223 struct amdgpu_device *adev = drm_to_adev(dev); 224 225 if (adev->flags & AMD_IS_PX) 226 return true; 227 return false; 228 } 229 230 /** 231 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 232 * 233 * @dev: drm_device pointer 234 * 235 * Returns true if the device is a dGPU with HG/PX power control, 236 * otherwise return false. 237 */ 238 bool amdgpu_device_supports_boco(struct drm_device *dev) 239 { 240 struct amdgpu_device *adev = drm_to_adev(dev); 241 242 if (adev->has_pr3) 243 return true; 244 return false; 245 } 246 247 /** 248 * amdgpu_device_supports_baco - Does the device support BACO 249 * 250 * @dev: drm_device pointer 251 * 252 * Returns true if the device supporte BACO, 253 * otherwise return false. 254 */ 255 bool amdgpu_device_supports_baco(struct drm_device *dev) 256 { 257 struct amdgpu_device *adev = drm_to_adev(dev); 258 259 return amdgpu_asic_supports_baco(adev); 260 } 261 262 /* 263 * VRAM access helper functions 264 */ 265 266 /** 267 * amdgpu_device_vram_access - read/write a buffer in vram 268 * 269 * @adev: amdgpu_device pointer 270 * @pos: offset of the buffer in vram 271 * @buf: virtual address of the buffer in system memory 272 * @size: read/write size, sizeof(@buf) must > @size 273 * @write: true - write to vram, otherwise - read from vram 274 */ 275 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 276 uint32_t *buf, size_t size, bool write) 277 { 278 unsigned long flags; 279 uint32_t hi = ~0; 280 uint64_t last; 281 282 283 #ifdef CONFIG_64BIT 284 last = min(pos + size, adev->gmc.visible_vram_size); 285 if (last > pos) { 286 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 287 size_t count = last - pos; 288 289 if (write) { 290 memcpy_toio(addr, buf, count); 291 mb(); 292 amdgpu_asic_flush_hdp(adev, NULL); 293 } else { 294 amdgpu_asic_invalidate_hdp(adev, NULL); 295 mb(); 296 memcpy_fromio(buf, addr, count); 297 } 298 299 if (count == size) 300 return; 301 302 pos += count; 303 buf += count / 4; 304 size -= count; 305 } 306 #endif 307 308 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 309 for (last = pos + size; pos < last; pos += 4) { 310 uint32_t tmp = pos >> 31; 311 312 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 313 if (tmp != hi) { 314 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 315 hi = tmp; 316 } 317 if (write) 318 WREG32_NO_KIQ(mmMM_DATA, *buf++); 319 else 320 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 321 } 322 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 323 } 324 325 /* 326 * register access helper functions. 327 */ 328 /** 329 * amdgpu_device_rreg - read a memory mapped IO or indirect register 330 * 331 * @adev: amdgpu_device pointer 332 * @reg: dword aligned register offset 333 * @acc_flags: access flags which require special behavior 334 * 335 * Returns the 32 bit value from the offset specified. 336 */ 337 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 338 uint32_t reg, uint32_t acc_flags) 339 { 340 uint32_t ret; 341 342 if (adev->in_pci_err_recovery) 343 return 0; 344 345 if ((reg * 4) < adev->rmmio_size) { 346 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 347 amdgpu_sriov_runtime(adev) && 348 down_read_trylock(&adev->reset_sem)) { 349 ret = amdgpu_kiq_rreg(adev, reg); 350 up_read(&adev->reset_sem); 351 } else { 352 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 353 } 354 } else { 355 ret = adev->pcie_rreg(adev, reg * 4); 356 } 357 358 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 359 360 return ret; 361 } 362 363 /* 364 * MMIO register read with bytes helper functions 365 * @offset:bytes offset from MMIO start 366 * 367 */ 368 369 /** 370 * amdgpu_mm_rreg8 - read a memory mapped IO register 371 * 372 * @adev: amdgpu_device pointer 373 * @offset: byte aligned register offset 374 * 375 * Returns the 8 bit value from the offset specified. 376 */ 377 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 378 { 379 if (adev->in_pci_err_recovery) 380 return 0; 381 382 if (offset < adev->rmmio_size) 383 return (readb(adev->rmmio + offset)); 384 BUG(); 385 } 386 387 /* 388 * MMIO register write with bytes helper functions 389 * @offset:bytes offset from MMIO start 390 * @value: the value want to be written to the register 391 * 392 */ 393 /** 394 * amdgpu_mm_wreg8 - read a memory mapped IO register 395 * 396 * @adev: amdgpu_device pointer 397 * @offset: byte aligned register offset 398 * @value: 8 bit value to write 399 * 400 * Writes the value specified to the offset specified. 401 */ 402 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 403 { 404 if (adev->in_pci_err_recovery) 405 return; 406 407 if (offset < adev->rmmio_size) 408 writeb(value, adev->rmmio + offset); 409 else 410 BUG(); 411 } 412 413 /** 414 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 415 * 416 * @adev: amdgpu_device pointer 417 * @reg: dword aligned register offset 418 * @v: 32 bit value to write to the register 419 * @acc_flags: access flags which require special behavior 420 * 421 * Writes the value specified to the offset specified. 422 */ 423 void amdgpu_device_wreg(struct amdgpu_device *adev, 424 uint32_t reg, uint32_t v, 425 uint32_t acc_flags) 426 { 427 if (adev->in_pci_err_recovery) 428 return; 429 430 if ((reg * 4) < adev->rmmio_size) { 431 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 432 amdgpu_sriov_runtime(adev) && 433 down_read_trylock(&adev->reset_sem)) { 434 amdgpu_kiq_wreg(adev, reg, v); 435 up_read(&adev->reset_sem); 436 } else { 437 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 438 } 439 } else { 440 adev->pcie_wreg(adev, reg * 4, v); 441 } 442 443 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 444 } 445 446 /* 447 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 448 * 449 * this function is invoked only the debugfs register access 450 * */ 451 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 452 uint32_t reg, uint32_t v) 453 { 454 if (adev->in_pci_err_recovery) 455 return; 456 457 if (amdgpu_sriov_fullaccess(adev) && 458 adev->gfx.rlc.funcs && 459 adev->gfx.rlc.funcs->is_rlcg_access_range) { 460 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 461 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 462 } else { 463 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 464 } 465 } 466 467 /** 468 * amdgpu_io_rreg - read an IO register 469 * 470 * @adev: amdgpu_device pointer 471 * @reg: dword aligned register offset 472 * 473 * Returns the 32 bit value from the offset specified. 474 */ 475 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 476 { 477 if (adev->in_pci_err_recovery) 478 return 0; 479 480 if ((reg * 4) < adev->rio_mem_size) 481 return ioread32(adev->rio_mem + (reg * 4)); 482 else { 483 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 484 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 485 } 486 } 487 488 /** 489 * amdgpu_io_wreg - write to an IO register 490 * 491 * @adev: amdgpu_device pointer 492 * @reg: dword aligned register offset 493 * @v: 32 bit value to write to the register 494 * 495 * Writes the value specified to the offset specified. 496 */ 497 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 498 { 499 if (adev->in_pci_err_recovery) 500 return; 501 502 if ((reg * 4) < adev->rio_mem_size) 503 iowrite32(v, adev->rio_mem + (reg * 4)); 504 else { 505 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 506 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 507 } 508 } 509 510 /** 511 * amdgpu_mm_rdoorbell - read a doorbell dword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * 516 * Returns the value in the doorbell aperture at the 517 * requested doorbell index (CIK). 518 */ 519 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 520 { 521 if (adev->in_pci_err_recovery) 522 return 0; 523 524 if (index < adev->doorbell.num_doorbells) { 525 return readl(adev->doorbell.ptr + index); 526 } else { 527 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 528 return 0; 529 } 530 } 531 532 /** 533 * amdgpu_mm_wdoorbell - write a doorbell dword 534 * 535 * @adev: amdgpu_device pointer 536 * @index: doorbell index 537 * @v: value to write 538 * 539 * Writes @v to the doorbell aperture at the 540 * requested doorbell index (CIK). 541 */ 542 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 543 { 544 if (adev->in_pci_err_recovery) 545 return; 546 547 if (index < adev->doorbell.num_doorbells) { 548 writel(v, adev->doorbell.ptr + index); 549 } else { 550 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 551 } 552 } 553 554 /** 555 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 556 * 557 * @adev: amdgpu_device pointer 558 * @index: doorbell index 559 * 560 * Returns the value in the doorbell aperture at the 561 * requested doorbell index (VEGA10+). 562 */ 563 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 564 { 565 if (adev->in_pci_err_recovery) 566 return 0; 567 568 if (index < adev->doorbell.num_doorbells) { 569 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 570 } else { 571 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 572 return 0; 573 } 574 } 575 576 /** 577 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 578 * 579 * @adev: amdgpu_device pointer 580 * @index: doorbell index 581 * @v: value to write 582 * 583 * Writes @v to the doorbell aperture at the 584 * requested doorbell index (VEGA10+). 585 */ 586 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 587 { 588 if (adev->in_pci_err_recovery) 589 return; 590 591 if (index < adev->doorbell.num_doorbells) { 592 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 593 } else { 594 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 595 } 596 } 597 598 /** 599 * amdgpu_device_indirect_rreg - read an indirect register 600 * 601 * @adev: amdgpu_device pointer 602 * @pcie_index: mmio register offset 603 * @pcie_data: mmio register offset 604 * @reg_addr: indirect register address to read from 605 * 606 * Returns the value of indirect register @reg_addr 607 */ 608 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 609 u32 pcie_index, u32 pcie_data, 610 u32 reg_addr) 611 { 612 unsigned long flags; 613 u32 r; 614 void __iomem *pcie_index_offset; 615 void __iomem *pcie_data_offset; 616 617 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 618 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 619 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 620 621 writel(reg_addr, pcie_index_offset); 622 readl(pcie_index_offset); 623 r = readl(pcie_data_offset); 624 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 625 626 return r; 627 } 628 629 /** 630 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 631 * 632 * @adev: amdgpu_device pointer 633 * @pcie_index: mmio register offset 634 * @pcie_data: mmio register offset 635 * @reg_addr: indirect register address to read from 636 * 637 * Returns the value of indirect register @reg_addr 638 */ 639 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 640 u32 pcie_index, u32 pcie_data, 641 u32 reg_addr) 642 { 643 unsigned long flags; 644 u64 r; 645 void __iomem *pcie_index_offset; 646 void __iomem *pcie_data_offset; 647 648 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 649 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 650 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 651 652 /* read low 32 bits */ 653 writel(reg_addr, pcie_index_offset); 654 readl(pcie_index_offset); 655 r = readl(pcie_data_offset); 656 /* read high 32 bits */ 657 writel(reg_addr + 4, pcie_index_offset); 658 readl(pcie_index_offset); 659 r |= ((u64)readl(pcie_data_offset) << 32); 660 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 661 662 return r; 663 } 664 665 /** 666 * amdgpu_device_indirect_wreg - write an indirect register address 667 * 668 * @adev: amdgpu_device pointer 669 * @pcie_index: mmio register offset 670 * @pcie_data: mmio register offset 671 * @reg_addr: indirect register offset 672 * @reg_data: indirect register data 673 * 674 */ 675 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 676 u32 pcie_index, u32 pcie_data, 677 u32 reg_addr, u32 reg_data) 678 { 679 unsigned long flags; 680 void __iomem *pcie_index_offset; 681 void __iomem *pcie_data_offset; 682 683 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 684 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 685 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 686 687 writel(reg_addr, pcie_index_offset); 688 readl(pcie_index_offset); 689 writel(reg_data, pcie_data_offset); 690 readl(pcie_data_offset); 691 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 692 } 693 694 /** 695 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 696 * 697 * @adev: amdgpu_device pointer 698 * @pcie_index: mmio register offset 699 * @pcie_data: mmio register offset 700 * @reg_addr: indirect register offset 701 * @reg_data: indirect register data 702 * 703 */ 704 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 705 u32 pcie_index, u32 pcie_data, 706 u32 reg_addr, u64 reg_data) 707 { 708 unsigned long flags; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* write low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 720 readl(pcie_data_offset); 721 /* write high 32 bits */ 722 writel(reg_addr + 4, pcie_index_offset); 723 readl(pcie_index_offset); 724 writel((u32)(reg_data >> 32), pcie_data_offset); 725 readl(pcie_data_offset); 726 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 727 } 728 729 /** 730 * amdgpu_invalid_rreg - dummy reg read function 731 * 732 * @adev: amdgpu_device pointer 733 * @reg: offset of register 734 * 735 * Dummy register read function. Used for register blocks 736 * that certain asics don't have (all asics). 737 * Returns the value in the register. 738 */ 739 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 740 { 741 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 742 BUG(); 743 return 0; 744 } 745 746 /** 747 * amdgpu_invalid_wreg - dummy reg write function 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: offset of register 751 * @v: value to write to the register 752 * 753 * Dummy register read function. Used for register blocks 754 * that certain asics don't have (all asics). 755 */ 756 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 757 { 758 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 759 reg, v); 760 BUG(); 761 } 762 763 /** 764 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 765 * 766 * @adev: amdgpu_device pointer 767 * @reg: offset of register 768 * 769 * Dummy register read function. Used for register blocks 770 * that certain asics don't have (all asics). 771 * Returns the value in the register. 772 */ 773 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 774 { 775 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 776 BUG(); 777 return 0; 778 } 779 780 /** 781 * amdgpu_invalid_wreg64 - dummy reg write function 782 * 783 * @adev: amdgpu_device pointer 784 * @reg: offset of register 785 * @v: value to write to the register 786 * 787 * Dummy register read function. Used for register blocks 788 * that certain asics don't have (all asics). 789 */ 790 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 791 { 792 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 793 reg, v); 794 BUG(); 795 } 796 797 /** 798 * amdgpu_block_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @block: offset of instance 802 * @reg: offset of register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 * Returns the value in the register. 807 */ 808 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 809 uint32_t block, uint32_t reg) 810 { 811 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 812 reg, block); 813 BUG(); 814 return 0; 815 } 816 817 /** 818 * amdgpu_block_invalid_wreg - dummy reg write function 819 * 820 * @adev: amdgpu_device pointer 821 * @block: offset of instance 822 * @reg: offset of register 823 * @v: value to write to the register 824 * 825 * Dummy register read function. Used for register blocks 826 * that certain asics don't have (all asics). 827 */ 828 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 829 uint32_t block, 830 uint32_t reg, uint32_t v) 831 { 832 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 833 reg, block, v); 834 BUG(); 835 } 836 837 /** 838 * amdgpu_device_asic_init - Wrapper for atom asic_init 839 * 840 * @adev: amdgpu_device pointer 841 * 842 * Does any asic specific work and then calls atom asic init. 843 */ 844 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 845 { 846 amdgpu_asic_pre_asic_init(adev); 847 848 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 849 } 850 851 /** 852 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 853 * 854 * @adev: amdgpu_device pointer 855 * 856 * Allocates a scratch page of VRAM for use by various things in the 857 * driver. 858 */ 859 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 860 { 861 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 862 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 863 &adev->vram_scratch.robj, 864 &adev->vram_scratch.gpu_addr, 865 (void **)&adev->vram_scratch.ptr); 866 } 867 868 /** 869 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 870 * 871 * @adev: amdgpu_device pointer 872 * 873 * Frees the VRAM scratch page. 874 */ 875 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 876 { 877 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 878 } 879 880 /** 881 * amdgpu_device_program_register_sequence - program an array of registers. 882 * 883 * @adev: amdgpu_device pointer 884 * @registers: pointer to the register array 885 * @array_size: size of the register array 886 * 887 * Programs an array or registers with and and or masks. 888 * This is a helper for setting golden registers. 889 */ 890 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 891 const u32 *registers, 892 const u32 array_size) 893 { 894 u32 tmp, reg, and_mask, or_mask; 895 int i; 896 897 if (array_size % 3) 898 return; 899 900 for (i = 0; i < array_size; i +=3) { 901 reg = registers[i + 0]; 902 and_mask = registers[i + 1]; 903 or_mask = registers[i + 2]; 904 905 if (and_mask == 0xffffffff) { 906 tmp = or_mask; 907 } else { 908 tmp = RREG32(reg); 909 tmp &= ~and_mask; 910 if (adev->family >= AMDGPU_FAMILY_AI) 911 tmp |= (or_mask & and_mask); 912 else 913 tmp |= or_mask; 914 } 915 WREG32(reg, tmp); 916 } 917 } 918 919 /** 920 * amdgpu_device_pci_config_reset - reset the GPU 921 * 922 * @adev: amdgpu_device pointer 923 * 924 * Resets the GPU using the pci config reset sequence. 925 * Only applicable to asics prior to vega10. 926 */ 927 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 928 { 929 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 930 } 931 932 /* 933 * GPU doorbell aperture helpers function. 934 */ 935 /** 936 * amdgpu_device_doorbell_init - Init doorbell driver information. 937 * 938 * @adev: amdgpu_device pointer 939 * 940 * Init doorbell driver information (CIK) 941 * Returns 0 on success, error on failure. 942 */ 943 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 944 { 945 946 /* No doorbell on SI hardware generation */ 947 if (adev->asic_type < CHIP_BONAIRE) { 948 adev->doorbell.base = 0; 949 adev->doorbell.size = 0; 950 adev->doorbell.num_doorbells = 0; 951 adev->doorbell.ptr = NULL; 952 return 0; 953 } 954 955 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 956 return -EINVAL; 957 958 amdgpu_asic_init_doorbell_index(adev); 959 960 /* doorbell bar mapping */ 961 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 962 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 963 964 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 965 adev->doorbell_index.max_assignment+1); 966 if (adev->doorbell.num_doorbells == 0) 967 return -EINVAL; 968 969 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 970 * paging queue doorbell use the second page. The 971 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 972 * doorbells are in the first page. So with paging queue enabled, 973 * the max num_doorbells should + 1 page (0x400 in dword) 974 */ 975 if (adev->asic_type >= CHIP_VEGA10) 976 adev->doorbell.num_doorbells += 0x400; 977 978 adev->doorbell.ptr = ioremap(adev->doorbell.base, 979 adev->doorbell.num_doorbells * 980 sizeof(u32)); 981 if (adev->doorbell.ptr == NULL) 982 return -ENOMEM; 983 984 return 0; 985 } 986 987 /** 988 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Tear down doorbell driver information (CIK) 993 */ 994 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 995 { 996 iounmap(adev->doorbell.ptr); 997 adev->doorbell.ptr = NULL; 998 } 999 1000 1001 1002 /* 1003 * amdgpu_device_wb_*() 1004 * Writeback is the method by which the GPU updates special pages in memory 1005 * with the status of certain GPU events (fences, ring pointers,etc.). 1006 */ 1007 1008 /** 1009 * amdgpu_device_wb_fini - Disable Writeback and free memory 1010 * 1011 * @adev: amdgpu_device pointer 1012 * 1013 * Disables Writeback and frees the Writeback memory (all asics). 1014 * Used at driver shutdown. 1015 */ 1016 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1017 { 1018 if (adev->wb.wb_obj) { 1019 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1020 &adev->wb.gpu_addr, 1021 (void **)&adev->wb.wb); 1022 adev->wb.wb_obj = NULL; 1023 } 1024 } 1025 1026 /** 1027 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1028 * 1029 * @adev: amdgpu_device pointer 1030 * 1031 * Initializes writeback and allocates writeback memory (all asics). 1032 * Used at driver startup. 1033 * Returns 0 on success or an -error on failure. 1034 */ 1035 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1036 { 1037 int r; 1038 1039 if (adev->wb.wb_obj == NULL) { 1040 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1041 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1042 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1043 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1044 (void **)&adev->wb.wb); 1045 if (r) { 1046 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1047 return r; 1048 } 1049 1050 adev->wb.num_wb = AMDGPU_MAX_WB; 1051 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1052 1053 /* clear wb memory */ 1054 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1055 } 1056 1057 return 0; 1058 } 1059 1060 /** 1061 * amdgpu_device_wb_get - Allocate a wb entry 1062 * 1063 * @adev: amdgpu_device pointer 1064 * @wb: wb index 1065 * 1066 * Allocate a wb slot for use by the driver (all asics). 1067 * Returns 0 on success or -EINVAL on failure. 1068 */ 1069 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1070 { 1071 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1072 1073 if (offset < adev->wb.num_wb) { 1074 __set_bit(offset, adev->wb.used); 1075 *wb = offset << 3; /* convert to dw offset */ 1076 return 0; 1077 } else { 1078 return -EINVAL; 1079 } 1080 } 1081 1082 /** 1083 * amdgpu_device_wb_free - Free a wb entry 1084 * 1085 * @adev: amdgpu_device pointer 1086 * @wb: wb index 1087 * 1088 * Free a wb slot allocated for use by the driver (all asics) 1089 */ 1090 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1091 { 1092 wb >>= 3; 1093 if (wb < adev->wb.num_wb) 1094 __clear_bit(wb, adev->wb.used); 1095 } 1096 1097 /** 1098 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1103 * to fail, but if any of the BARs is not accessible after the size we abort 1104 * driver loading by returning -ENODEV. 1105 */ 1106 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1107 { 1108 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1109 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1110 struct pci_bus *root; 1111 struct resource *res; 1112 unsigned i; 1113 u16 cmd; 1114 int r; 1115 1116 /* Bypass for VF */ 1117 if (amdgpu_sriov_vf(adev)) 1118 return 0; 1119 1120 /* skip if the bios has already enabled large BAR */ 1121 if (adev->gmc.real_vram_size && 1122 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1123 return 0; 1124 1125 /* Check if the root BUS has 64bit memory resources */ 1126 root = adev->pdev->bus; 1127 while (root->parent) 1128 root = root->parent; 1129 1130 pci_bus_for_each_resource(root, res, i) { 1131 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1132 res->start > 0x100000000ull) 1133 break; 1134 } 1135 1136 /* Trying to resize is pointless without a root hub window above 4GB */ 1137 if (!res) 1138 return 0; 1139 1140 /* Disable memory decoding while we change the BAR addresses and size */ 1141 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1142 pci_write_config_word(adev->pdev, PCI_COMMAND, 1143 cmd & ~PCI_COMMAND_MEMORY); 1144 1145 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1146 amdgpu_device_doorbell_fini(adev); 1147 if (adev->asic_type >= CHIP_BONAIRE) 1148 pci_release_resource(adev->pdev, 2); 1149 1150 pci_release_resource(adev->pdev, 0); 1151 1152 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1153 if (r == -ENOSPC) 1154 DRM_INFO("Not enough PCI address space for a large BAR."); 1155 else if (r && r != -ENOTSUPP) 1156 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1157 1158 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1159 1160 /* When the doorbell or fb BAR isn't available we have no chance of 1161 * using the device. 1162 */ 1163 r = amdgpu_device_doorbell_init(adev); 1164 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1165 return -ENODEV; 1166 1167 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1168 1169 return 0; 1170 } 1171 1172 /* 1173 * GPU helpers function. 1174 */ 1175 /** 1176 * amdgpu_device_need_post - check if the hw need post or not 1177 * 1178 * @adev: amdgpu_device pointer 1179 * 1180 * Check if the asic has been initialized (all asics) at driver startup 1181 * or post is needed if hw reset is performed. 1182 * Returns true if need or false if not. 1183 */ 1184 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1185 { 1186 uint32_t reg; 1187 1188 if (amdgpu_sriov_vf(adev)) 1189 return false; 1190 1191 if (amdgpu_passthrough(adev)) { 1192 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1193 * some old smc fw still need driver do vPost otherwise gpu hang, while 1194 * those smc fw version above 22.15 doesn't have this flaw, so we force 1195 * vpost executed for smc version below 22.15 1196 */ 1197 if (adev->asic_type == CHIP_FIJI) { 1198 int err; 1199 uint32_t fw_ver; 1200 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1201 /* force vPost if error occured */ 1202 if (err) 1203 return true; 1204 1205 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1206 if (fw_ver < 0x00160e00) 1207 return true; 1208 } 1209 } 1210 1211 if (adev->has_hw_reset) { 1212 adev->has_hw_reset = false; 1213 return true; 1214 } 1215 1216 /* bios scratch used on CIK+ */ 1217 if (adev->asic_type >= CHIP_BONAIRE) 1218 return amdgpu_atombios_scratch_need_asic_init(adev); 1219 1220 /* check MEM_SIZE for older asics */ 1221 reg = amdgpu_asic_get_config_memsize(adev); 1222 1223 if ((reg != 0) && (reg != 0xffffffff)) 1224 return false; 1225 1226 return true; 1227 } 1228 1229 /* if we get transitioned to only one device, take VGA back */ 1230 /** 1231 * amdgpu_device_vga_set_decode - enable/disable vga decode 1232 * 1233 * @cookie: amdgpu_device pointer 1234 * @state: enable/disable vga decode 1235 * 1236 * Enable/disable vga decode (all asics). 1237 * Returns VGA resource flags. 1238 */ 1239 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1240 { 1241 struct amdgpu_device *adev = cookie; 1242 amdgpu_asic_set_vga_state(adev, state); 1243 if (state) 1244 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1245 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1246 else 1247 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1248 } 1249 1250 /** 1251 * amdgpu_device_check_block_size - validate the vm block size 1252 * 1253 * @adev: amdgpu_device pointer 1254 * 1255 * Validates the vm block size specified via module parameter. 1256 * The vm block size defines number of bits in page table versus page directory, 1257 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1258 * page table and the remaining bits are in the page directory. 1259 */ 1260 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1261 { 1262 /* defines number of bits in page table versus page directory, 1263 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1264 * page table and the remaining bits are in the page directory */ 1265 if (amdgpu_vm_block_size == -1) 1266 return; 1267 1268 if (amdgpu_vm_block_size < 9) { 1269 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1270 amdgpu_vm_block_size); 1271 amdgpu_vm_block_size = -1; 1272 } 1273 } 1274 1275 /** 1276 * amdgpu_device_check_vm_size - validate the vm size 1277 * 1278 * @adev: amdgpu_device pointer 1279 * 1280 * Validates the vm size in GB specified via module parameter. 1281 * The VM size is the size of the GPU virtual memory space in GB. 1282 */ 1283 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1284 { 1285 /* no need to check the default value */ 1286 if (amdgpu_vm_size == -1) 1287 return; 1288 1289 if (amdgpu_vm_size < 1) { 1290 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1291 amdgpu_vm_size); 1292 amdgpu_vm_size = -1; 1293 } 1294 } 1295 1296 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1297 { 1298 struct sysinfo si; 1299 bool is_os_64 = (sizeof(void *) == 8); 1300 uint64_t total_memory; 1301 uint64_t dram_size_seven_GB = 0x1B8000000; 1302 uint64_t dram_size_three_GB = 0xB8000000; 1303 1304 if (amdgpu_smu_memory_pool_size == 0) 1305 return; 1306 1307 if (!is_os_64) { 1308 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1309 goto def_value; 1310 } 1311 si_meminfo(&si); 1312 total_memory = (uint64_t)si.totalram * si.mem_unit; 1313 1314 if ((amdgpu_smu_memory_pool_size == 1) || 1315 (amdgpu_smu_memory_pool_size == 2)) { 1316 if (total_memory < dram_size_three_GB) 1317 goto def_value1; 1318 } else if ((amdgpu_smu_memory_pool_size == 4) || 1319 (amdgpu_smu_memory_pool_size == 8)) { 1320 if (total_memory < dram_size_seven_GB) 1321 goto def_value1; 1322 } else { 1323 DRM_WARN("Smu memory pool size not supported\n"); 1324 goto def_value; 1325 } 1326 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1327 1328 return; 1329 1330 def_value1: 1331 DRM_WARN("No enough system memory\n"); 1332 def_value: 1333 adev->pm.smu_prv_buffer_size = 0; 1334 } 1335 1336 /** 1337 * amdgpu_device_check_arguments - validate module params 1338 * 1339 * @adev: amdgpu_device pointer 1340 * 1341 * Validates certain module parameters and updates 1342 * the associated values used by the driver (all asics). 1343 */ 1344 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1345 { 1346 if (amdgpu_sched_jobs < 4) { 1347 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1348 amdgpu_sched_jobs); 1349 amdgpu_sched_jobs = 4; 1350 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1351 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1352 amdgpu_sched_jobs); 1353 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1354 } 1355 1356 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1357 /* gart size must be greater or equal to 32M */ 1358 dev_warn(adev->dev, "gart size (%d) too small\n", 1359 amdgpu_gart_size); 1360 amdgpu_gart_size = -1; 1361 } 1362 1363 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1364 /* gtt size must be greater or equal to 32M */ 1365 dev_warn(adev->dev, "gtt size (%d) too small\n", 1366 amdgpu_gtt_size); 1367 amdgpu_gtt_size = -1; 1368 } 1369 1370 /* valid range is between 4 and 9 inclusive */ 1371 if (amdgpu_vm_fragment_size != -1 && 1372 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1373 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1374 amdgpu_vm_fragment_size = -1; 1375 } 1376 1377 if (amdgpu_sched_hw_submission < 2) { 1378 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1379 amdgpu_sched_hw_submission); 1380 amdgpu_sched_hw_submission = 2; 1381 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1382 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1383 amdgpu_sched_hw_submission); 1384 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1385 } 1386 1387 amdgpu_device_check_smu_prv_buffer_size(adev); 1388 1389 amdgpu_device_check_vm_size(adev); 1390 1391 amdgpu_device_check_block_size(adev); 1392 1393 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1394 1395 amdgpu_gmc_tmz_set(adev); 1396 1397 amdgpu_gmc_noretry_set(adev); 1398 1399 return 0; 1400 } 1401 1402 /** 1403 * amdgpu_switcheroo_set_state - set switcheroo state 1404 * 1405 * @pdev: pci dev pointer 1406 * @state: vga_switcheroo state 1407 * 1408 * Callback for the switcheroo driver. Suspends or resumes the 1409 * the asics before or after it is powered up using ACPI methods. 1410 */ 1411 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1412 enum vga_switcheroo_state state) 1413 { 1414 struct drm_device *dev = pci_get_drvdata(pdev); 1415 int r; 1416 1417 if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF) 1418 return; 1419 1420 if (state == VGA_SWITCHEROO_ON) { 1421 pr_info("switched on\n"); 1422 /* don't suspend or resume card normally */ 1423 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1424 1425 pci_set_power_state(dev->pdev, PCI_D0); 1426 amdgpu_device_load_pci_state(dev->pdev); 1427 r = pci_enable_device(dev->pdev); 1428 if (r) 1429 DRM_WARN("pci_enable_device failed (%d)\n", r); 1430 amdgpu_device_resume(dev, true); 1431 1432 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1433 drm_kms_helper_poll_enable(dev); 1434 } else { 1435 pr_info("switched off\n"); 1436 drm_kms_helper_poll_disable(dev); 1437 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1438 amdgpu_device_suspend(dev, true); 1439 amdgpu_device_cache_pci_state(dev->pdev); 1440 /* Shut down the device */ 1441 pci_disable_device(dev->pdev); 1442 pci_set_power_state(dev->pdev, PCI_D3cold); 1443 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1444 } 1445 } 1446 1447 /** 1448 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1449 * 1450 * @pdev: pci dev pointer 1451 * 1452 * Callback for the switcheroo driver. Check of the switcheroo 1453 * state can be changed. 1454 * Returns true if the state can be changed, false if not. 1455 */ 1456 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1457 { 1458 struct drm_device *dev = pci_get_drvdata(pdev); 1459 1460 /* 1461 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1462 * locking inversion with the driver load path. And the access here is 1463 * completely racy anyway. So don't bother with locking for now. 1464 */ 1465 return atomic_read(&dev->open_count) == 0; 1466 } 1467 1468 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1469 .set_gpu_state = amdgpu_switcheroo_set_state, 1470 .reprobe = NULL, 1471 .can_switch = amdgpu_switcheroo_can_switch, 1472 }; 1473 1474 /** 1475 * amdgpu_device_ip_set_clockgating_state - set the CG state 1476 * 1477 * @dev: amdgpu_device pointer 1478 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1479 * @state: clockgating state (gate or ungate) 1480 * 1481 * Sets the requested clockgating state for all instances of 1482 * the hardware IP specified. 1483 * Returns the error code from the last instance. 1484 */ 1485 int amdgpu_device_ip_set_clockgating_state(void *dev, 1486 enum amd_ip_block_type block_type, 1487 enum amd_clockgating_state state) 1488 { 1489 struct amdgpu_device *adev = dev; 1490 int i, r = 0; 1491 1492 for (i = 0; i < adev->num_ip_blocks; i++) { 1493 if (!adev->ip_blocks[i].status.valid) 1494 continue; 1495 if (adev->ip_blocks[i].version->type != block_type) 1496 continue; 1497 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1498 continue; 1499 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1500 (void *)adev, state); 1501 if (r) 1502 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1503 adev->ip_blocks[i].version->funcs->name, r); 1504 } 1505 return r; 1506 } 1507 1508 /** 1509 * amdgpu_device_ip_set_powergating_state - set the PG state 1510 * 1511 * @dev: amdgpu_device pointer 1512 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1513 * @state: powergating state (gate or ungate) 1514 * 1515 * Sets the requested powergating state for all instances of 1516 * the hardware IP specified. 1517 * Returns the error code from the last instance. 1518 */ 1519 int amdgpu_device_ip_set_powergating_state(void *dev, 1520 enum amd_ip_block_type block_type, 1521 enum amd_powergating_state state) 1522 { 1523 struct amdgpu_device *adev = dev; 1524 int i, r = 0; 1525 1526 for (i = 0; i < adev->num_ip_blocks; i++) { 1527 if (!adev->ip_blocks[i].status.valid) 1528 continue; 1529 if (adev->ip_blocks[i].version->type != block_type) 1530 continue; 1531 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1532 continue; 1533 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1534 (void *)adev, state); 1535 if (r) 1536 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1537 adev->ip_blocks[i].version->funcs->name, r); 1538 } 1539 return r; 1540 } 1541 1542 /** 1543 * amdgpu_device_ip_get_clockgating_state - get the CG state 1544 * 1545 * @adev: amdgpu_device pointer 1546 * @flags: clockgating feature flags 1547 * 1548 * Walks the list of IPs on the device and updates the clockgating 1549 * flags for each IP. 1550 * Updates @flags with the feature flags for each hardware IP where 1551 * clockgating is enabled. 1552 */ 1553 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1554 u32 *flags) 1555 { 1556 int i; 1557 1558 for (i = 0; i < adev->num_ip_blocks; i++) { 1559 if (!adev->ip_blocks[i].status.valid) 1560 continue; 1561 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1562 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1563 } 1564 } 1565 1566 /** 1567 * amdgpu_device_ip_wait_for_idle - wait for idle 1568 * 1569 * @adev: amdgpu_device pointer 1570 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1571 * 1572 * Waits for the request hardware IP to be idle. 1573 * Returns 0 for success or a negative error code on failure. 1574 */ 1575 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1576 enum amd_ip_block_type block_type) 1577 { 1578 int i, r; 1579 1580 for (i = 0; i < adev->num_ip_blocks; i++) { 1581 if (!adev->ip_blocks[i].status.valid) 1582 continue; 1583 if (adev->ip_blocks[i].version->type == block_type) { 1584 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1585 if (r) 1586 return r; 1587 break; 1588 } 1589 } 1590 return 0; 1591 1592 } 1593 1594 /** 1595 * amdgpu_device_ip_is_idle - is the hardware IP idle 1596 * 1597 * @adev: amdgpu_device pointer 1598 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1599 * 1600 * Check if the hardware IP is idle or not. 1601 * Returns true if it the IP is idle, false if not. 1602 */ 1603 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1604 enum amd_ip_block_type block_type) 1605 { 1606 int i; 1607 1608 for (i = 0; i < adev->num_ip_blocks; i++) { 1609 if (!adev->ip_blocks[i].status.valid) 1610 continue; 1611 if (adev->ip_blocks[i].version->type == block_type) 1612 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1613 } 1614 return true; 1615 1616 } 1617 1618 /** 1619 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1620 * 1621 * @adev: amdgpu_device pointer 1622 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1623 * 1624 * Returns a pointer to the hardware IP block structure 1625 * if it exists for the asic, otherwise NULL. 1626 */ 1627 struct amdgpu_ip_block * 1628 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1629 enum amd_ip_block_type type) 1630 { 1631 int i; 1632 1633 for (i = 0; i < adev->num_ip_blocks; i++) 1634 if (adev->ip_blocks[i].version->type == type) 1635 return &adev->ip_blocks[i]; 1636 1637 return NULL; 1638 } 1639 1640 /** 1641 * amdgpu_device_ip_block_version_cmp 1642 * 1643 * @adev: amdgpu_device pointer 1644 * @type: enum amd_ip_block_type 1645 * @major: major version 1646 * @minor: minor version 1647 * 1648 * return 0 if equal or greater 1649 * return 1 if smaller or the ip_block doesn't exist 1650 */ 1651 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1652 enum amd_ip_block_type type, 1653 u32 major, u32 minor) 1654 { 1655 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1656 1657 if (ip_block && ((ip_block->version->major > major) || 1658 ((ip_block->version->major == major) && 1659 (ip_block->version->minor >= minor)))) 1660 return 0; 1661 1662 return 1; 1663 } 1664 1665 /** 1666 * amdgpu_device_ip_block_add 1667 * 1668 * @adev: amdgpu_device pointer 1669 * @ip_block_version: pointer to the IP to add 1670 * 1671 * Adds the IP block driver information to the collection of IPs 1672 * on the asic. 1673 */ 1674 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1675 const struct amdgpu_ip_block_version *ip_block_version) 1676 { 1677 if (!ip_block_version) 1678 return -EINVAL; 1679 1680 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1681 ip_block_version->funcs->name); 1682 1683 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1684 1685 return 0; 1686 } 1687 1688 /** 1689 * amdgpu_device_enable_virtual_display - enable virtual display feature 1690 * 1691 * @adev: amdgpu_device pointer 1692 * 1693 * Enabled the virtual display feature if the user has enabled it via 1694 * the module parameter virtual_display. This feature provides a virtual 1695 * display hardware on headless boards or in virtualized environments. 1696 * This function parses and validates the configuration string specified by 1697 * the user and configues the virtual display configuration (number of 1698 * virtual connectors, crtcs, etc.) specified. 1699 */ 1700 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1701 { 1702 adev->enable_virtual_display = false; 1703 1704 if (amdgpu_virtual_display) { 1705 struct drm_device *ddev = adev_to_drm(adev); 1706 const char *pci_address_name = pci_name(ddev->pdev); 1707 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1708 1709 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1710 pciaddstr_tmp = pciaddstr; 1711 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1712 pciaddname = strsep(&pciaddname_tmp, ","); 1713 if (!strcmp("all", pciaddname) 1714 || !strcmp(pci_address_name, pciaddname)) { 1715 long num_crtc; 1716 int res = -1; 1717 1718 adev->enable_virtual_display = true; 1719 1720 if (pciaddname_tmp) 1721 res = kstrtol(pciaddname_tmp, 10, 1722 &num_crtc); 1723 1724 if (!res) { 1725 if (num_crtc < 1) 1726 num_crtc = 1; 1727 if (num_crtc > 6) 1728 num_crtc = 6; 1729 adev->mode_info.num_crtc = num_crtc; 1730 } else { 1731 adev->mode_info.num_crtc = 1; 1732 } 1733 break; 1734 } 1735 } 1736 1737 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1738 amdgpu_virtual_display, pci_address_name, 1739 adev->enable_virtual_display, adev->mode_info.num_crtc); 1740 1741 kfree(pciaddstr); 1742 } 1743 } 1744 1745 /** 1746 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1747 * 1748 * @adev: amdgpu_device pointer 1749 * 1750 * Parses the asic configuration parameters specified in the gpu info 1751 * firmware and makes them availale to the driver for use in configuring 1752 * the asic. 1753 * Returns 0 on success, -EINVAL on failure. 1754 */ 1755 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1756 { 1757 const char *chip_name; 1758 char fw_name[40]; 1759 int err; 1760 const struct gpu_info_firmware_header_v1_0 *hdr; 1761 1762 adev->firmware.gpu_info_fw = NULL; 1763 1764 if (adev->mman.discovery_bin) { 1765 amdgpu_discovery_get_gfx_info(adev); 1766 1767 /* 1768 * FIXME: The bounding box is still needed by Navi12, so 1769 * temporarily read it from gpu_info firmware. Should be droped 1770 * when DAL no longer needs it. 1771 */ 1772 if (adev->asic_type != CHIP_NAVI12) 1773 return 0; 1774 } 1775 1776 switch (adev->asic_type) { 1777 #ifdef CONFIG_DRM_AMDGPU_SI 1778 case CHIP_VERDE: 1779 case CHIP_TAHITI: 1780 case CHIP_PITCAIRN: 1781 case CHIP_OLAND: 1782 case CHIP_HAINAN: 1783 #endif 1784 #ifdef CONFIG_DRM_AMDGPU_CIK 1785 case CHIP_BONAIRE: 1786 case CHIP_HAWAII: 1787 case CHIP_KAVERI: 1788 case CHIP_KABINI: 1789 case CHIP_MULLINS: 1790 #endif 1791 case CHIP_TOPAZ: 1792 case CHIP_TONGA: 1793 case CHIP_FIJI: 1794 case CHIP_POLARIS10: 1795 case CHIP_POLARIS11: 1796 case CHIP_POLARIS12: 1797 case CHIP_VEGAM: 1798 case CHIP_CARRIZO: 1799 case CHIP_STONEY: 1800 case CHIP_VEGA20: 1801 case CHIP_SIENNA_CICHLID: 1802 case CHIP_NAVY_FLOUNDER: 1803 case CHIP_DIMGREY_CAVEFISH: 1804 default: 1805 return 0; 1806 case CHIP_VEGA10: 1807 chip_name = "vega10"; 1808 break; 1809 case CHIP_VEGA12: 1810 chip_name = "vega12"; 1811 break; 1812 case CHIP_RAVEN: 1813 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1814 chip_name = "raven2"; 1815 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1816 chip_name = "picasso"; 1817 else 1818 chip_name = "raven"; 1819 break; 1820 case CHIP_ARCTURUS: 1821 chip_name = "arcturus"; 1822 break; 1823 case CHIP_RENOIR: 1824 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1825 chip_name = "renoir"; 1826 else 1827 chip_name = "green_sardine"; 1828 break; 1829 case CHIP_NAVI10: 1830 chip_name = "navi10"; 1831 break; 1832 case CHIP_NAVI14: 1833 chip_name = "navi14"; 1834 break; 1835 case CHIP_NAVI12: 1836 chip_name = "navi12"; 1837 break; 1838 case CHIP_VANGOGH: 1839 chip_name = "vangogh"; 1840 break; 1841 } 1842 1843 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1844 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1845 if (err) { 1846 dev_err(adev->dev, 1847 "Failed to load gpu_info firmware \"%s\"\n", 1848 fw_name); 1849 goto out; 1850 } 1851 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1852 if (err) { 1853 dev_err(adev->dev, 1854 "Failed to validate gpu_info firmware \"%s\"\n", 1855 fw_name); 1856 goto out; 1857 } 1858 1859 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1860 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1861 1862 switch (hdr->version_major) { 1863 case 1: 1864 { 1865 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1866 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1867 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1868 1869 /* 1870 * Should be droped when DAL no longer needs it. 1871 */ 1872 if (adev->asic_type == CHIP_NAVI12) 1873 goto parse_soc_bounding_box; 1874 1875 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1876 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1877 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1878 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1879 adev->gfx.config.max_texture_channel_caches = 1880 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1881 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1882 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1883 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1884 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1885 adev->gfx.config.double_offchip_lds_buf = 1886 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1887 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1888 adev->gfx.cu_info.max_waves_per_simd = 1889 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1890 adev->gfx.cu_info.max_scratch_slots_per_cu = 1891 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1892 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1893 if (hdr->version_minor >= 1) { 1894 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1895 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1896 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1897 adev->gfx.config.num_sc_per_sh = 1898 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1899 adev->gfx.config.num_packer_per_sc = 1900 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1901 } 1902 1903 parse_soc_bounding_box: 1904 /* 1905 * soc bounding box info is not integrated in disocovery table, 1906 * we always need to parse it from gpu info firmware if needed. 1907 */ 1908 if (hdr->version_minor == 2) { 1909 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1910 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1911 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1912 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1913 } 1914 break; 1915 } 1916 default: 1917 dev_err(adev->dev, 1918 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1919 err = -EINVAL; 1920 goto out; 1921 } 1922 out: 1923 return err; 1924 } 1925 1926 /** 1927 * amdgpu_device_ip_early_init - run early init for hardware IPs 1928 * 1929 * @adev: amdgpu_device pointer 1930 * 1931 * Early initialization pass for hardware IPs. The hardware IPs that make 1932 * up each asic are discovered each IP's early_init callback is run. This 1933 * is the first stage in initializing the asic. 1934 * Returns 0 on success, negative error code on failure. 1935 */ 1936 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1937 { 1938 int i, r; 1939 1940 amdgpu_device_enable_virtual_display(adev); 1941 1942 if (amdgpu_sriov_vf(adev)) { 1943 r = amdgpu_virt_request_full_gpu(adev, true); 1944 if (r) 1945 return r; 1946 } 1947 1948 switch (adev->asic_type) { 1949 #ifdef CONFIG_DRM_AMDGPU_SI 1950 case CHIP_VERDE: 1951 case CHIP_TAHITI: 1952 case CHIP_PITCAIRN: 1953 case CHIP_OLAND: 1954 case CHIP_HAINAN: 1955 adev->family = AMDGPU_FAMILY_SI; 1956 r = si_set_ip_blocks(adev); 1957 if (r) 1958 return r; 1959 break; 1960 #endif 1961 #ifdef CONFIG_DRM_AMDGPU_CIK 1962 case CHIP_BONAIRE: 1963 case CHIP_HAWAII: 1964 case CHIP_KAVERI: 1965 case CHIP_KABINI: 1966 case CHIP_MULLINS: 1967 if (adev->flags & AMD_IS_APU) 1968 adev->family = AMDGPU_FAMILY_KV; 1969 else 1970 adev->family = AMDGPU_FAMILY_CI; 1971 1972 r = cik_set_ip_blocks(adev); 1973 if (r) 1974 return r; 1975 break; 1976 #endif 1977 case CHIP_TOPAZ: 1978 case CHIP_TONGA: 1979 case CHIP_FIJI: 1980 case CHIP_POLARIS10: 1981 case CHIP_POLARIS11: 1982 case CHIP_POLARIS12: 1983 case CHIP_VEGAM: 1984 case CHIP_CARRIZO: 1985 case CHIP_STONEY: 1986 if (adev->flags & AMD_IS_APU) 1987 adev->family = AMDGPU_FAMILY_CZ; 1988 else 1989 adev->family = AMDGPU_FAMILY_VI; 1990 1991 r = vi_set_ip_blocks(adev); 1992 if (r) 1993 return r; 1994 break; 1995 case CHIP_VEGA10: 1996 case CHIP_VEGA12: 1997 case CHIP_VEGA20: 1998 case CHIP_RAVEN: 1999 case CHIP_ARCTURUS: 2000 case CHIP_RENOIR: 2001 if (adev->flags & AMD_IS_APU) 2002 adev->family = AMDGPU_FAMILY_RV; 2003 else 2004 adev->family = AMDGPU_FAMILY_AI; 2005 2006 r = soc15_set_ip_blocks(adev); 2007 if (r) 2008 return r; 2009 break; 2010 case CHIP_NAVI10: 2011 case CHIP_NAVI14: 2012 case CHIP_NAVI12: 2013 case CHIP_SIENNA_CICHLID: 2014 case CHIP_NAVY_FLOUNDER: 2015 case CHIP_DIMGREY_CAVEFISH: 2016 case CHIP_VANGOGH: 2017 if (adev->asic_type == CHIP_VANGOGH) 2018 adev->family = AMDGPU_FAMILY_VGH; 2019 else 2020 adev->family = AMDGPU_FAMILY_NV; 2021 2022 r = nv_set_ip_blocks(adev); 2023 if (r) 2024 return r; 2025 break; 2026 default: 2027 /* FIXME: not supported yet */ 2028 return -EINVAL; 2029 } 2030 2031 amdgpu_amdkfd_device_probe(adev); 2032 2033 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2034 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2035 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2036 2037 for (i = 0; i < adev->num_ip_blocks; i++) { 2038 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2039 DRM_ERROR("disabled ip block: %d <%s>\n", 2040 i, adev->ip_blocks[i].version->funcs->name); 2041 adev->ip_blocks[i].status.valid = false; 2042 } else { 2043 if (adev->ip_blocks[i].version->funcs->early_init) { 2044 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2045 if (r == -ENOENT) { 2046 adev->ip_blocks[i].status.valid = false; 2047 } else if (r) { 2048 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2049 adev->ip_blocks[i].version->funcs->name, r); 2050 return r; 2051 } else { 2052 adev->ip_blocks[i].status.valid = true; 2053 } 2054 } else { 2055 adev->ip_blocks[i].status.valid = true; 2056 } 2057 } 2058 /* get the vbios after the asic_funcs are set up */ 2059 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2060 r = amdgpu_device_parse_gpu_info_fw(adev); 2061 if (r) 2062 return r; 2063 2064 /* Read BIOS */ 2065 if (!amdgpu_get_bios(adev)) 2066 return -EINVAL; 2067 2068 r = amdgpu_atombios_init(adev); 2069 if (r) { 2070 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2071 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2072 return r; 2073 } 2074 } 2075 } 2076 2077 adev->cg_flags &= amdgpu_cg_mask; 2078 adev->pg_flags &= amdgpu_pg_mask; 2079 2080 return 0; 2081 } 2082 2083 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2084 { 2085 int i, r; 2086 2087 for (i = 0; i < adev->num_ip_blocks; i++) { 2088 if (!adev->ip_blocks[i].status.sw) 2089 continue; 2090 if (adev->ip_blocks[i].status.hw) 2091 continue; 2092 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2093 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2094 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2095 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2096 if (r) { 2097 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2098 adev->ip_blocks[i].version->funcs->name, r); 2099 return r; 2100 } 2101 adev->ip_blocks[i].status.hw = true; 2102 } 2103 } 2104 2105 return 0; 2106 } 2107 2108 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2109 { 2110 int i, r; 2111 2112 for (i = 0; i < adev->num_ip_blocks; i++) { 2113 if (!adev->ip_blocks[i].status.sw) 2114 continue; 2115 if (adev->ip_blocks[i].status.hw) 2116 continue; 2117 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2118 if (r) { 2119 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2120 adev->ip_blocks[i].version->funcs->name, r); 2121 return r; 2122 } 2123 adev->ip_blocks[i].status.hw = true; 2124 } 2125 2126 return 0; 2127 } 2128 2129 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2130 { 2131 int r = 0; 2132 int i; 2133 uint32_t smu_version; 2134 2135 if (adev->asic_type >= CHIP_VEGA10) { 2136 for (i = 0; i < adev->num_ip_blocks; i++) { 2137 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2138 continue; 2139 2140 /* no need to do the fw loading again if already done*/ 2141 if (adev->ip_blocks[i].status.hw == true) 2142 break; 2143 2144 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2145 r = adev->ip_blocks[i].version->funcs->resume(adev); 2146 if (r) { 2147 DRM_ERROR("resume of IP block <%s> failed %d\n", 2148 adev->ip_blocks[i].version->funcs->name, r); 2149 return r; 2150 } 2151 } else { 2152 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2153 if (r) { 2154 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2155 adev->ip_blocks[i].version->funcs->name, r); 2156 return r; 2157 } 2158 } 2159 2160 adev->ip_blocks[i].status.hw = true; 2161 break; 2162 } 2163 } 2164 2165 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2166 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2167 2168 return r; 2169 } 2170 2171 /** 2172 * amdgpu_device_ip_init - run init for hardware IPs 2173 * 2174 * @adev: amdgpu_device pointer 2175 * 2176 * Main initialization pass for hardware IPs. The list of all the hardware 2177 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2178 * are run. sw_init initializes the software state associated with each IP 2179 * and hw_init initializes the hardware associated with each IP. 2180 * Returns 0 on success, negative error code on failure. 2181 */ 2182 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2183 { 2184 int i, r; 2185 2186 r = amdgpu_ras_init(adev); 2187 if (r) 2188 return r; 2189 2190 for (i = 0; i < adev->num_ip_blocks; i++) { 2191 if (!adev->ip_blocks[i].status.valid) 2192 continue; 2193 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2194 if (r) { 2195 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2196 adev->ip_blocks[i].version->funcs->name, r); 2197 goto init_failed; 2198 } 2199 adev->ip_blocks[i].status.sw = true; 2200 2201 /* need to do gmc hw init early so we can allocate gpu mem */ 2202 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2203 r = amdgpu_device_vram_scratch_init(adev); 2204 if (r) { 2205 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2206 goto init_failed; 2207 } 2208 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2209 if (r) { 2210 DRM_ERROR("hw_init %d failed %d\n", i, r); 2211 goto init_failed; 2212 } 2213 r = amdgpu_device_wb_init(adev); 2214 if (r) { 2215 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2216 goto init_failed; 2217 } 2218 adev->ip_blocks[i].status.hw = true; 2219 2220 /* right after GMC hw init, we create CSA */ 2221 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2222 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2223 AMDGPU_GEM_DOMAIN_VRAM, 2224 AMDGPU_CSA_SIZE); 2225 if (r) { 2226 DRM_ERROR("allocate CSA failed %d\n", r); 2227 goto init_failed; 2228 } 2229 } 2230 } 2231 } 2232 2233 if (amdgpu_sriov_vf(adev)) 2234 amdgpu_virt_init_data_exchange(adev); 2235 2236 r = amdgpu_ib_pool_init(adev); 2237 if (r) { 2238 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2239 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2240 goto init_failed; 2241 } 2242 2243 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2244 if (r) 2245 goto init_failed; 2246 2247 r = amdgpu_device_ip_hw_init_phase1(adev); 2248 if (r) 2249 goto init_failed; 2250 2251 r = amdgpu_device_fw_loading(adev); 2252 if (r) 2253 goto init_failed; 2254 2255 r = amdgpu_device_ip_hw_init_phase2(adev); 2256 if (r) 2257 goto init_failed; 2258 2259 /* 2260 * retired pages will be loaded from eeprom and reserved here, 2261 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2262 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2263 * for I2C communication which only true at this point. 2264 * 2265 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2266 * failure from bad gpu situation and stop amdgpu init process 2267 * accordingly. For other failed cases, it will still release all 2268 * the resource and print error message, rather than returning one 2269 * negative value to upper level. 2270 * 2271 * Note: theoretically, this should be called before all vram allocations 2272 * to protect retired page from abusing 2273 */ 2274 r = amdgpu_ras_recovery_init(adev); 2275 if (r) 2276 goto init_failed; 2277 2278 if (adev->gmc.xgmi.num_physical_nodes > 1) 2279 amdgpu_xgmi_add_device(adev); 2280 amdgpu_amdkfd_device_init(adev); 2281 2282 amdgpu_fru_get_product_info(adev); 2283 2284 init_failed: 2285 if (amdgpu_sriov_vf(adev)) 2286 amdgpu_virt_release_full_gpu(adev, true); 2287 2288 return r; 2289 } 2290 2291 /** 2292 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2293 * 2294 * @adev: amdgpu_device pointer 2295 * 2296 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2297 * this function before a GPU reset. If the value is retained after a 2298 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2299 */ 2300 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2301 { 2302 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2303 } 2304 2305 /** 2306 * amdgpu_device_check_vram_lost - check if vram is valid 2307 * 2308 * @adev: amdgpu_device pointer 2309 * 2310 * Checks the reset magic value written to the gart pointer in VRAM. 2311 * The driver calls this after a GPU reset to see if the contents of 2312 * VRAM is lost or now. 2313 * returns true if vram is lost, false if not. 2314 */ 2315 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2316 { 2317 if (memcmp(adev->gart.ptr, adev->reset_magic, 2318 AMDGPU_RESET_MAGIC_NUM)) 2319 return true; 2320 2321 if (!amdgpu_in_reset(adev)) 2322 return false; 2323 2324 /* 2325 * For all ASICs with baco/mode1 reset, the VRAM is 2326 * always assumed to be lost. 2327 */ 2328 switch (amdgpu_asic_reset_method(adev)) { 2329 case AMD_RESET_METHOD_BACO: 2330 case AMD_RESET_METHOD_MODE1: 2331 return true; 2332 default: 2333 return false; 2334 } 2335 } 2336 2337 /** 2338 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2339 * 2340 * @adev: amdgpu_device pointer 2341 * @state: clockgating state (gate or ungate) 2342 * 2343 * The list of all the hardware IPs that make up the asic is walked and the 2344 * set_clockgating_state callbacks are run. 2345 * Late initialization pass enabling clockgating for hardware IPs. 2346 * Fini or suspend, pass disabling clockgating for hardware IPs. 2347 * Returns 0 on success, negative error code on failure. 2348 */ 2349 2350 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2351 enum amd_clockgating_state state) 2352 { 2353 int i, j, r; 2354 2355 if (amdgpu_emu_mode == 1) 2356 return 0; 2357 2358 for (j = 0; j < adev->num_ip_blocks; j++) { 2359 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2360 if (!adev->ip_blocks[i].status.late_initialized) 2361 continue; 2362 /* skip CG for VCE/UVD, it's handled specially */ 2363 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2364 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2365 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2366 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2367 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2368 /* enable clockgating to save power */ 2369 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2370 state); 2371 if (r) { 2372 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2373 adev->ip_blocks[i].version->funcs->name, r); 2374 return r; 2375 } 2376 } 2377 } 2378 2379 return 0; 2380 } 2381 2382 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2383 { 2384 int i, j, r; 2385 2386 if (amdgpu_emu_mode == 1) 2387 return 0; 2388 2389 for (j = 0; j < adev->num_ip_blocks; j++) { 2390 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2391 if (!adev->ip_blocks[i].status.late_initialized) 2392 continue; 2393 /* skip CG for VCE/UVD, it's handled specially */ 2394 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2395 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2396 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2397 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2398 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2399 /* enable powergating to save power */ 2400 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2401 state); 2402 if (r) { 2403 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2404 adev->ip_blocks[i].version->funcs->name, r); 2405 return r; 2406 } 2407 } 2408 } 2409 return 0; 2410 } 2411 2412 static int amdgpu_device_enable_mgpu_fan_boost(void) 2413 { 2414 struct amdgpu_gpu_instance *gpu_ins; 2415 struct amdgpu_device *adev; 2416 int i, ret = 0; 2417 2418 mutex_lock(&mgpu_info.mutex); 2419 2420 /* 2421 * MGPU fan boost feature should be enabled 2422 * only when there are two or more dGPUs in 2423 * the system 2424 */ 2425 if (mgpu_info.num_dgpu < 2) 2426 goto out; 2427 2428 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2429 gpu_ins = &(mgpu_info.gpu_ins[i]); 2430 adev = gpu_ins->adev; 2431 if (!(adev->flags & AMD_IS_APU) && 2432 !gpu_ins->mgpu_fan_enabled) { 2433 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2434 if (ret) 2435 break; 2436 2437 gpu_ins->mgpu_fan_enabled = 1; 2438 } 2439 } 2440 2441 out: 2442 mutex_unlock(&mgpu_info.mutex); 2443 2444 return ret; 2445 } 2446 2447 /** 2448 * amdgpu_device_ip_late_init - run late init for hardware IPs 2449 * 2450 * @adev: amdgpu_device pointer 2451 * 2452 * Late initialization pass for hardware IPs. The list of all the hardware 2453 * IPs that make up the asic is walked and the late_init callbacks are run. 2454 * late_init covers any special initialization that an IP requires 2455 * after all of the have been initialized or something that needs to happen 2456 * late in the init process. 2457 * Returns 0 on success, negative error code on failure. 2458 */ 2459 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2460 { 2461 struct amdgpu_gpu_instance *gpu_instance; 2462 int i = 0, r; 2463 2464 for (i = 0; i < adev->num_ip_blocks; i++) { 2465 if (!adev->ip_blocks[i].status.hw) 2466 continue; 2467 if (adev->ip_blocks[i].version->funcs->late_init) { 2468 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2469 if (r) { 2470 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2471 adev->ip_blocks[i].version->funcs->name, r); 2472 return r; 2473 } 2474 } 2475 adev->ip_blocks[i].status.late_initialized = true; 2476 } 2477 2478 amdgpu_ras_set_error_query_ready(adev, true); 2479 2480 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2481 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2482 2483 amdgpu_device_fill_reset_magic(adev); 2484 2485 r = amdgpu_device_enable_mgpu_fan_boost(); 2486 if (r) 2487 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2488 2489 2490 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2491 mutex_lock(&mgpu_info.mutex); 2492 2493 /* 2494 * Reset device p-state to low as this was booted with high. 2495 * 2496 * This should be performed only after all devices from the same 2497 * hive get initialized. 2498 * 2499 * However, it's unknown how many device in the hive in advance. 2500 * As this is counted one by one during devices initializations. 2501 * 2502 * So, we wait for all XGMI interlinked devices initialized. 2503 * This may bring some delays as those devices may come from 2504 * different hives. But that should be OK. 2505 */ 2506 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2507 for (i = 0; i < mgpu_info.num_gpu; i++) { 2508 gpu_instance = &(mgpu_info.gpu_ins[i]); 2509 if (gpu_instance->adev->flags & AMD_IS_APU) 2510 continue; 2511 2512 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2513 AMDGPU_XGMI_PSTATE_MIN); 2514 if (r) { 2515 DRM_ERROR("pstate setting failed (%d).\n", r); 2516 break; 2517 } 2518 } 2519 } 2520 2521 mutex_unlock(&mgpu_info.mutex); 2522 } 2523 2524 return 0; 2525 } 2526 2527 /** 2528 * amdgpu_device_ip_fini - run fini for hardware IPs 2529 * 2530 * @adev: amdgpu_device pointer 2531 * 2532 * Main teardown pass for hardware IPs. The list of all the hardware 2533 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2534 * are run. hw_fini tears down the hardware associated with each IP 2535 * and sw_fini tears down any software state associated with each IP. 2536 * Returns 0 on success, negative error code on failure. 2537 */ 2538 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2539 { 2540 int i, r; 2541 2542 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2543 amdgpu_virt_release_ras_err_handler_data(adev); 2544 2545 amdgpu_ras_pre_fini(adev); 2546 2547 if (adev->gmc.xgmi.num_physical_nodes > 1) 2548 amdgpu_xgmi_remove_device(adev); 2549 2550 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2551 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2552 2553 amdgpu_amdkfd_device_fini(adev); 2554 2555 /* need to disable SMC first */ 2556 for (i = 0; i < adev->num_ip_blocks; i++) { 2557 if (!adev->ip_blocks[i].status.hw) 2558 continue; 2559 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2560 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2561 /* XXX handle errors */ 2562 if (r) { 2563 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2564 adev->ip_blocks[i].version->funcs->name, r); 2565 } 2566 adev->ip_blocks[i].status.hw = false; 2567 break; 2568 } 2569 } 2570 2571 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2572 if (!adev->ip_blocks[i].status.hw) 2573 continue; 2574 2575 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2576 /* XXX handle errors */ 2577 if (r) { 2578 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2579 adev->ip_blocks[i].version->funcs->name, r); 2580 } 2581 2582 adev->ip_blocks[i].status.hw = false; 2583 } 2584 2585 2586 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2587 if (!adev->ip_blocks[i].status.sw) 2588 continue; 2589 2590 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2591 amdgpu_ucode_free_bo(adev); 2592 amdgpu_free_static_csa(&adev->virt.csa_obj); 2593 amdgpu_device_wb_fini(adev); 2594 amdgpu_device_vram_scratch_fini(adev); 2595 amdgpu_ib_pool_fini(adev); 2596 } 2597 2598 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2599 /* XXX handle errors */ 2600 if (r) { 2601 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2602 adev->ip_blocks[i].version->funcs->name, r); 2603 } 2604 adev->ip_blocks[i].status.sw = false; 2605 adev->ip_blocks[i].status.valid = false; 2606 } 2607 2608 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2609 if (!adev->ip_blocks[i].status.late_initialized) 2610 continue; 2611 if (adev->ip_blocks[i].version->funcs->late_fini) 2612 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2613 adev->ip_blocks[i].status.late_initialized = false; 2614 } 2615 2616 amdgpu_ras_fini(adev); 2617 2618 if (amdgpu_sriov_vf(adev)) 2619 if (amdgpu_virt_release_full_gpu(adev, false)) 2620 DRM_ERROR("failed to release exclusive mode on fini\n"); 2621 2622 return 0; 2623 } 2624 2625 /** 2626 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2627 * 2628 * @work: work_struct. 2629 */ 2630 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2631 { 2632 struct amdgpu_device *adev = 2633 container_of(work, struct amdgpu_device, delayed_init_work.work); 2634 int r; 2635 2636 r = amdgpu_ib_ring_tests(adev); 2637 if (r) 2638 DRM_ERROR("ib ring test failed (%d).\n", r); 2639 } 2640 2641 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2642 { 2643 struct amdgpu_device *adev = 2644 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2645 2646 mutex_lock(&adev->gfx.gfx_off_mutex); 2647 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2648 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2649 adev->gfx.gfx_off_state = true; 2650 } 2651 mutex_unlock(&adev->gfx.gfx_off_mutex); 2652 } 2653 2654 /** 2655 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2656 * 2657 * @adev: amdgpu_device pointer 2658 * 2659 * Main suspend function for hardware IPs. The list of all the hardware 2660 * IPs that make up the asic is walked, clockgating is disabled and the 2661 * suspend callbacks are run. suspend puts the hardware and software state 2662 * in each IP into a state suitable for suspend. 2663 * Returns 0 on success, negative error code on failure. 2664 */ 2665 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2666 { 2667 int i, r; 2668 2669 if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { 2670 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2671 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2672 } 2673 2674 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2675 if (!adev->ip_blocks[i].status.valid) 2676 continue; 2677 2678 /* displays are handled separately */ 2679 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2680 continue; 2681 2682 /* XXX handle errors */ 2683 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2684 /* XXX handle errors */ 2685 if (r) { 2686 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2687 adev->ip_blocks[i].version->funcs->name, r); 2688 return r; 2689 } 2690 2691 adev->ip_blocks[i].status.hw = false; 2692 } 2693 2694 return 0; 2695 } 2696 2697 /** 2698 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2699 * 2700 * @adev: amdgpu_device pointer 2701 * 2702 * Main suspend function for hardware IPs. The list of all the hardware 2703 * IPs that make up the asic is walked, clockgating is disabled and the 2704 * suspend callbacks are run. suspend puts the hardware and software state 2705 * in each IP into a state suitable for suspend. 2706 * Returns 0 on success, negative error code on failure. 2707 */ 2708 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2709 { 2710 int i, r; 2711 2712 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2713 if (!adev->ip_blocks[i].status.valid) 2714 continue; 2715 /* displays are handled in phase1 */ 2716 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2717 continue; 2718 /* PSP lost connection when err_event_athub occurs */ 2719 if (amdgpu_ras_intr_triggered() && 2720 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2721 adev->ip_blocks[i].status.hw = false; 2722 continue; 2723 } 2724 /* XXX handle errors */ 2725 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2726 /* XXX handle errors */ 2727 if (r) { 2728 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2729 adev->ip_blocks[i].version->funcs->name, r); 2730 } 2731 adev->ip_blocks[i].status.hw = false; 2732 /* handle putting the SMC in the appropriate state */ 2733 if(!amdgpu_sriov_vf(adev)){ 2734 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2735 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2736 if (r) { 2737 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2738 adev->mp1_state, r); 2739 return r; 2740 } 2741 } 2742 } 2743 adev->ip_blocks[i].status.hw = false; 2744 } 2745 2746 return 0; 2747 } 2748 2749 /** 2750 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2751 * 2752 * @adev: amdgpu_device pointer 2753 * 2754 * Main suspend function for hardware IPs. The list of all the hardware 2755 * IPs that make up the asic is walked, clockgating is disabled and the 2756 * suspend callbacks are run. suspend puts the hardware and software state 2757 * in each IP into a state suitable for suspend. 2758 * Returns 0 on success, negative error code on failure. 2759 */ 2760 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2761 { 2762 int r; 2763 2764 if (amdgpu_sriov_vf(adev)) 2765 amdgpu_virt_request_full_gpu(adev, false); 2766 2767 r = amdgpu_device_ip_suspend_phase1(adev); 2768 if (r) 2769 return r; 2770 r = amdgpu_device_ip_suspend_phase2(adev); 2771 2772 if (amdgpu_sriov_vf(adev)) 2773 amdgpu_virt_release_full_gpu(adev, false); 2774 2775 return r; 2776 } 2777 2778 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2779 { 2780 int i, r; 2781 2782 static enum amd_ip_block_type ip_order[] = { 2783 AMD_IP_BLOCK_TYPE_GMC, 2784 AMD_IP_BLOCK_TYPE_COMMON, 2785 AMD_IP_BLOCK_TYPE_PSP, 2786 AMD_IP_BLOCK_TYPE_IH, 2787 }; 2788 2789 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2790 int j; 2791 struct amdgpu_ip_block *block; 2792 2793 block = &adev->ip_blocks[i]; 2794 block->status.hw = false; 2795 2796 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2797 2798 if (block->version->type != ip_order[j] || 2799 !block->status.valid) 2800 continue; 2801 2802 r = block->version->funcs->hw_init(adev); 2803 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2804 if (r) 2805 return r; 2806 block->status.hw = true; 2807 } 2808 } 2809 2810 return 0; 2811 } 2812 2813 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2814 { 2815 int i, r; 2816 2817 static enum amd_ip_block_type ip_order[] = { 2818 AMD_IP_BLOCK_TYPE_SMC, 2819 AMD_IP_BLOCK_TYPE_DCE, 2820 AMD_IP_BLOCK_TYPE_GFX, 2821 AMD_IP_BLOCK_TYPE_SDMA, 2822 AMD_IP_BLOCK_TYPE_UVD, 2823 AMD_IP_BLOCK_TYPE_VCE, 2824 AMD_IP_BLOCK_TYPE_VCN 2825 }; 2826 2827 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2828 int j; 2829 struct amdgpu_ip_block *block; 2830 2831 for (j = 0; j < adev->num_ip_blocks; j++) { 2832 block = &adev->ip_blocks[j]; 2833 2834 if (block->version->type != ip_order[i] || 2835 !block->status.valid || 2836 block->status.hw) 2837 continue; 2838 2839 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2840 r = block->version->funcs->resume(adev); 2841 else 2842 r = block->version->funcs->hw_init(adev); 2843 2844 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2845 if (r) 2846 return r; 2847 block->status.hw = true; 2848 } 2849 } 2850 2851 return 0; 2852 } 2853 2854 /** 2855 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2856 * 2857 * @adev: amdgpu_device pointer 2858 * 2859 * First resume function for hardware IPs. The list of all the hardware 2860 * IPs that make up the asic is walked and the resume callbacks are run for 2861 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2862 * after a suspend and updates the software state as necessary. This 2863 * function is also used for restoring the GPU after a GPU reset. 2864 * Returns 0 on success, negative error code on failure. 2865 */ 2866 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2867 { 2868 int i, r; 2869 2870 for (i = 0; i < adev->num_ip_blocks; i++) { 2871 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2872 continue; 2873 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2874 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2875 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2876 2877 r = adev->ip_blocks[i].version->funcs->resume(adev); 2878 if (r) { 2879 DRM_ERROR("resume of IP block <%s> failed %d\n", 2880 adev->ip_blocks[i].version->funcs->name, r); 2881 return r; 2882 } 2883 adev->ip_blocks[i].status.hw = true; 2884 } 2885 } 2886 2887 return 0; 2888 } 2889 2890 /** 2891 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2892 * 2893 * @adev: amdgpu_device pointer 2894 * 2895 * First resume function for hardware IPs. The list of all the hardware 2896 * IPs that make up the asic is walked and the resume callbacks are run for 2897 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2898 * functional state after a suspend and updates the software state as 2899 * necessary. This function is also used for restoring the GPU after a GPU 2900 * reset. 2901 * Returns 0 on success, negative error code on failure. 2902 */ 2903 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2904 { 2905 int i, r; 2906 2907 for (i = 0; i < adev->num_ip_blocks; i++) { 2908 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2909 continue; 2910 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2911 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2912 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2913 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2914 continue; 2915 r = adev->ip_blocks[i].version->funcs->resume(adev); 2916 if (r) { 2917 DRM_ERROR("resume of IP block <%s> failed %d\n", 2918 adev->ip_blocks[i].version->funcs->name, r); 2919 return r; 2920 } 2921 adev->ip_blocks[i].status.hw = true; 2922 } 2923 2924 return 0; 2925 } 2926 2927 /** 2928 * amdgpu_device_ip_resume - run resume for hardware IPs 2929 * 2930 * @adev: amdgpu_device pointer 2931 * 2932 * Main resume function for hardware IPs. The hardware IPs 2933 * are split into two resume functions because they are 2934 * are also used in in recovering from a GPU reset and some additional 2935 * steps need to be take between them. In this case (S3/S4) they are 2936 * run sequentially. 2937 * Returns 0 on success, negative error code on failure. 2938 */ 2939 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2940 { 2941 int r; 2942 2943 r = amdgpu_device_ip_resume_phase1(adev); 2944 if (r) 2945 return r; 2946 2947 r = amdgpu_device_fw_loading(adev); 2948 if (r) 2949 return r; 2950 2951 r = amdgpu_device_ip_resume_phase2(adev); 2952 2953 return r; 2954 } 2955 2956 /** 2957 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2958 * 2959 * @adev: amdgpu_device pointer 2960 * 2961 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2962 */ 2963 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2964 { 2965 if (amdgpu_sriov_vf(adev)) { 2966 if (adev->is_atom_fw) { 2967 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2968 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2969 } else { 2970 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2971 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2972 } 2973 2974 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2975 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2976 } 2977 } 2978 2979 /** 2980 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2981 * 2982 * @asic_type: AMD asic type 2983 * 2984 * Check if there is DC (new modesetting infrastructre) support for an asic. 2985 * returns true if DC has support, false if not. 2986 */ 2987 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2988 { 2989 switch (asic_type) { 2990 #if defined(CONFIG_DRM_AMD_DC) 2991 #if defined(CONFIG_DRM_AMD_DC_SI) 2992 case CHIP_TAHITI: 2993 case CHIP_PITCAIRN: 2994 case CHIP_VERDE: 2995 case CHIP_OLAND: 2996 #endif 2997 case CHIP_BONAIRE: 2998 case CHIP_KAVERI: 2999 case CHIP_KABINI: 3000 case CHIP_MULLINS: 3001 /* 3002 * We have systems in the wild with these ASICs that require 3003 * LVDS and VGA support which is not supported with DC. 3004 * 3005 * Fallback to the non-DC driver here by default so as not to 3006 * cause regressions. 3007 */ 3008 return amdgpu_dc > 0; 3009 case CHIP_HAWAII: 3010 case CHIP_CARRIZO: 3011 case CHIP_STONEY: 3012 case CHIP_POLARIS10: 3013 case CHIP_POLARIS11: 3014 case CHIP_POLARIS12: 3015 case CHIP_VEGAM: 3016 case CHIP_TONGA: 3017 case CHIP_FIJI: 3018 case CHIP_VEGA10: 3019 case CHIP_VEGA12: 3020 case CHIP_VEGA20: 3021 #if defined(CONFIG_DRM_AMD_DC_DCN) 3022 case CHIP_RAVEN: 3023 case CHIP_NAVI10: 3024 case CHIP_NAVI14: 3025 case CHIP_NAVI12: 3026 case CHIP_RENOIR: 3027 case CHIP_SIENNA_CICHLID: 3028 case CHIP_NAVY_FLOUNDER: 3029 case CHIP_DIMGREY_CAVEFISH: 3030 case CHIP_VANGOGH: 3031 #endif 3032 return amdgpu_dc != 0; 3033 #endif 3034 default: 3035 if (amdgpu_dc > 0) 3036 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3037 "but isn't supported by ASIC, ignoring\n"); 3038 return false; 3039 } 3040 } 3041 3042 /** 3043 * amdgpu_device_has_dc_support - check if dc is supported 3044 * 3045 * @adev: amdgpu_device pointer 3046 * 3047 * Returns true for supported, false for not supported 3048 */ 3049 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3050 { 3051 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3052 return false; 3053 3054 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3055 } 3056 3057 3058 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3059 { 3060 struct amdgpu_device *adev = 3061 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3062 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3063 3064 /* It's a bug to not have a hive within this function */ 3065 if (WARN_ON(!hive)) 3066 return; 3067 3068 /* 3069 * Use task barrier to synchronize all xgmi reset works across the 3070 * hive. task_barrier_enter and task_barrier_exit will block 3071 * until all the threads running the xgmi reset works reach 3072 * those points. task_barrier_full will do both blocks. 3073 */ 3074 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3075 3076 task_barrier_enter(&hive->tb); 3077 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3078 3079 if (adev->asic_reset_res) 3080 goto fail; 3081 3082 task_barrier_exit(&hive->tb); 3083 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3084 3085 if (adev->asic_reset_res) 3086 goto fail; 3087 3088 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3089 adev->mmhub.funcs->reset_ras_error_count(adev); 3090 } else { 3091 3092 task_barrier_full(&hive->tb); 3093 adev->asic_reset_res = amdgpu_asic_reset(adev); 3094 } 3095 3096 fail: 3097 if (adev->asic_reset_res) 3098 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3099 adev->asic_reset_res, adev_to_drm(adev)->unique); 3100 amdgpu_put_xgmi_hive(hive); 3101 } 3102 3103 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3104 { 3105 char *input = amdgpu_lockup_timeout; 3106 char *timeout_setting = NULL; 3107 int index = 0; 3108 long timeout; 3109 int ret = 0; 3110 3111 /* 3112 * By default timeout for non compute jobs is 10000. 3113 * And there is no timeout enforced on compute jobs. 3114 * In SR-IOV or passthrough mode, timeout for compute 3115 * jobs are 60000 by default. 3116 */ 3117 adev->gfx_timeout = msecs_to_jiffies(10000); 3118 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3119 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3120 adev->compute_timeout = msecs_to_jiffies(60000); 3121 else 3122 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3123 3124 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3125 while ((timeout_setting = strsep(&input, ",")) && 3126 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3127 ret = kstrtol(timeout_setting, 0, &timeout); 3128 if (ret) 3129 return ret; 3130 3131 if (timeout == 0) { 3132 index++; 3133 continue; 3134 } else if (timeout < 0) { 3135 timeout = MAX_SCHEDULE_TIMEOUT; 3136 } else { 3137 timeout = msecs_to_jiffies(timeout); 3138 } 3139 3140 switch (index++) { 3141 case 0: 3142 adev->gfx_timeout = timeout; 3143 break; 3144 case 1: 3145 adev->compute_timeout = timeout; 3146 break; 3147 case 2: 3148 adev->sdma_timeout = timeout; 3149 break; 3150 case 3: 3151 adev->video_timeout = timeout; 3152 break; 3153 default: 3154 break; 3155 } 3156 } 3157 /* 3158 * There is only one value specified and 3159 * it should apply to all non-compute jobs. 3160 */ 3161 if (index == 1) { 3162 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3163 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3164 adev->compute_timeout = adev->gfx_timeout; 3165 } 3166 } 3167 3168 return ret; 3169 } 3170 3171 static const struct attribute *amdgpu_dev_attributes[] = { 3172 &dev_attr_product_name.attr, 3173 &dev_attr_product_number.attr, 3174 &dev_attr_serial_number.attr, 3175 &dev_attr_pcie_replay_count.attr, 3176 NULL 3177 }; 3178 3179 3180 /** 3181 * amdgpu_device_init - initialize the driver 3182 * 3183 * @adev: amdgpu_device pointer 3184 * @flags: driver flags 3185 * 3186 * Initializes the driver info and hw (all asics). 3187 * Returns 0 for success or an error on failure. 3188 * Called at driver startup. 3189 */ 3190 int amdgpu_device_init(struct amdgpu_device *adev, 3191 uint32_t flags) 3192 { 3193 struct drm_device *ddev = adev_to_drm(adev); 3194 struct pci_dev *pdev = adev->pdev; 3195 int r, i; 3196 bool atpx = false; 3197 u32 max_MBps; 3198 3199 adev->shutdown = false; 3200 adev->flags = flags; 3201 3202 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3203 adev->asic_type = amdgpu_force_asic_type; 3204 else 3205 adev->asic_type = flags & AMD_ASIC_MASK; 3206 3207 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3208 if (amdgpu_emu_mode == 1) 3209 adev->usec_timeout *= 10; 3210 adev->gmc.gart_size = 512 * 1024 * 1024; 3211 adev->accel_working = false; 3212 adev->num_rings = 0; 3213 adev->mman.buffer_funcs = NULL; 3214 adev->mman.buffer_funcs_ring = NULL; 3215 adev->vm_manager.vm_pte_funcs = NULL; 3216 adev->vm_manager.vm_pte_num_scheds = 0; 3217 adev->gmc.gmc_funcs = NULL; 3218 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3219 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3220 3221 adev->smc_rreg = &amdgpu_invalid_rreg; 3222 adev->smc_wreg = &amdgpu_invalid_wreg; 3223 adev->pcie_rreg = &amdgpu_invalid_rreg; 3224 adev->pcie_wreg = &amdgpu_invalid_wreg; 3225 adev->pciep_rreg = &amdgpu_invalid_rreg; 3226 adev->pciep_wreg = &amdgpu_invalid_wreg; 3227 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3228 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3229 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3230 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3231 adev->didt_rreg = &amdgpu_invalid_rreg; 3232 adev->didt_wreg = &amdgpu_invalid_wreg; 3233 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3234 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3235 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3236 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3237 3238 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3239 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3240 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3241 3242 /* mutex initialization are all done here so we 3243 * can recall function without having locking issues */ 3244 atomic_set(&adev->irq.ih.lock, 0); 3245 mutex_init(&adev->firmware.mutex); 3246 mutex_init(&adev->pm.mutex); 3247 mutex_init(&adev->gfx.gpu_clock_mutex); 3248 mutex_init(&adev->srbm_mutex); 3249 mutex_init(&adev->gfx.pipe_reserve_mutex); 3250 mutex_init(&adev->gfx.gfx_off_mutex); 3251 mutex_init(&adev->grbm_idx_mutex); 3252 mutex_init(&adev->mn_lock); 3253 mutex_init(&adev->virt.vf_errors.lock); 3254 hash_init(adev->mn_hash); 3255 atomic_set(&adev->in_gpu_reset, 0); 3256 init_rwsem(&adev->reset_sem); 3257 mutex_init(&adev->psp.mutex); 3258 mutex_init(&adev->notifier_lock); 3259 3260 r = amdgpu_device_check_arguments(adev); 3261 if (r) 3262 return r; 3263 3264 spin_lock_init(&adev->mmio_idx_lock); 3265 spin_lock_init(&adev->smc_idx_lock); 3266 spin_lock_init(&adev->pcie_idx_lock); 3267 spin_lock_init(&adev->uvd_ctx_idx_lock); 3268 spin_lock_init(&adev->didt_idx_lock); 3269 spin_lock_init(&adev->gc_cac_idx_lock); 3270 spin_lock_init(&adev->se_cac_idx_lock); 3271 spin_lock_init(&adev->audio_endpt_idx_lock); 3272 spin_lock_init(&adev->mm_stats.lock); 3273 3274 INIT_LIST_HEAD(&adev->shadow_list); 3275 mutex_init(&adev->shadow_list_lock); 3276 3277 INIT_DELAYED_WORK(&adev->delayed_init_work, 3278 amdgpu_device_delayed_init_work_handler); 3279 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3280 amdgpu_device_delay_enable_gfx_off); 3281 3282 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3283 3284 adev->gfx.gfx_off_req_count = 1; 3285 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3286 3287 atomic_set(&adev->throttling_logging_enabled, 1); 3288 /* 3289 * If throttling continues, logging will be performed every minute 3290 * to avoid log flooding. "-1" is subtracted since the thermal 3291 * throttling interrupt comes every second. Thus, the total logging 3292 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3293 * for throttling interrupt) = 60 seconds. 3294 */ 3295 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3296 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3297 3298 /* Registers mapping */ 3299 /* TODO: block userspace mapping of io register */ 3300 if (adev->asic_type >= CHIP_BONAIRE) { 3301 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3302 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3303 } else { 3304 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3305 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3306 } 3307 3308 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3309 if (adev->rmmio == NULL) { 3310 return -ENOMEM; 3311 } 3312 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3313 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3314 3315 /* io port mapping */ 3316 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3317 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3318 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3319 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3320 break; 3321 } 3322 } 3323 if (adev->rio_mem == NULL) 3324 DRM_INFO("PCI I/O BAR is not found.\n"); 3325 3326 /* enable PCIE atomic ops */ 3327 r = pci_enable_atomic_ops_to_root(adev->pdev, 3328 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3329 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3330 if (r) { 3331 adev->have_atomics_support = false; 3332 DRM_INFO("PCIE atomic ops is not supported\n"); 3333 } else { 3334 adev->have_atomics_support = true; 3335 } 3336 3337 amdgpu_device_get_pcie_info(adev); 3338 3339 if (amdgpu_mcbp) 3340 DRM_INFO("MCBP is enabled\n"); 3341 3342 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3343 adev->enable_mes = true; 3344 3345 /* detect hw virtualization here */ 3346 amdgpu_detect_virtualization(adev); 3347 3348 r = amdgpu_device_get_job_timeout_settings(adev); 3349 if (r) { 3350 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3351 goto failed_unmap; 3352 } 3353 3354 /* early init functions */ 3355 r = amdgpu_device_ip_early_init(adev); 3356 if (r) 3357 goto failed_unmap; 3358 3359 /* doorbell bar mapping and doorbell index init*/ 3360 amdgpu_device_doorbell_init(adev); 3361 3362 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3363 /* this will fail for cards that aren't VGA class devices, just 3364 * ignore it */ 3365 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3366 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3367 3368 if (amdgpu_device_supports_atpx(ddev)) 3369 atpx = true; 3370 if (amdgpu_has_atpx() && 3371 (amdgpu_is_atpx_hybrid() || 3372 amdgpu_has_atpx_dgpu_power_cntl()) && 3373 !pci_is_thunderbolt_attached(adev->pdev)) 3374 vga_switcheroo_register_client(adev->pdev, 3375 &amdgpu_switcheroo_ops, atpx); 3376 if (atpx) 3377 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3378 3379 if (amdgpu_emu_mode == 1) { 3380 /* post the asic on emulation mode */ 3381 emu_soc_asic_init(adev); 3382 goto fence_driver_init; 3383 } 3384 3385 /* detect if we are with an SRIOV vbios */ 3386 amdgpu_device_detect_sriov_bios(adev); 3387 3388 /* check if we need to reset the asic 3389 * E.g., driver was not cleanly unloaded previously, etc. 3390 */ 3391 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3392 r = amdgpu_asic_reset(adev); 3393 if (r) { 3394 dev_err(adev->dev, "asic reset on init failed\n"); 3395 goto failed; 3396 } 3397 } 3398 3399 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3400 3401 /* Post card if necessary */ 3402 if (amdgpu_device_need_post(adev)) { 3403 if (!adev->bios) { 3404 dev_err(adev->dev, "no vBIOS found\n"); 3405 r = -EINVAL; 3406 goto failed; 3407 } 3408 DRM_INFO("GPU posting now...\n"); 3409 r = amdgpu_device_asic_init(adev); 3410 if (r) { 3411 dev_err(adev->dev, "gpu post error!\n"); 3412 goto failed; 3413 } 3414 } 3415 3416 if (adev->is_atom_fw) { 3417 /* Initialize clocks */ 3418 r = amdgpu_atomfirmware_get_clock_info(adev); 3419 if (r) { 3420 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3421 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3422 goto failed; 3423 } 3424 } else { 3425 /* Initialize clocks */ 3426 r = amdgpu_atombios_get_clock_info(adev); 3427 if (r) { 3428 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3429 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3430 goto failed; 3431 } 3432 /* init i2c buses */ 3433 if (!amdgpu_device_has_dc_support(adev)) 3434 amdgpu_atombios_i2c_init(adev); 3435 } 3436 3437 fence_driver_init: 3438 /* Fence driver */ 3439 r = amdgpu_fence_driver_init(adev); 3440 if (r) { 3441 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3442 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3443 goto failed; 3444 } 3445 3446 /* init the mode config */ 3447 drm_mode_config_init(adev_to_drm(adev)); 3448 3449 r = amdgpu_device_ip_init(adev); 3450 if (r) { 3451 /* failed in exclusive mode due to timeout */ 3452 if (amdgpu_sriov_vf(adev) && 3453 !amdgpu_sriov_runtime(adev) && 3454 amdgpu_virt_mmio_blocked(adev) && 3455 !amdgpu_virt_wait_reset(adev)) { 3456 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3457 /* Don't send request since VF is inactive. */ 3458 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3459 adev->virt.ops = NULL; 3460 r = -EAGAIN; 3461 goto failed; 3462 } 3463 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3464 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3465 goto failed; 3466 } 3467 3468 dev_info(adev->dev, 3469 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3470 adev->gfx.config.max_shader_engines, 3471 adev->gfx.config.max_sh_per_se, 3472 adev->gfx.config.max_cu_per_sh, 3473 adev->gfx.cu_info.number); 3474 3475 adev->accel_working = true; 3476 3477 amdgpu_vm_check_compute_bug(adev); 3478 3479 /* Initialize the buffer migration limit. */ 3480 if (amdgpu_moverate >= 0) 3481 max_MBps = amdgpu_moverate; 3482 else 3483 max_MBps = 8; /* Allow 8 MB/s. */ 3484 /* Get a log2 for easy divisions. */ 3485 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3486 3487 amdgpu_fbdev_init(adev); 3488 3489 r = amdgpu_pm_sysfs_init(adev); 3490 if (r) { 3491 adev->pm_sysfs_en = false; 3492 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3493 } else 3494 adev->pm_sysfs_en = true; 3495 3496 r = amdgpu_ucode_sysfs_init(adev); 3497 if (r) { 3498 adev->ucode_sysfs_en = false; 3499 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3500 } else 3501 adev->ucode_sysfs_en = true; 3502 3503 if ((amdgpu_testing & 1)) { 3504 if (adev->accel_working) 3505 amdgpu_test_moves(adev); 3506 else 3507 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3508 } 3509 if (amdgpu_benchmarking) { 3510 if (adev->accel_working) 3511 amdgpu_benchmark(adev, amdgpu_benchmarking); 3512 else 3513 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3514 } 3515 3516 /* 3517 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3518 * Otherwise the mgpu fan boost feature will be skipped due to the 3519 * gpu instance is counted less. 3520 */ 3521 amdgpu_register_gpu_instance(adev); 3522 3523 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3524 * explicit gating rather than handling it automatically. 3525 */ 3526 r = amdgpu_device_ip_late_init(adev); 3527 if (r) { 3528 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3529 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3530 goto failed; 3531 } 3532 3533 /* must succeed. */ 3534 amdgpu_ras_resume(adev); 3535 3536 queue_delayed_work(system_wq, &adev->delayed_init_work, 3537 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3538 3539 if (amdgpu_sriov_vf(adev)) 3540 flush_delayed_work(&adev->delayed_init_work); 3541 3542 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3543 if (r) 3544 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3545 3546 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3547 r = amdgpu_pmu_init(adev); 3548 if (r) 3549 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3550 3551 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3552 if (amdgpu_device_cache_pci_state(adev->pdev)) 3553 pci_restore_state(pdev); 3554 3555 return 0; 3556 3557 failed: 3558 amdgpu_vf_error_trans_all(adev); 3559 if (atpx) 3560 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3561 3562 failed_unmap: 3563 iounmap(adev->rmmio); 3564 adev->rmmio = NULL; 3565 3566 return r; 3567 } 3568 3569 /** 3570 * amdgpu_device_fini - tear down the driver 3571 * 3572 * @adev: amdgpu_device pointer 3573 * 3574 * Tear down the driver info (all asics). 3575 * Called at driver shutdown. 3576 */ 3577 void amdgpu_device_fini(struct amdgpu_device *adev) 3578 { 3579 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3580 flush_delayed_work(&adev->delayed_init_work); 3581 adev->shutdown = true; 3582 3583 kfree(adev->pci_state); 3584 3585 /* make sure IB test finished before entering exclusive mode 3586 * to avoid preemption on IB test 3587 * */ 3588 if (amdgpu_sriov_vf(adev)) { 3589 amdgpu_virt_request_full_gpu(adev, false); 3590 amdgpu_virt_fini_data_exchange(adev); 3591 } 3592 3593 /* disable all interrupts */ 3594 amdgpu_irq_disable_all(adev); 3595 if (adev->mode_info.mode_config_initialized){ 3596 if (!amdgpu_device_has_dc_support(adev)) 3597 drm_helper_force_disable_all(adev_to_drm(adev)); 3598 else 3599 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3600 } 3601 amdgpu_fence_driver_fini(adev); 3602 if (adev->pm_sysfs_en) 3603 amdgpu_pm_sysfs_fini(adev); 3604 amdgpu_fbdev_fini(adev); 3605 amdgpu_device_ip_fini(adev); 3606 release_firmware(adev->firmware.gpu_info_fw); 3607 adev->firmware.gpu_info_fw = NULL; 3608 adev->accel_working = false; 3609 /* free i2c buses */ 3610 if (!amdgpu_device_has_dc_support(adev)) 3611 amdgpu_i2c_fini(adev); 3612 3613 if (amdgpu_emu_mode != 1) 3614 amdgpu_atombios_fini(adev); 3615 3616 kfree(adev->bios); 3617 adev->bios = NULL; 3618 if (amdgpu_has_atpx() && 3619 (amdgpu_is_atpx_hybrid() || 3620 amdgpu_has_atpx_dgpu_power_cntl()) && 3621 !pci_is_thunderbolt_attached(adev->pdev)) 3622 vga_switcheroo_unregister_client(adev->pdev); 3623 if (amdgpu_device_supports_atpx(adev_to_drm(adev))) 3624 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3625 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3626 vga_client_register(adev->pdev, NULL, NULL, NULL); 3627 if (adev->rio_mem) 3628 pci_iounmap(adev->pdev, adev->rio_mem); 3629 adev->rio_mem = NULL; 3630 iounmap(adev->rmmio); 3631 adev->rmmio = NULL; 3632 amdgpu_device_doorbell_fini(adev); 3633 3634 if (adev->ucode_sysfs_en) 3635 amdgpu_ucode_sysfs_fini(adev); 3636 3637 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3638 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3639 amdgpu_pmu_fini(adev); 3640 if (adev->mman.discovery_bin) 3641 amdgpu_discovery_fini(adev); 3642 } 3643 3644 3645 /* 3646 * Suspend & resume. 3647 */ 3648 /** 3649 * amdgpu_device_suspend - initiate device suspend 3650 * 3651 * @dev: drm dev pointer 3652 * @fbcon : notify the fbdev of suspend 3653 * 3654 * Puts the hw in the suspend state (all asics). 3655 * Returns 0 for success or an error on failure. 3656 * Called at driver suspend. 3657 */ 3658 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3659 { 3660 struct amdgpu_device *adev; 3661 struct drm_crtc *crtc; 3662 struct drm_connector *connector; 3663 struct drm_connector_list_iter iter; 3664 int r; 3665 3666 adev = drm_to_adev(dev); 3667 3668 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3669 return 0; 3670 3671 adev->in_suspend = true; 3672 drm_kms_helper_poll_disable(dev); 3673 3674 if (fbcon) 3675 amdgpu_fbdev_set_suspend(adev, 1); 3676 3677 cancel_delayed_work_sync(&adev->delayed_init_work); 3678 3679 if (!amdgpu_device_has_dc_support(adev)) { 3680 /* turn off display hw */ 3681 drm_modeset_lock_all(dev); 3682 drm_connector_list_iter_begin(dev, &iter); 3683 drm_for_each_connector_iter(connector, &iter) 3684 drm_helper_connector_dpms(connector, 3685 DRM_MODE_DPMS_OFF); 3686 drm_connector_list_iter_end(&iter); 3687 drm_modeset_unlock_all(dev); 3688 /* unpin the front buffers and cursors */ 3689 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3690 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3691 struct drm_framebuffer *fb = crtc->primary->fb; 3692 struct amdgpu_bo *robj; 3693 3694 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3695 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3696 r = amdgpu_bo_reserve(aobj, true); 3697 if (r == 0) { 3698 amdgpu_bo_unpin(aobj); 3699 amdgpu_bo_unreserve(aobj); 3700 } 3701 } 3702 3703 if (fb == NULL || fb->obj[0] == NULL) { 3704 continue; 3705 } 3706 robj = gem_to_amdgpu_bo(fb->obj[0]); 3707 /* don't unpin kernel fb objects */ 3708 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3709 r = amdgpu_bo_reserve(robj, true); 3710 if (r == 0) { 3711 amdgpu_bo_unpin(robj); 3712 amdgpu_bo_unreserve(robj); 3713 } 3714 } 3715 } 3716 } 3717 3718 amdgpu_ras_suspend(adev); 3719 3720 r = amdgpu_device_ip_suspend_phase1(adev); 3721 3722 amdgpu_amdkfd_suspend(adev, !fbcon); 3723 3724 /* evict vram memory */ 3725 amdgpu_bo_evict_vram(adev); 3726 3727 amdgpu_fence_driver_suspend(adev); 3728 3729 if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) 3730 r = amdgpu_device_ip_suspend_phase2(adev); 3731 else 3732 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 3733 /* evict remaining vram memory 3734 * This second call to evict vram is to evict the gart page table 3735 * using the CPU. 3736 */ 3737 amdgpu_bo_evict_vram(adev); 3738 3739 return 0; 3740 } 3741 3742 /** 3743 * amdgpu_device_resume - initiate device resume 3744 * 3745 * @dev: drm dev pointer 3746 * @fbcon : notify the fbdev of resume 3747 * 3748 * Bring the hw back to operating state (all asics). 3749 * Returns 0 for success or an error on failure. 3750 * Called at driver resume. 3751 */ 3752 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3753 { 3754 struct drm_connector *connector; 3755 struct drm_connector_list_iter iter; 3756 struct amdgpu_device *adev = drm_to_adev(dev); 3757 struct drm_crtc *crtc; 3758 int r = 0; 3759 3760 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3761 return 0; 3762 3763 if (amdgpu_acpi_is_s0ix_supported(adev)) 3764 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3765 3766 /* post card */ 3767 if (amdgpu_device_need_post(adev)) { 3768 r = amdgpu_device_asic_init(adev); 3769 if (r) 3770 dev_err(adev->dev, "amdgpu asic init failed\n"); 3771 } 3772 3773 r = amdgpu_device_ip_resume(adev); 3774 if (r) { 3775 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3776 return r; 3777 } 3778 amdgpu_fence_driver_resume(adev); 3779 3780 3781 r = amdgpu_device_ip_late_init(adev); 3782 if (r) 3783 return r; 3784 3785 queue_delayed_work(system_wq, &adev->delayed_init_work, 3786 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3787 3788 if (!amdgpu_device_has_dc_support(adev)) { 3789 /* pin cursors */ 3790 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3791 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3792 3793 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3794 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3795 r = amdgpu_bo_reserve(aobj, true); 3796 if (r == 0) { 3797 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3798 if (r != 0) 3799 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3800 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3801 amdgpu_bo_unreserve(aobj); 3802 } 3803 } 3804 } 3805 } 3806 r = amdgpu_amdkfd_resume(adev, !fbcon); 3807 if (r) 3808 return r; 3809 3810 /* Make sure IB tests flushed */ 3811 flush_delayed_work(&adev->delayed_init_work); 3812 3813 /* blat the mode back in */ 3814 if (fbcon) { 3815 if (!amdgpu_device_has_dc_support(adev)) { 3816 /* pre DCE11 */ 3817 drm_helper_resume_force_mode(dev); 3818 3819 /* turn on display hw */ 3820 drm_modeset_lock_all(dev); 3821 3822 drm_connector_list_iter_begin(dev, &iter); 3823 drm_for_each_connector_iter(connector, &iter) 3824 drm_helper_connector_dpms(connector, 3825 DRM_MODE_DPMS_ON); 3826 drm_connector_list_iter_end(&iter); 3827 3828 drm_modeset_unlock_all(dev); 3829 } 3830 amdgpu_fbdev_set_suspend(adev, 0); 3831 } 3832 3833 drm_kms_helper_poll_enable(dev); 3834 3835 amdgpu_ras_resume(adev); 3836 3837 /* 3838 * Most of the connector probing functions try to acquire runtime pm 3839 * refs to ensure that the GPU is powered on when connector polling is 3840 * performed. Since we're calling this from a runtime PM callback, 3841 * trying to acquire rpm refs will cause us to deadlock. 3842 * 3843 * Since we're guaranteed to be holding the rpm lock, it's safe to 3844 * temporarily disable the rpm helpers so this doesn't deadlock us. 3845 */ 3846 #ifdef CONFIG_PM 3847 dev->dev->power.disable_depth++; 3848 #endif 3849 if (!amdgpu_device_has_dc_support(adev)) 3850 drm_helper_hpd_irq_event(dev); 3851 else 3852 drm_kms_helper_hotplug_event(dev); 3853 #ifdef CONFIG_PM 3854 dev->dev->power.disable_depth--; 3855 #endif 3856 adev->in_suspend = false; 3857 3858 return 0; 3859 } 3860 3861 /** 3862 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3863 * 3864 * @adev: amdgpu_device pointer 3865 * 3866 * The list of all the hardware IPs that make up the asic is walked and 3867 * the check_soft_reset callbacks are run. check_soft_reset determines 3868 * if the asic is still hung or not. 3869 * Returns true if any of the IPs are still in a hung state, false if not. 3870 */ 3871 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3872 { 3873 int i; 3874 bool asic_hang = false; 3875 3876 if (amdgpu_sriov_vf(adev)) 3877 return true; 3878 3879 if (amdgpu_asic_need_full_reset(adev)) 3880 return true; 3881 3882 for (i = 0; i < adev->num_ip_blocks; i++) { 3883 if (!adev->ip_blocks[i].status.valid) 3884 continue; 3885 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3886 adev->ip_blocks[i].status.hang = 3887 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3888 if (adev->ip_blocks[i].status.hang) { 3889 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3890 asic_hang = true; 3891 } 3892 } 3893 return asic_hang; 3894 } 3895 3896 /** 3897 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3898 * 3899 * @adev: amdgpu_device pointer 3900 * 3901 * The list of all the hardware IPs that make up the asic is walked and the 3902 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3903 * handles any IP specific hardware or software state changes that are 3904 * necessary for a soft reset to succeed. 3905 * Returns 0 on success, negative error code on failure. 3906 */ 3907 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3908 { 3909 int i, r = 0; 3910 3911 for (i = 0; i < adev->num_ip_blocks; i++) { 3912 if (!adev->ip_blocks[i].status.valid) 3913 continue; 3914 if (adev->ip_blocks[i].status.hang && 3915 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3916 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3917 if (r) 3918 return r; 3919 } 3920 } 3921 3922 return 0; 3923 } 3924 3925 /** 3926 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3927 * 3928 * @adev: amdgpu_device pointer 3929 * 3930 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3931 * reset is necessary to recover. 3932 * Returns true if a full asic reset is required, false if not. 3933 */ 3934 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3935 { 3936 int i; 3937 3938 if (amdgpu_asic_need_full_reset(adev)) 3939 return true; 3940 3941 for (i = 0; i < adev->num_ip_blocks; i++) { 3942 if (!adev->ip_blocks[i].status.valid) 3943 continue; 3944 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3945 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3946 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3947 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3948 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3949 if (adev->ip_blocks[i].status.hang) { 3950 dev_info(adev->dev, "Some block need full reset!\n"); 3951 return true; 3952 } 3953 } 3954 } 3955 return false; 3956 } 3957 3958 /** 3959 * amdgpu_device_ip_soft_reset - do a soft reset 3960 * 3961 * @adev: amdgpu_device pointer 3962 * 3963 * The list of all the hardware IPs that make up the asic is walked and the 3964 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3965 * IP specific hardware or software state changes that are necessary to soft 3966 * reset the IP. 3967 * Returns 0 on success, negative error code on failure. 3968 */ 3969 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3970 { 3971 int i, r = 0; 3972 3973 for (i = 0; i < adev->num_ip_blocks; i++) { 3974 if (!adev->ip_blocks[i].status.valid) 3975 continue; 3976 if (adev->ip_blocks[i].status.hang && 3977 adev->ip_blocks[i].version->funcs->soft_reset) { 3978 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3979 if (r) 3980 return r; 3981 } 3982 } 3983 3984 return 0; 3985 } 3986 3987 /** 3988 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3989 * 3990 * @adev: amdgpu_device pointer 3991 * 3992 * The list of all the hardware IPs that make up the asic is walked and the 3993 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3994 * handles any IP specific hardware or software state changes that are 3995 * necessary after the IP has been soft reset. 3996 * Returns 0 on success, negative error code on failure. 3997 */ 3998 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3999 { 4000 int i, r = 0; 4001 4002 for (i = 0; i < adev->num_ip_blocks; i++) { 4003 if (!adev->ip_blocks[i].status.valid) 4004 continue; 4005 if (adev->ip_blocks[i].status.hang && 4006 adev->ip_blocks[i].version->funcs->post_soft_reset) 4007 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4008 if (r) 4009 return r; 4010 } 4011 4012 return 0; 4013 } 4014 4015 /** 4016 * amdgpu_device_recover_vram - Recover some VRAM contents 4017 * 4018 * @adev: amdgpu_device pointer 4019 * 4020 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4021 * restore things like GPUVM page tables after a GPU reset where 4022 * the contents of VRAM might be lost. 4023 * 4024 * Returns: 4025 * 0 on success, negative error code on failure. 4026 */ 4027 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4028 { 4029 struct dma_fence *fence = NULL, *next = NULL; 4030 struct amdgpu_bo *shadow; 4031 long r = 1, tmo; 4032 4033 if (amdgpu_sriov_runtime(adev)) 4034 tmo = msecs_to_jiffies(8000); 4035 else 4036 tmo = msecs_to_jiffies(100); 4037 4038 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4039 mutex_lock(&adev->shadow_list_lock); 4040 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4041 4042 /* No need to recover an evicted BO */ 4043 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4044 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4045 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4046 continue; 4047 4048 r = amdgpu_bo_restore_shadow(shadow, &next); 4049 if (r) 4050 break; 4051 4052 if (fence) { 4053 tmo = dma_fence_wait_timeout(fence, false, tmo); 4054 dma_fence_put(fence); 4055 fence = next; 4056 if (tmo == 0) { 4057 r = -ETIMEDOUT; 4058 break; 4059 } else if (tmo < 0) { 4060 r = tmo; 4061 break; 4062 } 4063 } else { 4064 fence = next; 4065 } 4066 } 4067 mutex_unlock(&adev->shadow_list_lock); 4068 4069 if (fence) 4070 tmo = dma_fence_wait_timeout(fence, false, tmo); 4071 dma_fence_put(fence); 4072 4073 if (r < 0 || tmo <= 0) { 4074 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4075 return -EIO; 4076 } 4077 4078 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4079 return 0; 4080 } 4081 4082 4083 /** 4084 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4085 * 4086 * @adev: amdgpu_device pointer 4087 * @from_hypervisor: request from hypervisor 4088 * 4089 * do VF FLR and reinitialize Asic 4090 * return 0 means succeeded otherwise failed 4091 */ 4092 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4093 bool from_hypervisor) 4094 { 4095 int r; 4096 4097 if (from_hypervisor) 4098 r = amdgpu_virt_request_full_gpu(adev, true); 4099 else 4100 r = amdgpu_virt_reset_gpu(adev); 4101 if (r) 4102 return r; 4103 4104 amdgpu_amdkfd_pre_reset(adev); 4105 4106 /* Resume IP prior to SMC */ 4107 r = amdgpu_device_ip_reinit_early_sriov(adev); 4108 if (r) 4109 goto error; 4110 4111 amdgpu_virt_init_data_exchange(adev); 4112 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4113 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4114 4115 r = amdgpu_device_fw_loading(adev); 4116 if (r) 4117 return r; 4118 4119 /* now we are okay to resume SMC/CP/SDMA */ 4120 r = amdgpu_device_ip_reinit_late_sriov(adev); 4121 if (r) 4122 goto error; 4123 4124 amdgpu_irq_gpu_reset_resume_helper(adev); 4125 r = amdgpu_ib_ring_tests(adev); 4126 amdgpu_amdkfd_post_reset(adev); 4127 4128 error: 4129 amdgpu_virt_release_full_gpu(adev, true); 4130 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4131 amdgpu_inc_vram_lost(adev); 4132 r = amdgpu_device_recover_vram(adev); 4133 } 4134 4135 return r; 4136 } 4137 4138 /** 4139 * amdgpu_device_has_job_running - check if there is any job in mirror list 4140 * 4141 * @adev: amdgpu_device pointer 4142 * 4143 * check if there is any job in mirror list 4144 */ 4145 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4146 { 4147 int i; 4148 struct drm_sched_job *job; 4149 4150 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4151 struct amdgpu_ring *ring = adev->rings[i]; 4152 4153 if (!ring || !ring->sched.thread) 4154 continue; 4155 4156 spin_lock(&ring->sched.job_list_lock); 4157 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4158 struct drm_sched_job, node); 4159 spin_unlock(&ring->sched.job_list_lock); 4160 if (job) 4161 return true; 4162 } 4163 return false; 4164 } 4165 4166 /** 4167 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4168 * 4169 * @adev: amdgpu_device pointer 4170 * 4171 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4172 * a hung GPU. 4173 */ 4174 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4175 { 4176 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4177 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4178 return false; 4179 } 4180 4181 if (amdgpu_gpu_recovery == 0) 4182 goto disabled; 4183 4184 if (amdgpu_sriov_vf(adev)) 4185 return true; 4186 4187 if (amdgpu_gpu_recovery == -1) { 4188 switch (adev->asic_type) { 4189 case CHIP_BONAIRE: 4190 case CHIP_HAWAII: 4191 case CHIP_TOPAZ: 4192 case CHIP_TONGA: 4193 case CHIP_FIJI: 4194 case CHIP_POLARIS10: 4195 case CHIP_POLARIS11: 4196 case CHIP_POLARIS12: 4197 case CHIP_VEGAM: 4198 case CHIP_VEGA20: 4199 case CHIP_VEGA10: 4200 case CHIP_VEGA12: 4201 case CHIP_RAVEN: 4202 case CHIP_ARCTURUS: 4203 case CHIP_RENOIR: 4204 case CHIP_NAVI10: 4205 case CHIP_NAVI14: 4206 case CHIP_NAVI12: 4207 case CHIP_SIENNA_CICHLID: 4208 break; 4209 default: 4210 goto disabled; 4211 } 4212 } 4213 4214 return true; 4215 4216 disabled: 4217 dev_info(adev->dev, "GPU recovery disabled.\n"); 4218 return false; 4219 } 4220 4221 4222 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4223 struct amdgpu_job *job, 4224 bool *need_full_reset_arg) 4225 { 4226 int i, r = 0; 4227 bool need_full_reset = *need_full_reset_arg; 4228 4229 amdgpu_debugfs_wait_dump(adev); 4230 4231 if (amdgpu_sriov_vf(adev)) { 4232 /* stop the data exchange thread */ 4233 amdgpu_virt_fini_data_exchange(adev); 4234 } 4235 4236 /* block all schedulers and reset given job's ring */ 4237 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4238 struct amdgpu_ring *ring = adev->rings[i]; 4239 4240 if (!ring || !ring->sched.thread) 4241 continue; 4242 4243 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4244 amdgpu_fence_driver_force_completion(ring); 4245 } 4246 4247 if(job) 4248 drm_sched_increase_karma(&job->base); 4249 4250 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4251 if (!amdgpu_sriov_vf(adev)) { 4252 4253 if (!need_full_reset) 4254 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4255 4256 if (!need_full_reset) { 4257 amdgpu_device_ip_pre_soft_reset(adev); 4258 r = amdgpu_device_ip_soft_reset(adev); 4259 amdgpu_device_ip_post_soft_reset(adev); 4260 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4261 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4262 need_full_reset = true; 4263 } 4264 } 4265 4266 if (need_full_reset) 4267 r = amdgpu_device_ip_suspend(adev); 4268 4269 *need_full_reset_arg = need_full_reset; 4270 } 4271 4272 return r; 4273 } 4274 4275 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4276 struct list_head *device_list_handle, 4277 bool *need_full_reset_arg, 4278 bool skip_hw_reset) 4279 { 4280 struct amdgpu_device *tmp_adev = NULL; 4281 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4282 int r = 0; 4283 4284 /* 4285 * ASIC reset has to be done on all HGMI hive nodes ASAP 4286 * to allow proper links negotiation in FW (within 1 sec) 4287 */ 4288 if (!skip_hw_reset && need_full_reset) { 4289 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4290 /* For XGMI run all resets in parallel to speed up the process */ 4291 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4292 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4293 r = -EALREADY; 4294 } else 4295 r = amdgpu_asic_reset(tmp_adev); 4296 4297 if (r) { 4298 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4299 r, adev_to_drm(tmp_adev)->unique); 4300 break; 4301 } 4302 } 4303 4304 /* For XGMI wait for all resets to complete before proceed */ 4305 if (!r) { 4306 list_for_each_entry(tmp_adev, device_list_handle, 4307 gmc.xgmi.head) { 4308 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4309 flush_work(&tmp_adev->xgmi_reset_work); 4310 r = tmp_adev->asic_reset_res; 4311 if (r) 4312 break; 4313 } 4314 } 4315 } 4316 } 4317 4318 if (!r && amdgpu_ras_intr_triggered()) { 4319 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4320 if (tmp_adev->mmhub.funcs && 4321 tmp_adev->mmhub.funcs->reset_ras_error_count) 4322 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4323 } 4324 4325 amdgpu_ras_intr_cleared(); 4326 } 4327 4328 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4329 if (need_full_reset) { 4330 /* post card */ 4331 if (amdgpu_device_asic_init(tmp_adev)) 4332 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4333 4334 if (!r) { 4335 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4336 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4337 if (r) 4338 goto out; 4339 4340 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4341 if (vram_lost) { 4342 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4343 amdgpu_inc_vram_lost(tmp_adev); 4344 } 4345 4346 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4347 if (r) 4348 goto out; 4349 4350 r = amdgpu_device_fw_loading(tmp_adev); 4351 if (r) 4352 return r; 4353 4354 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4355 if (r) 4356 goto out; 4357 4358 if (vram_lost) 4359 amdgpu_device_fill_reset_magic(tmp_adev); 4360 4361 /* 4362 * Add this ASIC as tracked as reset was already 4363 * complete successfully. 4364 */ 4365 amdgpu_register_gpu_instance(tmp_adev); 4366 4367 r = amdgpu_device_ip_late_init(tmp_adev); 4368 if (r) 4369 goto out; 4370 4371 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4372 4373 /* 4374 * The GPU enters bad state once faulty pages 4375 * by ECC has reached the threshold, and ras 4376 * recovery is scheduled next. So add one check 4377 * here to break recovery if it indeed exceeds 4378 * bad page threshold, and remind user to 4379 * retire this GPU or setting one bigger 4380 * bad_page_threshold value to fix this once 4381 * probing driver again. 4382 */ 4383 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4384 /* must succeed. */ 4385 amdgpu_ras_resume(tmp_adev); 4386 } else { 4387 r = -EINVAL; 4388 goto out; 4389 } 4390 4391 /* Update PSP FW topology after reset */ 4392 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4393 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4394 } 4395 } 4396 4397 out: 4398 if (!r) { 4399 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4400 r = amdgpu_ib_ring_tests(tmp_adev); 4401 if (r) { 4402 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4403 r = amdgpu_device_ip_suspend(tmp_adev); 4404 need_full_reset = true; 4405 r = -EAGAIN; 4406 goto end; 4407 } 4408 } 4409 4410 if (!r) 4411 r = amdgpu_device_recover_vram(tmp_adev); 4412 else 4413 tmp_adev->asic_reset_res = r; 4414 } 4415 4416 end: 4417 *need_full_reset_arg = need_full_reset; 4418 return r; 4419 } 4420 4421 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4422 struct amdgpu_hive_info *hive) 4423 { 4424 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4425 return false; 4426 4427 if (hive) { 4428 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4429 } else { 4430 down_write(&adev->reset_sem); 4431 } 4432 4433 atomic_inc(&adev->gpu_reset_counter); 4434 switch (amdgpu_asic_reset_method(adev)) { 4435 case AMD_RESET_METHOD_MODE1: 4436 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4437 break; 4438 case AMD_RESET_METHOD_MODE2: 4439 adev->mp1_state = PP_MP1_STATE_RESET; 4440 break; 4441 default: 4442 adev->mp1_state = PP_MP1_STATE_NONE; 4443 break; 4444 } 4445 4446 return true; 4447 } 4448 4449 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4450 { 4451 amdgpu_vf_error_trans_all(adev); 4452 adev->mp1_state = PP_MP1_STATE_NONE; 4453 atomic_set(&adev->in_gpu_reset, 0); 4454 up_write(&adev->reset_sem); 4455 } 4456 4457 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4458 { 4459 struct pci_dev *p = NULL; 4460 4461 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4462 adev->pdev->bus->number, 1); 4463 if (p) { 4464 pm_runtime_enable(&(p->dev)); 4465 pm_runtime_resume(&(p->dev)); 4466 } 4467 } 4468 4469 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4470 { 4471 enum amd_reset_method reset_method; 4472 struct pci_dev *p = NULL; 4473 u64 expires; 4474 4475 /* 4476 * For now, only BACO and mode1 reset are confirmed 4477 * to suffer the audio issue without proper suspended. 4478 */ 4479 reset_method = amdgpu_asic_reset_method(adev); 4480 if ((reset_method != AMD_RESET_METHOD_BACO) && 4481 (reset_method != AMD_RESET_METHOD_MODE1)) 4482 return -EINVAL; 4483 4484 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4485 adev->pdev->bus->number, 1); 4486 if (!p) 4487 return -ENODEV; 4488 4489 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4490 if (!expires) 4491 /* 4492 * If we cannot get the audio device autosuspend delay, 4493 * a fixed 4S interval will be used. Considering 3S is 4494 * the audio controller default autosuspend delay setting. 4495 * 4S used here is guaranteed to cover that. 4496 */ 4497 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4498 4499 while (!pm_runtime_status_suspended(&(p->dev))) { 4500 if (!pm_runtime_suspend(&(p->dev))) 4501 break; 4502 4503 if (expires < ktime_get_mono_fast_ns()) { 4504 dev_warn(adev->dev, "failed to suspend display audio\n"); 4505 /* TODO: abort the succeeding gpu reset? */ 4506 return -ETIMEDOUT; 4507 } 4508 } 4509 4510 pm_runtime_disable(&(p->dev)); 4511 4512 return 0; 4513 } 4514 4515 /** 4516 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4517 * 4518 * @adev: amdgpu_device pointer 4519 * @job: which job trigger hang 4520 * 4521 * Attempt to reset the GPU if it has hung (all asics). 4522 * Attempt to do soft-reset or full-reset and reinitialize Asic 4523 * Returns 0 for success or an error on failure. 4524 */ 4525 4526 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4527 struct amdgpu_job *job) 4528 { 4529 struct list_head device_list, *device_list_handle = NULL; 4530 bool need_full_reset = false; 4531 bool job_signaled = false; 4532 struct amdgpu_hive_info *hive = NULL; 4533 struct amdgpu_device *tmp_adev = NULL; 4534 int i, r = 0; 4535 bool need_emergency_restart = false; 4536 bool audio_suspended = false; 4537 4538 /* 4539 * Special case: RAS triggered and full reset isn't supported 4540 */ 4541 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4542 4543 /* 4544 * Flush RAM to disk so that after reboot 4545 * the user can read log and see why the system rebooted. 4546 */ 4547 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4548 DRM_WARN("Emergency reboot."); 4549 4550 ksys_sync_helper(); 4551 emergency_restart(); 4552 } 4553 4554 dev_info(adev->dev, "GPU %s begin!\n", 4555 need_emergency_restart ? "jobs stop":"reset"); 4556 4557 /* 4558 * Here we trylock to avoid chain of resets executing from 4559 * either trigger by jobs on different adevs in XGMI hive or jobs on 4560 * different schedulers for same device while this TO handler is running. 4561 * We always reset all schedulers for device and all devices for XGMI 4562 * hive so that should take care of them too. 4563 */ 4564 hive = amdgpu_get_xgmi_hive(adev); 4565 if (hive) { 4566 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4567 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4568 job ? job->base.id : -1, hive->hive_id); 4569 amdgpu_put_xgmi_hive(hive); 4570 return 0; 4571 } 4572 mutex_lock(&hive->hive_lock); 4573 } 4574 4575 /* 4576 * Build list of devices to reset. 4577 * In case we are in XGMI hive mode, resort the device list 4578 * to put adev in the 1st position. 4579 */ 4580 INIT_LIST_HEAD(&device_list); 4581 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4582 if (!hive) 4583 return -ENODEV; 4584 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4585 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4586 device_list_handle = &hive->device_list; 4587 } else { 4588 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4589 device_list_handle = &device_list; 4590 } 4591 4592 /* block all schedulers and reset given job's ring */ 4593 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4594 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4595 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4596 job ? job->base.id : -1); 4597 r = 0; 4598 goto skip_recovery; 4599 } 4600 4601 /* 4602 * Try to put the audio codec into suspend state 4603 * before gpu reset started. 4604 * 4605 * Due to the power domain of the graphics device 4606 * is shared with AZ power domain. Without this, 4607 * we may change the audio hardware from behind 4608 * the audio driver's back. That will trigger 4609 * some audio codec errors. 4610 */ 4611 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4612 audio_suspended = true; 4613 4614 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4615 4616 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4617 4618 if (!amdgpu_sriov_vf(tmp_adev)) 4619 amdgpu_amdkfd_pre_reset(tmp_adev); 4620 4621 /* 4622 * Mark these ASICs to be reseted as untracked first 4623 * And add them back after reset completed 4624 */ 4625 amdgpu_unregister_gpu_instance(tmp_adev); 4626 4627 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4628 4629 /* disable ras on ALL IPs */ 4630 if (!need_emergency_restart && 4631 amdgpu_device_ip_need_full_reset(tmp_adev)) 4632 amdgpu_ras_suspend(tmp_adev); 4633 4634 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4635 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4636 4637 if (!ring || !ring->sched.thread) 4638 continue; 4639 4640 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4641 4642 if (need_emergency_restart) 4643 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4644 } 4645 } 4646 4647 if (need_emergency_restart) 4648 goto skip_sched_resume; 4649 4650 /* 4651 * Must check guilty signal here since after this point all old 4652 * HW fences are force signaled. 4653 * 4654 * job->base holds a reference to parent fence 4655 */ 4656 if (job && job->base.s_fence->parent && 4657 dma_fence_is_signaled(job->base.s_fence->parent)) { 4658 job_signaled = true; 4659 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4660 goto skip_hw_reset; 4661 } 4662 4663 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4664 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4665 r = amdgpu_device_pre_asic_reset(tmp_adev, 4666 (tmp_adev == adev) ? job : NULL, 4667 &need_full_reset); 4668 /*TODO Should we stop ?*/ 4669 if (r) { 4670 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4671 r, adev_to_drm(tmp_adev)->unique); 4672 tmp_adev->asic_reset_res = r; 4673 } 4674 } 4675 4676 /* Actual ASIC resets if needed.*/ 4677 /* TODO Implement XGMI hive reset logic for SRIOV */ 4678 if (amdgpu_sriov_vf(adev)) { 4679 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4680 if (r) 4681 adev->asic_reset_res = r; 4682 } else { 4683 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4684 if (r && r == -EAGAIN) 4685 goto retry; 4686 } 4687 4688 skip_hw_reset: 4689 4690 /* Post ASIC reset for all devs .*/ 4691 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4692 4693 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4694 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4695 4696 if (!ring || !ring->sched.thread) 4697 continue; 4698 4699 /* No point to resubmit jobs if we didn't HW reset*/ 4700 if (!tmp_adev->asic_reset_res && !job_signaled) 4701 drm_sched_resubmit_jobs(&ring->sched); 4702 4703 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4704 } 4705 4706 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4707 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4708 } 4709 4710 tmp_adev->asic_reset_res = 0; 4711 4712 if (r) { 4713 /* bad news, how to tell it to userspace ? */ 4714 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4715 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4716 } else { 4717 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4718 } 4719 } 4720 4721 skip_sched_resume: 4722 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4723 /*unlock kfd: SRIOV would do it separately */ 4724 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4725 amdgpu_amdkfd_post_reset(tmp_adev); 4726 if (audio_suspended) 4727 amdgpu_device_resume_display_audio(tmp_adev); 4728 amdgpu_device_unlock_adev(tmp_adev); 4729 } 4730 4731 skip_recovery: 4732 if (hive) { 4733 atomic_set(&hive->in_reset, 0); 4734 mutex_unlock(&hive->hive_lock); 4735 amdgpu_put_xgmi_hive(hive); 4736 } 4737 4738 if (r) 4739 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4740 return r; 4741 } 4742 4743 /** 4744 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4745 * 4746 * @adev: amdgpu_device pointer 4747 * 4748 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4749 * and lanes) of the slot the device is in. Handles APUs and 4750 * virtualized environments where PCIE config space may not be available. 4751 */ 4752 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4753 { 4754 struct pci_dev *pdev; 4755 enum pci_bus_speed speed_cap, platform_speed_cap; 4756 enum pcie_link_width platform_link_width; 4757 4758 if (amdgpu_pcie_gen_cap) 4759 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4760 4761 if (amdgpu_pcie_lane_cap) 4762 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4763 4764 /* covers APUs as well */ 4765 if (pci_is_root_bus(adev->pdev->bus)) { 4766 if (adev->pm.pcie_gen_mask == 0) 4767 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4768 if (adev->pm.pcie_mlw_mask == 0) 4769 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4770 return; 4771 } 4772 4773 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4774 return; 4775 4776 pcie_bandwidth_available(adev->pdev, NULL, 4777 &platform_speed_cap, &platform_link_width); 4778 4779 if (adev->pm.pcie_gen_mask == 0) { 4780 /* asic caps */ 4781 pdev = adev->pdev; 4782 speed_cap = pcie_get_speed_cap(pdev); 4783 if (speed_cap == PCI_SPEED_UNKNOWN) { 4784 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4785 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4786 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4787 } else { 4788 if (speed_cap == PCIE_SPEED_16_0GT) 4789 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4790 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4791 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4792 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4793 else if (speed_cap == PCIE_SPEED_8_0GT) 4794 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4795 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4796 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4797 else if (speed_cap == PCIE_SPEED_5_0GT) 4798 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4799 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4800 else 4801 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4802 } 4803 /* platform caps */ 4804 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4805 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4806 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4807 } else { 4808 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4809 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4810 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4811 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4812 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4813 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4814 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4815 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4816 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4817 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4818 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4819 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4820 else 4821 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4822 4823 } 4824 } 4825 if (adev->pm.pcie_mlw_mask == 0) { 4826 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4827 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4828 } else { 4829 switch (platform_link_width) { 4830 case PCIE_LNK_X32: 4831 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4834 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4836 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4837 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4838 break; 4839 case PCIE_LNK_X16: 4840 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4841 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4842 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4843 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4844 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4845 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4846 break; 4847 case PCIE_LNK_X12: 4848 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4849 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4850 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4851 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4852 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4853 break; 4854 case PCIE_LNK_X8: 4855 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4856 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4857 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4858 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4859 break; 4860 case PCIE_LNK_X4: 4861 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4862 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4863 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4864 break; 4865 case PCIE_LNK_X2: 4866 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4867 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4868 break; 4869 case PCIE_LNK_X1: 4870 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4871 break; 4872 default: 4873 break; 4874 } 4875 } 4876 } 4877 } 4878 4879 int amdgpu_device_baco_enter(struct drm_device *dev) 4880 { 4881 struct amdgpu_device *adev = drm_to_adev(dev); 4882 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4883 4884 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4885 return -ENOTSUPP; 4886 4887 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4888 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4889 4890 return amdgpu_dpm_baco_enter(adev); 4891 } 4892 4893 int amdgpu_device_baco_exit(struct drm_device *dev) 4894 { 4895 struct amdgpu_device *adev = drm_to_adev(dev); 4896 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4897 int ret = 0; 4898 4899 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4900 return -ENOTSUPP; 4901 4902 ret = amdgpu_dpm_baco_exit(adev); 4903 if (ret) 4904 return ret; 4905 4906 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4907 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4908 4909 return 0; 4910 } 4911 4912 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4913 { 4914 int i; 4915 4916 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4917 struct amdgpu_ring *ring = adev->rings[i]; 4918 4919 if (!ring || !ring->sched.thread) 4920 continue; 4921 4922 cancel_delayed_work_sync(&ring->sched.work_tdr); 4923 } 4924 } 4925 4926 /** 4927 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4928 * @pdev: PCI device struct 4929 * @state: PCI channel state 4930 * 4931 * Description: Called when a PCI error is detected. 4932 * 4933 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4934 */ 4935 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4936 { 4937 struct drm_device *dev = pci_get_drvdata(pdev); 4938 struct amdgpu_device *adev = drm_to_adev(dev); 4939 int i; 4940 4941 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4942 4943 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4944 DRM_WARN("No support for XGMI hive yet..."); 4945 return PCI_ERS_RESULT_DISCONNECT; 4946 } 4947 4948 switch (state) { 4949 case pci_channel_io_normal: 4950 return PCI_ERS_RESULT_CAN_RECOVER; 4951 /* Fatal error, prepare for slot reset */ 4952 case pci_channel_io_frozen: 4953 /* 4954 * Cancel and wait for all TDRs in progress if failing to 4955 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4956 * 4957 * Locking adev->reset_sem will prevent any external access 4958 * to GPU during PCI error recovery 4959 */ 4960 while (!amdgpu_device_lock_adev(adev, NULL)) 4961 amdgpu_cancel_all_tdr(adev); 4962 4963 /* 4964 * Block any work scheduling as we do for regular GPU reset 4965 * for the duration of the recovery 4966 */ 4967 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4968 struct amdgpu_ring *ring = adev->rings[i]; 4969 4970 if (!ring || !ring->sched.thread) 4971 continue; 4972 4973 drm_sched_stop(&ring->sched, NULL); 4974 } 4975 return PCI_ERS_RESULT_NEED_RESET; 4976 case pci_channel_io_perm_failure: 4977 /* Permanent error, prepare for device removal */ 4978 return PCI_ERS_RESULT_DISCONNECT; 4979 } 4980 4981 return PCI_ERS_RESULT_NEED_RESET; 4982 } 4983 4984 /** 4985 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4986 * @pdev: pointer to PCI device 4987 */ 4988 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4989 { 4990 4991 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4992 4993 /* TODO - dump whatever for debugging purposes */ 4994 4995 /* This called only if amdgpu_pci_error_detected returns 4996 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4997 * works, no need to reset slot. 4998 */ 4999 5000 return PCI_ERS_RESULT_RECOVERED; 5001 } 5002 5003 /** 5004 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5005 * @pdev: PCI device struct 5006 * 5007 * Description: This routine is called by the pci error recovery 5008 * code after the PCI slot has been reset, just before we 5009 * should resume normal operations. 5010 */ 5011 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5012 { 5013 struct drm_device *dev = pci_get_drvdata(pdev); 5014 struct amdgpu_device *adev = drm_to_adev(dev); 5015 int r, i; 5016 bool need_full_reset = true; 5017 u32 memsize; 5018 struct list_head device_list; 5019 5020 DRM_INFO("PCI error: slot reset callback!!\n"); 5021 5022 INIT_LIST_HEAD(&device_list); 5023 list_add_tail(&adev->gmc.xgmi.head, &device_list); 5024 5025 /* wait for asic to come out of reset */ 5026 msleep(500); 5027 5028 /* Restore PCI confspace */ 5029 amdgpu_device_load_pci_state(pdev); 5030 5031 /* confirm ASIC came out of reset */ 5032 for (i = 0; i < adev->usec_timeout; i++) { 5033 memsize = amdgpu_asic_get_config_memsize(adev); 5034 5035 if (memsize != 0xffffffff) 5036 break; 5037 udelay(1); 5038 } 5039 if (memsize == 0xffffffff) { 5040 r = -ETIME; 5041 goto out; 5042 } 5043 5044 adev->in_pci_err_recovery = true; 5045 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5046 adev->in_pci_err_recovery = false; 5047 if (r) 5048 goto out; 5049 5050 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5051 5052 out: 5053 if (!r) { 5054 if (amdgpu_device_cache_pci_state(adev->pdev)) 5055 pci_restore_state(adev->pdev); 5056 5057 DRM_INFO("PCIe error recovery succeeded\n"); 5058 } else { 5059 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5060 amdgpu_device_unlock_adev(adev); 5061 } 5062 5063 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5064 } 5065 5066 /** 5067 * amdgpu_pci_resume() - resume normal ops after PCI reset 5068 * @pdev: pointer to PCI device 5069 * 5070 * Called when the error recovery driver tells us that its 5071 * OK to resume normal operation. 5072 */ 5073 void amdgpu_pci_resume(struct pci_dev *pdev) 5074 { 5075 struct drm_device *dev = pci_get_drvdata(pdev); 5076 struct amdgpu_device *adev = drm_to_adev(dev); 5077 int i; 5078 5079 5080 DRM_INFO("PCI error: resume callback!!\n"); 5081 5082 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5083 struct amdgpu_ring *ring = adev->rings[i]; 5084 5085 if (!ring || !ring->sched.thread) 5086 continue; 5087 5088 5089 drm_sched_resubmit_jobs(&ring->sched); 5090 drm_sched_start(&ring->sched, true); 5091 } 5092 5093 amdgpu_device_unlock_adev(adev); 5094 } 5095 5096 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5097 { 5098 struct drm_device *dev = pci_get_drvdata(pdev); 5099 struct amdgpu_device *adev = drm_to_adev(dev); 5100 int r; 5101 5102 r = pci_save_state(pdev); 5103 if (!r) { 5104 kfree(adev->pci_state); 5105 5106 adev->pci_state = pci_store_saved_state(pdev); 5107 5108 if (!adev->pci_state) { 5109 DRM_ERROR("Failed to store PCI saved state"); 5110 return false; 5111 } 5112 } else { 5113 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5114 return false; 5115 } 5116 5117 return true; 5118 } 5119 5120 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5121 { 5122 struct drm_device *dev = pci_get_drvdata(pdev); 5123 struct amdgpu_device *adev = drm_to_adev(dev); 5124 int r; 5125 5126 if (!adev->pci_state) 5127 return false; 5128 5129 r = pci_load_saved_state(pdev, adev->pci_state); 5130 5131 if (!r) { 5132 pci_restore_state(pdev); 5133 } else { 5134 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5135 return false; 5136 } 5137 5138 return true; 5139 } 5140 5141 5142