1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "NAVI10", 115 "NAVI14", 116 "NAVI12", 117 "SIENNA_CICHLID", 118 "NAVY_FLOUNDER", 119 "VANGOGH", 120 "DIMGREY_CAVEFISH", 121 "LAST", 122 }; 123 124 /** 125 * DOC: pcie_replay_count 126 * 127 * The amdgpu driver provides a sysfs API for reporting the total number 128 * of PCIe replays (NAKs) 129 * The file pcie_replay_count is used for this and returns the total 130 * number of replays as a sum of the NAKs generated and NAKs received 131 */ 132 133 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 134 struct device_attribute *attr, char *buf) 135 { 136 struct drm_device *ddev = dev_get_drvdata(dev); 137 struct amdgpu_device *adev = drm_to_adev(ddev); 138 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 139 140 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 141 } 142 143 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 144 amdgpu_device_get_pcie_replay_count, NULL); 145 146 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 147 148 /** 149 * DOC: product_name 150 * 151 * The amdgpu driver provides a sysfs API for reporting the product name 152 * for the device 153 * The file serial_number is used for this and returns the product name 154 * as returned from the FRU. 155 * NOTE: This is only available for certain server cards 156 */ 157 158 static ssize_t amdgpu_device_get_product_name(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 164 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 165 } 166 167 static DEVICE_ATTR(product_name, S_IRUGO, 168 amdgpu_device_get_product_name, NULL); 169 170 /** 171 * DOC: product_number 172 * 173 * The amdgpu driver provides a sysfs API for reporting the part number 174 * for the device 175 * The file serial_number is used for this and returns the part number 176 * as returned from the FRU. 177 * NOTE: This is only available for certain server cards 178 */ 179 180 static ssize_t amdgpu_device_get_product_number(struct device *dev, 181 struct device_attribute *attr, char *buf) 182 { 183 struct drm_device *ddev = dev_get_drvdata(dev); 184 struct amdgpu_device *adev = drm_to_adev(ddev); 185 186 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 187 } 188 189 static DEVICE_ATTR(product_number, S_IRUGO, 190 amdgpu_device_get_product_number, NULL); 191 192 /** 193 * DOC: serial_number 194 * 195 * The amdgpu driver provides a sysfs API for reporting the serial number 196 * for the device 197 * The file serial_number is used for this and returns the serial number 198 * as returned from the FRU. 199 * NOTE: This is only available for certain server cards 200 */ 201 202 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 203 struct device_attribute *attr, char *buf) 204 { 205 struct drm_device *ddev = dev_get_drvdata(dev); 206 struct amdgpu_device *adev = drm_to_adev(ddev); 207 208 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 209 } 210 211 static DEVICE_ATTR(serial_number, S_IRUGO, 212 amdgpu_device_get_serial_number, NULL); 213 214 /** 215 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 216 * 217 * @dev: drm_device pointer 218 * 219 * Returns true if the device is a dGPU with HG/PX power control, 220 * otherwise return false. 221 */ 222 bool amdgpu_device_supports_boco(struct drm_device *dev) 223 { 224 struct amdgpu_device *adev = drm_to_adev(dev); 225 226 if (adev->flags & AMD_IS_PX) 227 return true; 228 return false; 229 } 230 231 /** 232 * amdgpu_device_supports_baco - Does the device support BACO 233 * 234 * @dev: drm_device pointer 235 * 236 * Returns true if the device supporte BACO, 237 * otherwise return false. 238 */ 239 bool amdgpu_device_supports_baco(struct drm_device *dev) 240 { 241 struct amdgpu_device *adev = drm_to_adev(dev); 242 243 return amdgpu_asic_supports_baco(adev); 244 } 245 246 /* 247 * VRAM access helper functions 248 */ 249 250 /** 251 * amdgpu_device_vram_access - read/write a buffer in vram 252 * 253 * @adev: amdgpu_device pointer 254 * @pos: offset of the buffer in vram 255 * @buf: virtual address of the buffer in system memory 256 * @size: read/write size, sizeof(@buf) must > @size 257 * @write: true - write to vram, otherwise - read from vram 258 */ 259 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 260 uint32_t *buf, size_t size, bool write) 261 { 262 unsigned long flags; 263 uint32_t hi = ~0; 264 uint64_t last; 265 266 267 #ifdef CONFIG_64BIT 268 last = min(pos + size, adev->gmc.visible_vram_size); 269 if (last > pos) { 270 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 271 size_t count = last - pos; 272 273 if (write) { 274 memcpy_toio(addr, buf, count); 275 mb(); 276 amdgpu_asic_flush_hdp(adev, NULL); 277 } else { 278 amdgpu_asic_invalidate_hdp(adev, NULL); 279 mb(); 280 memcpy_fromio(buf, addr, count); 281 } 282 283 if (count == size) 284 return; 285 286 pos += count; 287 buf += count / 4; 288 size -= count; 289 } 290 #endif 291 292 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 293 for (last = pos + size; pos < last; pos += 4) { 294 uint32_t tmp = pos >> 31; 295 296 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 297 if (tmp != hi) { 298 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 299 hi = tmp; 300 } 301 if (write) 302 WREG32_NO_KIQ(mmMM_DATA, *buf++); 303 else 304 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 305 } 306 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 307 } 308 309 /* 310 * register access helper functions. 311 */ 312 /** 313 * amdgpu_device_rreg - read a memory mapped IO or indirect register 314 * 315 * @adev: amdgpu_device pointer 316 * @reg: dword aligned register offset 317 * @acc_flags: access flags which require special behavior 318 * 319 * Returns the 32 bit value from the offset specified. 320 */ 321 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 322 uint32_t reg, uint32_t acc_flags) 323 { 324 uint32_t ret; 325 326 if (adev->in_pci_err_recovery) 327 return 0; 328 329 if ((reg * 4) < adev->rmmio_size) { 330 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 331 amdgpu_sriov_runtime(adev) && 332 down_read_trylock(&adev->reset_sem)) { 333 ret = amdgpu_kiq_rreg(adev, reg); 334 up_read(&adev->reset_sem); 335 } else { 336 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 337 } 338 } else { 339 ret = adev->pcie_rreg(adev, reg * 4); 340 } 341 342 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 343 344 return ret; 345 } 346 347 /* 348 * MMIO register read with bytes helper functions 349 * @offset:bytes offset from MMIO start 350 * 351 */ 352 353 /** 354 * amdgpu_mm_rreg8 - read a memory mapped IO register 355 * 356 * @adev: amdgpu_device pointer 357 * @offset: byte aligned register offset 358 * 359 * Returns the 8 bit value from the offset specified. 360 */ 361 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 362 { 363 if (adev->in_pci_err_recovery) 364 return 0; 365 366 if (offset < adev->rmmio_size) 367 return (readb(adev->rmmio + offset)); 368 BUG(); 369 } 370 371 /* 372 * MMIO register write with bytes helper functions 373 * @offset:bytes offset from MMIO start 374 * @value: the value want to be written to the register 375 * 376 */ 377 /** 378 * amdgpu_mm_wreg8 - read a memory mapped IO register 379 * 380 * @adev: amdgpu_device pointer 381 * @offset: byte aligned register offset 382 * @value: 8 bit value to write 383 * 384 * Writes the value specified to the offset specified. 385 */ 386 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 387 { 388 if (adev->in_pci_err_recovery) 389 return; 390 391 if (offset < adev->rmmio_size) 392 writeb(value, adev->rmmio + offset); 393 else 394 BUG(); 395 } 396 397 /** 398 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 399 * 400 * @adev: amdgpu_device pointer 401 * @reg: dword aligned register offset 402 * @v: 32 bit value to write to the register 403 * @acc_flags: access flags which require special behavior 404 * 405 * Writes the value specified to the offset specified. 406 */ 407 void amdgpu_device_wreg(struct amdgpu_device *adev, 408 uint32_t reg, uint32_t v, 409 uint32_t acc_flags) 410 { 411 if (adev->in_pci_err_recovery) 412 return; 413 414 if ((reg * 4) < adev->rmmio_size) { 415 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 416 amdgpu_sriov_runtime(adev) && 417 down_read_trylock(&adev->reset_sem)) { 418 amdgpu_kiq_wreg(adev, reg, v); 419 up_read(&adev->reset_sem); 420 } else { 421 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 422 } 423 } else { 424 adev->pcie_wreg(adev, reg * 4, v); 425 } 426 427 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 428 } 429 430 /* 431 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 432 * 433 * this function is invoked only the debugfs register access 434 * */ 435 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 436 uint32_t reg, uint32_t v) 437 { 438 if (adev->in_pci_err_recovery) 439 return; 440 441 if (amdgpu_sriov_fullaccess(adev) && 442 adev->gfx.rlc.funcs && 443 adev->gfx.rlc.funcs->is_rlcg_access_range) { 444 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 445 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 446 } else { 447 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 448 } 449 } 450 451 /** 452 * amdgpu_io_rreg - read an IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @reg: dword aligned register offset 456 * 457 * Returns the 32 bit value from the offset specified. 458 */ 459 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 460 { 461 if (adev->in_pci_err_recovery) 462 return 0; 463 464 if ((reg * 4) < adev->rio_mem_size) 465 return ioread32(adev->rio_mem + (reg * 4)); 466 else { 467 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 468 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 469 } 470 } 471 472 /** 473 * amdgpu_io_wreg - write to an IO register 474 * 475 * @adev: amdgpu_device pointer 476 * @reg: dword aligned register offset 477 * @v: 32 bit value to write to the register 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 482 { 483 if (adev->in_pci_err_recovery) 484 return; 485 486 if ((reg * 4) < adev->rio_mem_size) 487 iowrite32(v, adev->rio_mem + (reg * 4)); 488 else { 489 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 490 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 491 } 492 } 493 494 /** 495 * amdgpu_mm_rdoorbell - read a doorbell dword 496 * 497 * @adev: amdgpu_device pointer 498 * @index: doorbell index 499 * 500 * Returns the value in the doorbell aperture at the 501 * requested doorbell index (CIK). 502 */ 503 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 504 { 505 if (adev->in_pci_err_recovery) 506 return 0; 507 508 if (index < adev->doorbell.num_doorbells) { 509 return readl(adev->doorbell.ptr + index); 510 } else { 511 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 512 return 0; 513 } 514 } 515 516 /** 517 * amdgpu_mm_wdoorbell - write a doorbell dword 518 * 519 * @adev: amdgpu_device pointer 520 * @index: doorbell index 521 * @v: value to write 522 * 523 * Writes @v to the doorbell aperture at the 524 * requested doorbell index (CIK). 525 */ 526 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 527 { 528 if (adev->in_pci_err_recovery) 529 return; 530 531 if (index < adev->doorbell.num_doorbells) { 532 writel(v, adev->doorbell.ptr + index); 533 } else { 534 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 535 } 536 } 537 538 /** 539 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 540 * 541 * @adev: amdgpu_device pointer 542 * @index: doorbell index 543 * 544 * Returns the value in the doorbell aperture at the 545 * requested doorbell index (VEGA10+). 546 */ 547 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 548 { 549 if (adev->in_pci_err_recovery) 550 return 0; 551 552 if (index < adev->doorbell.num_doorbells) { 553 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 554 } else { 555 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 556 return 0; 557 } 558 } 559 560 /** 561 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 562 * 563 * @adev: amdgpu_device pointer 564 * @index: doorbell index 565 * @v: value to write 566 * 567 * Writes @v to the doorbell aperture at the 568 * requested doorbell index (VEGA10+). 569 */ 570 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 571 { 572 if (adev->in_pci_err_recovery) 573 return; 574 575 if (index < adev->doorbell.num_doorbells) { 576 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 577 } else { 578 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 579 } 580 } 581 582 /** 583 * amdgpu_device_indirect_rreg - read an indirect register 584 * 585 * @adev: amdgpu_device pointer 586 * @pcie_index: mmio register offset 587 * @pcie_data: mmio register offset 588 * @reg_addr: indirect register address to read from 589 * 590 * Returns the value of indirect register @reg_addr 591 */ 592 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 593 u32 pcie_index, u32 pcie_data, 594 u32 reg_addr) 595 { 596 unsigned long flags; 597 u32 r; 598 void __iomem *pcie_index_offset; 599 void __iomem *pcie_data_offset; 600 601 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 602 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 603 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 604 605 writel(reg_addr, pcie_index_offset); 606 readl(pcie_index_offset); 607 r = readl(pcie_data_offset); 608 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 609 610 return r; 611 } 612 613 /** 614 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 615 * 616 * @adev: amdgpu_device pointer 617 * @pcie_index: mmio register offset 618 * @pcie_data: mmio register offset 619 * @reg_addr: indirect register address to read from 620 * 621 * Returns the value of indirect register @reg_addr 622 */ 623 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 624 u32 pcie_index, u32 pcie_data, 625 u32 reg_addr) 626 { 627 unsigned long flags; 628 u64 r; 629 void __iomem *pcie_index_offset; 630 void __iomem *pcie_data_offset; 631 632 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 633 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 634 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 635 636 /* read low 32 bits */ 637 writel(reg_addr, pcie_index_offset); 638 readl(pcie_index_offset); 639 r = readl(pcie_data_offset); 640 /* read high 32 bits */ 641 writel(reg_addr + 4, pcie_index_offset); 642 readl(pcie_index_offset); 643 r |= ((u64)readl(pcie_data_offset) << 32); 644 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 645 646 return r; 647 } 648 649 /** 650 * amdgpu_device_indirect_wreg - write an indirect register address 651 * 652 * @adev: amdgpu_device pointer 653 * @pcie_index: mmio register offset 654 * @pcie_data: mmio register offset 655 * @reg_addr: indirect register offset 656 * @reg_data: indirect register data 657 * 658 */ 659 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 660 u32 pcie_index, u32 pcie_data, 661 u32 reg_addr, u32 reg_data) 662 { 663 unsigned long flags; 664 void __iomem *pcie_index_offset; 665 void __iomem *pcie_data_offset; 666 667 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 668 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 669 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 670 671 writel(reg_addr, pcie_index_offset); 672 readl(pcie_index_offset); 673 writel(reg_data, pcie_data_offset); 674 readl(pcie_data_offset); 675 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 676 } 677 678 /** 679 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 680 * 681 * @adev: amdgpu_device pointer 682 * @pcie_index: mmio register offset 683 * @pcie_data: mmio register offset 684 * @reg_addr: indirect register offset 685 * @reg_data: indirect register data 686 * 687 */ 688 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 689 u32 pcie_index, u32 pcie_data, 690 u32 reg_addr, u64 reg_data) 691 { 692 unsigned long flags; 693 void __iomem *pcie_index_offset; 694 void __iomem *pcie_data_offset; 695 696 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 697 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 698 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 699 700 /* write low 32 bits */ 701 writel(reg_addr, pcie_index_offset); 702 readl(pcie_index_offset); 703 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 704 readl(pcie_data_offset); 705 /* write high 32 bits */ 706 writel(reg_addr + 4, pcie_index_offset); 707 readl(pcie_index_offset); 708 writel((u32)(reg_data >> 32), pcie_data_offset); 709 readl(pcie_data_offset); 710 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 711 } 712 713 /** 714 * amdgpu_invalid_rreg - dummy reg read function 715 * 716 * @adev: amdgpu_device pointer 717 * @reg: offset of register 718 * 719 * Dummy register read function. Used for register blocks 720 * that certain asics don't have (all asics). 721 * Returns the value in the register. 722 */ 723 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 724 { 725 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 726 BUG(); 727 return 0; 728 } 729 730 /** 731 * amdgpu_invalid_wreg - dummy reg write function 732 * 733 * @adev: amdgpu_device pointer 734 * @reg: offset of register 735 * @v: value to write to the register 736 * 737 * Dummy register read function. Used for register blocks 738 * that certain asics don't have (all asics). 739 */ 740 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 741 { 742 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 743 reg, v); 744 BUG(); 745 } 746 747 /** 748 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 749 * 750 * @adev: amdgpu_device pointer 751 * @reg: offset of register 752 * 753 * Dummy register read function. Used for register blocks 754 * that certain asics don't have (all asics). 755 * Returns the value in the register. 756 */ 757 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 758 { 759 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 760 BUG(); 761 return 0; 762 } 763 764 /** 765 * amdgpu_invalid_wreg64 - dummy reg write function 766 * 767 * @adev: amdgpu_device pointer 768 * @reg: offset of register 769 * @v: value to write to the register 770 * 771 * Dummy register read function. Used for register blocks 772 * that certain asics don't have (all asics). 773 */ 774 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 775 { 776 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 777 reg, v); 778 BUG(); 779 } 780 781 /** 782 * amdgpu_block_invalid_rreg - dummy reg read function 783 * 784 * @adev: amdgpu_device pointer 785 * @block: offset of instance 786 * @reg: offset of register 787 * 788 * Dummy register read function. Used for register blocks 789 * that certain asics don't have (all asics). 790 * Returns the value in the register. 791 */ 792 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 793 uint32_t block, uint32_t reg) 794 { 795 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 796 reg, block); 797 BUG(); 798 return 0; 799 } 800 801 /** 802 * amdgpu_block_invalid_wreg - dummy reg write function 803 * 804 * @adev: amdgpu_device pointer 805 * @block: offset of instance 806 * @reg: offset of register 807 * @v: value to write to the register 808 * 809 * Dummy register read function. Used for register blocks 810 * that certain asics don't have (all asics). 811 */ 812 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 813 uint32_t block, 814 uint32_t reg, uint32_t v) 815 { 816 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 817 reg, block, v); 818 BUG(); 819 } 820 821 /** 822 * amdgpu_device_asic_init - Wrapper for atom asic_init 823 * 824 * @adev: amdgpu_device pointer 825 * 826 * Does any asic specific work and then calls atom asic init. 827 */ 828 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 829 { 830 amdgpu_asic_pre_asic_init(adev); 831 832 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 833 } 834 835 /** 836 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 837 * 838 * @adev: amdgpu_device pointer 839 * 840 * Allocates a scratch page of VRAM for use by various things in the 841 * driver. 842 */ 843 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 844 { 845 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 846 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 847 &adev->vram_scratch.robj, 848 &adev->vram_scratch.gpu_addr, 849 (void **)&adev->vram_scratch.ptr); 850 } 851 852 /** 853 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 854 * 855 * @adev: amdgpu_device pointer 856 * 857 * Frees the VRAM scratch page. 858 */ 859 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 860 { 861 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 862 } 863 864 /** 865 * amdgpu_device_program_register_sequence - program an array of registers. 866 * 867 * @adev: amdgpu_device pointer 868 * @registers: pointer to the register array 869 * @array_size: size of the register array 870 * 871 * Programs an array or registers with and and or masks. 872 * This is a helper for setting golden registers. 873 */ 874 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 875 const u32 *registers, 876 const u32 array_size) 877 { 878 u32 tmp, reg, and_mask, or_mask; 879 int i; 880 881 if (array_size % 3) 882 return; 883 884 for (i = 0; i < array_size; i +=3) { 885 reg = registers[i + 0]; 886 and_mask = registers[i + 1]; 887 or_mask = registers[i + 2]; 888 889 if (and_mask == 0xffffffff) { 890 tmp = or_mask; 891 } else { 892 tmp = RREG32(reg); 893 tmp &= ~and_mask; 894 if (adev->family >= AMDGPU_FAMILY_AI) 895 tmp |= (or_mask & and_mask); 896 else 897 tmp |= or_mask; 898 } 899 WREG32(reg, tmp); 900 } 901 } 902 903 /** 904 * amdgpu_device_pci_config_reset - reset the GPU 905 * 906 * @adev: amdgpu_device pointer 907 * 908 * Resets the GPU using the pci config reset sequence. 909 * Only applicable to asics prior to vega10. 910 */ 911 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 912 { 913 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 914 } 915 916 /* 917 * GPU doorbell aperture helpers function. 918 */ 919 /** 920 * amdgpu_device_doorbell_init - Init doorbell driver information. 921 * 922 * @adev: amdgpu_device pointer 923 * 924 * Init doorbell driver information (CIK) 925 * Returns 0 on success, error on failure. 926 */ 927 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 928 { 929 930 /* No doorbell on SI hardware generation */ 931 if (adev->asic_type < CHIP_BONAIRE) { 932 adev->doorbell.base = 0; 933 adev->doorbell.size = 0; 934 adev->doorbell.num_doorbells = 0; 935 adev->doorbell.ptr = NULL; 936 return 0; 937 } 938 939 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 940 return -EINVAL; 941 942 amdgpu_asic_init_doorbell_index(adev); 943 944 /* doorbell bar mapping */ 945 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 946 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 947 948 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 949 adev->doorbell_index.max_assignment+1); 950 if (adev->doorbell.num_doorbells == 0) 951 return -EINVAL; 952 953 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 954 * paging queue doorbell use the second page. The 955 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 956 * doorbells are in the first page. So with paging queue enabled, 957 * the max num_doorbells should + 1 page (0x400 in dword) 958 */ 959 if (adev->asic_type >= CHIP_VEGA10) 960 adev->doorbell.num_doorbells += 0x400; 961 962 adev->doorbell.ptr = ioremap(adev->doorbell.base, 963 adev->doorbell.num_doorbells * 964 sizeof(u32)); 965 if (adev->doorbell.ptr == NULL) 966 return -ENOMEM; 967 968 return 0; 969 } 970 971 /** 972 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 973 * 974 * @adev: amdgpu_device pointer 975 * 976 * Tear down doorbell driver information (CIK) 977 */ 978 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 979 { 980 iounmap(adev->doorbell.ptr); 981 adev->doorbell.ptr = NULL; 982 } 983 984 985 986 /* 987 * amdgpu_device_wb_*() 988 * Writeback is the method by which the GPU updates special pages in memory 989 * with the status of certain GPU events (fences, ring pointers,etc.). 990 */ 991 992 /** 993 * amdgpu_device_wb_fini - Disable Writeback and free memory 994 * 995 * @adev: amdgpu_device pointer 996 * 997 * Disables Writeback and frees the Writeback memory (all asics). 998 * Used at driver shutdown. 999 */ 1000 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1001 { 1002 if (adev->wb.wb_obj) { 1003 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1004 &adev->wb.gpu_addr, 1005 (void **)&adev->wb.wb); 1006 adev->wb.wb_obj = NULL; 1007 } 1008 } 1009 1010 /** 1011 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1012 * 1013 * @adev: amdgpu_device pointer 1014 * 1015 * Initializes writeback and allocates writeback memory (all asics). 1016 * Used at driver startup. 1017 * Returns 0 on success or an -error on failure. 1018 */ 1019 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1020 { 1021 int r; 1022 1023 if (adev->wb.wb_obj == NULL) { 1024 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1025 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1026 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1027 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1028 (void **)&adev->wb.wb); 1029 if (r) { 1030 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1031 return r; 1032 } 1033 1034 adev->wb.num_wb = AMDGPU_MAX_WB; 1035 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1036 1037 /* clear wb memory */ 1038 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1039 } 1040 1041 return 0; 1042 } 1043 1044 /** 1045 * amdgpu_device_wb_get - Allocate a wb entry 1046 * 1047 * @adev: amdgpu_device pointer 1048 * @wb: wb index 1049 * 1050 * Allocate a wb slot for use by the driver (all asics). 1051 * Returns 0 on success or -EINVAL on failure. 1052 */ 1053 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1054 { 1055 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1056 1057 if (offset < adev->wb.num_wb) { 1058 __set_bit(offset, adev->wb.used); 1059 *wb = offset << 3; /* convert to dw offset */ 1060 return 0; 1061 } else { 1062 return -EINVAL; 1063 } 1064 } 1065 1066 /** 1067 * amdgpu_device_wb_free - Free a wb entry 1068 * 1069 * @adev: amdgpu_device pointer 1070 * @wb: wb index 1071 * 1072 * Free a wb slot allocated for use by the driver (all asics) 1073 */ 1074 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1075 { 1076 wb >>= 3; 1077 if (wb < adev->wb.num_wb) 1078 __clear_bit(wb, adev->wb.used); 1079 } 1080 1081 /** 1082 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1083 * 1084 * @adev: amdgpu_device pointer 1085 * 1086 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1087 * to fail, but if any of the BARs is not accessible after the size we abort 1088 * driver loading by returning -ENODEV. 1089 */ 1090 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1091 { 1092 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1093 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1094 struct pci_bus *root; 1095 struct resource *res; 1096 unsigned i; 1097 u16 cmd; 1098 int r; 1099 1100 /* Bypass for VF */ 1101 if (amdgpu_sriov_vf(adev)) 1102 return 0; 1103 1104 /* skip if the bios has already enabled large BAR */ 1105 if (adev->gmc.real_vram_size && 1106 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1107 return 0; 1108 1109 /* Check if the root BUS has 64bit memory resources */ 1110 root = adev->pdev->bus; 1111 while (root->parent) 1112 root = root->parent; 1113 1114 pci_bus_for_each_resource(root, res, i) { 1115 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1116 res->start > 0x100000000ull) 1117 break; 1118 } 1119 1120 /* Trying to resize is pointless without a root hub window above 4GB */ 1121 if (!res) 1122 return 0; 1123 1124 /* Disable memory decoding while we change the BAR addresses and size */ 1125 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1126 pci_write_config_word(adev->pdev, PCI_COMMAND, 1127 cmd & ~PCI_COMMAND_MEMORY); 1128 1129 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1130 amdgpu_device_doorbell_fini(adev); 1131 if (adev->asic_type >= CHIP_BONAIRE) 1132 pci_release_resource(adev->pdev, 2); 1133 1134 pci_release_resource(adev->pdev, 0); 1135 1136 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1137 if (r == -ENOSPC) 1138 DRM_INFO("Not enough PCI address space for a large BAR."); 1139 else if (r && r != -ENOTSUPP) 1140 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1141 1142 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1143 1144 /* When the doorbell or fb BAR isn't available we have no chance of 1145 * using the device. 1146 */ 1147 r = amdgpu_device_doorbell_init(adev); 1148 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1149 return -ENODEV; 1150 1151 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1152 1153 return 0; 1154 } 1155 1156 /* 1157 * GPU helpers function. 1158 */ 1159 /** 1160 * amdgpu_device_need_post - check if the hw need post or not 1161 * 1162 * @adev: amdgpu_device pointer 1163 * 1164 * Check if the asic has been initialized (all asics) at driver startup 1165 * or post is needed if hw reset is performed. 1166 * Returns true if need or false if not. 1167 */ 1168 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1169 { 1170 uint32_t reg; 1171 1172 if (amdgpu_sriov_vf(adev)) 1173 return false; 1174 1175 if (amdgpu_passthrough(adev)) { 1176 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1177 * some old smc fw still need driver do vPost otherwise gpu hang, while 1178 * those smc fw version above 22.15 doesn't have this flaw, so we force 1179 * vpost executed for smc version below 22.15 1180 */ 1181 if (adev->asic_type == CHIP_FIJI) { 1182 int err; 1183 uint32_t fw_ver; 1184 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1185 /* force vPost if error occured */ 1186 if (err) 1187 return true; 1188 1189 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1190 if (fw_ver < 0x00160e00) 1191 return true; 1192 } 1193 } 1194 1195 if (adev->has_hw_reset) { 1196 adev->has_hw_reset = false; 1197 return true; 1198 } 1199 1200 /* bios scratch used on CIK+ */ 1201 if (adev->asic_type >= CHIP_BONAIRE) 1202 return amdgpu_atombios_scratch_need_asic_init(adev); 1203 1204 /* check MEM_SIZE for older asics */ 1205 reg = amdgpu_asic_get_config_memsize(adev); 1206 1207 if ((reg != 0) && (reg != 0xffffffff)) 1208 return false; 1209 1210 return true; 1211 } 1212 1213 /* if we get transitioned to only one device, take VGA back */ 1214 /** 1215 * amdgpu_device_vga_set_decode - enable/disable vga decode 1216 * 1217 * @cookie: amdgpu_device pointer 1218 * @state: enable/disable vga decode 1219 * 1220 * Enable/disable vga decode (all asics). 1221 * Returns VGA resource flags. 1222 */ 1223 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1224 { 1225 struct amdgpu_device *adev = cookie; 1226 amdgpu_asic_set_vga_state(adev, state); 1227 if (state) 1228 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1229 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1230 else 1231 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1232 } 1233 1234 /** 1235 * amdgpu_device_check_block_size - validate the vm block size 1236 * 1237 * @adev: amdgpu_device pointer 1238 * 1239 * Validates the vm block size specified via module parameter. 1240 * The vm block size defines number of bits in page table versus page directory, 1241 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1242 * page table and the remaining bits are in the page directory. 1243 */ 1244 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1245 { 1246 /* defines number of bits in page table versus page directory, 1247 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1248 * page table and the remaining bits are in the page directory */ 1249 if (amdgpu_vm_block_size == -1) 1250 return; 1251 1252 if (amdgpu_vm_block_size < 9) { 1253 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1254 amdgpu_vm_block_size); 1255 amdgpu_vm_block_size = -1; 1256 } 1257 } 1258 1259 /** 1260 * amdgpu_device_check_vm_size - validate the vm size 1261 * 1262 * @adev: amdgpu_device pointer 1263 * 1264 * Validates the vm size in GB specified via module parameter. 1265 * The VM size is the size of the GPU virtual memory space in GB. 1266 */ 1267 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1268 { 1269 /* no need to check the default value */ 1270 if (amdgpu_vm_size == -1) 1271 return; 1272 1273 if (amdgpu_vm_size < 1) { 1274 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1275 amdgpu_vm_size); 1276 amdgpu_vm_size = -1; 1277 } 1278 } 1279 1280 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1281 { 1282 struct sysinfo si; 1283 bool is_os_64 = (sizeof(void *) == 8); 1284 uint64_t total_memory; 1285 uint64_t dram_size_seven_GB = 0x1B8000000; 1286 uint64_t dram_size_three_GB = 0xB8000000; 1287 1288 if (amdgpu_smu_memory_pool_size == 0) 1289 return; 1290 1291 if (!is_os_64) { 1292 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1293 goto def_value; 1294 } 1295 si_meminfo(&si); 1296 total_memory = (uint64_t)si.totalram * si.mem_unit; 1297 1298 if ((amdgpu_smu_memory_pool_size == 1) || 1299 (amdgpu_smu_memory_pool_size == 2)) { 1300 if (total_memory < dram_size_three_GB) 1301 goto def_value1; 1302 } else if ((amdgpu_smu_memory_pool_size == 4) || 1303 (amdgpu_smu_memory_pool_size == 8)) { 1304 if (total_memory < dram_size_seven_GB) 1305 goto def_value1; 1306 } else { 1307 DRM_WARN("Smu memory pool size not supported\n"); 1308 goto def_value; 1309 } 1310 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1311 1312 return; 1313 1314 def_value1: 1315 DRM_WARN("No enough system memory\n"); 1316 def_value: 1317 adev->pm.smu_prv_buffer_size = 0; 1318 } 1319 1320 /** 1321 * amdgpu_device_check_arguments - validate module params 1322 * 1323 * @adev: amdgpu_device pointer 1324 * 1325 * Validates certain module parameters and updates 1326 * the associated values used by the driver (all asics). 1327 */ 1328 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1329 { 1330 if (amdgpu_sched_jobs < 4) { 1331 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1332 amdgpu_sched_jobs); 1333 amdgpu_sched_jobs = 4; 1334 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1335 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1336 amdgpu_sched_jobs); 1337 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1338 } 1339 1340 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1341 /* gart size must be greater or equal to 32M */ 1342 dev_warn(adev->dev, "gart size (%d) too small\n", 1343 amdgpu_gart_size); 1344 amdgpu_gart_size = -1; 1345 } 1346 1347 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1348 /* gtt size must be greater or equal to 32M */ 1349 dev_warn(adev->dev, "gtt size (%d) too small\n", 1350 amdgpu_gtt_size); 1351 amdgpu_gtt_size = -1; 1352 } 1353 1354 /* valid range is between 4 and 9 inclusive */ 1355 if (amdgpu_vm_fragment_size != -1 && 1356 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1357 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1358 amdgpu_vm_fragment_size = -1; 1359 } 1360 1361 if (amdgpu_sched_hw_submission < 2) { 1362 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1363 amdgpu_sched_hw_submission); 1364 amdgpu_sched_hw_submission = 2; 1365 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1366 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1367 amdgpu_sched_hw_submission); 1368 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1369 } 1370 1371 amdgpu_device_check_smu_prv_buffer_size(adev); 1372 1373 amdgpu_device_check_vm_size(adev); 1374 1375 amdgpu_device_check_block_size(adev); 1376 1377 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1378 1379 amdgpu_gmc_tmz_set(adev); 1380 1381 amdgpu_gmc_noretry_set(adev); 1382 1383 return 0; 1384 } 1385 1386 /** 1387 * amdgpu_switcheroo_set_state - set switcheroo state 1388 * 1389 * @pdev: pci dev pointer 1390 * @state: vga_switcheroo state 1391 * 1392 * Callback for the switcheroo driver. Suspends or resumes the 1393 * the asics before or after it is powered up using ACPI methods. 1394 */ 1395 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1396 enum vga_switcheroo_state state) 1397 { 1398 struct drm_device *dev = pci_get_drvdata(pdev); 1399 int r; 1400 1401 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1402 return; 1403 1404 if (state == VGA_SWITCHEROO_ON) { 1405 pr_info("switched on\n"); 1406 /* don't suspend or resume card normally */ 1407 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1408 1409 pci_set_power_state(dev->pdev, PCI_D0); 1410 amdgpu_device_load_pci_state(dev->pdev); 1411 r = pci_enable_device(dev->pdev); 1412 if (r) 1413 DRM_WARN("pci_enable_device failed (%d)\n", r); 1414 amdgpu_device_resume(dev, true); 1415 1416 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1417 drm_kms_helper_poll_enable(dev); 1418 } else { 1419 pr_info("switched off\n"); 1420 drm_kms_helper_poll_disable(dev); 1421 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1422 amdgpu_device_suspend(dev, true); 1423 amdgpu_device_cache_pci_state(dev->pdev); 1424 /* Shut down the device */ 1425 pci_disable_device(dev->pdev); 1426 pci_set_power_state(dev->pdev, PCI_D3cold); 1427 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1428 } 1429 } 1430 1431 /** 1432 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1433 * 1434 * @pdev: pci dev pointer 1435 * 1436 * Callback for the switcheroo driver. Check of the switcheroo 1437 * state can be changed. 1438 * Returns true if the state can be changed, false if not. 1439 */ 1440 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1441 { 1442 struct drm_device *dev = pci_get_drvdata(pdev); 1443 1444 /* 1445 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1446 * locking inversion with the driver load path. And the access here is 1447 * completely racy anyway. So don't bother with locking for now. 1448 */ 1449 return atomic_read(&dev->open_count) == 0; 1450 } 1451 1452 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1453 .set_gpu_state = amdgpu_switcheroo_set_state, 1454 .reprobe = NULL, 1455 .can_switch = amdgpu_switcheroo_can_switch, 1456 }; 1457 1458 /** 1459 * amdgpu_device_ip_set_clockgating_state - set the CG state 1460 * 1461 * @dev: amdgpu_device pointer 1462 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1463 * @state: clockgating state (gate or ungate) 1464 * 1465 * Sets the requested clockgating state for all instances of 1466 * the hardware IP specified. 1467 * Returns the error code from the last instance. 1468 */ 1469 int amdgpu_device_ip_set_clockgating_state(void *dev, 1470 enum amd_ip_block_type block_type, 1471 enum amd_clockgating_state state) 1472 { 1473 struct amdgpu_device *adev = dev; 1474 int i, r = 0; 1475 1476 for (i = 0; i < adev->num_ip_blocks; i++) { 1477 if (!adev->ip_blocks[i].status.valid) 1478 continue; 1479 if (adev->ip_blocks[i].version->type != block_type) 1480 continue; 1481 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1482 continue; 1483 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1484 (void *)adev, state); 1485 if (r) 1486 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1487 adev->ip_blocks[i].version->funcs->name, r); 1488 } 1489 return r; 1490 } 1491 1492 /** 1493 * amdgpu_device_ip_set_powergating_state - set the PG state 1494 * 1495 * @dev: amdgpu_device pointer 1496 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1497 * @state: powergating state (gate or ungate) 1498 * 1499 * Sets the requested powergating state for all instances of 1500 * the hardware IP specified. 1501 * Returns the error code from the last instance. 1502 */ 1503 int amdgpu_device_ip_set_powergating_state(void *dev, 1504 enum amd_ip_block_type block_type, 1505 enum amd_powergating_state state) 1506 { 1507 struct amdgpu_device *adev = dev; 1508 int i, r = 0; 1509 1510 for (i = 0; i < adev->num_ip_blocks; i++) { 1511 if (!adev->ip_blocks[i].status.valid) 1512 continue; 1513 if (adev->ip_blocks[i].version->type != block_type) 1514 continue; 1515 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1516 continue; 1517 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1518 (void *)adev, state); 1519 if (r) 1520 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1521 adev->ip_blocks[i].version->funcs->name, r); 1522 } 1523 return r; 1524 } 1525 1526 /** 1527 * amdgpu_device_ip_get_clockgating_state - get the CG state 1528 * 1529 * @adev: amdgpu_device pointer 1530 * @flags: clockgating feature flags 1531 * 1532 * Walks the list of IPs on the device and updates the clockgating 1533 * flags for each IP. 1534 * Updates @flags with the feature flags for each hardware IP where 1535 * clockgating is enabled. 1536 */ 1537 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1538 u32 *flags) 1539 { 1540 int i; 1541 1542 for (i = 0; i < adev->num_ip_blocks; i++) { 1543 if (!adev->ip_blocks[i].status.valid) 1544 continue; 1545 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1546 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1547 } 1548 } 1549 1550 /** 1551 * amdgpu_device_ip_wait_for_idle - wait for idle 1552 * 1553 * @adev: amdgpu_device pointer 1554 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1555 * 1556 * Waits for the request hardware IP to be idle. 1557 * Returns 0 for success or a negative error code on failure. 1558 */ 1559 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1560 enum amd_ip_block_type block_type) 1561 { 1562 int i, r; 1563 1564 for (i = 0; i < adev->num_ip_blocks; i++) { 1565 if (!adev->ip_blocks[i].status.valid) 1566 continue; 1567 if (adev->ip_blocks[i].version->type == block_type) { 1568 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1569 if (r) 1570 return r; 1571 break; 1572 } 1573 } 1574 return 0; 1575 1576 } 1577 1578 /** 1579 * amdgpu_device_ip_is_idle - is the hardware IP idle 1580 * 1581 * @adev: amdgpu_device pointer 1582 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1583 * 1584 * Check if the hardware IP is idle or not. 1585 * Returns true if it the IP is idle, false if not. 1586 */ 1587 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1588 enum amd_ip_block_type block_type) 1589 { 1590 int i; 1591 1592 for (i = 0; i < adev->num_ip_blocks; i++) { 1593 if (!adev->ip_blocks[i].status.valid) 1594 continue; 1595 if (adev->ip_blocks[i].version->type == block_type) 1596 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1597 } 1598 return true; 1599 1600 } 1601 1602 /** 1603 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1604 * 1605 * @adev: amdgpu_device pointer 1606 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1607 * 1608 * Returns a pointer to the hardware IP block structure 1609 * if it exists for the asic, otherwise NULL. 1610 */ 1611 struct amdgpu_ip_block * 1612 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1613 enum amd_ip_block_type type) 1614 { 1615 int i; 1616 1617 for (i = 0; i < adev->num_ip_blocks; i++) 1618 if (adev->ip_blocks[i].version->type == type) 1619 return &adev->ip_blocks[i]; 1620 1621 return NULL; 1622 } 1623 1624 /** 1625 * amdgpu_device_ip_block_version_cmp 1626 * 1627 * @adev: amdgpu_device pointer 1628 * @type: enum amd_ip_block_type 1629 * @major: major version 1630 * @minor: minor version 1631 * 1632 * return 0 if equal or greater 1633 * return 1 if smaller or the ip_block doesn't exist 1634 */ 1635 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1636 enum amd_ip_block_type type, 1637 u32 major, u32 minor) 1638 { 1639 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1640 1641 if (ip_block && ((ip_block->version->major > major) || 1642 ((ip_block->version->major == major) && 1643 (ip_block->version->minor >= minor)))) 1644 return 0; 1645 1646 return 1; 1647 } 1648 1649 /** 1650 * amdgpu_device_ip_block_add 1651 * 1652 * @adev: amdgpu_device pointer 1653 * @ip_block_version: pointer to the IP to add 1654 * 1655 * Adds the IP block driver information to the collection of IPs 1656 * on the asic. 1657 */ 1658 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1659 const struct amdgpu_ip_block_version *ip_block_version) 1660 { 1661 if (!ip_block_version) 1662 return -EINVAL; 1663 1664 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1665 ip_block_version->funcs->name); 1666 1667 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1668 1669 return 0; 1670 } 1671 1672 /** 1673 * amdgpu_device_enable_virtual_display - enable virtual display feature 1674 * 1675 * @adev: amdgpu_device pointer 1676 * 1677 * Enabled the virtual display feature if the user has enabled it via 1678 * the module parameter virtual_display. This feature provides a virtual 1679 * display hardware on headless boards or in virtualized environments. 1680 * This function parses and validates the configuration string specified by 1681 * the user and configues the virtual display configuration (number of 1682 * virtual connectors, crtcs, etc.) specified. 1683 */ 1684 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1685 { 1686 adev->enable_virtual_display = false; 1687 1688 if (amdgpu_virtual_display) { 1689 struct drm_device *ddev = adev_to_drm(adev); 1690 const char *pci_address_name = pci_name(ddev->pdev); 1691 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1692 1693 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1694 pciaddstr_tmp = pciaddstr; 1695 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1696 pciaddname = strsep(&pciaddname_tmp, ","); 1697 if (!strcmp("all", pciaddname) 1698 || !strcmp(pci_address_name, pciaddname)) { 1699 long num_crtc; 1700 int res = -1; 1701 1702 adev->enable_virtual_display = true; 1703 1704 if (pciaddname_tmp) 1705 res = kstrtol(pciaddname_tmp, 10, 1706 &num_crtc); 1707 1708 if (!res) { 1709 if (num_crtc < 1) 1710 num_crtc = 1; 1711 if (num_crtc > 6) 1712 num_crtc = 6; 1713 adev->mode_info.num_crtc = num_crtc; 1714 } else { 1715 adev->mode_info.num_crtc = 1; 1716 } 1717 break; 1718 } 1719 } 1720 1721 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1722 amdgpu_virtual_display, pci_address_name, 1723 adev->enable_virtual_display, adev->mode_info.num_crtc); 1724 1725 kfree(pciaddstr); 1726 } 1727 } 1728 1729 /** 1730 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1731 * 1732 * @adev: amdgpu_device pointer 1733 * 1734 * Parses the asic configuration parameters specified in the gpu info 1735 * firmware and makes them availale to the driver for use in configuring 1736 * the asic. 1737 * Returns 0 on success, -EINVAL on failure. 1738 */ 1739 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1740 { 1741 const char *chip_name; 1742 char fw_name[40]; 1743 int err; 1744 const struct gpu_info_firmware_header_v1_0 *hdr; 1745 1746 adev->firmware.gpu_info_fw = NULL; 1747 1748 if (adev->mman.discovery_bin) { 1749 amdgpu_discovery_get_gfx_info(adev); 1750 1751 /* 1752 * FIXME: The bounding box is still needed by Navi12, so 1753 * temporarily read it from gpu_info firmware. Should be droped 1754 * when DAL no longer needs it. 1755 */ 1756 if (adev->asic_type != CHIP_NAVI12) 1757 return 0; 1758 } 1759 1760 switch (adev->asic_type) { 1761 #ifdef CONFIG_DRM_AMDGPU_SI 1762 case CHIP_VERDE: 1763 case CHIP_TAHITI: 1764 case CHIP_PITCAIRN: 1765 case CHIP_OLAND: 1766 case CHIP_HAINAN: 1767 #endif 1768 #ifdef CONFIG_DRM_AMDGPU_CIK 1769 case CHIP_BONAIRE: 1770 case CHIP_HAWAII: 1771 case CHIP_KAVERI: 1772 case CHIP_KABINI: 1773 case CHIP_MULLINS: 1774 #endif 1775 case CHIP_TOPAZ: 1776 case CHIP_TONGA: 1777 case CHIP_FIJI: 1778 case CHIP_POLARIS10: 1779 case CHIP_POLARIS11: 1780 case CHIP_POLARIS12: 1781 case CHIP_VEGAM: 1782 case CHIP_CARRIZO: 1783 case CHIP_STONEY: 1784 case CHIP_VEGA20: 1785 case CHIP_SIENNA_CICHLID: 1786 case CHIP_NAVY_FLOUNDER: 1787 case CHIP_DIMGREY_CAVEFISH: 1788 default: 1789 return 0; 1790 case CHIP_VEGA10: 1791 chip_name = "vega10"; 1792 break; 1793 case CHIP_VEGA12: 1794 chip_name = "vega12"; 1795 break; 1796 case CHIP_RAVEN: 1797 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1798 chip_name = "raven2"; 1799 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1800 chip_name = "picasso"; 1801 else 1802 chip_name = "raven"; 1803 break; 1804 case CHIP_ARCTURUS: 1805 chip_name = "arcturus"; 1806 break; 1807 case CHIP_RENOIR: 1808 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1809 chip_name = "renoir"; 1810 else 1811 chip_name = "green_sardine"; 1812 break; 1813 case CHIP_NAVI10: 1814 chip_name = "navi10"; 1815 break; 1816 case CHIP_NAVI14: 1817 chip_name = "navi14"; 1818 break; 1819 case CHIP_NAVI12: 1820 chip_name = "navi12"; 1821 break; 1822 case CHIP_VANGOGH: 1823 chip_name = "vangogh"; 1824 break; 1825 } 1826 1827 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1828 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1829 if (err) { 1830 dev_err(adev->dev, 1831 "Failed to load gpu_info firmware \"%s\"\n", 1832 fw_name); 1833 goto out; 1834 } 1835 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1836 if (err) { 1837 dev_err(adev->dev, 1838 "Failed to validate gpu_info firmware \"%s\"\n", 1839 fw_name); 1840 goto out; 1841 } 1842 1843 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1844 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1845 1846 switch (hdr->version_major) { 1847 case 1: 1848 { 1849 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1850 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1851 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1852 1853 /* 1854 * Should be droped when DAL no longer needs it. 1855 */ 1856 if (adev->asic_type == CHIP_NAVI12) 1857 goto parse_soc_bounding_box; 1858 1859 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1860 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1861 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1862 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1863 adev->gfx.config.max_texture_channel_caches = 1864 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1865 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1866 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1867 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1868 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1869 adev->gfx.config.double_offchip_lds_buf = 1870 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1871 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1872 adev->gfx.cu_info.max_waves_per_simd = 1873 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1874 adev->gfx.cu_info.max_scratch_slots_per_cu = 1875 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1876 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1877 if (hdr->version_minor >= 1) { 1878 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1879 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1880 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1881 adev->gfx.config.num_sc_per_sh = 1882 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1883 adev->gfx.config.num_packer_per_sc = 1884 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1885 } 1886 1887 parse_soc_bounding_box: 1888 /* 1889 * soc bounding box info is not integrated in disocovery table, 1890 * we always need to parse it from gpu info firmware if needed. 1891 */ 1892 if (hdr->version_minor == 2) { 1893 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1894 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1895 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1896 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1897 } 1898 break; 1899 } 1900 default: 1901 dev_err(adev->dev, 1902 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1903 err = -EINVAL; 1904 goto out; 1905 } 1906 out: 1907 return err; 1908 } 1909 1910 /** 1911 * amdgpu_device_ip_early_init - run early init for hardware IPs 1912 * 1913 * @adev: amdgpu_device pointer 1914 * 1915 * Early initialization pass for hardware IPs. The hardware IPs that make 1916 * up each asic are discovered each IP's early_init callback is run. This 1917 * is the first stage in initializing the asic. 1918 * Returns 0 on success, negative error code on failure. 1919 */ 1920 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1921 { 1922 int i, r; 1923 1924 amdgpu_device_enable_virtual_display(adev); 1925 1926 if (amdgpu_sriov_vf(adev)) { 1927 r = amdgpu_virt_request_full_gpu(adev, true); 1928 if (r) 1929 return r; 1930 } 1931 1932 switch (adev->asic_type) { 1933 #ifdef CONFIG_DRM_AMDGPU_SI 1934 case CHIP_VERDE: 1935 case CHIP_TAHITI: 1936 case CHIP_PITCAIRN: 1937 case CHIP_OLAND: 1938 case CHIP_HAINAN: 1939 adev->family = AMDGPU_FAMILY_SI; 1940 r = si_set_ip_blocks(adev); 1941 if (r) 1942 return r; 1943 break; 1944 #endif 1945 #ifdef CONFIG_DRM_AMDGPU_CIK 1946 case CHIP_BONAIRE: 1947 case CHIP_HAWAII: 1948 case CHIP_KAVERI: 1949 case CHIP_KABINI: 1950 case CHIP_MULLINS: 1951 if (adev->flags & AMD_IS_APU) 1952 adev->family = AMDGPU_FAMILY_KV; 1953 else 1954 adev->family = AMDGPU_FAMILY_CI; 1955 1956 r = cik_set_ip_blocks(adev); 1957 if (r) 1958 return r; 1959 break; 1960 #endif 1961 case CHIP_TOPAZ: 1962 case CHIP_TONGA: 1963 case CHIP_FIJI: 1964 case CHIP_POLARIS10: 1965 case CHIP_POLARIS11: 1966 case CHIP_POLARIS12: 1967 case CHIP_VEGAM: 1968 case CHIP_CARRIZO: 1969 case CHIP_STONEY: 1970 if (adev->flags & AMD_IS_APU) 1971 adev->family = AMDGPU_FAMILY_CZ; 1972 else 1973 adev->family = AMDGPU_FAMILY_VI; 1974 1975 r = vi_set_ip_blocks(adev); 1976 if (r) 1977 return r; 1978 break; 1979 case CHIP_VEGA10: 1980 case CHIP_VEGA12: 1981 case CHIP_VEGA20: 1982 case CHIP_RAVEN: 1983 case CHIP_ARCTURUS: 1984 case CHIP_RENOIR: 1985 if (adev->flags & AMD_IS_APU) 1986 adev->family = AMDGPU_FAMILY_RV; 1987 else 1988 adev->family = AMDGPU_FAMILY_AI; 1989 1990 r = soc15_set_ip_blocks(adev); 1991 if (r) 1992 return r; 1993 break; 1994 case CHIP_NAVI10: 1995 case CHIP_NAVI14: 1996 case CHIP_NAVI12: 1997 case CHIP_SIENNA_CICHLID: 1998 case CHIP_NAVY_FLOUNDER: 1999 case CHIP_DIMGREY_CAVEFISH: 2000 case CHIP_VANGOGH: 2001 if (adev->asic_type == CHIP_VANGOGH) 2002 adev->family = AMDGPU_FAMILY_VGH; 2003 else 2004 adev->family = AMDGPU_FAMILY_NV; 2005 2006 r = nv_set_ip_blocks(adev); 2007 if (r) 2008 return r; 2009 break; 2010 default: 2011 /* FIXME: not supported yet */ 2012 return -EINVAL; 2013 } 2014 2015 amdgpu_amdkfd_device_probe(adev); 2016 2017 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2018 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2019 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2020 2021 for (i = 0; i < adev->num_ip_blocks; i++) { 2022 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2023 DRM_ERROR("disabled ip block: %d <%s>\n", 2024 i, adev->ip_blocks[i].version->funcs->name); 2025 adev->ip_blocks[i].status.valid = false; 2026 } else { 2027 if (adev->ip_blocks[i].version->funcs->early_init) { 2028 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2029 if (r == -ENOENT) { 2030 adev->ip_blocks[i].status.valid = false; 2031 } else if (r) { 2032 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2033 adev->ip_blocks[i].version->funcs->name, r); 2034 return r; 2035 } else { 2036 adev->ip_blocks[i].status.valid = true; 2037 } 2038 } else { 2039 adev->ip_blocks[i].status.valid = true; 2040 } 2041 } 2042 /* get the vbios after the asic_funcs are set up */ 2043 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2044 r = amdgpu_device_parse_gpu_info_fw(adev); 2045 if (r) 2046 return r; 2047 2048 /* Read BIOS */ 2049 if (!amdgpu_get_bios(adev)) 2050 return -EINVAL; 2051 2052 r = amdgpu_atombios_init(adev); 2053 if (r) { 2054 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2055 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2056 return r; 2057 } 2058 } 2059 } 2060 2061 adev->cg_flags &= amdgpu_cg_mask; 2062 adev->pg_flags &= amdgpu_pg_mask; 2063 2064 return 0; 2065 } 2066 2067 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2068 { 2069 int i, r; 2070 2071 for (i = 0; i < adev->num_ip_blocks; i++) { 2072 if (!adev->ip_blocks[i].status.sw) 2073 continue; 2074 if (adev->ip_blocks[i].status.hw) 2075 continue; 2076 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2077 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2078 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2079 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2080 if (r) { 2081 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2082 adev->ip_blocks[i].version->funcs->name, r); 2083 return r; 2084 } 2085 adev->ip_blocks[i].status.hw = true; 2086 } 2087 } 2088 2089 return 0; 2090 } 2091 2092 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2093 { 2094 int i, r; 2095 2096 for (i = 0; i < adev->num_ip_blocks; i++) { 2097 if (!adev->ip_blocks[i].status.sw) 2098 continue; 2099 if (adev->ip_blocks[i].status.hw) 2100 continue; 2101 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2102 if (r) { 2103 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2104 adev->ip_blocks[i].version->funcs->name, r); 2105 return r; 2106 } 2107 adev->ip_blocks[i].status.hw = true; 2108 } 2109 2110 return 0; 2111 } 2112 2113 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2114 { 2115 int r = 0; 2116 int i; 2117 uint32_t smu_version; 2118 2119 if (adev->asic_type >= CHIP_VEGA10) { 2120 for (i = 0; i < adev->num_ip_blocks; i++) { 2121 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2122 continue; 2123 2124 /* no need to do the fw loading again if already done*/ 2125 if (adev->ip_blocks[i].status.hw == true) 2126 break; 2127 2128 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2129 r = adev->ip_blocks[i].version->funcs->resume(adev); 2130 if (r) { 2131 DRM_ERROR("resume of IP block <%s> failed %d\n", 2132 adev->ip_blocks[i].version->funcs->name, r); 2133 return r; 2134 } 2135 } else { 2136 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2137 if (r) { 2138 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2139 adev->ip_blocks[i].version->funcs->name, r); 2140 return r; 2141 } 2142 } 2143 2144 adev->ip_blocks[i].status.hw = true; 2145 break; 2146 } 2147 } 2148 2149 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2150 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2151 2152 return r; 2153 } 2154 2155 /** 2156 * amdgpu_device_ip_init - run init for hardware IPs 2157 * 2158 * @adev: amdgpu_device pointer 2159 * 2160 * Main initialization pass for hardware IPs. The list of all the hardware 2161 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2162 * are run. sw_init initializes the software state associated with each IP 2163 * and hw_init initializes the hardware associated with each IP. 2164 * Returns 0 on success, negative error code on failure. 2165 */ 2166 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2167 { 2168 int i, r; 2169 2170 r = amdgpu_ras_init(adev); 2171 if (r) 2172 return r; 2173 2174 for (i = 0; i < adev->num_ip_blocks; i++) { 2175 if (!adev->ip_blocks[i].status.valid) 2176 continue; 2177 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2178 if (r) { 2179 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2180 adev->ip_blocks[i].version->funcs->name, r); 2181 goto init_failed; 2182 } 2183 adev->ip_blocks[i].status.sw = true; 2184 2185 /* need to do gmc hw init early so we can allocate gpu mem */ 2186 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2187 r = amdgpu_device_vram_scratch_init(adev); 2188 if (r) { 2189 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2190 goto init_failed; 2191 } 2192 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2193 if (r) { 2194 DRM_ERROR("hw_init %d failed %d\n", i, r); 2195 goto init_failed; 2196 } 2197 r = amdgpu_device_wb_init(adev); 2198 if (r) { 2199 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2200 goto init_failed; 2201 } 2202 adev->ip_blocks[i].status.hw = true; 2203 2204 /* right after GMC hw init, we create CSA */ 2205 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2206 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2207 AMDGPU_GEM_DOMAIN_VRAM, 2208 AMDGPU_CSA_SIZE); 2209 if (r) { 2210 DRM_ERROR("allocate CSA failed %d\n", r); 2211 goto init_failed; 2212 } 2213 } 2214 } 2215 } 2216 2217 if (amdgpu_sriov_vf(adev)) 2218 amdgpu_virt_init_data_exchange(adev); 2219 2220 r = amdgpu_ib_pool_init(adev); 2221 if (r) { 2222 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2223 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2224 goto init_failed; 2225 } 2226 2227 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2228 if (r) 2229 goto init_failed; 2230 2231 r = amdgpu_device_ip_hw_init_phase1(adev); 2232 if (r) 2233 goto init_failed; 2234 2235 r = amdgpu_device_fw_loading(adev); 2236 if (r) 2237 goto init_failed; 2238 2239 r = amdgpu_device_ip_hw_init_phase2(adev); 2240 if (r) 2241 goto init_failed; 2242 2243 /* 2244 * retired pages will be loaded from eeprom and reserved here, 2245 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2246 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2247 * for I2C communication which only true at this point. 2248 * 2249 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2250 * failure from bad gpu situation and stop amdgpu init process 2251 * accordingly. For other failed cases, it will still release all 2252 * the resource and print error message, rather than returning one 2253 * negative value to upper level. 2254 * 2255 * Note: theoretically, this should be called before all vram allocations 2256 * to protect retired page from abusing 2257 */ 2258 r = amdgpu_ras_recovery_init(adev); 2259 if (r) 2260 goto init_failed; 2261 2262 if (adev->gmc.xgmi.num_physical_nodes > 1) 2263 amdgpu_xgmi_add_device(adev); 2264 amdgpu_amdkfd_device_init(adev); 2265 2266 amdgpu_fru_get_product_info(adev); 2267 2268 init_failed: 2269 if (amdgpu_sriov_vf(adev)) 2270 amdgpu_virt_release_full_gpu(adev, true); 2271 2272 return r; 2273 } 2274 2275 /** 2276 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2277 * 2278 * @adev: amdgpu_device pointer 2279 * 2280 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2281 * this function before a GPU reset. If the value is retained after a 2282 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2283 */ 2284 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2285 { 2286 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2287 } 2288 2289 /** 2290 * amdgpu_device_check_vram_lost - check if vram is valid 2291 * 2292 * @adev: amdgpu_device pointer 2293 * 2294 * Checks the reset magic value written to the gart pointer in VRAM. 2295 * The driver calls this after a GPU reset to see if the contents of 2296 * VRAM is lost or now. 2297 * returns true if vram is lost, false if not. 2298 */ 2299 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2300 { 2301 if (memcmp(adev->gart.ptr, adev->reset_magic, 2302 AMDGPU_RESET_MAGIC_NUM)) 2303 return true; 2304 2305 if (!amdgpu_in_reset(adev)) 2306 return false; 2307 2308 /* 2309 * For all ASICs with baco/mode1 reset, the VRAM is 2310 * always assumed to be lost. 2311 */ 2312 switch (amdgpu_asic_reset_method(adev)) { 2313 case AMD_RESET_METHOD_BACO: 2314 case AMD_RESET_METHOD_MODE1: 2315 return true; 2316 default: 2317 return false; 2318 } 2319 } 2320 2321 /** 2322 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2323 * 2324 * @adev: amdgpu_device pointer 2325 * @state: clockgating state (gate or ungate) 2326 * 2327 * The list of all the hardware IPs that make up the asic is walked and the 2328 * set_clockgating_state callbacks are run. 2329 * Late initialization pass enabling clockgating for hardware IPs. 2330 * Fini or suspend, pass disabling clockgating for hardware IPs. 2331 * Returns 0 on success, negative error code on failure. 2332 */ 2333 2334 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2335 enum amd_clockgating_state state) 2336 { 2337 int i, j, r; 2338 2339 if (amdgpu_emu_mode == 1) 2340 return 0; 2341 2342 for (j = 0; j < adev->num_ip_blocks; j++) { 2343 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2344 if (!adev->ip_blocks[i].status.late_initialized) 2345 continue; 2346 /* skip CG for VCE/UVD, it's handled specially */ 2347 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2348 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2349 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2350 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2351 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2352 /* enable clockgating to save power */ 2353 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2354 state); 2355 if (r) { 2356 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2357 adev->ip_blocks[i].version->funcs->name, r); 2358 return r; 2359 } 2360 } 2361 } 2362 2363 return 0; 2364 } 2365 2366 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2367 { 2368 int i, j, r; 2369 2370 if (amdgpu_emu_mode == 1) 2371 return 0; 2372 2373 for (j = 0; j < adev->num_ip_blocks; j++) { 2374 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2375 if (!adev->ip_blocks[i].status.late_initialized) 2376 continue; 2377 /* skip CG for VCE/UVD, it's handled specially */ 2378 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2379 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2380 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2381 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2382 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2383 /* enable powergating to save power */ 2384 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2385 state); 2386 if (r) { 2387 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2388 adev->ip_blocks[i].version->funcs->name, r); 2389 return r; 2390 } 2391 } 2392 } 2393 return 0; 2394 } 2395 2396 static int amdgpu_device_enable_mgpu_fan_boost(void) 2397 { 2398 struct amdgpu_gpu_instance *gpu_ins; 2399 struct amdgpu_device *adev; 2400 int i, ret = 0; 2401 2402 mutex_lock(&mgpu_info.mutex); 2403 2404 /* 2405 * MGPU fan boost feature should be enabled 2406 * only when there are two or more dGPUs in 2407 * the system 2408 */ 2409 if (mgpu_info.num_dgpu < 2) 2410 goto out; 2411 2412 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2413 gpu_ins = &(mgpu_info.gpu_ins[i]); 2414 adev = gpu_ins->adev; 2415 if (!(adev->flags & AMD_IS_APU) && 2416 !gpu_ins->mgpu_fan_enabled) { 2417 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2418 if (ret) 2419 break; 2420 2421 gpu_ins->mgpu_fan_enabled = 1; 2422 } 2423 } 2424 2425 out: 2426 mutex_unlock(&mgpu_info.mutex); 2427 2428 return ret; 2429 } 2430 2431 /** 2432 * amdgpu_device_ip_late_init - run late init for hardware IPs 2433 * 2434 * @adev: amdgpu_device pointer 2435 * 2436 * Late initialization pass for hardware IPs. The list of all the hardware 2437 * IPs that make up the asic is walked and the late_init callbacks are run. 2438 * late_init covers any special initialization that an IP requires 2439 * after all of the have been initialized or something that needs to happen 2440 * late in the init process. 2441 * Returns 0 on success, negative error code on failure. 2442 */ 2443 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2444 { 2445 struct amdgpu_gpu_instance *gpu_instance; 2446 int i = 0, r; 2447 2448 for (i = 0; i < adev->num_ip_blocks; i++) { 2449 if (!adev->ip_blocks[i].status.hw) 2450 continue; 2451 if (adev->ip_blocks[i].version->funcs->late_init) { 2452 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2453 if (r) { 2454 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2455 adev->ip_blocks[i].version->funcs->name, r); 2456 return r; 2457 } 2458 } 2459 adev->ip_blocks[i].status.late_initialized = true; 2460 } 2461 2462 amdgpu_ras_set_error_query_ready(adev, true); 2463 2464 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2465 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2466 2467 amdgpu_device_fill_reset_magic(adev); 2468 2469 r = amdgpu_device_enable_mgpu_fan_boost(); 2470 if (r) 2471 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2472 2473 2474 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2475 mutex_lock(&mgpu_info.mutex); 2476 2477 /* 2478 * Reset device p-state to low as this was booted with high. 2479 * 2480 * This should be performed only after all devices from the same 2481 * hive get initialized. 2482 * 2483 * However, it's unknown how many device in the hive in advance. 2484 * As this is counted one by one during devices initializations. 2485 * 2486 * So, we wait for all XGMI interlinked devices initialized. 2487 * This may bring some delays as those devices may come from 2488 * different hives. But that should be OK. 2489 */ 2490 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2491 for (i = 0; i < mgpu_info.num_gpu; i++) { 2492 gpu_instance = &(mgpu_info.gpu_ins[i]); 2493 if (gpu_instance->adev->flags & AMD_IS_APU) 2494 continue; 2495 2496 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2497 AMDGPU_XGMI_PSTATE_MIN); 2498 if (r) { 2499 DRM_ERROR("pstate setting failed (%d).\n", r); 2500 break; 2501 } 2502 } 2503 } 2504 2505 mutex_unlock(&mgpu_info.mutex); 2506 } 2507 2508 return 0; 2509 } 2510 2511 /** 2512 * amdgpu_device_ip_fini - run fini for hardware IPs 2513 * 2514 * @adev: amdgpu_device pointer 2515 * 2516 * Main teardown pass for hardware IPs. The list of all the hardware 2517 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2518 * are run. hw_fini tears down the hardware associated with each IP 2519 * and sw_fini tears down any software state associated with each IP. 2520 * Returns 0 on success, negative error code on failure. 2521 */ 2522 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2523 { 2524 int i, r; 2525 2526 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2527 amdgpu_virt_release_ras_err_handler_data(adev); 2528 2529 amdgpu_ras_pre_fini(adev); 2530 2531 if (adev->gmc.xgmi.num_physical_nodes > 1) 2532 amdgpu_xgmi_remove_device(adev); 2533 2534 amdgpu_amdkfd_device_fini(adev); 2535 2536 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2537 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2538 2539 /* need to disable SMC first */ 2540 for (i = 0; i < adev->num_ip_blocks; i++) { 2541 if (!adev->ip_blocks[i].status.hw) 2542 continue; 2543 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2544 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2545 /* XXX handle errors */ 2546 if (r) { 2547 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2548 adev->ip_blocks[i].version->funcs->name, r); 2549 } 2550 adev->ip_blocks[i].status.hw = false; 2551 break; 2552 } 2553 } 2554 2555 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2556 if (!adev->ip_blocks[i].status.hw) 2557 continue; 2558 2559 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2560 /* XXX handle errors */ 2561 if (r) { 2562 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2563 adev->ip_blocks[i].version->funcs->name, r); 2564 } 2565 2566 adev->ip_blocks[i].status.hw = false; 2567 } 2568 2569 2570 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2571 if (!adev->ip_blocks[i].status.sw) 2572 continue; 2573 2574 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2575 amdgpu_ucode_free_bo(adev); 2576 amdgpu_free_static_csa(&adev->virt.csa_obj); 2577 amdgpu_device_wb_fini(adev); 2578 amdgpu_device_vram_scratch_fini(adev); 2579 amdgpu_ib_pool_fini(adev); 2580 } 2581 2582 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2583 /* XXX handle errors */ 2584 if (r) { 2585 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2586 adev->ip_blocks[i].version->funcs->name, r); 2587 } 2588 adev->ip_blocks[i].status.sw = false; 2589 adev->ip_blocks[i].status.valid = false; 2590 } 2591 2592 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2593 if (!adev->ip_blocks[i].status.late_initialized) 2594 continue; 2595 if (adev->ip_blocks[i].version->funcs->late_fini) 2596 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2597 adev->ip_blocks[i].status.late_initialized = false; 2598 } 2599 2600 amdgpu_ras_fini(adev); 2601 2602 if (amdgpu_sriov_vf(adev)) 2603 if (amdgpu_virt_release_full_gpu(adev, false)) 2604 DRM_ERROR("failed to release exclusive mode on fini\n"); 2605 2606 return 0; 2607 } 2608 2609 /** 2610 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2611 * 2612 * @work: work_struct. 2613 */ 2614 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2615 { 2616 struct amdgpu_device *adev = 2617 container_of(work, struct amdgpu_device, delayed_init_work.work); 2618 int r; 2619 2620 r = amdgpu_ib_ring_tests(adev); 2621 if (r) 2622 DRM_ERROR("ib ring test failed (%d).\n", r); 2623 } 2624 2625 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2626 { 2627 struct amdgpu_device *adev = 2628 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2629 2630 mutex_lock(&adev->gfx.gfx_off_mutex); 2631 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2632 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2633 adev->gfx.gfx_off_state = true; 2634 } 2635 mutex_unlock(&adev->gfx.gfx_off_mutex); 2636 } 2637 2638 /** 2639 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2640 * 2641 * @adev: amdgpu_device pointer 2642 * 2643 * Main suspend function for hardware IPs. The list of all the hardware 2644 * IPs that make up the asic is walked, clockgating is disabled and the 2645 * suspend callbacks are run. suspend puts the hardware and software state 2646 * in each IP into a state suitable for suspend. 2647 * Returns 0 on success, negative error code on failure. 2648 */ 2649 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2650 { 2651 int i, r; 2652 2653 if (!amdgpu_acpi_is_s0ix_supported() || amdgpu_in_reset(adev)) { 2654 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2655 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2656 } 2657 2658 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2659 if (!adev->ip_blocks[i].status.valid) 2660 continue; 2661 2662 /* displays are handled separately */ 2663 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2664 continue; 2665 2666 /* XXX handle errors */ 2667 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2668 /* XXX handle errors */ 2669 if (r) { 2670 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2671 adev->ip_blocks[i].version->funcs->name, r); 2672 return r; 2673 } 2674 2675 adev->ip_blocks[i].status.hw = false; 2676 } 2677 2678 return 0; 2679 } 2680 2681 /** 2682 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2683 * 2684 * @adev: amdgpu_device pointer 2685 * 2686 * Main suspend function for hardware IPs. The list of all the hardware 2687 * IPs that make up the asic is walked, clockgating is disabled and the 2688 * suspend callbacks are run. suspend puts the hardware and software state 2689 * in each IP into a state suitable for suspend. 2690 * Returns 0 on success, negative error code on failure. 2691 */ 2692 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2693 { 2694 int i, r; 2695 2696 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2697 if (!adev->ip_blocks[i].status.valid) 2698 continue; 2699 /* displays are handled in phase1 */ 2700 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2701 continue; 2702 /* PSP lost connection when err_event_athub occurs */ 2703 if (amdgpu_ras_intr_triggered() && 2704 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2705 adev->ip_blocks[i].status.hw = false; 2706 continue; 2707 } 2708 /* XXX handle errors */ 2709 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2710 /* XXX handle errors */ 2711 if (r) { 2712 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2713 adev->ip_blocks[i].version->funcs->name, r); 2714 } 2715 adev->ip_blocks[i].status.hw = false; 2716 /* handle putting the SMC in the appropriate state */ 2717 if(!amdgpu_sriov_vf(adev)){ 2718 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2719 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2720 if (r) { 2721 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2722 adev->mp1_state, r); 2723 return r; 2724 } 2725 } 2726 } 2727 adev->ip_blocks[i].status.hw = false; 2728 } 2729 2730 return 0; 2731 } 2732 2733 /** 2734 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2735 * 2736 * @adev: amdgpu_device pointer 2737 * 2738 * Main suspend function for hardware IPs. The list of all the hardware 2739 * IPs that make up the asic is walked, clockgating is disabled and the 2740 * suspend callbacks are run. suspend puts the hardware and software state 2741 * in each IP into a state suitable for suspend. 2742 * Returns 0 on success, negative error code on failure. 2743 */ 2744 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2745 { 2746 int r; 2747 2748 if (amdgpu_sriov_vf(adev)) 2749 amdgpu_virt_request_full_gpu(adev, false); 2750 2751 r = amdgpu_device_ip_suspend_phase1(adev); 2752 if (r) 2753 return r; 2754 r = amdgpu_device_ip_suspend_phase2(adev); 2755 2756 if (amdgpu_sriov_vf(adev)) 2757 amdgpu_virt_release_full_gpu(adev, false); 2758 2759 return r; 2760 } 2761 2762 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2763 { 2764 int i, r; 2765 2766 static enum amd_ip_block_type ip_order[] = { 2767 AMD_IP_BLOCK_TYPE_GMC, 2768 AMD_IP_BLOCK_TYPE_COMMON, 2769 AMD_IP_BLOCK_TYPE_PSP, 2770 AMD_IP_BLOCK_TYPE_IH, 2771 }; 2772 2773 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2774 int j; 2775 struct amdgpu_ip_block *block; 2776 2777 block = &adev->ip_blocks[i]; 2778 block->status.hw = false; 2779 2780 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2781 2782 if (block->version->type != ip_order[j] || 2783 !block->status.valid) 2784 continue; 2785 2786 r = block->version->funcs->hw_init(adev); 2787 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2788 if (r) 2789 return r; 2790 block->status.hw = true; 2791 } 2792 } 2793 2794 return 0; 2795 } 2796 2797 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2798 { 2799 int i, r; 2800 2801 static enum amd_ip_block_type ip_order[] = { 2802 AMD_IP_BLOCK_TYPE_SMC, 2803 AMD_IP_BLOCK_TYPE_DCE, 2804 AMD_IP_BLOCK_TYPE_GFX, 2805 AMD_IP_BLOCK_TYPE_SDMA, 2806 AMD_IP_BLOCK_TYPE_UVD, 2807 AMD_IP_BLOCK_TYPE_VCE, 2808 AMD_IP_BLOCK_TYPE_VCN 2809 }; 2810 2811 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2812 int j; 2813 struct amdgpu_ip_block *block; 2814 2815 for (j = 0; j < adev->num_ip_blocks; j++) { 2816 block = &adev->ip_blocks[j]; 2817 2818 if (block->version->type != ip_order[i] || 2819 !block->status.valid || 2820 block->status.hw) 2821 continue; 2822 2823 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2824 r = block->version->funcs->resume(adev); 2825 else 2826 r = block->version->funcs->hw_init(adev); 2827 2828 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2829 if (r) 2830 return r; 2831 block->status.hw = true; 2832 } 2833 } 2834 2835 return 0; 2836 } 2837 2838 /** 2839 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2840 * 2841 * @adev: amdgpu_device pointer 2842 * 2843 * First resume function for hardware IPs. The list of all the hardware 2844 * IPs that make up the asic is walked and the resume callbacks are run for 2845 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2846 * after a suspend and updates the software state as necessary. This 2847 * function is also used for restoring the GPU after a GPU reset. 2848 * Returns 0 on success, negative error code on failure. 2849 */ 2850 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2851 { 2852 int i, r; 2853 2854 for (i = 0; i < adev->num_ip_blocks; i++) { 2855 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2856 continue; 2857 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2858 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2859 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2860 2861 r = adev->ip_blocks[i].version->funcs->resume(adev); 2862 if (r) { 2863 DRM_ERROR("resume of IP block <%s> failed %d\n", 2864 adev->ip_blocks[i].version->funcs->name, r); 2865 return r; 2866 } 2867 adev->ip_blocks[i].status.hw = true; 2868 } 2869 } 2870 2871 return 0; 2872 } 2873 2874 /** 2875 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2876 * 2877 * @adev: amdgpu_device pointer 2878 * 2879 * First resume function for hardware IPs. The list of all the hardware 2880 * IPs that make up the asic is walked and the resume callbacks are run for 2881 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2882 * functional state after a suspend and updates the software state as 2883 * necessary. This function is also used for restoring the GPU after a GPU 2884 * reset. 2885 * Returns 0 on success, negative error code on failure. 2886 */ 2887 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2888 { 2889 int i, r; 2890 2891 for (i = 0; i < adev->num_ip_blocks; i++) { 2892 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2893 continue; 2894 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2895 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2896 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2898 continue; 2899 r = adev->ip_blocks[i].version->funcs->resume(adev); 2900 if (r) { 2901 DRM_ERROR("resume of IP block <%s> failed %d\n", 2902 adev->ip_blocks[i].version->funcs->name, r); 2903 return r; 2904 } 2905 adev->ip_blocks[i].status.hw = true; 2906 } 2907 2908 return 0; 2909 } 2910 2911 /** 2912 * amdgpu_device_ip_resume - run resume for hardware IPs 2913 * 2914 * @adev: amdgpu_device pointer 2915 * 2916 * Main resume function for hardware IPs. The hardware IPs 2917 * are split into two resume functions because they are 2918 * are also used in in recovering from a GPU reset and some additional 2919 * steps need to be take between them. In this case (S3/S4) they are 2920 * run sequentially. 2921 * Returns 0 on success, negative error code on failure. 2922 */ 2923 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2924 { 2925 int r; 2926 2927 r = amdgpu_device_ip_resume_phase1(adev); 2928 if (r) 2929 return r; 2930 2931 r = amdgpu_device_fw_loading(adev); 2932 if (r) 2933 return r; 2934 2935 r = amdgpu_device_ip_resume_phase2(adev); 2936 2937 return r; 2938 } 2939 2940 /** 2941 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2942 * 2943 * @adev: amdgpu_device pointer 2944 * 2945 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2946 */ 2947 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2948 { 2949 if (amdgpu_sriov_vf(adev)) { 2950 if (adev->is_atom_fw) { 2951 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2952 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2953 } else { 2954 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2955 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2956 } 2957 2958 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2959 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2960 } 2961 } 2962 2963 /** 2964 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2965 * 2966 * @asic_type: AMD asic type 2967 * 2968 * Check if there is DC (new modesetting infrastructre) support for an asic. 2969 * returns true if DC has support, false if not. 2970 */ 2971 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2972 { 2973 switch (asic_type) { 2974 #if defined(CONFIG_DRM_AMD_DC) 2975 #if defined(CONFIG_DRM_AMD_DC_SI) 2976 case CHIP_TAHITI: 2977 case CHIP_PITCAIRN: 2978 case CHIP_VERDE: 2979 case CHIP_OLAND: 2980 #endif 2981 case CHIP_BONAIRE: 2982 case CHIP_KAVERI: 2983 case CHIP_KABINI: 2984 case CHIP_MULLINS: 2985 /* 2986 * We have systems in the wild with these ASICs that require 2987 * LVDS and VGA support which is not supported with DC. 2988 * 2989 * Fallback to the non-DC driver here by default so as not to 2990 * cause regressions. 2991 */ 2992 return amdgpu_dc > 0; 2993 case CHIP_HAWAII: 2994 case CHIP_CARRIZO: 2995 case CHIP_STONEY: 2996 case CHIP_POLARIS10: 2997 case CHIP_POLARIS11: 2998 case CHIP_POLARIS12: 2999 case CHIP_VEGAM: 3000 case CHIP_TONGA: 3001 case CHIP_FIJI: 3002 case CHIP_VEGA10: 3003 case CHIP_VEGA12: 3004 case CHIP_VEGA20: 3005 #if defined(CONFIG_DRM_AMD_DC_DCN) 3006 case CHIP_RAVEN: 3007 case CHIP_NAVI10: 3008 case CHIP_NAVI14: 3009 case CHIP_NAVI12: 3010 case CHIP_RENOIR: 3011 case CHIP_SIENNA_CICHLID: 3012 case CHIP_NAVY_FLOUNDER: 3013 case CHIP_DIMGREY_CAVEFISH: 3014 case CHIP_VANGOGH: 3015 #endif 3016 return amdgpu_dc != 0; 3017 #endif 3018 default: 3019 if (amdgpu_dc > 0) 3020 DRM_INFO("Display Core has been requested via kernel parameter " 3021 "but isn't supported by ASIC, ignoring\n"); 3022 return false; 3023 } 3024 } 3025 3026 /** 3027 * amdgpu_device_has_dc_support - check if dc is supported 3028 * 3029 * @adev: amdgpu_device pointer 3030 * 3031 * Returns true for supported, false for not supported 3032 */ 3033 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3034 { 3035 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3036 return false; 3037 3038 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3039 } 3040 3041 3042 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3043 { 3044 struct amdgpu_device *adev = 3045 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3046 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3047 3048 /* It's a bug to not have a hive within this function */ 3049 if (WARN_ON(!hive)) 3050 return; 3051 3052 /* 3053 * Use task barrier to synchronize all xgmi reset works across the 3054 * hive. task_barrier_enter and task_barrier_exit will block 3055 * until all the threads running the xgmi reset works reach 3056 * those points. task_barrier_full will do both blocks. 3057 */ 3058 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3059 3060 task_barrier_enter(&hive->tb); 3061 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3062 3063 if (adev->asic_reset_res) 3064 goto fail; 3065 3066 task_barrier_exit(&hive->tb); 3067 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3068 3069 if (adev->asic_reset_res) 3070 goto fail; 3071 3072 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3073 adev->mmhub.funcs->reset_ras_error_count(adev); 3074 } else { 3075 3076 task_barrier_full(&hive->tb); 3077 adev->asic_reset_res = amdgpu_asic_reset(adev); 3078 } 3079 3080 fail: 3081 if (adev->asic_reset_res) 3082 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3083 adev->asic_reset_res, adev_to_drm(adev)->unique); 3084 amdgpu_put_xgmi_hive(hive); 3085 } 3086 3087 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3088 { 3089 char *input = amdgpu_lockup_timeout; 3090 char *timeout_setting = NULL; 3091 int index = 0; 3092 long timeout; 3093 int ret = 0; 3094 3095 /* 3096 * By default timeout for non compute jobs is 10000. 3097 * And there is no timeout enforced on compute jobs. 3098 * In SR-IOV or passthrough mode, timeout for compute 3099 * jobs are 60000 by default. 3100 */ 3101 adev->gfx_timeout = msecs_to_jiffies(10000); 3102 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3103 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3104 adev->compute_timeout = msecs_to_jiffies(60000); 3105 else 3106 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3107 3108 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3109 while ((timeout_setting = strsep(&input, ",")) && 3110 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3111 ret = kstrtol(timeout_setting, 0, &timeout); 3112 if (ret) 3113 return ret; 3114 3115 if (timeout == 0) { 3116 index++; 3117 continue; 3118 } else if (timeout < 0) { 3119 timeout = MAX_SCHEDULE_TIMEOUT; 3120 } else { 3121 timeout = msecs_to_jiffies(timeout); 3122 } 3123 3124 switch (index++) { 3125 case 0: 3126 adev->gfx_timeout = timeout; 3127 break; 3128 case 1: 3129 adev->compute_timeout = timeout; 3130 break; 3131 case 2: 3132 adev->sdma_timeout = timeout; 3133 break; 3134 case 3: 3135 adev->video_timeout = timeout; 3136 break; 3137 default: 3138 break; 3139 } 3140 } 3141 /* 3142 * There is only one value specified and 3143 * it should apply to all non-compute jobs. 3144 */ 3145 if (index == 1) { 3146 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3147 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3148 adev->compute_timeout = adev->gfx_timeout; 3149 } 3150 } 3151 3152 return ret; 3153 } 3154 3155 static const struct attribute *amdgpu_dev_attributes[] = { 3156 &dev_attr_product_name.attr, 3157 &dev_attr_product_number.attr, 3158 &dev_attr_serial_number.attr, 3159 &dev_attr_pcie_replay_count.attr, 3160 NULL 3161 }; 3162 3163 3164 /** 3165 * amdgpu_device_init - initialize the driver 3166 * 3167 * @adev: amdgpu_device pointer 3168 * @flags: driver flags 3169 * 3170 * Initializes the driver info and hw (all asics). 3171 * Returns 0 for success or an error on failure. 3172 * Called at driver startup. 3173 */ 3174 int amdgpu_device_init(struct amdgpu_device *adev, 3175 uint32_t flags) 3176 { 3177 struct drm_device *ddev = adev_to_drm(adev); 3178 struct pci_dev *pdev = adev->pdev; 3179 int r, i; 3180 bool boco = false; 3181 u32 max_MBps; 3182 3183 adev->shutdown = false; 3184 adev->flags = flags; 3185 3186 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3187 adev->asic_type = amdgpu_force_asic_type; 3188 else 3189 adev->asic_type = flags & AMD_ASIC_MASK; 3190 3191 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3192 if (amdgpu_emu_mode == 1) 3193 adev->usec_timeout *= 10; 3194 adev->gmc.gart_size = 512 * 1024 * 1024; 3195 adev->accel_working = false; 3196 adev->num_rings = 0; 3197 adev->mman.buffer_funcs = NULL; 3198 adev->mman.buffer_funcs_ring = NULL; 3199 adev->vm_manager.vm_pte_funcs = NULL; 3200 adev->vm_manager.vm_pte_num_scheds = 0; 3201 adev->gmc.gmc_funcs = NULL; 3202 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3203 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3204 3205 adev->smc_rreg = &amdgpu_invalid_rreg; 3206 adev->smc_wreg = &amdgpu_invalid_wreg; 3207 adev->pcie_rreg = &amdgpu_invalid_rreg; 3208 adev->pcie_wreg = &amdgpu_invalid_wreg; 3209 adev->pciep_rreg = &amdgpu_invalid_rreg; 3210 adev->pciep_wreg = &amdgpu_invalid_wreg; 3211 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3212 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3213 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3214 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3215 adev->didt_rreg = &amdgpu_invalid_rreg; 3216 adev->didt_wreg = &amdgpu_invalid_wreg; 3217 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3218 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3219 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3220 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3221 3222 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3223 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3224 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3225 3226 /* mutex initialization are all done here so we 3227 * can recall function without having locking issues */ 3228 atomic_set(&adev->irq.ih.lock, 0); 3229 mutex_init(&adev->firmware.mutex); 3230 mutex_init(&adev->pm.mutex); 3231 mutex_init(&adev->gfx.gpu_clock_mutex); 3232 mutex_init(&adev->srbm_mutex); 3233 mutex_init(&adev->gfx.pipe_reserve_mutex); 3234 mutex_init(&adev->gfx.gfx_off_mutex); 3235 mutex_init(&adev->grbm_idx_mutex); 3236 mutex_init(&adev->mn_lock); 3237 mutex_init(&adev->virt.vf_errors.lock); 3238 hash_init(adev->mn_hash); 3239 atomic_set(&adev->in_gpu_reset, 0); 3240 init_rwsem(&adev->reset_sem); 3241 mutex_init(&adev->psp.mutex); 3242 mutex_init(&adev->notifier_lock); 3243 3244 r = amdgpu_device_check_arguments(adev); 3245 if (r) 3246 return r; 3247 3248 spin_lock_init(&adev->mmio_idx_lock); 3249 spin_lock_init(&adev->smc_idx_lock); 3250 spin_lock_init(&adev->pcie_idx_lock); 3251 spin_lock_init(&adev->uvd_ctx_idx_lock); 3252 spin_lock_init(&adev->didt_idx_lock); 3253 spin_lock_init(&adev->gc_cac_idx_lock); 3254 spin_lock_init(&adev->se_cac_idx_lock); 3255 spin_lock_init(&adev->audio_endpt_idx_lock); 3256 spin_lock_init(&adev->mm_stats.lock); 3257 3258 INIT_LIST_HEAD(&adev->shadow_list); 3259 mutex_init(&adev->shadow_list_lock); 3260 3261 INIT_DELAYED_WORK(&adev->delayed_init_work, 3262 amdgpu_device_delayed_init_work_handler); 3263 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3264 amdgpu_device_delay_enable_gfx_off); 3265 3266 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3267 3268 adev->gfx.gfx_off_req_count = 1; 3269 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3270 3271 atomic_set(&adev->throttling_logging_enabled, 1); 3272 /* 3273 * If throttling continues, logging will be performed every minute 3274 * to avoid log flooding. "-1" is subtracted since the thermal 3275 * throttling interrupt comes every second. Thus, the total logging 3276 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3277 * for throttling interrupt) = 60 seconds. 3278 */ 3279 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3280 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3281 3282 /* Registers mapping */ 3283 /* TODO: block userspace mapping of io register */ 3284 if (adev->asic_type >= CHIP_BONAIRE) { 3285 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3286 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3287 } else { 3288 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3289 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3290 } 3291 3292 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3293 if (adev->rmmio == NULL) { 3294 return -ENOMEM; 3295 } 3296 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3297 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3298 3299 /* io port mapping */ 3300 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3301 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3302 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3303 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3304 break; 3305 } 3306 } 3307 if (adev->rio_mem == NULL) 3308 DRM_INFO("PCI I/O BAR is not found.\n"); 3309 3310 /* enable PCIE atomic ops */ 3311 r = pci_enable_atomic_ops_to_root(adev->pdev, 3312 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3313 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3314 if (r) { 3315 adev->have_atomics_support = false; 3316 DRM_INFO("PCIE atomic ops is not supported\n"); 3317 } else { 3318 adev->have_atomics_support = true; 3319 } 3320 3321 amdgpu_device_get_pcie_info(adev); 3322 3323 if (amdgpu_mcbp) 3324 DRM_INFO("MCBP is enabled\n"); 3325 3326 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3327 adev->enable_mes = true; 3328 3329 /* detect hw virtualization here */ 3330 amdgpu_detect_virtualization(adev); 3331 3332 r = amdgpu_device_get_job_timeout_settings(adev); 3333 if (r) { 3334 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3335 goto failed_unmap; 3336 } 3337 3338 /* early init functions */ 3339 r = amdgpu_device_ip_early_init(adev); 3340 if (r) 3341 goto failed_unmap; 3342 3343 /* doorbell bar mapping and doorbell index init*/ 3344 amdgpu_device_doorbell_init(adev); 3345 3346 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3347 /* this will fail for cards that aren't VGA class devices, just 3348 * ignore it */ 3349 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3350 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3351 3352 if (amdgpu_device_supports_boco(ddev)) 3353 boco = true; 3354 if (amdgpu_has_atpx() && 3355 (amdgpu_is_atpx_hybrid() || 3356 amdgpu_has_atpx_dgpu_power_cntl()) && 3357 !pci_is_thunderbolt_attached(adev->pdev)) 3358 vga_switcheroo_register_client(adev->pdev, 3359 &amdgpu_switcheroo_ops, boco); 3360 if (boco) 3361 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3362 3363 if (amdgpu_emu_mode == 1) { 3364 /* post the asic on emulation mode */ 3365 emu_soc_asic_init(adev); 3366 goto fence_driver_init; 3367 } 3368 3369 /* detect if we are with an SRIOV vbios */ 3370 amdgpu_device_detect_sriov_bios(adev); 3371 3372 /* check if we need to reset the asic 3373 * E.g., driver was not cleanly unloaded previously, etc. 3374 */ 3375 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3376 r = amdgpu_asic_reset(adev); 3377 if (r) { 3378 dev_err(adev->dev, "asic reset on init failed\n"); 3379 goto failed; 3380 } 3381 } 3382 3383 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3384 3385 /* Post card if necessary */ 3386 if (amdgpu_device_need_post(adev)) { 3387 if (!adev->bios) { 3388 dev_err(adev->dev, "no vBIOS found\n"); 3389 r = -EINVAL; 3390 goto failed; 3391 } 3392 DRM_INFO("GPU posting now...\n"); 3393 r = amdgpu_device_asic_init(adev); 3394 if (r) { 3395 dev_err(adev->dev, "gpu post error!\n"); 3396 goto failed; 3397 } 3398 } 3399 3400 if (adev->is_atom_fw) { 3401 /* Initialize clocks */ 3402 r = amdgpu_atomfirmware_get_clock_info(adev); 3403 if (r) { 3404 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3405 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3406 goto failed; 3407 } 3408 } else { 3409 /* Initialize clocks */ 3410 r = amdgpu_atombios_get_clock_info(adev); 3411 if (r) { 3412 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3413 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3414 goto failed; 3415 } 3416 /* init i2c buses */ 3417 if (!amdgpu_device_has_dc_support(adev)) 3418 amdgpu_atombios_i2c_init(adev); 3419 } 3420 3421 fence_driver_init: 3422 /* Fence driver */ 3423 r = amdgpu_fence_driver_init(adev); 3424 if (r) { 3425 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3426 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3427 goto failed; 3428 } 3429 3430 /* init the mode config */ 3431 drm_mode_config_init(adev_to_drm(adev)); 3432 3433 r = amdgpu_device_ip_init(adev); 3434 if (r) { 3435 /* failed in exclusive mode due to timeout */ 3436 if (amdgpu_sriov_vf(adev) && 3437 !amdgpu_sriov_runtime(adev) && 3438 amdgpu_virt_mmio_blocked(adev) && 3439 !amdgpu_virt_wait_reset(adev)) { 3440 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3441 /* Don't send request since VF is inactive. */ 3442 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3443 adev->virt.ops = NULL; 3444 r = -EAGAIN; 3445 goto failed; 3446 } 3447 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3448 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3449 goto failed; 3450 } 3451 3452 dev_info(adev->dev, 3453 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3454 adev->gfx.config.max_shader_engines, 3455 adev->gfx.config.max_sh_per_se, 3456 adev->gfx.config.max_cu_per_sh, 3457 adev->gfx.cu_info.number); 3458 3459 adev->accel_working = true; 3460 3461 amdgpu_vm_check_compute_bug(adev); 3462 3463 /* Initialize the buffer migration limit. */ 3464 if (amdgpu_moverate >= 0) 3465 max_MBps = amdgpu_moverate; 3466 else 3467 max_MBps = 8; /* Allow 8 MB/s. */ 3468 /* Get a log2 for easy divisions. */ 3469 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3470 3471 amdgpu_fbdev_init(adev); 3472 3473 r = amdgpu_pm_sysfs_init(adev); 3474 if (r) { 3475 adev->pm_sysfs_en = false; 3476 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3477 } else 3478 adev->pm_sysfs_en = true; 3479 3480 r = amdgpu_ucode_sysfs_init(adev); 3481 if (r) { 3482 adev->ucode_sysfs_en = false; 3483 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3484 } else 3485 adev->ucode_sysfs_en = true; 3486 3487 if ((amdgpu_testing & 1)) { 3488 if (adev->accel_working) 3489 amdgpu_test_moves(adev); 3490 else 3491 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3492 } 3493 if (amdgpu_benchmarking) { 3494 if (adev->accel_working) 3495 amdgpu_benchmark(adev, amdgpu_benchmarking); 3496 else 3497 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3498 } 3499 3500 /* 3501 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3502 * Otherwise the mgpu fan boost feature will be skipped due to the 3503 * gpu instance is counted less. 3504 */ 3505 amdgpu_register_gpu_instance(adev); 3506 3507 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3508 * explicit gating rather than handling it automatically. 3509 */ 3510 r = amdgpu_device_ip_late_init(adev); 3511 if (r) { 3512 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3513 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3514 goto failed; 3515 } 3516 3517 /* must succeed. */ 3518 amdgpu_ras_resume(adev); 3519 3520 queue_delayed_work(system_wq, &adev->delayed_init_work, 3521 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3522 3523 if (amdgpu_sriov_vf(adev)) 3524 flush_delayed_work(&adev->delayed_init_work); 3525 3526 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3527 if (r) 3528 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3529 3530 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3531 r = amdgpu_pmu_init(adev); 3532 if (r) 3533 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3534 3535 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3536 if (amdgpu_device_cache_pci_state(adev->pdev)) 3537 pci_restore_state(pdev); 3538 3539 return 0; 3540 3541 failed: 3542 amdgpu_vf_error_trans_all(adev); 3543 if (boco) 3544 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3545 3546 failed_unmap: 3547 iounmap(adev->rmmio); 3548 adev->rmmio = NULL; 3549 3550 return r; 3551 } 3552 3553 /** 3554 * amdgpu_device_fini - tear down the driver 3555 * 3556 * @adev: amdgpu_device pointer 3557 * 3558 * Tear down the driver info (all asics). 3559 * Called at driver shutdown. 3560 */ 3561 void amdgpu_device_fini(struct amdgpu_device *adev) 3562 { 3563 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3564 flush_delayed_work(&adev->delayed_init_work); 3565 adev->shutdown = true; 3566 3567 kfree(adev->pci_state); 3568 3569 /* make sure IB test finished before entering exclusive mode 3570 * to avoid preemption on IB test 3571 * */ 3572 if (amdgpu_sriov_vf(adev)) { 3573 amdgpu_virt_request_full_gpu(adev, false); 3574 amdgpu_virt_fini_data_exchange(adev); 3575 } 3576 3577 /* disable all interrupts */ 3578 amdgpu_irq_disable_all(adev); 3579 if (adev->mode_info.mode_config_initialized){ 3580 if (!amdgpu_device_has_dc_support(adev)) 3581 drm_helper_force_disable_all(adev_to_drm(adev)); 3582 else 3583 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3584 } 3585 amdgpu_fence_driver_fini(adev); 3586 if (adev->pm_sysfs_en) 3587 amdgpu_pm_sysfs_fini(adev); 3588 amdgpu_fbdev_fini(adev); 3589 amdgpu_device_ip_fini(adev); 3590 release_firmware(adev->firmware.gpu_info_fw); 3591 adev->firmware.gpu_info_fw = NULL; 3592 adev->accel_working = false; 3593 /* free i2c buses */ 3594 if (!amdgpu_device_has_dc_support(adev)) 3595 amdgpu_i2c_fini(adev); 3596 3597 if (amdgpu_emu_mode != 1) 3598 amdgpu_atombios_fini(adev); 3599 3600 kfree(adev->bios); 3601 adev->bios = NULL; 3602 if (amdgpu_has_atpx() && 3603 (amdgpu_is_atpx_hybrid() || 3604 amdgpu_has_atpx_dgpu_power_cntl()) && 3605 !pci_is_thunderbolt_attached(adev->pdev)) 3606 vga_switcheroo_unregister_client(adev->pdev); 3607 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3608 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3609 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3610 vga_client_register(adev->pdev, NULL, NULL, NULL); 3611 if (adev->rio_mem) 3612 pci_iounmap(adev->pdev, adev->rio_mem); 3613 adev->rio_mem = NULL; 3614 iounmap(adev->rmmio); 3615 adev->rmmio = NULL; 3616 amdgpu_device_doorbell_fini(adev); 3617 3618 if (adev->ucode_sysfs_en) 3619 amdgpu_ucode_sysfs_fini(adev); 3620 3621 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3622 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3623 amdgpu_pmu_fini(adev); 3624 if (adev->mman.discovery_bin) 3625 amdgpu_discovery_fini(adev); 3626 } 3627 3628 3629 /* 3630 * Suspend & resume. 3631 */ 3632 /** 3633 * amdgpu_device_suspend - initiate device suspend 3634 * 3635 * @dev: drm dev pointer 3636 * @fbcon : notify the fbdev of suspend 3637 * 3638 * Puts the hw in the suspend state (all asics). 3639 * Returns 0 for success or an error on failure. 3640 * Called at driver suspend. 3641 */ 3642 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3643 { 3644 struct amdgpu_device *adev; 3645 struct drm_crtc *crtc; 3646 struct drm_connector *connector; 3647 struct drm_connector_list_iter iter; 3648 int r; 3649 3650 adev = drm_to_adev(dev); 3651 3652 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3653 return 0; 3654 3655 adev->in_suspend = true; 3656 drm_kms_helper_poll_disable(dev); 3657 3658 if (fbcon) 3659 amdgpu_fbdev_set_suspend(adev, 1); 3660 3661 cancel_delayed_work_sync(&adev->delayed_init_work); 3662 3663 if (!amdgpu_device_has_dc_support(adev)) { 3664 /* turn off display hw */ 3665 drm_modeset_lock_all(dev); 3666 drm_connector_list_iter_begin(dev, &iter); 3667 drm_for_each_connector_iter(connector, &iter) 3668 drm_helper_connector_dpms(connector, 3669 DRM_MODE_DPMS_OFF); 3670 drm_connector_list_iter_end(&iter); 3671 drm_modeset_unlock_all(dev); 3672 /* unpin the front buffers and cursors */ 3673 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3674 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3675 struct drm_framebuffer *fb = crtc->primary->fb; 3676 struct amdgpu_bo *robj; 3677 3678 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3679 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3680 r = amdgpu_bo_reserve(aobj, true); 3681 if (r == 0) { 3682 amdgpu_bo_unpin(aobj); 3683 amdgpu_bo_unreserve(aobj); 3684 } 3685 } 3686 3687 if (fb == NULL || fb->obj[0] == NULL) { 3688 continue; 3689 } 3690 robj = gem_to_amdgpu_bo(fb->obj[0]); 3691 /* don't unpin kernel fb objects */ 3692 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3693 r = amdgpu_bo_reserve(robj, true); 3694 if (r == 0) { 3695 amdgpu_bo_unpin(robj); 3696 amdgpu_bo_unreserve(robj); 3697 } 3698 } 3699 } 3700 } 3701 3702 amdgpu_ras_suspend(adev); 3703 3704 r = amdgpu_device_ip_suspend_phase1(adev); 3705 3706 amdgpu_amdkfd_suspend(adev, !fbcon); 3707 3708 /* evict vram memory */ 3709 amdgpu_bo_evict_vram(adev); 3710 3711 amdgpu_fence_driver_suspend(adev); 3712 3713 if (!amdgpu_acpi_is_s0ix_supported() || amdgpu_in_reset(adev)) 3714 r = amdgpu_device_ip_suspend_phase2(adev); 3715 else 3716 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 3717 /* evict remaining vram memory 3718 * This second call to evict vram is to evict the gart page table 3719 * using the CPU. 3720 */ 3721 amdgpu_bo_evict_vram(adev); 3722 3723 return 0; 3724 } 3725 3726 /** 3727 * amdgpu_device_resume - initiate device resume 3728 * 3729 * @dev: drm dev pointer 3730 * @fbcon : notify the fbdev of resume 3731 * 3732 * Bring the hw back to operating state (all asics). 3733 * Returns 0 for success or an error on failure. 3734 * Called at driver resume. 3735 */ 3736 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3737 { 3738 struct drm_connector *connector; 3739 struct drm_connector_list_iter iter; 3740 struct amdgpu_device *adev = drm_to_adev(dev); 3741 struct drm_crtc *crtc; 3742 int r = 0; 3743 3744 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3745 return 0; 3746 3747 if (amdgpu_acpi_is_s0ix_supported()) 3748 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3749 3750 /* post card */ 3751 if (amdgpu_device_need_post(adev)) { 3752 r = amdgpu_device_asic_init(adev); 3753 if (r) 3754 dev_err(adev->dev, "amdgpu asic init failed\n"); 3755 } 3756 3757 r = amdgpu_device_ip_resume(adev); 3758 if (r) { 3759 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3760 return r; 3761 } 3762 amdgpu_fence_driver_resume(adev); 3763 3764 3765 r = amdgpu_device_ip_late_init(adev); 3766 if (r) 3767 return r; 3768 3769 queue_delayed_work(system_wq, &adev->delayed_init_work, 3770 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3771 3772 if (!amdgpu_device_has_dc_support(adev)) { 3773 /* pin cursors */ 3774 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3775 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3776 3777 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3778 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3779 r = amdgpu_bo_reserve(aobj, true); 3780 if (r == 0) { 3781 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3782 if (r != 0) 3783 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3784 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3785 amdgpu_bo_unreserve(aobj); 3786 } 3787 } 3788 } 3789 } 3790 r = amdgpu_amdkfd_resume(adev, !fbcon); 3791 if (r) 3792 return r; 3793 3794 /* Make sure IB tests flushed */ 3795 flush_delayed_work(&adev->delayed_init_work); 3796 3797 /* blat the mode back in */ 3798 if (fbcon) { 3799 if (!amdgpu_device_has_dc_support(adev)) { 3800 /* pre DCE11 */ 3801 drm_helper_resume_force_mode(dev); 3802 3803 /* turn on display hw */ 3804 drm_modeset_lock_all(dev); 3805 3806 drm_connector_list_iter_begin(dev, &iter); 3807 drm_for_each_connector_iter(connector, &iter) 3808 drm_helper_connector_dpms(connector, 3809 DRM_MODE_DPMS_ON); 3810 drm_connector_list_iter_end(&iter); 3811 3812 drm_modeset_unlock_all(dev); 3813 } 3814 amdgpu_fbdev_set_suspend(adev, 0); 3815 } 3816 3817 drm_kms_helper_poll_enable(dev); 3818 3819 amdgpu_ras_resume(adev); 3820 3821 /* 3822 * Most of the connector probing functions try to acquire runtime pm 3823 * refs to ensure that the GPU is powered on when connector polling is 3824 * performed. Since we're calling this from a runtime PM callback, 3825 * trying to acquire rpm refs will cause us to deadlock. 3826 * 3827 * Since we're guaranteed to be holding the rpm lock, it's safe to 3828 * temporarily disable the rpm helpers so this doesn't deadlock us. 3829 */ 3830 #ifdef CONFIG_PM 3831 dev->dev->power.disable_depth++; 3832 #endif 3833 if (!amdgpu_device_has_dc_support(adev)) 3834 drm_helper_hpd_irq_event(dev); 3835 else 3836 drm_kms_helper_hotplug_event(dev); 3837 #ifdef CONFIG_PM 3838 dev->dev->power.disable_depth--; 3839 #endif 3840 adev->in_suspend = false; 3841 3842 return 0; 3843 } 3844 3845 /** 3846 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3847 * 3848 * @adev: amdgpu_device pointer 3849 * 3850 * The list of all the hardware IPs that make up the asic is walked and 3851 * the check_soft_reset callbacks are run. check_soft_reset determines 3852 * if the asic is still hung or not. 3853 * Returns true if any of the IPs are still in a hung state, false if not. 3854 */ 3855 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3856 { 3857 int i; 3858 bool asic_hang = false; 3859 3860 if (amdgpu_sriov_vf(adev)) 3861 return true; 3862 3863 if (amdgpu_asic_need_full_reset(adev)) 3864 return true; 3865 3866 for (i = 0; i < adev->num_ip_blocks; i++) { 3867 if (!adev->ip_blocks[i].status.valid) 3868 continue; 3869 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3870 adev->ip_blocks[i].status.hang = 3871 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3872 if (adev->ip_blocks[i].status.hang) { 3873 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3874 asic_hang = true; 3875 } 3876 } 3877 return asic_hang; 3878 } 3879 3880 /** 3881 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3882 * 3883 * @adev: amdgpu_device pointer 3884 * 3885 * The list of all the hardware IPs that make up the asic is walked and the 3886 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3887 * handles any IP specific hardware or software state changes that are 3888 * necessary for a soft reset to succeed. 3889 * Returns 0 on success, negative error code on failure. 3890 */ 3891 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3892 { 3893 int i, r = 0; 3894 3895 for (i = 0; i < adev->num_ip_blocks; i++) { 3896 if (!adev->ip_blocks[i].status.valid) 3897 continue; 3898 if (adev->ip_blocks[i].status.hang && 3899 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3900 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3901 if (r) 3902 return r; 3903 } 3904 } 3905 3906 return 0; 3907 } 3908 3909 /** 3910 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3911 * 3912 * @adev: amdgpu_device pointer 3913 * 3914 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3915 * reset is necessary to recover. 3916 * Returns true if a full asic reset is required, false if not. 3917 */ 3918 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3919 { 3920 int i; 3921 3922 if (amdgpu_asic_need_full_reset(adev)) 3923 return true; 3924 3925 for (i = 0; i < adev->num_ip_blocks; i++) { 3926 if (!adev->ip_blocks[i].status.valid) 3927 continue; 3928 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3929 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3930 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3931 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3932 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3933 if (adev->ip_blocks[i].status.hang) { 3934 dev_info(adev->dev, "Some block need full reset!\n"); 3935 return true; 3936 } 3937 } 3938 } 3939 return false; 3940 } 3941 3942 /** 3943 * amdgpu_device_ip_soft_reset - do a soft reset 3944 * 3945 * @adev: amdgpu_device pointer 3946 * 3947 * The list of all the hardware IPs that make up the asic is walked and the 3948 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3949 * IP specific hardware or software state changes that are necessary to soft 3950 * reset the IP. 3951 * Returns 0 on success, negative error code on failure. 3952 */ 3953 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3954 { 3955 int i, r = 0; 3956 3957 for (i = 0; i < adev->num_ip_blocks; i++) { 3958 if (!adev->ip_blocks[i].status.valid) 3959 continue; 3960 if (adev->ip_blocks[i].status.hang && 3961 adev->ip_blocks[i].version->funcs->soft_reset) { 3962 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3963 if (r) 3964 return r; 3965 } 3966 } 3967 3968 return 0; 3969 } 3970 3971 /** 3972 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3973 * 3974 * @adev: amdgpu_device pointer 3975 * 3976 * The list of all the hardware IPs that make up the asic is walked and the 3977 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3978 * handles any IP specific hardware or software state changes that are 3979 * necessary after the IP has been soft reset. 3980 * Returns 0 on success, negative error code on failure. 3981 */ 3982 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3983 { 3984 int i, r = 0; 3985 3986 for (i = 0; i < adev->num_ip_blocks; i++) { 3987 if (!adev->ip_blocks[i].status.valid) 3988 continue; 3989 if (adev->ip_blocks[i].status.hang && 3990 adev->ip_blocks[i].version->funcs->post_soft_reset) 3991 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3992 if (r) 3993 return r; 3994 } 3995 3996 return 0; 3997 } 3998 3999 /** 4000 * amdgpu_device_recover_vram - Recover some VRAM contents 4001 * 4002 * @adev: amdgpu_device pointer 4003 * 4004 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4005 * restore things like GPUVM page tables after a GPU reset where 4006 * the contents of VRAM might be lost. 4007 * 4008 * Returns: 4009 * 0 on success, negative error code on failure. 4010 */ 4011 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4012 { 4013 struct dma_fence *fence = NULL, *next = NULL; 4014 struct amdgpu_bo *shadow; 4015 long r = 1, tmo; 4016 4017 if (amdgpu_sriov_runtime(adev)) 4018 tmo = msecs_to_jiffies(8000); 4019 else 4020 tmo = msecs_to_jiffies(100); 4021 4022 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4023 mutex_lock(&adev->shadow_list_lock); 4024 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4025 4026 /* No need to recover an evicted BO */ 4027 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4028 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4029 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4030 continue; 4031 4032 r = amdgpu_bo_restore_shadow(shadow, &next); 4033 if (r) 4034 break; 4035 4036 if (fence) { 4037 tmo = dma_fence_wait_timeout(fence, false, tmo); 4038 dma_fence_put(fence); 4039 fence = next; 4040 if (tmo == 0) { 4041 r = -ETIMEDOUT; 4042 break; 4043 } else if (tmo < 0) { 4044 r = tmo; 4045 break; 4046 } 4047 } else { 4048 fence = next; 4049 } 4050 } 4051 mutex_unlock(&adev->shadow_list_lock); 4052 4053 if (fence) 4054 tmo = dma_fence_wait_timeout(fence, false, tmo); 4055 dma_fence_put(fence); 4056 4057 if (r < 0 || tmo <= 0) { 4058 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4059 return -EIO; 4060 } 4061 4062 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4063 return 0; 4064 } 4065 4066 4067 /** 4068 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4069 * 4070 * @adev: amdgpu_device pointer 4071 * @from_hypervisor: request from hypervisor 4072 * 4073 * do VF FLR and reinitialize Asic 4074 * return 0 means succeeded otherwise failed 4075 */ 4076 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4077 bool from_hypervisor) 4078 { 4079 int r; 4080 4081 if (from_hypervisor) 4082 r = amdgpu_virt_request_full_gpu(adev, true); 4083 else 4084 r = amdgpu_virt_reset_gpu(adev); 4085 if (r) 4086 return r; 4087 4088 amdgpu_amdkfd_pre_reset(adev); 4089 4090 /* Resume IP prior to SMC */ 4091 r = amdgpu_device_ip_reinit_early_sriov(adev); 4092 if (r) 4093 goto error; 4094 4095 amdgpu_virt_init_data_exchange(adev); 4096 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4097 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4098 4099 r = amdgpu_device_fw_loading(adev); 4100 if (r) 4101 return r; 4102 4103 /* now we are okay to resume SMC/CP/SDMA */ 4104 r = amdgpu_device_ip_reinit_late_sriov(adev); 4105 if (r) 4106 goto error; 4107 4108 amdgpu_irq_gpu_reset_resume_helper(adev); 4109 r = amdgpu_ib_ring_tests(adev); 4110 amdgpu_amdkfd_post_reset(adev); 4111 4112 error: 4113 amdgpu_virt_release_full_gpu(adev, true); 4114 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4115 amdgpu_inc_vram_lost(adev); 4116 r = amdgpu_device_recover_vram(adev); 4117 } 4118 4119 return r; 4120 } 4121 4122 /** 4123 * amdgpu_device_has_job_running - check if there is any job in mirror list 4124 * 4125 * @adev: amdgpu_device pointer 4126 * 4127 * check if there is any job in mirror list 4128 */ 4129 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4130 { 4131 int i; 4132 struct drm_sched_job *job; 4133 4134 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4135 struct amdgpu_ring *ring = adev->rings[i]; 4136 4137 if (!ring || !ring->sched.thread) 4138 continue; 4139 4140 spin_lock(&ring->sched.job_list_lock); 4141 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4142 struct drm_sched_job, node); 4143 spin_unlock(&ring->sched.job_list_lock); 4144 if (job) 4145 return true; 4146 } 4147 return false; 4148 } 4149 4150 /** 4151 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4152 * 4153 * @adev: amdgpu_device pointer 4154 * 4155 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4156 * a hung GPU. 4157 */ 4158 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4159 { 4160 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4161 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4162 return false; 4163 } 4164 4165 if (amdgpu_gpu_recovery == 0) 4166 goto disabled; 4167 4168 if (amdgpu_sriov_vf(adev)) 4169 return true; 4170 4171 if (amdgpu_gpu_recovery == -1) { 4172 switch (adev->asic_type) { 4173 case CHIP_BONAIRE: 4174 case CHIP_HAWAII: 4175 case CHIP_TOPAZ: 4176 case CHIP_TONGA: 4177 case CHIP_FIJI: 4178 case CHIP_POLARIS10: 4179 case CHIP_POLARIS11: 4180 case CHIP_POLARIS12: 4181 case CHIP_VEGAM: 4182 case CHIP_VEGA20: 4183 case CHIP_VEGA10: 4184 case CHIP_VEGA12: 4185 case CHIP_RAVEN: 4186 case CHIP_ARCTURUS: 4187 case CHIP_RENOIR: 4188 case CHIP_NAVI10: 4189 case CHIP_NAVI14: 4190 case CHIP_NAVI12: 4191 case CHIP_SIENNA_CICHLID: 4192 break; 4193 default: 4194 goto disabled; 4195 } 4196 } 4197 4198 return true; 4199 4200 disabled: 4201 dev_info(adev->dev, "GPU recovery disabled.\n"); 4202 return false; 4203 } 4204 4205 4206 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4207 struct amdgpu_job *job, 4208 bool *need_full_reset_arg) 4209 { 4210 int i, r = 0; 4211 bool need_full_reset = *need_full_reset_arg; 4212 4213 amdgpu_debugfs_wait_dump(adev); 4214 4215 if (amdgpu_sriov_vf(adev)) { 4216 /* stop the data exchange thread */ 4217 amdgpu_virt_fini_data_exchange(adev); 4218 } 4219 4220 /* block all schedulers and reset given job's ring */ 4221 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4222 struct amdgpu_ring *ring = adev->rings[i]; 4223 4224 if (!ring || !ring->sched.thread) 4225 continue; 4226 4227 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4228 amdgpu_fence_driver_force_completion(ring); 4229 } 4230 4231 if(job) 4232 drm_sched_increase_karma(&job->base); 4233 4234 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4235 if (!amdgpu_sriov_vf(adev)) { 4236 4237 if (!need_full_reset) 4238 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4239 4240 if (!need_full_reset) { 4241 amdgpu_device_ip_pre_soft_reset(adev); 4242 r = amdgpu_device_ip_soft_reset(adev); 4243 amdgpu_device_ip_post_soft_reset(adev); 4244 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4245 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4246 need_full_reset = true; 4247 } 4248 } 4249 4250 if (need_full_reset) 4251 r = amdgpu_device_ip_suspend(adev); 4252 4253 *need_full_reset_arg = need_full_reset; 4254 } 4255 4256 return r; 4257 } 4258 4259 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4260 struct list_head *device_list_handle, 4261 bool *need_full_reset_arg, 4262 bool skip_hw_reset) 4263 { 4264 struct amdgpu_device *tmp_adev = NULL; 4265 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4266 int r = 0; 4267 4268 /* 4269 * ASIC reset has to be done on all HGMI hive nodes ASAP 4270 * to allow proper links negotiation in FW (within 1 sec) 4271 */ 4272 if (!skip_hw_reset && need_full_reset) { 4273 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4274 /* For XGMI run all resets in parallel to speed up the process */ 4275 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4276 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4277 r = -EALREADY; 4278 } else 4279 r = amdgpu_asic_reset(tmp_adev); 4280 4281 if (r) { 4282 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4283 r, adev_to_drm(tmp_adev)->unique); 4284 break; 4285 } 4286 } 4287 4288 /* For XGMI wait for all resets to complete before proceed */ 4289 if (!r) { 4290 list_for_each_entry(tmp_adev, device_list_handle, 4291 gmc.xgmi.head) { 4292 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4293 flush_work(&tmp_adev->xgmi_reset_work); 4294 r = tmp_adev->asic_reset_res; 4295 if (r) 4296 break; 4297 } 4298 } 4299 } 4300 } 4301 4302 if (!r && amdgpu_ras_intr_triggered()) { 4303 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4304 if (tmp_adev->mmhub.funcs && 4305 tmp_adev->mmhub.funcs->reset_ras_error_count) 4306 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4307 } 4308 4309 amdgpu_ras_intr_cleared(); 4310 } 4311 4312 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4313 if (need_full_reset) { 4314 /* post card */ 4315 if (amdgpu_device_asic_init(tmp_adev)) 4316 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4317 4318 if (!r) { 4319 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4320 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4321 if (r) 4322 goto out; 4323 4324 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4325 if (vram_lost) { 4326 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4327 amdgpu_inc_vram_lost(tmp_adev); 4328 } 4329 4330 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4331 if (r) 4332 goto out; 4333 4334 r = amdgpu_device_fw_loading(tmp_adev); 4335 if (r) 4336 return r; 4337 4338 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4339 if (r) 4340 goto out; 4341 4342 if (vram_lost) 4343 amdgpu_device_fill_reset_magic(tmp_adev); 4344 4345 /* 4346 * Add this ASIC as tracked as reset was already 4347 * complete successfully. 4348 */ 4349 amdgpu_register_gpu_instance(tmp_adev); 4350 4351 r = amdgpu_device_ip_late_init(tmp_adev); 4352 if (r) 4353 goto out; 4354 4355 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4356 4357 /* 4358 * The GPU enters bad state once faulty pages 4359 * by ECC has reached the threshold, and ras 4360 * recovery is scheduled next. So add one check 4361 * here to break recovery if it indeed exceeds 4362 * bad page threshold, and remind user to 4363 * retire this GPU or setting one bigger 4364 * bad_page_threshold value to fix this once 4365 * probing driver again. 4366 */ 4367 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4368 /* must succeed. */ 4369 amdgpu_ras_resume(tmp_adev); 4370 } else { 4371 r = -EINVAL; 4372 goto out; 4373 } 4374 4375 /* Update PSP FW topology after reset */ 4376 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4377 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4378 } 4379 } 4380 4381 out: 4382 if (!r) { 4383 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4384 r = amdgpu_ib_ring_tests(tmp_adev); 4385 if (r) { 4386 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4387 r = amdgpu_device_ip_suspend(tmp_adev); 4388 need_full_reset = true; 4389 r = -EAGAIN; 4390 goto end; 4391 } 4392 } 4393 4394 if (!r) 4395 r = amdgpu_device_recover_vram(tmp_adev); 4396 else 4397 tmp_adev->asic_reset_res = r; 4398 } 4399 4400 end: 4401 *need_full_reset_arg = need_full_reset; 4402 return r; 4403 } 4404 4405 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4406 struct amdgpu_hive_info *hive) 4407 { 4408 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4409 return false; 4410 4411 if (hive) { 4412 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4413 } else { 4414 down_write(&adev->reset_sem); 4415 } 4416 4417 atomic_inc(&adev->gpu_reset_counter); 4418 switch (amdgpu_asic_reset_method(adev)) { 4419 case AMD_RESET_METHOD_MODE1: 4420 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4421 break; 4422 case AMD_RESET_METHOD_MODE2: 4423 adev->mp1_state = PP_MP1_STATE_RESET; 4424 break; 4425 default: 4426 adev->mp1_state = PP_MP1_STATE_NONE; 4427 break; 4428 } 4429 4430 return true; 4431 } 4432 4433 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4434 { 4435 amdgpu_vf_error_trans_all(adev); 4436 adev->mp1_state = PP_MP1_STATE_NONE; 4437 atomic_set(&adev->in_gpu_reset, 0); 4438 up_write(&adev->reset_sem); 4439 } 4440 4441 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4442 { 4443 struct pci_dev *p = NULL; 4444 4445 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4446 adev->pdev->bus->number, 1); 4447 if (p) { 4448 pm_runtime_enable(&(p->dev)); 4449 pm_runtime_resume(&(p->dev)); 4450 } 4451 } 4452 4453 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4454 { 4455 enum amd_reset_method reset_method; 4456 struct pci_dev *p = NULL; 4457 u64 expires; 4458 4459 /* 4460 * For now, only BACO and mode1 reset are confirmed 4461 * to suffer the audio issue without proper suspended. 4462 */ 4463 reset_method = amdgpu_asic_reset_method(adev); 4464 if ((reset_method != AMD_RESET_METHOD_BACO) && 4465 (reset_method != AMD_RESET_METHOD_MODE1)) 4466 return -EINVAL; 4467 4468 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4469 adev->pdev->bus->number, 1); 4470 if (!p) 4471 return -ENODEV; 4472 4473 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4474 if (!expires) 4475 /* 4476 * If we cannot get the audio device autosuspend delay, 4477 * a fixed 4S interval will be used. Considering 3S is 4478 * the audio controller default autosuspend delay setting. 4479 * 4S used here is guaranteed to cover that. 4480 */ 4481 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4482 4483 while (!pm_runtime_status_suspended(&(p->dev))) { 4484 if (!pm_runtime_suspend(&(p->dev))) 4485 break; 4486 4487 if (expires < ktime_get_mono_fast_ns()) { 4488 dev_warn(adev->dev, "failed to suspend display audio\n"); 4489 /* TODO: abort the succeeding gpu reset? */ 4490 return -ETIMEDOUT; 4491 } 4492 } 4493 4494 pm_runtime_disable(&(p->dev)); 4495 4496 return 0; 4497 } 4498 4499 /** 4500 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4501 * 4502 * @adev: amdgpu_device pointer 4503 * @job: which job trigger hang 4504 * 4505 * Attempt to reset the GPU if it has hung (all asics). 4506 * Attempt to do soft-reset or full-reset and reinitialize Asic 4507 * Returns 0 for success or an error on failure. 4508 */ 4509 4510 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4511 struct amdgpu_job *job) 4512 { 4513 struct list_head device_list, *device_list_handle = NULL; 4514 bool need_full_reset = false; 4515 bool job_signaled = false; 4516 struct amdgpu_hive_info *hive = NULL; 4517 struct amdgpu_device *tmp_adev = NULL; 4518 int i, r = 0; 4519 bool need_emergency_restart = false; 4520 bool audio_suspended = false; 4521 4522 /* 4523 * Special case: RAS triggered and full reset isn't supported 4524 */ 4525 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4526 4527 /* 4528 * Flush RAM to disk so that after reboot 4529 * the user can read log and see why the system rebooted. 4530 */ 4531 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4532 DRM_WARN("Emergency reboot."); 4533 4534 ksys_sync_helper(); 4535 emergency_restart(); 4536 } 4537 4538 dev_info(adev->dev, "GPU %s begin!\n", 4539 need_emergency_restart ? "jobs stop":"reset"); 4540 4541 /* 4542 * Here we trylock to avoid chain of resets executing from 4543 * either trigger by jobs on different adevs in XGMI hive or jobs on 4544 * different schedulers for same device while this TO handler is running. 4545 * We always reset all schedulers for device and all devices for XGMI 4546 * hive so that should take care of them too. 4547 */ 4548 hive = amdgpu_get_xgmi_hive(adev); 4549 if (hive) { 4550 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4551 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4552 job ? job->base.id : -1, hive->hive_id); 4553 amdgpu_put_xgmi_hive(hive); 4554 return 0; 4555 } 4556 mutex_lock(&hive->hive_lock); 4557 } 4558 4559 /* 4560 * Build list of devices to reset. 4561 * In case we are in XGMI hive mode, resort the device list 4562 * to put adev in the 1st position. 4563 */ 4564 INIT_LIST_HEAD(&device_list); 4565 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4566 if (!hive) 4567 return -ENODEV; 4568 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4569 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4570 device_list_handle = &hive->device_list; 4571 } else { 4572 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4573 device_list_handle = &device_list; 4574 } 4575 4576 /* block all schedulers and reset given job's ring */ 4577 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4578 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4579 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4580 job ? job->base.id : -1); 4581 r = 0; 4582 goto skip_recovery; 4583 } 4584 4585 /* 4586 * Try to put the audio codec into suspend state 4587 * before gpu reset started. 4588 * 4589 * Due to the power domain of the graphics device 4590 * is shared with AZ power domain. Without this, 4591 * we may change the audio hardware from behind 4592 * the audio driver's back. That will trigger 4593 * some audio codec errors. 4594 */ 4595 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4596 audio_suspended = true; 4597 4598 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4599 4600 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4601 4602 if (!amdgpu_sriov_vf(tmp_adev)) 4603 amdgpu_amdkfd_pre_reset(tmp_adev); 4604 4605 /* 4606 * Mark these ASICs to be reseted as untracked first 4607 * And add them back after reset completed 4608 */ 4609 amdgpu_unregister_gpu_instance(tmp_adev); 4610 4611 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4612 4613 /* disable ras on ALL IPs */ 4614 if (!need_emergency_restart && 4615 amdgpu_device_ip_need_full_reset(tmp_adev)) 4616 amdgpu_ras_suspend(tmp_adev); 4617 4618 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4619 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4620 4621 if (!ring || !ring->sched.thread) 4622 continue; 4623 4624 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4625 4626 if (need_emergency_restart) 4627 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4628 } 4629 } 4630 4631 if (need_emergency_restart) 4632 goto skip_sched_resume; 4633 4634 /* 4635 * Must check guilty signal here since after this point all old 4636 * HW fences are force signaled. 4637 * 4638 * job->base holds a reference to parent fence 4639 */ 4640 if (job && job->base.s_fence->parent && 4641 dma_fence_is_signaled(job->base.s_fence->parent)) { 4642 job_signaled = true; 4643 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4644 goto skip_hw_reset; 4645 } 4646 4647 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4648 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4649 r = amdgpu_device_pre_asic_reset(tmp_adev, 4650 (tmp_adev == adev) ? job : NULL, 4651 &need_full_reset); 4652 /*TODO Should we stop ?*/ 4653 if (r) { 4654 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4655 r, adev_to_drm(tmp_adev)->unique); 4656 tmp_adev->asic_reset_res = r; 4657 } 4658 } 4659 4660 /* Actual ASIC resets if needed.*/ 4661 /* TODO Implement XGMI hive reset logic for SRIOV */ 4662 if (amdgpu_sriov_vf(adev)) { 4663 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4664 if (r) 4665 adev->asic_reset_res = r; 4666 } else { 4667 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4668 if (r && r == -EAGAIN) 4669 goto retry; 4670 } 4671 4672 skip_hw_reset: 4673 4674 /* Post ASIC reset for all devs .*/ 4675 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4676 4677 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4678 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4679 4680 if (!ring || !ring->sched.thread) 4681 continue; 4682 4683 /* No point to resubmit jobs if we didn't HW reset*/ 4684 if (!tmp_adev->asic_reset_res && !job_signaled) 4685 drm_sched_resubmit_jobs(&ring->sched); 4686 4687 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4688 } 4689 4690 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4691 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4692 } 4693 4694 tmp_adev->asic_reset_res = 0; 4695 4696 if (r) { 4697 /* bad news, how to tell it to userspace ? */ 4698 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4699 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4700 } else { 4701 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4702 } 4703 } 4704 4705 skip_sched_resume: 4706 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4707 /*unlock kfd: SRIOV would do it separately */ 4708 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4709 amdgpu_amdkfd_post_reset(tmp_adev); 4710 if (audio_suspended) 4711 amdgpu_device_resume_display_audio(tmp_adev); 4712 amdgpu_device_unlock_adev(tmp_adev); 4713 } 4714 4715 skip_recovery: 4716 if (hive) { 4717 atomic_set(&hive->in_reset, 0); 4718 mutex_unlock(&hive->hive_lock); 4719 amdgpu_put_xgmi_hive(hive); 4720 } 4721 4722 if (r) 4723 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4724 return r; 4725 } 4726 4727 /** 4728 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4729 * 4730 * @adev: amdgpu_device pointer 4731 * 4732 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4733 * and lanes) of the slot the device is in. Handles APUs and 4734 * virtualized environments where PCIE config space may not be available. 4735 */ 4736 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4737 { 4738 struct pci_dev *pdev; 4739 enum pci_bus_speed speed_cap, platform_speed_cap; 4740 enum pcie_link_width platform_link_width; 4741 4742 if (amdgpu_pcie_gen_cap) 4743 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4744 4745 if (amdgpu_pcie_lane_cap) 4746 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4747 4748 /* covers APUs as well */ 4749 if (pci_is_root_bus(adev->pdev->bus)) { 4750 if (adev->pm.pcie_gen_mask == 0) 4751 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4752 if (adev->pm.pcie_mlw_mask == 0) 4753 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4754 return; 4755 } 4756 4757 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4758 return; 4759 4760 pcie_bandwidth_available(adev->pdev, NULL, 4761 &platform_speed_cap, &platform_link_width); 4762 4763 if (adev->pm.pcie_gen_mask == 0) { 4764 /* asic caps */ 4765 pdev = adev->pdev; 4766 speed_cap = pcie_get_speed_cap(pdev); 4767 if (speed_cap == PCI_SPEED_UNKNOWN) { 4768 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4769 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4770 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4771 } else { 4772 if (speed_cap == PCIE_SPEED_16_0GT) 4773 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4774 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4775 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4776 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4777 else if (speed_cap == PCIE_SPEED_8_0GT) 4778 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4779 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4780 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4781 else if (speed_cap == PCIE_SPEED_5_0GT) 4782 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4783 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4784 else 4785 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4786 } 4787 /* platform caps */ 4788 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4789 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4790 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4791 } else { 4792 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4793 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4794 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4795 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4796 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4797 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4798 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4799 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4800 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4801 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4802 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4803 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4804 else 4805 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4806 4807 } 4808 } 4809 if (adev->pm.pcie_mlw_mask == 0) { 4810 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4811 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4812 } else { 4813 switch (platform_link_width) { 4814 case PCIE_LNK_X32: 4815 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4822 break; 4823 case PCIE_LNK_X16: 4824 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4830 break; 4831 case PCIE_LNK_X12: 4832 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4834 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4836 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4837 break; 4838 case PCIE_LNK_X8: 4839 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4840 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4841 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4842 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4843 break; 4844 case PCIE_LNK_X4: 4845 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4846 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4847 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4848 break; 4849 case PCIE_LNK_X2: 4850 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4851 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4852 break; 4853 case PCIE_LNK_X1: 4854 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4855 break; 4856 default: 4857 break; 4858 } 4859 } 4860 } 4861 } 4862 4863 int amdgpu_device_baco_enter(struct drm_device *dev) 4864 { 4865 struct amdgpu_device *adev = drm_to_adev(dev); 4866 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4867 4868 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4869 return -ENOTSUPP; 4870 4871 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4872 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4873 4874 return amdgpu_dpm_baco_enter(adev); 4875 } 4876 4877 int amdgpu_device_baco_exit(struct drm_device *dev) 4878 { 4879 struct amdgpu_device *adev = drm_to_adev(dev); 4880 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4881 int ret = 0; 4882 4883 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4884 return -ENOTSUPP; 4885 4886 ret = amdgpu_dpm_baco_exit(adev); 4887 if (ret) 4888 return ret; 4889 4890 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4891 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4892 4893 return 0; 4894 } 4895 4896 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4897 { 4898 int i; 4899 4900 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4901 struct amdgpu_ring *ring = adev->rings[i]; 4902 4903 if (!ring || !ring->sched.thread) 4904 continue; 4905 4906 cancel_delayed_work_sync(&ring->sched.work_tdr); 4907 } 4908 } 4909 4910 /** 4911 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4912 * @pdev: PCI device struct 4913 * @state: PCI channel state 4914 * 4915 * Description: Called when a PCI error is detected. 4916 * 4917 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4918 */ 4919 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4920 { 4921 struct drm_device *dev = pci_get_drvdata(pdev); 4922 struct amdgpu_device *adev = drm_to_adev(dev); 4923 int i; 4924 4925 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4926 4927 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4928 DRM_WARN("No support for XGMI hive yet..."); 4929 return PCI_ERS_RESULT_DISCONNECT; 4930 } 4931 4932 switch (state) { 4933 case pci_channel_io_normal: 4934 return PCI_ERS_RESULT_CAN_RECOVER; 4935 /* Fatal error, prepare for slot reset */ 4936 case pci_channel_io_frozen: 4937 /* 4938 * Cancel and wait for all TDRs in progress if failing to 4939 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4940 * 4941 * Locking adev->reset_sem will prevent any external access 4942 * to GPU during PCI error recovery 4943 */ 4944 while (!amdgpu_device_lock_adev(adev, NULL)) 4945 amdgpu_cancel_all_tdr(adev); 4946 4947 /* 4948 * Block any work scheduling as we do for regular GPU reset 4949 * for the duration of the recovery 4950 */ 4951 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4952 struct amdgpu_ring *ring = adev->rings[i]; 4953 4954 if (!ring || !ring->sched.thread) 4955 continue; 4956 4957 drm_sched_stop(&ring->sched, NULL); 4958 } 4959 return PCI_ERS_RESULT_NEED_RESET; 4960 case pci_channel_io_perm_failure: 4961 /* Permanent error, prepare for device removal */ 4962 return PCI_ERS_RESULT_DISCONNECT; 4963 } 4964 4965 return PCI_ERS_RESULT_NEED_RESET; 4966 } 4967 4968 /** 4969 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4970 * @pdev: pointer to PCI device 4971 */ 4972 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4973 { 4974 4975 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4976 4977 /* TODO - dump whatever for debugging purposes */ 4978 4979 /* This called only if amdgpu_pci_error_detected returns 4980 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4981 * works, no need to reset slot. 4982 */ 4983 4984 return PCI_ERS_RESULT_RECOVERED; 4985 } 4986 4987 /** 4988 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4989 * @pdev: PCI device struct 4990 * 4991 * Description: This routine is called by the pci error recovery 4992 * code after the PCI slot has been reset, just before we 4993 * should resume normal operations. 4994 */ 4995 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4996 { 4997 struct drm_device *dev = pci_get_drvdata(pdev); 4998 struct amdgpu_device *adev = drm_to_adev(dev); 4999 int r, i; 5000 bool need_full_reset = true; 5001 u32 memsize; 5002 struct list_head device_list; 5003 5004 DRM_INFO("PCI error: slot reset callback!!\n"); 5005 5006 INIT_LIST_HEAD(&device_list); 5007 list_add_tail(&adev->gmc.xgmi.head, &device_list); 5008 5009 /* wait for asic to come out of reset */ 5010 msleep(500); 5011 5012 /* Restore PCI confspace */ 5013 amdgpu_device_load_pci_state(pdev); 5014 5015 /* confirm ASIC came out of reset */ 5016 for (i = 0; i < adev->usec_timeout; i++) { 5017 memsize = amdgpu_asic_get_config_memsize(adev); 5018 5019 if (memsize != 0xffffffff) 5020 break; 5021 udelay(1); 5022 } 5023 if (memsize == 0xffffffff) { 5024 r = -ETIME; 5025 goto out; 5026 } 5027 5028 adev->in_pci_err_recovery = true; 5029 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5030 adev->in_pci_err_recovery = false; 5031 if (r) 5032 goto out; 5033 5034 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5035 5036 out: 5037 if (!r) { 5038 if (amdgpu_device_cache_pci_state(adev->pdev)) 5039 pci_restore_state(adev->pdev); 5040 5041 DRM_INFO("PCIe error recovery succeeded\n"); 5042 } else { 5043 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5044 amdgpu_device_unlock_adev(adev); 5045 } 5046 5047 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5048 } 5049 5050 /** 5051 * amdgpu_pci_resume() - resume normal ops after PCI reset 5052 * @pdev: pointer to PCI device 5053 * 5054 * Called when the error recovery driver tells us that its 5055 * OK to resume normal operation. Use completion to allow 5056 * halted scsi ops to resume. 5057 */ 5058 void amdgpu_pci_resume(struct pci_dev *pdev) 5059 { 5060 struct drm_device *dev = pci_get_drvdata(pdev); 5061 struct amdgpu_device *adev = drm_to_adev(dev); 5062 int i; 5063 5064 5065 DRM_INFO("PCI error: resume callback!!\n"); 5066 5067 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5068 struct amdgpu_ring *ring = adev->rings[i]; 5069 5070 if (!ring || !ring->sched.thread) 5071 continue; 5072 5073 5074 drm_sched_resubmit_jobs(&ring->sched); 5075 drm_sched_start(&ring->sched, true); 5076 } 5077 5078 amdgpu_device_unlock_adev(adev); 5079 } 5080 5081 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5082 { 5083 struct drm_device *dev = pci_get_drvdata(pdev); 5084 struct amdgpu_device *adev = drm_to_adev(dev); 5085 int r; 5086 5087 r = pci_save_state(pdev); 5088 if (!r) { 5089 kfree(adev->pci_state); 5090 5091 adev->pci_state = pci_store_saved_state(pdev); 5092 5093 if (!adev->pci_state) { 5094 DRM_ERROR("Failed to store PCI saved state"); 5095 return false; 5096 } 5097 } else { 5098 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5099 return false; 5100 } 5101 5102 return true; 5103 } 5104 5105 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5106 { 5107 struct drm_device *dev = pci_get_drvdata(pdev); 5108 struct amdgpu_device *adev = drm_to_adev(dev); 5109 int r; 5110 5111 if (!adev->pci_state) 5112 return false; 5113 5114 r = pci_load_saved_state(pdev, adev->pci_state); 5115 5116 if (!r) { 5117 pci_restore_state(pdev); 5118 } else { 5119 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5120 return false; 5121 } 5122 5123 return true; 5124 } 5125 5126 5127