1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 84 85 #define AMDGPU_RESUME_MS 2000 86 87 const char *amdgpu_asic_name[] = { 88 "TAHITI", 89 "PITCAIRN", 90 "VERDE", 91 "OLAND", 92 "HAINAN", 93 "BONAIRE", 94 "KAVERI", 95 "KABINI", 96 "HAWAII", 97 "MULLINS", 98 "TOPAZ", 99 "TONGA", 100 "FIJI", 101 "CARRIZO", 102 "STONEY", 103 "POLARIS10", 104 "POLARIS11", 105 "POLARIS12", 106 "VEGAM", 107 "VEGA10", 108 "VEGA12", 109 "VEGA20", 110 "RAVEN", 111 "ARCTURUS", 112 "RENOIR", 113 "NAVI10", 114 "NAVI14", 115 "NAVI12", 116 "SIENNA_CICHLID", 117 "NAVY_FLOUNDER", 118 "VANGOGH", 119 "DIMGREY_CAVEFISH", 120 "LAST", 121 }; 122 123 /** 124 * DOC: pcie_replay_count 125 * 126 * The amdgpu driver provides a sysfs API for reporting the total number 127 * of PCIe replays (NAKs) 128 * The file pcie_replay_count is used for this and returns the total 129 * number of replays as a sum of the NAKs generated and NAKs received 130 */ 131 132 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 133 struct device_attribute *attr, char *buf) 134 { 135 struct drm_device *ddev = dev_get_drvdata(dev); 136 struct amdgpu_device *adev = drm_to_adev(ddev); 137 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 138 139 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 140 } 141 142 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 143 amdgpu_device_get_pcie_replay_count, NULL); 144 145 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 146 147 /** 148 * DOC: product_name 149 * 150 * The amdgpu driver provides a sysfs API for reporting the product name 151 * for the device 152 * The file serial_number is used for this and returns the product name 153 * as returned from the FRU. 154 * NOTE: This is only available for certain server cards 155 */ 156 157 static ssize_t amdgpu_device_get_product_name(struct device *dev, 158 struct device_attribute *attr, char *buf) 159 { 160 struct drm_device *ddev = dev_get_drvdata(dev); 161 struct amdgpu_device *adev = drm_to_adev(ddev); 162 163 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 164 } 165 166 static DEVICE_ATTR(product_name, S_IRUGO, 167 amdgpu_device_get_product_name, NULL); 168 169 /** 170 * DOC: product_number 171 * 172 * The amdgpu driver provides a sysfs API for reporting the part number 173 * for the device 174 * The file serial_number is used for this and returns the part number 175 * as returned from the FRU. 176 * NOTE: This is only available for certain server cards 177 */ 178 179 static ssize_t amdgpu_device_get_product_number(struct device *dev, 180 struct device_attribute *attr, char *buf) 181 { 182 struct drm_device *ddev = dev_get_drvdata(dev); 183 struct amdgpu_device *adev = drm_to_adev(ddev); 184 185 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 186 } 187 188 static DEVICE_ATTR(product_number, S_IRUGO, 189 amdgpu_device_get_product_number, NULL); 190 191 /** 192 * DOC: serial_number 193 * 194 * The amdgpu driver provides a sysfs API for reporting the serial number 195 * for the device 196 * The file serial_number is used for this and returns the serial number 197 * as returned from the FRU. 198 * NOTE: This is only available for certain server cards 199 */ 200 201 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 202 struct device_attribute *attr, char *buf) 203 { 204 struct drm_device *ddev = dev_get_drvdata(dev); 205 struct amdgpu_device *adev = drm_to_adev(ddev); 206 207 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 208 } 209 210 static DEVICE_ATTR(serial_number, S_IRUGO, 211 amdgpu_device_get_serial_number, NULL); 212 213 /** 214 * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control 215 * 216 * @dev: drm_device pointer 217 * 218 * Returns true if the device is a dGPU with HG/PX power control, 219 * otherwise return false. 220 */ 221 bool amdgpu_device_supports_atpx(struct drm_device *dev) 222 { 223 struct amdgpu_device *adev = drm_to_adev(dev); 224 225 if (adev->flags & AMD_IS_PX) 226 return true; 227 return false; 228 } 229 230 /** 231 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 232 * 233 * @dev: drm_device pointer 234 * 235 * Returns true if the device is a dGPU with HG/PX power control, 236 * otherwise return false. 237 */ 238 bool amdgpu_device_supports_boco(struct drm_device *dev) 239 { 240 struct amdgpu_device *adev = drm_to_adev(dev); 241 242 if (adev->has_pr3) 243 return true; 244 return false; 245 } 246 247 /** 248 * amdgpu_device_supports_baco - Does the device support BACO 249 * 250 * @dev: drm_device pointer 251 * 252 * Returns true if the device supporte BACO, 253 * otherwise return false. 254 */ 255 bool amdgpu_device_supports_baco(struct drm_device *dev) 256 { 257 struct amdgpu_device *adev = drm_to_adev(dev); 258 259 return amdgpu_asic_supports_baco(adev); 260 } 261 262 /* 263 * VRAM access helper functions 264 */ 265 266 /** 267 * amdgpu_device_vram_access - read/write a buffer in vram 268 * 269 * @adev: amdgpu_device pointer 270 * @pos: offset of the buffer in vram 271 * @buf: virtual address of the buffer in system memory 272 * @size: read/write size, sizeof(@buf) must > @size 273 * @write: true - write to vram, otherwise - read from vram 274 */ 275 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 276 uint32_t *buf, size_t size, bool write) 277 { 278 unsigned long flags; 279 uint32_t hi = ~0; 280 uint64_t last; 281 282 283 #ifdef CONFIG_64BIT 284 last = min(pos + size, adev->gmc.visible_vram_size); 285 if (last > pos) { 286 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 287 size_t count = last - pos; 288 289 if (write) { 290 memcpy_toio(addr, buf, count); 291 mb(); 292 amdgpu_asic_flush_hdp(adev, NULL); 293 } else { 294 amdgpu_asic_invalidate_hdp(adev, NULL); 295 mb(); 296 memcpy_fromio(buf, addr, count); 297 } 298 299 if (count == size) 300 return; 301 302 pos += count; 303 buf += count / 4; 304 size -= count; 305 } 306 #endif 307 308 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 309 for (last = pos + size; pos < last; pos += 4) { 310 uint32_t tmp = pos >> 31; 311 312 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 313 if (tmp != hi) { 314 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 315 hi = tmp; 316 } 317 if (write) 318 WREG32_NO_KIQ(mmMM_DATA, *buf++); 319 else 320 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 321 } 322 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 323 } 324 325 /* 326 * register access helper functions. 327 */ 328 /** 329 * amdgpu_device_rreg - read a memory mapped IO or indirect register 330 * 331 * @adev: amdgpu_device pointer 332 * @reg: dword aligned register offset 333 * @acc_flags: access flags which require special behavior 334 * 335 * Returns the 32 bit value from the offset specified. 336 */ 337 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 338 uint32_t reg, uint32_t acc_flags) 339 { 340 uint32_t ret; 341 342 if (adev->in_pci_err_recovery) 343 return 0; 344 345 if ((reg * 4) < adev->rmmio_size) { 346 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 347 amdgpu_sriov_runtime(adev) && 348 down_read_trylock(&adev->reset_sem)) { 349 ret = amdgpu_kiq_rreg(adev, reg); 350 up_read(&adev->reset_sem); 351 } else { 352 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 353 } 354 } else { 355 ret = adev->pcie_rreg(adev, reg * 4); 356 } 357 358 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 359 360 return ret; 361 } 362 363 /* 364 * MMIO register read with bytes helper functions 365 * @offset:bytes offset from MMIO start 366 * 367 */ 368 369 /** 370 * amdgpu_mm_rreg8 - read a memory mapped IO register 371 * 372 * @adev: amdgpu_device pointer 373 * @offset: byte aligned register offset 374 * 375 * Returns the 8 bit value from the offset specified. 376 */ 377 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 378 { 379 if (adev->in_pci_err_recovery) 380 return 0; 381 382 if (offset < adev->rmmio_size) 383 return (readb(adev->rmmio + offset)); 384 BUG(); 385 } 386 387 /* 388 * MMIO register write with bytes helper functions 389 * @offset:bytes offset from MMIO start 390 * @value: the value want to be written to the register 391 * 392 */ 393 /** 394 * amdgpu_mm_wreg8 - read a memory mapped IO register 395 * 396 * @adev: amdgpu_device pointer 397 * @offset: byte aligned register offset 398 * @value: 8 bit value to write 399 * 400 * Writes the value specified to the offset specified. 401 */ 402 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 403 { 404 if (adev->in_pci_err_recovery) 405 return; 406 407 if (offset < adev->rmmio_size) 408 writeb(value, adev->rmmio + offset); 409 else 410 BUG(); 411 } 412 413 /** 414 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 415 * 416 * @adev: amdgpu_device pointer 417 * @reg: dword aligned register offset 418 * @v: 32 bit value to write to the register 419 * @acc_flags: access flags which require special behavior 420 * 421 * Writes the value specified to the offset specified. 422 */ 423 void amdgpu_device_wreg(struct amdgpu_device *adev, 424 uint32_t reg, uint32_t v, 425 uint32_t acc_flags) 426 { 427 if (adev->in_pci_err_recovery) 428 return; 429 430 if ((reg * 4) < adev->rmmio_size) { 431 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 432 amdgpu_sriov_runtime(adev) && 433 down_read_trylock(&adev->reset_sem)) { 434 amdgpu_kiq_wreg(adev, reg, v); 435 up_read(&adev->reset_sem); 436 } else { 437 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 438 } 439 } else { 440 adev->pcie_wreg(adev, reg * 4, v); 441 } 442 443 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 444 } 445 446 /* 447 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 448 * 449 * this function is invoked only the debugfs register access 450 * */ 451 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 452 uint32_t reg, uint32_t v) 453 { 454 if (adev->in_pci_err_recovery) 455 return; 456 457 if (amdgpu_sriov_fullaccess(adev) && 458 adev->gfx.rlc.funcs && 459 adev->gfx.rlc.funcs->is_rlcg_access_range) { 460 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 461 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 462 } else { 463 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 464 } 465 } 466 467 /** 468 * amdgpu_io_rreg - read an IO register 469 * 470 * @adev: amdgpu_device pointer 471 * @reg: dword aligned register offset 472 * 473 * Returns the 32 bit value from the offset specified. 474 */ 475 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 476 { 477 if (adev->in_pci_err_recovery) 478 return 0; 479 480 if ((reg * 4) < adev->rio_mem_size) 481 return ioread32(adev->rio_mem + (reg * 4)); 482 else { 483 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 484 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 485 } 486 } 487 488 /** 489 * amdgpu_io_wreg - write to an IO register 490 * 491 * @adev: amdgpu_device pointer 492 * @reg: dword aligned register offset 493 * @v: 32 bit value to write to the register 494 * 495 * Writes the value specified to the offset specified. 496 */ 497 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 498 { 499 if (adev->in_pci_err_recovery) 500 return; 501 502 if ((reg * 4) < adev->rio_mem_size) 503 iowrite32(v, adev->rio_mem + (reg * 4)); 504 else { 505 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 506 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 507 } 508 } 509 510 /** 511 * amdgpu_mm_rdoorbell - read a doorbell dword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * 516 * Returns the value in the doorbell aperture at the 517 * requested doorbell index (CIK). 518 */ 519 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 520 { 521 if (adev->in_pci_err_recovery) 522 return 0; 523 524 if (index < adev->doorbell.num_doorbells) { 525 return readl(adev->doorbell.ptr + index); 526 } else { 527 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 528 return 0; 529 } 530 } 531 532 /** 533 * amdgpu_mm_wdoorbell - write a doorbell dword 534 * 535 * @adev: amdgpu_device pointer 536 * @index: doorbell index 537 * @v: value to write 538 * 539 * Writes @v to the doorbell aperture at the 540 * requested doorbell index (CIK). 541 */ 542 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 543 { 544 if (adev->in_pci_err_recovery) 545 return; 546 547 if (index < adev->doorbell.num_doorbells) { 548 writel(v, adev->doorbell.ptr + index); 549 } else { 550 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 551 } 552 } 553 554 /** 555 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 556 * 557 * @adev: amdgpu_device pointer 558 * @index: doorbell index 559 * 560 * Returns the value in the doorbell aperture at the 561 * requested doorbell index (VEGA10+). 562 */ 563 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 564 { 565 if (adev->in_pci_err_recovery) 566 return 0; 567 568 if (index < adev->doorbell.num_doorbells) { 569 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 570 } else { 571 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 572 return 0; 573 } 574 } 575 576 /** 577 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 578 * 579 * @adev: amdgpu_device pointer 580 * @index: doorbell index 581 * @v: value to write 582 * 583 * Writes @v to the doorbell aperture at the 584 * requested doorbell index (VEGA10+). 585 */ 586 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 587 { 588 if (adev->in_pci_err_recovery) 589 return; 590 591 if (index < adev->doorbell.num_doorbells) { 592 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 593 } else { 594 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 595 } 596 } 597 598 /** 599 * amdgpu_device_indirect_rreg - read an indirect register 600 * 601 * @adev: amdgpu_device pointer 602 * @pcie_index: mmio register offset 603 * @pcie_data: mmio register offset 604 * @reg_addr: indirect register address to read from 605 * 606 * Returns the value of indirect register @reg_addr 607 */ 608 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 609 u32 pcie_index, u32 pcie_data, 610 u32 reg_addr) 611 { 612 unsigned long flags; 613 u32 r; 614 void __iomem *pcie_index_offset; 615 void __iomem *pcie_data_offset; 616 617 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 618 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 619 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 620 621 writel(reg_addr, pcie_index_offset); 622 readl(pcie_index_offset); 623 r = readl(pcie_data_offset); 624 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 625 626 return r; 627 } 628 629 /** 630 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 631 * 632 * @adev: amdgpu_device pointer 633 * @pcie_index: mmio register offset 634 * @pcie_data: mmio register offset 635 * @reg_addr: indirect register address to read from 636 * 637 * Returns the value of indirect register @reg_addr 638 */ 639 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 640 u32 pcie_index, u32 pcie_data, 641 u32 reg_addr) 642 { 643 unsigned long flags; 644 u64 r; 645 void __iomem *pcie_index_offset; 646 void __iomem *pcie_data_offset; 647 648 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 649 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 650 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 651 652 /* read low 32 bits */ 653 writel(reg_addr, pcie_index_offset); 654 readl(pcie_index_offset); 655 r = readl(pcie_data_offset); 656 /* read high 32 bits */ 657 writel(reg_addr + 4, pcie_index_offset); 658 readl(pcie_index_offset); 659 r |= ((u64)readl(pcie_data_offset) << 32); 660 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 661 662 return r; 663 } 664 665 /** 666 * amdgpu_device_indirect_wreg - write an indirect register address 667 * 668 * @adev: amdgpu_device pointer 669 * @pcie_index: mmio register offset 670 * @pcie_data: mmio register offset 671 * @reg_addr: indirect register offset 672 * @reg_data: indirect register data 673 * 674 */ 675 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 676 u32 pcie_index, u32 pcie_data, 677 u32 reg_addr, u32 reg_data) 678 { 679 unsigned long flags; 680 void __iomem *pcie_index_offset; 681 void __iomem *pcie_data_offset; 682 683 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 684 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 685 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 686 687 writel(reg_addr, pcie_index_offset); 688 readl(pcie_index_offset); 689 writel(reg_data, pcie_data_offset); 690 readl(pcie_data_offset); 691 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 692 } 693 694 /** 695 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 696 * 697 * @adev: amdgpu_device pointer 698 * @pcie_index: mmio register offset 699 * @pcie_data: mmio register offset 700 * @reg_addr: indirect register offset 701 * @reg_data: indirect register data 702 * 703 */ 704 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 705 u32 pcie_index, u32 pcie_data, 706 u32 reg_addr, u64 reg_data) 707 { 708 unsigned long flags; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* write low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 720 readl(pcie_data_offset); 721 /* write high 32 bits */ 722 writel(reg_addr + 4, pcie_index_offset); 723 readl(pcie_index_offset); 724 writel((u32)(reg_data >> 32), pcie_data_offset); 725 readl(pcie_data_offset); 726 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 727 } 728 729 /** 730 * amdgpu_invalid_rreg - dummy reg read function 731 * 732 * @adev: amdgpu_device pointer 733 * @reg: offset of register 734 * 735 * Dummy register read function. Used for register blocks 736 * that certain asics don't have (all asics). 737 * Returns the value in the register. 738 */ 739 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 740 { 741 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 742 BUG(); 743 return 0; 744 } 745 746 /** 747 * amdgpu_invalid_wreg - dummy reg write function 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: offset of register 751 * @v: value to write to the register 752 * 753 * Dummy register read function. Used for register blocks 754 * that certain asics don't have (all asics). 755 */ 756 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 757 { 758 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 759 reg, v); 760 BUG(); 761 } 762 763 /** 764 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 765 * 766 * @adev: amdgpu_device pointer 767 * @reg: offset of register 768 * 769 * Dummy register read function. Used for register blocks 770 * that certain asics don't have (all asics). 771 * Returns the value in the register. 772 */ 773 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 774 { 775 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 776 BUG(); 777 return 0; 778 } 779 780 /** 781 * amdgpu_invalid_wreg64 - dummy reg write function 782 * 783 * @adev: amdgpu_device pointer 784 * @reg: offset of register 785 * @v: value to write to the register 786 * 787 * Dummy register read function. Used for register blocks 788 * that certain asics don't have (all asics). 789 */ 790 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 791 { 792 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 793 reg, v); 794 BUG(); 795 } 796 797 /** 798 * amdgpu_block_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @block: offset of instance 802 * @reg: offset of register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 * Returns the value in the register. 807 */ 808 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 809 uint32_t block, uint32_t reg) 810 { 811 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 812 reg, block); 813 BUG(); 814 return 0; 815 } 816 817 /** 818 * amdgpu_block_invalid_wreg - dummy reg write function 819 * 820 * @adev: amdgpu_device pointer 821 * @block: offset of instance 822 * @reg: offset of register 823 * @v: value to write to the register 824 * 825 * Dummy register read function. Used for register blocks 826 * that certain asics don't have (all asics). 827 */ 828 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 829 uint32_t block, 830 uint32_t reg, uint32_t v) 831 { 832 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 833 reg, block, v); 834 BUG(); 835 } 836 837 /** 838 * amdgpu_device_asic_init - Wrapper for atom asic_init 839 * 840 * @adev: amdgpu_device pointer 841 * 842 * Does any asic specific work and then calls atom asic init. 843 */ 844 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 845 { 846 amdgpu_asic_pre_asic_init(adev); 847 848 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 849 } 850 851 /** 852 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 853 * 854 * @adev: amdgpu_device pointer 855 * 856 * Allocates a scratch page of VRAM for use by various things in the 857 * driver. 858 */ 859 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 860 { 861 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 862 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 863 &adev->vram_scratch.robj, 864 &adev->vram_scratch.gpu_addr, 865 (void **)&adev->vram_scratch.ptr); 866 } 867 868 /** 869 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 870 * 871 * @adev: amdgpu_device pointer 872 * 873 * Frees the VRAM scratch page. 874 */ 875 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 876 { 877 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 878 } 879 880 /** 881 * amdgpu_device_program_register_sequence - program an array of registers. 882 * 883 * @adev: amdgpu_device pointer 884 * @registers: pointer to the register array 885 * @array_size: size of the register array 886 * 887 * Programs an array or registers with and and or masks. 888 * This is a helper for setting golden registers. 889 */ 890 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 891 const u32 *registers, 892 const u32 array_size) 893 { 894 u32 tmp, reg, and_mask, or_mask; 895 int i; 896 897 if (array_size % 3) 898 return; 899 900 for (i = 0; i < array_size; i +=3) { 901 reg = registers[i + 0]; 902 and_mask = registers[i + 1]; 903 or_mask = registers[i + 2]; 904 905 if (and_mask == 0xffffffff) { 906 tmp = or_mask; 907 } else { 908 tmp = RREG32(reg); 909 tmp &= ~and_mask; 910 if (adev->family >= AMDGPU_FAMILY_AI) 911 tmp |= (or_mask & and_mask); 912 else 913 tmp |= or_mask; 914 } 915 WREG32(reg, tmp); 916 } 917 } 918 919 /** 920 * amdgpu_device_pci_config_reset - reset the GPU 921 * 922 * @adev: amdgpu_device pointer 923 * 924 * Resets the GPU using the pci config reset sequence. 925 * Only applicable to asics prior to vega10. 926 */ 927 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 928 { 929 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 930 } 931 932 /** 933 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 938 */ 939 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 940 { 941 return pci_reset_function(adev->pdev); 942 } 943 944 /* 945 * GPU doorbell aperture helpers function. 946 */ 947 /** 948 * amdgpu_device_doorbell_init - Init doorbell driver information. 949 * 950 * @adev: amdgpu_device pointer 951 * 952 * Init doorbell driver information (CIK) 953 * Returns 0 on success, error on failure. 954 */ 955 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 956 { 957 958 /* No doorbell on SI hardware generation */ 959 if (adev->asic_type < CHIP_BONAIRE) { 960 adev->doorbell.base = 0; 961 adev->doorbell.size = 0; 962 adev->doorbell.num_doorbells = 0; 963 adev->doorbell.ptr = NULL; 964 return 0; 965 } 966 967 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 968 return -EINVAL; 969 970 amdgpu_asic_init_doorbell_index(adev); 971 972 /* doorbell bar mapping */ 973 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 974 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 975 976 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 977 adev->doorbell_index.max_assignment+1); 978 if (adev->doorbell.num_doorbells == 0) 979 return -EINVAL; 980 981 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 982 * paging queue doorbell use the second page. The 983 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 984 * doorbells are in the first page. So with paging queue enabled, 985 * the max num_doorbells should + 1 page (0x400 in dword) 986 */ 987 if (adev->asic_type >= CHIP_VEGA10) 988 adev->doorbell.num_doorbells += 0x400; 989 990 adev->doorbell.ptr = ioremap(adev->doorbell.base, 991 adev->doorbell.num_doorbells * 992 sizeof(u32)); 993 if (adev->doorbell.ptr == NULL) 994 return -ENOMEM; 995 996 return 0; 997 } 998 999 /** 1000 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1001 * 1002 * @adev: amdgpu_device pointer 1003 * 1004 * Tear down doorbell driver information (CIK) 1005 */ 1006 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1007 { 1008 iounmap(adev->doorbell.ptr); 1009 adev->doorbell.ptr = NULL; 1010 } 1011 1012 1013 1014 /* 1015 * amdgpu_device_wb_*() 1016 * Writeback is the method by which the GPU updates special pages in memory 1017 * with the status of certain GPU events (fences, ring pointers,etc.). 1018 */ 1019 1020 /** 1021 * amdgpu_device_wb_fini - Disable Writeback and free memory 1022 * 1023 * @adev: amdgpu_device pointer 1024 * 1025 * Disables Writeback and frees the Writeback memory (all asics). 1026 * Used at driver shutdown. 1027 */ 1028 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1029 { 1030 if (adev->wb.wb_obj) { 1031 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1032 &adev->wb.gpu_addr, 1033 (void **)&adev->wb.wb); 1034 adev->wb.wb_obj = NULL; 1035 } 1036 } 1037 1038 /** 1039 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1040 * 1041 * @adev: amdgpu_device pointer 1042 * 1043 * Initializes writeback and allocates writeback memory (all asics). 1044 * Used at driver startup. 1045 * Returns 0 on success or an -error on failure. 1046 */ 1047 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1048 { 1049 int r; 1050 1051 if (adev->wb.wb_obj == NULL) { 1052 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1053 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1054 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1055 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1056 (void **)&adev->wb.wb); 1057 if (r) { 1058 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1059 return r; 1060 } 1061 1062 adev->wb.num_wb = AMDGPU_MAX_WB; 1063 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1064 1065 /* clear wb memory */ 1066 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1067 } 1068 1069 return 0; 1070 } 1071 1072 /** 1073 * amdgpu_device_wb_get - Allocate a wb entry 1074 * 1075 * @adev: amdgpu_device pointer 1076 * @wb: wb index 1077 * 1078 * Allocate a wb slot for use by the driver (all asics). 1079 * Returns 0 on success or -EINVAL on failure. 1080 */ 1081 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1082 { 1083 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1084 1085 if (offset < adev->wb.num_wb) { 1086 __set_bit(offset, adev->wb.used); 1087 *wb = offset << 3; /* convert to dw offset */ 1088 return 0; 1089 } else { 1090 return -EINVAL; 1091 } 1092 } 1093 1094 /** 1095 * amdgpu_device_wb_free - Free a wb entry 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @wb: wb index 1099 * 1100 * Free a wb slot allocated for use by the driver (all asics) 1101 */ 1102 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1103 { 1104 wb >>= 3; 1105 if (wb < adev->wb.num_wb) 1106 __clear_bit(wb, adev->wb.used); 1107 } 1108 1109 /** 1110 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1111 * 1112 * @adev: amdgpu_device pointer 1113 * 1114 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1115 * to fail, but if any of the BARs is not accessible after the size we abort 1116 * driver loading by returning -ENODEV. 1117 */ 1118 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1119 { 1120 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1121 struct pci_bus *root; 1122 struct resource *res; 1123 unsigned i; 1124 u16 cmd; 1125 int r; 1126 1127 /* Bypass for VF */ 1128 if (amdgpu_sriov_vf(adev)) 1129 return 0; 1130 1131 /* skip if the bios has already enabled large BAR */ 1132 if (adev->gmc.real_vram_size && 1133 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1134 return 0; 1135 1136 /* Check if the root BUS has 64bit memory resources */ 1137 root = adev->pdev->bus; 1138 while (root->parent) 1139 root = root->parent; 1140 1141 pci_bus_for_each_resource(root, res, i) { 1142 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1143 res->start > 0x100000000ull) 1144 break; 1145 } 1146 1147 /* Trying to resize is pointless without a root hub window above 4GB */ 1148 if (!res) 1149 return 0; 1150 1151 /* Limit the BAR size to what is available */ 1152 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1153 rbar_size); 1154 1155 /* Disable memory decoding while we change the BAR addresses and size */ 1156 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1157 pci_write_config_word(adev->pdev, PCI_COMMAND, 1158 cmd & ~PCI_COMMAND_MEMORY); 1159 1160 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1161 amdgpu_device_doorbell_fini(adev); 1162 if (adev->asic_type >= CHIP_BONAIRE) 1163 pci_release_resource(adev->pdev, 2); 1164 1165 pci_release_resource(adev->pdev, 0); 1166 1167 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1168 if (r == -ENOSPC) 1169 DRM_INFO("Not enough PCI address space for a large BAR."); 1170 else if (r && r != -ENOTSUPP) 1171 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1172 1173 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1174 1175 /* When the doorbell or fb BAR isn't available we have no chance of 1176 * using the device. 1177 */ 1178 r = amdgpu_device_doorbell_init(adev); 1179 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1180 return -ENODEV; 1181 1182 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1183 1184 return 0; 1185 } 1186 1187 /* 1188 * GPU helpers function. 1189 */ 1190 /** 1191 * amdgpu_device_need_post - check if the hw need post or not 1192 * 1193 * @adev: amdgpu_device pointer 1194 * 1195 * Check if the asic has been initialized (all asics) at driver startup 1196 * or post is needed if hw reset is performed. 1197 * Returns true if need or false if not. 1198 */ 1199 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1200 { 1201 uint32_t reg; 1202 1203 if (amdgpu_sriov_vf(adev)) 1204 return false; 1205 1206 if (amdgpu_passthrough(adev)) { 1207 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1208 * some old smc fw still need driver do vPost otherwise gpu hang, while 1209 * those smc fw version above 22.15 doesn't have this flaw, so we force 1210 * vpost executed for smc version below 22.15 1211 */ 1212 if (adev->asic_type == CHIP_FIJI) { 1213 int err; 1214 uint32_t fw_ver; 1215 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1216 /* force vPost if error occured */ 1217 if (err) 1218 return true; 1219 1220 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1221 if (fw_ver < 0x00160e00) 1222 return true; 1223 } 1224 } 1225 1226 if (adev->has_hw_reset) { 1227 adev->has_hw_reset = false; 1228 return true; 1229 } 1230 1231 /* bios scratch used on CIK+ */ 1232 if (adev->asic_type >= CHIP_BONAIRE) 1233 return amdgpu_atombios_scratch_need_asic_init(adev); 1234 1235 /* check MEM_SIZE for older asics */ 1236 reg = amdgpu_asic_get_config_memsize(adev); 1237 1238 if ((reg != 0) && (reg != 0xffffffff)) 1239 return false; 1240 1241 return true; 1242 } 1243 1244 /* if we get transitioned to only one device, take VGA back */ 1245 /** 1246 * amdgpu_device_vga_set_decode - enable/disable vga decode 1247 * 1248 * @cookie: amdgpu_device pointer 1249 * @state: enable/disable vga decode 1250 * 1251 * Enable/disable vga decode (all asics). 1252 * Returns VGA resource flags. 1253 */ 1254 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1255 { 1256 struct amdgpu_device *adev = cookie; 1257 amdgpu_asic_set_vga_state(adev, state); 1258 if (state) 1259 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1260 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1261 else 1262 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1263 } 1264 1265 /** 1266 * amdgpu_device_check_block_size - validate the vm block size 1267 * 1268 * @adev: amdgpu_device pointer 1269 * 1270 * Validates the vm block size specified via module parameter. 1271 * The vm block size defines number of bits in page table versus page directory, 1272 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1273 * page table and the remaining bits are in the page directory. 1274 */ 1275 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1276 { 1277 /* defines number of bits in page table versus page directory, 1278 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1279 * page table and the remaining bits are in the page directory */ 1280 if (amdgpu_vm_block_size == -1) 1281 return; 1282 1283 if (amdgpu_vm_block_size < 9) { 1284 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1285 amdgpu_vm_block_size); 1286 amdgpu_vm_block_size = -1; 1287 } 1288 } 1289 1290 /** 1291 * amdgpu_device_check_vm_size - validate the vm size 1292 * 1293 * @adev: amdgpu_device pointer 1294 * 1295 * Validates the vm size in GB specified via module parameter. 1296 * The VM size is the size of the GPU virtual memory space in GB. 1297 */ 1298 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1299 { 1300 /* no need to check the default value */ 1301 if (amdgpu_vm_size == -1) 1302 return; 1303 1304 if (amdgpu_vm_size < 1) { 1305 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1306 amdgpu_vm_size); 1307 amdgpu_vm_size = -1; 1308 } 1309 } 1310 1311 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1312 { 1313 struct sysinfo si; 1314 bool is_os_64 = (sizeof(void *) == 8); 1315 uint64_t total_memory; 1316 uint64_t dram_size_seven_GB = 0x1B8000000; 1317 uint64_t dram_size_three_GB = 0xB8000000; 1318 1319 if (amdgpu_smu_memory_pool_size == 0) 1320 return; 1321 1322 if (!is_os_64) { 1323 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1324 goto def_value; 1325 } 1326 si_meminfo(&si); 1327 total_memory = (uint64_t)si.totalram * si.mem_unit; 1328 1329 if ((amdgpu_smu_memory_pool_size == 1) || 1330 (amdgpu_smu_memory_pool_size == 2)) { 1331 if (total_memory < dram_size_three_GB) 1332 goto def_value1; 1333 } else if ((amdgpu_smu_memory_pool_size == 4) || 1334 (amdgpu_smu_memory_pool_size == 8)) { 1335 if (total_memory < dram_size_seven_GB) 1336 goto def_value1; 1337 } else { 1338 DRM_WARN("Smu memory pool size not supported\n"); 1339 goto def_value; 1340 } 1341 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1342 1343 return; 1344 1345 def_value1: 1346 DRM_WARN("No enough system memory\n"); 1347 def_value: 1348 adev->pm.smu_prv_buffer_size = 0; 1349 } 1350 1351 /** 1352 * amdgpu_device_check_arguments - validate module params 1353 * 1354 * @adev: amdgpu_device pointer 1355 * 1356 * Validates certain module parameters and updates 1357 * the associated values used by the driver (all asics). 1358 */ 1359 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1360 { 1361 if (amdgpu_sched_jobs < 4) { 1362 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1363 amdgpu_sched_jobs); 1364 amdgpu_sched_jobs = 4; 1365 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1366 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1367 amdgpu_sched_jobs); 1368 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1369 } 1370 1371 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1372 /* gart size must be greater or equal to 32M */ 1373 dev_warn(adev->dev, "gart size (%d) too small\n", 1374 amdgpu_gart_size); 1375 amdgpu_gart_size = -1; 1376 } 1377 1378 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1379 /* gtt size must be greater or equal to 32M */ 1380 dev_warn(adev->dev, "gtt size (%d) too small\n", 1381 amdgpu_gtt_size); 1382 amdgpu_gtt_size = -1; 1383 } 1384 1385 /* valid range is between 4 and 9 inclusive */ 1386 if (amdgpu_vm_fragment_size != -1 && 1387 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1388 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1389 amdgpu_vm_fragment_size = -1; 1390 } 1391 1392 if (amdgpu_sched_hw_submission < 2) { 1393 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1394 amdgpu_sched_hw_submission); 1395 amdgpu_sched_hw_submission = 2; 1396 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1397 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1398 amdgpu_sched_hw_submission); 1399 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1400 } 1401 1402 amdgpu_device_check_smu_prv_buffer_size(adev); 1403 1404 amdgpu_device_check_vm_size(adev); 1405 1406 amdgpu_device_check_block_size(adev); 1407 1408 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1409 1410 amdgpu_gmc_tmz_set(adev); 1411 1412 amdgpu_gmc_noretry_set(adev); 1413 1414 return 0; 1415 } 1416 1417 /** 1418 * amdgpu_switcheroo_set_state - set switcheroo state 1419 * 1420 * @pdev: pci dev pointer 1421 * @state: vga_switcheroo state 1422 * 1423 * Callback for the switcheroo driver. Suspends or resumes the 1424 * the asics before or after it is powered up using ACPI methods. 1425 */ 1426 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1427 enum vga_switcheroo_state state) 1428 { 1429 struct drm_device *dev = pci_get_drvdata(pdev); 1430 int r; 1431 1432 if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF) 1433 return; 1434 1435 if (state == VGA_SWITCHEROO_ON) { 1436 pr_info("switched on\n"); 1437 /* don't suspend or resume card normally */ 1438 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1439 1440 pci_set_power_state(pdev, PCI_D0); 1441 amdgpu_device_load_pci_state(pdev); 1442 r = pci_enable_device(pdev); 1443 if (r) 1444 DRM_WARN("pci_enable_device failed (%d)\n", r); 1445 amdgpu_device_resume(dev, true); 1446 1447 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1448 } else { 1449 pr_info("switched off\n"); 1450 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1451 amdgpu_device_suspend(dev, true); 1452 amdgpu_device_cache_pci_state(pdev); 1453 /* Shut down the device */ 1454 pci_disable_device(pdev); 1455 pci_set_power_state(pdev, PCI_D3cold); 1456 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1457 } 1458 } 1459 1460 /** 1461 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1462 * 1463 * @pdev: pci dev pointer 1464 * 1465 * Callback for the switcheroo driver. Check of the switcheroo 1466 * state can be changed. 1467 * Returns true if the state can be changed, false if not. 1468 */ 1469 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1470 { 1471 struct drm_device *dev = pci_get_drvdata(pdev); 1472 1473 /* 1474 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1475 * locking inversion with the driver load path. And the access here is 1476 * completely racy anyway. So don't bother with locking for now. 1477 */ 1478 return atomic_read(&dev->open_count) == 0; 1479 } 1480 1481 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1482 .set_gpu_state = amdgpu_switcheroo_set_state, 1483 .reprobe = NULL, 1484 .can_switch = amdgpu_switcheroo_can_switch, 1485 }; 1486 1487 /** 1488 * amdgpu_device_ip_set_clockgating_state - set the CG state 1489 * 1490 * @dev: amdgpu_device pointer 1491 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1492 * @state: clockgating state (gate or ungate) 1493 * 1494 * Sets the requested clockgating state for all instances of 1495 * the hardware IP specified. 1496 * Returns the error code from the last instance. 1497 */ 1498 int amdgpu_device_ip_set_clockgating_state(void *dev, 1499 enum amd_ip_block_type block_type, 1500 enum amd_clockgating_state state) 1501 { 1502 struct amdgpu_device *adev = dev; 1503 int i, r = 0; 1504 1505 for (i = 0; i < adev->num_ip_blocks; i++) { 1506 if (!adev->ip_blocks[i].status.valid) 1507 continue; 1508 if (adev->ip_blocks[i].version->type != block_type) 1509 continue; 1510 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1511 continue; 1512 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1513 (void *)adev, state); 1514 if (r) 1515 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1516 adev->ip_blocks[i].version->funcs->name, r); 1517 } 1518 return r; 1519 } 1520 1521 /** 1522 * amdgpu_device_ip_set_powergating_state - set the PG state 1523 * 1524 * @dev: amdgpu_device pointer 1525 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1526 * @state: powergating state (gate or ungate) 1527 * 1528 * Sets the requested powergating state for all instances of 1529 * the hardware IP specified. 1530 * Returns the error code from the last instance. 1531 */ 1532 int amdgpu_device_ip_set_powergating_state(void *dev, 1533 enum amd_ip_block_type block_type, 1534 enum amd_powergating_state state) 1535 { 1536 struct amdgpu_device *adev = dev; 1537 int i, r = 0; 1538 1539 for (i = 0; i < adev->num_ip_blocks; i++) { 1540 if (!adev->ip_blocks[i].status.valid) 1541 continue; 1542 if (adev->ip_blocks[i].version->type != block_type) 1543 continue; 1544 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1545 continue; 1546 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1547 (void *)adev, state); 1548 if (r) 1549 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1550 adev->ip_blocks[i].version->funcs->name, r); 1551 } 1552 return r; 1553 } 1554 1555 /** 1556 * amdgpu_device_ip_get_clockgating_state - get the CG state 1557 * 1558 * @adev: amdgpu_device pointer 1559 * @flags: clockgating feature flags 1560 * 1561 * Walks the list of IPs on the device and updates the clockgating 1562 * flags for each IP. 1563 * Updates @flags with the feature flags for each hardware IP where 1564 * clockgating is enabled. 1565 */ 1566 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1567 u32 *flags) 1568 { 1569 int i; 1570 1571 for (i = 0; i < adev->num_ip_blocks; i++) { 1572 if (!adev->ip_blocks[i].status.valid) 1573 continue; 1574 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1575 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1576 } 1577 } 1578 1579 /** 1580 * amdgpu_device_ip_wait_for_idle - wait for idle 1581 * 1582 * @adev: amdgpu_device pointer 1583 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1584 * 1585 * Waits for the request hardware IP to be idle. 1586 * Returns 0 for success or a negative error code on failure. 1587 */ 1588 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1589 enum amd_ip_block_type block_type) 1590 { 1591 int i, r; 1592 1593 for (i = 0; i < adev->num_ip_blocks; i++) { 1594 if (!adev->ip_blocks[i].status.valid) 1595 continue; 1596 if (adev->ip_blocks[i].version->type == block_type) { 1597 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1598 if (r) 1599 return r; 1600 break; 1601 } 1602 } 1603 return 0; 1604 1605 } 1606 1607 /** 1608 * amdgpu_device_ip_is_idle - is the hardware IP idle 1609 * 1610 * @adev: amdgpu_device pointer 1611 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1612 * 1613 * Check if the hardware IP is idle or not. 1614 * Returns true if it the IP is idle, false if not. 1615 */ 1616 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1617 enum amd_ip_block_type block_type) 1618 { 1619 int i; 1620 1621 for (i = 0; i < adev->num_ip_blocks; i++) { 1622 if (!adev->ip_blocks[i].status.valid) 1623 continue; 1624 if (adev->ip_blocks[i].version->type == block_type) 1625 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1626 } 1627 return true; 1628 1629 } 1630 1631 /** 1632 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1633 * 1634 * @adev: amdgpu_device pointer 1635 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1636 * 1637 * Returns a pointer to the hardware IP block structure 1638 * if it exists for the asic, otherwise NULL. 1639 */ 1640 struct amdgpu_ip_block * 1641 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1642 enum amd_ip_block_type type) 1643 { 1644 int i; 1645 1646 for (i = 0; i < adev->num_ip_blocks; i++) 1647 if (adev->ip_blocks[i].version->type == type) 1648 return &adev->ip_blocks[i]; 1649 1650 return NULL; 1651 } 1652 1653 /** 1654 * amdgpu_device_ip_block_version_cmp 1655 * 1656 * @adev: amdgpu_device pointer 1657 * @type: enum amd_ip_block_type 1658 * @major: major version 1659 * @minor: minor version 1660 * 1661 * return 0 if equal or greater 1662 * return 1 if smaller or the ip_block doesn't exist 1663 */ 1664 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1665 enum amd_ip_block_type type, 1666 u32 major, u32 minor) 1667 { 1668 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1669 1670 if (ip_block && ((ip_block->version->major > major) || 1671 ((ip_block->version->major == major) && 1672 (ip_block->version->minor >= minor)))) 1673 return 0; 1674 1675 return 1; 1676 } 1677 1678 /** 1679 * amdgpu_device_ip_block_add 1680 * 1681 * @adev: amdgpu_device pointer 1682 * @ip_block_version: pointer to the IP to add 1683 * 1684 * Adds the IP block driver information to the collection of IPs 1685 * on the asic. 1686 */ 1687 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1688 const struct amdgpu_ip_block_version *ip_block_version) 1689 { 1690 if (!ip_block_version) 1691 return -EINVAL; 1692 1693 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1694 ip_block_version->funcs->name); 1695 1696 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1697 1698 return 0; 1699 } 1700 1701 /** 1702 * amdgpu_device_enable_virtual_display - enable virtual display feature 1703 * 1704 * @adev: amdgpu_device pointer 1705 * 1706 * Enabled the virtual display feature if the user has enabled it via 1707 * the module parameter virtual_display. This feature provides a virtual 1708 * display hardware on headless boards or in virtualized environments. 1709 * This function parses and validates the configuration string specified by 1710 * the user and configues the virtual display configuration (number of 1711 * virtual connectors, crtcs, etc.) specified. 1712 */ 1713 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1714 { 1715 adev->enable_virtual_display = false; 1716 1717 if (amdgpu_virtual_display) { 1718 const char *pci_address_name = pci_name(adev->pdev); 1719 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1720 1721 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1722 pciaddstr_tmp = pciaddstr; 1723 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1724 pciaddname = strsep(&pciaddname_tmp, ","); 1725 if (!strcmp("all", pciaddname) 1726 || !strcmp(pci_address_name, pciaddname)) { 1727 long num_crtc; 1728 int res = -1; 1729 1730 adev->enable_virtual_display = true; 1731 1732 if (pciaddname_tmp) 1733 res = kstrtol(pciaddname_tmp, 10, 1734 &num_crtc); 1735 1736 if (!res) { 1737 if (num_crtc < 1) 1738 num_crtc = 1; 1739 if (num_crtc > 6) 1740 num_crtc = 6; 1741 adev->mode_info.num_crtc = num_crtc; 1742 } else { 1743 adev->mode_info.num_crtc = 1; 1744 } 1745 break; 1746 } 1747 } 1748 1749 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1750 amdgpu_virtual_display, pci_address_name, 1751 adev->enable_virtual_display, adev->mode_info.num_crtc); 1752 1753 kfree(pciaddstr); 1754 } 1755 } 1756 1757 /** 1758 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1759 * 1760 * @adev: amdgpu_device pointer 1761 * 1762 * Parses the asic configuration parameters specified in the gpu info 1763 * firmware and makes them availale to the driver for use in configuring 1764 * the asic. 1765 * Returns 0 on success, -EINVAL on failure. 1766 */ 1767 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1768 { 1769 const char *chip_name; 1770 char fw_name[40]; 1771 int err; 1772 const struct gpu_info_firmware_header_v1_0 *hdr; 1773 1774 adev->firmware.gpu_info_fw = NULL; 1775 1776 if (adev->mman.discovery_bin) { 1777 amdgpu_discovery_get_gfx_info(adev); 1778 1779 /* 1780 * FIXME: The bounding box is still needed by Navi12, so 1781 * temporarily read it from gpu_info firmware. Should be droped 1782 * when DAL no longer needs it. 1783 */ 1784 if (adev->asic_type != CHIP_NAVI12) 1785 return 0; 1786 } 1787 1788 switch (adev->asic_type) { 1789 #ifdef CONFIG_DRM_AMDGPU_SI 1790 case CHIP_VERDE: 1791 case CHIP_TAHITI: 1792 case CHIP_PITCAIRN: 1793 case CHIP_OLAND: 1794 case CHIP_HAINAN: 1795 #endif 1796 #ifdef CONFIG_DRM_AMDGPU_CIK 1797 case CHIP_BONAIRE: 1798 case CHIP_HAWAII: 1799 case CHIP_KAVERI: 1800 case CHIP_KABINI: 1801 case CHIP_MULLINS: 1802 #endif 1803 case CHIP_TOPAZ: 1804 case CHIP_TONGA: 1805 case CHIP_FIJI: 1806 case CHIP_POLARIS10: 1807 case CHIP_POLARIS11: 1808 case CHIP_POLARIS12: 1809 case CHIP_VEGAM: 1810 case CHIP_CARRIZO: 1811 case CHIP_STONEY: 1812 case CHIP_VEGA20: 1813 case CHIP_SIENNA_CICHLID: 1814 case CHIP_NAVY_FLOUNDER: 1815 case CHIP_DIMGREY_CAVEFISH: 1816 default: 1817 return 0; 1818 case CHIP_VEGA10: 1819 chip_name = "vega10"; 1820 break; 1821 case CHIP_VEGA12: 1822 chip_name = "vega12"; 1823 break; 1824 case CHIP_RAVEN: 1825 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1826 chip_name = "raven2"; 1827 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1828 chip_name = "picasso"; 1829 else 1830 chip_name = "raven"; 1831 break; 1832 case CHIP_ARCTURUS: 1833 chip_name = "arcturus"; 1834 break; 1835 case CHIP_RENOIR: 1836 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1837 chip_name = "renoir"; 1838 else 1839 chip_name = "green_sardine"; 1840 break; 1841 case CHIP_NAVI10: 1842 chip_name = "navi10"; 1843 break; 1844 case CHIP_NAVI14: 1845 chip_name = "navi14"; 1846 break; 1847 case CHIP_NAVI12: 1848 chip_name = "navi12"; 1849 break; 1850 case CHIP_VANGOGH: 1851 chip_name = "vangogh"; 1852 break; 1853 } 1854 1855 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1856 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1857 if (err) { 1858 dev_err(adev->dev, 1859 "Failed to load gpu_info firmware \"%s\"\n", 1860 fw_name); 1861 goto out; 1862 } 1863 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1864 if (err) { 1865 dev_err(adev->dev, 1866 "Failed to validate gpu_info firmware \"%s\"\n", 1867 fw_name); 1868 goto out; 1869 } 1870 1871 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1872 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1873 1874 switch (hdr->version_major) { 1875 case 1: 1876 { 1877 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1878 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1879 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1880 1881 /* 1882 * Should be droped when DAL no longer needs it. 1883 */ 1884 if (adev->asic_type == CHIP_NAVI12) 1885 goto parse_soc_bounding_box; 1886 1887 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1888 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1889 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1890 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1891 adev->gfx.config.max_texture_channel_caches = 1892 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1893 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1894 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1895 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1896 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1897 adev->gfx.config.double_offchip_lds_buf = 1898 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1899 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1900 adev->gfx.cu_info.max_waves_per_simd = 1901 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1902 adev->gfx.cu_info.max_scratch_slots_per_cu = 1903 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1904 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1905 if (hdr->version_minor >= 1) { 1906 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1907 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1908 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1909 adev->gfx.config.num_sc_per_sh = 1910 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1911 adev->gfx.config.num_packer_per_sc = 1912 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1913 } 1914 1915 parse_soc_bounding_box: 1916 /* 1917 * soc bounding box info is not integrated in disocovery table, 1918 * we always need to parse it from gpu info firmware if needed. 1919 */ 1920 if (hdr->version_minor == 2) { 1921 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1922 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1923 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1924 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1925 } 1926 break; 1927 } 1928 default: 1929 dev_err(adev->dev, 1930 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1931 err = -EINVAL; 1932 goto out; 1933 } 1934 out: 1935 return err; 1936 } 1937 1938 /** 1939 * amdgpu_device_ip_early_init - run early init for hardware IPs 1940 * 1941 * @adev: amdgpu_device pointer 1942 * 1943 * Early initialization pass for hardware IPs. The hardware IPs that make 1944 * up each asic are discovered each IP's early_init callback is run. This 1945 * is the first stage in initializing the asic. 1946 * Returns 0 on success, negative error code on failure. 1947 */ 1948 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1949 { 1950 int i, r; 1951 1952 amdgpu_device_enable_virtual_display(adev); 1953 1954 if (amdgpu_sriov_vf(adev)) { 1955 r = amdgpu_virt_request_full_gpu(adev, true); 1956 if (r) 1957 return r; 1958 } 1959 1960 switch (adev->asic_type) { 1961 #ifdef CONFIG_DRM_AMDGPU_SI 1962 case CHIP_VERDE: 1963 case CHIP_TAHITI: 1964 case CHIP_PITCAIRN: 1965 case CHIP_OLAND: 1966 case CHIP_HAINAN: 1967 adev->family = AMDGPU_FAMILY_SI; 1968 r = si_set_ip_blocks(adev); 1969 if (r) 1970 return r; 1971 break; 1972 #endif 1973 #ifdef CONFIG_DRM_AMDGPU_CIK 1974 case CHIP_BONAIRE: 1975 case CHIP_HAWAII: 1976 case CHIP_KAVERI: 1977 case CHIP_KABINI: 1978 case CHIP_MULLINS: 1979 if (adev->flags & AMD_IS_APU) 1980 adev->family = AMDGPU_FAMILY_KV; 1981 else 1982 adev->family = AMDGPU_FAMILY_CI; 1983 1984 r = cik_set_ip_blocks(adev); 1985 if (r) 1986 return r; 1987 break; 1988 #endif 1989 case CHIP_TOPAZ: 1990 case CHIP_TONGA: 1991 case CHIP_FIJI: 1992 case CHIP_POLARIS10: 1993 case CHIP_POLARIS11: 1994 case CHIP_POLARIS12: 1995 case CHIP_VEGAM: 1996 case CHIP_CARRIZO: 1997 case CHIP_STONEY: 1998 if (adev->flags & AMD_IS_APU) 1999 adev->family = AMDGPU_FAMILY_CZ; 2000 else 2001 adev->family = AMDGPU_FAMILY_VI; 2002 2003 r = vi_set_ip_blocks(adev); 2004 if (r) 2005 return r; 2006 break; 2007 case CHIP_VEGA10: 2008 case CHIP_VEGA12: 2009 case CHIP_VEGA20: 2010 case CHIP_RAVEN: 2011 case CHIP_ARCTURUS: 2012 case CHIP_RENOIR: 2013 if (adev->flags & AMD_IS_APU) 2014 adev->family = AMDGPU_FAMILY_RV; 2015 else 2016 adev->family = AMDGPU_FAMILY_AI; 2017 2018 r = soc15_set_ip_blocks(adev); 2019 if (r) 2020 return r; 2021 break; 2022 case CHIP_NAVI10: 2023 case CHIP_NAVI14: 2024 case CHIP_NAVI12: 2025 case CHIP_SIENNA_CICHLID: 2026 case CHIP_NAVY_FLOUNDER: 2027 case CHIP_DIMGREY_CAVEFISH: 2028 case CHIP_VANGOGH: 2029 if (adev->asic_type == CHIP_VANGOGH) 2030 adev->family = AMDGPU_FAMILY_VGH; 2031 else 2032 adev->family = AMDGPU_FAMILY_NV; 2033 2034 r = nv_set_ip_blocks(adev); 2035 if (r) 2036 return r; 2037 break; 2038 default: 2039 /* FIXME: not supported yet */ 2040 return -EINVAL; 2041 } 2042 2043 amdgpu_amdkfd_device_probe(adev); 2044 2045 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2046 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2047 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2048 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2049 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2050 2051 for (i = 0; i < adev->num_ip_blocks; i++) { 2052 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2053 DRM_ERROR("disabled ip block: %d <%s>\n", 2054 i, adev->ip_blocks[i].version->funcs->name); 2055 adev->ip_blocks[i].status.valid = false; 2056 } else { 2057 if (adev->ip_blocks[i].version->funcs->early_init) { 2058 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2059 if (r == -ENOENT) { 2060 adev->ip_blocks[i].status.valid = false; 2061 } else if (r) { 2062 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2063 adev->ip_blocks[i].version->funcs->name, r); 2064 return r; 2065 } else { 2066 adev->ip_blocks[i].status.valid = true; 2067 } 2068 } else { 2069 adev->ip_blocks[i].status.valid = true; 2070 } 2071 } 2072 /* get the vbios after the asic_funcs are set up */ 2073 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2074 r = amdgpu_device_parse_gpu_info_fw(adev); 2075 if (r) 2076 return r; 2077 2078 /* Read BIOS */ 2079 if (!amdgpu_get_bios(adev)) 2080 return -EINVAL; 2081 2082 r = amdgpu_atombios_init(adev); 2083 if (r) { 2084 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2085 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2086 return r; 2087 } 2088 } 2089 } 2090 2091 adev->cg_flags &= amdgpu_cg_mask; 2092 adev->pg_flags &= amdgpu_pg_mask; 2093 2094 return 0; 2095 } 2096 2097 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2098 { 2099 int i, r; 2100 2101 for (i = 0; i < adev->num_ip_blocks; i++) { 2102 if (!adev->ip_blocks[i].status.sw) 2103 continue; 2104 if (adev->ip_blocks[i].status.hw) 2105 continue; 2106 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2107 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2108 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2109 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2110 if (r) { 2111 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2112 adev->ip_blocks[i].version->funcs->name, r); 2113 return r; 2114 } 2115 adev->ip_blocks[i].status.hw = true; 2116 } 2117 } 2118 2119 return 0; 2120 } 2121 2122 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2123 { 2124 int i, r; 2125 2126 for (i = 0; i < adev->num_ip_blocks; i++) { 2127 if (!adev->ip_blocks[i].status.sw) 2128 continue; 2129 if (adev->ip_blocks[i].status.hw) 2130 continue; 2131 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2132 if (r) { 2133 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2134 adev->ip_blocks[i].version->funcs->name, r); 2135 return r; 2136 } 2137 adev->ip_blocks[i].status.hw = true; 2138 } 2139 2140 return 0; 2141 } 2142 2143 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2144 { 2145 int r = 0; 2146 int i; 2147 uint32_t smu_version; 2148 2149 if (adev->asic_type >= CHIP_VEGA10) { 2150 for (i = 0; i < adev->num_ip_blocks; i++) { 2151 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2152 continue; 2153 2154 /* no need to do the fw loading again if already done*/ 2155 if (adev->ip_blocks[i].status.hw == true) 2156 break; 2157 2158 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2159 r = adev->ip_blocks[i].version->funcs->resume(adev); 2160 if (r) { 2161 DRM_ERROR("resume of IP block <%s> failed %d\n", 2162 adev->ip_blocks[i].version->funcs->name, r); 2163 return r; 2164 } 2165 } else { 2166 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2167 if (r) { 2168 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2169 adev->ip_blocks[i].version->funcs->name, r); 2170 return r; 2171 } 2172 } 2173 2174 adev->ip_blocks[i].status.hw = true; 2175 break; 2176 } 2177 } 2178 2179 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2180 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2181 2182 return r; 2183 } 2184 2185 /** 2186 * amdgpu_device_ip_init - run init for hardware IPs 2187 * 2188 * @adev: amdgpu_device pointer 2189 * 2190 * Main initialization pass for hardware IPs. The list of all the hardware 2191 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2192 * are run. sw_init initializes the software state associated with each IP 2193 * and hw_init initializes the hardware associated with each IP. 2194 * Returns 0 on success, negative error code on failure. 2195 */ 2196 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2197 { 2198 int i, r; 2199 2200 r = amdgpu_ras_init(adev); 2201 if (r) 2202 return r; 2203 2204 for (i = 0; i < adev->num_ip_blocks; i++) { 2205 if (!adev->ip_blocks[i].status.valid) 2206 continue; 2207 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2208 if (r) { 2209 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2210 adev->ip_blocks[i].version->funcs->name, r); 2211 goto init_failed; 2212 } 2213 adev->ip_blocks[i].status.sw = true; 2214 2215 /* need to do gmc hw init early so we can allocate gpu mem */ 2216 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2217 r = amdgpu_device_vram_scratch_init(adev); 2218 if (r) { 2219 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2220 goto init_failed; 2221 } 2222 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2223 if (r) { 2224 DRM_ERROR("hw_init %d failed %d\n", i, r); 2225 goto init_failed; 2226 } 2227 r = amdgpu_device_wb_init(adev); 2228 if (r) { 2229 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2230 goto init_failed; 2231 } 2232 adev->ip_blocks[i].status.hw = true; 2233 2234 /* right after GMC hw init, we create CSA */ 2235 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2236 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2237 AMDGPU_GEM_DOMAIN_VRAM, 2238 AMDGPU_CSA_SIZE); 2239 if (r) { 2240 DRM_ERROR("allocate CSA failed %d\n", r); 2241 goto init_failed; 2242 } 2243 } 2244 } 2245 } 2246 2247 if (amdgpu_sriov_vf(adev)) 2248 amdgpu_virt_init_data_exchange(adev); 2249 2250 r = amdgpu_ib_pool_init(adev); 2251 if (r) { 2252 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2253 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2254 goto init_failed; 2255 } 2256 2257 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2258 if (r) 2259 goto init_failed; 2260 2261 r = amdgpu_device_ip_hw_init_phase1(adev); 2262 if (r) 2263 goto init_failed; 2264 2265 r = amdgpu_device_fw_loading(adev); 2266 if (r) 2267 goto init_failed; 2268 2269 r = amdgpu_device_ip_hw_init_phase2(adev); 2270 if (r) 2271 goto init_failed; 2272 2273 /* 2274 * retired pages will be loaded from eeprom and reserved here, 2275 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2276 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2277 * for I2C communication which only true at this point. 2278 * 2279 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2280 * failure from bad gpu situation and stop amdgpu init process 2281 * accordingly. For other failed cases, it will still release all 2282 * the resource and print error message, rather than returning one 2283 * negative value to upper level. 2284 * 2285 * Note: theoretically, this should be called before all vram allocations 2286 * to protect retired page from abusing 2287 */ 2288 r = amdgpu_ras_recovery_init(adev); 2289 if (r) 2290 goto init_failed; 2291 2292 if (adev->gmc.xgmi.num_physical_nodes > 1) 2293 amdgpu_xgmi_add_device(adev); 2294 amdgpu_amdkfd_device_init(adev); 2295 2296 amdgpu_fru_get_product_info(adev); 2297 2298 init_failed: 2299 if (amdgpu_sriov_vf(adev)) 2300 amdgpu_virt_release_full_gpu(adev, true); 2301 2302 return r; 2303 } 2304 2305 /** 2306 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2307 * 2308 * @adev: amdgpu_device pointer 2309 * 2310 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2311 * this function before a GPU reset. If the value is retained after a 2312 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2313 */ 2314 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2315 { 2316 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2317 } 2318 2319 /** 2320 * amdgpu_device_check_vram_lost - check if vram is valid 2321 * 2322 * @adev: amdgpu_device pointer 2323 * 2324 * Checks the reset magic value written to the gart pointer in VRAM. 2325 * The driver calls this after a GPU reset to see if the contents of 2326 * VRAM is lost or now. 2327 * returns true if vram is lost, false if not. 2328 */ 2329 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2330 { 2331 if (memcmp(adev->gart.ptr, adev->reset_magic, 2332 AMDGPU_RESET_MAGIC_NUM)) 2333 return true; 2334 2335 if (!amdgpu_in_reset(adev)) 2336 return false; 2337 2338 /* 2339 * For all ASICs with baco/mode1 reset, the VRAM is 2340 * always assumed to be lost. 2341 */ 2342 switch (amdgpu_asic_reset_method(adev)) { 2343 case AMD_RESET_METHOD_BACO: 2344 case AMD_RESET_METHOD_MODE1: 2345 return true; 2346 default: 2347 return false; 2348 } 2349 } 2350 2351 /** 2352 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2353 * 2354 * @adev: amdgpu_device pointer 2355 * @state: clockgating state (gate or ungate) 2356 * 2357 * The list of all the hardware IPs that make up the asic is walked and the 2358 * set_clockgating_state callbacks are run. 2359 * Late initialization pass enabling clockgating for hardware IPs. 2360 * Fini or suspend, pass disabling clockgating for hardware IPs. 2361 * Returns 0 on success, negative error code on failure. 2362 */ 2363 2364 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2365 enum amd_clockgating_state state) 2366 { 2367 int i, j, r; 2368 2369 if (amdgpu_emu_mode == 1) 2370 return 0; 2371 2372 for (j = 0; j < adev->num_ip_blocks; j++) { 2373 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2374 if (!adev->ip_blocks[i].status.late_initialized) 2375 continue; 2376 /* skip CG for VCE/UVD, it's handled specially */ 2377 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2378 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2379 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2380 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2381 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2382 /* enable clockgating to save power */ 2383 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2384 state); 2385 if (r) { 2386 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2387 adev->ip_blocks[i].version->funcs->name, r); 2388 return r; 2389 } 2390 } 2391 } 2392 2393 return 0; 2394 } 2395 2396 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2397 { 2398 int i, j, r; 2399 2400 if (amdgpu_emu_mode == 1) 2401 return 0; 2402 2403 for (j = 0; j < adev->num_ip_blocks; j++) { 2404 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2405 if (!adev->ip_blocks[i].status.late_initialized) 2406 continue; 2407 /* skip CG for VCE/UVD, it's handled specially */ 2408 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2409 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2410 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2411 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2412 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2413 /* enable powergating to save power */ 2414 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2415 state); 2416 if (r) { 2417 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2418 adev->ip_blocks[i].version->funcs->name, r); 2419 return r; 2420 } 2421 } 2422 } 2423 return 0; 2424 } 2425 2426 static int amdgpu_device_enable_mgpu_fan_boost(void) 2427 { 2428 struct amdgpu_gpu_instance *gpu_ins; 2429 struct amdgpu_device *adev; 2430 int i, ret = 0; 2431 2432 mutex_lock(&mgpu_info.mutex); 2433 2434 /* 2435 * MGPU fan boost feature should be enabled 2436 * only when there are two or more dGPUs in 2437 * the system 2438 */ 2439 if (mgpu_info.num_dgpu < 2) 2440 goto out; 2441 2442 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2443 gpu_ins = &(mgpu_info.gpu_ins[i]); 2444 adev = gpu_ins->adev; 2445 if (!(adev->flags & AMD_IS_APU) && 2446 !gpu_ins->mgpu_fan_enabled) { 2447 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2448 if (ret) 2449 break; 2450 2451 gpu_ins->mgpu_fan_enabled = 1; 2452 } 2453 } 2454 2455 out: 2456 mutex_unlock(&mgpu_info.mutex); 2457 2458 return ret; 2459 } 2460 2461 /** 2462 * amdgpu_device_ip_late_init - run late init for hardware IPs 2463 * 2464 * @adev: amdgpu_device pointer 2465 * 2466 * Late initialization pass for hardware IPs. The list of all the hardware 2467 * IPs that make up the asic is walked and the late_init callbacks are run. 2468 * late_init covers any special initialization that an IP requires 2469 * after all of the have been initialized or something that needs to happen 2470 * late in the init process. 2471 * Returns 0 on success, negative error code on failure. 2472 */ 2473 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2474 { 2475 struct amdgpu_gpu_instance *gpu_instance; 2476 int i = 0, r; 2477 2478 for (i = 0; i < adev->num_ip_blocks; i++) { 2479 if (!adev->ip_blocks[i].status.hw) 2480 continue; 2481 if (adev->ip_blocks[i].version->funcs->late_init) { 2482 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2483 if (r) { 2484 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2485 adev->ip_blocks[i].version->funcs->name, r); 2486 return r; 2487 } 2488 } 2489 adev->ip_blocks[i].status.late_initialized = true; 2490 } 2491 2492 amdgpu_ras_set_error_query_ready(adev, true); 2493 2494 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2495 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2496 2497 amdgpu_device_fill_reset_magic(adev); 2498 2499 r = amdgpu_device_enable_mgpu_fan_boost(); 2500 if (r) 2501 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2502 2503 2504 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2505 mutex_lock(&mgpu_info.mutex); 2506 2507 /* 2508 * Reset device p-state to low as this was booted with high. 2509 * 2510 * This should be performed only after all devices from the same 2511 * hive get initialized. 2512 * 2513 * However, it's unknown how many device in the hive in advance. 2514 * As this is counted one by one during devices initializations. 2515 * 2516 * So, we wait for all XGMI interlinked devices initialized. 2517 * This may bring some delays as those devices may come from 2518 * different hives. But that should be OK. 2519 */ 2520 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2521 for (i = 0; i < mgpu_info.num_gpu; i++) { 2522 gpu_instance = &(mgpu_info.gpu_ins[i]); 2523 if (gpu_instance->adev->flags & AMD_IS_APU) 2524 continue; 2525 2526 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2527 AMDGPU_XGMI_PSTATE_MIN); 2528 if (r) { 2529 DRM_ERROR("pstate setting failed (%d).\n", r); 2530 break; 2531 } 2532 } 2533 } 2534 2535 mutex_unlock(&mgpu_info.mutex); 2536 } 2537 2538 return 0; 2539 } 2540 2541 /** 2542 * amdgpu_device_ip_fini - run fini for hardware IPs 2543 * 2544 * @adev: amdgpu_device pointer 2545 * 2546 * Main teardown pass for hardware IPs. The list of all the hardware 2547 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2548 * are run. hw_fini tears down the hardware associated with each IP 2549 * and sw_fini tears down any software state associated with each IP. 2550 * Returns 0 on success, negative error code on failure. 2551 */ 2552 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2553 { 2554 int i, r; 2555 2556 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2557 amdgpu_virt_release_ras_err_handler_data(adev); 2558 2559 amdgpu_ras_pre_fini(adev); 2560 2561 if (adev->gmc.xgmi.num_physical_nodes > 1) 2562 amdgpu_xgmi_remove_device(adev); 2563 2564 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2565 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2566 2567 amdgpu_amdkfd_device_fini(adev); 2568 2569 /* need to disable SMC first */ 2570 for (i = 0; i < adev->num_ip_blocks; i++) { 2571 if (!adev->ip_blocks[i].status.hw) 2572 continue; 2573 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2574 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2575 /* XXX handle errors */ 2576 if (r) { 2577 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2578 adev->ip_blocks[i].version->funcs->name, r); 2579 } 2580 adev->ip_blocks[i].status.hw = false; 2581 break; 2582 } 2583 } 2584 2585 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2586 if (!adev->ip_blocks[i].status.hw) 2587 continue; 2588 2589 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2590 /* XXX handle errors */ 2591 if (r) { 2592 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2593 adev->ip_blocks[i].version->funcs->name, r); 2594 } 2595 2596 adev->ip_blocks[i].status.hw = false; 2597 } 2598 2599 2600 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2601 if (!adev->ip_blocks[i].status.sw) 2602 continue; 2603 2604 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2605 amdgpu_ucode_free_bo(adev); 2606 amdgpu_free_static_csa(&adev->virt.csa_obj); 2607 amdgpu_device_wb_fini(adev); 2608 amdgpu_device_vram_scratch_fini(adev); 2609 amdgpu_ib_pool_fini(adev); 2610 } 2611 2612 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2613 /* XXX handle errors */ 2614 if (r) { 2615 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2616 adev->ip_blocks[i].version->funcs->name, r); 2617 } 2618 adev->ip_blocks[i].status.sw = false; 2619 adev->ip_blocks[i].status.valid = false; 2620 } 2621 2622 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2623 if (!adev->ip_blocks[i].status.late_initialized) 2624 continue; 2625 if (adev->ip_blocks[i].version->funcs->late_fini) 2626 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2627 adev->ip_blocks[i].status.late_initialized = false; 2628 } 2629 2630 amdgpu_ras_fini(adev); 2631 2632 if (amdgpu_sriov_vf(adev)) 2633 if (amdgpu_virt_release_full_gpu(adev, false)) 2634 DRM_ERROR("failed to release exclusive mode on fini\n"); 2635 2636 return 0; 2637 } 2638 2639 /** 2640 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2641 * 2642 * @work: work_struct. 2643 */ 2644 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2645 { 2646 struct amdgpu_device *adev = 2647 container_of(work, struct amdgpu_device, delayed_init_work.work); 2648 int r; 2649 2650 r = amdgpu_ib_ring_tests(adev); 2651 if (r) 2652 DRM_ERROR("ib ring test failed (%d).\n", r); 2653 } 2654 2655 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2656 { 2657 struct amdgpu_device *adev = 2658 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2659 2660 mutex_lock(&adev->gfx.gfx_off_mutex); 2661 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2662 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2663 adev->gfx.gfx_off_state = true; 2664 } 2665 mutex_unlock(&adev->gfx.gfx_off_mutex); 2666 } 2667 2668 /** 2669 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2670 * 2671 * @adev: amdgpu_device pointer 2672 * 2673 * Main suspend function for hardware IPs. The list of all the hardware 2674 * IPs that make up the asic is walked, clockgating is disabled and the 2675 * suspend callbacks are run. suspend puts the hardware and software state 2676 * in each IP into a state suitable for suspend. 2677 * Returns 0 on success, negative error code on failure. 2678 */ 2679 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2680 { 2681 int i, r; 2682 2683 if (adev->in_poweroff_reboot_com || 2684 !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { 2685 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2686 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2687 } 2688 2689 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2690 if (!adev->ip_blocks[i].status.valid) 2691 continue; 2692 2693 /* displays are handled separately */ 2694 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2695 continue; 2696 2697 /* XXX handle errors */ 2698 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2699 /* XXX handle errors */ 2700 if (r) { 2701 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2702 adev->ip_blocks[i].version->funcs->name, r); 2703 return r; 2704 } 2705 2706 adev->ip_blocks[i].status.hw = false; 2707 } 2708 2709 return 0; 2710 } 2711 2712 /** 2713 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2714 * 2715 * @adev: amdgpu_device pointer 2716 * 2717 * Main suspend function for hardware IPs. The list of all the hardware 2718 * IPs that make up the asic is walked, clockgating is disabled and the 2719 * suspend callbacks are run. suspend puts the hardware and software state 2720 * in each IP into a state suitable for suspend. 2721 * Returns 0 on success, negative error code on failure. 2722 */ 2723 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2724 { 2725 int i, r; 2726 2727 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2728 if (!adev->ip_blocks[i].status.valid) 2729 continue; 2730 /* displays are handled in phase1 */ 2731 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2732 continue; 2733 /* PSP lost connection when err_event_athub occurs */ 2734 if (amdgpu_ras_intr_triggered() && 2735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2736 adev->ip_blocks[i].status.hw = false; 2737 continue; 2738 } 2739 /* XXX handle errors */ 2740 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2741 /* XXX handle errors */ 2742 if (r) { 2743 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2744 adev->ip_blocks[i].version->funcs->name, r); 2745 } 2746 adev->ip_blocks[i].status.hw = false; 2747 /* handle putting the SMC in the appropriate state */ 2748 if(!amdgpu_sriov_vf(adev)){ 2749 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2750 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2751 if (r) { 2752 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2753 adev->mp1_state, r); 2754 return r; 2755 } 2756 } 2757 } 2758 adev->ip_blocks[i].status.hw = false; 2759 } 2760 2761 return 0; 2762 } 2763 2764 /** 2765 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2766 * 2767 * @adev: amdgpu_device pointer 2768 * 2769 * Main suspend function for hardware IPs. The list of all the hardware 2770 * IPs that make up the asic is walked, clockgating is disabled and the 2771 * suspend callbacks are run. suspend puts the hardware and software state 2772 * in each IP into a state suitable for suspend. 2773 * Returns 0 on success, negative error code on failure. 2774 */ 2775 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2776 { 2777 int r; 2778 2779 if (amdgpu_sriov_vf(adev)) 2780 amdgpu_virt_request_full_gpu(adev, false); 2781 2782 r = amdgpu_device_ip_suspend_phase1(adev); 2783 if (r) 2784 return r; 2785 r = amdgpu_device_ip_suspend_phase2(adev); 2786 2787 if (amdgpu_sriov_vf(adev)) 2788 amdgpu_virt_release_full_gpu(adev, false); 2789 2790 return r; 2791 } 2792 2793 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2794 { 2795 int i, r; 2796 2797 static enum amd_ip_block_type ip_order[] = { 2798 AMD_IP_BLOCK_TYPE_GMC, 2799 AMD_IP_BLOCK_TYPE_COMMON, 2800 AMD_IP_BLOCK_TYPE_PSP, 2801 AMD_IP_BLOCK_TYPE_IH, 2802 }; 2803 2804 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2805 int j; 2806 struct amdgpu_ip_block *block; 2807 2808 block = &adev->ip_blocks[i]; 2809 block->status.hw = false; 2810 2811 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2812 2813 if (block->version->type != ip_order[j] || 2814 !block->status.valid) 2815 continue; 2816 2817 r = block->version->funcs->hw_init(adev); 2818 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2819 if (r) 2820 return r; 2821 block->status.hw = true; 2822 } 2823 } 2824 2825 return 0; 2826 } 2827 2828 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2829 { 2830 int i, r; 2831 2832 static enum amd_ip_block_type ip_order[] = { 2833 AMD_IP_BLOCK_TYPE_SMC, 2834 AMD_IP_BLOCK_TYPE_DCE, 2835 AMD_IP_BLOCK_TYPE_GFX, 2836 AMD_IP_BLOCK_TYPE_SDMA, 2837 AMD_IP_BLOCK_TYPE_UVD, 2838 AMD_IP_BLOCK_TYPE_VCE, 2839 AMD_IP_BLOCK_TYPE_VCN 2840 }; 2841 2842 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2843 int j; 2844 struct amdgpu_ip_block *block; 2845 2846 for (j = 0; j < adev->num_ip_blocks; j++) { 2847 block = &adev->ip_blocks[j]; 2848 2849 if (block->version->type != ip_order[i] || 2850 !block->status.valid || 2851 block->status.hw) 2852 continue; 2853 2854 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2855 r = block->version->funcs->resume(adev); 2856 else 2857 r = block->version->funcs->hw_init(adev); 2858 2859 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2860 if (r) 2861 return r; 2862 block->status.hw = true; 2863 } 2864 } 2865 2866 return 0; 2867 } 2868 2869 /** 2870 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2871 * 2872 * @adev: amdgpu_device pointer 2873 * 2874 * First resume function for hardware IPs. The list of all the hardware 2875 * IPs that make up the asic is walked and the resume callbacks are run for 2876 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2877 * after a suspend and updates the software state as necessary. This 2878 * function is also used for restoring the GPU after a GPU reset. 2879 * Returns 0 on success, negative error code on failure. 2880 */ 2881 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2882 { 2883 int i, r; 2884 2885 for (i = 0; i < adev->num_ip_blocks; i++) { 2886 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2887 continue; 2888 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2889 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2890 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2891 2892 r = adev->ip_blocks[i].version->funcs->resume(adev); 2893 if (r) { 2894 DRM_ERROR("resume of IP block <%s> failed %d\n", 2895 adev->ip_blocks[i].version->funcs->name, r); 2896 return r; 2897 } 2898 adev->ip_blocks[i].status.hw = true; 2899 } 2900 } 2901 2902 return 0; 2903 } 2904 2905 /** 2906 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2907 * 2908 * @adev: amdgpu_device pointer 2909 * 2910 * First resume function for hardware IPs. The list of all the hardware 2911 * IPs that make up the asic is walked and the resume callbacks are run for 2912 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2913 * functional state after a suspend and updates the software state as 2914 * necessary. This function is also used for restoring the GPU after a GPU 2915 * reset. 2916 * Returns 0 on success, negative error code on failure. 2917 */ 2918 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2919 { 2920 int i, r; 2921 2922 for (i = 0; i < adev->num_ip_blocks; i++) { 2923 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2924 continue; 2925 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2926 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2927 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2928 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2929 continue; 2930 r = adev->ip_blocks[i].version->funcs->resume(adev); 2931 if (r) { 2932 DRM_ERROR("resume of IP block <%s> failed %d\n", 2933 adev->ip_blocks[i].version->funcs->name, r); 2934 return r; 2935 } 2936 adev->ip_blocks[i].status.hw = true; 2937 } 2938 2939 return 0; 2940 } 2941 2942 /** 2943 * amdgpu_device_ip_resume - run resume for hardware IPs 2944 * 2945 * @adev: amdgpu_device pointer 2946 * 2947 * Main resume function for hardware IPs. The hardware IPs 2948 * are split into two resume functions because they are 2949 * are also used in in recovering from a GPU reset and some additional 2950 * steps need to be take between them. In this case (S3/S4) they are 2951 * run sequentially. 2952 * Returns 0 on success, negative error code on failure. 2953 */ 2954 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2955 { 2956 int r; 2957 2958 r = amdgpu_device_ip_resume_phase1(adev); 2959 if (r) 2960 return r; 2961 2962 r = amdgpu_device_fw_loading(adev); 2963 if (r) 2964 return r; 2965 2966 r = amdgpu_device_ip_resume_phase2(adev); 2967 2968 return r; 2969 } 2970 2971 /** 2972 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2973 * 2974 * @adev: amdgpu_device pointer 2975 * 2976 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2977 */ 2978 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2979 { 2980 if (amdgpu_sriov_vf(adev)) { 2981 if (adev->is_atom_fw) { 2982 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2983 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2984 } else { 2985 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2986 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2987 } 2988 2989 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2990 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2991 } 2992 } 2993 2994 /** 2995 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2996 * 2997 * @asic_type: AMD asic type 2998 * 2999 * Check if there is DC (new modesetting infrastructre) support for an asic. 3000 * returns true if DC has support, false if not. 3001 */ 3002 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3003 { 3004 switch (asic_type) { 3005 #if defined(CONFIG_DRM_AMD_DC) 3006 #if defined(CONFIG_DRM_AMD_DC_SI) 3007 case CHIP_TAHITI: 3008 case CHIP_PITCAIRN: 3009 case CHIP_VERDE: 3010 case CHIP_OLAND: 3011 #endif 3012 case CHIP_BONAIRE: 3013 case CHIP_KAVERI: 3014 case CHIP_KABINI: 3015 case CHIP_MULLINS: 3016 /* 3017 * We have systems in the wild with these ASICs that require 3018 * LVDS and VGA support which is not supported with DC. 3019 * 3020 * Fallback to the non-DC driver here by default so as not to 3021 * cause regressions. 3022 */ 3023 return amdgpu_dc > 0; 3024 case CHIP_HAWAII: 3025 case CHIP_CARRIZO: 3026 case CHIP_STONEY: 3027 case CHIP_POLARIS10: 3028 case CHIP_POLARIS11: 3029 case CHIP_POLARIS12: 3030 case CHIP_VEGAM: 3031 case CHIP_TONGA: 3032 case CHIP_FIJI: 3033 case CHIP_VEGA10: 3034 case CHIP_VEGA12: 3035 case CHIP_VEGA20: 3036 #if defined(CONFIG_DRM_AMD_DC_DCN) 3037 case CHIP_RAVEN: 3038 case CHIP_NAVI10: 3039 case CHIP_NAVI14: 3040 case CHIP_NAVI12: 3041 case CHIP_RENOIR: 3042 case CHIP_SIENNA_CICHLID: 3043 case CHIP_NAVY_FLOUNDER: 3044 case CHIP_DIMGREY_CAVEFISH: 3045 case CHIP_VANGOGH: 3046 #endif 3047 return amdgpu_dc != 0; 3048 #endif 3049 default: 3050 if (amdgpu_dc > 0) 3051 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3052 "but isn't supported by ASIC, ignoring\n"); 3053 return false; 3054 } 3055 } 3056 3057 /** 3058 * amdgpu_device_has_dc_support - check if dc is supported 3059 * 3060 * @adev: amdgpu_device pointer 3061 * 3062 * Returns true for supported, false for not supported 3063 */ 3064 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3065 { 3066 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3067 return false; 3068 3069 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3070 } 3071 3072 3073 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3074 { 3075 struct amdgpu_device *adev = 3076 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3077 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3078 3079 /* It's a bug to not have a hive within this function */ 3080 if (WARN_ON(!hive)) 3081 return; 3082 3083 /* 3084 * Use task barrier to synchronize all xgmi reset works across the 3085 * hive. task_barrier_enter and task_barrier_exit will block 3086 * until all the threads running the xgmi reset works reach 3087 * those points. task_barrier_full will do both blocks. 3088 */ 3089 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3090 3091 task_barrier_enter(&hive->tb); 3092 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3093 3094 if (adev->asic_reset_res) 3095 goto fail; 3096 3097 task_barrier_exit(&hive->tb); 3098 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3099 3100 if (adev->asic_reset_res) 3101 goto fail; 3102 3103 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3104 adev->mmhub.funcs->reset_ras_error_count(adev); 3105 } else { 3106 3107 task_barrier_full(&hive->tb); 3108 adev->asic_reset_res = amdgpu_asic_reset(adev); 3109 } 3110 3111 fail: 3112 if (adev->asic_reset_res) 3113 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3114 adev->asic_reset_res, adev_to_drm(adev)->unique); 3115 amdgpu_put_xgmi_hive(hive); 3116 } 3117 3118 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3119 { 3120 char *input = amdgpu_lockup_timeout; 3121 char *timeout_setting = NULL; 3122 int index = 0; 3123 long timeout; 3124 int ret = 0; 3125 3126 /* 3127 * By default timeout for non compute jobs is 10000. 3128 * And there is no timeout enforced on compute jobs. 3129 * In SR-IOV or passthrough mode, timeout for compute 3130 * jobs are 60000 by default. 3131 */ 3132 adev->gfx_timeout = msecs_to_jiffies(10000); 3133 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3134 if (amdgpu_sriov_vf(adev)) 3135 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3136 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3137 else if (amdgpu_passthrough(adev)) 3138 adev->compute_timeout = msecs_to_jiffies(60000); 3139 else 3140 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3141 3142 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3143 while ((timeout_setting = strsep(&input, ",")) && 3144 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3145 ret = kstrtol(timeout_setting, 0, &timeout); 3146 if (ret) 3147 return ret; 3148 3149 if (timeout == 0) { 3150 index++; 3151 continue; 3152 } else if (timeout < 0) { 3153 timeout = MAX_SCHEDULE_TIMEOUT; 3154 } else { 3155 timeout = msecs_to_jiffies(timeout); 3156 } 3157 3158 switch (index++) { 3159 case 0: 3160 adev->gfx_timeout = timeout; 3161 break; 3162 case 1: 3163 adev->compute_timeout = timeout; 3164 break; 3165 case 2: 3166 adev->sdma_timeout = timeout; 3167 break; 3168 case 3: 3169 adev->video_timeout = timeout; 3170 break; 3171 default: 3172 break; 3173 } 3174 } 3175 /* 3176 * There is only one value specified and 3177 * it should apply to all non-compute jobs. 3178 */ 3179 if (index == 1) { 3180 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3181 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3182 adev->compute_timeout = adev->gfx_timeout; 3183 } 3184 } 3185 3186 return ret; 3187 } 3188 3189 static const struct attribute *amdgpu_dev_attributes[] = { 3190 &dev_attr_product_name.attr, 3191 &dev_attr_product_number.attr, 3192 &dev_attr_serial_number.attr, 3193 &dev_attr_pcie_replay_count.attr, 3194 NULL 3195 }; 3196 3197 3198 /** 3199 * amdgpu_device_init - initialize the driver 3200 * 3201 * @adev: amdgpu_device pointer 3202 * @flags: driver flags 3203 * 3204 * Initializes the driver info and hw (all asics). 3205 * Returns 0 for success or an error on failure. 3206 * Called at driver startup. 3207 */ 3208 int amdgpu_device_init(struct amdgpu_device *adev, 3209 uint32_t flags) 3210 { 3211 struct drm_device *ddev = adev_to_drm(adev); 3212 struct pci_dev *pdev = adev->pdev; 3213 int r, i; 3214 bool atpx = false; 3215 u32 max_MBps; 3216 3217 adev->shutdown = false; 3218 adev->flags = flags; 3219 3220 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3221 adev->asic_type = amdgpu_force_asic_type; 3222 else 3223 adev->asic_type = flags & AMD_ASIC_MASK; 3224 3225 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3226 if (amdgpu_emu_mode == 1) 3227 adev->usec_timeout *= 10; 3228 adev->gmc.gart_size = 512 * 1024 * 1024; 3229 adev->accel_working = false; 3230 adev->num_rings = 0; 3231 adev->mman.buffer_funcs = NULL; 3232 adev->mman.buffer_funcs_ring = NULL; 3233 adev->vm_manager.vm_pte_funcs = NULL; 3234 adev->vm_manager.vm_pte_num_scheds = 0; 3235 adev->gmc.gmc_funcs = NULL; 3236 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3237 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3238 3239 adev->smc_rreg = &amdgpu_invalid_rreg; 3240 adev->smc_wreg = &amdgpu_invalid_wreg; 3241 adev->pcie_rreg = &amdgpu_invalid_rreg; 3242 adev->pcie_wreg = &amdgpu_invalid_wreg; 3243 adev->pciep_rreg = &amdgpu_invalid_rreg; 3244 adev->pciep_wreg = &amdgpu_invalid_wreg; 3245 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3246 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3247 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3248 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3249 adev->didt_rreg = &amdgpu_invalid_rreg; 3250 adev->didt_wreg = &amdgpu_invalid_wreg; 3251 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3252 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3253 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3254 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3255 3256 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3257 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3258 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3259 3260 /* mutex initialization are all done here so we 3261 * can recall function without having locking issues */ 3262 atomic_set(&adev->irq.ih.lock, 0); 3263 mutex_init(&adev->firmware.mutex); 3264 mutex_init(&adev->pm.mutex); 3265 mutex_init(&adev->gfx.gpu_clock_mutex); 3266 mutex_init(&adev->srbm_mutex); 3267 mutex_init(&adev->gfx.pipe_reserve_mutex); 3268 mutex_init(&adev->gfx.gfx_off_mutex); 3269 mutex_init(&adev->grbm_idx_mutex); 3270 mutex_init(&adev->mn_lock); 3271 mutex_init(&adev->virt.vf_errors.lock); 3272 hash_init(adev->mn_hash); 3273 atomic_set(&adev->in_gpu_reset, 0); 3274 init_rwsem(&adev->reset_sem); 3275 mutex_init(&adev->psp.mutex); 3276 mutex_init(&adev->notifier_lock); 3277 3278 r = amdgpu_device_check_arguments(adev); 3279 if (r) 3280 return r; 3281 3282 spin_lock_init(&adev->mmio_idx_lock); 3283 spin_lock_init(&adev->smc_idx_lock); 3284 spin_lock_init(&adev->pcie_idx_lock); 3285 spin_lock_init(&adev->uvd_ctx_idx_lock); 3286 spin_lock_init(&adev->didt_idx_lock); 3287 spin_lock_init(&adev->gc_cac_idx_lock); 3288 spin_lock_init(&adev->se_cac_idx_lock); 3289 spin_lock_init(&adev->audio_endpt_idx_lock); 3290 spin_lock_init(&adev->mm_stats.lock); 3291 3292 INIT_LIST_HEAD(&adev->shadow_list); 3293 mutex_init(&adev->shadow_list_lock); 3294 3295 INIT_DELAYED_WORK(&adev->delayed_init_work, 3296 amdgpu_device_delayed_init_work_handler); 3297 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3298 amdgpu_device_delay_enable_gfx_off); 3299 3300 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3301 3302 adev->gfx.gfx_off_req_count = 1; 3303 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3304 3305 atomic_set(&adev->throttling_logging_enabled, 1); 3306 /* 3307 * If throttling continues, logging will be performed every minute 3308 * to avoid log flooding. "-1" is subtracted since the thermal 3309 * throttling interrupt comes every second. Thus, the total logging 3310 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3311 * for throttling interrupt) = 60 seconds. 3312 */ 3313 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3314 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3315 3316 /* Registers mapping */ 3317 /* TODO: block userspace mapping of io register */ 3318 if (adev->asic_type >= CHIP_BONAIRE) { 3319 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3320 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3321 } else { 3322 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3323 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3324 } 3325 3326 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3327 if (adev->rmmio == NULL) { 3328 return -ENOMEM; 3329 } 3330 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3331 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3332 3333 /* io port mapping */ 3334 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3335 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3336 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3337 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3338 break; 3339 } 3340 } 3341 if (adev->rio_mem == NULL) 3342 DRM_INFO("PCI I/O BAR is not found.\n"); 3343 3344 /* enable PCIE atomic ops */ 3345 r = pci_enable_atomic_ops_to_root(adev->pdev, 3346 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3347 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3348 if (r) { 3349 adev->have_atomics_support = false; 3350 DRM_INFO("PCIE atomic ops is not supported\n"); 3351 } else { 3352 adev->have_atomics_support = true; 3353 } 3354 3355 amdgpu_device_get_pcie_info(adev); 3356 3357 if (amdgpu_mcbp) 3358 DRM_INFO("MCBP is enabled\n"); 3359 3360 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3361 adev->enable_mes = true; 3362 3363 /* detect hw virtualization here */ 3364 amdgpu_detect_virtualization(adev); 3365 3366 r = amdgpu_device_get_job_timeout_settings(adev); 3367 if (r) { 3368 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3369 goto failed_unmap; 3370 } 3371 3372 /* early init functions */ 3373 r = amdgpu_device_ip_early_init(adev); 3374 if (r) 3375 goto failed_unmap; 3376 3377 /* doorbell bar mapping and doorbell index init*/ 3378 amdgpu_device_doorbell_init(adev); 3379 3380 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3381 /* this will fail for cards that aren't VGA class devices, just 3382 * ignore it */ 3383 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3384 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3385 3386 if (amdgpu_device_supports_atpx(ddev)) 3387 atpx = true; 3388 if (amdgpu_has_atpx() && 3389 (amdgpu_is_atpx_hybrid() || 3390 amdgpu_has_atpx_dgpu_power_cntl()) && 3391 !pci_is_thunderbolt_attached(adev->pdev)) 3392 vga_switcheroo_register_client(adev->pdev, 3393 &amdgpu_switcheroo_ops, atpx); 3394 if (atpx) 3395 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3396 3397 if (amdgpu_emu_mode == 1) { 3398 /* post the asic on emulation mode */ 3399 emu_soc_asic_init(adev); 3400 goto fence_driver_init; 3401 } 3402 3403 /* detect if we are with an SRIOV vbios */ 3404 amdgpu_device_detect_sriov_bios(adev); 3405 3406 /* check if we need to reset the asic 3407 * E.g., driver was not cleanly unloaded previously, etc. 3408 */ 3409 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3410 r = amdgpu_asic_reset(adev); 3411 if (r) { 3412 dev_err(adev->dev, "asic reset on init failed\n"); 3413 goto failed; 3414 } 3415 } 3416 3417 pci_enable_pcie_error_reporting(adev->pdev); 3418 3419 /* Post card if necessary */ 3420 if (amdgpu_device_need_post(adev)) { 3421 if (!adev->bios) { 3422 dev_err(adev->dev, "no vBIOS found\n"); 3423 r = -EINVAL; 3424 goto failed; 3425 } 3426 DRM_INFO("GPU posting now...\n"); 3427 r = amdgpu_device_asic_init(adev); 3428 if (r) { 3429 dev_err(adev->dev, "gpu post error!\n"); 3430 goto failed; 3431 } 3432 } 3433 3434 if (adev->is_atom_fw) { 3435 /* Initialize clocks */ 3436 r = amdgpu_atomfirmware_get_clock_info(adev); 3437 if (r) { 3438 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3439 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3440 goto failed; 3441 } 3442 } else { 3443 /* Initialize clocks */ 3444 r = amdgpu_atombios_get_clock_info(adev); 3445 if (r) { 3446 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3447 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3448 goto failed; 3449 } 3450 /* init i2c buses */ 3451 if (!amdgpu_device_has_dc_support(adev)) 3452 amdgpu_atombios_i2c_init(adev); 3453 } 3454 3455 fence_driver_init: 3456 /* Fence driver */ 3457 r = amdgpu_fence_driver_init(adev); 3458 if (r) { 3459 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3460 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3461 goto failed; 3462 } 3463 3464 /* init the mode config */ 3465 drm_mode_config_init(adev_to_drm(adev)); 3466 3467 r = amdgpu_device_ip_init(adev); 3468 if (r) { 3469 /* failed in exclusive mode due to timeout */ 3470 if (amdgpu_sriov_vf(adev) && 3471 !amdgpu_sriov_runtime(adev) && 3472 amdgpu_virt_mmio_blocked(adev) && 3473 !amdgpu_virt_wait_reset(adev)) { 3474 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3475 /* Don't send request since VF is inactive. */ 3476 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3477 adev->virt.ops = NULL; 3478 r = -EAGAIN; 3479 goto failed; 3480 } 3481 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3482 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3483 goto failed; 3484 } 3485 3486 dev_info(adev->dev, 3487 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3488 adev->gfx.config.max_shader_engines, 3489 adev->gfx.config.max_sh_per_se, 3490 adev->gfx.config.max_cu_per_sh, 3491 adev->gfx.cu_info.number); 3492 3493 adev->accel_working = true; 3494 3495 amdgpu_vm_check_compute_bug(adev); 3496 3497 /* Initialize the buffer migration limit. */ 3498 if (amdgpu_moverate >= 0) 3499 max_MBps = amdgpu_moverate; 3500 else 3501 max_MBps = 8; /* Allow 8 MB/s. */ 3502 /* Get a log2 for easy divisions. */ 3503 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3504 3505 amdgpu_fbdev_init(adev); 3506 3507 r = amdgpu_pm_sysfs_init(adev); 3508 if (r) { 3509 adev->pm_sysfs_en = false; 3510 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3511 } else 3512 adev->pm_sysfs_en = true; 3513 3514 r = amdgpu_ucode_sysfs_init(adev); 3515 if (r) { 3516 adev->ucode_sysfs_en = false; 3517 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3518 } else 3519 adev->ucode_sysfs_en = true; 3520 3521 if ((amdgpu_testing & 1)) { 3522 if (adev->accel_working) 3523 amdgpu_test_moves(adev); 3524 else 3525 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3526 } 3527 if (amdgpu_benchmarking) { 3528 if (adev->accel_working) 3529 amdgpu_benchmark(adev, amdgpu_benchmarking); 3530 else 3531 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3532 } 3533 3534 /* 3535 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3536 * Otherwise the mgpu fan boost feature will be skipped due to the 3537 * gpu instance is counted less. 3538 */ 3539 amdgpu_register_gpu_instance(adev); 3540 3541 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3542 * explicit gating rather than handling it automatically. 3543 */ 3544 r = amdgpu_device_ip_late_init(adev); 3545 if (r) { 3546 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3547 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3548 goto failed; 3549 } 3550 3551 /* must succeed. */ 3552 amdgpu_ras_resume(adev); 3553 3554 queue_delayed_work(system_wq, &adev->delayed_init_work, 3555 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3556 3557 if (amdgpu_sriov_vf(adev)) 3558 flush_delayed_work(&adev->delayed_init_work); 3559 3560 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3561 if (r) 3562 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3563 3564 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3565 r = amdgpu_pmu_init(adev); 3566 if (r) 3567 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3568 3569 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3570 if (amdgpu_device_cache_pci_state(adev->pdev)) 3571 pci_restore_state(pdev); 3572 3573 return 0; 3574 3575 failed: 3576 amdgpu_vf_error_trans_all(adev); 3577 if (atpx) 3578 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3579 3580 failed_unmap: 3581 iounmap(adev->rmmio); 3582 adev->rmmio = NULL; 3583 3584 return r; 3585 } 3586 3587 /** 3588 * amdgpu_device_fini - tear down the driver 3589 * 3590 * @adev: amdgpu_device pointer 3591 * 3592 * Tear down the driver info (all asics). 3593 * Called at driver shutdown. 3594 */ 3595 void amdgpu_device_fini(struct amdgpu_device *adev) 3596 { 3597 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3598 flush_delayed_work(&adev->delayed_init_work); 3599 adev->shutdown = true; 3600 3601 kfree(adev->pci_state); 3602 3603 /* make sure IB test finished before entering exclusive mode 3604 * to avoid preemption on IB test 3605 * */ 3606 if (amdgpu_sriov_vf(adev)) { 3607 amdgpu_virt_request_full_gpu(adev, false); 3608 amdgpu_virt_fini_data_exchange(adev); 3609 } 3610 3611 /* disable all interrupts */ 3612 amdgpu_irq_disable_all(adev); 3613 if (adev->mode_info.mode_config_initialized){ 3614 if (!amdgpu_device_has_dc_support(adev)) 3615 drm_helper_force_disable_all(adev_to_drm(adev)); 3616 else 3617 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3618 } 3619 amdgpu_fence_driver_fini(adev); 3620 if (adev->pm_sysfs_en) 3621 amdgpu_pm_sysfs_fini(adev); 3622 amdgpu_fbdev_fini(adev); 3623 amdgpu_device_ip_fini(adev); 3624 release_firmware(adev->firmware.gpu_info_fw); 3625 adev->firmware.gpu_info_fw = NULL; 3626 adev->accel_working = false; 3627 /* free i2c buses */ 3628 if (!amdgpu_device_has_dc_support(adev)) 3629 amdgpu_i2c_fini(adev); 3630 3631 if (amdgpu_emu_mode != 1) 3632 amdgpu_atombios_fini(adev); 3633 3634 kfree(adev->bios); 3635 adev->bios = NULL; 3636 if (amdgpu_has_atpx() && 3637 (amdgpu_is_atpx_hybrid() || 3638 amdgpu_has_atpx_dgpu_power_cntl()) && 3639 !pci_is_thunderbolt_attached(adev->pdev)) 3640 vga_switcheroo_unregister_client(adev->pdev); 3641 if (amdgpu_device_supports_atpx(adev_to_drm(adev))) 3642 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3643 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3644 vga_client_register(adev->pdev, NULL, NULL, NULL); 3645 if (adev->rio_mem) 3646 pci_iounmap(adev->pdev, adev->rio_mem); 3647 adev->rio_mem = NULL; 3648 iounmap(adev->rmmio); 3649 adev->rmmio = NULL; 3650 amdgpu_device_doorbell_fini(adev); 3651 3652 if (adev->ucode_sysfs_en) 3653 amdgpu_ucode_sysfs_fini(adev); 3654 3655 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3656 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3657 amdgpu_pmu_fini(adev); 3658 if (adev->mman.discovery_bin) 3659 amdgpu_discovery_fini(adev); 3660 } 3661 3662 3663 /* 3664 * Suspend & resume. 3665 */ 3666 /** 3667 * amdgpu_device_suspend - initiate device suspend 3668 * 3669 * @dev: drm dev pointer 3670 * @fbcon : notify the fbdev of suspend 3671 * 3672 * Puts the hw in the suspend state (all asics). 3673 * Returns 0 for success or an error on failure. 3674 * Called at driver suspend. 3675 */ 3676 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3677 { 3678 struct amdgpu_device *adev; 3679 struct drm_crtc *crtc; 3680 struct drm_connector *connector; 3681 struct drm_connector_list_iter iter; 3682 int r; 3683 3684 adev = drm_to_adev(dev); 3685 3686 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3687 return 0; 3688 3689 adev->in_suspend = true; 3690 drm_kms_helper_poll_disable(dev); 3691 3692 if (fbcon) 3693 amdgpu_fbdev_set_suspend(adev, 1); 3694 3695 cancel_delayed_work_sync(&adev->delayed_init_work); 3696 3697 if (!amdgpu_device_has_dc_support(adev)) { 3698 /* turn off display hw */ 3699 drm_modeset_lock_all(dev); 3700 drm_connector_list_iter_begin(dev, &iter); 3701 drm_for_each_connector_iter(connector, &iter) 3702 drm_helper_connector_dpms(connector, 3703 DRM_MODE_DPMS_OFF); 3704 drm_connector_list_iter_end(&iter); 3705 drm_modeset_unlock_all(dev); 3706 /* unpin the front buffers and cursors */ 3707 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3708 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3709 struct drm_framebuffer *fb = crtc->primary->fb; 3710 struct amdgpu_bo *robj; 3711 3712 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3713 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3714 r = amdgpu_bo_reserve(aobj, true); 3715 if (r == 0) { 3716 amdgpu_bo_unpin(aobj); 3717 amdgpu_bo_unreserve(aobj); 3718 } 3719 } 3720 3721 if (fb == NULL || fb->obj[0] == NULL) { 3722 continue; 3723 } 3724 robj = gem_to_amdgpu_bo(fb->obj[0]); 3725 /* don't unpin kernel fb objects */ 3726 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3727 r = amdgpu_bo_reserve(robj, true); 3728 if (r == 0) { 3729 amdgpu_bo_unpin(robj); 3730 amdgpu_bo_unreserve(robj); 3731 } 3732 } 3733 } 3734 } 3735 3736 amdgpu_ras_suspend(adev); 3737 3738 r = amdgpu_device_ip_suspend_phase1(adev); 3739 3740 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3741 3742 /* evict vram memory */ 3743 amdgpu_bo_evict_vram(adev); 3744 3745 amdgpu_fence_driver_suspend(adev); 3746 3747 if (adev->in_poweroff_reboot_com || 3748 !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) 3749 r = amdgpu_device_ip_suspend_phase2(adev); 3750 else 3751 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 3752 /* evict remaining vram memory 3753 * This second call to evict vram is to evict the gart page table 3754 * using the CPU. 3755 */ 3756 amdgpu_bo_evict_vram(adev); 3757 3758 return 0; 3759 } 3760 3761 /** 3762 * amdgpu_device_resume - initiate device resume 3763 * 3764 * @dev: drm dev pointer 3765 * @fbcon : notify the fbdev of resume 3766 * 3767 * Bring the hw back to operating state (all asics). 3768 * Returns 0 for success or an error on failure. 3769 * Called at driver resume. 3770 */ 3771 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3772 { 3773 struct drm_connector *connector; 3774 struct drm_connector_list_iter iter; 3775 struct amdgpu_device *adev = drm_to_adev(dev); 3776 struct drm_crtc *crtc; 3777 int r = 0; 3778 3779 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3780 return 0; 3781 3782 if (amdgpu_acpi_is_s0ix_supported(adev)) 3783 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3784 3785 /* post card */ 3786 if (amdgpu_device_need_post(adev)) { 3787 r = amdgpu_device_asic_init(adev); 3788 if (r) 3789 dev_err(adev->dev, "amdgpu asic init failed\n"); 3790 } 3791 3792 r = amdgpu_device_ip_resume(adev); 3793 if (r) { 3794 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3795 return r; 3796 } 3797 amdgpu_fence_driver_resume(adev); 3798 3799 3800 r = amdgpu_device_ip_late_init(adev); 3801 if (r) 3802 return r; 3803 3804 queue_delayed_work(system_wq, &adev->delayed_init_work, 3805 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3806 3807 if (!amdgpu_device_has_dc_support(adev)) { 3808 /* pin cursors */ 3809 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3810 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3811 3812 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3813 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3814 r = amdgpu_bo_reserve(aobj, true); 3815 if (r == 0) { 3816 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3817 if (r != 0) 3818 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3819 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3820 amdgpu_bo_unreserve(aobj); 3821 } 3822 } 3823 } 3824 } 3825 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3826 if (r) 3827 return r; 3828 3829 /* Make sure IB tests flushed */ 3830 flush_delayed_work(&adev->delayed_init_work); 3831 3832 /* blat the mode back in */ 3833 if (fbcon) { 3834 if (!amdgpu_device_has_dc_support(adev)) { 3835 /* pre DCE11 */ 3836 drm_helper_resume_force_mode(dev); 3837 3838 /* turn on display hw */ 3839 drm_modeset_lock_all(dev); 3840 3841 drm_connector_list_iter_begin(dev, &iter); 3842 drm_for_each_connector_iter(connector, &iter) 3843 drm_helper_connector_dpms(connector, 3844 DRM_MODE_DPMS_ON); 3845 drm_connector_list_iter_end(&iter); 3846 3847 drm_modeset_unlock_all(dev); 3848 } 3849 amdgpu_fbdev_set_suspend(adev, 0); 3850 } 3851 3852 drm_kms_helper_poll_enable(dev); 3853 3854 amdgpu_ras_resume(adev); 3855 3856 /* 3857 * Most of the connector probing functions try to acquire runtime pm 3858 * refs to ensure that the GPU is powered on when connector polling is 3859 * performed. Since we're calling this from a runtime PM callback, 3860 * trying to acquire rpm refs will cause us to deadlock. 3861 * 3862 * Since we're guaranteed to be holding the rpm lock, it's safe to 3863 * temporarily disable the rpm helpers so this doesn't deadlock us. 3864 */ 3865 #ifdef CONFIG_PM 3866 dev->dev->power.disable_depth++; 3867 #endif 3868 if (!amdgpu_device_has_dc_support(adev)) 3869 drm_helper_hpd_irq_event(dev); 3870 else 3871 drm_kms_helper_hotplug_event(dev); 3872 #ifdef CONFIG_PM 3873 dev->dev->power.disable_depth--; 3874 #endif 3875 adev->in_suspend = false; 3876 3877 return 0; 3878 } 3879 3880 /** 3881 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3882 * 3883 * @adev: amdgpu_device pointer 3884 * 3885 * The list of all the hardware IPs that make up the asic is walked and 3886 * the check_soft_reset callbacks are run. check_soft_reset determines 3887 * if the asic is still hung or not. 3888 * Returns true if any of the IPs are still in a hung state, false if not. 3889 */ 3890 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3891 { 3892 int i; 3893 bool asic_hang = false; 3894 3895 if (amdgpu_sriov_vf(adev)) 3896 return true; 3897 3898 if (amdgpu_asic_need_full_reset(adev)) 3899 return true; 3900 3901 for (i = 0; i < adev->num_ip_blocks; i++) { 3902 if (!adev->ip_blocks[i].status.valid) 3903 continue; 3904 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3905 adev->ip_blocks[i].status.hang = 3906 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3907 if (adev->ip_blocks[i].status.hang) { 3908 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3909 asic_hang = true; 3910 } 3911 } 3912 return asic_hang; 3913 } 3914 3915 /** 3916 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3917 * 3918 * @adev: amdgpu_device pointer 3919 * 3920 * The list of all the hardware IPs that make up the asic is walked and the 3921 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3922 * handles any IP specific hardware or software state changes that are 3923 * necessary for a soft reset to succeed. 3924 * Returns 0 on success, negative error code on failure. 3925 */ 3926 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3927 { 3928 int i, r = 0; 3929 3930 for (i = 0; i < adev->num_ip_blocks; i++) { 3931 if (!adev->ip_blocks[i].status.valid) 3932 continue; 3933 if (adev->ip_blocks[i].status.hang && 3934 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3935 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3936 if (r) 3937 return r; 3938 } 3939 } 3940 3941 return 0; 3942 } 3943 3944 /** 3945 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3946 * 3947 * @adev: amdgpu_device pointer 3948 * 3949 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3950 * reset is necessary to recover. 3951 * Returns true if a full asic reset is required, false if not. 3952 */ 3953 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3954 { 3955 int i; 3956 3957 if (amdgpu_asic_need_full_reset(adev)) 3958 return true; 3959 3960 for (i = 0; i < adev->num_ip_blocks; i++) { 3961 if (!adev->ip_blocks[i].status.valid) 3962 continue; 3963 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3964 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3965 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3966 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3968 if (adev->ip_blocks[i].status.hang) { 3969 dev_info(adev->dev, "Some block need full reset!\n"); 3970 return true; 3971 } 3972 } 3973 } 3974 return false; 3975 } 3976 3977 /** 3978 * amdgpu_device_ip_soft_reset - do a soft reset 3979 * 3980 * @adev: amdgpu_device pointer 3981 * 3982 * The list of all the hardware IPs that make up the asic is walked and the 3983 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3984 * IP specific hardware or software state changes that are necessary to soft 3985 * reset the IP. 3986 * Returns 0 on success, negative error code on failure. 3987 */ 3988 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3989 { 3990 int i, r = 0; 3991 3992 for (i = 0; i < adev->num_ip_blocks; i++) { 3993 if (!adev->ip_blocks[i].status.valid) 3994 continue; 3995 if (adev->ip_blocks[i].status.hang && 3996 adev->ip_blocks[i].version->funcs->soft_reset) { 3997 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3998 if (r) 3999 return r; 4000 } 4001 } 4002 4003 return 0; 4004 } 4005 4006 /** 4007 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4008 * 4009 * @adev: amdgpu_device pointer 4010 * 4011 * The list of all the hardware IPs that make up the asic is walked and the 4012 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4013 * handles any IP specific hardware or software state changes that are 4014 * necessary after the IP has been soft reset. 4015 * Returns 0 on success, negative error code on failure. 4016 */ 4017 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4018 { 4019 int i, r = 0; 4020 4021 for (i = 0; i < adev->num_ip_blocks; i++) { 4022 if (!adev->ip_blocks[i].status.valid) 4023 continue; 4024 if (adev->ip_blocks[i].status.hang && 4025 adev->ip_blocks[i].version->funcs->post_soft_reset) 4026 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4027 if (r) 4028 return r; 4029 } 4030 4031 return 0; 4032 } 4033 4034 /** 4035 * amdgpu_device_recover_vram - Recover some VRAM contents 4036 * 4037 * @adev: amdgpu_device pointer 4038 * 4039 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4040 * restore things like GPUVM page tables after a GPU reset where 4041 * the contents of VRAM might be lost. 4042 * 4043 * Returns: 4044 * 0 on success, negative error code on failure. 4045 */ 4046 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4047 { 4048 struct dma_fence *fence = NULL, *next = NULL; 4049 struct amdgpu_bo *shadow; 4050 long r = 1, tmo; 4051 4052 if (amdgpu_sriov_runtime(adev)) 4053 tmo = msecs_to_jiffies(8000); 4054 else 4055 tmo = msecs_to_jiffies(100); 4056 4057 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4058 mutex_lock(&adev->shadow_list_lock); 4059 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4060 4061 /* No need to recover an evicted BO */ 4062 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4063 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4064 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4065 continue; 4066 4067 r = amdgpu_bo_restore_shadow(shadow, &next); 4068 if (r) 4069 break; 4070 4071 if (fence) { 4072 tmo = dma_fence_wait_timeout(fence, false, tmo); 4073 dma_fence_put(fence); 4074 fence = next; 4075 if (tmo == 0) { 4076 r = -ETIMEDOUT; 4077 break; 4078 } else if (tmo < 0) { 4079 r = tmo; 4080 break; 4081 } 4082 } else { 4083 fence = next; 4084 } 4085 } 4086 mutex_unlock(&adev->shadow_list_lock); 4087 4088 if (fence) 4089 tmo = dma_fence_wait_timeout(fence, false, tmo); 4090 dma_fence_put(fence); 4091 4092 if (r < 0 || tmo <= 0) { 4093 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4094 return -EIO; 4095 } 4096 4097 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4098 return 0; 4099 } 4100 4101 4102 /** 4103 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4104 * 4105 * @adev: amdgpu_device pointer 4106 * @from_hypervisor: request from hypervisor 4107 * 4108 * do VF FLR and reinitialize Asic 4109 * return 0 means succeeded otherwise failed 4110 */ 4111 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4112 bool from_hypervisor) 4113 { 4114 int r; 4115 4116 if (from_hypervisor) 4117 r = amdgpu_virt_request_full_gpu(adev, true); 4118 else 4119 r = amdgpu_virt_reset_gpu(adev); 4120 if (r) 4121 return r; 4122 4123 amdgpu_amdkfd_pre_reset(adev); 4124 4125 /* Resume IP prior to SMC */ 4126 r = amdgpu_device_ip_reinit_early_sriov(adev); 4127 if (r) 4128 goto error; 4129 4130 amdgpu_virt_init_data_exchange(adev); 4131 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4132 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4133 4134 r = amdgpu_device_fw_loading(adev); 4135 if (r) 4136 return r; 4137 4138 /* now we are okay to resume SMC/CP/SDMA */ 4139 r = amdgpu_device_ip_reinit_late_sriov(adev); 4140 if (r) 4141 goto error; 4142 4143 amdgpu_irq_gpu_reset_resume_helper(adev); 4144 r = amdgpu_ib_ring_tests(adev); 4145 amdgpu_amdkfd_post_reset(adev); 4146 4147 error: 4148 amdgpu_virt_release_full_gpu(adev, true); 4149 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4150 amdgpu_inc_vram_lost(adev); 4151 r = amdgpu_device_recover_vram(adev); 4152 } 4153 4154 return r; 4155 } 4156 4157 /** 4158 * amdgpu_device_has_job_running - check if there is any job in mirror list 4159 * 4160 * @adev: amdgpu_device pointer 4161 * 4162 * check if there is any job in mirror list 4163 */ 4164 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4165 { 4166 int i; 4167 struct drm_sched_job *job; 4168 4169 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4170 struct amdgpu_ring *ring = adev->rings[i]; 4171 4172 if (!ring || !ring->sched.thread) 4173 continue; 4174 4175 spin_lock(&ring->sched.job_list_lock); 4176 job = list_first_entry_or_null(&ring->sched.pending_list, 4177 struct drm_sched_job, list); 4178 spin_unlock(&ring->sched.job_list_lock); 4179 if (job) 4180 return true; 4181 } 4182 return false; 4183 } 4184 4185 /** 4186 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4187 * 4188 * @adev: amdgpu_device pointer 4189 * 4190 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4191 * a hung GPU. 4192 */ 4193 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4194 { 4195 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4196 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4197 return false; 4198 } 4199 4200 if (amdgpu_gpu_recovery == 0) 4201 goto disabled; 4202 4203 if (amdgpu_sriov_vf(adev)) 4204 return true; 4205 4206 if (amdgpu_gpu_recovery == -1) { 4207 switch (adev->asic_type) { 4208 case CHIP_BONAIRE: 4209 case CHIP_HAWAII: 4210 case CHIP_TOPAZ: 4211 case CHIP_TONGA: 4212 case CHIP_FIJI: 4213 case CHIP_POLARIS10: 4214 case CHIP_POLARIS11: 4215 case CHIP_POLARIS12: 4216 case CHIP_VEGAM: 4217 case CHIP_VEGA20: 4218 case CHIP_VEGA10: 4219 case CHIP_VEGA12: 4220 case CHIP_RAVEN: 4221 case CHIP_ARCTURUS: 4222 case CHIP_RENOIR: 4223 case CHIP_NAVI10: 4224 case CHIP_NAVI14: 4225 case CHIP_NAVI12: 4226 case CHIP_SIENNA_CICHLID: 4227 case CHIP_NAVY_FLOUNDER: 4228 case CHIP_DIMGREY_CAVEFISH: 4229 break; 4230 default: 4231 goto disabled; 4232 } 4233 } 4234 4235 return true; 4236 4237 disabled: 4238 dev_info(adev->dev, "GPU recovery disabled.\n"); 4239 return false; 4240 } 4241 4242 4243 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4244 struct amdgpu_job *job, 4245 bool *need_full_reset_arg) 4246 { 4247 int i, r = 0; 4248 bool need_full_reset = *need_full_reset_arg; 4249 4250 amdgpu_debugfs_wait_dump(adev); 4251 4252 if (amdgpu_sriov_vf(adev)) { 4253 /* stop the data exchange thread */ 4254 amdgpu_virt_fini_data_exchange(adev); 4255 } 4256 4257 /* block all schedulers and reset given job's ring */ 4258 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4259 struct amdgpu_ring *ring = adev->rings[i]; 4260 4261 if (!ring || !ring->sched.thread) 4262 continue; 4263 4264 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4265 amdgpu_fence_driver_force_completion(ring); 4266 } 4267 4268 if(job) 4269 drm_sched_increase_karma(&job->base); 4270 4271 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4272 if (!amdgpu_sriov_vf(adev)) { 4273 4274 if (!need_full_reset) 4275 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4276 4277 if (!need_full_reset) { 4278 amdgpu_device_ip_pre_soft_reset(adev); 4279 r = amdgpu_device_ip_soft_reset(adev); 4280 amdgpu_device_ip_post_soft_reset(adev); 4281 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4282 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4283 need_full_reset = true; 4284 } 4285 } 4286 4287 if (need_full_reset) 4288 r = amdgpu_device_ip_suspend(adev); 4289 4290 *need_full_reset_arg = need_full_reset; 4291 } 4292 4293 return r; 4294 } 4295 4296 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4297 struct list_head *device_list_handle, 4298 bool *need_full_reset_arg, 4299 bool skip_hw_reset) 4300 { 4301 struct amdgpu_device *tmp_adev = NULL; 4302 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4303 int r = 0; 4304 4305 /* 4306 * ASIC reset has to be done on all HGMI hive nodes ASAP 4307 * to allow proper links negotiation in FW (within 1 sec) 4308 */ 4309 if (!skip_hw_reset && need_full_reset) { 4310 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4311 /* For XGMI run all resets in parallel to speed up the process */ 4312 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4313 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4314 r = -EALREADY; 4315 } else 4316 r = amdgpu_asic_reset(tmp_adev); 4317 4318 if (r) { 4319 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4320 r, adev_to_drm(tmp_adev)->unique); 4321 break; 4322 } 4323 } 4324 4325 /* For XGMI wait for all resets to complete before proceed */ 4326 if (!r) { 4327 list_for_each_entry(tmp_adev, device_list_handle, 4328 gmc.xgmi.head) { 4329 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4330 flush_work(&tmp_adev->xgmi_reset_work); 4331 r = tmp_adev->asic_reset_res; 4332 if (r) 4333 break; 4334 } 4335 } 4336 } 4337 } 4338 4339 if (!r && amdgpu_ras_intr_triggered()) { 4340 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4341 if (tmp_adev->mmhub.funcs && 4342 tmp_adev->mmhub.funcs->reset_ras_error_count) 4343 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4344 } 4345 4346 amdgpu_ras_intr_cleared(); 4347 } 4348 4349 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4350 if (need_full_reset) { 4351 /* post card */ 4352 if (amdgpu_device_asic_init(tmp_adev)) 4353 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4354 4355 if (!r) { 4356 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4357 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4358 if (r) 4359 goto out; 4360 4361 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4362 if (vram_lost) { 4363 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4364 amdgpu_inc_vram_lost(tmp_adev); 4365 } 4366 4367 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4368 if (r) 4369 goto out; 4370 4371 r = amdgpu_device_fw_loading(tmp_adev); 4372 if (r) 4373 return r; 4374 4375 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4376 if (r) 4377 goto out; 4378 4379 if (vram_lost) 4380 amdgpu_device_fill_reset_magic(tmp_adev); 4381 4382 /* 4383 * Add this ASIC as tracked as reset was already 4384 * complete successfully. 4385 */ 4386 amdgpu_register_gpu_instance(tmp_adev); 4387 4388 r = amdgpu_device_ip_late_init(tmp_adev); 4389 if (r) 4390 goto out; 4391 4392 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4393 4394 /* 4395 * The GPU enters bad state once faulty pages 4396 * by ECC has reached the threshold, and ras 4397 * recovery is scheduled next. So add one check 4398 * here to break recovery if it indeed exceeds 4399 * bad page threshold, and remind user to 4400 * retire this GPU or setting one bigger 4401 * bad_page_threshold value to fix this once 4402 * probing driver again. 4403 */ 4404 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4405 /* must succeed. */ 4406 amdgpu_ras_resume(tmp_adev); 4407 } else { 4408 r = -EINVAL; 4409 goto out; 4410 } 4411 4412 /* Update PSP FW topology after reset */ 4413 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4414 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4415 } 4416 } 4417 4418 out: 4419 if (!r) { 4420 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4421 r = amdgpu_ib_ring_tests(tmp_adev); 4422 if (r) { 4423 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4424 r = amdgpu_device_ip_suspend(tmp_adev); 4425 need_full_reset = true; 4426 r = -EAGAIN; 4427 goto end; 4428 } 4429 } 4430 4431 if (!r) 4432 r = amdgpu_device_recover_vram(tmp_adev); 4433 else 4434 tmp_adev->asic_reset_res = r; 4435 } 4436 4437 end: 4438 *need_full_reset_arg = need_full_reset; 4439 return r; 4440 } 4441 4442 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4443 struct amdgpu_hive_info *hive) 4444 { 4445 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4446 return false; 4447 4448 if (hive) { 4449 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4450 } else { 4451 down_write(&adev->reset_sem); 4452 } 4453 4454 switch (amdgpu_asic_reset_method(adev)) { 4455 case AMD_RESET_METHOD_MODE1: 4456 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4457 break; 4458 case AMD_RESET_METHOD_MODE2: 4459 adev->mp1_state = PP_MP1_STATE_RESET; 4460 break; 4461 default: 4462 adev->mp1_state = PP_MP1_STATE_NONE; 4463 break; 4464 } 4465 4466 return true; 4467 } 4468 4469 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4470 { 4471 amdgpu_vf_error_trans_all(adev); 4472 adev->mp1_state = PP_MP1_STATE_NONE; 4473 atomic_set(&adev->in_gpu_reset, 0); 4474 up_write(&adev->reset_sem); 4475 } 4476 4477 /* 4478 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4479 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4480 * 4481 * unlock won't require roll back. 4482 */ 4483 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4484 { 4485 struct amdgpu_device *tmp_adev = NULL; 4486 4487 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4488 if (!hive) { 4489 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4490 return -ENODEV; 4491 } 4492 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4493 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4494 goto roll_back; 4495 } 4496 } else if (!amdgpu_device_lock_adev(adev, hive)) 4497 return -EAGAIN; 4498 4499 return 0; 4500 roll_back: 4501 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4502 /* 4503 * if the lockup iteration break in the middle of a hive, 4504 * it may means there may has a race issue, 4505 * or a hive device locked up independently. 4506 * we may be in trouble and may not, so will try to roll back 4507 * the lock and give out a warnning. 4508 */ 4509 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4510 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4511 amdgpu_device_unlock_adev(tmp_adev); 4512 } 4513 } 4514 return -EAGAIN; 4515 } 4516 4517 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4518 { 4519 struct pci_dev *p = NULL; 4520 4521 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4522 adev->pdev->bus->number, 1); 4523 if (p) { 4524 pm_runtime_enable(&(p->dev)); 4525 pm_runtime_resume(&(p->dev)); 4526 } 4527 } 4528 4529 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4530 { 4531 enum amd_reset_method reset_method; 4532 struct pci_dev *p = NULL; 4533 u64 expires; 4534 4535 /* 4536 * For now, only BACO and mode1 reset are confirmed 4537 * to suffer the audio issue without proper suspended. 4538 */ 4539 reset_method = amdgpu_asic_reset_method(adev); 4540 if ((reset_method != AMD_RESET_METHOD_BACO) && 4541 (reset_method != AMD_RESET_METHOD_MODE1)) 4542 return -EINVAL; 4543 4544 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4545 adev->pdev->bus->number, 1); 4546 if (!p) 4547 return -ENODEV; 4548 4549 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4550 if (!expires) 4551 /* 4552 * If we cannot get the audio device autosuspend delay, 4553 * a fixed 4S interval will be used. Considering 3S is 4554 * the audio controller default autosuspend delay setting. 4555 * 4S used here is guaranteed to cover that. 4556 */ 4557 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4558 4559 while (!pm_runtime_status_suspended(&(p->dev))) { 4560 if (!pm_runtime_suspend(&(p->dev))) 4561 break; 4562 4563 if (expires < ktime_get_mono_fast_ns()) { 4564 dev_warn(adev->dev, "failed to suspend display audio\n"); 4565 /* TODO: abort the succeeding gpu reset? */ 4566 return -ETIMEDOUT; 4567 } 4568 } 4569 4570 pm_runtime_disable(&(p->dev)); 4571 4572 return 0; 4573 } 4574 4575 /** 4576 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4577 * 4578 * @adev: amdgpu_device pointer 4579 * @job: which job trigger hang 4580 * 4581 * Attempt to reset the GPU if it has hung (all asics). 4582 * Attempt to do soft-reset or full-reset and reinitialize Asic 4583 * Returns 0 for success or an error on failure. 4584 */ 4585 4586 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4587 struct amdgpu_job *job) 4588 { 4589 struct list_head device_list, *device_list_handle = NULL; 4590 bool need_full_reset = false; 4591 bool job_signaled = false; 4592 struct amdgpu_hive_info *hive = NULL; 4593 struct amdgpu_device *tmp_adev = NULL; 4594 int i, r = 0; 4595 bool need_emergency_restart = false; 4596 bool audio_suspended = false; 4597 4598 /* 4599 * Special case: RAS triggered and full reset isn't supported 4600 */ 4601 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4602 4603 /* 4604 * Flush RAM to disk so that after reboot 4605 * the user can read log and see why the system rebooted. 4606 */ 4607 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4608 DRM_WARN("Emergency reboot."); 4609 4610 ksys_sync_helper(); 4611 emergency_restart(); 4612 } 4613 4614 dev_info(adev->dev, "GPU %s begin!\n", 4615 need_emergency_restart ? "jobs stop":"reset"); 4616 4617 /* 4618 * Here we trylock to avoid chain of resets executing from 4619 * either trigger by jobs on different adevs in XGMI hive or jobs on 4620 * different schedulers for same device while this TO handler is running. 4621 * We always reset all schedulers for device and all devices for XGMI 4622 * hive so that should take care of them too. 4623 */ 4624 hive = amdgpu_get_xgmi_hive(adev); 4625 if (hive) { 4626 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4627 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4628 job ? job->base.id : -1, hive->hive_id); 4629 amdgpu_put_xgmi_hive(hive); 4630 if (job) 4631 drm_sched_increase_karma(&job->base); 4632 return 0; 4633 } 4634 mutex_lock(&hive->hive_lock); 4635 } 4636 4637 /* 4638 * lock the device before we try to operate the linked list 4639 * if didn't get the device lock, don't touch the linked list since 4640 * others may iterating it. 4641 */ 4642 r = amdgpu_device_lock_hive_adev(adev, hive); 4643 if (r) { 4644 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4645 job ? job->base.id : -1); 4646 4647 /* even we skipped this reset, still need to set the job to guilty */ 4648 if (job) 4649 drm_sched_increase_karma(&job->base); 4650 goto skip_recovery; 4651 } 4652 4653 /* 4654 * Build list of devices to reset. 4655 * In case we are in XGMI hive mode, resort the device list 4656 * to put adev in the 1st position. 4657 */ 4658 INIT_LIST_HEAD(&device_list); 4659 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4660 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4661 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4662 device_list_handle = &hive->device_list; 4663 } else { 4664 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4665 device_list_handle = &device_list; 4666 } 4667 4668 /* block all schedulers and reset given job's ring */ 4669 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4670 /* 4671 * Try to put the audio codec into suspend state 4672 * before gpu reset started. 4673 * 4674 * Due to the power domain of the graphics device 4675 * is shared with AZ power domain. Without this, 4676 * we may change the audio hardware from behind 4677 * the audio driver's back. That will trigger 4678 * some audio codec errors. 4679 */ 4680 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4681 audio_suspended = true; 4682 4683 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4684 4685 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4686 4687 if (!amdgpu_sriov_vf(tmp_adev)) 4688 amdgpu_amdkfd_pre_reset(tmp_adev); 4689 4690 /* 4691 * Mark these ASICs to be reseted as untracked first 4692 * And add them back after reset completed 4693 */ 4694 amdgpu_unregister_gpu_instance(tmp_adev); 4695 4696 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4697 4698 /* disable ras on ALL IPs */ 4699 if (!need_emergency_restart && 4700 amdgpu_device_ip_need_full_reset(tmp_adev)) 4701 amdgpu_ras_suspend(tmp_adev); 4702 4703 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4704 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4705 4706 if (!ring || !ring->sched.thread) 4707 continue; 4708 4709 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4710 4711 if (need_emergency_restart) 4712 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4713 } 4714 atomic_inc(&tmp_adev->gpu_reset_counter); 4715 } 4716 4717 if (need_emergency_restart) 4718 goto skip_sched_resume; 4719 4720 /* 4721 * Must check guilty signal here since after this point all old 4722 * HW fences are force signaled. 4723 * 4724 * job->base holds a reference to parent fence 4725 */ 4726 if (job && job->base.s_fence->parent && 4727 dma_fence_is_signaled(job->base.s_fence->parent)) { 4728 job_signaled = true; 4729 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4730 goto skip_hw_reset; 4731 } 4732 4733 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4734 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4735 r = amdgpu_device_pre_asic_reset(tmp_adev, 4736 (tmp_adev == adev) ? job : NULL, 4737 &need_full_reset); 4738 /*TODO Should we stop ?*/ 4739 if (r) { 4740 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4741 r, adev_to_drm(tmp_adev)->unique); 4742 tmp_adev->asic_reset_res = r; 4743 } 4744 } 4745 4746 /* Actual ASIC resets if needed.*/ 4747 /* TODO Implement XGMI hive reset logic for SRIOV */ 4748 if (amdgpu_sriov_vf(adev)) { 4749 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4750 if (r) 4751 adev->asic_reset_res = r; 4752 } else { 4753 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4754 if (r && r == -EAGAIN) 4755 goto retry; 4756 } 4757 4758 skip_hw_reset: 4759 4760 /* Post ASIC reset for all devs .*/ 4761 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4762 4763 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4764 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4765 4766 if (!ring || !ring->sched.thread) 4767 continue; 4768 4769 /* No point to resubmit jobs if we didn't HW reset*/ 4770 if (!tmp_adev->asic_reset_res && !job_signaled) 4771 drm_sched_resubmit_jobs(&ring->sched); 4772 4773 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4774 } 4775 4776 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4777 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4778 } 4779 4780 tmp_adev->asic_reset_res = 0; 4781 4782 if (r) { 4783 /* bad news, how to tell it to userspace ? */ 4784 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4785 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4786 } else { 4787 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4788 } 4789 } 4790 4791 skip_sched_resume: 4792 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4793 /*unlock kfd: SRIOV would do it separately */ 4794 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4795 amdgpu_amdkfd_post_reset(tmp_adev); 4796 if (audio_suspended) 4797 amdgpu_device_resume_display_audio(tmp_adev); 4798 amdgpu_device_unlock_adev(tmp_adev); 4799 } 4800 4801 skip_recovery: 4802 if (hive) { 4803 atomic_set(&hive->in_reset, 0); 4804 mutex_unlock(&hive->hive_lock); 4805 amdgpu_put_xgmi_hive(hive); 4806 } 4807 4808 if (r && r != -EAGAIN) 4809 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4810 return r; 4811 } 4812 4813 /** 4814 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4815 * 4816 * @adev: amdgpu_device pointer 4817 * 4818 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4819 * and lanes) of the slot the device is in. Handles APUs and 4820 * virtualized environments where PCIE config space may not be available. 4821 */ 4822 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4823 { 4824 struct pci_dev *pdev; 4825 enum pci_bus_speed speed_cap, platform_speed_cap; 4826 enum pcie_link_width platform_link_width; 4827 4828 if (amdgpu_pcie_gen_cap) 4829 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4830 4831 if (amdgpu_pcie_lane_cap) 4832 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4833 4834 /* covers APUs as well */ 4835 if (pci_is_root_bus(adev->pdev->bus)) { 4836 if (adev->pm.pcie_gen_mask == 0) 4837 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4838 if (adev->pm.pcie_mlw_mask == 0) 4839 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4840 return; 4841 } 4842 4843 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4844 return; 4845 4846 pcie_bandwidth_available(adev->pdev, NULL, 4847 &platform_speed_cap, &platform_link_width); 4848 4849 if (adev->pm.pcie_gen_mask == 0) { 4850 /* asic caps */ 4851 pdev = adev->pdev; 4852 speed_cap = pcie_get_speed_cap(pdev); 4853 if (speed_cap == PCI_SPEED_UNKNOWN) { 4854 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4855 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4856 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4857 } else { 4858 if (speed_cap == PCIE_SPEED_32_0GT) 4859 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4860 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4861 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4862 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 4863 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 4864 else if (speed_cap == PCIE_SPEED_16_0GT) 4865 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4866 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4867 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4868 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4869 else if (speed_cap == PCIE_SPEED_8_0GT) 4870 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4871 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4872 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4873 else if (speed_cap == PCIE_SPEED_5_0GT) 4874 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4875 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4876 else 4877 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4878 } 4879 /* platform caps */ 4880 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4881 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4882 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4883 } else { 4884 if (platform_speed_cap == PCIE_SPEED_32_0GT) 4885 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4886 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4887 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4888 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 4889 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 4890 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 4891 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4892 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4893 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4894 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4895 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4896 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4897 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4898 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4899 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4900 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4901 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4902 else 4903 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4904 4905 } 4906 } 4907 if (adev->pm.pcie_mlw_mask == 0) { 4908 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4909 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4910 } else { 4911 switch (platform_link_width) { 4912 case PCIE_LNK_X32: 4913 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4914 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4915 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4916 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4917 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4918 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4919 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4920 break; 4921 case PCIE_LNK_X16: 4922 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4923 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4924 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4925 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4926 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4927 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4928 break; 4929 case PCIE_LNK_X12: 4930 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4931 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4932 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4933 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4934 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4935 break; 4936 case PCIE_LNK_X8: 4937 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4938 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4939 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4940 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4941 break; 4942 case PCIE_LNK_X4: 4943 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4944 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4945 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4946 break; 4947 case PCIE_LNK_X2: 4948 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4950 break; 4951 case PCIE_LNK_X1: 4952 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4953 break; 4954 default: 4955 break; 4956 } 4957 } 4958 } 4959 } 4960 4961 int amdgpu_device_baco_enter(struct drm_device *dev) 4962 { 4963 struct amdgpu_device *adev = drm_to_adev(dev); 4964 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4965 4966 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4967 return -ENOTSUPP; 4968 4969 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4970 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4971 4972 return amdgpu_dpm_baco_enter(adev); 4973 } 4974 4975 int amdgpu_device_baco_exit(struct drm_device *dev) 4976 { 4977 struct amdgpu_device *adev = drm_to_adev(dev); 4978 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4979 int ret = 0; 4980 4981 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4982 return -ENOTSUPP; 4983 4984 ret = amdgpu_dpm_baco_exit(adev); 4985 if (ret) 4986 return ret; 4987 4988 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4989 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4990 4991 return 0; 4992 } 4993 4994 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4995 { 4996 int i; 4997 4998 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4999 struct amdgpu_ring *ring = adev->rings[i]; 5000 5001 if (!ring || !ring->sched.thread) 5002 continue; 5003 5004 cancel_delayed_work_sync(&ring->sched.work_tdr); 5005 } 5006 } 5007 5008 /** 5009 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5010 * @pdev: PCI device struct 5011 * @state: PCI channel state 5012 * 5013 * Description: Called when a PCI error is detected. 5014 * 5015 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5016 */ 5017 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5018 { 5019 struct drm_device *dev = pci_get_drvdata(pdev); 5020 struct amdgpu_device *adev = drm_to_adev(dev); 5021 int i; 5022 5023 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5024 5025 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5026 DRM_WARN("No support for XGMI hive yet..."); 5027 return PCI_ERS_RESULT_DISCONNECT; 5028 } 5029 5030 switch (state) { 5031 case pci_channel_io_normal: 5032 return PCI_ERS_RESULT_CAN_RECOVER; 5033 /* Fatal error, prepare for slot reset */ 5034 case pci_channel_io_frozen: 5035 /* 5036 * Cancel and wait for all TDRs in progress if failing to 5037 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5038 * 5039 * Locking adev->reset_sem will prevent any external access 5040 * to GPU during PCI error recovery 5041 */ 5042 while (!amdgpu_device_lock_adev(adev, NULL)) 5043 amdgpu_cancel_all_tdr(adev); 5044 5045 /* 5046 * Block any work scheduling as we do for regular GPU reset 5047 * for the duration of the recovery 5048 */ 5049 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5050 struct amdgpu_ring *ring = adev->rings[i]; 5051 5052 if (!ring || !ring->sched.thread) 5053 continue; 5054 5055 drm_sched_stop(&ring->sched, NULL); 5056 } 5057 atomic_inc(&adev->gpu_reset_counter); 5058 return PCI_ERS_RESULT_NEED_RESET; 5059 case pci_channel_io_perm_failure: 5060 /* Permanent error, prepare for device removal */ 5061 return PCI_ERS_RESULT_DISCONNECT; 5062 } 5063 5064 return PCI_ERS_RESULT_NEED_RESET; 5065 } 5066 5067 /** 5068 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5069 * @pdev: pointer to PCI device 5070 */ 5071 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5072 { 5073 5074 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5075 5076 /* TODO - dump whatever for debugging purposes */ 5077 5078 /* This called only if amdgpu_pci_error_detected returns 5079 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5080 * works, no need to reset slot. 5081 */ 5082 5083 return PCI_ERS_RESULT_RECOVERED; 5084 } 5085 5086 /** 5087 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5088 * @pdev: PCI device struct 5089 * 5090 * Description: This routine is called by the pci error recovery 5091 * code after the PCI slot has been reset, just before we 5092 * should resume normal operations. 5093 */ 5094 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5095 { 5096 struct drm_device *dev = pci_get_drvdata(pdev); 5097 struct amdgpu_device *adev = drm_to_adev(dev); 5098 int r, i; 5099 bool need_full_reset = true; 5100 u32 memsize; 5101 struct list_head device_list; 5102 5103 DRM_INFO("PCI error: slot reset callback!!\n"); 5104 5105 INIT_LIST_HEAD(&device_list); 5106 list_add_tail(&adev->gmc.xgmi.head, &device_list); 5107 5108 /* wait for asic to come out of reset */ 5109 msleep(500); 5110 5111 /* Restore PCI confspace */ 5112 amdgpu_device_load_pci_state(pdev); 5113 5114 /* confirm ASIC came out of reset */ 5115 for (i = 0; i < adev->usec_timeout; i++) { 5116 memsize = amdgpu_asic_get_config_memsize(adev); 5117 5118 if (memsize != 0xffffffff) 5119 break; 5120 udelay(1); 5121 } 5122 if (memsize == 0xffffffff) { 5123 r = -ETIME; 5124 goto out; 5125 } 5126 5127 adev->in_pci_err_recovery = true; 5128 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5129 adev->in_pci_err_recovery = false; 5130 if (r) 5131 goto out; 5132 5133 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5134 5135 out: 5136 if (!r) { 5137 if (amdgpu_device_cache_pci_state(adev->pdev)) 5138 pci_restore_state(adev->pdev); 5139 5140 DRM_INFO("PCIe error recovery succeeded\n"); 5141 } else { 5142 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5143 amdgpu_device_unlock_adev(adev); 5144 } 5145 5146 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5147 } 5148 5149 /** 5150 * amdgpu_pci_resume() - resume normal ops after PCI reset 5151 * @pdev: pointer to PCI device 5152 * 5153 * Called when the error recovery driver tells us that its 5154 * OK to resume normal operation. 5155 */ 5156 void amdgpu_pci_resume(struct pci_dev *pdev) 5157 { 5158 struct drm_device *dev = pci_get_drvdata(pdev); 5159 struct amdgpu_device *adev = drm_to_adev(dev); 5160 int i; 5161 5162 5163 DRM_INFO("PCI error: resume callback!!\n"); 5164 5165 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5166 struct amdgpu_ring *ring = adev->rings[i]; 5167 5168 if (!ring || !ring->sched.thread) 5169 continue; 5170 5171 5172 drm_sched_resubmit_jobs(&ring->sched); 5173 drm_sched_start(&ring->sched, true); 5174 } 5175 5176 amdgpu_device_unlock_adev(adev); 5177 } 5178 5179 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5180 { 5181 struct drm_device *dev = pci_get_drvdata(pdev); 5182 struct amdgpu_device *adev = drm_to_adev(dev); 5183 int r; 5184 5185 r = pci_save_state(pdev); 5186 if (!r) { 5187 kfree(adev->pci_state); 5188 5189 adev->pci_state = pci_store_saved_state(pdev); 5190 5191 if (!adev->pci_state) { 5192 DRM_ERROR("Failed to store PCI saved state"); 5193 return false; 5194 } 5195 } else { 5196 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5197 return false; 5198 } 5199 5200 return true; 5201 } 5202 5203 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5204 { 5205 struct drm_device *dev = pci_get_drvdata(pdev); 5206 struct amdgpu_device *adev = drm_to_adev(dev); 5207 int r; 5208 5209 if (!adev->pci_state) 5210 return false; 5211 5212 r = pci_load_saved_state(pdev, adev->pci_state); 5213 5214 if (!r) { 5215 pci_restore_state(pdev); 5216 } else { 5217 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5218 return false; 5219 } 5220 5221 return true; 5222 } 5223 5224 5225