1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "NAVI10", 115 "NAVI14", 116 "NAVI12", 117 "SIENNA_CICHLID", 118 "NAVY_FLOUNDER", 119 "LAST", 120 }; 121 122 /** 123 * DOC: pcie_replay_count 124 * 125 * The amdgpu driver provides a sysfs API for reporting the total number 126 * of PCIe replays (NAKs) 127 * The file pcie_replay_count is used for this and returns the total 128 * number of replays as a sum of the NAKs generated and NAKs received 129 */ 130 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct drm_device *ddev = dev_get_drvdata(dev); 135 struct amdgpu_device *adev = ddev->dev_private; 136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 137 138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 139 } 140 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 142 amdgpu_device_get_pcie_replay_count, NULL); 143 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 145 146 /** 147 * DOC: product_name 148 * 149 * The amdgpu driver provides a sysfs API for reporting the product name 150 * for the device 151 * The file serial_number is used for this and returns the product name 152 * as returned from the FRU. 153 * NOTE: This is only available for certain server cards 154 */ 155 156 static ssize_t amdgpu_device_get_product_name(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 struct drm_device *ddev = dev_get_drvdata(dev); 160 struct amdgpu_device *adev = ddev->dev_private; 161 162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 163 } 164 165 static DEVICE_ATTR(product_name, S_IRUGO, 166 amdgpu_device_get_product_name, NULL); 167 168 /** 169 * DOC: product_number 170 * 171 * The amdgpu driver provides a sysfs API for reporting the part number 172 * for the device 173 * The file serial_number is used for this and returns the part number 174 * as returned from the FRU. 175 * NOTE: This is only available for certain server cards 176 */ 177 178 static ssize_t amdgpu_device_get_product_number(struct device *dev, 179 struct device_attribute *attr, char *buf) 180 { 181 struct drm_device *ddev = dev_get_drvdata(dev); 182 struct amdgpu_device *adev = ddev->dev_private; 183 184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 185 } 186 187 static DEVICE_ATTR(product_number, S_IRUGO, 188 amdgpu_device_get_product_number, NULL); 189 190 /** 191 * DOC: serial_number 192 * 193 * The amdgpu driver provides a sysfs API for reporting the serial number 194 * for the device 195 * The file serial_number is used for this and returns the serial number 196 * as returned from the FRU. 197 * NOTE: This is only available for certain server cards 198 */ 199 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 201 struct device_attribute *attr, char *buf) 202 { 203 struct drm_device *ddev = dev_get_drvdata(dev); 204 struct amdgpu_device *adev = ddev->dev_private; 205 206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 207 } 208 209 static DEVICE_ATTR(serial_number, S_IRUGO, 210 amdgpu_device_get_serial_number, NULL); 211 212 /** 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 214 * 215 * @dev: drm_device pointer 216 * 217 * Returns true if the device is a dGPU with HG/PX power control, 218 * otherwise return false. 219 */ 220 bool amdgpu_device_supports_boco(struct drm_device *dev) 221 { 222 struct amdgpu_device *adev = dev->dev_private; 223 224 if (adev->flags & AMD_IS_PX) 225 return true; 226 return false; 227 } 228 229 /** 230 * amdgpu_device_supports_baco - Does the device support BACO 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device supporte BACO, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_baco(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = dev->dev_private; 240 241 return amdgpu_asic_supports_baco(adev); 242 } 243 244 /** 245 * VRAM access helper functions. 246 * 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * MMIO register access helper functions. 307 */ 308 /** 309 * amdgpu_mm_rreg - read a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 318 uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 323 return amdgpu_kiq_rreg(adev, reg); 324 325 if ((reg * 4) < adev->rmmio_size) 326 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 327 else { 328 unsigned long flags; 329 330 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 331 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 332 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 333 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 334 } 335 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 336 return ret; 337 } 338 339 /* 340 * MMIO register read with bytes helper functions 341 * @offset:bytes offset from MMIO start 342 * 343 */ 344 345 /** 346 * amdgpu_mm_rreg8 - read a memory mapped IO register 347 * 348 * @adev: amdgpu_device pointer 349 * @offset: byte aligned register offset 350 * 351 * Returns the 8 bit value from the offset specified. 352 */ 353 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 354 if (offset < adev->rmmio_size) 355 return (readb(adev->rmmio + offset)); 356 BUG(); 357 } 358 359 /* 360 * MMIO register write with bytes helper functions 361 * @offset:bytes offset from MMIO start 362 * @value: the value want to be written to the register 363 * 364 */ 365 /** 366 * amdgpu_mm_wreg8 - read a memory mapped IO register 367 * 368 * @adev: amdgpu_device pointer 369 * @offset: byte aligned register offset 370 * @value: 8 bit value to write 371 * 372 * Writes the value specified to the offset specified. 373 */ 374 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 375 if (offset < adev->rmmio_size) 376 writeb(value, adev->rmmio + offset); 377 else 378 BUG(); 379 } 380 381 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) 382 { 383 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 384 385 if ((reg * 4) < adev->rmmio_size) 386 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 387 else { 388 unsigned long flags; 389 390 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 391 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 392 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 393 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 394 } 395 } 396 397 /** 398 * amdgpu_mm_wreg - write to a memory mapped IO register 399 * 400 * @adev: amdgpu_device pointer 401 * @reg: dword aligned register offset 402 * @v: 32 bit value to write to the register 403 * @acc_flags: access flags which require special behavior 404 * 405 * Writes the value specified to the offset specified. 406 */ 407 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 408 uint32_t acc_flags) 409 { 410 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 411 return amdgpu_kiq_wreg(adev, reg, v); 412 413 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 414 } 415 416 /* 417 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 418 * 419 * this function is invoked only the debugfs register access 420 * */ 421 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 422 uint32_t acc_flags) 423 { 424 if (amdgpu_sriov_fullaccess(adev) && 425 adev->gfx.rlc.funcs && 426 adev->gfx.rlc.funcs->is_rlcg_access_range) { 427 428 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 429 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 430 } 431 432 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 433 } 434 435 /** 436 * amdgpu_io_rreg - read an IO register 437 * 438 * @adev: amdgpu_device pointer 439 * @reg: dword aligned register offset 440 * 441 * Returns the 32 bit value from the offset specified. 442 */ 443 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 444 { 445 if ((reg * 4) < adev->rio_mem_size) 446 return ioread32(adev->rio_mem + (reg * 4)); 447 else { 448 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 449 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 450 } 451 } 452 453 /** 454 * amdgpu_io_wreg - write to an IO register 455 * 456 * @adev: amdgpu_device pointer 457 * @reg: dword aligned register offset 458 * @v: 32 bit value to write to the register 459 * 460 * Writes the value specified to the offset specified. 461 */ 462 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 463 { 464 if ((reg * 4) < adev->rio_mem_size) 465 iowrite32(v, adev->rio_mem + (reg * 4)); 466 else { 467 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 468 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 469 } 470 } 471 472 /** 473 * amdgpu_mm_rdoorbell - read a doorbell dword 474 * 475 * @adev: amdgpu_device pointer 476 * @index: doorbell index 477 * 478 * Returns the value in the doorbell aperture at the 479 * requested doorbell index (CIK). 480 */ 481 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 482 { 483 if (index < adev->doorbell.num_doorbells) { 484 return readl(adev->doorbell.ptr + index); 485 } else { 486 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 487 return 0; 488 } 489 } 490 491 /** 492 * amdgpu_mm_wdoorbell - write a doorbell dword 493 * 494 * @adev: amdgpu_device pointer 495 * @index: doorbell index 496 * @v: value to write 497 * 498 * Writes @v to the doorbell aperture at the 499 * requested doorbell index (CIK). 500 */ 501 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 502 { 503 if (index < adev->doorbell.num_doorbells) { 504 writel(v, adev->doorbell.ptr + index); 505 } else { 506 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 507 } 508 } 509 510 /** 511 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * 516 * Returns the value in the doorbell aperture at the 517 * requested doorbell index (VEGA10+). 518 */ 519 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 520 { 521 if (index < adev->doorbell.num_doorbells) { 522 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 523 } else { 524 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 525 return 0; 526 } 527 } 528 529 /** 530 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 531 * 532 * @adev: amdgpu_device pointer 533 * @index: doorbell index 534 * @v: value to write 535 * 536 * Writes @v to the doorbell aperture at the 537 * requested doorbell index (VEGA10+). 538 */ 539 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 540 { 541 if (index < adev->doorbell.num_doorbells) { 542 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 543 } else { 544 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 545 } 546 } 547 548 /** 549 * amdgpu_invalid_rreg - dummy reg read function 550 * 551 * @adev: amdgpu device pointer 552 * @reg: offset of register 553 * 554 * Dummy register read function. Used for register blocks 555 * that certain asics don't have (all asics). 556 * Returns the value in the register. 557 */ 558 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 559 { 560 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 561 BUG(); 562 return 0; 563 } 564 565 /** 566 * amdgpu_invalid_wreg - dummy reg write function 567 * 568 * @adev: amdgpu device pointer 569 * @reg: offset of register 570 * @v: value to write to the register 571 * 572 * Dummy register read function. Used for register blocks 573 * that certain asics don't have (all asics). 574 */ 575 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 576 { 577 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 578 reg, v); 579 BUG(); 580 } 581 582 /** 583 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 584 * 585 * @adev: amdgpu device pointer 586 * @reg: offset of register 587 * 588 * Dummy register read function. Used for register blocks 589 * that certain asics don't have (all asics). 590 * Returns the value in the register. 591 */ 592 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 593 { 594 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 595 BUG(); 596 return 0; 597 } 598 599 /** 600 * amdgpu_invalid_wreg64 - dummy reg write function 601 * 602 * @adev: amdgpu device pointer 603 * @reg: offset of register 604 * @v: value to write to the register 605 * 606 * Dummy register read function. Used for register blocks 607 * that certain asics don't have (all asics). 608 */ 609 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 610 { 611 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 612 reg, v); 613 BUG(); 614 } 615 616 /** 617 * amdgpu_block_invalid_rreg - dummy reg read function 618 * 619 * @adev: amdgpu device pointer 620 * @block: offset of instance 621 * @reg: offset of register 622 * 623 * Dummy register read function. Used for register blocks 624 * that certain asics don't have (all asics). 625 * Returns the value in the register. 626 */ 627 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 628 uint32_t block, uint32_t reg) 629 { 630 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 631 reg, block); 632 BUG(); 633 return 0; 634 } 635 636 /** 637 * amdgpu_block_invalid_wreg - dummy reg write function 638 * 639 * @adev: amdgpu device pointer 640 * @block: offset of instance 641 * @reg: offset of register 642 * @v: value to write to the register 643 * 644 * Dummy register read function. Used for register blocks 645 * that certain asics don't have (all asics). 646 */ 647 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 648 uint32_t block, 649 uint32_t reg, uint32_t v) 650 { 651 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 652 reg, block, v); 653 BUG(); 654 } 655 656 /** 657 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 658 * 659 * @adev: amdgpu device pointer 660 * 661 * Allocates a scratch page of VRAM for use by various things in the 662 * driver. 663 */ 664 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 665 { 666 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 667 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 668 &adev->vram_scratch.robj, 669 &adev->vram_scratch.gpu_addr, 670 (void **)&adev->vram_scratch.ptr); 671 } 672 673 /** 674 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 675 * 676 * @adev: amdgpu device pointer 677 * 678 * Frees the VRAM scratch page. 679 */ 680 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 681 { 682 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 683 } 684 685 /** 686 * amdgpu_device_program_register_sequence - program an array of registers. 687 * 688 * @adev: amdgpu_device pointer 689 * @registers: pointer to the register array 690 * @array_size: size of the register array 691 * 692 * Programs an array or registers with and and or masks. 693 * This is a helper for setting golden registers. 694 */ 695 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 696 const u32 *registers, 697 const u32 array_size) 698 { 699 u32 tmp, reg, and_mask, or_mask; 700 int i; 701 702 if (array_size % 3) 703 return; 704 705 for (i = 0; i < array_size; i +=3) { 706 reg = registers[i + 0]; 707 and_mask = registers[i + 1]; 708 or_mask = registers[i + 2]; 709 710 if (and_mask == 0xffffffff) { 711 tmp = or_mask; 712 } else { 713 tmp = RREG32(reg); 714 tmp &= ~and_mask; 715 if (adev->family >= AMDGPU_FAMILY_AI) 716 tmp |= (or_mask & and_mask); 717 else 718 tmp |= or_mask; 719 } 720 WREG32(reg, tmp); 721 } 722 } 723 724 /** 725 * amdgpu_device_pci_config_reset - reset the GPU 726 * 727 * @adev: amdgpu_device pointer 728 * 729 * Resets the GPU using the pci config reset sequence. 730 * Only applicable to asics prior to vega10. 731 */ 732 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 733 { 734 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 735 } 736 737 /* 738 * GPU doorbell aperture helpers function. 739 */ 740 /** 741 * amdgpu_device_doorbell_init - Init doorbell driver information. 742 * 743 * @adev: amdgpu_device pointer 744 * 745 * Init doorbell driver information (CIK) 746 * Returns 0 on success, error on failure. 747 */ 748 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 749 { 750 751 /* No doorbell on SI hardware generation */ 752 if (adev->asic_type < CHIP_BONAIRE) { 753 adev->doorbell.base = 0; 754 adev->doorbell.size = 0; 755 adev->doorbell.num_doorbells = 0; 756 adev->doorbell.ptr = NULL; 757 return 0; 758 } 759 760 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 761 return -EINVAL; 762 763 amdgpu_asic_init_doorbell_index(adev); 764 765 /* doorbell bar mapping */ 766 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 767 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 768 769 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 770 adev->doorbell_index.max_assignment+1); 771 if (adev->doorbell.num_doorbells == 0) 772 return -EINVAL; 773 774 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 775 * paging queue doorbell use the second page. The 776 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 777 * doorbells are in the first page. So with paging queue enabled, 778 * the max num_doorbells should + 1 page (0x400 in dword) 779 */ 780 if (adev->asic_type >= CHIP_VEGA10) 781 adev->doorbell.num_doorbells += 0x400; 782 783 adev->doorbell.ptr = ioremap(adev->doorbell.base, 784 adev->doorbell.num_doorbells * 785 sizeof(u32)); 786 if (adev->doorbell.ptr == NULL) 787 return -ENOMEM; 788 789 return 0; 790 } 791 792 /** 793 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 794 * 795 * @adev: amdgpu_device pointer 796 * 797 * Tear down doorbell driver information (CIK) 798 */ 799 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 800 { 801 iounmap(adev->doorbell.ptr); 802 adev->doorbell.ptr = NULL; 803 } 804 805 806 807 /* 808 * amdgpu_device_wb_*() 809 * Writeback is the method by which the GPU updates special pages in memory 810 * with the status of certain GPU events (fences, ring pointers,etc.). 811 */ 812 813 /** 814 * amdgpu_device_wb_fini - Disable Writeback and free memory 815 * 816 * @adev: amdgpu_device pointer 817 * 818 * Disables Writeback and frees the Writeback memory (all asics). 819 * Used at driver shutdown. 820 */ 821 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 822 { 823 if (adev->wb.wb_obj) { 824 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 825 &adev->wb.gpu_addr, 826 (void **)&adev->wb.wb); 827 adev->wb.wb_obj = NULL; 828 } 829 } 830 831 /** 832 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 833 * 834 * @adev: amdgpu_device pointer 835 * 836 * Initializes writeback and allocates writeback memory (all asics). 837 * Used at driver startup. 838 * Returns 0 on success or an -error on failure. 839 */ 840 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 841 { 842 int r; 843 844 if (adev->wb.wb_obj == NULL) { 845 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 846 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 847 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 848 &adev->wb.wb_obj, &adev->wb.gpu_addr, 849 (void **)&adev->wb.wb); 850 if (r) { 851 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 852 return r; 853 } 854 855 adev->wb.num_wb = AMDGPU_MAX_WB; 856 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 857 858 /* clear wb memory */ 859 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 860 } 861 862 return 0; 863 } 864 865 /** 866 * amdgpu_device_wb_get - Allocate a wb entry 867 * 868 * @adev: amdgpu_device pointer 869 * @wb: wb index 870 * 871 * Allocate a wb slot for use by the driver (all asics). 872 * Returns 0 on success or -EINVAL on failure. 873 */ 874 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 875 { 876 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 877 878 if (offset < adev->wb.num_wb) { 879 __set_bit(offset, adev->wb.used); 880 *wb = offset << 3; /* convert to dw offset */ 881 return 0; 882 } else { 883 return -EINVAL; 884 } 885 } 886 887 /** 888 * amdgpu_device_wb_free - Free a wb entry 889 * 890 * @adev: amdgpu_device pointer 891 * @wb: wb index 892 * 893 * Free a wb slot allocated for use by the driver (all asics) 894 */ 895 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 896 { 897 wb >>= 3; 898 if (wb < adev->wb.num_wb) 899 __clear_bit(wb, adev->wb.used); 900 } 901 902 /** 903 * amdgpu_device_resize_fb_bar - try to resize FB BAR 904 * 905 * @adev: amdgpu_device pointer 906 * 907 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 908 * to fail, but if any of the BARs is not accessible after the size we abort 909 * driver loading by returning -ENODEV. 910 */ 911 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 912 { 913 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 914 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 915 struct pci_bus *root; 916 struct resource *res; 917 unsigned i; 918 u16 cmd; 919 int r; 920 921 /* Bypass for VF */ 922 if (amdgpu_sriov_vf(adev)) 923 return 0; 924 925 /* skip if the bios has already enabled large BAR */ 926 if (adev->gmc.real_vram_size && 927 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 928 return 0; 929 930 /* Check if the root BUS has 64bit memory resources */ 931 root = adev->pdev->bus; 932 while (root->parent) 933 root = root->parent; 934 935 pci_bus_for_each_resource(root, res, i) { 936 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 937 res->start > 0x100000000ull) 938 break; 939 } 940 941 /* Trying to resize is pointless without a root hub window above 4GB */ 942 if (!res) 943 return 0; 944 945 /* Disable memory decoding while we change the BAR addresses and size */ 946 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 947 pci_write_config_word(adev->pdev, PCI_COMMAND, 948 cmd & ~PCI_COMMAND_MEMORY); 949 950 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 951 amdgpu_device_doorbell_fini(adev); 952 if (adev->asic_type >= CHIP_BONAIRE) 953 pci_release_resource(adev->pdev, 2); 954 955 pci_release_resource(adev->pdev, 0); 956 957 r = pci_resize_resource(adev->pdev, 0, rbar_size); 958 if (r == -ENOSPC) 959 DRM_INFO("Not enough PCI address space for a large BAR."); 960 else if (r && r != -ENOTSUPP) 961 DRM_ERROR("Problem resizing BAR0 (%d).", r); 962 963 pci_assign_unassigned_bus_resources(adev->pdev->bus); 964 965 /* When the doorbell or fb BAR isn't available we have no chance of 966 * using the device. 967 */ 968 r = amdgpu_device_doorbell_init(adev); 969 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 970 return -ENODEV; 971 972 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 973 974 return 0; 975 } 976 977 /* 978 * GPU helpers function. 979 */ 980 /** 981 * amdgpu_device_need_post - check if the hw need post or not 982 * 983 * @adev: amdgpu_device pointer 984 * 985 * Check if the asic has been initialized (all asics) at driver startup 986 * or post is needed if hw reset is performed. 987 * Returns true if need or false if not. 988 */ 989 bool amdgpu_device_need_post(struct amdgpu_device *adev) 990 { 991 uint32_t reg; 992 993 if (amdgpu_sriov_vf(adev)) 994 return false; 995 996 if (amdgpu_passthrough(adev)) { 997 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 998 * some old smc fw still need driver do vPost otherwise gpu hang, while 999 * those smc fw version above 22.15 doesn't have this flaw, so we force 1000 * vpost executed for smc version below 22.15 1001 */ 1002 if (adev->asic_type == CHIP_FIJI) { 1003 int err; 1004 uint32_t fw_ver; 1005 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1006 /* force vPost if error occured */ 1007 if (err) 1008 return true; 1009 1010 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1011 if (fw_ver < 0x00160e00) 1012 return true; 1013 } 1014 } 1015 1016 if (adev->has_hw_reset) { 1017 adev->has_hw_reset = false; 1018 return true; 1019 } 1020 1021 /* bios scratch used on CIK+ */ 1022 if (adev->asic_type >= CHIP_BONAIRE) 1023 return amdgpu_atombios_scratch_need_asic_init(adev); 1024 1025 /* check MEM_SIZE for older asics */ 1026 reg = amdgpu_asic_get_config_memsize(adev); 1027 1028 if ((reg != 0) && (reg != 0xffffffff)) 1029 return false; 1030 1031 return true; 1032 } 1033 1034 /* if we get transitioned to only one device, take VGA back */ 1035 /** 1036 * amdgpu_device_vga_set_decode - enable/disable vga decode 1037 * 1038 * @cookie: amdgpu_device pointer 1039 * @state: enable/disable vga decode 1040 * 1041 * Enable/disable vga decode (all asics). 1042 * Returns VGA resource flags. 1043 */ 1044 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1045 { 1046 struct amdgpu_device *adev = cookie; 1047 amdgpu_asic_set_vga_state(adev, state); 1048 if (state) 1049 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1050 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1051 else 1052 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1053 } 1054 1055 /** 1056 * amdgpu_device_check_block_size - validate the vm block size 1057 * 1058 * @adev: amdgpu_device pointer 1059 * 1060 * Validates the vm block size specified via module parameter. 1061 * The vm block size defines number of bits in page table versus page directory, 1062 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1063 * page table and the remaining bits are in the page directory. 1064 */ 1065 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1066 { 1067 /* defines number of bits in page table versus page directory, 1068 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1069 * page table and the remaining bits are in the page directory */ 1070 if (amdgpu_vm_block_size == -1) 1071 return; 1072 1073 if (amdgpu_vm_block_size < 9) { 1074 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1075 amdgpu_vm_block_size); 1076 amdgpu_vm_block_size = -1; 1077 } 1078 } 1079 1080 /** 1081 * amdgpu_device_check_vm_size - validate the vm size 1082 * 1083 * @adev: amdgpu_device pointer 1084 * 1085 * Validates the vm size in GB specified via module parameter. 1086 * The VM size is the size of the GPU virtual memory space in GB. 1087 */ 1088 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1089 { 1090 /* no need to check the default value */ 1091 if (amdgpu_vm_size == -1) 1092 return; 1093 1094 if (amdgpu_vm_size < 1) { 1095 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1096 amdgpu_vm_size); 1097 amdgpu_vm_size = -1; 1098 } 1099 } 1100 1101 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1102 { 1103 struct sysinfo si; 1104 bool is_os_64 = (sizeof(void *) == 8); 1105 uint64_t total_memory; 1106 uint64_t dram_size_seven_GB = 0x1B8000000; 1107 uint64_t dram_size_three_GB = 0xB8000000; 1108 1109 if (amdgpu_smu_memory_pool_size == 0) 1110 return; 1111 1112 if (!is_os_64) { 1113 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1114 goto def_value; 1115 } 1116 si_meminfo(&si); 1117 total_memory = (uint64_t)si.totalram * si.mem_unit; 1118 1119 if ((amdgpu_smu_memory_pool_size == 1) || 1120 (amdgpu_smu_memory_pool_size == 2)) { 1121 if (total_memory < dram_size_three_GB) 1122 goto def_value1; 1123 } else if ((amdgpu_smu_memory_pool_size == 4) || 1124 (amdgpu_smu_memory_pool_size == 8)) { 1125 if (total_memory < dram_size_seven_GB) 1126 goto def_value1; 1127 } else { 1128 DRM_WARN("Smu memory pool size not supported\n"); 1129 goto def_value; 1130 } 1131 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1132 1133 return; 1134 1135 def_value1: 1136 DRM_WARN("No enough system memory\n"); 1137 def_value: 1138 adev->pm.smu_prv_buffer_size = 0; 1139 } 1140 1141 /** 1142 * amdgpu_device_check_arguments - validate module params 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Validates certain module parameters and updates 1147 * the associated values used by the driver (all asics). 1148 */ 1149 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1150 { 1151 if (amdgpu_sched_jobs < 4) { 1152 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1153 amdgpu_sched_jobs); 1154 amdgpu_sched_jobs = 4; 1155 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1156 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1157 amdgpu_sched_jobs); 1158 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1159 } 1160 1161 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1162 /* gart size must be greater or equal to 32M */ 1163 dev_warn(adev->dev, "gart size (%d) too small\n", 1164 amdgpu_gart_size); 1165 amdgpu_gart_size = -1; 1166 } 1167 1168 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1169 /* gtt size must be greater or equal to 32M */ 1170 dev_warn(adev->dev, "gtt size (%d) too small\n", 1171 amdgpu_gtt_size); 1172 amdgpu_gtt_size = -1; 1173 } 1174 1175 /* valid range is between 4 and 9 inclusive */ 1176 if (amdgpu_vm_fragment_size != -1 && 1177 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1178 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1179 amdgpu_vm_fragment_size = -1; 1180 } 1181 1182 if (amdgpu_sched_hw_submission < 2) { 1183 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1184 amdgpu_sched_hw_submission); 1185 amdgpu_sched_hw_submission = 2; 1186 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1187 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1188 amdgpu_sched_hw_submission); 1189 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1190 } 1191 1192 amdgpu_device_check_smu_prv_buffer_size(adev); 1193 1194 amdgpu_device_check_vm_size(adev); 1195 1196 amdgpu_device_check_block_size(adev); 1197 1198 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1199 1200 amdgpu_gmc_tmz_set(adev); 1201 1202 return 0; 1203 } 1204 1205 /** 1206 * amdgpu_switcheroo_set_state - set switcheroo state 1207 * 1208 * @pdev: pci dev pointer 1209 * @state: vga_switcheroo state 1210 * 1211 * Callback for the switcheroo driver. Suspends or resumes the 1212 * the asics before or after it is powered up using ACPI methods. 1213 */ 1214 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1215 { 1216 struct drm_device *dev = pci_get_drvdata(pdev); 1217 int r; 1218 1219 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1220 return; 1221 1222 if (state == VGA_SWITCHEROO_ON) { 1223 pr_info("switched on\n"); 1224 /* don't suspend or resume card normally */ 1225 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1226 1227 pci_set_power_state(dev->pdev, PCI_D0); 1228 pci_restore_state(dev->pdev); 1229 r = pci_enable_device(dev->pdev); 1230 if (r) 1231 DRM_WARN("pci_enable_device failed (%d)\n", r); 1232 amdgpu_device_resume(dev, true); 1233 1234 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1235 drm_kms_helper_poll_enable(dev); 1236 } else { 1237 pr_info("switched off\n"); 1238 drm_kms_helper_poll_disable(dev); 1239 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1240 amdgpu_device_suspend(dev, true); 1241 pci_save_state(dev->pdev); 1242 /* Shut down the device */ 1243 pci_disable_device(dev->pdev); 1244 pci_set_power_state(dev->pdev, PCI_D3cold); 1245 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1246 } 1247 } 1248 1249 /** 1250 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1251 * 1252 * @pdev: pci dev pointer 1253 * 1254 * Callback for the switcheroo driver. Check of the switcheroo 1255 * state can be changed. 1256 * Returns true if the state can be changed, false if not. 1257 */ 1258 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1259 { 1260 struct drm_device *dev = pci_get_drvdata(pdev); 1261 1262 /* 1263 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1264 * locking inversion with the driver load path. And the access here is 1265 * completely racy anyway. So don't bother with locking for now. 1266 */ 1267 return atomic_read(&dev->open_count) == 0; 1268 } 1269 1270 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1271 .set_gpu_state = amdgpu_switcheroo_set_state, 1272 .reprobe = NULL, 1273 .can_switch = amdgpu_switcheroo_can_switch, 1274 }; 1275 1276 /** 1277 * amdgpu_device_ip_set_clockgating_state - set the CG state 1278 * 1279 * @dev: amdgpu_device pointer 1280 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1281 * @state: clockgating state (gate or ungate) 1282 * 1283 * Sets the requested clockgating state for all instances of 1284 * the hardware IP specified. 1285 * Returns the error code from the last instance. 1286 */ 1287 int amdgpu_device_ip_set_clockgating_state(void *dev, 1288 enum amd_ip_block_type block_type, 1289 enum amd_clockgating_state state) 1290 { 1291 struct amdgpu_device *adev = dev; 1292 int i, r = 0; 1293 1294 for (i = 0; i < adev->num_ip_blocks; i++) { 1295 if (!adev->ip_blocks[i].status.valid) 1296 continue; 1297 if (adev->ip_blocks[i].version->type != block_type) 1298 continue; 1299 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1300 continue; 1301 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1302 (void *)adev, state); 1303 if (r) 1304 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1305 adev->ip_blocks[i].version->funcs->name, r); 1306 } 1307 return r; 1308 } 1309 1310 /** 1311 * amdgpu_device_ip_set_powergating_state - set the PG state 1312 * 1313 * @dev: amdgpu_device pointer 1314 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1315 * @state: powergating state (gate or ungate) 1316 * 1317 * Sets the requested powergating state for all instances of 1318 * the hardware IP specified. 1319 * Returns the error code from the last instance. 1320 */ 1321 int amdgpu_device_ip_set_powergating_state(void *dev, 1322 enum amd_ip_block_type block_type, 1323 enum amd_powergating_state state) 1324 { 1325 struct amdgpu_device *adev = dev; 1326 int i, r = 0; 1327 1328 for (i = 0; i < adev->num_ip_blocks; i++) { 1329 if (!adev->ip_blocks[i].status.valid) 1330 continue; 1331 if (adev->ip_blocks[i].version->type != block_type) 1332 continue; 1333 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1334 continue; 1335 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1336 (void *)adev, state); 1337 if (r) 1338 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1339 adev->ip_blocks[i].version->funcs->name, r); 1340 } 1341 return r; 1342 } 1343 1344 /** 1345 * amdgpu_device_ip_get_clockgating_state - get the CG state 1346 * 1347 * @adev: amdgpu_device pointer 1348 * @flags: clockgating feature flags 1349 * 1350 * Walks the list of IPs on the device and updates the clockgating 1351 * flags for each IP. 1352 * Updates @flags with the feature flags for each hardware IP where 1353 * clockgating is enabled. 1354 */ 1355 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1356 u32 *flags) 1357 { 1358 int i; 1359 1360 for (i = 0; i < adev->num_ip_blocks; i++) { 1361 if (!adev->ip_blocks[i].status.valid) 1362 continue; 1363 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1364 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1365 } 1366 } 1367 1368 /** 1369 * amdgpu_device_ip_wait_for_idle - wait for idle 1370 * 1371 * @adev: amdgpu_device pointer 1372 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1373 * 1374 * Waits for the request hardware IP to be idle. 1375 * Returns 0 for success or a negative error code on failure. 1376 */ 1377 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1378 enum amd_ip_block_type block_type) 1379 { 1380 int i, r; 1381 1382 for (i = 0; i < adev->num_ip_blocks; i++) { 1383 if (!adev->ip_blocks[i].status.valid) 1384 continue; 1385 if (adev->ip_blocks[i].version->type == block_type) { 1386 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1387 if (r) 1388 return r; 1389 break; 1390 } 1391 } 1392 return 0; 1393 1394 } 1395 1396 /** 1397 * amdgpu_device_ip_is_idle - is the hardware IP idle 1398 * 1399 * @adev: amdgpu_device pointer 1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1401 * 1402 * Check if the hardware IP is idle or not. 1403 * Returns true if it the IP is idle, false if not. 1404 */ 1405 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1406 enum amd_ip_block_type block_type) 1407 { 1408 int i; 1409 1410 for (i = 0; i < adev->num_ip_blocks; i++) { 1411 if (!adev->ip_blocks[i].status.valid) 1412 continue; 1413 if (adev->ip_blocks[i].version->type == block_type) 1414 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1415 } 1416 return true; 1417 1418 } 1419 1420 /** 1421 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1422 * 1423 * @adev: amdgpu_device pointer 1424 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1425 * 1426 * Returns a pointer to the hardware IP block structure 1427 * if it exists for the asic, otherwise NULL. 1428 */ 1429 struct amdgpu_ip_block * 1430 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1431 enum amd_ip_block_type type) 1432 { 1433 int i; 1434 1435 for (i = 0; i < adev->num_ip_blocks; i++) 1436 if (adev->ip_blocks[i].version->type == type) 1437 return &adev->ip_blocks[i]; 1438 1439 return NULL; 1440 } 1441 1442 /** 1443 * amdgpu_device_ip_block_version_cmp 1444 * 1445 * @adev: amdgpu_device pointer 1446 * @type: enum amd_ip_block_type 1447 * @major: major version 1448 * @minor: minor version 1449 * 1450 * return 0 if equal or greater 1451 * return 1 if smaller or the ip_block doesn't exist 1452 */ 1453 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1454 enum amd_ip_block_type type, 1455 u32 major, u32 minor) 1456 { 1457 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1458 1459 if (ip_block && ((ip_block->version->major > major) || 1460 ((ip_block->version->major == major) && 1461 (ip_block->version->minor >= minor)))) 1462 return 0; 1463 1464 return 1; 1465 } 1466 1467 /** 1468 * amdgpu_device_ip_block_add 1469 * 1470 * @adev: amdgpu_device pointer 1471 * @ip_block_version: pointer to the IP to add 1472 * 1473 * Adds the IP block driver information to the collection of IPs 1474 * on the asic. 1475 */ 1476 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1477 const struct amdgpu_ip_block_version *ip_block_version) 1478 { 1479 if (!ip_block_version) 1480 return -EINVAL; 1481 1482 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1483 ip_block_version->funcs->name); 1484 1485 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1486 1487 return 0; 1488 } 1489 1490 /** 1491 * amdgpu_device_enable_virtual_display - enable virtual display feature 1492 * 1493 * @adev: amdgpu_device pointer 1494 * 1495 * Enabled the virtual display feature if the user has enabled it via 1496 * the module parameter virtual_display. This feature provides a virtual 1497 * display hardware on headless boards or in virtualized environments. 1498 * This function parses and validates the configuration string specified by 1499 * the user and configues the virtual display configuration (number of 1500 * virtual connectors, crtcs, etc.) specified. 1501 */ 1502 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1503 { 1504 adev->enable_virtual_display = false; 1505 1506 if (amdgpu_virtual_display) { 1507 struct drm_device *ddev = adev->ddev; 1508 const char *pci_address_name = pci_name(ddev->pdev); 1509 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1510 1511 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1512 pciaddstr_tmp = pciaddstr; 1513 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1514 pciaddname = strsep(&pciaddname_tmp, ","); 1515 if (!strcmp("all", pciaddname) 1516 || !strcmp(pci_address_name, pciaddname)) { 1517 long num_crtc; 1518 int res = -1; 1519 1520 adev->enable_virtual_display = true; 1521 1522 if (pciaddname_tmp) 1523 res = kstrtol(pciaddname_tmp, 10, 1524 &num_crtc); 1525 1526 if (!res) { 1527 if (num_crtc < 1) 1528 num_crtc = 1; 1529 if (num_crtc > 6) 1530 num_crtc = 6; 1531 adev->mode_info.num_crtc = num_crtc; 1532 } else { 1533 adev->mode_info.num_crtc = 1; 1534 } 1535 break; 1536 } 1537 } 1538 1539 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1540 amdgpu_virtual_display, pci_address_name, 1541 adev->enable_virtual_display, adev->mode_info.num_crtc); 1542 1543 kfree(pciaddstr); 1544 } 1545 } 1546 1547 /** 1548 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1549 * 1550 * @adev: amdgpu_device pointer 1551 * 1552 * Parses the asic configuration parameters specified in the gpu info 1553 * firmware and makes them availale to the driver for use in configuring 1554 * the asic. 1555 * Returns 0 on success, -EINVAL on failure. 1556 */ 1557 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1558 { 1559 const char *chip_name; 1560 char fw_name[40]; 1561 int err; 1562 const struct gpu_info_firmware_header_v1_0 *hdr; 1563 1564 adev->firmware.gpu_info_fw = NULL; 1565 1566 if (adev->discovery_bin) { 1567 amdgpu_discovery_get_gfx_info(adev); 1568 1569 /* 1570 * FIXME: The bounding box is still needed by Navi12, so 1571 * temporarily read it from gpu_info firmware. Should be droped 1572 * when DAL no longer needs it. 1573 */ 1574 if (adev->asic_type != CHIP_NAVI12) 1575 return 0; 1576 } 1577 1578 switch (adev->asic_type) { 1579 #ifdef CONFIG_DRM_AMDGPU_SI 1580 case CHIP_VERDE: 1581 case CHIP_TAHITI: 1582 case CHIP_PITCAIRN: 1583 case CHIP_OLAND: 1584 case CHIP_HAINAN: 1585 #endif 1586 #ifdef CONFIG_DRM_AMDGPU_CIK 1587 case CHIP_BONAIRE: 1588 case CHIP_HAWAII: 1589 case CHIP_KAVERI: 1590 case CHIP_KABINI: 1591 case CHIP_MULLINS: 1592 #endif 1593 case CHIP_TOPAZ: 1594 case CHIP_TONGA: 1595 case CHIP_FIJI: 1596 case CHIP_POLARIS10: 1597 case CHIP_POLARIS11: 1598 case CHIP_POLARIS12: 1599 case CHIP_VEGAM: 1600 case CHIP_CARRIZO: 1601 case CHIP_STONEY: 1602 case CHIP_VEGA20: 1603 default: 1604 return 0; 1605 case CHIP_VEGA10: 1606 chip_name = "vega10"; 1607 break; 1608 case CHIP_VEGA12: 1609 chip_name = "vega12"; 1610 break; 1611 case CHIP_RAVEN: 1612 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1613 chip_name = "raven2"; 1614 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1615 chip_name = "picasso"; 1616 else 1617 chip_name = "raven"; 1618 break; 1619 case CHIP_ARCTURUS: 1620 chip_name = "arcturus"; 1621 break; 1622 case CHIP_RENOIR: 1623 chip_name = "renoir"; 1624 break; 1625 case CHIP_NAVI10: 1626 chip_name = "navi10"; 1627 break; 1628 case CHIP_NAVI14: 1629 chip_name = "navi14"; 1630 break; 1631 case CHIP_NAVI12: 1632 chip_name = "navi12"; 1633 break; 1634 case CHIP_SIENNA_CICHLID: 1635 chip_name = "sienna_cichlid"; 1636 break; 1637 case CHIP_NAVY_FLOUNDER: 1638 chip_name = "navy_flounder"; 1639 break; 1640 } 1641 1642 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1643 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1644 if (err) { 1645 dev_err(adev->dev, 1646 "Failed to load gpu_info firmware \"%s\"\n", 1647 fw_name); 1648 goto out; 1649 } 1650 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1651 if (err) { 1652 dev_err(adev->dev, 1653 "Failed to validate gpu_info firmware \"%s\"\n", 1654 fw_name); 1655 goto out; 1656 } 1657 1658 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1659 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1660 1661 switch (hdr->version_major) { 1662 case 1: 1663 { 1664 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1665 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1666 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1667 1668 /* 1669 * Should be droped when DAL no longer needs it. 1670 */ 1671 if (adev->asic_type == CHIP_NAVI12) 1672 goto parse_soc_bounding_box; 1673 1674 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1675 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1676 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1677 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1678 adev->gfx.config.max_texture_channel_caches = 1679 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1680 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1681 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1682 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1683 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1684 adev->gfx.config.double_offchip_lds_buf = 1685 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1686 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1687 adev->gfx.cu_info.max_waves_per_simd = 1688 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1689 adev->gfx.cu_info.max_scratch_slots_per_cu = 1690 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1691 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1692 if (hdr->version_minor >= 1) { 1693 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1694 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1695 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1696 adev->gfx.config.num_sc_per_sh = 1697 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1698 adev->gfx.config.num_packer_per_sc = 1699 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1700 } 1701 1702 parse_soc_bounding_box: 1703 /* 1704 * soc bounding box info is not integrated in disocovery table, 1705 * we always need to parse it from gpu info firmware if needed. 1706 */ 1707 if (hdr->version_minor == 2) { 1708 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1709 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1710 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1711 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1712 } 1713 break; 1714 } 1715 default: 1716 dev_err(adev->dev, 1717 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1718 err = -EINVAL; 1719 goto out; 1720 } 1721 out: 1722 return err; 1723 } 1724 1725 /** 1726 * amdgpu_device_ip_early_init - run early init for hardware IPs 1727 * 1728 * @adev: amdgpu_device pointer 1729 * 1730 * Early initialization pass for hardware IPs. The hardware IPs that make 1731 * up each asic are discovered each IP's early_init callback is run. This 1732 * is the first stage in initializing the asic. 1733 * Returns 0 on success, negative error code on failure. 1734 */ 1735 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1736 { 1737 int i, r; 1738 1739 amdgpu_device_enable_virtual_display(adev); 1740 1741 if (amdgpu_sriov_vf(adev)) { 1742 r = amdgpu_virt_request_full_gpu(adev, true); 1743 if (r) 1744 return r; 1745 } 1746 1747 switch (adev->asic_type) { 1748 #ifdef CONFIG_DRM_AMDGPU_SI 1749 case CHIP_VERDE: 1750 case CHIP_TAHITI: 1751 case CHIP_PITCAIRN: 1752 case CHIP_OLAND: 1753 case CHIP_HAINAN: 1754 adev->family = AMDGPU_FAMILY_SI; 1755 r = si_set_ip_blocks(adev); 1756 if (r) 1757 return r; 1758 break; 1759 #endif 1760 #ifdef CONFIG_DRM_AMDGPU_CIK 1761 case CHIP_BONAIRE: 1762 case CHIP_HAWAII: 1763 case CHIP_KAVERI: 1764 case CHIP_KABINI: 1765 case CHIP_MULLINS: 1766 if (adev->flags & AMD_IS_APU) 1767 adev->family = AMDGPU_FAMILY_KV; 1768 else 1769 adev->family = AMDGPU_FAMILY_CI; 1770 1771 r = cik_set_ip_blocks(adev); 1772 if (r) 1773 return r; 1774 break; 1775 #endif 1776 case CHIP_TOPAZ: 1777 case CHIP_TONGA: 1778 case CHIP_FIJI: 1779 case CHIP_POLARIS10: 1780 case CHIP_POLARIS11: 1781 case CHIP_POLARIS12: 1782 case CHIP_VEGAM: 1783 case CHIP_CARRIZO: 1784 case CHIP_STONEY: 1785 if (adev->flags & AMD_IS_APU) 1786 adev->family = AMDGPU_FAMILY_CZ; 1787 else 1788 adev->family = AMDGPU_FAMILY_VI; 1789 1790 r = vi_set_ip_blocks(adev); 1791 if (r) 1792 return r; 1793 break; 1794 case CHIP_VEGA10: 1795 case CHIP_VEGA12: 1796 case CHIP_VEGA20: 1797 case CHIP_RAVEN: 1798 case CHIP_ARCTURUS: 1799 case CHIP_RENOIR: 1800 if (adev->flags & AMD_IS_APU) 1801 adev->family = AMDGPU_FAMILY_RV; 1802 else 1803 adev->family = AMDGPU_FAMILY_AI; 1804 1805 r = soc15_set_ip_blocks(adev); 1806 if (r) 1807 return r; 1808 break; 1809 case CHIP_NAVI10: 1810 case CHIP_NAVI14: 1811 case CHIP_NAVI12: 1812 case CHIP_SIENNA_CICHLID: 1813 case CHIP_NAVY_FLOUNDER: 1814 adev->family = AMDGPU_FAMILY_NV; 1815 1816 r = nv_set_ip_blocks(adev); 1817 if (r) 1818 return r; 1819 break; 1820 default: 1821 /* FIXME: not supported yet */ 1822 return -EINVAL; 1823 } 1824 1825 amdgpu_amdkfd_device_probe(adev); 1826 1827 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1828 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1829 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1830 1831 for (i = 0; i < adev->num_ip_blocks; i++) { 1832 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1833 DRM_ERROR("disabled ip block: %d <%s>\n", 1834 i, adev->ip_blocks[i].version->funcs->name); 1835 adev->ip_blocks[i].status.valid = false; 1836 } else { 1837 if (adev->ip_blocks[i].version->funcs->early_init) { 1838 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1839 if (r == -ENOENT) { 1840 adev->ip_blocks[i].status.valid = false; 1841 } else if (r) { 1842 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1843 adev->ip_blocks[i].version->funcs->name, r); 1844 return r; 1845 } else { 1846 adev->ip_blocks[i].status.valid = true; 1847 } 1848 } else { 1849 adev->ip_blocks[i].status.valid = true; 1850 } 1851 } 1852 /* get the vbios after the asic_funcs are set up */ 1853 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1854 r = amdgpu_device_parse_gpu_info_fw(adev); 1855 if (r) 1856 return r; 1857 1858 /* Read BIOS */ 1859 if (!amdgpu_get_bios(adev)) 1860 return -EINVAL; 1861 1862 r = amdgpu_atombios_init(adev); 1863 if (r) { 1864 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1865 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1866 return r; 1867 } 1868 } 1869 } 1870 1871 adev->cg_flags &= amdgpu_cg_mask; 1872 adev->pg_flags &= amdgpu_pg_mask; 1873 1874 return 0; 1875 } 1876 1877 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1878 { 1879 int i, r; 1880 1881 for (i = 0; i < adev->num_ip_blocks; i++) { 1882 if (!adev->ip_blocks[i].status.sw) 1883 continue; 1884 if (adev->ip_blocks[i].status.hw) 1885 continue; 1886 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1887 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1888 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1889 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1890 if (r) { 1891 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1892 adev->ip_blocks[i].version->funcs->name, r); 1893 return r; 1894 } 1895 adev->ip_blocks[i].status.hw = true; 1896 } 1897 } 1898 1899 return 0; 1900 } 1901 1902 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1903 { 1904 int i, r; 1905 1906 for (i = 0; i < adev->num_ip_blocks; i++) { 1907 if (!adev->ip_blocks[i].status.sw) 1908 continue; 1909 if (adev->ip_blocks[i].status.hw) 1910 continue; 1911 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1912 if (r) { 1913 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1914 adev->ip_blocks[i].version->funcs->name, r); 1915 return r; 1916 } 1917 adev->ip_blocks[i].status.hw = true; 1918 } 1919 1920 return 0; 1921 } 1922 1923 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1924 { 1925 int r = 0; 1926 int i; 1927 uint32_t smu_version; 1928 1929 if (adev->asic_type >= CHIP_VEGA10) { 1930 for (i = 0; i < adev->num_ip_blocks; i++) { 1931 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1932 continue; 1933 1934 /* no need to do the fw loading again if already done*/ 1935 if (adev->ip_blocks[i].status.hw == true) 1936 break; 1937 1938 if (adev->in_gpu_reset || adev->in_suspend) { 1939 r = adev->ip_blocks[i].version->funcs->resume(adev); 1940 if (r) { 1941 DRM_ERROR("resume of IP block <%s> failed %d\n", 1942 adev->ip_blocks[i].version->funcs->name, r); 1943 return r; 1944 } 1945 } else { 1946 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1947 if (r) { 1948 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1949 adev->ip_blocks[i].version->funcs->name, r); 1950 return r; 1951 } 1952 } 1953 1954 adev->ip_blocks[i].status.hw = true; 1955 break; 1956 } 1957 } 1958 1959 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 1960 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1961 1962 return r; 1963 } 1964 1965 /** 1966 * amdgpu_device_ip_init - run init for hardware IPs 1967 * 1968 * @adev: amdgpu_device pointer 1969 * 1970 * Main initialization pass for hardware IPs. The list of all the hardware 1971 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1972 * are run. sw_init initializes the software state associated with each IP 1973 * and hw_init initializes the hardware associated with each IP. 1974 * Returns 0 on success, negative error code on failure. 1975 */ 1976 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1977 { 1978 int i, r; 1979 1980 r = amdgpu_ras_init(adev); 1981 if (r) 1982 return r; 1983 1984 for (i = 0; i < adev->num_ip_blocks; i++) { 1985 if (!adev->ip_blocks[i].status.valid) 1986 continue; 1987 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1988 if (r) { 1989 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1990 adev->ip_blocks[i].version->funcs->name, r); 1991 goto init_failed; 1992 } 1993 adev->ip_blocks[i].status.sw = true; 1994 1995 /* need to do gmc hw init early so we can allocate gpu mem */ 1996 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 1997 r = amdgpu_device_vram_scratch_init(adev); 1998 if (r) { 1999 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2000 goto init_failed; 2001 } 2002 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2003 if (r) { 2004 DRM_ERROR("hw_init %d failed %d\n", i, r); 2005 goto init_failed; 2006 } 2007 r = amdgpu_device_wb_init(adev); 2008 if (r) { 2009 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2010 goto init_failed; 2011 } 2012 adev->ip_blocks[i].status.hw = true; 2013 2014 /* right after GMC hw init, we create CSA */ 2015 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2016 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2017 AMDGPU_GEM_DOMAIN_VRAM, 2018 AMDGPU_CSA_SIZE); 2019 if (r) { 2020 DRM_ERROR("allocate CSA failed %d\n", r); 2021 goto init_failed; 2022 } 2023 } 2024 } 2025 } 2026 2027 if (amdgpu_sriov_vf(adev)) 2028 amdgpu_virt_init_data_exchange(adev); 2029 2030 r = amdgpu_ib_pool_init(adev); 2031 if (r) { 2032 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2033 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2034 goto init_failed; 2035 } 2036 2037 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2038 if (r) 2039 goto init_failed; 2040 2041 r = amdgpu_device_ip_hw_init_phase1(adev); 2042 if (r) 2043 goto init_failed; 2044 2045 r = amdgpu_device_fw_loading(adev); 2046 if (r) 2047 goto init_failed; 2048 2049 r = amdgpu_device_ip_hw_init_phase2(adev); 2050 if (r) 2051 goto init_failed; 2052 2053 /* 2054 * retired pages will be loaded from eeprom and reserved here, 2055 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2056 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2057 * for I2C communication which only true at this point. 2058 * recovery_init may fail, but it can free all resources allocated by 2059 * itself and its failure should not stop amdgpu init process. 2060 * 2061 * Note: theoretically, this should be called before all vram allocations 2062 * to protect retired page from abusing 2063 */ 2064 amdgpu_ras_recovery_init(adev); 2065 2066 if (adev->gmc.xgmi.num_physical_nodes > 1) 2067 amdgpu_xgmi_add_device(adev); 2068 amdgpu_amdkfd_device_init(adev); 2069 2070 amdgpu_fru_get_product_info(adev); 2071 2072 init_failed: 2073 if (amdgpu_sriov_vf(adev)) 2074 amdgpu_virt_release_full_gpu(adev, true); 2075 2076 return r; 2077 } 2078 2079 /** 2080 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2081 * 2082 * @adev: amdgpu_device pointer 2083 * 2084 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2085 * this function before a GPU reset. If the value is retained after a 2086 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2087 */ 2088 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2089 { 2090 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2091 } 2092 2093 /** 2094 * amdgpu_device_check_vram_lost - check if vram is valid 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Checks the reset magic value written to the gart pointer in VRAM. 2099 * The driver calls this after a GPU reset to see if the contents of 2100 * VRAM is lost or now. 2101 * returns true if vram is lost, false if not. 2102 */ 2103 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2104 { 2105 if (memcmp(adev->gart.ptr, adev->reset_magic, 2106 AMDGPU_RESET_MAGIC_NUM)) 2107 return true; 2108 2109 if (!adev->in_gpu_reset) 2110 return false; 2111 2112 /* 2113 * For all ASICs with baco/mode1 reset, the VRAM is 2114 * always assumed to be lost. 2115 */ 2116 switch (amdgpu_asic_reset_method(adev)) { 2117 case AMD_RESET_METHOD_BACO: 2118 case AMD_RESET_METHOD_MODE1: 2119 return true; 2120 default: 2121 return false; 2122 } 2123 } 2124 2125 /** 2126 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2127 * 2128 * @adev: amdgpu_device pointer 2129 * @state: clockgating state (gate or ungate) 2130 * 2131 * The list of all the hardware IPs that make up the asic is walked and the 2132 * set_clockgating_state callbacks are run. 2133 * Late initialization pass enabling clockgating for hardware IPs. 2134 * Fini or suspend, pass disabling clockgating for hardware IPs. 2135 * Returns 0 on success, negative error code on failure. 2136 */ 2137 2138 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2139 enum amd_clockgating_state state) 2140 { 2141 int i, j, r; 2142 2143 if (amdgpu_emu_mode == 1) 2144 return 0; 2145 2146 for (j = 0; j < adev->num_ip_blocks; j++) { 2147 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2148 if (!adev->ip_blocks[i].status.late_initialized) 2149 continue; 2150 /* skip CG for VCE/UVD, it's handled specially */ 2151 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2152 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2153 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2154 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2155 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2156 /* enable clockgating to save power */ 2157 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2158 state); 2159 if (r) { 2160 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2161 adev->ip_blocks[i].version->funcs->name, r); 2162 return r; 2163 } 2164 } 2165 } 2166 2167 return 0; 2168 } 2169 2170 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2171 { 2172 int i, j, r; 2173 2174 if (amdgpu_emu_mode == 1) 2175 return 0; 2176 2177 for (j = 0; j < adev->num_ip_blocks; j++) { 2178 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2179 if (!adev->ip_blocks[i].status.late_initialized) 2180 continue; 2181 /* skip CG for VCE/UVD, it's handled specially */ 2182 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2183 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2184 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2185 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2186 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2187 /* enable powergating to save power */ 2188 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2189 state); 2190 if (r) { 2191 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2192 adev->ip_blocks[i].version->funcs->name, r); 2193 return r; 2194 } 2195 } 2196 } 2197 return 0; 2198 } 2199 2200 static int amdgpu_device_enable_mgpu_fan_boost(void) 2201 { 2202 struct amdgpu_gpu_instance *gpu_ins; 2203 struct amdgpu_device *adev; 2204 int i, ret = 0; 2205 2206 mutex_lock(&mgpu_info.mutex); 2207 2208 /* 2209 * MGPU fan boost feature should be enabled 2210 * only when there are two or more dGPUs in 2211 * the system 2212 */ 2213 if (mgpu_info.num_dgpu < 2) 2214 goto out; 2215 2216 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2217 gpu_ins = &(mgpu_info.gpu_ins[i]); 2218 adev = gpu_ins->adev; 2219 if (!(adev->flags & AMD_IS_APU) && 2220 !gpu_ins->mgpu_fan_enabled && 2221 adev->powerplay.pp_funcs && 2222 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 2223 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2224 if (ret) 2225 break; 2226 2227 gpu_ins->mgpu_fan_enabled = 1; 2228 } 2229 } 2230 2231 out: 2232 mutex_unlock(&mgpu_info.mutex); 2233 2234 return ret; 2235 } 2236 2237 /** 2238 * amdgpu_device_ip_late_init - run late init for hardware IPs 2239 * 2240 * @adev: amdgpu_device pointer 2241 * 2242 * Late initialization pass for hardware IPs. The list of all the hardware 2243 * IPs that make up the asic is walked and the late_init callbacks are run. 2244 * late_init covers any special initialization that an IP requires 2245 * after all of the have been initialized or something that needs to happen 2246 * late in the init process. 2247 * Returns 0 on success, negative error code on failure. 2248 */ 2249 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2250 { 2251 struct amdgpu_gpu_instance *gpu_instance; 2252 int i = 0, r; 2253 2254 for (i = 0; i < adev->num_ip_blocks; i++) { 2255 if (!adev->ip_blocks[i].status.hw) 2256 continue; 2257 if (adev->ip_blocks[i].version->funcs->late_init) { 2258 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2259 if (r) { 2260 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2261 adev->ip_blocks[i].version->funcs->name, r); 2262 return r; 2263 } 2264 } 2265 adev->ip_blocks[i].status.late_initialized = true; 2266 } 2267 2268 amdgpu_ras_set_error_query_ready(adev, true); 2269 2270 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2271 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2272 2273 amdgpu_device_fill_reset_magic(adev); 2274 2275 r = amdgpu_device_enable_mgpu_fan_boost(); 2276 if (r) 2277 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2278 2279 2280 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2281 mutex_lock(&mgpu_info.mutex); 2282 2283 /* 2284 * Reset device p-state to low as this was booted with high. 2285 * 2286 * This should be performed only after all devices from the same 2287 * hive get initialized. 2288 * 2289 * However, it's unknown how many device in the hive in advance. 2290 * As this is counted one by one during devices initializations. 2291 * 2292 * So, we wait for all XGMI interlinked devices initialized. 2293 * This may bring some delays as those devices may come from 2294 * different hives. But that should be OK. 2295 */ 2296 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2297 for (i = 0; i < mgpu_info.num_gpu; i++) { 2298 gpu_instance = &(mgpu_info.gpu_ins[i]); 2299 if (gpu_instance->adev->flags & AMD_IS_APU) 2300 continue; 2301 2302 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2303 AMDGPU_XGMI_PSTATE_MIN); 2304 if (r) { 2305 DRM_ERROR("pstate setting failed (%d).\n", r); 2306 break; 2307 } 2308 } 2309 } 2310 2311 mutex_unlock(&mgpu_info.mutex); 2312 } 2313 2314 return 0; 2315 } 2316 2317 /** 2318 * amdgpu_device_ip_fini - run fini for hardware IPs 2319 * 2320 * @adev: amdgpu_device pointer 2321 * 2322 * Main teardown pass for hardware IPs. The list of all the hardware 2323 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2324 * are run. hw_fini tears down the hardware associated with each IP 2325 * and sw_fini tears down any software state associated with each IP. 2326 * Returns 0 on success, negative error code on failure. 2327 */ 2328 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2329 { 2330 int i, r; 2331 2332 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2333 amdgpu_virt_release_ras_err_handler_data(adev); 2334 2335 amdgpu_ras_pre_fini(adev); 2336 2337 if (adev->gmc.xgmi.num_physical_nodes > 1) 2338 amdgpu_xgmi_remove_device(adev); 2339 2340 amdgpu_amdkfd_device_fini(adev); 2341 2342 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2343 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2344 2345 /* need to disable SMC first */ 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if (!adev->ip_blocks[i].status.hw) 2348 continue; 2349 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2351 /* XXX handle errors */ 2352 if (r) { 2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2354 adev->ip_blocks[i].version->funcs->name, r); 2355 } 2356 adev->ip_blocks[i].status.hw = false; 2357 break; 2358 } 2359 } 2360 2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2362 if (!adev->ip_blocks[i].status.hw) 2363 continue; 2364 2365 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2366 /* XXX handle errors */ 2367 if (r) { 2368 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2369 adev->ip_blocks[i].version->funcs->name, r); 2370 } 2371 2372 adev->ip_blocks[i].status.hw = false; 2373 } 2374 2375 2376 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2377 if (!adev->ip_blocks[i].status.sw) 2378 continue; 2379 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2381 amdgpu_ucode_free_bo(adev); 2382 amdgpu_free_static_csa(&adev->virt.csa_obj); 2383 amdgpu_device_wb_fini(adev); 2384 amdgpu_device_vram_scratch_fini(adev); 2385 amdgpu_ib_pool_fini(adev); 2386 } 2387 2388 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2389 /* XXX handle errors */ 2390 if (r) { 2391 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2392 adev->ip_blocks[i].version->funcs->name, r); 2393 } 2394 adev->ip_blocks[i].status.sw = false; 2395 adev->ip_blocks[i].status.valid = false; 2396 } 2397 2398 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2399 if (!adev->ip_blocks[i].status.late_initialized) 2400 continue; 2401 if (adev->ip_blocks[i].version->funcs->late_fini) 2402 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2403 adev->ip_blocks[i].status.late_initialized = false; 2404 } 2405 2406 amdgpu_ras_fini(adev); 2407 2408 if (amdgpu_sriov_vf(adev)) 2409 if (amdgpu_virt_release_full_gpu(adev, false)) 2410 DRM_ERROR("failed to release exclusive mode on fini\n"); 2411 2412 return 0; 2413 } 2414 2415 /** 2416 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2417 * 2418 * @work: work_struct. 2419 */ 2420 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2421 { 2422 struct amdgpu_device *adev = 2423 container_of(work, struct amdgpu_device, delayed_init_work.work); 2424 int r; 2425 2426 r = amdgpu_ib_ring_tests(adev); 2427 if (r) 2428 DRM_ERROR("ib ring test failed (%d).\n", r); 2429 } 2430 2431 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2432 { 2433 struct amdgpu_device *adev = 2434 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2435 2436 mutex_lock(&adev->gfx.gfx_off_mutex); 2437 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2438 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2439 adev->gfx.gfx_off_state = true; 2440 } 2441 mutex_unlock(&adev->gfx.gfx_off_mutex); 2442 } 2443 2444 /** 2445 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2446 * 2447 * @adev: amdgpu_device pointer 2448 * 2449 * Main suspend function for hardware IPs. The list of all the hardware 2450 * IPs that make up the asic is walked, clockgating is disabled and the 2451 * suspend callbacks are run. suspend puts the hardware and software state 2452 * in each IP into a state suitable for suspend. 2453 * Returns 0 on success, negative error code on failure. 2454 */ 2455 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2456 { 2457 int i, r; 2458 2459 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2460 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2461 2462 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2463 if (!adev->ip_blocks[i].status.valid) 2464 continue; 2465 2466 /* displays are handled separately */ 2467 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2468 continue; 2469 2470 /* XXX handle errors */ 2471 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2472 /* XXX handle errors */ 2473 if (r) { 2474 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2475 adev->ip_blocks[i].version->funcs->name, r); 2476 return r; 2477 } 2478 2479 adev->ip_blocks[i].status.hw = false; 2480 } 2481 2482 return 0; 2483 } 2484 2485 /** 2486 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2487 * 2488 * @adev: amdgpu_device pointer 2489 * 2490 * Main suspend function for hardware IPs. The list of all the hardware 2491 * IPs that make up the asic is walked, clockgating is disabled and the 2492 * suspend callbacks are run. suspend puts the hardware and software state 2493 * in each IP into a state suitable for suspend. 2494 * Returns 0 on success, negative error code on failure. 2495 */ 2496 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2497 { 2498 int i, r; 2499 2500 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2501 if (!adev->ip_blocks[i].status.valid) 2502 continue; 2503 /* displays are handled in phase1 */ 2504 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2505 continue; 2506 /* PSP lost connection when err_event_athub occurs */ 2507 if (amdgpu_ras_intr_triggered() && 2508 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2509 adev->ip_blocks[i].status.hw = false; 2510 continue; 2511 } 2512 /* XXX handle errors */ 2513 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2514 /* XXX handle errors */ 2515 if (r) { 2516 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2517 adev->ip_blocks[i].version->funcs->name, r); 2518 } 2519 adev->ip_blocks[i].status.hw = false; 2520 /* handle putting the SMC in the appropriate state */ 2521 if(!amdgpu_sriov_vf(adev)){ 2522 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2523 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2524 if (r) { 2525 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2526 adev->mp1_state, r); 2527 return r; 2528 } 2529 } 2530 } 2531 adev->ip_blocks[i].status.hw = false; 2532 } 2533 2534 return 0; 2535 } 2536 2537 /** 2538 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2539 * 2540 * @adev: amdgpu_device pointer 2541 * 2542 * Main suspend function for hardware IPs. The list of all the hardware 2543 * IPs that make up the asic is walked, clockgating is disabled and the 2544 * suspend callbacks are run. suspend puts the hardware and software state 2545 * in each IP into a state suitable for suspend. 2546 * Returns 0 on success, negative error code on failure. 2547 */ 2548 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2549 { 2550 int r; 2551 2552 if (amdgpu_sriov_vf(adev)) 2553 amdgpu_virt_request_full_gpu(adev, false); 2554 2555 r = amdgpu_device_ip_suspend_phase1(adev); 2556 if (r) 2557 return r; 2558 r = amdgpu_device_ip_suspend_phase2(adev); 2559 2560 if (amdgpu_sriov_vf(adev)) 2561 amdgpu_virt_release_full_gpu(adev, false); 2562 2563 return r; 2564 } 2565 2566 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2567 { 2568 int i, r; 2569 2570 static enum amd_ip_block_type ip_order[] = { 2571 AMD_IP_BLOCK_TYPE_GMC, 2572 AMD_IP_BLOCK_TYPE_COMMON, 2573 AMD_IP_BLOCK_TYPE_PSP, 2574 AMD_IP_BLOCK_TYPE_IH, 2575 }; 2576 2577 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2578 int j; 2579 struct amdgpu_ip_block *block; 2580 2581 for (j = 0; j < adev->num_ip_blocks; j++) { 2582 block = &adev->ip_blocks[j]; 2583 2584 block->status.hw = false; 2585 if (block->version->type != ip_order[i] || 2586 !block->status.valid) 2587 continue; 2588 2589 r = block->version->funcs->hw_init(adev); 2590 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2591 if (r) 2592 return r; 2593 block->status.hw = true; 2594 } 2595 } 2596 2597 return 0; 2598 } 2599 2600 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2601 { 2602 int i, r; 2603 2604 static enum amd_ip_block_type ip_order[] = { 2605 AMD_IP_BLOCK_TYPE_SMC, 2606 AMD_IP_BLOCK_TYPE_DCE, 2607 AMD_IP_BLOCK_TYPE_GFX, 2608 AMD_IP_BLOCK_TYPE_SDMA, 2609 AMD_IP_BLOCK_TYPE_UVD, 2610 AMD_IP_BLOCK_TYPE_VCE, 2611 AMD_IP_BLOCK_TYPE_VCN 2612 }; 2613 2614 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2615 int j; 2616 struct amdgpu_ip_block *block; 2617 2618 for (j = 0; j < adev->num_ip_blocks; j++) { 2619 block = &adev->ip_blocks[j]; 2620 2621 if (block->version->type != ip_order[i] || 2622 !block->status.valid || 2623 block->status.hw) 2624 continue; 2625 2626 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2627 r = block->version->funcs->resume(adev); 2628 else 2629 r = block->version->funcs->hw_init(adev); 2630 2631 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2632 if (r) 2633 return r; 2634 block->status.hw = true; 2635 } 2636 } 2637 2638 return 0; 2639 } 2640 2641 /** 2642 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2643 * 2644 * @adev: amdgpu_device pointer 2645 * 2646 * First resume function for hardware IPs. The list of all the hardware 2647 * IPs that make up the asic is walked and the resume callbacks are run for 2648 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2649 * after a suspend and updates the software state as necessary. This 2650 * function is also used for restoring the GPU after a GPU reset. 2651 * Returns 0 on success, negative error code on failure. 2652 */ 2653 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2654 { 2655 int i, r; 2656 2657 for (i = 0; i < adev->num_ip_blocks; i++) { 2658 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2659 continue; 2660 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2661 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2662 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2663 2664 r = adev->ip_blocks[i].version->funcs->resume(adev); 2665 if (r) { 2666 DRM_ERROR("resume of IP block <%s> failed %d\n", 2667 adev->ip_blocks[i].version->funcs->name, r); 2668 return r; 2669 } 2670 adev->ip_blocks[i].status.hw = true; 2671 } 2672 } 2673 2674 return 0; 2675 } 2676 2677 /** 2678 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2679 * 2680 * @adev: amdgpu_device pointer 2681 * 2682 * First resume function for hardware IPs. The list of all the hardware 2683 * IPs that make up the asic is walked and the resume callbacks are run for 2684 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2685 * functional state after a suspend and updates the software state as 2686 * necessary. This function is also used for restoring the GPU after a GPU 2687 * reset. 2688 * Returns 0 on success, negative error code on failure. 2689 */ 2690 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2691 { 2692 int i, r; 2693 2694 for (i = 0; i < adev->num_ip_blocks; i++) { 2695 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2696 continue; 2697 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2698 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2699 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2700 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2701 continue; 2702 r = adev->ip_blocks[i].version->funcs->resume(adev); 2703 if (r) { 2704 DRM_ERROR("resume of IP block <%s> failed %d\n", 2705 adev->ip_blocks[i].version->funcs->name, r); 2706 return r; 2707 } 2708 adev->ip_blocks[i].status.hw = true; 2709 } 2710 2711 return 0; 2712 } 2713 2714 /** 2715 * amdgpu_device_ip_resume - run resume for hardware IPs 2716 * 2717 * @adev: amdgpu_device pointer 2718 * 2719 * Main resume function for hardware IPs. The hardware IPs 2720 * are split into two resume functions because they are 2721 * are also used in in recovering from a GPU reset and some additional 2722 * steps need to be take between them. In this case (S3/S4) they are 2723 * run sequentially. 2724 * Returns 0 on success, negative error code on failure. 2725 */ 2726 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2727 { 2728 int r; 2729 2730 r = amdgpu_device_ip_resume_phase1(adev); 2731 if (r) 2732 return r; 2733 2734 r = amdgpu_device_fw_loading(adev); 2735 if (r) 2736 return r; 2737 2738 r = amdgpu_device_ip_resume_phase2(adev); 2739 2740 return r; 2741 } 2742 2743 /** 2744 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2745 * 2746 * @adev: amdgpu_device pointer 2747 * 2748 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2749 */ 2750 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2751 { 2752 if (amdgpu_sriov_vf(adev)) { 2753 if (adev->is_atom_fw) { 2754 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2755 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2756 } else { 2757 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2758 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2759 } 2760 2761 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2762 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2763 } 2764 } 2765 2766 /** 2767 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2768 * 2769 * @asic_type: AMD asic type 2770 * 2771 * Check if there is DC (new modesetting infrastructre) support for an asic. 2772 * returns true if DC has support, false if not. 2773 */ 2774 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2775 { 2776 switch (asic_type) { 2777 #if defined(CONFIG_DRM_AMD_DC) 2778 case CHIP_BONAIRE: 2779 case CHIP_KAVERI: 2780 case CHIP_KABINI: 2781 case CHIP_MULLINS: 2782 /* 2783 * We have systems in the wild with these ASICs that require 2784 * LVDS and VGA support which is not supported with DC. 2785 * 2786 * Fallback to the non-DC driver here by default so as not to 2787 * cause regressions. 2788 */ 2789 return amdgpu_dc > 0; 2790 case CHIP_HAWAII: 2791 case CHIP_CARRIZO: 2792 case CHIP_STONEY: 2793 case CHIP_POLARIS10: 2794 case CHIP_POLARIS11: 2795 case CHIP_POLARIS12: 2796 case CHIP_VEGAM: 2797 case CHIP_TONGA: 2798 case CHIP_FIJI: 2799 case CHIP_VEGA10: 2800 case CHIP_VEGA12: 2801 case CHIP_VEGA20: 2802 #if defined(CONFIG_DRM_AMD_DC_DCN) 2803 case CHIP_RAVEN: 2804 case CHIP_NAVI10: 2805 case CHIP_NAVI14: 2806 case CHIP_NAVI12: 2807 case CHIP_RENOIR: 2808 #endif 2809 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2810 case CHIP_SIENNA_CICHLID: 2811 case CHIP_NAVY_FLOUNDER: 2812 #endif 2813 return amdgpu_dc != 0; 2814 #endif 2815 default: 2816 if (amdgpu_dc > 0) 2817 DRM_INFO("Display Core has been requested via kernel parameter " 2818 "but isn't supported by ASIC, ignoring\n"); 2819 return false; 2820 } 2821 } 2822 2823 /** 2824 * amdgpu_device_has_dc_support - check if dc is supported 2825 * 2826 * @adev: amdgpu_device_pointer 2827 * 2828 * Returns true for supported, false for not supported 2829 */ 2830 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2831 { 2832 if (amdgpu_sriov_vf(adev)) 2833 return false; 2834 2835 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2836 } 2837 2838 2839 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2840 { 2841 struct amdgpu_device *adev = 2842 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2843 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); 2844 2845 /* It's a bug to not have a hive within this function */ 2846 if (WARN_ON(!hive)) 2847 return; 2848 2849 /* 2850 * Use task barrier to synchronize all xgmi reset works across the 2851 * hive. task_barrier_enter and task_barrier_exit will block 2852 * until all the threads running the xgmi reset works reach 2853 * those points. task_barrier_full will do both blocks. 2854 */ 2855 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2856 2857 task_barrier_enter(&hive->tb); 2858 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); 2859 2860 if (adev->asic_reset_res) 2861 goto fail; 2862 2863 task_barrier_exit(&hive->tb); 2864 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); 2865 2866 if (adev->asic_reset_res) 2867 goto fail; 2868 2869 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 2870 adev->mmhub.funcs->reset_ras_error_count(adev); 2871 } else { 2872 2873 task_barrier_full(&hive->tb); 2874 adev->asic_reset_res = amdgpu_asic_reset(adev); 2875 } 2876 2877 fail: 2878 if (adev->asic_reset_res) 2879 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2880 adev->asic_reset_res, adev->ddev->unique); 2881 } 2882 2883 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2884 { 2885 char *input = amdgpu_lockup_timeout; 2886 char *timeout_setting = NULL; 2887 int index = 0; 2888 long timeout; 2889 int ret = 0; 2890 2891 /* 2892 * By default timeout for non compute jobs is 10000. 2893 * And there is no timeout enforced on compute jobs. 2894 * In SR-IOV or passthrough mode, timeout for compute 2895 * jobs are 60000 by default. 2896 */ 2897 adev->gfx_timeout = msecs_to_jiffies(10000); 2898 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2899 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2900 adev->compute_timeout = msecs_to_jiffies(60000); 2901 else 2902 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2903 2904 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2905 while ((timeout_setting = strsep(&input, ",")) && 2906 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2907 ret = kstrtol(timeout_setting, 0, &timeout); 2908 if (ret) 2909 return ret; 2910 2911 if (timeout == 0) { 2912 index++; 2913 continue; 2914 } else if (timeout < 0) { 2915 timeout = MAX_SCHEDULE_TIMEOUT; 2916 } else { 2917 timeout = msecs_to_jiffies(timeout); 2918 } 2919 2920 switch (index++) { 2921 case 0: 2922 adev->gfx_timeout = timeout; 2923 break; 2924 case 1: 2925 adev->compute_timeout = timeout; 2926 break; 2927 case 2: 2928 adev->sdma_timeout = timeout; 2929 break; 2930 case 3: 2931 adev->video_timeout = timeout; 2932 break; 2933 default: 2934 break; 2935 } 2936 } 2937 /* 2938 * There is only one value specified and 2939 * it should apply to all non-compute jobs. 2940 */ 2941 if (index == 1) { 2942 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2943 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2944 adev->compute_timeout = adev->gfx_timeout; 2945 } 2946 } 2947 2948 return ret; 2949 } 2950 2951 static const struct attribute *amdgpu_dev_attributes[] = { 2952 &dev_attr_product_name.attr, 2953 &dev_attr_product_number.attr, 2954 &dev_attr_serial_number.attr, 2955 &dev_attr_pcie_replay_count.attr, 2956 NULL 2957 }; 2958 2959 /** 2960 * amdgpu_device_init - initialize the driver 2961 * 2962 * @adev: amdgpu_device pointer 2963 * @ddev: drm dev pointer 2964 * @pdev: pci dev pointer 2965 * @flags: driver flags 2966 * 2967 * Initializes the driver info and hw (all asics). 2968 * Returns 0 for success or an error on failure. 2969 * Called at driver startup. 2970 */ 2971 int amdgpu_device_init(struct amdgpu_device *adev, 2972 struct drm_device *ddev, 2973 struct pci_dev *pdev, 2974 uint32_t flags) 2975 { 2976 int r, i; 2977 bool boco = false; 2978 u32 max_MBps; 2979 2980 adev->shutdown = false; 2981 adev->dev = &pdev->dev; 2982 adev->ddev = ddev; 2983 adev->pdev = pdev; 2984 adev->flags = flags; 2985 2986 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 2987 adev->asic_type = amdgpu_force_asic_type; 2988 else 2989 adev->asic_type = flags & AMD_ASIC_MASK; 2990 2991 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2992 if (amdgpu_emu_mode == 1) 2993 adev->usec_timeout *= 10; 2994 adev->gmc.gart_size = 512 * 1024 * 1024; 2995 adev->accel_working = false; 2996 adev->num_rings = 0; 2997 adev->mman.buffer_funcs = NULL; 2998 adev->mman.buffer_funcs_ring = NULL; 2999 adev->vm_manager.vm_pte_funcs = NULL; 3000 adev->vm_manager.vm_pte_num_scheds = 0; 3001 adev->gmc.gmc_funcs = NULL; 3002 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3003 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3004 3005 adev->smc_rreg = &amdgpu_invalid_rreg; 3006 adev->smc_wreg = &amdgpu_invalid_wreg; 3007 adev->pcie_rreg = &amdgpu_invalid_rreg; 3008 adev->pcie_wreg = &amdgpu_invalid_wreg; 3009 adev->pciep_rreg = &amdgpu_invalid_rreg; 3010 adev->pciep_wreg = &amdgpu_invalid_wreg; 3011 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3012 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3013 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3014 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3015 adev->didt_rreg = &amdgpu_invalid_rreg; 3016 adev->didt_wreg = &amdgpu_invalid_wreg; 3017 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3018 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3019 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3020 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3021 3022 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3023 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3024 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3025 3026 /* mutex initialization are all done here so we 3027 * can recall function without having locking issues */ 3028 atomic_set(&adev->irq.ih.lock, 0); 3029 mutex_init(&adev->firmware.mutex); 3030 mutex_init(&adev->pm.mutex); 3031 mutex_init(&adev->gfx.gpu_clock_mutex); 3032 mutex_init(&adev->srbm_mutex); 3033 mutex_init(&adev->gfx.pipe_reserve_mutex); 3034 mutex_init(&adev->gfx.gfx_off_mutex); 3035 mutex_init(&adev->grbm_idx_mutex); 3036 mutex_init(&adev->mn_lock); 3037 mutex_init(&adev->virt.vf_errors.lock); 3038 hash_init(adev->mn_hash); 3039 mutex_init(&adev->lock_reset); 3040 mutex_init(&adev->psp.mutex); 3041 mutex_init(&adev->notifier_lock); 3042 3043 r = amdgpu_device_check_arguments(adev); 3044 if (r) 3045 return r; 3046 3047 spin_lock_init(&adev->mmio_idx_lock); 3048 spin_lock_init(&adev->smc_idx_lock); 3049 spin_lock_init(&adev->pcie_idx_lock); 3050 spin_lock_init(&adev->uvd_ctx_idx_lock); 3051 spin_lock_init(&adev->didt_idx_lock); 3052 spin_lock_init(&adev->gc_cac_idx_lock); 3053 spin_lock_init(&adev->se_cac_idx_lock); 3054 spin_lock_init(&adev->audio_endpt_idx_lock); 3055 spin_lock_init(&adev->mm_stats.lock); 3056 3057 INIT_LIST_HEAD(&adev->shadow_list); 3058 mutex_init(&adev->shadow_list_lock); 3059 3060 INIT_DELAYED_WORK(&adev->delayed_init_work, 3061 amdgpu_device_delayed_init_work_handler); 3062 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3063 amdgpu_device_delay_enable_gfx_off); 3064 3065 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3066 3067 adev->gfx.gfx_off_req_count = 1; 3068 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3069 3070 atomic_set(&adev->throttling_logging_enabled, 1); 3071 /* 3072 * If throttling continues, logging will be performed every minute 3073 * to avoid log flooding. "-1" is subtracted since the thermal 3074 * throttling interrupt comes every second. Thus, the total logging 3075 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3076 * for throttling interrupt) = 60 seconds. 3077 */ 3078 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3079 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3080 3081 /* Registers mapping */ 3082 /* TODO: block userspace mapping of io register */ 3083 if (adev->asic_type >= CHIP_BONAIRE) { 3084 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3085 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3086 } else { 3087 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3088 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3089 } 3090 3091 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3092 if (adev->rmmio == NULL) { 3093 return -ENOMEM; 3094 } 3095 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3096 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3097 3098 /* io port mapping */ 3099 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3100 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3101 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3102 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3103 break; 3104 } 3105 } 3106 if (adev->rio_mem == NULL) 3107 DRM_INFO("PCI I/O BAR is not found.\n"); 3108 3109 /* enable PCIE atomic ops */ 3110 r = pci_enable_atomic_ops_to_root(adev->pdev, 3111 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3112 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3113 if (r) { 3114 adev->have_atomics_support = false; 3115 DRM_INFO("PCIE atomic ops is not supported\n"); 3116 } else { 3117 adev->have_atomics_support = true; 3118 } 3119 3120 amdgpu_device_get_pcie_info(adev); 3121 3122 if (amdgpu_mcbp) 3123 DRM_INFO("MCBP is enabled\n"); 3124 3125 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3126 adev->enable_mes = true; 3127 3128 /* detect hw virtualization here */ 3129 amdgpu_detect_virtualization(adev); 3130 3131 r = amdgpu_device_get_job_timeout_settings(adev); 3132 if (r) { 3133 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3134 return r; 3135 } 3136 3137 /* early init functions */ 3138 r = amdgpu_device_ip_early_init(adev); 3139 if (r) 3140 return r; 3141 3142 /* doorbell bar mapping and doorbell index init*/ 3143 amdgpu_device_doorbell_init(adev); 3144 3145 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3146 /* this will fail for cards that aren't VGA class devices, just 3147 * ignore it */ 3148 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3149 3150 if (amdgpu_device_supports_boco(ddev)) 3151 boco = true; 3152 if (amdgpu_has_atpx() && 3153 (amdgpu_is_atpx_hybrid() || 3154 amdgpu_has_atpx_dgpu_power_cntl()) && 3155 !pci_is_thunderbolt_attached(adev->pdev)) 3156 vga_switcheroo_register_client(adev->pdev, 3157 &amdgpu_switcheroo_ops, boco); 3158 if (boco) 3159 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3160 3161 if (amdgpu_emu_mode == 1) { 3162 /* post the asic on emulation mode */ 3163 emu_soc_asic_init(adev); 3164 goto fence_driver_init; 3165 } 3166 3167 /* detect if we are with an SRIOV vbios */ 3168 amdgpu_device_detect_sriov_bios(adev); 3169 3170 /* check if we need to reset the asic 3171 * E.g., driver was not cleanly unloaded previously, etc. 3172 */ 3173 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3174 r = amdgpu_asic_reset(adev); 3175 if (r) { 3176 dev_err(adev->dev, "asic reset on init failed\n"); 3177 goto failed; 3178 } 3179 } 3180 3181 /* Post card if necessary */ 3182 if (amdgpu_device_need_post(adev)) { 3183 if (!adev->bios) { 3184 dev_err(adev->dev, "no vBIOS found\n"); 3185 r = -EINVAL; 3186 goto failed; 3187 } 3188 DRM_INFO("GPU posting now...\n"); 3189 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3190 if (r) { 3191 dev_err(adev->dev, "gpu post error!\n"); 3192 goto failed; 3193 } 3194 } 3195 3196 if (adev->is_atom_fw) { 3197 /* Initialize clocks */ 3198 r = amdgpu_atomfirmware_get_clock_info(adev); 3199 if (r) { 3200 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3201 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3202 goto failed; 3203 } 3204 } else { 3205 /* Initialize clocks */ 3206 r = amdgpu_atombios_get_clock_info(adev); 3207 if (r) { 3208 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3209 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3210 goto failed; 3211 } 3212 /* init i2c buses */ 3213 if (!amdgpu_device_has_dc_support(adev)) 3214 amdgpu_atombios_i2c_init(adev); 3215 } 3216 3217 fence_driver_init: 3218 /* Fence driver */ 3219 r = amdgpu_fence_driver_init(adev); 3220 if (r) { 3221 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3222 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3223 goto failed; 3224 } 3225 3226 /* init the mode config */ 3227 drm_mode_config_init(adev->ddev); 3228 3229 r = amdgpu_device_ip_init(adev); 3230 if (r) { 3231 /* failed in exclusive mode due to timeout */ 3232 if (amdgpu_sriov_vf(adev) && 3233 !amdgpu_sriov_runtime(adev) && 3234 amdgpu_virt_mmio_blocked(adev) && 3235 !amdgpu_virt_wait_reset(adev)) { 3236 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3237 /* Don't send request since VF is inactive. */ 3238 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3239 adev->virt.ops = NULL; 3240 r = -EAGAIN; 3241 goto failed; 3242 } 3243 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3244 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3245 goto failed; 3246 } 3247 3248 dev_info(adev->dev, 3249 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3250 adev->gfx.config.max_shader_engines, 3251 adev->gfx.config.max_sh_per_se, 3252 adev->gfx.config.max_cu_per_sh, 3253 adev->gfx.cu_info.number); 3254 3255 adev->accel_working = true; 3256 3257 amdgpu_vm_check_compute_bug(adev); 3258 3259 /* Initialize the buffer migration limit. */ 3260 if (amdgpu_moverate >= 0) 3261 max_MBps = amdgpu_moverate; 3262 else 3263 max_MBps = 8; /* Allow 8 MB/s. */ 3264 /* Get a log2 for easy divisions. */ 3265 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3266 3267 amdgpu_fbdev_init(adev); 3268 3269 r = amdgpu_pm_sysfs_init(adev); 3270 if (r) { 3271 adev->pm_sysfs_en = false; 3272 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3273 } else 3274 adev->pm_sysfs_en = true; 3275 3276 r = amdgpu_ucode_sysfs_init(adev); 3277 if (r) { 3278 adev->ucode_sysfs_en = false; 3279 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3280 } else 3281 adev->ucode_sysfs_en = true; 3282 3283 if ((amdgpu_testing & 1)) { 3284 if (adev->accel_working) 3285 amdgpu_test_moves(adev); 3286 else 3287 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3288 } 3289 if (amdgpu_benchmarking) { 3290 if (adev->accel_working) 3291 amdgpu_benchmark(adev, amdgpu_benchmarking); 3292 else 3293 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3294 } 3295 3296 /* 3297 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3298 * Otherwise the mgpu fan boost feature will be skipped due to the 3299 * gpu instance is counted less. 3300 */ 3301 amdgpu_register_gpu_instance(adev); 3302 3303 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3304 * explicit gating rather than handling it automatically. 3305 */ 3306 r = amdgpu_device_ip_late_init(adev); 3307 if (r) { 3308 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3309 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3310 goto failed; 3311 } 3312 3313 /* must succeed. */ 3314 amdgpu_ras_resume(adev); 3315 3316 queue_delayed_work(system_wq, &adev->delayed_init_work, 3317 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3318 3319 if (amdgpu_sriov_vf(adev)) 3320 flush_delayed_work(&adev->delayed_init_work); 3321 3322 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3323 if (r) { 3324 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3325 return r; 3326 } 3327 3328 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3329 r = amdgpu_pmu_init(adev); 3330 if (r) 3331 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3332 3333 return 0; 3334 3335 failed: 3336 amdgpu_vf_error_trans_all(adev); 3337 if (boco) 3338 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3339 3340 return r; 3341 } 3342 3343 /** 3344 * amdgpu_device_fini - tear down the driver 3345 * 3346 * @adev: amdgpu_device pointer 3347 * 3348 * Tear down the driver info (all asics). 3349 * Called at driver shutdown. 3350 */ 3351 void amdgpu_device_fini(struct amdgpu_device *adev) 3352 { 3353 int r; 3354 3355 DRM_INFO("amdgpu: finishing device.\n"); 3356 flush_delayed_work(&adev->delayed_init_work); 3357 adev->shutdown = true; 3358 3359 /* make sure IB test finished before entering exclusive mode 3360 * to avoid preemption on IB test 3361 * */ 3362 if (amdgpu_sriov_vf(adev)) 3363 amdgpu_virt_request_full_gpu(adev, false); 3364 3365 /* disable all interrupts */ 3366 amdgpu_irq_disable_all(adev); 3367 if (adev->mode_info.mode_config_initialized){ 3368 if (!amdgpu_device_has_dc_support(adev)) 3369 drm_helper_force_disable_all(adev->ddev); 3370 else 3371 drm_atomic_helper_shutdown(adev->ddev); 3372 } 3373 amdgpu_fence_driver_fini(adev); 3374 if (adev->pm_sysfs_en) 3375 amdgpu_pm_sysfs_fini(adev); 3376 amdgpu_fbdev_fini(adev); 3377 r = amdgpu_device_ip_fini(adev); 3378 release_firmware(adev->firmware.gpu_info_fw); 3379 adev->firmware.gpu_info_fw = NULL; 3380 adev->accel_working = false; 3381 /* free i2c buses */ 3382 if (!amdgpu_device_has_dc_support(adev)) 3383 amdgpu_i2c_fini(adev); 3384 3385 if (amdgpu_emu_mode != 1) 3386 amdgpu_atombios_fini(adev); 3387 3388 kfree(adev->bios); 3389 adev->bios = NULL; 3390 if (amdgpu_has_atpx() && 3391 (amdgpu_is_atpx_hybrid() || 3392 amdgpu_has_atpx_dgpu_power_cntl()) && 3393 !pci_is_thunderbolt_attached(adev->pdev)) 3394 vga_switcheroo_unregister_client(adev->pdev); 3395 if (amdgpu_device_supports_boco(adev->ddev)) 3396 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3397 vga_client_register(adev->pdev, NULL, NULL, NULL); 3398 if (adev->rio_mem) 3399 pci_iounmap(adev->pdev, adev->rio_mem); 3400 adev->rio_mem = NULL; 3401 iounmap(adev->rmmio); 3402 adev->rmmio = NULL; 3403 amdgpu_device_doorbell_fini(adev); 3404 3405 if (adev->ucode_sysfs_en) 3406 amdgpu_ucode_sysfs_fini(adev); 3407 3408 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3409 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3410 amdgpu_pmu_fini(adev); 3411 if (adev->discovery_bin) 3412 amdgpu_discovery_fini(adev); 3413 } 3414 3415 3416 /* 3417 * Suspend & resume. 3418 */ 3419 /** 3420 * amdgpu_device_suspend - initiate device suspend 3421 * 3422 * @dev: drm dev pointer 3423 * @fbcon : notify the fbdev of suspend 3424 * 3425 * Puts the hw in the suspend state (all asics). 3426 * Returns 0 for success or an error on failure. 3427 * Called at driver suspend. 3428 */ 3429 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3430 { 3431 struct amdgpu_device *adev; 3432 struct drm_crtc *crtc; 3433 struct drm_connector *connector; 3434 struct drm_connector_list_iter iter; 3435 int r; 3436 3437 if (dev == NULL || dev->dev_private == NULL) { 3438 return -ENODEV; 3439 } 3440 3441 adev = dev->dev_private; 3442 3443 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3444 return 0; 3445 3446 adev->in_suspend = true; 3447 drm_kms_helper_poll_disable(dev); 3448 3449 if (fbcon) 3450 amdgpu_fbdev_set_suspend(adev, 1); 3451 3452 cancel_delayed_work_sync(&adev->delayed_init_work); 3453 3454 if (!amdgpu_device_has_dc_support(adev)) { 3455 /* turn off display hw */ 3456 drm_modeset_lock_all(dev); 3457 drm_connector_list_iter_begin(dev, &iter); 3458 drm_for_each_connector_iter(connector, &iter) 3459 drm_helper_connector_dpms(connector, 3460 DRM_MODE_DPMS_OFF); 3461 drm_connector_list_iter_end(&iter); 3462 drm_modeset_unlock_all(dev); 3463 /* unpin the front buffers and cursors */ 3464 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3465 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3466 struct drm_framebuffer *fb = crtc->primary->fb; 3467 struct amdgpu_bo *robj; 3468 3469 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3470 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3471 r = amdgpu_bo_reserve(aobj, true); 3472 if (r == 0) { 3473 amdgpu_bo_unpin(aobj); 3474 amdgpu_bo_unreserve(aobj); 3475 } 3476 } 3477 3478 if (fb == NULL || fb->obj[0] == NULL) { 3479 continue; 3480 } 3481 robj = gem_to_amdgpu_bo(fb->obj[0]); 3482 /* don't unpin kernel fb objects */ 3483 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3484 r = amdgpu_bo_reserve(robj, true); 3485 if (r == 0) { 3486 amdgpu_bo_unpin(robj); 3487 amdgpu_bo_unreserve(robj); 3488 } 3489 } 3490 } 3491 } 3492 3493 amdgpu_ras_suspend(adev); 3494 3495 r = amdgpu_device_ip_suspend_phase1(adev); 3496 3497 amdgpu_amdkfd_suspend(adev, !fbcon); 3498 3499 /* evict vram memory */ 3500 amdgpu_bo_evict_vram(adev); 3501 3502 amdgpu_fence_driver_suspend(adev); 3503 3504 r = amdgpu_device_ip_suspend_phase2(adev); 3505 3506 /* evict remaining vram memory 3507 * This second call to evict vram is to evict the gart page table 3508 * using the CPU. 3509 */ 3510 amdgpu_bo_evict_vram(adev); 3511 3512 return 0; 3513 } 3514 3515 /** 3516 * amdgpu_device_resume - initiate device resume 3517 * 3518 * @dev: drm dev pointer 3519 * @fbcon : notify the fbdev of resume 3520 * 3521 * Bring the hw back to operating state (all asics). 3522 * Returns 0 for success or an error on failure. 3523 * Called at driver resume. 3524 */ 3525 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3526 { 3527 struct drm_connector *connector; 3528 struct drm_connector_list_iter iter; 3529 struct amdgpu_device *adev = dev->dev_private; 3530 struct drm_crtc *crtc; 3531 int r = 0; 3532 3533 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3534 return 0; 3535 3536 /* post card */ 3537 if (amdgpu_device_need_post(adev)) { 3538 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3539 if (r) 3540 DRM_ERROR("amdgpu asic init failed\n"); 3541 } 3542 3543 r = amdgpu_device_ip_resume(adev); 3544 if (r) { 3545 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3546 return r; 3547 } 3548 amdgpu_fence_driver_resume(adev); 3549 3550 3551 r = amdgpu_device_ip_late_init(adev); 3552 if (r) 3553 return r; 3554 3555 queue_delayed_work(system_wq, &adev->delayed_init_work, 3556 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3557 3558 if (!amdgpu_device_has_dc_support(adev)) { 3559 /* pin cursors */ 3560 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3561 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3562 3563 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3564 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3565 r = amdgpu_bo_reserve(aobj, true); 3566 if (r == 0) { 3567 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3568 if (r != 0) 3569 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3570 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3571 amdgpu_bo_unreserve(aobj); 3572 } 3573 } 3574 } 3575 } 3576 r = amdgpu_amdkfd_resume(adev, !fbcon); 3577 if (r) 3578 return r; 3579 3580 /* Make sure IB tests flushed */ 3581 flush_delayed_work(&adev->delayed_init_work); 3582 3583 /* blat the mode back in */ 3584 if (fbcon) { 3585 if (!amdgpu_device_has_dc_support(adev)) { 3586 /* pre DCE11 */ 3587 drm_helper_resume_force_mode(dev); 3588 3589 /* turn on display hw */ 3590 drm_modeset_lock_all(dev); 3591 3592 drm_connector_list_iter_begin(dev, &iter); 3593 drm_for_each_connector_iter(connector, &iter) 3594 drm_helper_connector_dpms(connector, 3595 DRM_MODE_DPMS_ON); 3596 drm_connector_list_iter_end(&iter); 3597 3598 drm_modeset_unlock_all(dev); 3599 } 3600 amdgpu_fbdev_set_suspend(adev, 0); 3601 } 3602 3603 drm_kms_helper_poll_enable(dev); 3604 3605 amdgpu_ras_resume(adev); 3606 3607 /* 3608 * Most of the connector probing functions try to acquire runtime pm 3609 * refs to ensure that the GPU is powered on when connector polling is 3610 * performed. Since we're calling this from a runtime PM callback, 3611 * trying to acquire rpm refs will cause us to deadlock. 3612 * 3613 * Since we're guaranteed to be holding the rpm lock, it's safe to 3614 * temporarily disable the rpm helpers so this doesn't deadlock us. 3615 */ 3616 #ifdef CONFIG_PM 3617 dev->dev->power.disable_depth++; 3618 #endif 3619 if (!amdgpu_device_has_dc_support(adev)) 3620 drm_helper_hpd_irq_event(dev); 3621 else 3622 drm_kms_helper_hotplug_event(dev); 3623 #ifdef CONFIG_PM 3624 dev->dev->power.disable_depth--; 3625 #endif 3626 adev->in_suspend = false; 3627 3628 return 0; 3629 } 3630 3631 /** 3632 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3633 * 3634 * @adev: amdgpu_device pointer 3635 * 3636 * The list of all the hardware IPs that make up the asic is walked and 3637 * the check_soft_reset callbacks are run. check_soft_reset determines 3638 * if the asic is still hung or not. 3639 * Returns true if any of the IPs are still in a hung state, false if not. 3640 */ 3641 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3642 { 3643 int i; 3644 bool asic_hang = false; 3645 3646 if (amdgpu_sriov_vf(adev)) 3647 return true; 3648 3649 if (amdgpu_asic_need_full_reset(adev)) 3650 return true; 3651 3652 for (i = 0; i < adev->num_ip_blocks; i++) { 3653 if (!adev->ip_blocks[i].status.valid) 3654 continue; 3655 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3656 adev->ip_blocks[i].status.hang = 3657 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3658 if (adev->ip_blocks[i].status.hang) { 3659 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3660 asic_hang = true; 3661 } 3662 } 3663 return asic_hang; 3664 } 3665 3666 /** 3667 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3668 * 3669 * @adev: amdgpu_device pointer 3670 * 3671 * The list of all the hardware IPs that make up the asic is walked and the 3672 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3673 * handles any IP specific hardware or software state changes that are 3674 * necessary for a soft reset to succeed. 3675 * Returns 0 on success, negative error code on failure. 3676 */ 3677 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3678 { 3679 int i, r = 0; 3680 3681 for (i = 0; i < adev->num_ip_blocks; i++) { 3682 if (!adev->ip_blocks[i].status.valid) 3683 continue; 3684 if (adev->ip_blocks[i].status.hang && 3685 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3686 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3687 if (r) 3688 return r; 3689 } 3690 } 3691 3692 return 0; 3693 } 3694 3695 /** 3696 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3697 * 3698 * @adev: amdgpu_device pointer 3699 * 3700 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3701 * reset is necessary to recover. 3702 * Returns true if a full asic reset is required, false if not. 3703 */ 3704 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3705 { 3706 int i; 3707 3708 if (amdgpu_asic_need_full_reset(adev)) 3709 return true; 3710 3711 for (i = 0; i < adev->num_ip_blocks; i++) { 3712 if (!adev->ip_blocks[i].status.valid) 3713 continue; 3714 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3715 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3716 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3717 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3718 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3719 if (adev->ip_blocks[i].status.hang) { 3720 DRM_INFO("Some block need full reset!\n"); 3721 return true; 3722 } 3723 } 3724 } 3725 return false; 3726 } 3727 3728 /** 3729 * amdgpu_device_ip_soft_reset - do a soft reset 3730 * 3731 * @adev: amdgpu_device pointer 3732 * 3733 * The list of all the hardware IPs that make up the asic is walked and the 3734 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3735 * IP specific hardware or software state changes that are necessary to soft 3736 * reset the IP. 3737 * Returns 0 on success, negative error code on failure. 3738 */ 3739 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3740 { 3741 int i, r = 0; 3742 3743 for (i = 0; i < adev->num_ip_blocks; i++) { 3744 if (!adev->ip_blocks[i].status.valid) 3745 continue; 3746 if (adev->ip_blocks[i].status.hang && 3747 adev->ip_blocks[i].version->funcs->soft_reset) { 3748 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3749 if (r) 3750 return r; 3751 } 3752 } 3753 3754 return 0; 3755 } 3756 3757 /** 3758 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3759 * 3760 * @adev: amdgpu_device pointer 3761 * 3762 * The list of all the hardware IPs that make up the asic is walked and the 3763 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3764 * handles any IP specific hardware or software state changes that are 3765 * necessary after the IP has been soft reset. 3766 * Returns 0 on success, negative error code on failure. 3767 */ 3768 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3769 { 3770 int i, r = 0; 3771 3772 for (i = 0; i < adev->num_ip_blocks; i++) { 3773 if (!adev->ip_blocks[i].status.valid) 3774 continue; 3775 if (adev->ip_blocks[i].status.hang && 3776 adev->ip_blocks[i].version->funcs->post_soft_reset) 3777 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3778 if (r) 3779 return r; 3780 } 3781 3782 return 0; 3783 } 3784 3785 /** 3786 * amdgpu_device_recover_vram - Recover some VRAM contents 3787 * 3788 * @adev: amdgpu_device pointer 3789 * 3790 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3791 * restore things like GPUVM page tables after a GPU reset where 3792 * the contents of VRAM might be lost. 3793 * 3794 * Returns: 3795 * 0 on success, negative error code on failure. 3796 */ 3797 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3798 { 3799 struct dma_fence *fence = NULL, *next = NULL; 3800 struct amdgpu_bo *shadow; 3801 long r = 1, tmo; 3802 3803 if (amdgpu_sriov_runtime(adev)) 3804 tmo = msecs_to_jiffies(8000); 3805 else 3806 tmo = msecs_to_jiffies(100); 3807 3808 DRM_INFO("recover vram bo from shadow start\n"); 3809 mutex_lock(&adev->shadow_list_lock); 3810 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3811 3812 /* No need to recover an evicted BO */ 3813 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3814 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3815 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3816 continue; 3817 3818 r = amdgpu_bo_restore_shadow(shadow, &next); 3819 if (r) 3820 break; 3821 3822 if (fence) { 3823 tmo = dma_fence_wait_timeout(fence, false, tmo); 3824 dma_fence_put(fence); 3825 fence = next; 3826 if (tmo == 0) { 3827 r = -ETIMEDOUT; 3828 break; 3829 } else if (tmo < 0) { 3830 r = tmo; 3831 break; 3832 } 3833 } else { 3834 fence = next; 3835 } 3836 } 3837 mutex_unlock(&adev->shadow_list_lock); 3838 3839 if (fence) 3840 tmo = dma_fence_wait_timeout(fence, false, tmo); 3841 dma_fence_put(fence); 3842 3843 if (r < 0 || tmo <= 0) { 3844 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3845 return -EIO; 3846 } 3847 3848 DRM_INFO("recover vram bo from shadow done\n"); 3849 return 0; 3850 } 3851 3852 3853 /** 3854 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3855 * 3856 * @adev: amdgpu device pointer 3857 * @from_hypervisor: request from hypervisor 3858 * 3859 * do VF FLR and reinitialize Asic 3860 * return 0 means succeeded otherwise failed 3861 */ 3862 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3863 bool from_hypervisor) 3864 { 3865 int r; 3866 3867 if (from_hypervisor) 3868 r = amdgpu_virt_request_full_gpu(adev, true); 3869 else 3870 r = amdgpu_virt_reset_gpu(adev); 3871 if (r) 3872 return r; 3873 3874 amdgpu_amdkfd_pre_reset(adev); 3875 3876 /* Resume IP prior to SMC */ 3877 r = amdgpu_device_ip_reinit_early_sriov(adev); 3878 if (r) 3879 goto error; 3880 3881 amdgpu_virt_init_data_exchange(adev); 3882 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3883 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3884 3885 r = amdgpu_device_fw_loading(adev); 3886 if (r) 3887 return r; 3888 3889 /* now we are okay to resume SMC/CP/SDMA */ 3890 r = amdgpu_device_ip_reinit_late_sriov(adev); 3891 if (r) 3892 goto error; 3893 3894 amdgpu_irq_gpu_reset_resume_helper(adev); 3895 r = amdgpu_ib_ring_tests(adev); 3896 amdgpu_amdkfd_post_reset(adev); 3897 3898 error: 3899 amdgpu_virt_release_full_gpu(adev, true); 3900 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3901 amdgpu_inc_vram_lost(adev); 3902 r = amdgpu_device_recover_vram(adev); 3903 } 3904 3905 return r; 3906 } 3907 3908 /** 3909 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3910 * 3911 * @adev: amdgpu device pointer 3912 * 3913 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3914 * a hung GPU. 3915 */ 3916 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3917 { 3918 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3919 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3920 return false; 3921 } 3922 3923 if (amdgpu_gpu_recovery == 0) 3924 goto disabled; 3925 3926 if (amdgpu_sriov_vf(adev)) 3927 return true; 3928 3929 if (amdgpu_gpu_recovery == -1) { 3930 switch (adev->asic_type) { 3931 case CHIP_BONAIRE: 3932 case CHIP_HAWAII: 3933 case CHIP_TOPAZ: 3934 case CHIP_TONGA: 3935 case CHIP_FIJI: 3936 case CHIP_POLARIS10: 3937 case CHIP_POLARIS11: 3938 case CHIP_POLARIS12: 3939 case CHIP_VEGAM: 3940 case CHIP_VEGA20: 3941 case CHIP_VEGA10: 3942 case CHIP_VEGA12: 3943 case CHIP_RAVEN: 3944 case CHIP_ARCTURUS: 3945 case CHIP_RENOIR: 3946 case CHIP_NAVI10: 3947 case CHIP_NAVI14: 3948 case CHIP_NAVI12: 3949 case CHIP_SIENNA_CICHLID: 3950 break; 3951 default: 3952 goto disabled; 3953 } 3954 } 3955 3956 return true; 3957 3958 disabled: 3959 DRM_INFO("GPU recovery disabled.\n"); 3960 return false; 3961 } 3962 3963 3964 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 3965 struct amdgpu_job *job, 3966 bool *need_full_reset_arg) 3967 { 3968 int i, r = 0; 3969 bool need_full_reset = *need_full_reset_arg; 3970 3971 amdgpu_debugfs_wait_dump(adev); 3972 3973 /* block all schedulers and reset given job's ring */ 3974 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3975 struct amdgpu_ring *ring = adev->rings[i]; 3976 3977 if (!ring || !ring->sched.thread) 3978 continue; 3979 3980 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3981 amdgpu_fence_driver_force_completion(ring); 3982 } 3983 3984 if(job) 3985 drm_sched_increase_karma(&job->base); 3986 3987 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 3988 if (!amdgpu_sriov_vf(adev)) { 3989 3990 if (!need_full_reset) 3991 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 3992 3993 if (!need_full_reset) { 3994 amdgpu_device_ip_pre_soft_reset(adev); 3995 r = amdgpu_device_ip_soft_reset(adev); 3996 amdgpu_device_ip_post_soft_reset(adev); 3997 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 3998 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 3999 need_full_reset = true; 4000 } 4001 } 4002 4003 if (need_full_reset) 4004 r = amdgpu_device_ip_suspend(adev); 4005 4006 *need_full_reset_arg = need_full_reset; 4007 } 4008 4009 return r; 4010 } 4011 4012 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4013 struct list_head *device_list_handle, 4014 bool *need_full_reset_arg) 4015 { 4016 struct amdgpu_device *tmp_adev = NULL; 4017 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4018 int r = 0; 4019 4020 /* 4021 * ASIC reset has to be done on all HGMI hive nodes ASAP 4022 * to allow proper links negotiation in FW (within 1 sec) 4023 */ 4024 if (need_full_reset) { 4025 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4026 /* For XGMI run all resets in parallel to speed up the process */ 4027 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4028 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4029 r = -EALREADY; 4030 } else 4031 r = amdgpu_asic_reset(tmp_adev); 4032 4033 if (r) { 4034 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 4035 r, tmp_adev->ddev->unique); 4036 break; 4037 } 4038 } 4039 4040 /* For XGMI wait for all resets to complete before proceed */ 4041 if (!r) { 4042 list_for_each_entry(tmp_adev, device_list_handle, 4043 gmc.xgmi.head) { 4044 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4045 flush_work(&tmp_adev->xgmi_reset_work); 4046 r = tmp_adev->asic_reset_res; 4047 if (r) 4048 break; 4049 } 4050 } 4051 } 4052 } 4053 4054 if (!r && amdgpu_ras_intr_triggered()) { 4055 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4056 if (tmp_adev->mmhub.funcs && 4057 tmp_adev->mmhub.funcs->reset_ras_error_count) 4058 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4059 } 4060 4061 amdgpu_ras_intr_cleared(); 4062 } 4063 4064 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4065 if (need_full_reset) { 4066 /* post card */ 4067 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 4068 DRM_WARN("asic atom init failed!"); 4069 4070 if (!r) { 4071 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4072 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4073 if (r) 4074 goto out; 4075 4076 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4077 if (vram_lost) { 4078 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4079 amdgpu_inc_vram_lost(tmp_adev); 4080 } 4081 4082 r = amdgpu_gtt_mgr_recover( 4083 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 4084 if (r) 4085 goto out; 4086 4087 r = amdgpu_device_fw_loading(tmp_adev); 4088 if (r) 4089 return r; 4090 4091 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4092 if (r) 4093 goto out; 4094 4095 if (vram_lost) 4096 amdgpu_device_fill_reset_magic(tmp_adev); 4097 4098 /* 4099 * Add this ASIC as tracked as reset was already 4100 * complete successfully. 4101 */ 4102 amdgpu_register_gpu_instance(tmp_adev); 4103 4104 r = amdgpu_device_ip_late_init(tmp_adev); 4105 if (r) 4106 goto out; 4107 4108 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4109 4110 /* must succeed. */ 4111 amdgpu_ras_resume(tmp_adev); 4112 4113 /* Update PSP FW topology after reset */ 4114 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4115 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4116 } 4117 } 4118 4119 4120 out: 4121 if (!r) { 4122 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4123 r = amdgpu_ib_ring_tests(tmp_adev); 4124 if (r) { 4125 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4126 r = amdgpu_device_ip_suspend(tmp_adev); 4127 need_full_reset = true; 4128 r = -EAGAIN; 4129 goto end; 4130 } 4131 } 4132 4133 if (!r) 4134 r = amdgpu_device_recover_vram(tmp_adev); 4135 else 4136 tmp_adev->asic_reset_res = r; 4137 } 4138 4139 end: 4140 *need_full_reset_arg = need_full_reset; 4141 return r; 4142 } 4143 4144 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 4145 { 4146 if (trylock) { 4147 if (!mutex_trylock(&adev->lock_reset)) 4148 return false; 4149 } else 4150 mutex_lock(&adev->lock_reset); 4151 4152 atomic_inc(&adev->gpu_reset_counter); 4153 adev->in_gpu_reset = true; 4154 switch (amdgpu_asic_reset_method(adev)) { 4155 case AMD_RESET_METHOD_MODE1: 4156 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4157 break; 4158 case AMD_RESET_METHOD_MODE2: 4159 adev->mp1_state = PP_MP1_STATE_RESET; 4160 break; 4161 default: 4162 adev->mp1_state = PP_MP1_STATE_NONE; 4163 break; 4164 } 4165 4166 return true; 4167 } 4168 4169 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4170 { 4171 amdgpu_vf_error_trans_all(adev); 4172 adev->mp1_state = PP_MP1_STATE_NONE; 4173 adev->in_gpu_reset = false; 4174 mutex_unlock(&adev->lock_reset); 4175 } 4176 4177 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4178 { 4179 struct pci_dev *p = NULL; 4180 4181 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4182 adev->pdev->bus->number, 1); 4183 if (p) { 4184 pm_runtime_enable(&(p->dev)); 4185 pm_runtime_resume(&(p->dev)); 4186 } 4187 } 4188 4189 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4190 { 4191 enum amd_reset_method reset_method; 4192 struct pci_dev *p = NULL; 4193 u64 expires; 4194 4195 /* 4196 * For now, only BACO and mode1 reset are confirmed 4197 * to suffer the audio issue without proper suspended. 4198 */ 4199 reset_method = amdgpu_asic_reset_method(adev); 4200 if ((reset_method != AMD_RESET_METHOD_BACO) && 4201 (reset_method != AMD_RESET_METHOD_MODE1)) 4202 return -EINVAL; 4203 4204 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4205 adev->pdev->bus->number, 1); 4206 if (!p) 4207 return -ENODEV; 4208 4209 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4210 if (!expires) 4211 /* 4212 * If we cannot get the audio device autosuspend delay, 4213 * a fixed 4S interval will be used. Considering 3S is 4214 * the audio controller default autosuspend delay setting. 4215 * 4S used here is guaranteed to cover that. 4216 */ 4217 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4218 4219 while (!pm_runtime_status_suspended(&(p->dev))) { 4220 if (!pm_runtime_suspend(&(p->dev))) 4221 break; 4222 4223 if (expires < ktime_get_mono_fast_ns()) { 4224 dev_warn(adev->dev, "failed to suspend display audio\n"); 4225 /* TODO: abort the succeeding gpu reset? */ 4226 return -ETIMEDOUT; 4227 } 4228 } 4229 4230 pm_runtime_disable(&(p->dev)); 4231 4232 return 0; 4233 } 4234 4235 /** 4236 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4237 * 4238 * @adev: amdgpu device pointer 4239 * @job: which job trigger hang 4240 * 4241 * Attempt to reset the GPU if it has hung (all asics). 4242 * Attempt to do soft-reset or full-reset and reinitialize Asic 4243 * Returns 0 for success or an error on failure. 4244 */ 4245 4246 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4247 struct amdgpu_job *job) 4248 { 4249 struct list_head device_list, *device_list_handle = NULL; 4250 bool need_full_reset = false; 4251 bool job_signaled = false; 4252 struct amdgpu_hive_info *hive = NULL; 4253 struct amdgpu_device *tmp_adev = NULL; 4254 int i, r = 0; 4255 bool need_emergency_restart = false; 4256 bool audio_suspended = false; 4257 4258 /** 4259 * Special case: RAS triggered and full reset isn't supported 4260 */ 4261 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4262 4263 /* 4264 * Flush RAM to disk so that after reboot 4265 * the user can read log and see why the system rebooted. 4266 */ 4267 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4268 DRM_WARN("Emergency reboot."); 4269 4270 ksys_sync_helper(); 4271 emergency_restart(); 4272 } 4273 4274 dev_info(adev->dev, "GPU %s begin!\n", 4275 need_emergency_restart ? "jobs stop":"reset"); 4276 4277 /* 4278 * Here we trylock to avoid chain of resets executing from 4279 * either trigger by jobs on different adevs in XGMI hive or jobs on 4280 * different schedulers for same device while this TO handler is running. 4281 * We always reset all schedulers for device and all devices for XGMI 4282 * hive so that should take care of them too. 4283 */ 4284 hive = amdgpu_get_xgmi_hive(adev, true); 4285 if (hive && !mutex_trylock(&hive->reset_lock)) { 4286 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4287 job ? job->base.id : -1, hive->hive_id); 4288 mutex_unlock(&hive->hive_lock); 4289 return 0; 4290 } 4291 4292 /* 4293 * Build list of devices to reset. 4294 * In case we are in XGMI hive mode, resort the device list 4295 * to put adev in the 1st position. 4296 */ 4297 INIT_LIST_HEAD(&device_list); 4298 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4299 if (!hive) 4300 return -ENODEV; 4301 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4302 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4303 device_list_handle = &hive->device_list; 4304 } else { 4305 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4306 device_list_handle = &device_list; 4307 } 4308 4309 /* block all schedulers and reset given job's ring */ 4310 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4311 if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { 4312 DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", 4313 job ? job->base.id : -1); 4314 mutex_unlock(&hive->hive_lock); 4315 return 0; 4316 } 4317 4318 /* 4319 * Try to put the audio codec into suspend state 4320 * before gpu reset started. 4321 * 4322 * Due to the power domain of the graphics device 4323 * is shared with AZ power domain. Without this, 4324 * we may change the audio hardware from behind 4325 * the audio driver's back. That will trigger 4326 * some audio codec errors. 4327 */ 4328 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4329 audio_suspended = true; 4330 4331 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4332 4333 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4334 4335 if (!amdgpu_sriov_vf(tmp_adev)) 4336 amdgpu_amdkfd_pre_reset(tmp_adev); 4337 4338 /* 4339 * Mark these ASICs to be reseted as untracked first 4340 * And add them back after reset completed 4341 */ 4342 amdgpu_unregister_gpu_instance(tmp_adev); 4343 4344 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4345 4346 /* disable ras on ALL IPs */ 4347 if (!need_emergency_restart && 4348 amdgpu_device_ip_need_full_reset(tmp_adev)) 4349 amdgpu_ras_suspend(tmp_adev); 4350 4351 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4352 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4353 4354 if (!ring || !ring->sched.thread) 4355 continue; 4356 4357 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4358 4359 if (need_emergency_restart) 4360 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4361 } 4362 } 4363 4364 if (need_emergency_restart) 4365 goto skip_sched_resume; 4366 4367 /* 4368 * Must check guilty signal here since after this point all old 4369 * HW fences are force signaled. 4370 * 4371 * job->base holds a reference to parent fence 4372 */ 4373 if (job && job->base.s_fence->parent && 4374 dma_fence_is_signaled(job->base.s_fence->parent)) { 4375 job_signaled = true; 4376 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4377 goto skip_hw_reset; 4378 } 4379 4380 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4381 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4382 r = amdgpu_device_pre_asic_reset(tmp_adev, 4383 NULL, 4384 &need_full_reset); 4385 /*TODO Should we stop ?*/ 4386 if (r) { 4387 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4388 r, tmp_adev->ddev->unique); 4389 tmp_adev->asic_reset_res = r; 4390 } 4391 } 4392 4393 /* Actual ASIC resets if needed.*/ 4394 /* TODO Implement XGMI hive reset logic for SRIOV */ 4395 if (amdgpu_sriov_vf(adev)) { 4396 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4397 if (r) 4398 adev->asic_reset_res = r; 4399 } else { 4400 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4401 if (r && r == -EAGAIN) 4402 goto retry; 4403 } 4404 4405 skip_hw_reset: 4406 4407 /* Post ASIC reset for all devs .*/ 4408 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4409 4410 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4411 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4412 4413 if (!ring || !ring->sched.thread) 4414 continue; 4415 4416 /* No point to resubmit jobs if we didn't HW reset*/ 4417 if (!tmp_adev->asic_reset_res && !job_signaled) 4418 drm_sched_resubmit_jobs(&ring->sched); 4419 4420 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4421 } 4422 4423 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4424 drm_helper_resume_force_mode(tmp_adev->ddev); 4425 } 4426 4427 tmp_adev->asic_reset_res = 0; 4428 4429 if (r) { 4430 /* bad news, how to tell it to userspace ? */ 4431 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4432 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4433 } else { 4434 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4435 } 4436 } 4437 4438 skip_sched_resume: 4439 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4440 /*unlock kfd: SRIOV would do it separately */ 4441 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4442 amdgpu_amdkfd_post_reset(tmp_adev); 4443 if (audio_suspended) 4444 amdgpu_device_resume_display_audio(tmp_adev); 4445 amdgpu_device_unlock_adev(tmp_adev); 4446 } 4447 4448 if (hive) { 4449 mutex_unlock(&hive->reset_lock); 4450 mutex_unlock(&hive->hive_lock); 4451 } 4452 4453 if (r) 4454 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4455 return r; 4456 } 4457 4458 /** 4459 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4460 * 4461 * @adev: amdgpu_device pointer 4462 * 4463 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4464 * and lanes) of the slot the device is in. Handles APUs and 4465 * virtualized environments where PCIE config space may not be available. 4466 */ 4467 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4468 { 4469 struct pci_dev *pdev; 4470 enum pci_bus_speed speed_cap, platform_speed_cap; 4471 enum pcie_link_width platform_link_width; 4472 4473 if (amdgpu_pcie_gen_cap) 4474 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4475 4476 if (amdgpu_pcie_lane_cap) 4477 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4478 4479 /* covers APUs as well */ 4480 if (pci_is_root_bus(adev->pdev->bus)) { 4481 if (adev->pm.pcie_gen_mask == 0) 4482 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4483 if (adev->pm.pcie_mlw_mask == 0) 4484 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4485 return; 4486 } 4487 4488 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4489 return; 4490 4491 pcie_bandwidth_available(adev->pdev, NULL, 4492 &platform_speed_cap, &platform_link_width); 4493 4494 if (adev->pm.pcie_gen_mask == 0) { 4495 /* asic caps */ 4496 pdev = adev->pdev; 4497 speed_cap = pcie_get_speed_cap(pdev); 4498 if (speed_cap == PCI_SPEED_UNKNOWN) { 4499 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4500 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4501 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4502 } else { 4503 if (speed_cap == PCIE_SPEED_16_0GT) 4504 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4505 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4506 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4507 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4508 else if (speed_cap == PCIE_SPEED_8_0GT) 4509 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4510 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4511 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4512 else if (speed_cap == PCIE_SPEED_5_0GT) 4513 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4514 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4515 else 4516 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4517 } 4518 /* platform caps */ 4519 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4520 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4521 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4522 } else { 4523 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4524 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4525 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4526 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4527 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4528 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4529 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4530 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4531 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4532 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4533 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4534 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4535 else 4536 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4537 4538 } 4539 } 4540 if (adev->pm.pcie_mlw_mask == 0) { 4541 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4542 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4543 } else { 4544 switch (platform_link_width) { 4545 case PCIE_LNK_X32: 4546 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4549 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4553 break; 4554 case PCIE_LNK_X16: 4555 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4559 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4560 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4561 break; 4562 case PCIE_LNK_X12: 4563 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4565 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4567 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4568 break; 4569 case PCIE_LNK_X8: 4570 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4572 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4573 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4574 break; 4575 case PCIE_LNK_X4: 4576 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4577 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4578 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4579 break; 4580 case PCIE_LNK_X2: 4581 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4582 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4583 break; 4584 case PCIE_LNK_X1: 4585 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4586 break; 4587 default: 4588 break; 4589 } 4590 } 4591 } 4592 } 4593 4594 int amdgpu_device_baco_enter(struct drm_device *dev) 4595 { 4596 struct amdgpu_device *adev = dev->dev_private; 4597 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4598 4599 if (!amdgpu_device_supports_baco(adev->ddev)) 4600 return -ENOTSUPP; 4601 4602 if (ras && ras->supported) 4603 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4604 4605 return amdgpu_dpm_baco_enter(adev); 4606 } 4607 4608 int amdgpu_device_baco_exit(struct drm_device *dev) 4609 { 4610 struct amdgpu_device *adev = dev->dev_private; 4611 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4612 int ret = 0; 4613 4614 if (!amdgpu_device_supports_baco(adev->ddev)) 4615 return -ENOTSUPP; 4616 4617 ret = amdgpu_dpm_baco_exit(adev); 4618 if (ret) 4619 return ret; 4620 4621 if (ras && ras->supported) 4622 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4623 4624 return 0; 4625 } 4626