1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "NAVI10", 115 "NAVI14", 116 "NAVI12", 117 "SIENNA_CICHLID", 118 "NAVY_FLOUNDER", 119 "LAST", 120 }; 121 122 /** 123 * DOC: pcie_replay_count 124 * 125 * The amdgpu driver provides a sysfs API for reporting the total number 126 * of PCIe replays (NAKs) 127 * The file pcie_replay_count is used for this and returns the total 128 * number of replays as a sum of the NAKs generated and NAKs received 129 */ 130 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct drm_device *ddev = dev_get_drvdata(dev); 135 struct amdgpu_device *adev = ddev->dev_private; 136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 137 138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 139 } 140 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 142 amdgpu_device_get_pcie_replay_count, NULL); 143 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 145 146 /** 147 * DOC: product_name 148 * 149 * The amdgpu driver provides a sysfs API for reporting the product name 150 * for the device 151 * The file serial_number is used for this and returns the product name 152 * as returned from the FRU. 153 * NOTE: This is only available for certain server cards 154 */ 155 156 static ssize_t amdgpu_device_get_product_name(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 struct drm_device *ddev = dev_get_drvdata(dev); 160 struct amdgpu_device *adev = ddev->dev_private; 161 162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 163 } 164 165 static DEVICE_ATTR(product_name, S_IRUGO, 166 amdgpu_device_get_product_name, NULL); 167 168 /** 169 * DOC: product_number 170 * 171 * The amdgpu driver provides a sysfs API for reporting the part number 172 * for the device 173 * The file serial_number is used for this and returns the part number 174 * as returned from the FRU. 175 * NOTE: This is only available for certain server cards 176 */ 177 178 static ssize_t amdgpu_device_get_product_number(struct device *dev, 179 struct device_attribute *attr, char *buf) 180 { 181 struct drm_device *ddev = dev_get_drvdata(dev); 182 struct amdgpu_device *adev = ddev->dev_private; 183 184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 185 } 186 187 static DEVICE_ATTR(product_number, S_IRUGO, 188 amdgpu_device_get_product_number, NULL); 189 190 /** 191 * DOC: serial_number 192 * 193 * The amdgpu driver provides a sysfs API for reporting the serial number 194 * for the device 195 * The file serial_number is used for this and returns the serial number 196 * as returned from the FRU. 197 * NOTE: This is only available for certain server cards 198 */ 199 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 201 struct device_attribute *attr, char *buf) 202 { 203 struct drm_device *ddev = dev_get_drvdata(dev); 204 struct amdgpu_device *adev = ddev->dev_private; 205 206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 207 } 208 209 static DEVICE_ATTR(serial_number, S_IRUGO, 210 amdgpu_device_get_serial_number, NULL); 211 212 /** 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 214 * 215 * @dev: drm_device pointer 216 * 217 * Returns true if the device is a dGPU with HG/PX power control, 218 * otherwise return false. 219 */ 220 bool amdgpu_device_supports_boco(struct drm_device *dev) 221 { 222 struct amdgpu_device *adev = dev->dev_private; 223 224 if (adev->flags & AMD_IS_PX) 225 return true; 226 return false; 227 } 228 229 /** 230 * amdgpu_device_supports_baco - Does the device support BACO 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device supporte BACO, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_baco(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = dev->dev_private; 240 241 return amdgpu_asic_supports_baco(adev); 242 } 243 244 /** 245 * VRAM access helper functions. 246 * 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * MMIO register access helper functions. 307 */ 308 /** 309 * amdgpu_mm_rreg - read a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 318 uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 323 return amdgpu_kiq_rreg(adev, reg); 324 325 if ((reg * 4) < adev->rmmio_size) 326 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 327 else { 328 unsigned long flags; 329 330 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 331 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 332 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 333 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 334 } 335 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 336 return ret; 337 } 338 339 /* 340 * MMIO register read with bytes helper functions 341 * @offset:bytes offset from MMIO start 342 * 343 */ 344 345 /** 346 * amdgpu_mm_rreg8 - read a memory mapped IO register 347 * 348 * @adev: amdgpu_device pointer 349 * @offset: byte aligned register offset 350 * 351 * Returns the 8 bit value from the offset specified. 352 */ 353 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 354 if (offset < adev->rmmio_size) 355 return (readb(adev->rmmio + offset)); 356 BUG(); 357 } 358 359 /* 360 * MMIO register write with bytes helper functions 361 * @offset:bytes offset from MMIO start 362 * @value: the value want to be written to the register 363 * 364 */ 365 /** 366 * amdgpu_mm_wreg8 - read a memory mapped IO register 367 * 368 * @adev: amdgpu_device pointer 369 * @offset: byte aligned register offset 370 * @value: 8 bit value to write 371 * 372 * Writes the value specified to the offset specified. 373 */ 374 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 375 if (offset < adev->rmmio_size) 376 writeb(value, adev->rmmio + offset); 377 else 378 BUG(); 379 } 380 381 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) 382 { 383 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 384 385 if ((reg * 4) < adev->rmmio_size) 386 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 387 else { 388 unsigned long flags; 389 390 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 391 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 392 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 393 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 394 } 395 } 396 397 /** 398 * amdgpu_mm_wreg - write to a memory mapped IO register 399 * 400 * @adev: amdgpu_device pointer 401 * @reg: dword aligned register offset 402 * @v: 32 bit value to write to the register 403 * @acc_flags: access flags which require special behavior 404 * 405 * Writes the value specified to the offset specified. 406 */ 407 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 408 uint32_t acc_flags) 409 { 410 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 411 return amdgpu_kiq_wreg(adev, reg, v); 412 413 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 414 } 415 416 /* 417 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 418 * 419 * this function is invoked only the debugfs register access 420 * */ 421 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 422 uint32_t acc_flags) 423 { 424 if (amdgpu_sriov_fullaccess(adev) && 425 adev->gfx.rlc.funcs && 426 adev->gfx.rlc.funcs->is_rlcg_access_range) { 427 428 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 429 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 430 } 431 432 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 433 } 434 435 /** 436 * amdgpu_io_rreg - read an IO register 437 * 438 * @adev: amdgpu_device pointer 439 * @reg: dword aligned register offset 440 * 441 * Returns the 32 bit value from the offset specified. 442 */ 443 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 444 { 445 if ((reg * 4) < adev->rio_mem_size) 446 return ioread32(adev->rio_mem + (reg * 4)); 447 else { 448 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 449 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 450 } 451 } 452 453 /** 454 * amdgpu_io_wreg - write to an IO register 455 * 456 * @adev: amdgpu_device pointer 457 * @reg: dword aligned register offset 458 * @v: 32 bit value to write to the register 459 * 460 * Writes the value specified to the offset specified. 461 */ 462 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 463 { 464 if ((reg * 4) < adev->rio_mem_size) 465 iowrite32(v, adev->rio_mem + (reg * 4)); 466 else { 467 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 468 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 469 } 470 } 471 472 /** 473 * amdgpu_mm_rdoorbell - read a doorbell dword 474 * 475 * @adev: amdgpu_device pointer 476 * @index: doorbell index 477 * 478 * Returns the value in the doorbell aperture at the 479 * requested doorbell index (CIK). 480 */ 481 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 482 { 483 if (index < adev->doorbell.num_doorbells) { 484 return readl(adev->doorbell.ptr + index); 485 } else { 486 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 487 return 0; 488 } 489 } 490 491 /** 492 * amdgpu_mm_wdoorbell - write a doorbell dword 493 * 494 * @adev: amdgpu_device pointer 495 * @index: doorbell index 496 * @v: value to write 497 * 498 * Writes @v to the doorbell aperture at the 499 * requested doorbell index (CIK). 500 */ 501 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 502 { 503 if (index < adev->doorbell.num_doorbells) { 504 writel(v, adev->doorbell.ptr + index); 505 } else { 506 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 507 } 508 } 509 510 /** 511 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * 516 * Returns the value in the doorbell aperture at the 517 * requested doorbell index (VEGA10+). 518 */ 519 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 520 { 521 if (index < adev->doorbell.num_doorbells) { 522 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 523 } else { 524 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 525 return 0; 526 } 527 } 528 529 /** 530 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 531 * 532 * @adev: amdgpu_device pointer 533 * @index: doorbell index 534 * @v: value to write 535 * 536 * Writes @v to the doorbell aperture at the 537 * requested doorbell index (VEGA10+). 538 */ 539 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 540 { 541 if (index < adev->doorbell.num_doorbells) { 542 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 543 } else { 544 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 545 } 546 } 547 548 /** 549 * amdgpu_invalid_rreg - dummy reg read function 550 * 551 * @adev: amdgpu device pointer 552 * @reg: offset of register 553 * 554 * Dummy register read function. Used for register blocks 555 * that certain asics don't have (all asics). 556 * Returns the value in the register. 557 */ 558 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 559 { 560 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 561 BUG(); 562 return 0; 563 } 564 565 /** 566 * amdgpu_invalid_wreg - dummy reg write function 567 * 568 * @adev: amdgpu device pointer 569 * @reg: offset of register 570 * @v: value to write to the register 571 * 572 * Dummy register read function. Used for register blocks 573 * that certain asics don't have (all asics). 574 */ 575 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 576 { 577 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 578 reg, v); 579 BUG(); 580 } 581 582 /** 583 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 584 * 585 * @adev: amdgpu device pointer 586 * @reg: offset of register 587 * 588 * Dummy register read function. Used for register blocks 589 * that certain asics don't have (all asics). 590 * Returns the value in the register. 591 */ 592 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 593 { 594 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 595 BUG(); 596 return 0; 597 } 598 599 /** 600 * amdgpu_invalid_wreg64 - dummy reg write function 601 * 602 * @adev: amdgpu device pointer 603 * @reg: offset of register 604 * @v: value to write to the register 605 * 606 * Dummy register read function. Used for register blocks 607 * that certain asics don't have (all asics). 608 */ 609 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 610 { 611 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 612 reg, v); 613 BUG(); 614 } 615 616 /** 617 * amdgpu_block_invalid_rreg - dummy reg read function 618 * 619 * @adev: amdgpu device pointer 620 * @block: offset of instance 621 * @reg: offset of register 622 * 623 * Dummy register read function. Used for register blocks 624 * that certain asics don't have (all asics). 625 * Returns the value in the register. 626 */ 627 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 628 uint32_t block, uint32_t reg) 629 { 630 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 631 reg, block); 632 BUG(); 633 return 0; 634 } 635 636 /** 637 * amdgpu_block_invalid_wreg - dummy reg write function 638 * 639 * @adev: amdgpu device pointer 640 * @block: offset of instance 641 * @reg: offset of register 642 * @v: value to write to the register 643 * 644 * Dummy register read function. Used for register blocks 645 * that certain asics don't have (all asics). 646 */ 647 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 648 uint32_t block, 649 uint32_t reg, uint32_t v) 650 { 651 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 652 reg, block, v); 653 BUG(); 654 } 655 656 /** 657 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 658 * 659 * @adev: amdgpu device pointer 660 * 661 * Allocates a scratch page of VRAM for use by various things in the 662 * driver. 663 */ 664 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 665 { 666 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 667 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 668 &adev->vram_scratch.robj, 669 &adev->vram_scratch.gpu_addr, 670 (void **)&adev->vram_scratch.ptr); 671 } 672 673 /** 674 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 675 * 676 * @adev: amdgpu device pointer 677 * 678 * Frees the VRAM scratch page. 679 */ 680 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 681 { 682 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 683 } 684 685 /** 686 * amdgpu_device_program_register_sequence - program an array of registers. 687 * 688 * @adev: amdgpu_device pointer 689 * @registers: pointer to the register array 690 * @array_size: size of the register array 691 * 692 * Programs an array or registers with and and or masks. 693 * This is a helper for setting golden registers. 694 */ 695 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 696 const u32 *registers, 697 const u32 array_size) 698 { 699 u32 tmp, reg, and_mask, or_mask; 700 int i; 701 702 if (array_size % 3) 703 return; 704 705 for (i = 0; i < array_size; i +=3) { 706 reg = registers[i + 0]; 707 and_mask = registers[i + 1]; 708 or_mask = registers[i + 2]; 709 710 if (and_mask == 0xffffffff) { 711 tmp = or_mask; 712 } else { 713 tmp = RREG32(reg); 714 tmp &= ~and_mask; 715 if (adev->family >= AMDGPU_FAMILY_AI) 716 tmp |= (or_mask & and_mask); 717 else 718 tmp |= or_mask; 719 } 720 WREG32(reg, tmp); 721 } 722 } 723 724 /** 725 * amdgpu_device_pci_config_reset - reset the GPU 726 * 727 * @adev: amdgpu_device pointer 728 * 729 * Resets the GPU using the pci config reset sequence. 730 * Only applicable to asics prior to vega10. 731 */ 732 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 733 { 734 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 735 } 736 737 /* 738 * GPU doorbell aperture helpers function. 739 */ 740 /** 741 * amdgpu_device_doorbell_init - Init doorbell driver information. 742 * 743 * @adev: amdgpu_device pointer 744 * 745 * Init doorbell driver information (CIK) 746 * Returns 0 on success, error on failure. 747 */ 748 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 749 { 750 751 /* No doorbell on SI hardware generation */ 752 if (adev->asic_type < CHIP_BONAIRE) { 753 adev->doorbell.base = 0; 754 adev->doorbell.size = 0; 755 adev->doorbell.num_doorbells = 0; 756 adev->doorbell.ptr = NULL; 757 return 0; 758 } 759 760 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 761 return -EINVAL; 762 763 amdgpu_asic_init_doorbell_index(adev); 764 765 /* doorbell bar mapping */ 766 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 767 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 768 769 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 770 adev->doorbell_index.max_assignment+1); 771 if (adev->doorbell.num_doorbells == 0) 772 return -EINVAL; 773 774 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 775 * paging queue doorbell use the second page. The 776 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 777 * doorbells are in the first page. So with paging queue enabled, 778 * the max num_doorbells should + 1 page (0x400 in dword) 779 */ 780 if (adev->asic_type >= CHIP_VEGA10) 781 adev->doorbell.num_doorbells += 0x400; 782 783 adev->doorbell.ptr = ioremap(adev->doorbell.base, 784 adev->doorbell.num_doorbells * 785 sizeof(u32)); 786 if (adev->doorbell.ptr == NULL) 787 return -ENOMEM; 788 789 return 0; 790 } 791 792 /** 793 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 794 * 795 * @adev: amdgpu_device pointer 796 * 797 * Tear down doorbell driver information (CIK) 798 */ 799 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 800 { 801 iounmap(adev->doorbell.ptr); 802 adev->doorbell.ptr = NULL; 803 } 804 805 806 807 /* 808 * amdgpu_device_wb_*() 809 * Writeback is the method by which the GPU updates special pages in memory 810 * with the status of certain GPU events (fences, ring pointers,etc.). 811 */ 812 813 /** 814 * amdgpu_device_wb_fini - Disable Writeback and free memory 815 * 816 * @adev: amdgpu_device pointer 817 * 818 * Disables Writeback and frees the Writeback memory (all asics). 819 * Used at driver shutdown. 820 */ 821 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 822 { 823 if (adev->wb.wb_obj) { 824 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 825 &adev->wb.gpu_addr, 826 (void **)&adev->wb.wb); 827 adev->wb.wb_obj = NULL; 828 } 829 } 830 831 /** 832 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 833 * 834 * @adev: amdgpu_device pointer 835 * 836 * Initializes writeback and allocates writeback memory (all asics). 837 * Used at driver startup. 838 * Returns 0 on success or an -error on failure. 839 */ 840 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 841 { 842 int r; 843 844 if (adev->wb.wb_obj == NULL) { 845 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 846 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 847 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 848 &adev->wb.wb_obj, &adev->wb.gpu_addr, 849 (void **)&adev->wb.wb); 850 if (r) { 851 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 852 return r; 853 } 854 855 adev->wb.num_wb = AMDGPU_MAX_WB; 856 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 857 858 /* clear wb memory */ 859 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 860 } 861 862 return 0; 863 } 864 865 /** 866 * amdgpu_device_wb_get - Allocate a wb entry 867 * 868 * @adev: amdgpu_device pointer 869 * @wb: wb index 870 * 871 * Allocate a wb slot for use by the driver (all asics). 872 * Returns 0 on success or -EINVAL on failure. 873 */ 874 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 875 { 876 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 877 878 if (offset < adev->wb.num_wb) { 879 __set_bit(offset, adev->wb.used); 880 *wb = offset << 3; /* convert to dw offset */ 881 return 0; 882 } else { 883 return -EINVAL; 884 } 885 } 886 887 /** 888 * amdgpu_device_wb_free - Free a wb entry 889 * 890 * @adev: amdgpu_device pointer 891 * @wb: wb index 892 * 893 * Free a wb slot allocated for use by the driver (all asics) 894 */ 895 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 896 { 897 wb >>= 3; 898 if (wb < adev->wb.num_wb) 899 __clear_bit(wb, adev->wb.used); 900 } 901 902 /** 903 * amdgpu_device_resize_fb_bar - try to resize FB BAR 904 * 905 * @adev: amdgpu_device pointer 906 * 907 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 908 * to fail, but if any of the BARs is not accessible after the size we abort 909 * driver loading by returning -ENODEV. 910 */ 911 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 912 { 913 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 914 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 915 struct pci_bus *root; 916 struct resource *res; 917 unsigned i; 918 u16 cmd; 919 int r; 920 921 /* Bypass for VF */ 922 if (amdgpu_sriov_vf(adev)) 923 return 0; 924 925 /* skip if the bios has already enabled large BAR */ 926 if (adev->gmc.real_vram_size && 927 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 928 return 0; 929 930 /* Check if the root BUS has 64bit memory resources */ 931 root = adev->pdev->bus; 932 while (root->parent) 933 root = root->parent; 934 935 pci_bus_for_each_resource(root, res, i) { 936 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 937 res->start > 0x100000000ull) 938 break; 939 } 940 941 /* Trying to resize is pointless without a root hub window above 4GB */ 942 if (!res) 943 return 0; 944 945 /* Disable memory decoding while we change the BAR addresses and size */ 946 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 947 pci_write_config_word(adev->pdev, PCI_COMMAND, 948 cmd & ~PCI_COMMAND_MEMORY); 949 950 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 951 amdgpu_device_doorbell_fini(adev); 952 if (adev->asic_type >= CHIP_BONAIRE) 953 pci_release_resource(adev->pdev, 2); 954 955 pci_release_resource(adev->pdev, 0); 956 957 r = pci_resize_resource(adev->pdev, 0, rbar_size); 958 if (r == -ENOSPC) 959 DRM_INFO("Not enough PCI address space for a large BAR."); 960 else if (r && r != -ENOTSUPP) 961 DRM_ERROR("Problem resizing BAR0 (%d).", r); 962 963 pci_assign_unassigned_bus_resources(adev->pdev->bus); 964 965 /* When the doorbell or fb BAR isn't available we have no chance of 966 * using the device. 967 */ 968 r = amdgpu_device_doorbell_init(adev); 969 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 970 return -ENODEV; 971 972 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 973 974 return 0; 975 } 976 977 /* 978 * GPU helpers function. 979 */ 980 /** 981 * amdgpu_device_need_post - check if the hw need post or not 982 * 983 * @adev: amdgpu_device pointer 984 * 985 * Check if the asic has been initialized (all asics) at driver startup 986 * or post is needed if hw reset is performed. 987 * Returns true if need or false if not. 988 */ 989 bool amdgpu_device_need_post(struct amdgpu_device *adev) 990 { 991 uint32_t reg; 992 993 if (amdgpu_sriov_vf(adev)) 994 return false; 995 996 if (amdgpu_passthrough(adev)) { 997 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 998 * some old smc fw still need driver do vPost otherwise gpu hang, while 999 * those smc fw version above 22.15 doesn't have this flaw, so we force 1000 * vpost executed for smc version below 22.15 1001 */ 1002 if (adev->asic_type == CHIP_FIJI) { 1003 int err; 1004 uint32_t fw_ver; 1005 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1006 /* force vPost if error occured */ 1007 if (err) 1008 return true; 1009 1010 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1011 if (fw_ver < 0x00160e00) 1012 return true; 1013 } 1014 } 1015 1016 if (adev->has_hw_reset) { 1017 adev->has_hw_reset = false; 1018 return true; 1019 } 1020 1021 /* bios scratch used on CIK+ */ 1022 if (adev->asic_type >= CHIP_BONAIRE) 1023 return amdgpu_atombios_scratch_need_asic_init(adev); 1024 1025 /* check MEM_SIZE for older asics */ 1026 reg = amdgpu_asic_get_config_memsize(adev); 1027 1028 if ((reg != 0) && (reg != 0xffffffff)) 1029 return false; 1030 1031 return true; 1032 } 1033 1034 /* if we get transitioned to only one device, take VGA back */ 1035 /** 1036 * amdgpu_device_vga_set_decode - enable/disable vga decode 1037 * 1038 * @cookie: amdgpu_device pointer 1039 * @state: enable/disable vga decode 1040 * 1041 * Enable/disable vga decode (all asics). 1042 * Returns VGA resource flags. 1043 */ 1044 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1045 { 1046 struct amdgpu_device *adev = cookie; 1047 amdgpu_asic_set_vga_state(adev, state); 1048 if (state) 1049 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1050 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1051 else 1052 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1053 } 1054 1055 /** 1056 * amdgpu_device_check_block_size - validate the vm block size 1057 * 1058 * @adev: amdgpu_device pointer 1059 * 1060 * Validates the vm block size specified via module parameter. 1061 * The vm block size defines number of bits in page table versus page directory, 1062 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1063 * page table and the remaining bits are in the page directory. 1064 */ 1065 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1066 { 1067 /* defines number of bits in page table versus page directory, 1068 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1069 * page table and the remaining bits are in the page directory */ 1070 if (amdgpu_vm_block_size == -1) 1071 return; 1072 1073 if (amdgpu_vm_block_size < 9) { 1074 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1075 amdgpu_vm_block_size); 1076 amdgpu_vm_block_size = -1; 1077 } 1078 } 1079 1080 /** 1081 * amdgpu_device_check_vm_size - validate the vm size 1082 * 1083 * @adev: amdgpu_device pointer 1084 * 1085 * Validates the vm size in GB specified via module parameter. 1086 * The VM size is the size of the GPU virtual memory space in GB. 1087 */ 1088 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1089 { 1090 /* no need to check the default value */ 1091 if (amdgpu_vm_size == -1) 1092 return; 1093 1094 if (amdgpu_vm_size < 1) { 1095 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1096 amdgpu_vm_size); 1097 amdgpu_vm_size = -1; 1098 } 1099 } 1100 1101 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1102 { 1103 struct sysinfo si; 1104 bool is_os_64 = (sizeof(void *) == 8); 1105 uint64_t total_memory; 1106 uint64_t dram_size_seven_GB = 0x1B8000000; 1107 uint64_t dram_size_three_GB = 0xB8000000; 1108 1109 if (amdgpu_smu_memory_pool_size == 0) 1110 return; 1111 1112 if (!is_os_64) { 1113 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1114 goto def_value; 1115 } 1116 si_meminfo(&si); 1117 total_memory = (uint64_t)si.totalram * si.mem_unit; 1118 1119 if ((amdgpu_smu_memory_pool_size == 1) || 1120 (amdgpu_smu_memory_pool_size == 2)) { 1121 if (total_memory < dram_size_three_GB) 1122 goto def_value1; 1123 } else if ((amdgpu_smu_memory_pool_size == 4) || 1124 (amdgpu_smu_memory_pool_size == 8)) { 1125 if (total_memory < dram_size_seven_GB) 1126 goto def_value1; 1127 } else { 1128 DRM_WARN("Smu memory pool size not supported\n"); 1129 goto def_value; 1130 } 1131 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1132 1133 return; 1134 1135 def_value1: 1136 DRM_WARN("No enough system memory\n"); 1137 def_value: 1138 adev->pm.smu_prv_buffer_size = 0; 1139 } 1140 1141 /** 1142 * amdgpu_device_check_arguments - validate module params 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Validates certain module parameters and updates 1147 * the associated values used by the driver (all asics). 1148 */ 1149 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1150 { 1151 if (amdgpu_sched_jobs < 4) { 1152 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1153 amdgpu_sched_jobs); 1154 amdgpu_sched_jobs = 4; 1155 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1156 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1157 amdgpu_sched_jobs); 1158 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1159 } 1160 1161 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1162 /* gart size must be greater or equal to 32M */ 1163 dev_warn(adev->dev, "gart size (%d) too small\n", 1164 amdgpu_gart_size); 1165 amdgpu_gart_size = -1; 1166 } 1167 1168 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1169 /* gtt size must be greater or equal to 32M */ 1170 dev_warn(adev->dev, "gtt size (%d) too small\n", 1171 amdgpu_gtt_size); 1172 amdgpu_gtt_size = -1; 1173 } 1174 1175 /* valid range is between 4 and 9 inclusive */ 1176 if (amdgpu_vm_fragment_size != -1 && 1177 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1178 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1179 amdgpu_vm_fragment_size = -1; 1180 } 1181 1182 if (amdgpu_sched_hw_submission < 2) { 1183 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1184 amdgpu_sched_hw_submission); 1185 amdgpu_sched_hw_submission = 2; 1186 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1187 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1188 amdgpu_sched_hw_submission); 1189 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1190 } 1191 1192 amdgpu_device_check_smu_prv_buffer_size(adev); 1193 1194 amdgpu_device_check_vm_size(adev); 1195 1196 amdgpu_device_check_block_size(adev); 1197 1198 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1199 1200 amdgpu_gmc_tmz_set(adev); 1201 1202 return 0; 1203 } 1204 1205 /** 1206 * amdgpu_switcheroo_set_state - set switcheroo state 1207 * 1208 * @pdev: pci dev pointer 1209 * @state: vga_switcheroo state 1210 * 1211 * Callback for the switcheroo driver. Suspends or resumes the 1212 * the asics before or after it is powered up using ACPI methods. 1213 */ 1214 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1215 { 1216 struct drm_device *dev = pci_get_drvdata(pdev); 1217 int r; 1218 1219 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1220 return; 1221 1222 if (state == VGA_SWITCHEROO_ON) { 1223 pr_info("switched on\n"); 1224 /* don't suspend or resume card normally */ 1225 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1226 1227 pci_set_power_state(dev->pdev, PCI_D0); 1228 pci_restore_state(dev->pdev); 1229 r = pci_enable_device(dev->pdev); 1230 if (r) 1231 DRM_WARN("pci_enable_device failed (%d)\n", r); 1232 amdgpu_device_resume(dev, true); 1233 1234 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1235 drm_kms_helper_poll_enable(dev); 1236 } else { 1237 pr_info("switched off\n"); 1238 drm_kms_helper_poll_disable(dev); 1239 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1240 amdgpu_device_suspend(dev, true); 1241 pci_save_state(dev->pdev); 1242 /* Shut down the device */ 1243 pci_disable_device(dev->pdev); 1244 pci_set_power_state(dev->pdev, PCI_D3cold); 1245 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1246 } 1247 } 1248 1249 /** 1250 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1251 * 1252 * @pdev: pci dev pointer 1253 * 1254 * Callback for the switcheroo driver. Check of the switcheroo 1255 * state can be changed. 1256 * Returns true if the state can be changed, false if not. 1257 */ 1258 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1259 { 1260 struct drm_device *dev = pci_get_drvdata(pdev); 1261 1262 /* 1263 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1264 * locking inversion with the driver load path. And the access here is 1265 * completely racy anyway. So don't bother with locking for now. 1266 */ 1267 return atomic_read(&dev->open_count) == 0; 1268 } 1269 1270 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1271 .set_gpu_state = amdgpu_switcheroo_set_state, 1272 .reprobe = NULL, 1273 .can_switch = amdgpu_switcheroo_can_switch, 1274 }; 1275 1276 /** 1277 * amdgpu_device_ip_set_clockgating_state - set the CG state 1278 * 1279 * @dev: amdgpu_device pointer 1280 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1281 * @state: clockgating state (gate or ungate) 1282 * 1283 * Sets the requested clockgating state for all instances of 1284 * the hardware IP specified. 1285 * Returns the error code from the last instance. 1286 */ 1287 int amdgpu_device_ip_set_clockgating_state(void *dev, 1288 enum amd_ip_block_type block_type, 1289 enum amd_clockgating_state state) 1290 { 1291 struct amdgpu_device *adev = dev; 1292 int i, r = 0; 1293 1294 for (i = 0; i < adev->num_ip_blocks; i++) { 1295 if (!adev->ip_blocks[i].status.valid) 1296 continue; 1297 if (adev->ip_blocks[i].version->type != block_type) 1298 continue; 1299 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1300 continue; 1301 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1302 (void *)adev, state); 1303 if (r) 1304 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1305 adev->ip_blocks[i].version->funcs->name, r); 1306 } 1307 return r; 1308 } 1309 1310 /** 1311 * amdgpu_device_ip_set_powergating_state - set the PG state 1312 * 1313 * @dev: amdgpu_device pointer 1314 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1315 * @state: powergating state (gate or ungate) 1316 * 1317 * Sets the requested powergating state for all instances of 1318 * the hardware IP specified. 1319 * Returns the error code from the last instance. 1320 */ 1321 int amdgpu_device_ip_set_powergating_state(void *dev, 1322 enum amd_ip_block_type block_type, 1323 enum amd_powergating_state state) 1324 { 1325 struct amdgpu_device *adev = dev; 1326 int i, r = 0; 1327 1328 for (i = 0; i < adev->num_ip_blocks; i++) { 1329 if (!adev->ip_blocks[i].status.valid) 1330 continue; 1331 if (adev->ip_blocks[i].version->type != block_type) 1332 continue; 1333 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1334 continue; 1335 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1336 (void *)adev, state); 1337 if (r) 1338 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1339 adev->ip_blocks[i].version->funcs->name, r); 1340 } 1341 return r; 1342 } 1343 1344 /** 1345 * amdgpu_device_ip_get_clockgating_state - get the CG state 1346 * 1347 * @adev: amdgpu_device pointer 1348 * @flags: clockgating feature flags 1349 * 1350 * Walks the list of IPs on the device and updates the clockgating 1351 * flags for each IP. 1352 * Updates @flags with the feature flags for each hardware IP where 1353 * clockgating is enabled. 1354 */ 1355 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1356 u32 *flags) 1357 { 1358 int i; 1359 1360 for (i = 0; i < adev->num_ip_blocks; i++) { 1361 if (!adev->ip_blocks[i].status.valid) 1362 continue; 1363 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1364 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1365 } 1366 } 1367 1368 /** 1369 * amdgpu_device_ip_wait_for_idle - wait for idle 1370 * 1371 * @adev: amdgpu_device pointer 1372 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1373 * 1374 * Waits for the request hardware IP to be idle. 1375 * Returns 0 for success or a negative error code on failure. 1376 */ 1377 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1378 enum amd_ip_block_type block_type) 1379 { 1380 int i, r; 1381 1382 for (i = 0; i < adev->num_ip_blocks; i++) { 1383 if (!adev->ip_blocks[i].status.valid) 1384 continue; 1385 if (adev->ip_blocks[i].version->type == block_type) { 1386 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1387 if (r) 1388 return r; 1389 break; 1390 } 1391 } 1392 return 0; 1393 1394 } 1395 1396 /** 1397 * amdgpu_device_ip_is_idle - is the hardware IP idle 1398 * 1399 * @adev: amdgpu_device pointer 1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1401 * 1402 * Check if the hardware IP is idle or not. 1403 * Returns true if it the IP is idle, false if not. 1404 */ 1405 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1406 enum amd_ip_block_type block_type) 1407 { 1408 int i; 1409 1410 for (i = 0; i < adev->num_ip_blocks; i++) { 1411 if (!adev->ip_blocks[i].status.valid) 1412 continue; 1413 if (adev->ip_blocks[i].version->type == block_type) 1414 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1415 } 1416 return true; 1417 1418 } 1419 1420 /** 1421 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1422 * 1423 * @adev: amdgpu_device pointer 1424 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1425 * 1426 * Returns a pointer to the hardware IP block structure 1427 * if it exists for the asic, otherwise NULL. 1428 */ 1429 struct amdgpu_ip_block * 1430 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1431 enum amd_ip_block_type type) 1432 { 1433 int i; 1434 1435 for (i = 0; i < adev->num_ip_blocks; i++) 1436 if (adev->ip_blocks[i].version->type == type) 1437 return &adev->ip_blocks[i]; 1438 1439 return NULL; 1440 } 1441 1442 /** 1443 * amdgpu_device_ip_block_version_cmp 1444 * 1445 * @adev: amdgpu_device pointer 1446 * @type: enum amd_ip_block_type 1447 * @major: major version 1448 * @minor: minor version 1449 * 1450 * return 0 if equal or greater 1451 * return 1 if smaller or the ip_block doesn't exist 1452 */ 1453 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1454 enum amd_ip_block_type type, 1455 u32 major, u32 minor) 1456 { 1457 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1458 1459 if (ip_block && ((ip_block->version->major > major) || 1460 ((ip_block->version->major == major) && 1461 (ip_block->version->minor >= minor)))) 1462 return 0; 1463 1464 return 1; 1465 } 1466 1467 /** 1468 * amdgpu_device_ip_block_add 1469 * 1470 * @adev: amdgpu_device pointer 1471 * @ip_block_version: pointer to the IP to add 1472 * 1473 * Adds the IP block driver information to the collection of IPs 1474 * on the asic. 1475 */ 1476 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1477 const struct amdgpu_ip_block_version *ip_block_version) 1478 { 1479 if (!ip_block_version) 1480 return -EINVAL; 1481 1482 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1483 ip_block_version->funcs->name); 1484 1485 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1486 1487 return 0; 1488 } 1489 1490 /** 1491 * amdgpu_device_enable_virtual_display - enable virtual display feature 1492 * 1493 * @adev: amdgpu_device pointer 1494 * 1495 * Enabled the virtual display feature if the user has enabled it via 1496 * the module parameter virtual_display. This feature provides a virtual 1497 * display hardware on headless boards or in virtualized environments. 1498 * This function parses and validates the configuration string specified by 1499 * the user and configues the virtual display configuration (number of 1500 * virtual connectors, crtcs, etc.) specified. 1501 */ 1502 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1503 { 1504 adev->enable_virtual_display = false; 1505 1506 if (amdgpu_virtual_display) { 1507 struct drm_device *ddev = adev->ddev; 1508 const char *pci_address_name = pci_name(ddev->pdev); 1509 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1510 1511 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1512 pciaddstr_tmp = pciaddstr; 1513 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1514 pciaddname = strsep(&pciaddname_tmp, ","); 1515 if (!strcmp("all", pciaddname) 1516 || !strcmp(pci_address_name, pciaddname)) { 1517 long num_crtc; 1518 int res = -1; 1519 1520 adev->enable_virtual_display = true; 1521 1522 if (pciaddname_tmp) 1523 res = kstrtol(pciaddname_tmp, 10, 1524 &num_crtc); 1525 1526 if (!res) { 1527 if (num_crtc < 1) 1528 num_crtc = 1; 1529 if (num_crtc > 6) 1530 num_crtc = 6; 1531 adev->mode_info.num_crtc = num_crtc; 1532 } else { 1533 adev->mode_info.num_crtc = 1; 1534 } 1535 break; 1536 } 1537 } 1538 1539 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1540 amdgpu_virtual_display, pci_address_name, 1541 adev->enable_virtual_display, adev->mode_info.num_crtc); 1542 1543 kfree(pciaddstr); 1544 } 1545 } 1546 1547 /** 1548 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1549 * 1550 * @adev: amdgpu_device pointer 1551 * 1552 * Parses the asic configuration parameters specified in the gpu info 1553 * firmware and makes them availale to the driver for use in configuring 1554 * the asic. 1555 * Returns 0 on success, -EINVAL on failure. 1556 */ 1557 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1558 { 1559 const char *chip_name; 1560 char fw_name[40]; 1561 int err; 1562 const struct gpu_info_firmware_header_v1_0 *hdr; 1563 1564 adev->firmware.gpu_info_fw = NULL; 1565 1566 if (adev->discovery_bin) { 1567 amdgpu_discovery_get_gfx_info(adev); 1568 1569 /* 1570 * FIXME: The bounding box is still needed by Navi12, so 1571 * temporarily read it from gpu_info firmware. Should be droped 1572 * when DAL no longer needs it. 1573 */ 1574 if (adev->asic_type != CHIP_NAVI12) 1575 return 0; 1576 } 1577 1578 switch (adev->asic_type) { 1579 #ifdef CONFIG_DRM_AMDGPU_SI 1580 case CHIP_VERDE: 1581 case CHIP_TAHITI: 1582 case CHIP_PITCAIRN: 1583 case CHIP_OLAND: 1584 case CHIP_HAINAN: 1585 #endif 1586 #ifdef CONFIG_DRM_AMDGPU_CIK 1587 case CHIP_BONAIRE: 1588 case CHIP_HAWAII: 1589 case CHIP_KAVERI: 1590 case CHIP_KABINI: 1591 case CHIP_MULLINS: 1592 #endif 1593 case CHIP_TOPAZ: 1594 case CHIP_TONGA: 1595 case CHIP_FIJI: 1596 case CHIP_POLARIS10: 1597 case CHIP_POLARIS11: 1598 case CHIP_POLARIS12: 1599 case CHIP_VEGAM: 1600 case CHIP_CARRIZO: 1601 case CHIP_STONEY: 1602 case CHIP_VEGA20: 1603 default: 1604 return 0; 1605 case CHIP_VEGA10: 1606 chip_name = "vega10"; 1607 break; 1608 case CHIP_VEGA12: 1609 chip_name = "vega12"; 1610 break; 1611 case CHIP_RAVEN: 1612 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1613 chip_name = "raven2"; 1614 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1615 chip_name = "picasso"; 1616 else 1617 chip_name = "raven"; 1618 break; 1619 case CHIP_ARCTURUS: 1620 chip_name = "arcturus"; 1621 break; 1622 case CHIP_RENOIR: 1623 chip_name = "renoir"; 1624 break; 1625 case CHIP_NAVI10: 1626 chip_name = "navi10"; 1627 break; 1628 case CHIP_NAVI14: 1629 chip_name = "navi14"; 1630 break; 1631 case CHIP_NAVI12: 1632 chip_name = "navi12"; 1633 break; 1634 case CHIP_SIENNA_CICHLID: 1635 chip_name = "sienna_cichlid"; 1636 break; 1637 case CHIP_NAVY_FLOUNDER: 1638 chip_name = "navy_flounder"; 1639 break; 1640 } 1641 1642 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1643 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1644 if (err) { 1645 dev_err(adev->dev, 1646 "Failed to load gpu_info firmware \"%s\"\n", 1647 fw_name); 1648 goto out; 1649 } 1650 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1651 if (err) { 1652 dev_err(adev->dev, 1653 "Failed to validate gpu_info firmware \"%s\"\n", 1654 fw_name); 1655 goto out; 1656 } 1657 1658 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1659 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1660 1661 switch (hdr->version_major) { 1662 case 1: 1663 { 1664 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1665 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1666 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1667 1668 /* 1669 * Should be droped when DAL no longer needs it. 1670 */ 1671 if (adev->asic_type == CHIP_NAVI12) 1672 goto parse_soc_bounding_box; 1673 1674 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1675 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1676 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1677 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1678 adev->gfx.config.max_texture_channel_caches = 1679 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1680 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1681 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1682 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1683 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1684 adev->gfx.config.double_offchip_lds_buf = 1685 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1686 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1687 adev->gfx.cu_info.max_waves_per_simd = 1688 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1689 adev->gfx.cu_info.max_scratch_slots_per_cu = 1690 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1691 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1692 if (hdr->version_minor >= 1) { 1693 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1694 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1695 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1696 adev->gfx.config.num_sc_per_sh = 1697 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1698 adev->gfx.config.num_packer_per_sc = 1699 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1700 } 1701 1702 parse_soc_bounding_box: 1703 /* 1704 * soc bounding box info is not integrated in disocovery table, 1705 * we always need to parse it from gpu info firmware if needed. 1706 */ 1707 if (hdr->version_minor == 2) { 1708 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1709 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1710 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1711 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1712 } 1713 break; 1714 } 1715 default: 1716 dev_err(adev->dev, 1717 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1718 err = -EINVAL; 1719 goto out; 1720 } 1721 out: 1722 return err; 1723 } 1724 1725 /** 1726 * amdgpu_device_ip_early_init - run early init for hardware IPs 1727 * 1728 * @adev: amdgpu_device pointer 1729 * 1730 * Early initialization pass for hardware IPs. The hardware IPs that make 1731 * up each asic are discovered each IP's early_init callback is run. This 1732 * is the first stage in initializing the asic. 1733 * Returns 0 on success, negative error code on failure. 1734 */ 1735 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1736 { 1737 int i, r; 1738 1739 amdgpu_device_enable_virtual_display(adev); 1740 1741 if (amdgpu_sriov_vf(adev)) { 1742 r = amdgpu_virt_request_full_gpu(adev, true); 1743 if (r) 1744 return r; 1745 } 1746 1747 switch (adev->asic_type) { 1748 #ifdef CONFIG_DRM_AMDGPU_SI 1749 case CHIP_VERDE: 1750 case CHIP_TAHITI: 1751 case CHIP_PITCAIRN: 1752 case CHIP_OLAND: 1753 case CHIP_HAINAN: 1754 adev->family = AMDGPU_FAMILY_SI; 1755 r = si_set_ip_blocks(adev); 1756 if (r) 1757 return r; 1758 break; 1759 #endif 1760 #ifdef CONFIG_DRM_AMDGPU_CIK 1761 case CHIP_BONAIRE: 1762 case CHIP_HAWAII: 1763 case CHIP_KAVERI: 1764 case CHIP_KABINI: 1765 case CHIP_MULLINS: 1766 if (adev->flags & AMD_IS_APU) 1767 adev->family = AMDGPU_FAMILY_KV; 1768 else 1769 adev->family = AMDGPU_FAMILY_CI; 1770 1771 r = cik_set_ip_blocks(adev); 1772 if (r) 1773 return r; 1774 break; 1775 #endif 1776 case CHIP_TOPAZ: 1777 case CHIP_TONGA: 1778 case CHIP_FIJI: 1779 case CHIP_POLARIS10: 1780 case CHIP_POLARIS11: 1781 case CHIP_POLARIS12: 1782 case CHIP_VEGAM: 1783 case CHIP_CARRIZO: 1784 case CHIP_STONEY: 1785 if (adev->flags & AMD_IS_APU) 1786 adev->family = AMDGPU_FAMILY_CZ; 1787 else 1788 adev->family = AMDGPU_FAMILY_VI; 1789 1790 r = vi_set_ip_blocks(adev); 1791 if (r) 1792 return r; 1793 break; 1794 case CHIP_VEGA10: 1795 case CHIP_VEGA12: 1796 case CHIP_VEGA20: 1797 case CHIP_RAVEN: 1798 case CHIP_ARCTURUS: 1799 case CHIP_RENOIR: 1800 if (adev->flags & AMD_IS_APU) 1801 adev->family = AMDGPU_FAMILY_RV; 1802 else 1803 adev->family = AMDGPU_FAMILY_AI; 1804 1805 r = soc15_set_ip_blocks(adev); 1806 if (r) 1807 return r; 1808 break; 1809 case CHIP_NAVI10: 1810 case CHIP_NAVI14: 1811 case CHIP_NAVI12: 1812 case CHIP_SIENNA_CICHLID: 1813 case CHIP_NAVY_FLOUNDER: 1814 adev->family = AMDGPU_FAMILY_NV; 1815 1816 r = nv_set_ip_blocks(adev); 1817 if (r) 1818 return r; 1819 break; 1820 default: 1821 /* FIXME: not supported yet */ 1822 return -EINVAL; 1823 } 1824 1825 amdgpu_amdkfd_device_probe(adev); 1826 1827 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1828 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1829 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1830 1831 for (i = 0; i < adev->num_ip_blocks; i++) { 1832 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1833 DRM_ERROR("disabled ip block: %d <%s>\n", 1834 i, adev->ip_blocks[i].version->funcs->name); 1835 adev->ip_blocks[i].status.valid = false; 1836 } else { 1837 if (adev->ip_blocks[i].version->funcs->early_init) { 1838 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1839 if (r == -ENOENT) { 1840 adev->ip_blocks[i].status.valid = false; 1841 } else if (r) { 1842 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1843 adev->ip_blocks[i].version->funcs->name, r); 1844 return r; 1845 } else { 1846 adev->ip_blocks[i].status.valid = true; 1847 } 1848 } else { 1849 adev->ip_blocks[i].status.valid = true; 1850 } 1851 } 1852 /* get the vbios after the asic_funcs are set up */ 1853 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1854 r = amdgpu_device_parse_gpu_info_fw(adev); 1855 if (r) 1856 return r; 1857 1858 /* Read BIOS */ 1859 if (!amdgpu_get_bios(adev)) 1860 return -EINVAL; 1861 1862 r = amdgpu_atombios_init(adev); 1863 if (r) { 1864 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1865 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1866 return r; 1867 } 1868 } 1869 } 1870 1871 adev->cg_flags &= amdgpu_cg_mask; 1872 adev->pg_flags &= amdgpu_pg_mask; 1873 1874 return 0; 1875 } 1876 1877 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1878 { 1879 int i, r; 1880 1881 for (i = 0; i < adev->num_ip_blocks; i++) { 1882 if (!adev->ip_blocks[i].status.sw) 1883 continue; 1884 if (adev->ip_blocks[i].status.hw) 1885 continue; 1886 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1887 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1888 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1889 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1890 if (r) { 1891 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1892 adev->ip_blocks[i].version->funcs->name, r); 1893 return r; 1894 } 1895 adev->ip_blocks[i].status.hw = true; 1896 } 1897 } 1898 1899 return 0; 1900 } 1901 1902 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1903 { 1904 int i, r; 1905 1906 for (i = 0; i < adev->num_ip_blocks; i++) { 1907 if (!adev->ip_blocks[i].status.sw) 1908 continue; 1909 if (adev->ip_blocks[i].status.hw) 1910 continue; 1911 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1912 if (r) { 1913 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1914 adev->ip_blocks[i].version->funcs->name, r); 1915 return r; 1916 } 1917 adev->ip_blocks[i].status.hw = true; 1918 } 1919 1920 return 0; 1921 } 1922 1923 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1924 { 1925 int r = 0; 1926 int i; 1927 uint32_t smu_version; 1928 1929 if (adev->asic_type >= CHIP_VEGA10) { 1930 for (i = 0; i < adev->num_ip_blocks; i++) { 1931 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1932 continue; 1933 1934 /* no need to do the fw loading again if already done*/ 1935 if (adev->ip_blocks[i].status.hw == true) 1936 break; 1937 1938 if (adev->in_gpu_reset || adev->in_suspend) { 1939 r = adev->ip_blocks[i].version->funcs->resume(adev); 1940 if (r) { 1941 DRM_ERROR("resume of IP block <%s> failed %d\n", 1942 adev->ip_blocks[i].version->funcs->name, r); 1943 return r; 1944 } 1945 } else { 1946 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1947 if (r) { 1948 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1949 adev->ip_blocks[i].version->funcs->name, r); 1950 return r; 1951 } 1952 } 1953 1954 adev->ip_blocks[i].status.hw = true; 1955 break; 1956 } 1957 } 1958 1959 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 1960 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1961 1962 return r; 1963 } 1964 1965 /** 1966 * amdgpu_device_ip_init - run init for hardware IPs 1967 * 1968 * @adev: amdgpu_device pointer 1969 * 1970 * Main initialization pass for hardware IPs. The list of all the hardware 1971 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1972 * are run. sw_init initializes the software state associated with each IP 1973 * and hw_init initializes the hardware associated with each IP. 1974 * Returns 0 on success, negative error code on failure. 1975 */ 1976 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1977 { 1978 int i, r; 1979 1980 r = amdgpu_ras_init(adev); 1981 if (r) 1982 return r; 1983 1984 for (i = 0; i < adev->num_ip_blocks; i++) { 1985 if (!adev->ip_blocks[i].status.valid) 1986 continue; 1987 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1988 if (r) { 1989 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1990 adev->ip_blocks[i].version->funcs->name, r); 1991 goto init_failed; 1992 } 1993 adev->ip_blocks[i].status.sw = true; 1994 1995 /* need to do gmc hw init early so we can allocate gpu mem */ 1996 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 1997 r = amdgpu_device_vram_scratch_init(adev); 1998 if (r) { 1999 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2000 goto init_failed; 2001 } 2002 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2003 if (r) { 2004 DRM_ERROR("hw_init %d failed %d\n", i, r); 2005 goto init_failed; 2006 } 2007 r = amdgpu_device_wb_init(adev); 2008 if (r) { 2009 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2010 goto init_failed; 2011 } 2012 adev->ip_blocks[i].status.hw = true; 2013 2014 /* right after GMC hw init, we create CSA */ 2015 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2016 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2017 AMDGPU_GEM_DOMAIN_VRAM, 2018 AMDGPU_CSA_SIZE); 2019 if (r) { 2020 DRM_ERROR("allocate CSA failed %d\n", r); 2021 goto init_failed; 2022 } 2023 } 2024 } 2025 } 2026 2027 if (amdgpu_sriov_vf(adev)) 2028 amdgpu_virt_init_data_exchange(adev); 2029 2030 r = amdgpu_ib_pool_init(adev); 2031 if (r) { 2032 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2033 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2034 goto init_failed; 2035 } 2036 2037 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2038 if (r) 2039 goto init_failed; 2040 2041 r = amdgpu_device_ip_hw_init_phase1(adev); 2042 if (r) 2043 goto init_failed; 2044 2045 r = amdgpu_device_fw_loading(adev); 2046 if (r) 2047 goto init_failed; 2048 2049 r = amdgpu_device_ip_hw_init_phase2(adev); 2050 if (r) 2051 goto init_failed; 2052 2053 /* 2054 * retired pages will be loaded from eeprom and reserved here, 2055 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2056 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2057 * for I2C communication which only true at this point. 2058 * recovery_init may fail, but it can free all resources allocated by 2059 * itself and its failure should not stop amdgpu init process. 2060 * 2061 * Note: theoretically, this should be called before all vram allocations 2062 * to protect retired page from abusing 2063 */ 2064 amdgpu_ras_recovery_init(adev); 2065 2066 if (adev->gmc.xgmi.num_physical_nodes > 1) 2067 amdgpu_xgmi_add_device(adev); 2068 amdgpu_amdkfd_device_init(adev); 2069 2070 amdgpu_fru_get_product_info(adev); 2071 2072 init_failed: 2073 if (amdgpu_sriov_vf(adev)) 2074 amdgpu_virt_release_full_gpu(adev, true); 2075 2076 return r; 2077 } 2078 2079 /** 2080 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2081 * 2082 * @adev: amdgpu_device pointer 2083 * 2084 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2085 * this function before a GPU reset. If the value is retained after a 2086 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2087 */ 2088 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2089 { 2090 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2091 } 2092 2093 /** 2094 * amdgpu_device_check_vram_lost - check if vram is valid 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Checks the reset magic value written to the gart pointer in VRAM. 2099 * The driver calls this after a GPU reset to see if the contents of 2100 * VRAM is lost or now. 2101 * returns true if vram is lost, false if not. 2102 */ 2103 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2104 { 2105 if (memcmp(adev->gart.ptr, adev->reset_magic, 2106 AMDGPU_RESET_MAGIC_NUM)) 2107 return true; 2108 2109 if (!adev->in_gpu_reset) 2110 return false; 2111 2112 /* 2113 * For all ASICs with baco/mode1 reset, the VRAM is 2114 * always assumed to be lost. 2115 */ 2116 switch (amdgpu_asic_reset_method(adev)) { 2117 case AMD_RESET_METHOD_BACO: 2118 case AMD_RESET_METHOD_MODE1: 2119 return true; 2120 default: 2121 return false; 2122 } 2123 } 2124 2125 /** 2126 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2127 * 2128 * @adev: amdgpu_device pointer 2129 * @state: clockgating state (gate or ungate) 2130 * 2131 * The list of all the hardware IPs that make up the asic is walked and the 2132 * set_clockgating_state callbacks are run. 2133 * Late initialization pass enabling clockgating for hardware IPs. 2134 * Fini or suspend, pass disabling clockgating for hardware IPs. 2135 * Returns 0 on success, negative error code on failure. 2136 */ 2137 2138 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2139 enum amd_clockgating_state state) 2140 { 2141 int i, j, r; 2142 2143 if (amdgpu_emu_mode == 1) 2144 return 0; 2145 2146 for (j = 0; j < adev->num_ip_blocks; j++) { 2147 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2148 if (!adev->ip_blocks[i].status.late_initialized) 2149 continue; 2150 /* skip CG for VCE/UVD, it's handled specially */ 2151 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2152 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2153 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2154 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2155 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2156 /* enable clockgating to save power */ 2157 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2158 state); 2159 if (r) { 2160 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2161 adev->ip_blocks[i].version->funcs->name, r); 2162 return r; 2163 } 2164 } 2165 } 2166 2167 return 0; 2168 } 2169 2170 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2171 { 2172 int i, j, r; 2173 2174 if (amdgpu_emu_mode == 1) 2175 return 0; 2176 2177 for (j = 0; j < adev->num_ip_blocks; j++) { 2178 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2179 if (!adev->ip_blocks[i].status.late_initialized) 2180 continue; 2181 /* skip CG for VCE/UVD, it's handled specially */ 2182 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2183 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2184 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2185 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2186 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2187 /* enable powergating to save power */ 2188 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2189 state); 2190 if (r) { 2191 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2192 adev->ip_blocks[i].version->funcs->name, r); 2193 return r; 2194 } 2195 } 2196 } 2197 return 0; 2198 } 2199 2200 static int amdgpu_device_enable_mgpu_fan_boost(void) 2201 { 2202 struct amdgpu_gpu_instance *gpu_ins; 2203 struct amdgpu_device *adev; 2204 int i, ret = 0; 2205 2206 mutex_lock(&mgpu_info.mutex); 2207 2208 /* 2209 * MGPU fan boost feature should be enabled 2210 * only when there are two or more dGPUs in 2211 * the system 2212 */ 2213 if (mgpu_info.num_dgpu < 2) 2214 goto out; 2215 2216 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2217 gpu_ins = &(mgpu_info.gpu_ins[i]); 2218 adev = gpu_ins->adev; 2219 if (!(adev->flags & AMD_IS_APU) && 2220 !gpu_ins->mgpu_fan_enabled && 2221 adev->powerplay.pp_funcs && 2222 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 2223 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2224 if (ret) 2225 break; 2226 2227 gpu_ins->mgpu_fan_enabled = 1; 2228 } 2229 } 2230 2231 out: 2232 mutex_unlock(&mgpu_info.mutex); 2233 2234 return ret; 2235 } 2236 2237 /** 2238 * amdgpu_device_ip_late_init - run late init for hardware IPs 2239 * 2240 * @adev: amdgpu_device pointer 2241 * 2242 * Late initialization pass for hardware IPs. The list of all the hardware 2243 * IPs that make up the asic is walked and the late_init callbacks are run. 2244 * late_init covers any special initialization that an IP requires 2245 * after all of the have been initialized or something that needs to happen 2246 * late in the init process. 2247 * Returns 0 on success, negative error code on failure. 2248 */ 2249 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2250 { 2251 struct amdgpu_gpu_instance *gpu_instance; 2252 int i = 0, r; 2253 2254 for (i = 0; i < adev->num_ip_blocks; i++) { 2255 if (!adev->ip_blocks[i].status.hw) 2256 continue; 2257 if (adev->ip_blocks[i].version->funcs->late_init) { 2258 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2259 if (r) { 2260 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2261 adev->ip_blocks[i].version->funcs->name, r); 2262 return r; 2263 } 2264 } 2265 adev->ip_blocks[i].status.late_initialized = true; 2266 } 2267 2268 amdgpu_ras_set_error_query_ready(adev, true); 2269 2270 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2271 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2272 2273 amdgpu_device_fill_reset_magic(adev); 2274 2275 r = amdgpu_device_enable_mgpu_fan_boost(); 2276 if (r) 2277 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2278 2279 2280 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2281 mutex_lock(&mgpu_info.mutex); 2282 2283 /* 2284 * Reset device p-state to low as this was booted with high. 2285 * 2286 * This should be performed only after all devices from the same 2287 * hive get initialized. 2288 * 2289 * However, it's unknown how many device in the hive in advance. 2290 * As this is counted one by one during devices initializations. 2291 * 2292 * So, we wait for all XGMI interlinked devices initialized. 2293 * This may bring some delays as those devices may come from 2294 * different hives. But that should be OK. 2295 */ 2296 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2297 for (i = 0; i < mgpu_info.num_gpu; i++) { 2298 gpu_instance = &(mgpu_info.gpu_ins[i]); 2299 if (gpu_instance->adev->flags & AMD_IS_APU) 2300 continue; 2301 2302 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2303 AMDGPU_XGMI_PSTATE_MIN); 2304 if (r) { 2305 DRM_ERROR("pstate setting failed (%d).\n", r); 2306 break; 2307 } 2308 } 2309 } 2310 2311 mutex_unlock(&mgpu_info.mutex); 2312 } 2313 2314 return 0; 2315 } 2316 2317 /** 2318 * amdgpu_device_ip_fini - run fini for hardware IPs 2319 * 2320 * @adev: amdgpu_device pointer 2321 * 2322 * Main teardown pass for hardware IPs. The list of all the hardware 2323 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2324 * are run. hw_fini tears down the hardware associated with each IP 2325 * and sw_fini tears down any software state associated with each IP. 2326 * Returns 0 on success, negative error code on failure. 2327 */ 2328 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2329 { 2330 int i, r; 2331 2332 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2333 amdgpu_virt_release_ras_err_handler_data(adev); 2334 2335 amdgpu_ras_pre_fini(adev); 2336 2337 if (adev->gmc.xgmi.num_physical_nodes > 1) 2338 amdgpu_xgmi_remove_device(adev); 2339 2340 amdgpu_amdkfd_device_fini(adev); 2341 2342 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2343 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2344 2345 /* need to disable SMC first */ 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if (!adev->ip_blocks[i].status.hw) 2348 continue; 2349 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2351 /* XXX handle errors */ 2352 if (r) { 2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2354 adev->ip_blocks[i].version->funcs->name, r); 2355 } 2356 adev->ip_blocks[i].status.hw = false; 2357 break; 2358 } 2359 } 2360 2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2362 if (!adev->ip_blocks[i].status.hw) 2363 continue; 2364 2365 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2366 /* XXX handle errors */ 2367 if (r) { 2368 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2369 adev->ip_blocks[i].version->funcs->name, r); 2370 } 2371 2372 adev->ip_blocks[i].status.hw = false; 2373 } 2374 2375 2376 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2377 if (!adev->ip_blocks[i].status.sw) 2378 continue; 2379 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2381 amdgpu_ucode_free_bo(adev); 2382 amdgpu_free_static_csa(&adev->virt.csa_obj); 2383 amdgpu_device_wb_fini(adev); 2384 amdgpu_device_vram_scratch_fini(adev); 2385 amdgpu_ib_pool_fini(adev); 2386 } 2387 2388 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2389 /* XXX handle errors */ 2390 if (r) { 2391 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2392 adev->ip_blocks[i].version->funcs->name, r); 2393 } 2394 adev->ip_blocks[i].status.sw = false; 2395 adev->ip_blocks[i].status.valid = false; 2396 } 2397 2398 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2399 if (!adev->ip_blocks[i].status.late_initialized) 2400 continue; 2401 if (adev->ip_blocks[i].version->funcs->late_fini) 2402 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2403 adev->ip_blocks[i].status.late_initialized = false; 2404 } 2405 2406 amdgpu_ras_fini(adev); 2407 2408 if (amdgpu_sriov_vf(adev)) 2409 if (amdgpu_virt_release_full_gpu(adev, false)) 2410 DRM_ERROR("failed to release exclusive mode on fini\n"); 2411 2412 return 0; 2413 } 2414 2415 /** 2416 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2417 * 2418 * @work: work_struct. 2419 */ 2420 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2421 { 2422 struct amdgpu_device *adev = 2423 container_of(work, struct amdgpu_device, delayed_init_work.work); 2424 int r; 2425 2426 r = amdgpu_ib_ring_tests(adev); 2427 if (r) 2428 DRM_ERROR("ib ring test failed (%d).\n", r); 2429 } 2430 2431 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2432 { 2433 struct amdgpu_device *adev = 2434 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2435 2436 mutex_lock(&adev->gfx.gfx_off_mutex); 2437 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2438 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2439 adev->gfx.gfx_off_state = true; 2440 } 2441 mutex_unlock(&adev->gfx.gfx_off_mutex); 2442 } 2443 2444 /** 2445 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2446 * 2447 * @adev: amdgpu_device pointer 2448 * 2449 * Main suspend function for hardware IPs. The list of all the hardware 2450 * IPs that make up the asic is walked, clockgating is disabled and the 2451 * suspend callbacks are run. suspend puts the hardware and software state 2452 * in each IP into a state suitable for suspend. 2453 * Returns 0 on success, negative error code on failure. 2454 */ 2455 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2456 { 2457 int i, r; 2458 2459 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2460 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2461 2462 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2463 if (!adev->ip_blocks[i].status.valid) 2464 continue; 2465 2466 /* displays are handled separately */ 2467 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2468 continue; 2469 2470 /* XXX handle errors */ 2471 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2472 /* XXX handle errors */ 2473 if (r) { 2474 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2475 adev->ip_blocks[i].version->funcs->name, r); 2476 return r; 2477 } 2478 2479 adev->ip_blocks[i].status.hw = false; 2480 } 2481 2482 return 0; 2483 } 2484 2485 /** 2486 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2487 * 2488 * @adev: amdgpu_device pointer 2489 * 2490 * Main suspend function for hardware IPs. The list of all the hardware 2491 * IPs that make up the asic is walked, clockgating is disabled and the 2492 * suspend callbacks are run. suspend puts the hardware and software state 2493 * in each IP into a state suitable for suspend. 2494 * Returns 0 on success, negative error code on failure. 2495 */ 2496 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2497 { 2498 int i, r; 2499 2500 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2501 if (!adev->ip_blocks[i].status.valid) 2502 continue; 2503 /* displays are handled in phase1 */ 2504 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2505 continue; 2506 /* PSP lost connection when err_event_athub occurs */ 2507 if (amdgpu_ras_intr_triggered() && 2508 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2509 adev->ip_blocks[i].status.hw = false; 2510 continue; 2511 } 2512 /* XXX handle errors */ 2513 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2514 /* XXX handle errors */ 2515 if (r) { 2516 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2517 adev->ip_blocks[i].version->funcs->name, r); 2518 } 2519 adev->ip_blocks[i].status.hw = false; 2520 /* handle putting the SMC in the appropriate state */ 2521 if(!amdgpu_sriov_vf(adev)){ 2522 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2523 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2524 if (r) { 2525 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2526 adev->mp1_state, r); 2527 return r; 2528 } 2529 } 2530 } 2531 adev->ip_blocks[i].status.hw = false; 2532 } 2533 2534 return 0; 2535 } 2536 2537 /** 2538 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2539 * 2540 * @adev: amdgpu_device pointer 2541 * 2542 * Main suspend function for hardware IPs. The list of all the hardware 2543 * IPs that make up the asic is walked, clockgating is disabled and the 2544 * suspend callbacks are run. suspend puts the hardware and software state 2545 * in each IP into a state suitable for suspend. 2546 * Returns 0 on success, negative error code on failure. 2547 */ 2548 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2549 { 2550 int r; 2551 2552 if (amdgpu_sriov_vf(adev)) 2553 amdgpu_virt_request_full_gpu(adev, false); 2554 2555 r = amdgpu_device_ip_suspend_phase1(adev); 2556 if (r) 2557 return r; 2558 r = amdgpu_device_ip_suspend_phase2(adev); 2559 2560 if (amdgpu_sriov_vf(adev)) 2561 amdgpu_virt_release_full_gpu(adev, false); 2562 2563 return r; 2564 } 2565 2566 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2567 { 2568 int i, r; 2569 2570 static enum amd_ip_block_type ip_order[] = { 2571 AMD_IP_BLOCK_TYPE_GMC, 2572 AMD_IP_BLOCK_TYPE_COMMON, 2573 AMD_IP_BLOCK_TYPE_PSP, 2574 AMD_IP_BLOCK_TYPE_IH, 2575 }; 2576 2577 for (i = 0; i < adev->num_ip_blocks; i++) 2578 adev->ip_blocks[i].status.hw = false; 2579 2580 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2581 int j; 2582 struct amdgpu_ip_block *block; 2583 2584 for (j = 0; j < adev->num_ip_blocks; j++) { 2585 block = &adev->ip_blocks[j]; 2586 2587 if (block->version->type != ip_order[i] || 2588 !block->status.valid) 2589 continue; 2590 2591 r = block->version->funcs->hw_init(adev); 2592 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2593 if (r) 2594 return r; 2595 block->status.hw = true; 2596 } 2597 } 2598 2599 return 0; 2600 } 2601 2602 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2603 { 2604 int i, r; 2605 2606 static enum amd_ip_block_type ip_order[] = { 2607 AMD_IP_BLOCK_TYPE_SMC, 2608 AMD_IP_BLOCK_TYPE_DCE, 2609 AMD_IP_BLOCK_TYPE_GFX, 2610 AMD_IP_BLOCK_TYPE_SDMA, 2611 AMD_IP_BLOCK_TYPE_UVD, 2612 AMD_IP_BLOCK_TYPE_VCE, 2613 AMD_IP_BLOCK_TYPE_VCN 2614 }; 2615 2616 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2617 int j; 2618 struct amdgpu_ip_block *block; 2619 2620 for (j = 0; j < adev->num_ip_blocks; j++) { 2621 block = &adev->ip_blocks[j]; 2622 2623 if (block->version->type != ip_order[i] || 2624 !block->status.valid || 2625 block->status.hw) 2626 continue; 2627 2628 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2629 r = block->version->funcs->resume(adev); 2630 else 2631 r = block->version->funcs->hw_init(adev); 2632 2633 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2634 if (r) 2635 return r; 2636 block->status.hw = true; 2637 } 2638 } 2639 2640 return 0; 2641 } 2642 2643 /** 2644 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2645 * 2646 * @adev: amdgpu_device pointer 2647 * 2648 * First resume function for hardware IPs. The list of all the hardware 2649 * IPs that make up the asic is walked and the resume callbacks are run for 2650 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2651 * after a suspend and updates the software state as necessary. This 2652 * function is also used for restoring the GPU after a GPU reset. 2653 * Returns 0 on success, negative error code on failure. 2654 */ 2655 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2656 { 2657 int i, r; 2658 2659 for (i = 0; i < adev->num_ip_blocks; i++) { 2660 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2661 continue; 2662 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2663 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2664 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2665 2666 r = adev->ip_blocks[i].version->funcs->resume(adev); 2667 if (r) { 2668 DRM_ERROR("resume of IP block <%s> failed %d\n", 2669 adev->ip_blocks[i].version->funcs->name, r); 2670 return r; 2671 } 2672 adev->ip_blocks[i].status.hw = true; 2673 } 2674 } 2675 2676 return 0; 2677 } 2678 2679 /** 2680 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2681 * 2682 * @adev: amdgpu_device pointer 2683 * 2684 * First resume function for hardware IPs. The list of all the hardware 2685 * IPs that make up the asic is walked and the resume callbacks are run for 2686 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2687 * functional state after a suspend and updates the software state as 2688 * necessary. This function is also used for restoring the GPU after a GPU 2689 * reset. 2690 * Returns 0 on success, negative error code on failure. 2691 */ 2692 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2693 { 2694 int i, r; 2695 2696 for (i = 0; i < adev->num_ip_blocks; i++) { 2697 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2698 continue; 2699 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2700 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2701 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2702 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2703 continue; 2704 r = adev->ip_blocks[i].version->funcs->resume(adev); 2705 if (r) { 2706 DRM_ERROR("resume of IP block <%s> failed %d\n", 2707 adev->ip_blocks[i].version->funcs->name, r); 2708 return r; 2709 } 2710 adev->ip_blocks[i].status.hw = true; 2711 } 2712 2713 return 0; 2714 } 2715 2716 /** 2717 * amdgpu_device_ip_resume - run resume for hardware IPs 2718 * 2719 * @adev: amdgpu_device pointer 2720 * 2721 * Main resume function for hardware IPs. The hardware IPs 2722 * are split into two resume functions because they are 2723 * are also used in in recovering from a GPU reset and some additional 2724 * steps need to be take between them. In this case (S3/S4) they are 2725 * run sequentially. 2726 * Returns 0 on success, negative error code on failure. 2727 */ 2728 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2729 { 2730 int r; 2731 2732 r = amdgpu_device_ip_resume_phase1(adev); 2733 if (r) 2734 return r; 2735 2736 r = amdgpu_device_fw_loading(adev); 2737 if (r) 2738 return r; 2739 2740 r = amdgpu_device_ip_resume_phase2(adev); 2741 2742 return r; 2743 } 2744 2745 /** 2746 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2747 * 2748 * @adev: amdgpu_device pointer 2749 * 2750 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2751 */ 2752 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2753 { 2754 if (amdgpu_sriov_vf(adev)) { 2755 if (adev->is_atom_fw) { 2756 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2757 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2758 } else { 2759 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2760 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2761 } 2762 2763 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2764 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2765 } 2766 } 2767 2768 /** 2769 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2770 * 2771 * @asic_type: AMD asic type 2772 * 2773 * Check if there is DC (new modesetting infrastructre) support for an asic. 2774 * returns true if DC has support, false if not. 2775 */ 2776 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2777 { 2778 switch (asic_type) { 2779 #if defined(CONFIG_DRM_AMD_DC) 2780 case CHIP_BONAIRE: 2781 case CHIP_KAVERI: 2782 case CHIP_KABINI: 2783 case CHIP_MULLINS: 2784 /* 2785 * We have systems in the wild with these ASICs that require 2786 * LVDS and VGA support which is not supported with DC. 2787 * 2788 * Fallback to the non-DC driver here by default so as not to 2789 * cause regressions. 2790 */ 2791 return amdgpu_dc > 0; 2792 case CHIP_HAWAII: 2793 case CHIP_CARRIZO: 2794 case CHIP_STONEY: 2795 case CHIP_POLARIS10: 2796 case CHIP_POLARIS11: 2797 case CHIP_POLARIS12: 2798 case CHIP_VEGAM: 2799 case CHIP_TONGA: 2800 case CHIP_FIJI: 2801 case CHIP_VEGA10: 2802 case CHIP_VEGA12: 2803 case CHIP_VEGA20: 2804 #if defined(CONFIG_DRM_AMD_DC_DCN) 2805 case CHIP_RAVEN: 2806 case CHIP_NAVI10: 2807 case CHIP_NAVI14: 2808 case CHIP_NAVI12: 2809 case CHIP_RENOIR: 2810 #endif 2811 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2812 case CHIP_SIENNA_CICHLID: 2813 case CHIP_NAVY_FLOUNDER: 2814 #endif 2815 return amdgpu_dc != 0; 2816 #endif 2817 default: 2818 if (amdgpu_dc > 0) 2819 DRM_INFO("Display Core has been requested via kernel parameter " 2820 "but isn't supported by ASIC, ignoring\n"); 2821 return false; 2822 } 2823 } 2824 2825 /** 2826 * amdgpu_device_has_dc_support - check if dc is supported 2827 * 2828 * @adev: amdgpu_device_pointer 2829 * 2830 * Returns true for supported, false for not supported 2831 */ 2832 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2833 { 2834 if (amdgpu_sriov_vf(adev)) 2835 return false; 2836 2837 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2838 } 2839 2840 2841 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2842 { 2843 struct amdgpu_device *adev = 2844 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2845 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); 2846 2847 /* It's a bug to not have a hive within this function */ 2848 if (WARN_ON(!hive)) 2849 return; 2850 2851 /* 2852 * Use task barrier to synchronize all xgmi reset works across the 2853 * hive. task_barrier_enter and task_barrier_exit will block 2854 * until all the threads running the xgmi reset works reach 2855 * those points. task_barrier_full will do both blocks. 2856 */ 2857 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2858 2859 task_barrier_enter(&hive->tb); 2860 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); 2861 2862 if (adev->asic_reset_res) 2863 goto fail; 2864 2865 task_barrier_exit(&hive->tb); 2866 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); 2867 2868 if (adev->asic_reset_res) 2869 goto fail; 2870 2871 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 2872 adev->mmhub.funcs->reset_ras_error_count(adev); 2873 } else { 2874 2875 task_barrier_full(&hive->tb); 2876 adev->asic_reset_res = amdgpu_asic_reset(adev); 2877 } 2878 2879 fail: 2880 if (adev->asic_reset_res) 2881 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2882 adev->asic_reset_res, adev->ddev->unique); 2883 } 2884 2885 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2886 { 2887 char *input = amdgpu_lockup_timeout; 2888 char *timeout_setting = NULL; 2889 int index = 0; 2890 long timeout; 2891 int ret = 0; 2892 2893 /* 2894 * By default timeout for non compute jobs is 10000. 2895 * And there is no timeout enforced on compute jobs. 2896 * In SR-IOV or passthrough mode, timeout for compute 2897 * jobs are 60000 by default. 2898 */ 2899 adev->gfx_timeout = msecs_to_jiffies(10000); 2900 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2901 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2902 adev->compute_timeout = msecs_to_jiffies(60000); 2903 else 2904 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2905 2906 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2907 while ((timeout_setting = strsep(&input, ",")) && 2908 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2909 ret = kstrtol(timeout_setting, 0, &timeout); 2910 if (ret) 2911 return ret; 2912 2913 if (timeout == 0) { 2914 index++; 2915 continue; 2916 } else if (timeout < 0) { 2917 timeout = MAX_SCHEDULE_TIMEOUT; 2918 } else { 2919 timeout = msecs_to_jiffies(timeout); 2920 } 2921 2922 switch (index++) { 2923 case 0: 2924 adev->gfx_timeout = timeout; 2925 break; 2926 case 1: 2927 adev->compute_timeout = timeout; 2928 break; 2929 case 2: 2930 adev->sdma_timeout = timeout; 2931 break; 2932 case 3: 2933 adev->video_timeout = timeout; 2934 break; 2935 default: 2936 break; 2937 } 2938 } 2939 /* 2940 * There is only one value specified and 2941 * it should apply to all non-compute jobs. 2942 */ 2943 if (index == 1) { 2944 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2945 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2946 adev->compute_timeout = adev->gfx_timeout; 2947 } 2948 } 2949 2950 return ret; 2951 } 2952 2953 static const struct attribute *amdgpu_dev_attributes[] = { 2954 &dev_attr_product_name.attr, 2955 &dev_attr_product_number.attr, 2956 &dev_attr_serial_number.attr, 2957 &dev_attr_pcie_replay_count.attr, 2958 NULL 2959 }; 2960 2961 /** 2962 * amdgpu_device_init - initialize the driver 2963 * 2964 * @adev: amdgpu_device pointer 2965 * @ddev: drm dev pointer 2966 * @pdev: pci dev pointer 2967 * @flags: driver flags 2968 * 2969 * Initializes the driver info and hw (all asics). 2970 * Returns 0 for success or an error on failure. 2971 * Called at driver startup. 2972 */ 2973 int amdgpu_device_init(struct amdgpu_device *adev, 2974 struct drm_device *ddev, 2975 struct pci_dev *pdev, 2976 uint32_t flags) 2977 { 2978 int r, i; 2979 bool boco = false; 2980 u32 max_MBps; 2981 2982 adev->shutdown = false; 2983 adev->dev = &pdev->dev; 2984 adev->ddev = ddev; 2985 adev->pdev = pdev; 2986 adev->flags = flags; 2987 2988 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 2989 adev->asic_type = amdgpu_force_asic_type; 2990 else 2991 adev->asic_type = flags & AMD_ASIC_MASK; 2992 2993 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2994 if (amdgpu_emu_mode == 1) 2995 adev->usec_timeout *= 10; 2996 adev->gmc.gart_size = 512 * 1024 * 1024; 2997 adev->accel_working = false; 2998 adev->num_rings = 0; 2999 adev->mman.buffer_funcs = NULL; 3000 adev->mman.buffer_funcs_ring = NULL; 3001 adev->vm_manager.vm_pte_funcs = NULL; 3002 adev->vm_manager.vm_pte_num_scheds = 0; 3003 adev->gmc.gmc_funcs = NULL; 3004 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3005 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3006 3007 adev->smc_rreg = &amdgpu_invalid_rreg; 3008 adev->smc_wreg = &amdgpu_invalid_wreg; 3009 adev->pcie_rreg = &amdgpu_invalid_rreg; 3010 adev->pcie_wreg = &amdgpu_invalid_wreg; 3011 adev->pciep_rreg = &amdgpu_invalid_rreg; 3012 adev->pciep_wreg = &amdgpu_invalid_wreg; 3013 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3014 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3015 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3016 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3017 adev->didt_rreg = &amdgpu_invalid_rreg; 3018 adev->didt_wreg = &amdgpu_invalid_wreg; 3019 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3020 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3021 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3022 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3023 3024 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3025 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3026 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3027 3028 /* mutex initialization are all done here so we 3029 * can recall function without having locking issues */ 3030 atomic_set(&adev->irq.ih.lock, 0); 3031 mutex_init(&adev->firmware.mutex); 3032 mutex_init(&adev->pm.mutex); 3033 mutex_init(&adev->gfx.gpu_clock_mutex); 3034 mutex_init(&adev->srbm_mutex); 3035 mutex_init(&adev->gfx.pipe_reserve_mutex); 3036 mutex_init(&adev->gfx.gfx_off_mutex); 3037 mutex_init(&adev->grbm_idx_mutex); 3038 mutex_init(&adev->mn_lock); 3039 mutex_init(&adev->virt.vf_errors.lock); 3040 hash_init(adev->mn_hash); 3041 mutex_init(&adev->lock_reset); 3042 mutex_init(&adev->psp.mutex); 3043 mutex_init(&adev->notifier_lock); 3044 3045 r = amdgpu_device_check_arguments(adev); 3046 if (r) 3047 return r; 3048 3049 spin_lock_init(&adev->mmio_idx_lock); 3050 spin_lock_init(&adev->smc_idx_lock); 3051 spin_lock_init(&adev->pcie_idx_lock); 3052 spin_lock_init(&adev->uvd_ctx_idx_lock); 3053 spin_lock_init(&adev->didt_idx_lock); 3054 spin_lock_init(&adev->gc_cac_idx_lock); 3055 spin_lock_init(&adev->se_cac_idx_lock); 3056 spin_lock_init(&adev->audio_endpt_idx_lock); 3057 spin_lock_init(&adev->mm_stats.lock); 3058 3059 INIT_LIST_HEAD(&adev->shadow_list); 3060 mutex_init(&adev->shadow_list_lock); 3061 3062 INIT_DELAYED_WORK(&adev->delayed_init_work, 3063 amdgpu_device_delayed_init_work_handler); 3064 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3065 amdgpu_device_delay_enable_gfx_off); 3066 3067 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3068 3069 adev->gfx.gfx_off_req_count = 1; 3070 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3071 3072 atomic_set(&adev->throttling_logging_enabled, 1); 3073 /* 3074 * If throttling continues, logging will be performed every minute 3075 * to avoid log flooding. "-1" is subtracted since the thermal 3076 * throttling interrupt comes every second. Thus, the total logging 3077 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3078 * for throttling interrupt) = 60 seconds. 3079 */ 3080 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3081 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3082 3083 /* Registers mapping */ 3084 /* TODO: block userspace mapping of io register */ 3085 if (adev->asic_type >= CHIP_BONAIRE) { 3086 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3087 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3088 } else { 3089 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3090 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3091 } 3092 3093 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3094 if (adev->rmmio == NULL) { 3095 return -ENOMEM; 3096 } 3097 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3098 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3099 3100 /* io port mapping */ 3101 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3102 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3103 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3104 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3105 break; 3106 } 3107 } 3108 if (adev->rio_mem == NULL) 3109 DRM_INFO("PCI I/O BAR is not found.\n"); 3110 3111 /* enable PCIE atomic ops */ 3112 r = pci_enable_atomic_ops_to_root(adev->pdev, 3113 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3114 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3115 if (r) { 3116 adev->have_atomics_support = false; 3117 DRM_INFO("PCIE atomic ops is not supported\n"); 3118 } else { 3119 adev->have_atomics_support = true; 3120 } 3121 3122 amdgpu_device_get_pcie_info(adev); 3123 3124 if (amdgpu_mcbp) 3125 DRM_INFO("MCBP is enabled\n"); 3126 3127 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3128 adev->enable_mes = true; 3129 3130 /* detect hw virtualization here */ 3131 amdgpu_detect_virtualization(adev); 3132 3133 r = amdgpu_device_get_job_timeout_settings(adev); 3134 if (r) { 3135 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3136 return r; 3137 } 3138 3139 /* early init functions */ 3140 r = amdgpu_device_ip_early_init(adev); 3141 if (r) 3142 return r; 3143 3144 /* doorbell bar mapping and doorbell index init*/ 3145 amdgpu_device_doorbell_init(adev); 3146 3147 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3148 /* this will fail for cards that aren't VGA class devices, just 3149 * ignore it */ 3150 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3151 3152 if (amdgpu_device_supports_boco(ddev)) 3153 boco = true; 3154 if (amdgpu_has_atpx() && 3155 (amdgpu_is_atpx_hybrid() || 3156 amdgpu_has_atpx_dgpu_power_cntl()) && 3157 !pci_is_thunderbolt_attached(adev->pdev)) 3158 vga_switcheroo_register_client(adev->pdev, 3159 &amdgpu_switcheroo_ops, boco); 3160 if (boco) 3161 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3162 3163 if (amdgpu_emu_mode == 1) { 3164 /* post the asic on emulation mode */ 3165 emu_soc_asic_init(adev); 3166 goto fence_driver_init; 3167 } 3168 3169 /* detect if we are with an SRIOV vbios */ 3170 amdgpu_device_detect_sriov_bios(adev); 3171 3172 /* check if we need to reset the asic 3173 * E.g., driver was not cleanly unloaded previously, etc. 3174 */ 3175 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3176 r = amdgpu_asic_reset(adev); 3177 if (r) { 3178 dev_err(adev->dev, "asic reset on init failed\n"); 3179 goto failed; 3180 } 3181 } 3182 3183 /* Post card if necessary */ 3184 if (amdgpu_device_need_post(adev)) { 3185 if (!adev->bios) { 3186 dev_err(adev->dev, "no vBIOS found\n"); 3187 r = -EINVAL; 3188 goto failed; 3189 } 3190 DRM_INFO("GPU posting now...\n"); 3191 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3192 if (r) { 3193 dev_err(adev->dev, "gpu post error!\n"); 3194 goto failed; 3195 } 3196 } 3197 3198 if (adev->is_atom_fw) { 3199 /* Initialize clocks */ 3200 r = amdgpu_atomfirmware_get_clock_info(adev); 3201 if (r) { 3202 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3203 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3204 goto failed; 3205 } 3206 } else { 3207 /* Initialize clocks */ 3208 r = amdgpu_atombios_get_clock_info(adev); 3209 if (r) { 3210 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3211 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3212 goto failed; 3213 } 3214 /* init i2c buses */ 3215 if (!amdgpu_device_has_dc_support(adev)) 3216 amdgpu_atombios_i2c_init(adev); 3217 } 3218 3219 fence_driver_init: 3220 /* Fence driver */ 3221 r = amdgpu_fence_driver_init(adev); 3222 if (r) { 3223 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3224 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3225 goto failed; 3226 } 3227 3228 /* init the mode config */ 3229 drm_mode_config_init(adev->ddev); 3230 3231 r = amdgpu_device_ip_init(adev); 3232 if (r) { 3233 /* failed in exclusive mode due to timeout */ 3234 if (amdgpu_sriov_vf(adev) && 3235 !amdgpu_sriov_runtime(adev) && 3236 amdgpu_virt_mmio_blocked(adev) && 3237 !amdgpu_virt_wait_reset(adev)) { 3238 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3239 /* Don't send request since VF is inactive. */ 3240 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3241 adev->virt.ops = NULL; 3242 r = -EAGAIN; 3243 goto failed; 3244 } 3245 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3246 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3247 goto failed; 3248 } 3249 3250 dev_info(adev->dev, 3251 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3252 adev->gfx.config.max_shader_engines, 3253 adev->gfx.config.max_sh_per_se, 3254 adev->gfx.config.max_cu_per_sh, 3255 adev->gfx.cu_info.number); 3256 3257 adev->accel_working = true; 3258 3259 amdgpu_vm_check_compute_bug(adev); 3260 3261 /* Initialize the buffer migration limit. */ 3262 if (amdgpu_moverate >= 0) 3263 max_MBps = amdgpu_moverate; 3264 else 3265 max_MBps = 8; /* Allow 8 MB/s. */ 3266 /* Get a log2 for easy divisions. */ 3267 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3268 3269 amdgpu_fbdev_init(adev); 3270 3271 r = amdgpu_pm_sysfs_init(adev); 3272 if (r) { 3273 adev->pm_sysfs_en = false; 3274 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3275 } else 3276 adev->pm_sysfs_en = true; 3277 3278 r = amdgpu_ucode_sysfs_init(adev); 3279 if (r) { 3280 adev->ucode_sysfs_en = false; 3281 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3282 } else 3283 adev->ucode_sysfs_en = true; 3284 3285 if ((amdgpu_testing & 1)) { 3286 if (adev->accel_working) 3287 amdgpu_test_moves(adev); 3288 else 3289 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3290 } 3291 if (amdgpu_benchmarking) { 3292 if (adev->accel_working) 3293 amdgpu_benchmark(adev, amdgpu_benchmarking); 3294 else 3295 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3296 } 3297 3298 /* 3299 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3300 * Otherwise the mgpu fan boost feature will be skipped due to the 3301 * gpu instance is counted less. 3302 */ 3303 amdgpu_register_gpu_instance(adev); 3304 3305 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3306 * explicit gating rather than handling it automatically. 3307 */ 3308 r = amdgpu_device_ip_late_init(adev); 3309 if (r) { 3310 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3311 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3312 goto failed; 3313 } 3314 3315 /* must succeed. */ 3316 amdgpu_ras_resume(adev); 3317 3318 queue_delayed_work(system_wq, &adev->delayed_init_work, 3319 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3320 3321 if (amdgpu_sriov_vf(adev)) 3322 flush_delayed_work(&adev->delayed_init_work); 3323 3324 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3325 if (r) { 3326 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3327 return r; 3328 } 3329 3330 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3331 r = amdgpu_pmu_init(adev); 3332 if (r) 3333 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3334 3335 return 0; 3336 3337 failed: 3338 amdgpu_vf_error_trans_all(adev); 3339 if (boco) 3340 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3341 3342 return r; 3343 } 3344 3345 /** 3346 * amdgpu_device_fini - tear down the driver 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Tear down the driver info (all asics). 3351 * Called at driver shutdown. 3352 */ 3353 void amdgpu_device_fini(struct amdgpu_device *adev) 3354 { 3355 int r; 3356 3357 DRM_INFO("amdgpu: finishing device.\n"); 3358 flush_delayed_work(&adev->delayed_init_work); 3359 adev->shutdown = true; 3360 3361 /* make sure IB test finished before entering exclusive mode 3362 * to avoid preemption on IB test 3363 * */ 3364 if (amdgpu_sriov_vf(adev)) 3365 amdgpu_virt_request_full_gpu(adev, false); 3366 3367 /* disable all interrupts */ 3368 amdgpu_irq_disable_all(adev); 3369 if (adev->mode_info.mode_config_initialized){ 3370 if (!amdgpu_device_has_dc_support(adev)) 3371 drm_helper_force_disable_all(adev->ddev); 3372 else 3373 drm_atomic_helper_shutdown(adev->ddev); 3374 } 3375 amdgpu_fence_driver_fini(adev); 3376 if (adev->pm_sysfs_en) 3377 amdgpu_pm_sysfs_fini(adev); 3378 amdgpu_fbdev_fini(adev); 3379 r = amdgpu_device_ip_fini(adev); 3380 release_firmware(adev->firmware.gpu_info_fw); 3381 adev->firmware.gpu_info_fw = NULL; 3382 adev->accel_working = false; 3383 /* free i2c buses */ 3384 if (!amdgpu_device_has_dc_support(adev)) 3385 amdgpu_i2c_fini(adev); 3386 3387 if (amdgpu_emu_mode != 1) 3388 amdgpu_atombios_fini(adev); 3389 3390 kfree(adev->bios); 3391 adev->bios = NULL; 3392 if (amdgpu_has_atpx() && 3393 (amdgpu_is_atpx_hybrid() || 3394 amdgpu_has_atpx_dgpu_power_cntl()) && 3395 !pci_is_thunderbolt_attached(adev->pdev)) 3396 vga_switcheroo_unregister_client(adev->pdev); 3397 if (amdgpu_device_supports_boco(adev->ddev)) 3398 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3399 vga_client_register(adev->pdev, NULL, NULL, NULL); 3400 if (adev->rio_mem) 3401 pci_iounmap(adev->pdev, adev->rio_mem); 3402 adev->rio_mem = NULL; 3403 iounmap(adev->rmmio); 3404 adev->rmmio = NULL; 3405 amdgpu_device_doorbell_fini(adev); 3406 3407 if (adev->ucode_sysfs_en) 3408 amdgpu_ucode_sysfs_fini(adev); 3409 3410 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3411 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3412 amdgpu_pmu_fini(adev); 3413 if (adev->discovery_bin) 3414 amdgpu_discovery_fini(adev); 3415 } 3416 3417 3418 /* 3419 * Suspend & resume. 3420 */ 3421 /** 3422 * amdgpu_device_suspend - initiate device suspend 3423 * 3424 * @dev: drm dev pointer 3425 * @fbcon : notify the fbdev of suspend 3426 * 3427 * Puts the hw in the suspend state (all asics). 3428 * Returns 0 for success or an error on failure. 3429 * Called at driver suspend. 3430 */ 3431 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3432 { 3433 struct amdgpu_device *adev; 3434 struct drm_crtc *crtc; 3435 struct drm_connector *connector; 3436 struct drm_connector_list_iter iter; 3437 int r; 3438 3439 if (dev == NULL || dev->dev_private == NULL) { 3440 return -ENODEV; 3441 } 3442 3443 adev = dev->dev_private; 3444 3445 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3446 return 0; 3447 3448 adev->in_suspend = true; 3449 drm_kms_helper_poll_disable(dev); 3450 3451 if (fbcon) 3452 amdgpu_fbdev_set_suspend(adev, 1); 3453 3454 cancel_delayed_work_sync(&adev->delayed_init_work); 3455 3456 if (!amdgpu_device_has_dc_support(adev)) { 3457 /* turn off display hw */ 3458 drm_modeset_lock_all(dev); 3459 drm_connector_list_iter_begin(dev, &iter); 3460 drm_for_each_connector_iter(connector, &iter) 3461 drm_helper_connector_dpms(connector, 3462 DRM_MODE_DPMS_OFF); 3463 drm_connector_list_iter_end(&iter); 3464 drm_modeset_unlock_all(dev); 3465 /* unpin the front buffers and cursors */ 3466 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3467 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3468 struct drm_framebuffer *fb = crtc->primary->fb; 3469 struct amdgpu_bo *robj; 3470 3471 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3472 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3473 r = amdgpu_bo_reserve(aobj, true); 3474 if (r == 0) { 3475 amdgpu_bo_unpin(aobj); 3476 amdgpu_bo_unreserve(aobj); 3477 } 3478 } 3479 3480 if (fb == NULL || fb->obj[0] == NULL) { 3481 continue; 3482 } 3483 robj = gem_to_amdgpu_bo(fb->obj[0]); 3484 /* don't unpin kernel fb objects */ 3485 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3486 r = amdgpu_bo_reserve(robj, true); 3487 if (r == 0) { 3488 amdgpu_bo_unpin(robj); 3489 amdgpu_bo_unreserve(robj); 3490 } 3491 } 3492 } 3493 } 3494 3495 amdgpu_ras_suspend(adev); 3496 3497 r = amdgpu_device_ip_suspend_phase1(adev); 3498 3499 amdgpu_amdkfd_suspend(adev, !fbcon); 3500 3501 /* evict vram memory */ 3502 amdgpu_bo_evict_vram(adev); 3503 3504 amdgpu_fence_driver_suspend(adev); 3505 3506 r = amdgpu_device_ip_suspend_phase2(adev); 3507 3508 /* evict remaining vram memory 3509 * This second call to evict vram is to evict the gart page table 3510 * using the CPU. 3511 */ 3512 amdgpu_bo_evict_vram(adev); 3513 3514 return 0; 3515 } 3516 3517 /** 3518 * amdgpu_device_resume - initiate device resume 3519 * 3520 * @dev: drm dev pointer 3521 * @fbcon : notify the fbdev of resume 3522 * 3523 * Bring the hw back to operating state (all asics). 3524 * Returns 0 for success or an error on failure. 3525 * Called at driver resume. 3526 */ 3527 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3528 { 3529 struct drm_connector *connector; 3530 struct drm_connector_list_iter iter; 3531 struct amdgpu_device *adev = dev->dev_private; 3532 struct drm_crtc *crtc; 3533 int r = 0; 3534 3535 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3536 return 0; 3537 3538 /* post card */ 3539 if (amdgpu_device_need_post(adev)) { 3540 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3541 if (r) 3542 DRM_ERROR("amdgpu asic init failed\n"); 3543 } 3544 3545 r = amdgpu_device_ip_resume(adev); 3546 if (r) { 3547 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3548 return r; 3549 } 3550 amdgpu_fence_driver_resume(adev); 3551 3552 3553 r = amdgpu_device_ip_late_init(adev); 3554 if (r) 3555 return r; 3556 3557 queue_delayed_work(system_wq, &adev->delayed_init_work, 3558 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3559 3560 if (!amdgpu_device_has_dc_support(adev)) { 3561 /* pin cursors */ 3562 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3563 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3564 3565 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3566 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3567 r = amdgpu_bo_reserve(aobj, true); 3568 if (r == 0) { 3569 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3570 if (r != 0) 3571 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3572 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3573 amdgpu_bo_unreserve(aobj); 3574 } 3575 } 3576 } 3577 } 3578 r = amdgpu_amdkfd_resume(adev, !fbcon); 3579 if (r) 3580 return r; 3581 3582 /* Make sure IB tests flushed */ 3583 flush_delayed_work(&adev->delayed_init_work); 3584 3585 /* blat the mode back in */ 3586 if (fbcon) { 3587 if (!amdgpu_device_has_dc_support(adev)) { 3588 /* pre DCE11 */ 3589 drm_helper_resume_force_mode(dev); 3590 3591 /* turn on display hw */ 3592 drm_modeset_lock_all(dev); 3593 3594 drm_connector_list_iter_begin(dev, &iter); 3595 drm_for_each_connector_iter(connector, &iter) 3596 drm_helper_connector_dpms(connector, 3597 DRM_MODE_DPMS_ON); 3598 drm_connector_list_iter_end(&iter); 3599 3600 drm_modeset_unlock_all(dev); 3601 } 3602 amdgpu_fbdev_set_suspend(adev, 0); 3603 } 3604 3605 drm_kms_helper_poll_enable(dev); 3606 3607 amdgpu_ras_resume(adev); 3608 3609 /* 3610 * Most of the connector probing functions try to acquire runtime pm 3611 * refs to ensure that the GPU is powered on when connector polling is 3612 * performed. Since we're calling this from a runtime PM callback, 3613 * trying to acquire rpm refs will cause us to deadlock. 3614 * 3615 * Since we're guaranteed to be holding the rpm lock, it's safe to 3616 * temporarily disable the rpm helpers so this doesn't deadlock us. 3617 */ 3618 #ifdef CONFIG_PM 3619 dev->dev->power.disable_depth++; 3620 #endif 3621 if (!amdgpu_device_has_dc_support(adev)) 3622 drm_helper_hpd_irq_event(dev); 3623 else 3624 drm_kms_helper_hotplug_event(dev); 3625 #ifdef CONFIG_PM 3626 dev->dev->power.disable_depth--; 3627 #endif 3628 adev->in_suspend = false; 3629 3630 return 0; 3631 } 3632 3633 /** 3634 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3635 * 3636 * @adev: amdgpu_device pointer 3637 * 3638 * The list of all the hardware IPs that make up the asic is walked and 3639 * the check_soft_reset callbacks are run. check_soft_reset determines 3640 * if the asic is still hung or not. 3641 * Returns true if any of the IPs are still in a hung state, false if not. 3642 */ 3643 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3644 { 3645 int i; 3646 bool asic_hang = false; 3647 3648 if (amdgpu_sriov_vf(adev)) 3649 return true; 3650 3651 if (amdgpu_asic_need_full_reset(adev)) 3652 return true; 3653 3654 for (i = 0; i < adev->num_ip_blocks; i++) { 3655 if (!adev->ip_blocks[i].status.valid) 3656 continue; 3657 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3658 adev->ip_blocks[i].status.hang = 3659 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3660 if (adev->ip_blocks[i].status.hang) { 3661 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3662 asic_hang = true; 3663 } 3664 } 3665 return asic_hang; 3666 } 3667 3668 /** 3669 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3670 * 3671 * @adev: amdgpu_device pointer 3672 * 3673 * The list of all the hardware IPs that make up the asic is walked and the 3674 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3675 * handles any IP specific hardware or software state changes that are 3676 * necessary for a soft reset to succeed. 3677 * Returns 0 on success, negative error code on failure. 3678 */ 3679 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3680 { 3681 int i, r = 0; 3682 3683 for (i = 0; i < adev->num_ip_blocks; i++) { 3684 if (!adev->ip_blocks[i].status.valid) 3685 continue; 3686 if (adev->ip_blocks[i].status.hang && 3687 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3688 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3689 if (r) 3690 return r; 3691 } 3692 } 3693 3694 return 0; 3695 } 3696 3697 /** 3698 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3699 * 3700 * @adev: amdgpu_device pointer 3701 * 3702 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3703 * reset is necessary to recover. 3704 * Returns true if a full asic reset is required, false if not. 3705 */ 3706 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3707 { 3708 int i; 3709 3710 if (amdgpu_asic_need_full_reset(adev)) 3711 return true; 3712 3713 for (i = 0; i < adev->num_ip_blocks; i++) { 3714 if (!adev->ip_blocks[i].status.valid) 3715 continue; 3716 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3717 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3718 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3719 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3720 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3721 if (adev->ip_blocks[i].status.hang) { 3722 DRM_INFO("Some block need full reset!\n"); 3723 return true; 3724 } 3725 } 3726 } 3727 return false; 3728 } 3729 3730 /** 3731 * amdgpu_device_ip_soft_reset - do a soft reset 3732 * 3733 * @adev: amdgpu_device pointer 3734 * 3735 * The list of all the hardware IPs that make up the asic is walked and the 3736 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3737 * IP specific hardware or software state changes that are necessary to soft 3738 * reset the IP. 3739 * Returns 0 on success, negative error code on failure. 3740 */ 3741 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3742 { 3743 int i, r = 0; 3744 3745 for (i = 0; i < adev->num_ip_blocks; i++) { 3746 if (!adev->ip_blocks[i].status.valid) 3747 continue; 3748 if (adev->ip_blocks[i].status.hang && 3749 adev->ip_blocks[i].version->funcs->soft_reset) { 3750 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3751 if (r) 3752 return r; 3753 } 3754 } 3755 3756 return 0; 3757 } 3758 3759 /** 3760 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3761 * 3762 * @adev: amdgpu_device pointer 3763 * 3764 * The list of all the hardware IPs that make up the asic is walked and the 3765 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3766 * handles any IP specific hardware or software state changes that are 3767 * necessary after the IP has been soft reset. 3768 * Returns 0 on success, negative error code on failure. 3769 */ 3770 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3771 { 3772 int i, r = 0; 3773 3774 for (i = 0; i < adev->num_ip_blocks; i++) { 3775 if (!adev->ip_blocks[i].status.valid) 3776 continue; 3777 if (adev->ip_blocks[i].status.hang && 3778 adev->ip_blocks[i].version->funcs->post_soft_reset) 3779 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3780 if (r) 3781 return r; 3782 } 3783 3784 return 0; 3785 } 3786 3787 /** 3788 * amdgpu_device_recover_vram - Recover some VRAM contents 3789 * 3790 * @adev: amdgpu_device pointer 3791 * 3792 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3793 * restore things like GPUVM page tables after a GPU reset where 3794 * the contents of VRAM might be lost. 3795 * 3796 * Returns: 3797 * 0 on success, negative error code on failure. 3798 */ 3799 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3800 { 3801 struct dma_fence *fence = NULL, *next = NULL; 3802 struct amdgpu_bo *shadow; 3803 long r = 1, tmo; 3804 3805 if (amdgpu_sriov_runtime(adev)) 3806 tmo = msecs_to_jiffies(8000); 3807 else 3808 tmo = msecs_to_jiffies(100); 3809 3810 DRM_INFO("recover vram bo from shadow start\n"); 3811 mutex_lock(&adev->shadow_list_lock); 3812 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3813 3814 /* No need to recover an evicted BO */ 3815 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3816 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3817 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3818 continue; 3819 3820 r = amdgpu_bo_restore_shadow(shadow, &next); 3821 if (r) 3822 break; 3823 3824 if (fence) { 3825 tmo = dma_fence_wait_timeout(fence, false, tmo); 3826 dma_fence_put(fence); 3827 fence = next; 3828 if (tmo == 0) { 3829 r = -ETIMEDOUT; 3830 break; 3831 } else if (tmo < 0) { 3832 r = tmo; 3833 break; 3834 } 3835 } else { 3836 fence = next; 3837 } 3838 } 3839 mutex_unlock(&adev->shadow_list_lock); 3840 3841 if (fence) 3842 tmo = dma_fence_wait_timeout(fence, false, tmo); 3843 dma_fence_put(fence); 3844 3845 if (r < 0 || tmo <= 0) { 3846 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3847 return -EIO; 3848 } 3849 3850 DRM_INFO("recover vram bo from shadow done\n"); 3851 return 0; 3852 } 3853 3854 3855 /** 3856 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3857 * 3858 * @adev: amdgpu device pointer 3859 * @from_hypervisor: request from hypervisor 3860 * 3861 * do VF FLR and reinitialize Asic 3862 * return 0 means succeeded otherwise failed 3863 */ 3864 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3865 bool from_hypervisor) 3866 { 3867 int r; 3868 3869 if (from_hypervisor) 3870 r = amdgpu_virt_request_full_gpu(adev, true); 3871 else 3872 r = amdgpu_virt_reset_gpu(adev); 3873 if (r) 3874 return r; 3875 3876 amdgpu_amdkfd_pre_reset(adev); 3877 3878 /* Resume IP prior to SMC */ 3879 r = amdgpu_device_ip_reinit_early_sriov(adev); 3880 if (r) 3881 goto error; 3882 3883 amdgpu_virt_init_data_exchange(adev); 3884 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3885 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3886 3887 r = amdgpu_device_fw_loading(adev); 3888 if (r) 3889 return r; 3890 3891 /* now we are okay to resume SMC/CP/SDMA */ 3892 r = amdgpu_device_ip_reinit_late_sriov(adev); 3893 if (r) 3894 goto error; 3895 3896 amdgpu_irq_gpu_reset_resume_helper(adev); 3897 r = amdgpu_ib_ring_tests(adev); 3898 amdgpu_amdkfd_post_reset(adev); 3899 3900 error: 3901 amdgpu_virt_release_full_gpu(adev, true); 3902 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3903 amdgpu_inc_vram_lost(adev); 3904 r = amdgpu_device_recover_vram(adev); 3905 } 3906 3907 return r; 3908 } 3909 3910 /** 3911 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3912 * 3913 * @adev: amdgpu device pointer 3914 * 3915 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3916 * a hung GPU. 3917 */ 3918 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3919 { 3920 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3921 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3922 return false; 3923 } 3924 3925 if (amdgpu_gpu_recovery == 0) 3926 goto disabled; 3927 3928 if (amdgpu_sriov_vf(adev)) 3929 return true; 3930 3931 if (amdgpu_gpu_recovery == -1) { 3932 switch (adev->asic_type) { 3933 case CHIP_BONAIRE: 3934 case CHIP_HAWAII: 3935 case CHIP_TOPAZ: 3936 case CHIP_TONGA: 3937 case CHIP_FIJI: 3938 case CHIP_POLARIS10: 3939 case CHIP_POLARIS11: 3940 case CHIP_POLARIS12: 3941 case CHIP_VEGAM: 3942 case CHIP_VEGA20: 3943 case CHIP_VEGA10: 3944 case CHIP_VEGA12: 3945 case CHIP_RAVEN: 3946 case CHIP_ARCTURUS: 3947 case CHIP_RENOIR: 3948 case CHIP_NAVI10: 3949 case CHIP_NAVI14: 3950 case CHIP_NAVI12: 3951 case CHIP_SIENNA_CICHLID: 3952 break; 3953 default: 3954 goto disabled; 3955 } 3956 } 3957 3958 return true; 3959 3960 disabled: 3961 DRM_INFO("GPU recovery disabled.\n"); 3962 return false; 3963 } 3964 3965 3966 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 3967 struct amdgpu_job *job, 3968 bool *need_full_reset_arg) 3969 { 3970 int i, r = 0; 3971 bool need_full_reset = *need_full_reset_arg; 3972 3973 amdgpu_debugfs_wait_dump(adev); 3974 3975 /* block all schedulers and reset given job's ring */ 3976 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3977 struct amdgpu_ring *ring = adev->rings[i]; 3978 3979 if (!ring || !ring->sched.thread) 3980 continue; 3981 3982 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3983 amdgpu_fence_driver_force_completion(ring); 3984 } 3985 3986 if(job) 3987 drm_sched_increase_karma(&job->base); 3988 3989 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 3990 if (!amdgpu_sriov_vf(adev)) { 3991 3992 if (!need_full_reset) 3993 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 3994 3995 if (!need_full_reset) { 3996 amdgpu_device_ip_pre_soft_reset(adev); 3997 r = amdgpu_device_ip_soft_reset(adev); 3998 amdgpu_device_ip_post_soft_reset(adev); 3999 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4000 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 4001 need_full_reset = true; 4002 } 4003 } 4004 4005 if (need_full_reset) 4006 r = amdgpu_device_ip_suspend(adev); 4007 4008 *need_full_reset_arg = need_full_reset; 4009 } 4010 4011 return r; 4012 } 4013 4014 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4015 struct list_head *device_list_handle, 4016 bool *need_full_reset_arg) 4017 { 4018 struct amdgpu_device *tmp_adev = NULL; 4019 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4020 int r = 0; 4021 4022 /* 4023 * ASIC reset has to be done on all HGMI hive nodes ASAP 4024 * to allow proper links negotiation in FW (within 1 sec) 4025 */ 4026 if (need_full_reset) { 4027 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4028 /* For XGMI run all resets in parallel to speed up the process */ 4029 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4030 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4031 r = -EALREADY; 4032 } else 4033 r = amdgpu_asic_reset(tmp_adev); 4034 4035 if (r) { 4036 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 4037 r, tmp_adev->ddev->unique); 4038 break; 4039 } 4040 } 4041 4042 /* For XGMI wait for all resets to complete before proceed */ 4043 if (!r) { 4044 list_for_each_entry(tmp_adev, device_list_handle, 4045 gmc.xgmi.head) { 4046 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4047 flush_work(&tmp_adev->xgmi_reset_work); 4048 r = tmp_adev->asic_reset_res; 4049 if (r) 4050 break; 4051 } 4052 } 4053 } 4054 } 4055 4056 if (!r && amdgpu_ras_intr_triggered()) { 4057 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4058 if (tmp_adev->mmhub.funcs && 4059 tmp_adev->mmhub.funcs->reset_ras_error_count) 4060 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4061 } 4062 4063 amdgpu_ras_intr_cleared(); 4064 } 4065 4066 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4067 if (need_full_reset) { 4068 /* post card */ 4069 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 4070 DRM_WARN("asic atom init failed!"); 4071 4072 if (!r) { 4073 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4074 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4075 if (r) 4076 goto out; 4077 4078 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4079 if (vram_lost) { 4080 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4081 amdgpu_inc_vram_lost(tmp_adev); 4082 } 4083 4084 r = amdgpu_gtt_mgr_recover( 4085 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 4086 if (r) 4087 goto out; 4088 4089 r = amdgpu_device_fw_loading(tmp_adev); 4090 if (r) 4091 return r; 4092 4093 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4094 if (r) 4095 goto out; 4096 4097 if (vram_lost) 4098 amdgpu_device_fill_reset_magic(tmp_adev); 4099 4100 /* 4101 * Add this ASIC as tracked as reset was already 4102 * complete successfully. 4103 */ 4104 amdgpu_register_gpu_instance(tmp_adev); 4105 4106 r = amdgpu_device_ip_late_init(tmp_adev); 4107 if (r) 4108 goto out; 4109 4110 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4111 4112 /* must succeed. */ 4113 amdgpu_ras_resume(tmp_adev); 4114 4115 /* Update PSP FW topology after reset */ 4116 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4117 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4118 } 4119 } 4120 4121 4122 out: 4123 if (!r) { 4124 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4125 r = amdgpu_ib_ring_tests(tmp_adev); 4126 if (r) { 4127 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4128 r = amdgpu_device_ip_suspend(tmp_adev); 4129 need_full_reset = true; 4130 r = -EAGAIN; 4131 goto end; 4132 } 4133 } 4134 4135 if (!r) 4136 r = amdgpu_device_recover_vram(tmp_adev); 4137 else 4138 tmp_adev->asic_reset_res = r; 4139 } 4140 4141 end: 4142 *need_full_reset_arg = need_full_reset; 4143 return r; 4144 } 4145 4146 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 4147 { 4148 if (trylock) { 4149 if (!mutex_trylock(&adev->lock_reset)) 4150 return false; 4151 } else 4152 mutex_lock(&adev->lock_reset); 4153 4154 atomic_inc(&adev->gpu_reset_counter); 4155 adev->in_gpu_reset = true; 4156 switch (amdgpu_asic_reset_method(adev)) { 4157 case AMD_RESET_METHOD_MODE1: 4158 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4159 break; 4160 case AMD_RESET_METHOD_MODE2: 4161 adev->mp1_state = PP_MP1_STATE_RESET; 4162 break; 4163 default: 4164 adev->mp1_state = PP_MP1_STATE_NONE; 4165 break; 4166 } 4167 4168 return true; 4169 } 4170 4171 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4172 { 4173 amdgpu_vf_error_trans_all(adev); 4174 adev->mp1_state = PP_MP1_STATE_NONE; 4175 adev->in_gpu_reset = false; 4176 mutex_unlock(&adev->lock_reset); 4177 } 4178 4179 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4180 { 4181 struct pci_dev *p = NULL; 4182 4183 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4184 adev->pdev->bus->number, 1); 4185 if (p) { 4186 pm_runtime_enable(&(p->dev)); 4187 pm_runtime_resume(&(p->dev)); 4188 } 4189 } 4190 4191 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4192 { 4193 enum amd_reset_method reset_method; 4194 struct pci_dev *p = NULL; 4195 u64 expires; 4196 4197 /* 4198 * For now, only BACO and mode1 reset are confirmed 4199 * to suffer the audio issue without proper suspended. 4200 */ 4201 reset_method = amdgpu_asic_reset_method(adev); 4202 if ((reset_method != AMD_RESET_METHOD_BACO) && 4203 (reset_method != AMD_RESET_METHOD_MODE1)) 4204 return -EINVAL; 4205 4206 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4207 adev->pdev->bus->number, 1); 4208 if (!p) 4209 return -ENODEV; 4210 4211 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4212 if (!expires) 4213 /* 4214 * If we cannot get the audio device autosuspend delay, 4215 * a fixed 4S interval will be used. Considering 3S is 4216 * the audio controller default autosuspend delay setting. 4217 * 4S used here is guaranteed to cover that. 4218 */ 4219 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4220 4221 while (!pm_runtime_status_suspended(&(p->dev))) { 4222 if (!pm_runtime_suspend(&(p->dev))) 4223 break; 4224 4225 if (expires < ktime_get_mono_fast_ns()) { 4226 dev_warn(adev->dev, "failed to suspend display audio\n"); 4227 /* TODO: abort the succeeding gpu reset? */ 4228 return -ETIMEDOUT; 4229 } 4230 } 4231 4232 pm_runtime_disable(&(p->dev)); 4233 4234 return 0; 4235 } 4236 4237 /** 4238 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4239 * 4240 * @adev: amdgpu device pointer 4241 * @job: which job trigger hang 4242 * 4243 * Attempt to reset the GPU if it has hung (all asics). 4244 * Attempt to do soft-reset or full-reset and reinitialize Asic 4245 * Returns 0 for success or an error on failure. 4246 */ 4247 4248 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4249 struct amdgpu_job *job) 4250 { 4251 struct list_head device_list, *device_list_handle = NULL; 4252 bool need_full_reset = false; 4253 bool job_signaled = false; 4254 struct amdgpu_hive_info *hive = NULL; 4255 struct amdgpu_device *tmp_adev = NULL; 4256 int i, r = 0; 4257 bool need_emergency_restart = false; 4258 bool audio_suspended = false; 4259 4260 /** 4261 * Special case: RAS triggered and full reset isn't supported 4262 */ 4263 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4264 4265 /* 4266 * Flush RAM to disk so that after reboot 4267 * the user can read log and see why the system rebooted. 4268 */ 4269 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4270 DRM_WARN("Emergency reboot."); 4271 4272 ksys_sync_helper(); 4273 emergency_restart(); 4274 } 4275 4276 dev_info(adev->dev, "GPU %s begin!\n", 4277 need_emergency_restart ? "jobs stop":"reset"); 4278 4279 /* 4280 * Here we trylock to avoid chain of resets executing from 4281 * either trigger by jobs on different adevs in XGMI hive or jobs on 4282 * different schedulers for same device while this TO handler is running. 4283 * We always reset all schedulers for device and all devices for XGMI 4284 * hive so that should take care of them too. 4285 */ 4286 hive = amdgpu_get_xgmi_hive(adev, true); 4287 if (hive && !mutex_trylock(&hive->reset_lock)) { 4288 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4289 job ? job->base.id : -1, hive->hive_id); 4290 mutex_unlock(&hive->hive_lock); 4291 return 0; 4292 } 4293 4294 /* 4295 * Build list of devices to reset. 4296 * In case we are in XGMI hive mode, resort the device list 4297 * to put adev in the 1st position. 4298 */ 4299 INIT_LIST_HEAD(&device_list); 4300 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4301 if (!hive) 4302 return -ENODEV; 4303 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4304 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4305 device_list_handle = &hive->device_list; 4306 } else { 4307 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4308 device_list_handle = &device_list; 4309 } 4310 4311 /* block all schedulers and reset given job's ring */ 4312 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4313 if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { 4314 DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", 4315 job ? job->base.id : -1); 4316 mutex_unlock(&hive->hive_lock); 4317 return 0; 4318 } 4319 4320 /* 4321 * Try to put the audio codec into suspend state 4322 * before gpu reset started. 4323 * 4324 * Due to the power domain of the graphics device 4325 * is shared with AZ power domain. Without this, 4326 * we may change the audio hardware from behind 4327 * the audio driver's back. That will trigger 4328 * some audio codec errors. 4329 */ 4330 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4331 audio_suspended = true; 4332 4333 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4334 4335 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4336 4337 if (!amdgpu_sriov_vf(tmp_adev)) 4338 amdgpu_amdkfd_pre_reset(tmp_adev); 4339 4340 /* 4341 * Mark these ASICs to be reseted as untracked first 4342 * And add them back after reset completed 4343 */ 4344 amdgpu_unregister_gpu_instance(tmp_adev); 4345 4346 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4347 4348 /* disable ras on ALL IPs */ 4349 if (!need_emergency_restart && 4350 amdgpu_device_ip_need_full_reset(tmp_adev)) 4351 amdgpu_ras_suspend(tmp_adev); 4352 4353 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4354 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4355 4356 if (!ring || !ring->sched.thread) 4357 continue; 4358 4359 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4360 4361 if (need_emergency_restart) 4362 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4363 } 4364 } 4365 4366 if (need_emergency_restart) 4367 goto skip_sched_resume; 4368 4369 /* 4370 * Must check guilty signal here since after this point all old 4371 * HW fences are force signaled. 4372 * 4373 * job->base holds a reference to parent fence 4374 */ 4375 if (job && job->base.s_fence->parent && 4376 dma_fence_is_signaled(job->base.s_fence->parent)) { 4377 job_signaled = true; 4378 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4379 goto skip_hw_reset; 4380 } 4381 4382 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4383 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4384 r = amdgpu_device_pre_asic_reset(tmp_adev, 4385 NULL, 4386 &need_full_reset); 4387 /*TODO Should we stop ?*/ 4388 if (r) { 4389 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4390 r, tmp_adev->ddev->unique); 4391 tmp_adev->asic_reset_res = r; 4392 } 4393 } 4394 4395 /* Actual ASIC resets if needed.*/ 4396 /* TODO Implement XGMI hive reset logic for SRIOV */ 4397 if (amdgpu_sriov_vf(adev)) { 4398 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4399 if (r) 4400 adev->asic_reset_res = r; 4401 } else { 4402 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4403 if (r && r == -EAGAIN) 4404 goto retry; 4405 } 4406 4407 skip_hw_reset: 4408 4409 /* Post ASIC reset for all devs .*/ 4410 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4411 4412 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4413 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4414 4415 if (!ring || !ring->sched.thread) 4416 continue; 4417 4418 /* No point to resubmit jobs if we didn't HW reset*/ 4419 if (!tmp_adev->asic_reset_res && !job_signaled) 4420 drm_sched_resubmit_jobs(&ring->sched); 4421 4422 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4423 } 4424 4425 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4426 drm_helper_resume_force_mode(tmp_adev->ddev); 4427 } 4428 4429 tmp_adev->asic_reset_res = 0; 4430 4431 if (r) { 4432 /* bad news, how to tell it to userspace ? */ 4433 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4434 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4435 } else { 4436 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4437 } 4438 } 4439 4440 skip_sched_resume: 4441 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4442 /*unlock kfd: SRIOV would do it separately */ 4443 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4444 amdgpu_amdkfd_post_reset(tmp_adev); 4445 if (audio_suspended) 4446 amdgpu_device_resume_display_audio(tmp_adev); 4447 amdgpu_device_unlock_adev(tmp_adev); 4448 } 4449 4450 if (hive) { 4451 mutex_unlock(&hive->reset_lock); 4452 mutex_unlock(&hive->hive_lock); 4453 } 4454 4455 if (r) 4456 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4457 return r; 4458 } 4459 4460 /** 4461 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4462 * 4463 * @adev: amdgpu_device pointer 4464 * 4465 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4466 * and lanes) of the slot the device is in. Handles APUs and 4467 * virtualized environments where PCIE config space may not be available. 4468 */ 4469 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4470 { 4471 struct pci_dev *pdev; 4472 enum pci_bus_speed speed_cap, platform_speed_cap; 4473 enum pcie_link_width platform_link_width; 4474 4475 if (amdgpu_pcie_gen_cap) 4476 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4477 4478 if (amdgpu_pcie_lane_cap) 4479 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4480 4481 /* covers APUs as well */ 4482 if (pci_is_root_bus(adev->pdev->bus)) { 4483 if (adev->pm.pcie_gen_mask == 0) 4484 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4485 if (adev->pm.pcie_mlw_mask == 0) 4486 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4487 return; 4488 } 4489 4490 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4491 return; 4492 4493 pcie_bandwidth_available(adev->pdev, NULL, 4494 &platform_speed_cap, &platform_link_width); 4495 4496 if (adev->pm.pcie_gen_mask == 0) { 4497 /* asic caps */ 4498 pdev = adev->pdev; 4499 speed_cap = pcie_get_speed_cap(pdev); 4500 if (speed_cap == PCI_SPEED_UNKNOWN) { 4501 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4502 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4504 } else { 4505 if (speed_cap == PCIE_SPEED_16_0GT) 4506 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4507 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4508 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4509 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4510 else if (speed_cap == PCIE_SPEED_8_0GT) 4511 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4512 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4513 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4514 else if (speed_cap == PCIE_SPEED_5_0GT) 4515 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4516 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4517 else 4518 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4519 } 4520 /* platform caps */ 4521 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4522 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4523 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4524 } else { 4525 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4526 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4527 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4528 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4530 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4531 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4532 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4533 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4534 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4535 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4536 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4537 else 4538 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4539 4540 } 4541 } 4542 if (adev->pm.pcie_mlw_mask == 0) { 4543 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4544 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4545 } else { 4546 switch (platform_link_width) { 4547 case PCIE_LNK_X32: 4548 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4549 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4555 break; 4556 case PCIE_LNK_X16: 4557 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4559 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4560 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4563 break; 4564 case PCIE_LNK_X12: 4565 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4567 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4568 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4569 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4570 break; 4571 case PCIE_LNK_X8: 4572 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4573 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4574 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4575 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4576 break; 4577 case PCIE_LNK_X4: 4578 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4579 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4580 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4581 break; 4582 case PCIE_LNK_X2: 4583 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4584 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4585 break; 4586 case PCIE_LNK_X1: 4587 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4588 break; 4589 default: 4590 break; 4591 } 4592 } 4593 } 4594 } 4595 4596 int amdgpu_device_baco_enter(struct drm_device *dev) 4597 { 4598 struct amdgpu_device *adev = dev->dev_private; 4599 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4600 4601 if (!amdgpu_device_supports_baco(adev->ddev)) 4602 return -ENOTSUPP; 4603 4604 if (ras && ras->supported) 4605 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4606 4607 return amdgpu_dpm_baco_enter(adev); 4608 } 4609 4610 int amdgpu_device_baco_exit(struct drm_device *dev) 4611 { 4612 struct amdgpu_device *adev = dev->dev_private; 4613 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4614 int ret = 0; 4615 4616 if (!amdgpu_device_supports_baco(adev->ddev)) 4617 return -ENOTSUPP; 4618 4619 ret = amdgpu_dpm_baco_exit(adev); 4620 if (ret) 4621 return ret; 4622 4623 if (ras && ras->supported) 4624 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4625 4626 return 0; 4627 } 4628