1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "NAVI10", 115 "NAVI14", 116 "NAVI12", 117 "SIENNA_CICHLID", 118 "NAVY_FLOUNDER", 119 "LAST", 120 }; 121 122 /** 123 * DOC: pcie_replay_count 124 * 125 * The amdgpu driver provides a sysfs API for reporting the total number 126 * of PCIe replays (NAKs) 127 * The file pcie_replay_count is used for this and returns the total 128 * number of replays as a sum of the NAKs generated and NAKs received 129 */ 130 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct drm_device *ddev = dev_get_drvdata(dev); 135 struct amdgpu_device *adev = ddev->dev_private; 136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 137 138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 139 } 140 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 142 amdgpu_device_get_pcie_replay_count, NULL); 143 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 145 146 /** 147 * DOC: product_name 148 * 149 * The amdgpu driver provides a sysfs API for reporting the product name 150 * for the device 151 * The file serial_number is used for this and returns the product name 152 * as returned from the FRU. 153 * NOTE: This is only available for certain server cards 154 */ 155 156 static ssize_t amdgpu_device_get_product_name(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 struct drm_device *ddev = dev_get_drvdata(dev); 160 struct amdgpu_device *adev = ddev->dev_private; 161 162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 163 } 164 165 static DEVICE_ATTR(product_name, S_IRUGO, 166 amdgpu_device_get_product_name, NULL); 167 168 /** 169 * DOC: product_number 170 * 171 * The amdgpu driver provides a sysfs API for reporting the part number 172 * for the device 173 * The file serial_number is used for this and returns the part number 174 * as returned from the FRU. 175 * NOTE: This is only available for certain server cards 176 */ 177 178 static ssize_t amdgpu_device_get_product_number(struct device *dev, 179 struct device_attribute *attr, char *buf) 180 { 181 struct drm_device *ddev = dev_get_drvdata(dev); 182 struct amdgpu_device *adev = ddev->dev_private; 183 184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 185 } 186 187 static DEVICE_ATTR(product_number, S_IRUGO, 188 amdgpu_device_get_product_number, NULL); 189 190 /** 191 * DOC: serial_number 192 * 193 * The amdgpu driver provides a sysfs API for reporting the serial number 194 * for the device 195 * The file serial_number is used for this and returns the serial number 196 * as returned from the FRU. 197 * NOTE: This is only available for certain server cards 198 */ 199 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 201 struct device_attribute *attr, char *buf) 202 { 203 struct drm_device *ddev = dev_get_drvdata(dev); 204 struct amdgpu_device *adev = ddev->dev_private; 205 206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 207 } 208 209 static DEVICE_ATTR(serial_number, S_IRUGO, 210 amdgpu_device_get_serial_number, NULL); 211 212 /** 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 214 * 215 * @dev: drm_device pointer 216 * 217 * Returns true if the device is a dGPU with HG/PX power control, 218 * otherwise return false. 219 */ 220 bool amdgpu_device_supports_boco(struct drm_device *dev) 221 { 222 struct amdgpu_device *adev = dev->dev_private; 223 224 if (adev->flags & AMD_IS_PX) 225 return true; 226 return false; 227 } 228 229 /** 230 * amdgpu_device_supports_baco - Does the device support BACO 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device supporte BACO, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_baco(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = dev->dev_private; 240 241 return amdgpu_asic_supports_baco(adev); 242 } 243 244 /** 245 * VRAM access helper functions. 246 * 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * MMIO register access helper functions. 307 */ 308 /** 309 * amdgpu_mm_rreg - read a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 318 uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 323 return amdgpu_kiq_rreg(adev, reg); 324 325 if ((reg * 4) < adev->rmmio_size) 326 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 327 else { 328 unsigned long flags; 329 330 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 331 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 332 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 333 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 334 } 335 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 336 return ret; 337 } 338 339 /* 340 * MMIO register read with bytes helper functions 341 * @offset:bytes offset from MMIO start 342 * 343 */ 344 345 /** 346 * amdgpu_mm_rreg8 - read a memory mapped IO register 347 * 348 * @adev: amdgpu_device pointer 349 * @offset: byte aligned register offset 350 * 351 * Returns the 8 bit value from the offset specified. 352 */ 353 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 354 if (offset < adev->rmmio_size) 355 return (readb(adev->rmmio + offset)); 356 BUG(); 357 } 358 359 /* 360 * MMIO register write with bytes helper functions 361 * @offset:bytes offset from MMIO start 362 * @value: the value want to be written to the register 363 * 364 */ 365 /** 366 * amdgpu_mm_wreg8 - read a memory mapped IO register 367 * 368 * @adev: amdgpu_device pointer 369 * @offset: byte aligned register offset 370 * @value: 8 bit value to write 371 * 372 * Writes the value specified to the offset specified. 373 */ 374 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 375 if (offset < adev->rmmio_size) 376 writeb(value, adev->rmmio + offset); 377 else 378 BUG(); 379 } 380 381 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) 382 { 383 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 384 385 if ((reg * 4) < adev->rmmio_size) 386 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 387 else { 388 unsigned long flags; 389 390 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 391 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 392 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 393 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 394 } 395 } 396 397 /** 398 * amdgpu_mm_wreg - write to a memory mapped IO register 399 * 400 * @adev: amdgpu_device pointer 401 * @reg: dword aligned register offset 402 * @v: 32 bit value to write to the register 403 * @acc_flags: access flags which require special behavior 404 * 405 * Writes the value specified to the offset specified. 406 */ 407 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 408 uint32_t acc_flags) 409 { 410 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 411 return amdgpu_kiq_wreg(adev, reg, v); 412 413 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 414 } 415 416 /* 417 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 418 * 419 * this function is invoked only the debugfs register access 420 * */ 421 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 422 uint32_t acc_flags) 423 { 424 if (amdgpu_sriov_fullaccess(adev) && 425 adev->gfx.rlc.funcs && 426 adev->gfx.rlc.funcs->is_rlcg_access_range) { 427 428 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 429 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 430 } 431 432 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 433 } 434 435 /** 436 * amdgpu_io_rreg - read an IO register 437 * 438 * @adev: amdgpu_device pointer 439 * @reg: dword aligned register offset 440 * 441 * Returns the 32 bit value from the offset specified. 442 */ 443 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 444 { 445 if ((reg * 4) < adev->rio_mem_size) 446 return ioread32(adev->rio_mem + (reg * 4)); 447 else { 448 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 449 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 450 } 451 } 452 453 /** 454 * amdgpu_io_wreg - write to an IO register 455 * 456 * @adev: amdgpu_device pointer 457 * @reg: dword aligned register offset 458 * @v: 32 bit value to write to the register 459 * 460 * Writes the value specified to the offset specified. 461 */ 462 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 463 { 464 if ((reg * 4) < adev->rio_mem_size) 465 iowrite32(v, adev->rio_mem + (reg * 4)); 466 else { 467 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 468 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 469 } 470 } 471 472 /** 473 * amdgpu_mm_rdoorbell - read a doorbell dword 474 * 475 * @adev: amdgpu_device pointer 476 * @index: doorbell index 477 * 478 * Returns the value in the doorbell aperture at the 479 * requested doorbell index (CIK). 480 */ 481 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 482 { 483 if (index < adev->doorbell.num_doorbells) { 484 return readl(adev->doorbell.ptr + index); 485 } else { 486 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 487 return 0; 488 } 489 } 490 491 /** 492 * amdgpu_mm_wdoorbell - write a doorbell dword 493 * 494 * @adev: amdgpu_device pointer 495 * @index: doorbell index 496 * @v: value to write 497 * 498 * Writes @v to the doorbell aperture at the 499 * requested doorbell index (CIK). 500 */ 501 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 502 { 503 if (index < adev->doorbell.num_doorbells) { 504 writel(v, adev->doorbell.ptr + index); 505 } else { 506 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 507 } 508 } 509 510 /** 511 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * 516 * Returns the value in the doorbell aperture at the 517 * requested doorbell index (VEGA10+). 518 */ 519 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 520 { 521 if (index < adev->doorbell.num_doorbells) { 522 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 523 } else { 524 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 525 return 0; 526 } 527 } 528 529 /** 530 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 531 * 532 * @adev: amdgpu_device pointer 533 * @index: doorbell index 534 * @v: value to write 535 * 536 * Writes @v to the doorbell aperture at the 537 * requested doorbell index (VEGA10+). 538 */ 539 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 540 { 541 if (index < adev->doorbell.num_doorbells) { 542 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 543 } else { 544 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 545 } 546 } 547 548 /** 549 * amdgpu_invalid_rreg - dummy reg read function 550 * 551 * @adev: amdgpu device pointer 552 * @reg: offset of register 553 * 554 * Dummy register read function. Used for register blocks 555 * that certain asics don't have (all asics). 556 * Returns the value in the register. 557 */ 558 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 559 { 560 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 561 BUG(); 562 return 0; 563 } 564 565 /** 566 * amdgpu_invalid_wreg - dummy reg write function 567 * 568 * @adev: amdgpu device pointer 569 * @reg: offset of register 570 * @v: value to write to the register 571 * 572 * Dummy register read function. Used for register blocks 573 * that certain asics don't have (all asics). 574 */ 575 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 576 { 577 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 578 reg, v); 579 BUG(); 580 } 581 582 /** 583 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 584 * 585 * @adev: amdgpu device pointer 586 * @reg: offset of register 587 * 588 * Dummy register read function. Used for register blocks 589 * that certain asics don't have (all asics). 590 * Returns the value in the register. 591 */ 592 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 593 { 594 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 595 BUG(); 596 return 0; 597 } 598 599 /** 600 * amdgpu_invalid_wreg64 - dummy reg write function 601 * 602 * @adev: amdgpu device pointer 603 * @reg: offset of register 604 * @v: value to write to the register 605 * 606 * Dummy register read function. Used for register blocks 607 * that certain asics don't have (all asics). 608 */ 609 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 610 { 611 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 612 reg, v); 613 BUG(); 614 } 615 616 /** 617 * amdgpu_block_invalid_rreg - dummy reg read function 618 * 619 * @adev: amdgpu device pointer 620 * @block: offset of instance 621 * @reg: offset of register 622 * 623 * Dummy register read function. Used for register blocks 624 * that certain asics don't have (all asics). 625 * Returns the value in the register. 626 */ 627 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 628 uint32_t block, uint32_t reg) 629 { 630 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 631 reg, block); 632 BUG(); 633 return 0; 634 } 635 636 /** 637 * amdgpu_block_invalid_wreg - dummy reg write function 638 * 639 * @adev: amdgpu device pointer 640 * @block: offset of instance 641 * @reg: offset of register 642 * @v: value to write to the register 643 * 644 * Dummy register read function. Used for register blocks 645 * that certain asics don't have (all asics). 646 */ 647 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 648 uint32_t block, 649 uint32_t reg, uint32_t v) 650 { 651 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 652 reg, block, v); 653 BUG(); 654 } 655 656 /** 657 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 658 * 659 * @adev: amdgpu device pointer 660 * 661 * Allocates a scratch page of VRAM for use by various things in the 662 * driver. 663 */ 664 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 665 { 666 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 667 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 668 &adev->vram_scratch.robj, 669 &adev->vram_scratch.gpu_addr, 670 (void **)&adev->vram_scratch.ptr); 671 } 672 673 /** 674 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 675 * 676 * @adev: amdgpu device pointer 677 * 678 * Frees the VRAM scratch page. 679 */ 680 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 681 { 682 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 683 } 684 685 /** 686 * amdgpu_device_program_register_sequence - program an array of registers. 687 * 688 * @adev: amdgpu_device pointer 689 * @registers: pointer to the register array 690 * @array_size: size of the register array 691 * 692 * Programs an array or registers with and and or masks. 693 * This is a helper for setting golden registers. 694 */ 695 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 696 const u32 *registers, 697 const u32 array_size) 698 { 699 u32 tmp, reg, and_mask, or_mask; 700 int i; 701 702 if (array_size % 3) 703 return; 704 705 for (i = 0; i < array_size; i +=3) { 706 reg = registers[i + 0]; 707 and_mask = registers[i + 1]; 708 or_mask = registers[i + 2]; 709 710 if (and_mask == 0xffffffff) { 711 tmp = or_mask; 712 } else { 713 tmp = RREG32(reg); 714 tmp &= ~and_mask; 715 if (adev->family >= AMDGPU_FAMILY_AI) 716 tmp |= (or_mask & and_mask); 717 else 718 tmp |= or_mask; 719 } 720 WREG32(reg, tmp); 721 } 722 } 723 724 /** 725 * amdgpu_device_pci_config_reset - reset the GPU 726 * 727 * @adev: amdgpu_device pointer 728 * 729 * Resets the GPU using the pci config reset sequence. 730 * Only applicable to asics prior to vega10. 731 */ 732 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 733 { 734 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 735 } 736 737 /* 738 * GPU doorbell aperture helpers function. 739 */ 740 /** 741 * amdgpu_device_doorbell_init - Init doorbell driver information. 742 * 743 * @adev: amdgpu_device pointer 744 * 745 * Init doorbell driver information (CIK) 746 * Returns 0 on success, error on failure. 747 */ 748 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 749 { 750 751 /* No doorbell on SI hardware generation */ 752 if (adev->asic_type < CHIP_BONAIRE) { 753 adev->doorbell.base = 0; 754 adev->doorbell.size = 0; 755 adev->doorbell.num_doorbells = 0; 756 adev->doorbell.ptr = NULL; 757 return 0; 758 } 759 760 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 761 return -EINVAL; 762 763 amdgpu_asic_init_doorbell_index(adev); 764 765 /* doorbell bar mapping */ 766 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 767 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 768 769 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 770 adev->doorbell_index.max_assignment+1); 771 if (adev->doorbell.num_doorbells == 0) 772 return -EINVAL; 773 774 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 775 * paging queue doorbell use the second page. The 776 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 777 * doorbells are in the first page. So with paging queue enabled, 778 * the max num_doorbells should + 1 page (0x400 in dword) 779 */ 780 if (adev->asic_type >= CHIP_VEGA10) 781 adev->doorbell.num_doorbells += 0x400; 782 783 adev->doorbell.ptr = ioremap(adev->doorbell.base, 784 adev->doorbell.num_doorbells * 785 sizeof(u32)); 786 if (adev->doorbell.ptr == NULL) 787 return -ENOMEM; 788 789 return 0; 790 } 791 792 /** 793 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 794 * 795 * @adev: amdgpu_device pointer 796 * 797 * Tear down doorbell driver information (CIK) 798 */ 799 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 800 { 801 iounmap(adev->doorbell.ptr); 802 adev->doorbell.ptr = NULL; 803 } 804 805 806 807 /* 808 * amdgpu_device_wb_*() 809 * Writeback is the method by which the GPU updates special pages in memory 810 * with the status of certain GPU events (fences, ring pointers,etc.). 811 */ 812 813 /** 814 * amdgpu_device_wb_fini - Disable Writeback and free memory 815 * 816 * @adev: amdgpu_device pointer 817 * 818 * Disables Writeback and frees the Writeback memory (all asics). 819 * Used at driver shutdown. 820 */ 821 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 822 { 823 if (adev->wb.wb_obj) { 824 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 825 &adev->wb.gpu_addr, 826 (void **)&adev->wb.wb); 827 adev->wb.wb_obj = NULL; 828 } 829 } 830 831 /** 832 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 833 * 834 * @adev: amdgpu_device pointer 835 * 836 * Initializes writeback and allocates writeback memory (all asics). 837 * Used at driver startup. 838 * Returns 0 on success or an -error on failure. 839 */ 840 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 841 { 842 int r; 843 844 if (adev->wb.wb_obj == NULL) { 845 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 846 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 847 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 848 &adev->wb.wb_obj, &adev->wb.gpu_addr, 849 (void **)&adev->wb.wb); 850 if (r) { 851 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 852 return r; 853 } 854 855 adev->wb.num_wb = AMDGPU_MAX_WB; 856 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 857 858 /* clear wb memory */ 859 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 860 } 861 862 return 0; 863 } 864 865 /** 866 * amdgpu_device_wb_get - Allocate a wb entry 867 * 868 * @adev: amdgpu_device pointer 869 * @wb: wb index 870 * 871 * Allocate a wb slot for use by the driver (all asics). 872 * Returns 0 on success or -EINVAL on failure. 873 */ 874 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 875 { 876 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 877 878 if (offset < adev->wb.num_wb) { 879 __set_bit(offset, adev->wb.used); 880 *wb = offset << 3; /* convert to dw offset */ 881 return 0; 882 } else { 883 return -EINVAL; 884 } 885 } 886 887 /** 888 * amdgpu_device_wb_free - Free a wb entry 889 * 890 * @adev: amdgpu_device pointer 891 * @wb: wb index 892 * 893 * Free a wb slot allocated for use by the driver (all asics) 894 */ 895 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 896 { 897 wb >>= 3; 898 if (wb < adev->wb.num_wb) 899 __clear_bit(wb, adev->wb.used); 900 } 901 902 /** 903 * amdgpu_device_resize_fb_bar - try to resize FB BAR 904 * 905 * @adev: amdgpu_device pointer 906 * 907 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 908 * to fail, but if any of the BARs is not accessible after the size we abort 909 * driver loading by returning -ENODEV. 910 */ 911 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 912 { 913 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 914 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 915 struct pci_bus *root; 916 struct resource *res; 917 unsigned i; 918 u16 cmd; 919 int r; 920 921 /* Bypass for VF */ 922 if (amdgpu_sriov_vf(adev)) 923 return 0; 924 925 /* skip if the bios has already enabled large BAR */ 926 if (adev->gmc.real_vram_size && 927 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 928 return 0; 929 930 /* Check if the root BUS has 64bit memory resources */ 931 root = adev->pdev->bus; 932 while (root->parent) 933 root = root->parent; 934 935 pci_bus_for_each_resource(root, res, i) { 936 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 937 res->start > 0x100000000ull) 938 break; 939 } 940 941 /* Trying to resize is pointless without a root hub window above 4GB */ 942 if (!res) 943 return 0; 944 945 /* Disable memory decoding while we change the BAR addresses and size */ 946 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 947 pci_write_config_word(adev->pdev, PCI_COMMAND, 948 cmd & ~PCI_COMMAND_MEMORY); 949 950 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 951 amdgpu_device_doorbell_fini(adev); 952 if (adev->asic_type >= CHIP_BONAIRE) 953 pci_release_resource(adev->pdev, 2); 954 955 pci_release_resource(adev->pdev, 0); 956 957 r = pci_resize_resource(adev->pdev, 0, rbar_size); 958 if (r == -ENOSPC) 959 DRM_INFO("Not enough PCI address space for a large BAR."); 960 else if (r && r != -ENOTSUPP) 961 DRM_ERROR("Problem resizing BAR0 (%d).", r); 962 963 pci_assign_unassigned_bus_resources(adev->pdev->bus); 964 965 /* When the doorbell or fb BAR isn't available we have no chance of 966 * using the device. 967 */ 968 r = amdgpu_device_doorbell_init(adev); 969 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 970 return -ENODEV; 971 972 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 973 974 return 0; 975 } 976 977 /* 978 * GPU helpers function. 979 */ 980 /** 981 * amdgpu_device_need_post - check if the hw need post or not 982 * 983 * @adev: amdgpu_device pointer 984 * 985 * Check if the asic has been initialized (all asics) at driver startup 986 * or post is needed if hw reset is performed. 987 * Returns true if need or false if not. 988 */ 989 bool amdgpu_device_need_post(struct amdgpu_device *adev) 990 { 991 uint32_t reg; 992 993 if (amdgpu_sriov_vf(adev)) 994 return false; 995 996 if (amdgpu_passthrough(adev)) { 997 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 998 * some old smc fw still need driver do vPost otherwise gpu hang, while 999 * those smc fw version above 22.15 doesn't have this flaw, so we force 1000 * vpost executed for smc version below 22.15 1001 */ 1002 if (adev->asic_type == CHIP_FIJI) { 1003 int err; 1004 uint32_t fw_ver; 1005 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1006 /* force vPost if error occured */ 1007 if (err) 1008 return true; 1009 1010 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1011 if (fw_ver < 0x00160e00) 1012 return true; 1013 } 1014 } 1015 1016 if (adev->has_hw_reset) { 1017 adev->has_hw_reset = false; 1018 return true; 1019 } 1020 1021 /* bios scratch used on CIK+ */ 1022 if (adev->asic_type >= CHIP_BONAIRE) 1023 return amdgpu_atombios_scratch_need_asic_init(adev); 1024 1025 /* check MEM_SIZE for older asics */ 1026 reg = amdgpu_asic_get_config_memsize(adev); 1027 1028 if ((reg != 0) && (reg != 0xffffffff)) 1029 return false; 1030 1031 return true; 1032 } 1033 1034 /* if we get transitioned to only one device, take VGA back */ 1035 /** 1036 * amdgpu_device_vga_set_decode - enable/disable vga decode 1037 * 1038 * @cookie: amdgpu_device pointer 1039 * @state: enable/disable vga decode 1040 * 1041 * Enable/disable vga decode (all asics). 1042 * Returns VGA resource flags. 1043 */ 1044 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1045 { 1046 struct amdgpu_device *adev = cookie; 1047 amdgpu_asic_set_vga_state(adev, state); 1048 if (state) 1049 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1050 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1051 else 1052 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1053 } 1054 1055 /** 1056 * amdgpu_device_check_block_size - validate the vm block size 1057 * 1058 * @adev: amdgpu_device pointer 1059 * 1060 * Validates the vm block size specified via module parameter. 1061 * The vm block size defines number of bits in page table versus page directory, 1062 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1063 * page table and the remaining bits are in the page directory. 1064 */ 1065 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1066 { 1067 /* defines number of bits in page table versus page directory, 1068 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1069 * page table and the remaining bits are in the page directory */ 1070 if (amdgpu_vm_block_size == -1) 1071 return; 1072 1073 if (amdgpu_vm_block_size < 9) { 1074 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1075 amdgpu_vm_block_size); 1076 amdgpu_vm_block_size = -1; 1077 } 1078 } 1079 1080 /** 1081 * amdgpu_device_check_vm_size - validate the vm size 1082 * 1083 * @adev: amdgpu_device pointer 1084 * 1085 * Validates the vm size in GB specified via module parameter. 1086 * The VM size is the size of the GPU virtual memory space in GB. 1087 */ 1088 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1089 { 1090 /* no need to check the default value */ 1091 if (amdgpu_vm_size == -1) 1092 return; 1093 1094 if (amdgpu_vm_size < 1) { 1095 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1096 amdgpu_vm_size); 1097 amdgpu_vm_size = -1; 1098 } 1099 } 1100 1101 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1102 { 1103 struct sysinfo si; 1104 bool is_os_64 = (sizeof(void *) == 8); 1105 uint64_t total_memory; 1106 uint64_t dram_size_seven_GB = 0x1B8000000; 1107 uint64_t dram_size_three_GB = 0xB8000000; 1108 1109 if (amdgpu_smu_memory_pool_size == 0) 1110 return; 1111 1112 if (!is_os_64) { 1113 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1114 goto def_value; 1115 } 1116 si_meminfo(&si); 1117 total_memory = (uint64_t)si.totalram * si.mem_unit; 1118 1119 if ((amdgpu_smu_memory_pool_size == 1) || 1120 (amdgpu_smu_memory_pool_size == 2)) { 1121 if (total_memory < dram_size_three_GB) 1122 goto def_value1; 1123 } else if ((amdgpu_smu_memory_pool_size == 4) || 1124 (amdgpu_smu_memory_pool_size == 8)) { 1125 if (total_memory < dram_size_seven_GB) 1126 goto def_value1; 1127 } else { 1128 DRM_WARN("Smu memory pool size not supported\n"); 1129 goto def_value; 1130 } 1131 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1132 1133 return; 1134 1135 def_value1: 1136 DRM_WARN("No enough system memory\n"); 1137 def_value: 1138 adev->pm.smu_prv_buffer_size = 0; 1139 } 1140 1141 /** 1142 * amdgpu_device_check_arguments - validate module params 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Validates certain module parameters and updates 1147 * the associated values used by the driver (all asics). 1148 */ 1149 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1150 { 1151 if (amdgpu_sched_jobs < 4) { 1152 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1153 amdgpu_sched_jobs); 1154 amdgpu_sched_jobs = 4; 1155 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1156 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1157 amdgpu_sched_jobs); 1158 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1159 } 1160 1161 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1162 /* gart size must be greater or equal to 32M */ 1163 dev_warn(adev->dev, "gart size (%d) too small\n", 1164 amdgpu_gart_size); 1165 amdgpu_gart_size = -1; 1166 } 1167 1168 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1169 /* gtt size must be greater or equal to 32M */ 1170 dev_warn(adev->dev, "gtt size (%d) too small\n", 1171 amdgpu_gtt_size); 1172 amdgpu_gtt_size = -1; 1173 } 1174 1175 /* valid range is between 4 and 9 inclusive */ 1176 if (amdgpu_vm_fragment_size != -1 && 1177 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1178 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1179 amdgpu_vm_fragment_size = -1; 1180 } 1181 1182 if (amdgpu_sched_hw_submission < 2) { 1183 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1184 amdgpu_sched_hw_submission); 1185 amdgpu_sched_hw_submission = 2; 1186 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1187 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1188 amdgpu_sched_hw_submission); 1189 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1190 } 1191 1192 amdgpu_device_check_smu_prv_buffer_size(adev); 1193 1194 amdgpu_device_check_vm_size(adev); 1195 1196 amdgpu_device_check_block_size(adev); 1197 1198 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1199 1200 amdgpu_gmc_tmz_set(adev); 1201 1202 return 0; 1203 } 1204 1205 /** 1206 * amdgpu_switcheroo_set_state - set switcheroo state 1207 * 1208 * @pdev: pci dev pointer 1209 * @state: vga_switcheroo state 1210 * 1211 * Callback for the switcheroo driver. Suspends or resumes the 1212 * the asics before or after it is powered up using ACPI methods. 1213 */ 1214 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1215 { 1216 struct drm_device *dev = pci_get_drvdata(pdev); 1217 int r; 1218 1219 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1220 return; 1221 1222 if (state == VGA_SWITCHEROO_ON) { 1223 pr_info("switched on\n"); 1224 /* don't suspend or resume card normally */ 1225 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1226 1227 pci_set_power_state(dev->pdev, PCI_D0); 1228 pci_restore_state(dev->pdev); 1229 r = pci_enable_device(dev->pdev); 1230 if (r) 1231 DRM_WARN("pci_enable_device failed (%d)\n", r); 1232 amdgpu_device_resume(dev, true); 1233 1234 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1235 drm_kms_helper_poll_enable(dev); 1236 } else { 1237 pr_info("switched off\n"); 1238 drm_kms_helper_poll_disable(dev); 1239 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1240 amdgpu_device_suspend(dev, true); 1241 pci_save_state(dev->pdev); 1242 /* Shut down the device */ 1243 pci_disable_device(dev->pdev); 1244 pci_set_power_state(dev->pdev, PCI_D3cold); 1245 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1246 } 1247 } 1248 1249 /** 1250 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1251 * 1252 * @pdev: pci dev pointer 1253 * 1254 * Callback for the switcheroo driver. Check of the switcheroo 1255 * state can be changed. 1256 * Returns true if the state can be changed, false if not. 1257 */ 1258 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1259 { 1260 struct drm_device *dev = pci_get_drvdata(pdev); 1261 1262 /* 1263 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1264 * locking inversion with the driver load path. And the access here is 1265 * completely racy anyway. So don't bother with locking for now. 1266 */ 1267 return atomic_read(&dev->open_count) == 0; 1268 } 1269 1270 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1271 .set_gpu_state = amdgpu_switcheroo_set_state, 1272 .reprobe = NULL, 1273 .can_switch = amdgpu_switcheroo_can_switch, 1274 }; 1275 1276 /** 1277 * amdgpu_device_ip_set_clockgating_state - set the CG state 1278 * 1279 * @dev: amdgpu_device pointer 1280 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1281 * @state: clockgating state (gate or ungate) 1282 * 1283 * Sets the requested clockgating state for all instances of 1284 * the hardware IP specified. 1285 * Returns the error code from the last instance. 1286 */ 1287 int amdgpu_device_ip_set_clockgating_state(void *dev, 1288 enum amd_ip_block_type block_type, 1289 enum amd_clockgating_state state) 1290 { 1291 struct amdgpu_device *adev = dev; 1292 int i, r = 0; 1293 1294 for (i = 0; i < adev->num_ip_blocks; i++) { 1295 if (!adev->ip_blocks[i].status.valid) 1296 continue; 1297 if (adev->ip_blocks[i].version->type != block_type) 1298 continue; 1299 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1300 continue; 1301 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1302 (void *)adev, state); 1303 if (r) 1304 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1305 adev->ip_blocks[i].version->funcs->name, r); 1306 } 1307 return r; 1308 } 1309 1310 /** 1311 * amdgpu_device_ip_set_powergating_state - set the PG state 1312 * 1313 * @dev: amdgpu_device pointer 1314 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1315 * @state: powergating state (gate or ungate) 1316 * 1317 * Sets the requested powergating state for all instances of 1318 * the hardware IP specified. 1319 * Returns the error code from the last instance. 1320 */ 1321 int amdgpu_device_ip_set_powergating_state(void *dev, 1322 enum amd_ip_block_type block_type, 1323 enum amd_powergating_state state) 1324 { 1325 struct amdgpu_device *adev = dev; 1326 int i, r = 0; 1327 1328 for (i = 0; i < adev->num_ip_blocks; i++) { 1329 if (!adev->ip_blocks[i].status.valid) 1330 continue; 1331 if (adev->ip_blocks[i].version->type != block_type) 1332 continue; 1333 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1334 continue; 1335 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1336 (void *)adev, state); 1337 if (r) 1338 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1339 adev->ip_blocks[i].version->funcs->name, r); 1340 } 1341 return r; 1342 } 1343 1344 /** 1345 * amdgpu_device_ip_get_clockgating_state - get the CG state 1346 * 1347 * @adev: amdgpu_device pointer 1348 * @flags: clockgating feature flags 1349 * 1350 * Walks the list of IPs on the device and updates the clockgating 1351 * flags for each IP. 1352 * Updates @flags with the feature flags for each hardware IP where 1353 * clockgating is enabled. 1354 */ 1355 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1356 u32 *flags) 1357 { 1358 int i; 1359 1360 for (i = 0; i < adev->num_ip_blocks; i++) { 1361 if (!adev->ip_blocks[i].status.valid) 1362 continue; 1363 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1364 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1365 } 1366 } 1367 1368 /** 1369 * amdgpu_device_ip_wait_for_idle - wait for idle 1370 * 1371 * @adev: amdgpu_device pointer 1372 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1373 * 1374 * Waits for the request hardware IP to be idle. 1375 * Returns 0 for success or a negative error code on failure. 1376 */ 1377 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1378 enum amd_ip_block_type block_type) 1379 { 1380 int i, r; 1381 1382 for (i = 0; i < adev->num_ip_blocks; i++) { 1383 if (!adev->ip_blocks[i].status.valid) 1384 continue; 1385 if (adev->ip_blocks[i].version->type == block_type) { 1386 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1387 if (r) 1388 return r; 1389 break; 1390 } 1391 } 1392 return 0; 1393 1394 } 1395 1396 /** 1397 * amdgpu_device_ip_is_idle - is the hardware IP idle 1398 * 1399 * @adev: amdgpu_device pointer 1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1401 * 1402 * Check if the hardware IP is idle or not. 1403 * Returns true if it the IP is idle, false if not. 1404 */ 1405 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1406 enum amd_ip_block_type block_type) 1407 { 1408 int i; 1409 1410 for (i = 0; i < adev->num_ip_blocks; i++) { 1411 if (!adev->ip_blocks[i].status.valid) 1412 continue; 1413 if (adev->ip_blocks[i].version->type == block_type) 1414 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1415 } 1416 return true; 1417 1418 } 1419 1420 /** 1421 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1422 * 1423 * @adev: amdgpu_device pointer 1424 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1425 * 1426 * Returns a pointer to the hardware IP block structure 1427 * if it exists for the asic, otherwise NULL. 1428 */ 1429 struct amdgpu_ip_block * 1430 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1431 enum amd_ip_block_type type) 1432 { 1433 int i; 1434 1435 for (i = 0; i < adev->num_ip_blocks; i++) 1436 if (adev->ip_blocks[i].version->type == type) 1437 return &adev->ip_blocks[i]; 1438 1439 return NULL; 1440 } 1441 1442 /** 1443 * amdgpu_device_ip_block_version_cmp 1444 * 1445 * @adev: amdgpu_device pointer 1446 * @type: enum amd_ip_block_type 1447 * @major: major version 1448 * @minor: minor version 1449 * 1450 * return 0 if equal or greater 1451 * return 1 if smaller or the ip_block doesn't exist 1452 */ 1453 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1454 enum amd_ip_block_type type, 1455 u32 major, u32 minor) 1456 { 1457 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1458 1459 if (ip_block && ((ip_block->version->major > major) || 1460 ((ip_block->version->major == major) && 1461 (ip_block->version->minor >= minor)))) 1462 return 0; 1463 1464 return 1; 1465 } 1466 1467 /** 1468 * amdgpu_device_ip_block_add 1469 * 1470 * @adev: amdgpu_device pointer 1471 * @ip_block_version: pointer to the IP to add 1472 * 1473 * Adds the IP block driver information to the collection of IPs 1474 * on the asic. 1475 */ 1476 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1477 const struct amdgpu_ip_block_version *ip_block_version) 1478 { 1479 if (!ip_block_version) 1480 return -EINVAL; 1481 1482 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1483 ip_block_version->funcs->name); 1484 1485 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1486 1487 return 0; 1488 } 1489 1490 /** 1491 * amdgpu_device_enable_virtual_display - enable virtual display feature 1492 * 1493 * @adev: amdgpu_device pointer 1494 * 1495 * Enabled the virtual display feature if the user has enabled it via 1496 * the module parameter virtual_display. This feature provides a virtual 1497 * display hardware on headless boards or in virtualized environments. 1498 * This function parses and validates the configuration string specified by 1499 * the user and configues the virtual display configuration (number of 1500 * virtual connectors, crtcs, etc.) specified. 1501 */ 1502 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1503 { 1504 adev->enable_virtual_display = false; 1505 1506 if (amdgpu_virtual_display) { 1507 struct drm_device *ddev = adev->ddev; 1508 const char *pci_address_name = pci_name(ddev->pdev); 1509 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1510 1511 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1512 pciaddstr_tmp = pciaddstr; 1513 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1514 pciaddname = strsep(&pciaddname_tmp, ","); 1515 if (!strcmp("all", pciaddname) 1516 || !strcmp(pci_address_name, pciaddname)) { 1517 long num_crtc; 1518 int res = -1; 1519 1520 adev->enable_virtual_display = true; 1521 1522 if (pciaddname_tmp) 1523 res = kstrtol(pciaddname_tmp, 10, 1524 &num_crtc); 1525 1526 if (!res) { 1527 if (num_crtc < 1) 1528 num_crtc = 1; 1529 if (num_crtc > 6) 1530 num_crtc = 6; 1531 adev->mode_info.num_crtc = num_crtc; 1532 } else { 1533 adev->mode_info.num_crtc = 1; 1534 } 1535 break; 1536 } 1537 } 1538 1539 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1540 amdgpu_virtual_display, pci_address_name, 1541 adev->enable_virtual_display, adev->mode_info.num_crtc); 1542 1543 kfree(pciaddstr); 1544 } 1545 } 1546 1547 /** 1548 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1549 * 1550 * @adev: amdgpu_device pointer 1551 * 1552 * Parses the asic configuration parameters specified in the gpu info 1553 * firmware and makes them availale to the driver for use in configuring 1554 * the asic. 1555 * Returns 0 on success, -EINVAL on failure. 1556 */ 1557 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1558 { 1559 const char *chip_name; 1560 char fw_name[40]; 1561 int err; 1562 const struct gpu_info_firmware_header_v1_0 *hdr; 1563 1564 adev->firmware.gpu_info_fw = NULL; 1565 1566 if (adev->discovery_bin) { 1567 amdgpu_discovery_get_gfx_info(adev); 1568 1569 /* 1570 * FIXME: The bounding box is still needed by Navi12, so 1571 * temporarily read it from gpu_info firmware. Should be droped 1572 * when DAL no longer needs it. 1573 */ 1574 if (adev->asic_type != CHIP_NAVI12) 1575 return 0; 1576 } 1577 1578 switch (adev->asic_type) { 1579 #ifdef CONFIG_DRM_AMDGPU_SI 1580 case CHIP_VERDE: 1581 case CHIP_TAHITI: 1582 case CHIP_PITCAIRN: 1583 case CHIP_OLAND: 1584 case CHIP_HAINAN: 1585 #endif 1586 #ifdef CONFIG_DRM_AMDGPU_CIK 1587 case CHIP_BONAIRE: 1588 case CHIP_HAWAII: 1589 case CHIP_KAVERI: 1590 case CHIP_KABINI: 1591 case CHIP_MULLINS: 1592 #endif 1593 case CHIP_TOPAZ: 1594 case CHIP_TONGA: 1595 case CHIP_FIJI: 1596 case CHIP_POLARIS10: 1597 case CHIP_POLARIS11: 1598 case CHIP_POLARIS12: 1599 case CHIP_VEGAM: 1600 case CHIP_CARRIZO: 1601 case CHIP_STONEY: 1602 case CHIP_VEGA20: 1603 default: 1604 return 0; 1605 case CHIP_VEGA10: 1606 chip_name = "vega10"; 1607 break; 1608 case CHIP_VEGA12: 1609 chip_name = "vega12"; 1610 break; 1611 case CHIP_RAVEN: 1612 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1613 chip_name = "raven2"; 1614 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1615 chip_name = "picasso"; 1616 else 1617 chip_name = "raven"; 1618 break; 1619 case CHIP_ARCTURUS: 1620 chip_name = "arcturus"; 1621 break; 1622 case CHIP_RENOIR: 1623 chip_name = "renoir"; 1624 break; 1625 case CHIP_NAVI10: 1626 chip_name = "navi10"; 1627 break; 1628 case CHIP_NAVI14: 1629 chip_name = "navi14"; 1630 break; 1631 case CHIP_NAVI12: 1632 chip_name = "navi12"; 1633 break; 1634 case CHIP_SIENNA_CICHLID: 1635 chip_name = "sienna_cichlid"; 1636 break; 1637 case CHIP_NAVY_FLOUNDER: 1638 chip_name = "navy_flounder"; 1639 break; 1640 } 1641 1642 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1643 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1644 if (err) { 1645 dev_err(adev->dev, 1646 "Failed to load gpu_info firmware \"%s\"\n", 1647 fw_name); 1648 goto out; 1649 } 1650 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1651 if (err) { 1652 dev_err(adev->dev, 1653 "Failed to validate gpu_info firmware \"%s\"\n", 1654 fw_name); 1655 goto out; 1656 } 1657 1658 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1659 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1660 1661 switch (hdr->version_major) { 1662 case 1: 1663 { 1664 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1665 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1666 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1667 1668 /* 1669 * Should be droped when DAL no longer needs it. 1670 */ 1671 if (adev->asic_type == CHIP_NAVI12) 1672 goto parse_soc_bounding_box; 1673 1674 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1675 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1676 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1677 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1678 adev->gfx.config.max_texture_channel_caches = 1679 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1680 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1681 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1682 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1683 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1684 adev->gfx.config.double_offchip_lds_buf = 1685 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1686 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1687 adev->gfx.cu_info.max_waves_per_simd = 1688 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1689 adev->gfx.cu_info.max_scratch_slots_per_cu = 1690 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1691 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1692 if (hdr->version_minor >= 1) { 1693 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1694 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1695 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1696 adev->gfx.config.num_sc_per_sh = 1697 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1698 adev->gfx.config.num_packer_per_sc = 1699 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1700 } 1701 1702 parse_soc_bounding_box: 1703 /* 1704 * soc bounding box info is not integrated in disocovery table, 1705 * we always need to parse it from gpu info firmware if needed. 1706 */ 1707 if (hdr->version_minor == 2) { 1708 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1709 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1710 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1711 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1712 } 1713 break; 1714 } 1715 default: 1716 dev_err(adev->dev, 1717 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1718 err = -EINVAL; 1719 goto out; 1720 } 1721 out: 1722 return err; 1723 } 1724 1725 /** 1726 * amdgpu_device_ip_early_init - run early init for hardware IPs 1727 * 1728 * @adev: amdgpu_device pointer 1729 * 1730 * Early initialization pass for hardware IPs. The hardware IPs that make 1731 * up each asic are discovered each IP's early_init callback is run. This 1732 * is the first stage in initializing the asic. 1733 * Returns 0 on success, negative error code on failure. 1734 */ 1735 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1736 { 1737 int i, r; 1738 1739 amdgpu_device_enable_virtual_display(adev); 1740 1741 if (amdgpu_sriov_vf(adev)) { 1742 r = amdgpu_virt_request_full_gpu(adev, true); 1743 if (r) 1744 return r; 1745 } 1746 1747 switch (adev->asic_type) { 1748 #ifdef CONFIG_DRM_AMDGPU_SI 1749 case CHIP_VERDE: 1750 case CHIP_TAHITI: 1751 case CHIP_PITCAIRN: 1752 case CHIP_OLAND: 1753 case CHIP_HAINAN: 1754 adev->family = AMDGPU_FAMILY_SI; 1755 r = si_set_ip_blocks(adev); 1756 if (r) 1757 return r; 1758 break; 1759 #endif 1760 #ifdef CONFIG_DRM_AMDGPU_CIK 1761 case CHIP_BONAIRE: 1762 case CHIP_HAWAII: 1763 case CHIP_KAVERI: 1764 case CHIP_KABINI: 1765 case CHIP_MULLINS: 1766 if (adev->flags & AMD_IS_APU) 1767 adev->family = AMDGPU_FAMILY_KV; 1768 else 1769 adev->family = AMDGPU_FAMILY_CI; 1770 1771 r = cik_set_ip_blocks(adev); 1772 if (r) 1773 return r; 1774 break; 1775 #endif 1776 case CHIP_TOPAZ: 1777 case CHIP_TONGA: 1778 case CHIP_FIJI: 1779 case CHIP_POLARIS10: 1780 case CHIP_POLARIS11: 1781 case CHIP_POLARIS12: 1782 case CHIP_VEGAM: 1783 case CHIP_CARRIZO: 1784 case CHIP_STONEY: 1785 if (adev->flags & AMD_IS_APU) 1786 adev->family = AMDGPU_FAMILY_CZ; 1787 else 1788 adev->family = AMDGPU_FAMILY_VI; 1789 1790 r = vi_set_ip_blocks(adev); 1791 if (r) 1792 return r; 1793 break; 1794 case CHIP_VEGA10: 1795 case CHIP_VEGA12: 1796 case CHIP_VEGA20: 1797 case CHIP_RAVEN: 1798 case CHIP_ARCTURUS: 1799 case CHIP_RENOIR: 1800 if (adev->flags & AMD_IS_APU) 1801 adev->family = AMDGPU_FAMILY_RV; 1802 else 1803 adev->family = AMDGPU_FAMILY_AI; 1804 1805 r = soc15_set_ip_blocks(adev); 1806 if (r) 1807 return r; 1808 break; 1809 case CHIP_NAVI10: 1810 case CHIP_NAVI14: 1811 case CHIP_NAVI12: 1812 case CHIP_SIENNA_CICHLID: 1813 case CHIP_NAVY_FLOUNDER: 1814 adev->family = AMDGPU_FAMILY_NV; 1815 1816 r = nv_set_ip_blocks(adev); 1817 if (r) 1818 return r; 1819 break; 1820 default: 1821 /* FIXME: not supported yet */ 1822 return -EINVAL; 1823 } 1824 1825 amdgpu_amdkfd_device_probe(adev); 1826 1827 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1828 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1829 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1830 1831 for (i = 0; i < adev->num_ip_blocks; i++) { 1832 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1833 DRM_ERROR("disabled ip block: %d <%s>\n", 1834 i, adev->ip_blocks[i].version->funcs->name); 1835 adev->ip_blocks[i].status.valid = false; 1836 } else { 1837 if (adev->ip_blocks[i].version->funcs->early_init) { 1838 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1839 if (r == -ENOENT) { 1840 adev->ip_blocks[i].status.valid = false; 1841 } else if (r) { 1842 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1843 adev->ip_blocks[i].version->funcs->name, r); 1844 return r; 1845 } else { 1846 adev->ip_blocks[i].status.valid = true; 1847 } 1848 } else { 1849 adev->ip_blocks[i].status.valid = true; 1850 } 1851 } 1852 /* get the vbios after the asic_funcs are set up */ 1853 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1854 r = amdgpu_device_parse_gpu_info_fw(adev); 1855 if (r) 1856 return r; 1857 1858 /* Read BIOS */ 1859 if (!amdgpu_get_bios(adev)) 1860 return -EINVAL; 1861 1862 r = amdgpu_atombios_init(adev); 1863 if (r) { 1864 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1865 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1866 return r; 1867 } 1868 } 1869 } 1870 1871 adev->cg_flags &= amdgpu_cg_mask; 1872 adev->pg_flags &= amdgpu_pg_mask; 1873 1874 return 0; 1875 } 1876 1877 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1878 { 1879 int i, r; 1880 1881 for (i = 0; i < adev->num_ip_blocks; i++) { 1882 if (!adev->ip_blocks[i].status.sw) 1883 continue; 1884 if (adev->ip_blocks[i].status.hw) 1885 continue; 1886 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1887 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1888 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1889 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1890 if (r) { 1891 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1892 adev->ip_blocks[i].version->funcs->name, r); 1893 return r; 1894 } 1895 adev->ip_blocks[i].status.hw = true; 1896 } 1897 } 1898 1899 return 0; 1900 } 1901 1902 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1903 { 1904 int i, r; 1905 1906 for (i = 0; i < adev->num_ip_blocks; i++) { 1907 if (!adev->ip_blocks[i].status.sw) 1908 continue; 1909 if (adev->ip_blocks[i].status.hw) 1910 continue; 1911 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1912 if (r) { 1913 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1914 adev->ip_blocks[i].version->funcs->name, r); 1915 return r; 1916 } 1917 adev->ip_blocks[i].status.hw = true; 1918 } 1919 1920 return 0; 1921 } 1922 1923 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1924 { 1925 int r = 0; 1926 int i; 1927 uint32_t smu_version; 1928 1929 if (adev->asic_type >= CHIP_VEGA10) { 1930 for (i = 0; i < adev->num_ip_blocks; i++) { 1931 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1932 continue; 1933 1934 /* no need to do the fw loading again if already done*/ 1935 if (adev->ip_blocks[i].status.hw == true) 1936 break; 1937 1938 if (amdgpu_in_reset(adev) || adev->in_suspend) { 1939 r = adev->ip_blocks[i].version->funcs->resume(adev); 1940 if (r) { 1941 DRM_ERROR("resume of IP block <%s> failed %d\n", 1942 adev->ip_blocks[i].version->funcs->name, r); 1943 return r; 1944 } 1945 } else { 1946 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1947 if (r) { 1948 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1949 adev->ip_blocks[i].version->funcs->name, r); 1950 return r; 1951 } 1952 } 1953 1954 adev->ip_blocks[i].status.hw = true; 1955 break; 1956 } 1957 } 1958 1959 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 1960 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1961 1962 return r; 1963 } 1964 1965 /** 1966 * amdgpu_device_ip_init - run init for hardware IPs 1967 * 1968 * @adev: amdgpu_device pointer 1969 * 1970 * Main initialization pass for hardware IPs. The list of all the hardware 1971 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1972 * are run. sw_init initializes the software state associated with each IP 1973 * and hw_init initializes the hardware associated with each IP. 1974 * Returns 0 on success, negative error code on failure. 1975 */ 1976 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1977 { 1978 int i, r; 1979 1980 r = amdgpu_ras_init(adev); 1981 if (r) 1982 return r; 1983 1984 for (i = 0; i < adev->num_ip_blocks; i++) { 1985 if (!adev->ip_blocks[i].status.valid) 1986 continue; 1987 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1988 if (r) { 1989 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1990 adev->ip_blocks[i].version->funcs->name, r); 1991 goto init_failed; 1992 } 1993 adev->ip_blocks[i].status.sw = true; 1994 1995 /* need to do gmc hw init early so we can allocate gpu mem */ 1996 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 1997 r = amdgpu_device_vram_scratch_init(adev); 1998 if (r) { 1999 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2000 goto init_failed; 2001 } 2002 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2003 if (r) { 2004 DRM_ERROR("hw_init %d failed %d\n", i, r); 2005 goto init_failed; 2006 } 2007 r = amdgpu_device_wb_init(adev); 2008 if (r) { 2009 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2010 goto init_failed; 2011 } 2012 adev->ip_blocks[i].status.hw = true; 2013 2014 /* right after GMC hw init, we create CSA */ 2015 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2016 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2017 AMDGPU_GEM_DOMAIN_VRAM, 2018 AMDGPU_CSA_SIZE); 2019 if (r) { 2020 DRM_ERROR("allocate CSA failed %d\n", r); 2021 goto init_failed; 2022 } 2023 } 2024 } 2025 } 2026 2027 if (amdgpu_sriov_vf(adev)) 2028 amdgpu_virt_init_data_exchange(adev); 2029 2030 r = amdgpu_ib_pool_init(adev); 2031 if (r) { 2032 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2033 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2034 goto init_failed; 2035 } 2036 2037 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2038 if (r) 2039 goto init_failed; 2040 2041 r = amdgpu_device_ip_hw_init_phase1(adev); 2042 if (r) 2043 goto init_failed; 2044 2045 r = amdgpu_device_fw_loading(adev); 2046 if (r) 2047 goto init_failed; 2048 2049 r = amdgpu_device_ip_hw_init_phase2(adev); 2050 if (r) 2051 goto init_failed; 2052 2053 /* 2054 * retired pages will be loaded from eeprom and reserved here, 2055 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2056 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2057 * for I2C communication which only true at this point. 2058 * recovery_init may fail, but it can free all resources allocated by 2059 * itself and its failure should not stop amdgpu init process. 2060 * 2061 * Note: theoretically, this should be called before all vram allocations 2062 * to protect retired page from abusing 2063 */ 2064 amdgpu_ras_recovery_init(adev); 2065 2066 if (adev->gmc.xgmi.num_physical_nodes > 1) 2067 amdgpu_xgmi_add_device(adev); 2068 amdgpu_amdkfd_device_init(adev); 2069 2070 amdgpu_fru_get_product_info(adev); 2071 2072 init_failed: 2073 if (amdgpu_sriov_vf(adev)) 2074 amdgpu_virt_release_full_gpu(adev, true); 2075 2076 return r; 2077 } 2078 2079 /** 2080 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2081 * 2082 * @adev: amdgpu_device pointer 2083 * 2084 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2085 * this function before a GPU reset. If the value is retained after a 2086 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2087 */ 2088 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2089 { 2090 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2091 } 2092 2093 /** 2094 * amdgpu_device_check_vram_lost - check if vram is valid 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Checks the reset magic value written to the gart pointer in VRAM. 2099 * The driver calls this after a GPU reset to see if the contents of 2100 * VRAM is lost or now. 2101 * returns true if vram is lost, false if not. 2102 */ 2103 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2104 { 2105 if (memcmp(adev->gart.ptr, adev->reset_magic, 2106 AMDGPU_RESET_MAGIC_NUM)) 2107 return true; 2108 2109 if (!amdgpu_in_reset(adev)) 2110 return false; 2111 2112 /* 2113 * For all ASICs with baco/mode1 reset, the VRAM is 2114 * always assumed to be lost. 2115 */ 2116 switch (amdgpu_asic_reset_method(adev)) { 2117 case AMD_RESET_METHOD_BACO: 2118 case AMD_RESET_METHOD_MODE1: 2119 return true; 2120 default: 2121 return false; 2122 } 2123 } 2124 2125 /** 2126 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2127 * 2128 * @adev: amdgpu_device pointer 2129 * @state: clockgating state (gate or ungate) 2130 * 2131 * The list of all the hardware IPs that make up the asic is walked and the 2132 * set_clockgating_state callbacks are run. 2133 * Late initialization pass enabling clockgating for hardware IPs. 2134 * Fini or suspend, pass disabling clockgating for hardware IPs. 2135 * Returns 0 on success, negative error code on failure. 2136 */ 2137 2138 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2139 enum amd_clockgating_state state) 2140 { 2141 int i, j, r; 2142 2143 if (amdgpu_emu_mode == 1) 2144 return 0; 2145 2146 for (j = 0; j < adev->num_ip_blocks; j++) { 2147 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2148 if (!adev->ip_blocks[i].status.late_initialized) 2149 continue; 2150 /* skip CG for VCE/UVD, it's handled specially */ 2151 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2152 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2153 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2154 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2155 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2156 /* enable clockgating to save power */ 2157 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2158 state); 2159 if (r) { 2160 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2161 adev->ip_blocks[i].version->funcs->name, r); 2162 return r; 2163 } 2164 } 2165 } 2166 2167 return 0; 2168 } 2169 2170 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2171 { 2172 int i, j, r; 2173 2174 if (amdgpu_emu_mode == 1) 2175 return 0; 2176 2177 for (j = 0; j < adev->num_ip_blocks; j++) { 2178 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2179 if (!adev->ip_blocks[i].status.late_initialized) 2180 continue; 2181 /* skip CG for VCE/UVD, it's handled specially */ 2182 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2183 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2184 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2185 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2186 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2187 /* enable powergating to save power */ 2188 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2189 state); 2190 if (r) { 2191 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2192 adev->ip_blocks[i].version->funcs->name, r); 2193 return r; 2194 } 2195 } 2196 } 2197 return 0; 2198 } 2199 2200 static int amdgpu_device_enable_mgpu_fan_boost(void) 2201 { 2202 struct amdgpu_gpu_instance *gpu_ins; 2203 struct amdgpu_device *adev; 2204 int i, ret = 0; 2205 2206 mutex_lock(&mgpu_info.mutex); 2207 2208 /* 2209 * MGPU fan boost feature should be enabled 2210 * only when there are two or more dGPUs in 2211 * the system 2212 */ 2213 if (mgpu_info.num_dgpu < 2) 2214 goto out; 2215 2216 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2217 gpu_ins = &(mgpu_info.gpu_ins[i]); 2218 adev = gpu_ins->adev; 2219 if (!(adev->flags & AMD_IS_APU) && 2220 !gpu_ins->mgpu_fan_enabled && 2221 adev->powerplay.pp_funcs && 2222 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 2223 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2224 if (ret) 2225 break; 2226 2227 gpu_ins->mgpu_fan_enabled = 1; 2228 } 2229 } 2230 2231 out: 2232 mutex_unlock(&mgpu_info.mutex); 2233 2234 return ret; 2235 } 2236 2237 /** 2238 * amdgpu_device_ip_late_init - run late init for hardware IPs 2239 * 2240 * @adev: amdgpu_device pointer 2241 * 2242 * Late initialization pass for hardware IPs. The list of all the hardware 2243 * IPs that make up the asic is walked and the late_init callbacks are run. 2244 * late_init covers any special initialization that an IP requires 2245 * after all of the have been initialized or something that needs to happen 2246 * late in the init process. 2247 * Returns 0 on success, negative error code on failure. 2248 */ 2249 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2250 { 2251 struct amdgpu_gpu_instance *gpu_instance; 2252 int i = 0, r; 2253 2254 for (i = 0; i < adev->num_ip_blocks; i++) { 2255 if (!adev->ip_blocks[i].status.hw) 2256 continue; 2257 if (adev->ip_blocks[i].version->funcs->late_init) { 2258 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2259 if (r) { 2260 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2261 adev->ip_blocks[i].version->funcs->name, r); 2262 return r; 2263 } 2264 } 2265 adev->ip_blocks[i].status.late_initialized = true; 2266 } 2267 2268 amdgpu_ras_set_error_query_ready(adev, true); 2269 2270 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2271 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2272 2273 amdgpu_device_fill_reset_magic(adev); 2274 2275 r = amdgpu_device_enable_mgpu_fan_boost(); 2276 if (r) 2277 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2278 2279 2280 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2281 mutex_lock(&mgpu_info.mutex); 2282 2283 /* 2284 * Reset device p-state to low as this was booted with high. 2285 * 2286 * This should be performed only after all devices from the same 2287 * hive get initialized. 2288 * 2289 * However, it's unknown how many device in the hive in advance. 2290 * As this is counted one by one during devices initializations. 2291 * 2292 * So, we wait for all XGMI interlinked devices initialized. 2293 * This may bring some delays as those devices may come from 2294 * different hives. But that should be OK. 2295 */ 2296 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2297 for (i = 0; i < mgpu_info.num_gpu; i++) { 2298 gpu_instance = &(mgpu_info.gpu_ins[i]); 2299 if (gpu_instance->adev->flags & AMD_IS_APU) 2300 continue; 2301 2302 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2303 AMDGPU_XGMI_PSTATE_MIN); 2304 if (r) { 2305 DRM_ERROR("pstate setting failed (%d).\n", r); 2306 break; 2307 } 2308 } 2309 } 2310 2311 mutex_unlock(&mgpu_info.mutex); 2312 } 2313 2314 return 0; 2315 } 2316 2317 /** 2318 * amdgpu_device_ip_fini - run fini for hardware IPs 2319 * 2320 * @adev: amdgpu_device pointer 2321 * 2322 * Main teardown pass for hardware IPs. The list of all the hardware 2323 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2324 * are run. hw_fini tears down the hardware associated with each IP 2325 * and sw_fini tears down any software state associated with each IP. 2326 * Returns 0 on success, negative error code on failure. 2327 */ 2328 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2329 { 2330 int i, r; 2331 2332 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2333 amdgpu_virt_release_ras_err_handler_data(adev); 2334 2335 amdgpu_ras_pre_fini(adev); 2336 2337 if (adev->gmc.xgmi.num_physical_nodes > 1) 2338 amdgpu_xgmi_remove_device(adev); 2339 2340 amdgpu_amdkfd_device_fini(adev); 2341 2342 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2343 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2344 2345 /* need to disable SMC first */ 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if (!adev->ip_blocks[i].status.hw) 2348 continue; 2349 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2351 /* XXX handle errors */ 2352 if (r) { 2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2354 adev->ip_blocks[i].version->funcs->name, r); 2355 } 2356 adev->ip_blocks[i].status.hw = false; 2357 break; 2358 } 2359 } 2360 2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2362 if (!adev->ip_blocks[i].status.hw) 2363 continue; 2364 2365 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2366 /* XXX handle errors */ 2367 if (r) { 2368 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2369 adev->ip_blocks[i].version->funcs->name, r); 2370 } 2371 2372 adev->ip_blocks[i].status.hw = false; 2373 } 2374 2375 2376 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2377 if (!adev->ip_blocks[i].status.sw) 2378 continue; 2379 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2381 amdgpu_ucode_free_bo(adev); 2382 amdgpu_free_static_csa(&adev->virt.csa_obj); 2383 amdgpu_device_wb_fini(adev); 2384 amdgpu_device_vram_scratch_fini(adev); 2385 amdgpu_ib_pool_fini(adev); 2386 } 2387 2388 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2389 /* XXX handle errors */ 2390 if (r) { 2391 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2392 adev->ip_blocks[i].version->funcs->name, r); 2393 } 2394 adev->ip_blocks[i].status.sw = false; 2395 adev->ip_blocks[i].status.valid = false; 2396 } 2397 2398 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2399 if (!adev->ip_blocks[i].status.late_initialized) 2400 continue; 2401 if (adev->ip_blocks[i].version->funcs->late_fini) 2402 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2403 adev->ip_blocks[i].status.late_initialized = false; 2404 } 2405 2406 amdgpu_ras_fini(adev); 2407 2408 if (amdgpu_sriov_vf(adev)) 2409 if (amdgpu_virt_release_full_gpu(adev, false)) 2410 DRM_ERROR("failed to release exclusive mode on fini\n"); 2411 2412 return 0; 2413 } 2414 2415 /** 2416 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2417 * 2418 * @work: work_struct. 2419 */ 2420 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2421 { 2422 struct amdgpu_device *adev = 2423 container_of(work, struct amdgpu_device, delayed_init_work.work); 2424 int r; 2425 2426 r = amdgpu_ib_ring_tests(adev); 2427 if (r) 2428 DRM_ERROR("ib ring test failed (%d).\n", r); 2429 } 2430 2431 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2432 { 2433 struct amdgpu_device *adev = 2434 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2435 2436 mutex_lock(&adev->gfx.gfx_off_mutex); 2437 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2438 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2439 adev->gfx.gfx_off_state = true; 2440 } 2441 mutex_unlock(&adev->gfx.gfx_off_mutex); 2442 } 2443 2444 /** 2445 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2446 * 2447 * @adev: amdgpu_device pointer 2448 * 2449 * Main suspend function for hardware IPs. The list of all the hardware 2450 * IPs that make up the asic is walked, clockgating is disabled and the 2451 * suspend callbacks are run. suspend puts the hardware and software state 2452 * in each IP into a state suitable for suspend. 2453 * Returns 0 on success, negative error code on failure. 2454 */ 2455 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2456 { 2457 int i, r; 2458 2459 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2460 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2461 2462 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2463 if (!adev->ip_blocks[i].status.valid) 2464 continue; 2465 2466 /* displays are handled separately */ 2467 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2468 continue; 2469 2470 /* XXX handle errors */ 2471 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2472 /* XXX handle errors */ 2473 if (r) { 2474 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2475 adev->ip_blocks[i].version->funcs->name, r); 2476 return r; 2477 } 2478 2479 adev->ip_blocks[i].status.hw = false; 2480 } 2481 2482 return 0; 2483 } 2484 2485 /** 2486 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2487 * 2488 * @adev: amdgpu_device pointer 2489 * 2490 * Main suspend function for hardware IPs. The list of all the hardware 2491 * IPs that make up the asic is walked, clockgating is disabled and the 2492 * suspend callbacks are run. suspend puts the hardware and software state 2493 * in each IP into a state suitable for suspend. 2494 * Returns 0 on success, negative error code on failure. 2495 */ 2496 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2497 { 2498 int i, r; 2499 2500 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2501 if (!adev->ip_blocks[i].status.valid) 2502 continue; 2503 /* displays are handled in phase1 */ 2504 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2505 continue; 2506 /* PSP lost connection when err_event_athub occurs */ 2507 if (amdgpu_ras_intr_triggered() && 2508 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2509 adev->ip_blocks[i].status.hw = false; 2510 continue; 2511 } 2512 /* XXX handle errors */ 2513 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2514 /* XXX handle errors */ 2515 if (r) { 2516 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2517 adev->ip_blocks[i].version->funcs->name, r); 2518 } 2519 adev->ip_blocks[i].status.hw = false; 2520 /* handle putting the SMC in the appropriate state */ 2521 if(!amdgpu_sriov_vf(adev)){ 2522 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2523 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2524 if (r) { 2525 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2526 adev->mp1_state, r); 2527 return r; 2528 } 2529 } 2530 } 2531 adev->ip_blocks[i].status.hw = false; 2532 } 2533 2534 return 0; 2535 } 2536 2537 /** 2538 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2539 * 2540 * @adev: amdgpu_device pointer 2541 * 2542 * Main suspend function for hardware IPs. The list of all the hardware 2543 * IPs that make up the asic is walked, clockgating is disabled and the 2544 * suspend callbacks are run. suspend puts the hardware and software state 2545 * in each IP into a state suitable for suspend. 2546 * Returns 0 on success, negative error code on failure. 2547 */ 2548 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2549 { 2550 int r; 2551 2552 if (amdgpu_sriov_vf(adev)) 2553 amdgpu_virt_request_full_gpu(adev, false); 2554 2555 r = amdgpu_device_ip_suspend_phase1(adev); 2556 if (r) 2557 return r; 2558 r = amdgpu_device_ip_suspend_phase2(adev); 2559 2560 if (amdgpu_sriov_vf(adev)) 2561 amdgpu_virt_release_full_gpu(adev, false); 2562 2563 return r; 2564 } 2565 2566 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2567 { 2568 int i, r; 2569 2570 static enum amd_ip_block_type ip_order[] = { 2571 AMD_IP_BLOCK_TYPE_GMC, 2572 AMD_IP_BLOCK_TYPE_COMMON, 2573 AMD_IP_BLOCK_TYPE_PSP, 2574 AMD_IP_BLOCK_TYPE_IH, 2575 }; 2576 2577 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2578 int j; 2579 struct amdgpu_ip_block *block; 2580 2581 for (j = 0; j < adev->num_ip_blocks; j++) { 2582 block = &adev->ip_blocks[j]; 2583 2584 block->status.hw = false; 2585 if (block->version->type != ip_order[i] || 2586 !block->status.valid) 2587 continue; 2588 2589 r = block->version->funcs->hw_init(adev); 2590 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2591 if (r) 2592 return r; 2593 block->status.hw = true; 2594 } 2595 } 2596 2597 return 0; 2598 } 2599 2600 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2601 { 2602 int i, r; 2603 2604 static enum amd_ip_block_type ip_order[] = { 2605 AMD_IP_BLOCK_TYPE_SMC, 2606 AMD_IP_BLOCK_TYPE_DCE, 2607 AMD_IP_BLOCK_TYPE_GFX, 2608 AMD_IP_BLOCK_TYPE_SDMA, 2609 AMD_IP_BLOCK_TYPE_UVD, 2610 AMD_IP_BLOCK_TYPE_VCE, 2611 AMD_IP_BLOCK_TYPE_VCN 2612 }; 2613 2614 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2615 int j; 2616 struct amdgpu_ip_block *block; 2617 2618 for (j = 0; j < adev->num_ip_blocks; j++) { 2619 block = &adev->ip_blocks[j]; 2620 2621 if (block->version->type != ip_order[i] || 2622 !block->status.valid || 2623 block->status.hw) 2624 continue; 2625 2626 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2627 r = block->version->funcs->resume(adev); 2628 else 2629 r = block->version->funcs->hw_init(adev); 2630 2631 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2632 if (r) 2633 return r; 2634 block->status.hw = true; 2635 } 2636 } 2637 2638 return 0; 2639 } 2640 2641 /** 2642 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2643 * 2644 * @adev: amdgpu_device pointer 2645 * 2646 * First resume function for hardware IPs. The list of all the hardware 2647 * IPs that make up the asic is walked and the resume callbacks are run for 2648 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2649 * after a suspend and updates the software state as necessary. This 2650 * function is also used for restoring the GPU after a GPU reset. 2651 * Returns 0 on success, negative error code on failure. 2652 */ 2653 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2654 { 2655 int i, r; 2656 2657 for (i = 0; i < adev->num_ip_blocks; i++) { 2658 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2659 continue; 2660 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2661 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2662 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2663 2664 r = adev->ip_blocks[i].version->funcs->resume(adev); 2665 if (r) { 2666 DRM_ERROR("resume of IP block <%s> failed %d\n", 2667 adev->ip_blocks[i].version->funcs->name, r); 2668 return r; 2669 } 2670 adev->ip_blocks[i].status.hw = true; 2671 } 2672 } 2673 2674 return 0; 2675 } 2676 2677 /** 2678 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2679 * 2680 * @adev: amdgpu_device pointer 2681 * 2682 * First resume function for hardware IPs. The list of all the hardware 2683 * IPs that make up the asic is walked and the resume callbacks are run for 2684 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2685 * functional state after a suspend and updates the software state as 2686 * necessary. This function is also used for restoring the GPU after a GPU 2687 * reset. 2688 * Returns 0 on success, negative error code on failure. 2689 */ 2690 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2691 { 2692 int i, r; 2693 2694 for (i = 0; i < adev->num_ip_blocks; i++) { 2695 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2696 continue; 2697 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2698 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2699 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2700 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2701 continue; 2702 r = adev->ip_blocks[i].version->funcs->resume(adev); 2703 if (r) { 2704 DRM_ERROR("resume of IP block <%s> failed %d\n", 2705 adev->ip_blocks[i].version->funcs->name, r); 2706 return r; 2707 } 2708 adev->ip_blocks[i].status.hw = true; 2709 } 2710 2711 return 0; 2712 } 2713 2714 /** 2715 * amdgpu_device_ip_resume - run resume for hardware IPs 2716 * 2717 * @adev: amdgpu_device pointer 2718 * 2719 * Main resume function for hardware IPs. The hardware IPs 2720 * are split into two resume functions because they are 2721 * are also used in in recovering from a GPU reset and some additional 2722 * steps need to be take between them. In this case (S3/S4) they are 2723 * run sequentially. 2724 * Returns 0 on success, negative error code on failure. 2725 */ 2726 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2727 { 2728 int r; 2729 2730 r = amdgpu_device_ip_resume_phase1(adev); 2731 if (r) 2732 return r; 2733 2734 r = amdgpu_device_fw_loading(adev); 2735 if (r) 2736 return r; 2737 2738 r = amdgpu_device_ip_resume_phase2(adev); 2739 2740 return r; 2741 } 2742 2743 /** 2744 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2745 * 2746 * @adev: amdgpu_device pointer 2747 * 2748 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2749 */ 2750 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2751 { 2752 if (amdgpu_sriov_vf(adev)) { 2753 if (adev->is_atom_fw) { 2754 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2755 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2756 } else { 2757 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2758 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2759 } 2760 2761 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2762 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2763 } 2764 } 2765 2766 /** 2767 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2768 * 2769 * @asic_type: AMD asic type 2770 * 2771 * Check if there is DC (new modesetting infrastructre) support for an asic. 2772 * returns true if DC has support, false if not. 2773 */ 2774 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2775 { 2776 switch (asic_type) { 2777 #if defined(CONFIG_DRM_AMD_DC) 2778 case CHIP_BONAIRE: 2779 case CHIP_KAVERI: 2780 case CHIP_KABINI: 2781 case CHIP_MULLINS: 2782 /* 2783 * We have systems in the wild with these ASICs that require 2784 * LVDS and VGA support which is not supported with DC. 2785 * 2786 * Fallback to the non-DC driver here by default so as not to 2787 * cause regressions. 2788 */ 2789 return amdgpu_dc > 0; 2790 case CHIP_HAWAII: 2791 case CHIP_CARRIZO: 2792 case CHIP_STONEY: 2793 case CHIP_POLARIS10: 2794 case CHIP_POLARIS11: 2795 case CHIP_POLARIS12: 2796 case CHIP_VEGAM: 2797 case CHIP_TONGA: 2798 case CHIP_FIJI: 2799 case CHIP_VEGA10: 2800 case CHIP_VEGA12: 2801 case CHIP_VEGA20: 2802 #if defined(CONFIG_DRM_AMD_DC_DCN) 2803 case CHIP_RAVEN: 2804 case CHIP_NAVI10: 2805 case CHIP_NAVI14: 2806 case CHIP_NAVI12: 2807 case CHIP_RENOIR: 2808 #endif 2809 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2810 case CHIP_SIENNA_CICHLID: 2811 case CHIP_NAVY_FLOUNDER: 2812 #endif 2813 return amdgpu_dc != 0; 2814 #endif 2815 default: 2816 if (amdgpu_dc > 0) 2817 DRM_INFO("Display Core has been requested via kernel parameter " 2818 "but isn't supported by ASIC, ignoring\n"); 2819 return false; 2820 } 2821 } 2822 2823 /** 2824 * amdgpu_device_has_dc_support - check if dc is supported 2825 * 2826 * @adev: amdgpu_device_pointer 2827 * 2828 * Returns true for supported, false for not supported 2829 */ 2830 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2831 { 2832 if (amdgpu_sriov_vf(adev)) 2833 return false; 2834 2835 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2836 } 2837 2838 2839 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2840 { 2841 struct amdgpu_device *adev = 2842 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2843 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); 2844 2845 /* It's a bug to not have a hive within this function */ 2846 if (WARN_ON(!hive)) 2847 return; 2848 2849 /* 2850 * Use task barrier to synchronize all xgmi reset works across the 2851 * hive. task_barrier_enter and task_barrier_exit will block 2852 * until all the threads running the xgmi reset works reach 2853 * those points. task_barrier_full will do both blocks. 2854 */ 2855 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2856 2857 task_barrier_enter(&hive->tb); 2858 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); 2859 2860 if (adev->asic_reset_res) 2861 goto fail; 2862 2863 task_barrier_exit(&hive->tb); 2864 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); 2865 2866 if (adev->asic_reset_res) 2867 goto fail; 2868 2869 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 2870 adev->mmhub.funcs->reset_ras_error_count(adev); 2871 } else { 2872 2873 task_barrier_full(&hive->tb); 2874 adev->asic_reset_res = amdgpu_asic_reset(adev); 2875 } 2876 2877 fail: 2878 if (adev->asic_reset_res) 2879 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2880 adev->asic_reset_res, adev->ddev->unique); 2881 } 2882 2883 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2884 { 2885 char *input = amdgpu_lockup_timeout; 2886 char *timeout_setting = NULL; 2887 int index = 0; 2888 long timeout; 2889 int ret = 0; 2890 2891 /* 2892 * By default timeout for non compute jobs is 10000. 2893 * And there is no timeout enforced on compute jobs. 2894 * In SR-IOV or passthrough mode, timeout for compute 2895 * jobs are 60000 by default. 2896 */ 2897 adev->gfx_timeout = msecs_to_jiffies(10000); 2898 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2899 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2900 adev->compute_timeout = msecs_to_jiffies(60000); 2901 else 2902 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2903 2904 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2905 while ((timeout_setting = strsep(&input, ",")) && 2906 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2907 ret = kstrtol(timeout_setting, 0, &timeout); 2908 if (ret) 2909 return ret; 2910 2911 if (timeout == 0) { 2912 index++; 2913 continue; 2914 } else if (timeout < 0) { 2915 timeout = MAX_SCHEDULE_TIMEOUT; 2916 } else { 2917 timeout = msecs_to_jiffies(timeout); 2918 } 2919 2920 switch (index++) { 2921 case 0: 2922 adev->gfx_timeout = timeout; 2923 break; 2924 case 1: 2925 adev->compute_timeout = timeout; 2926 break; 2927 case 2: 2928 adev->sdma_timeout = timeout; 2929 break; 2930 case 3: 2931 adev->video_timeout = timeout; 2932 break; 2933 default: 2934 break; 2935 } 2936 } 2937 /* 2938 * There is only one value specified and 2939 * it should apply to all non-compute jobs. 2940 */ 2941 if (index == 1) { 2942 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2943 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2944 adev->compute_timeout = adev->gfx_timeout; 2945 } 2946 } 2947 2948 return ret; 2949 } 2950 2951 static const struct attribute *amdgpu_dev_attributes[] = { 2952 &dev_attr_product_name.attr, 2953 &dev_attr_product_number.attr, 2954 &dev_attr_serial_number.attr, 2955 &dev_attr_pcie_replay_count.attr, 2956 NULL 2957 }; 2958 2959 /** 2960 * amdgpu_device_init - initialize the driver 2961 * 2962 * @adev: amdgpu_device pointer 2963 * @ddev: drm dev pointer 2964 * @pdev: pci dev pointer 2965 * @flags: driver flags 2966 * 2967 * Initializes the driver info and hw (all asics). 2968 * Returns 0 for success or an error on failure. 2969 * Called at driver startup. 2970 */ 2971 int amdgpu_device_init(struct amdgpu_device *adev, 2972 struct drm_device *ddev, 2973 struct pci_dev *pdev, 2974 uint32_t flags) 2975 { 2976 int r, i; 2977 bool boco = false; 2978 u32 max_MBps; 2979 2980 adev->shutdown = false; 2981 adev->dev = &pdev->dev; 2982 adev->ddev = ddev; 2983 adev->pdev = pdev; 2984 adev->flags = flags; 2985 2986 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 2987 adev->asic_type = amdgpu_force_asic_type; 2988 else 2989 adev->asic_type = flags & AMD_ASIC_MASK; 2990 2991 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2992 if (amdgpu_emu_mode == 1) 2993 adev->usec_timeout *= 10; 2994 adev->gmc.gart_size = 512 * 1024 * 1024; 2995 adev->accel_working = false; 2996 adev->num_rings = 0; 2997 adev->mman.buffer_funcs = NULL; 2998 adev->mman.buffer_funcs_ring = NULL; 2999 adev->vm_manager.vm_pte_funcs = NULL; 3000 adev->vm_manager.vm_pte_num_scheds = 0; 3001 adev->gmc.gmc_funcs = NULL; 3002 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3003 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3004 3005 adev->smc_rreg = &amdgpu_invalid_rreg; 3006 adev->smc_wreg = &amdgpu_invalid_wreg; 3007 adev->pcie_rreg = &amdgpu_invalid_rreg; 3008 adev->pcie_wreg = &amdgpu_invalid_wreg; 3009 adev->pciep_rreg = &amdgpu_invalid_rreg; 3010 adev->pciep_wreg = &amdgpu_invalid_wreg; 3011 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3012 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3013 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3014 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3015 adev->didt_rreg = &amdgpu_invalid_rreg; 3016 adev->didt_wreg = &amdgpu_invalid_wreg; 3017 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3018 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3019 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3020 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3021 3022 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3023 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3024 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3025 3026 /* mutex initialization are all done here so we 3027 * can recall function without having locking issues */ 3028 atomic_set(&adev->irq.ih.lock, 0); 3029 mutex_init(&adev->firmware.mutex); 3030 mutex_init(&adev->pm.mutex); 3031 mutex_init(&adev->gfx.gpu_clock_mutex); 3032 mutex_init(&adev->srbm_mutex); 3033 mutex_init(&adev->gfx.pipe_reserve_mutex); 3034 mutex_init(&adev->gfx.gfx_off_mutex); 3035 mutex_init(&adev->grbm_idx_mutex); 3036 mutex_init(&adev->mn_lock); 3037 mutex_init(&adev->virt.vf_errors.lock); 3038 hash_init(adev->mn_hash); 3039 init_rwsem(&adev->reset_sem); 3040 atomic_set(&adev->in_gpu_reset, 0); 3041 mutex_init(&adev->psp.mutex); 3042 mutex_init(&adev->notifier_lock); 3043 3044 r = amdgpu_device_check_arguments(adev); 3045 if (r) 3046 return r; 3047 3048 spin_lock_init(&adev->mmio_idx_lock); 3049 spin_lock_init(&adev->smc_idx_lock); 3050 spin_lock_init(&adev->pcie_idx_lock); 3051 spin_lock_init(&adev->uvd_ctx_idx_lock); 3052 spin_lock_init(&adev->didt_idx_lock); 3053 spin_lock_init(&adev->gc_cac_idx_lock); 3054 spin_lock_init(&adev->se_cac_idx_lock); 3055 spin_lock_init(&adev->audio_endpt_idx_lock); 3056 spin_lock_init(&adev->mm_stats.lock); 3057 3058 INIT_LIST_HEAD(&adev->shadow_list); 3059 mutex_init(&adev->shadow_list_lock); 3060 3061 INIT_DELAYED_WORK(&adev->delayed_init_work, 3062 amdgpu_device_delayed_init_work_handler); 3063 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3064 amdgpu_device_delay_enable_gfx_off); 3065 3066 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3067 3068 adev->gfx.gfx_off_req_count = 1; 3069 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3070 3071 atomic_set(&adev->throttling_logging_enabled, 1); 3072 /* 3073 * If throttling continues, logging will be performed every minute 3074 * to avoid log flooding. "-1" is subtracted since the thermal 3075 * throttling interrupt comes every second. Thus, the total logging 3076 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3077 * for throttling interrupt) = 60 seconds. 3078 */ 3079 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3080 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3081 3082 /* Registers mapping */ 3083 /* TODO: block userspace mapping of io register */ 3084 if (adev->asic_type >= CHIP_BONAIRE) { 3085 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3086 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3087 } else { 3088 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3089 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3090 } 3091 3092 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3093 if (adev->rmmio == NULL) { 3094 return -ENOMEM; 3095 } 3096 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3097 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3098 3099 /* io port mapping */ 3100 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3101 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3102 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3103 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3104 break; 3105 } 3106 } 3107 if (adev->rio_mem == NULL) 3108 DRM_INFO("PCI I/O BAR is not found.\n"); 3109 3110 /* enable PCIE atomic ops */ 3111 r = pci_enable_atomic_ops_to_root(adev->pdev, 3112 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3113 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3114 if (r) { 3115 adev->have_atomics_support = false; 3116 DRM_INFO("PCIE atomic ops is not supported\n"); 3117 } else { 3118 adev->have_atomics_support = true; 3119 } 3120 3121 amdgpu_device_get_pcie_info(adev); 3122 3123 if (amdgpu_mcbp) 3124 DRM_INFO("MCBP is enabled\n"); 3125 3126 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3127 adev->enable_mes = true; 3128 3129 /* detect hw virtualization here */ 3130 amdgpu_detect_virtualization(adev); 3131 3132 r = amdgpu_device_get_job_timeout_settings(adev); 3133 if (r) { 3134 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3135 return r; 3136 } 3137 3138 /* early init functions */ 3139 r = amdgpu_device_ip_early_init(adev); 3140 if (r) 3141 return r; 3142 3143 /* doorbell bar mapping and doorbell index init*/ 3144 amdgpu_device_doorbell_init(adev); 3145 3146 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3147 /* this will fail for cards that aren't VGA class devices, just 3148 * ignore it */ 3149 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3150 3151 if (amdgpu_device_supports_boco(ddev)) 3152 boco = true; 3153 if (amdgpu_has_atpx() && 3154 (amdgpu_is_atpx_hybrid() || 3155 amdgpu_has_atpx_dgpu_power_cntl()) && 3156 !pci_is_thunderbolt_attached(adev->pdev)) 3157 vga_switcheroo_register_client(adev->pdev, 3158 &amdgpu_switcheroo_ops, boco); 3159 if (boco) 3160 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3161 3162 if (amdgpu_emu_mode == 1) { 3163 /* post the asic on emulation mode */ 3164 emu_soc_asic_init(adev); 3165 goto fence_driver_init; 3166 } 3167 3168 /* detect if we are with an SRIOV vbios */ 3169 amdgpu_device_detect_sriov_bios(adev); 3170 3171 /* check if we need to reset the asic 3172 * E.g., driver was not cleanly unloaded previously, etc. 3173 */ 3174 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3175 r = amdgpu_asic_reset(adev); 3176 if (r) { 3177 dev_err(adev->dev, "asic reset on init failed\n"); 3178 goto failed; 3179 } 3180 } 3181 3182 /* Post card if necessary */ 3183 if (amdgpu_device_need_post(adev)) { 3184 if (!adev->bios) { 3185 dev_err(adev->dev, "no vBIOS found\n"); 3186 r = -EINVAL; 3187 goto failed; 3188 } 3189 DRM_INFO("GPU posting now...\n"); 3190 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3191 if (r) { 3192 dev_err(adev->dev, "gpu post error!\n"); 3193 goto failed; 3194 } 3195 } 3196 3197 if (adev->is_atom_fw) { 3198 /* Initialize clocks */ 3199 r = amdgpu_atomfirmware_get_clock_info(adev); 3200 if (r) { 3201 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3202 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3203 goto failed; 3204 } 3205 } else { 3206 /* Initialize clocks */ 3207 r = amdgpu_atombios_get_clock_info(adev); 3208 if (r) { 3209 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3210 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3211 goto failed; 3212 } 3213 /* init i2c buses */ 3214 if (!amdgpu_device_has_dc_support(adev)) 3215 amdgpu_atombios_i2c_init(adev); 3216 } 3217 3218 fence_driver_init: 3219 /* Fence driver */ 3220 r = amdgpu_fence_driver_init(adev); 3221 if (r) { 3222 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3223 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3224 goto failed; 3225 } 3226 3227 /* init the mode config */ 3228 drm_mode_config_init(adev->ddev); 3229 3230 r = amdgpu_device_ip_init(adev); 3231 if (r) { 3232 /* failed in exclusive mode due to timeout */ 3233 if (amdgpu_sriov_vf(adev) && 3234 !amdgpu_sriov_runtime(adev) && 3235 amdgpu_virt_mmio_blocked(adev) && 3236 !amdgpu_virt_wait_reset(adev)) { 3237 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3238 /* Don't send request since VF is inactive. */ 3239 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3240 adev->virt.ops = NULL; 3241 r = -EAGAIN; 3242 goto failed; 3243 } 3244 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3245 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3246 goto failed; 3247 } 3248 3249 dev_info(adev->dev, 3250 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3251 adev->gfx.config.max_shader_engines, 3252 adev->gfx.config.max_sh_per_se, 3253 adev->gfx.config.max_cu_per_sh, 3254 adev->gfx.cu_info.number); 3255 3256 adev->accel_working = true; 3257 3258 amdgpu_vm_check_compute_bug(adev); 3259 3260 /* Initialize the buffer migration limit. */ 3261 if (amdgpu_moverate >= 0) 3262 max_MBps = amdgpu_moverate; 3263 else 3264 max_MBps = 8; /* Allow 8 MB/s. */ 3265 /* Get a log2 for easy divisions. */ 3266 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3267 3268 amdgpu_fbdev_init(adev); 3269 3270 r = amdgpu_pm_sysfs_init(adev); 3271 if (r) { 3272 adev->pm_sysfs_en = false; 3273 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3274 } else 3275 adev->pm_sysfs_en = true; 3276 3277 r = amdgpu_ucode_sysfs_init(adev); 3278 if (r) { 3279 adev->ucode_sysfs_en = false; 3280 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3281 } else 3282 adev->ucode_sysfs_en = true; 3283 3284 if ((amdgpu_testing & 1)) { 3285 if (adev->accel_working) 3286 amdgpu_test_moves(adev); 3287 else 3288 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3289 } 3290 if (amdgpu_benchmarking) { 3291 if (adev->accel_working) 3292 amdgpu_benchmark(adev, amdgpu_benchmarking); 3293 else 3294 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3295 } 3296 3297 /* 3298 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3299 * Otherwise the mgpu fan boost feature will be skipped due to the 3300 * gpu instance is counted less. 3301 */ 3302 amdgpu_register_gpu_instance(adev); 3303 3304 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3305 * explicit gating rather than handling it automatically. 3306 */ 3307 r = amdgpu_device_ip_late_init(adev); 3308 if (r) { 3309 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3310 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3311 goto failed; 3312 } 3313 3314 /* must succeed. */ 3315 amdgpu_ras_resume(adev); 3316 3317 queue_delayed_work(system_wq, &adev->delayed_init_work, 3318 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3319 3320 if (amdgpu_sriov_vf(adev)) 3321 flush_delayed_work(&adev->delayed_init_work); 3322 3323 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3324 if (r) { 3325 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3326 return r; 3327 } 3328 3329 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3330 r = amdgpu_pmu_init(adev); 3331 if (r) 3332 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3333 3334 return 0; 3335 3336 failed: 3337 amdgpu_vf_error_trans_all(adev); 3338 if (boco) 3339 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3340 3341 return r; 3342 } 3343 3344 /** 3345 * amdgpu_device_fini - tear down the driver 3346 * 3347 * @adev: amdgpu_device pointer 3348 * 3349 * Tear down the driver info (all asics). 3350 * Called at driver shutdown. 3351 */ 3352 void amdgpu_device_fini(struct amdgpu_device *adev) 3353 { 3354 int r; 3355 3356 DRM_INFO("amdgpu: finishing device.\n"); 3357 flush_delayed_work(&adev->delayed_init_work); 3358 adev->shutdown = true; 3359 3360 /* make sure IB test finished before entering exclusive mode 3361 * to avoid preemption on IB test 3362 * */ 3363 if (amdgpu_sriov_vf(adev)) 3364 amdgpu_virt_request_full_gpu(adev, false); 3365 3366 /* disable all interrupts */ 3367 amdgpu_irq_disable_all(adev); 3368 if (adev->mode_info.mode_config_initialized){ 3369 if (!amdgpu_device_has_dc_support(adev)) 3370 drm_helper_force_disable_all(adev->ddev); 3371 else 3372 drm_atomic_helper_shutdown(adev->ddev); 3373 } 3374 amdgpu_fence_driver_fini(adev); 3375 if (adev->pm_sysfs_en) 3376 amdgpu_pm_sysfs_fini(adev); 3377 amdgpu_fbdev_fini(adev); 3378 r = amdgpu_device_ip_fini(adev); 3379 release_firmware(adev->firmware.gpu_info_fw); 3380 adev->firmware.gpu_info_fw = NULL; 3381 adev->accel_working = false; 3382 /* free i2c buses */ 3383 if (!amdgpu_device_has_dc_support(adev)) 3384 amdgpu_i2c_fini(adev); 3385 3386 if (amdgpu_emu_mode != 1) 3387 amdgpu_atombios_fini(adev); 3388 3389 kfree(adev->bios); 3390 adev->bios = NULL; 3391 if (amdgpu_has_atpx() && 3392 (amdgpu_is_atpx_hybrid() || 3393 amdgpu_has_atpx_dgpu_power_cntl()) && 3394 !pci_is_thunderbolt_attached(adev->pdev)) 3395 vga_switcheroo_unregister_client(adev->pdev); 3396 if (amdgpu_device_supports_boco(adev->ddev)) 3397 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3398 vga_client_register(adev->pdev, NULL, NULL, NULL); 3399 if (adev->rio_mem) 3400 pci_iounmap(adev->pdev, adev->rio_mem); 3401 adev->rio_mem = NULL; 3402 iounmap(adev->rmmio); 3403 adev->rmmio = NULL; 3404 amdgpu_device_doorbell_fini(adev); 3405 3406 if (adev->ucode_sysfs_en) 3407 amdgpu_ucode_sysfs_fini(adev); 3408 3409 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3410 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3411 amdgpu_pmu_fini(adev); 3412 if (adev->discovery_bin) 3413 amdgpu_discovery_fini(adev); 3414 } 3415 3416 3417 /* 3418 * Suspend & resume. 3419 */ 3420 /** 3421 * amdgpu_device_suspend - initiate device suspend 3422 * 3423 * @dev: drm dev pointer 3424 * @fbcon : notify the fbdev of suspend 3425 * 3426 * Puts the hw in the suspend state (all asics). 3427 * Returns 0 for success or an error on failure. 3428 * Called at driver suspend. 3429 */ 3430 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3431 { 3432 struct amdgpu_device *adev; 3433 struct drm_crtc *crtc; 3434 struct drm_connector *connector; 3435 struct drm_connector_list_iter iter; 3436 int r; 3437 3438 if (dev == NULL || dev->dev_private == NULL) { 3439 return -ENODEV; 3440 } 3441 3442 adev = dev->dev_private; 3443 3444 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3445 return 0; 3446 3447 adev->in_suspend = true; 3448 drm_kms_helper_poll_disable(dev); 3449 3450 if (fbcon) 3451 amdgpu_fbdev_set_suspend(adev, 1); 3452 3453 cancel_delayed_work_sync(&adev->delayed_init_work); 3454 3455 if (!amdgpu_device_has_dc_support(adev)) { 3456 /* turn off display hw */ 3457 drm_modeset_lock_all(dev); 3458 drm_connector_list_iter_begin(dev, &iter); 3459 drm_for_each_connector_iter(connector, &iter) 3460 drm_helper_connector_dpms(connector, 3461 DRM_MODE_DPMS_OFF); 3462 drm_connector_list_iter_end(&iter); 3463 drm_modeset_unlock_all(dev); 3464 /* unpin the front buffers and cursors */ 3465 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3466 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3467 struct drm_framebuffer *fb = crtc->primary->fb; 3468 struct amdgpu_bo *robj; 3469 3470 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3471 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3472 r = amdgpu_bo_reserve(aobj, true); 3473 if (r == 0) { 3474 amdgpu_bo_unpin(aobj); 3475 amdgpu_bo_unreserve(aobj); 3476 } 3477 } 3478 3479 if (fb == NULL || fb->obj[0] == NULL) { 3480 continue; 3481 } 3482 robj = gem_to_amdgpu_bo(fb->obj[0]); 3483 /* don't unpin kernel fb objects */ 3484 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3485 r = amdgpu_bo_reserve(robj, true); 3486 if (r == 0) { 3487 amdgpu_bo_unpin(robj); 3488 amdgpu_bo_unreserve(robj); 3489 } 3490 } 3491 } 3492 } 3493 3494 amdgpu_ras_suspend(adev); 3495 3496 r = amdgpu_device_ip_suspend_phase1(adev); 3497 3498 amdgpu_amdkfd_suspend(adev, !fbcon); 3499 3500 /* evict vram memory */ 3501 amdgpu_bo_evict_vram(adev); 3502 3503 amdgpu_fence_driver_suspend(adev); 3504 3505 r = amdgpu_device_ip_suspend_phase2(adev); 3506 3507 /* evict remaining vram memory 3508 * This second call to evict vram is to evict the gart page table 3509 * using the CPU. 3510 */ 3511 amdgpu_bo_evict_vram(adev); 3512 3513 return 0; 3514 } 3515 3516 /** 3517 * amdgpu_device_resume - initiate device resume 3518 * 3519 * @dev: drm dev pointer 3520 * @fbcon : notify the fbdev of resume 3521 * 3522 * Bring the hw back to operating state (all asics). 3523 * Returns 0 for success or an error on failure. 3524 * Called at driver resume. 3525 */ 3526 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3527 { 3528 struct drm_connector *connector; 3529 struct drm_connector_list_iter iter; 3530 struct amdgpu_device *adev = dev->dev_private; 3531 struct drm_crtc *crtc; 3532 int r = 0; 3533 3534 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3535 return 0; 3536 3537 /* post card */ 3538 if (amdgpu_device_need_post(adev)) { 3539 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3540 if (r) 3541 DRM_ERROR("amdgpu asic init failed\n"); 3542 } 3543 3544 r = amdgpu_device_ip_resume(adev); 3545 if (r) { 3546 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3547 return r; 3548 } 3549 amdgpu_fence_driver_resume(adev); 3550 3551 3552 r = amdgpu_device_ip_late_init(adev); 3553 if (r) 3554 return r; 3555 3556 queue_delayed_work(system_wq, &adev->delayed_init_work, 3557 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3558 3559 if (!amdgpu_device_has_dc_support(adev)) { 3560 /* pin cursors */ 3561 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3562 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3563 3564 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3565 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3566 r = amdgpu_bo_reserve(aobj, true); 3567 if (r == 0) { 3568 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3569 if (r != 0) 3570 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3571 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3572 amdgpu_bo_unreserve(aobj); 3573 } 3574 } 3575 } 3576 } 3577 r = amdgpu_amdkfd_resume(adev, !fbcon); 3578 if (r) 3579 return r; 3580 3581 /* Make sure IB tests flushed */ 3582 flush_delayed_work(&adev->delayed_init_work); 3583 3584 /* blat the mode back in */ 3585 if (fbcon) { 3586 if (!amdgpu_device_has_dc_support(adev)) { 3587 /* pre DCE11 */ 3588 drm_helper_resume_force_mode(dev); 3589 3590 /* turn on display hw */ 3591 drm_modeset_lock_all(dev); 3592 3593 drm_connector_list_iter_begin(dev, &iter); 3594 drm_for_each_connector_iter(connector, &iter) 3595 drm_helper_connector_dpms(connector, 3596 DRM_MODE_DPMS_ON); 3597 drm_connector_list_iter_end(&iter); 3598 3599 drm_modeset_unlock_all(dev); 3600 } 3601 amdgpu_fbdev_set_suspend(adev, 0); 3602 } 3603 3604 drm_kms_helper_poll_enable(dev); 3605 3606 amdgpu_ras_resume(adev); 3607 3608 /* 3609 * Most of the connector probing functions try to acquire runtime pm 3610 * refs to ensure that the GPU is powered on when connector polling is 3611 * performed. Since we're calling this from a runtime PM callback, 3612 * trying to acquire rpm refs will cause us to deadlock. 3613 * 3614 * Since we're guaranteed to be holding the rpm lock, it's safe to 3615 * temporarily disable the rpm helpers so this doesn't deadlock us. 3616 */ 3617 #ifdef CONFIG_PM 3618 dev->dev->power.disable_depth++; 3619 #endif 3620 if (!amdgpu_device_has_dc_support(adev)) 3621 drm_helper_hpd_irq_event(dev); 3622 else 3623 drm_kms_helper_hotplug_event(dev); 3624 #ifdef CONFIG_PM 3625 dev->dev->power.disable_depth--; 3626 #endif 3627 adev->in_suspend = false; 3628 3629 return 0; 3630 } 3631 3632 /** 3633 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3634 * 3635 * @adev: amdgpu_device pointer 3636 * 3637 * The list of all the hardware IPs that make up the asic is walked and 3638 * the check_soft_reset callbacks are run. check_soft_reset determines 3639 * if the asic is still hung or not. 3640 * Returns true if any of the IPs are still in a hung state, false if not. 3641 */ 3642 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3643 { 3644 int i; 3645 bool asic_hang = false; 3646 3647 if (amdgpu_sriov_vf(adev)) 3648 return true; 3649 3650 if (amdgpu_asic_need_full_reset(adev)) 3651 return true; 3652 3653 for (i = 0; i < adev->num_ip_blocks; i++) { 3654 if (!adev->ip_blocks[i].status.valid) 3655 continue; 3656 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3657 adev->ip_blocks[i].status.hang = 3658 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3659 if (adev->ip_blocks[i].status.hang) { 3660 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3661 asic_hang = true; 3662 } 3663 } 3664 return asic_hang; 3665 } 3666 3667 /** 3668 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3669 * 3670 * @adev: amdgpu_device pointer 3671 * 3672 * The list of all the hardware IPs that make up the asic is walked and the 3673 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3674 * handles any IP specific hardware or software state changes that are 3675 * necessary for a soft reset to succeed. 3676 * Returns 0 on success, negative error code on failure. 3677 */ 3678 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3679 { 3680 int i, r = 0; 3681 3682 for (i = 0; i < adev->num_ip_blocks; i++) { 3683 if (!adev->ip_blocks[i].status.valid) 3684 continue; 3685 if (adev->ip_blocks[i].status.hang && 3686 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3687 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3688 if (r) 3689 return r; 3690 } 3691 } 3692 3693 return 0; 3694 } 3695 3696 /** 3697 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3698 * 3699 * @adev: amdgpu_device pointer 3700 * 3701 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3702 * reset is necessary to recover. 3703 * Returns true if a full asic reset is required, false if not. 3704 */ 3705 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3706 { 3707 int i; 3708 3709 if (amdgpu_asic_need_full_reset(adev)) 3710 return true; 3711 3712 for (i = 0; i < adev->num_ip_blocks; i++) { 3713 if (!adev->ip_blocks[i].status.valid) 3714 continue; 3715 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3716 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3717 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3718 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3719 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3720 if (adev->ip_blocks[i].status.hang) { 3721 DRM_INFO("Some block need full reset!\n"); 3722 return true; 3723 } 3724 } 3725 } 3726 return false; 3727 } 3728 3729 /** 3730 * amdgpu_device_ip_soft_reset - do a soft reset 3731 * 3732 * @adev: amdgpu_device pointer 3733 * 3734 * The list of all the hardware IPs that make up the asic is walked and the 3735 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3736 * IP specific hardware or software state changes that are necessary to soft 3737 * reset the IP. 3738 * Returns 0 on success, negative error code on failure. 3739 */ 3740 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3741 { 3742 int i, r = 0; 3743 3744 for (i = 0; i < adev->num_ip_blocks; i++) { 3745 if (!adev->ip_blocks[i].status.valid) 3746 continue; 3747 if (adev->ip_blocks[i].status.hang && 3748 adev->ip_blocks[i].version->funcs->soft_reset) { 3749 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3750 if (r) 3751 return r; 3752 } 3753 } 3754 3755 return 0; 3756 } 3757 3758 /** 3759 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3760 * 3761 * @adev: amdgpu_device pointer 3762 * 3763 * The list of all the hardware IPs that make up the asic is walked and the 3764 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3765 * handles any IP specific hardware or software state changes that are 3766 * necessary after the IP has been soft reset. 3767 * Returns 0 on success, negative error code on failure. 3768 */ 3769 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3770 { 3771 int i, r = 0; 3772 3773 for (i = 0; i < adev->num_ip_blocks; i++) { 3774 if (!adev->ip_blocks[i].status.valid) 3775 continue; 3776 if (adev->ip_blocks[i].status.hang && 3777 adev->ip_blocks[i].version->funcs->post_soft_reset) 3778 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3779 if (r) 3780 return r; 3781 } 3782 3783 return 0; 3784 } 3785 3786 /** 3787 * amdgpu_device_recover_vram - Recover some VRAM contents 3788 * 3789 * @adev: amdgpu_device pointer 3790 * 3791 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3792 * restore things like GPUVM page tables after a GPU reset where 3793 * the contents of VRAM might be lost. 3794 * 3795 * Returns: 3796 * 0 on success, negative error code on failure. 3797 */ 3798 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3799 { 3800 struct dma_fence *fence = NULL, *next = NULL; 3801 struct amdgpu_bo *shadow; 3802 long r = 1, tmo; 3803 3804 if (amdgpu_sriov_runtime(adev)) 3805 tmo = msecs_to_jiffies(8000); 3806 else 3807 tmo = msecs_to_jiffies(100); 3808 3809 DRM_INFO("recover vram bo from shadow start\n"); 3810 mutex_lock(&adev->shadow_list_lock); 3811 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3812 3813 /* No need to recover an evicted BO */ 3814 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3815 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3816 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3817 continue; 3818 3819 r = amdgpu_bo_restore_shadow(shadow, &next); 3820 if (r) 3821 break; 3822 3823 if (fence) { 3824 tmo = dma_fence_wait_timeout(fence, false, tmo); 3825 dma_fence_put(fence); 3826 fence = next; 3827 if (tmo == 0) { 3828 r = -ETIMEDOUT; 3829 break; 3830 } else if (tmo < 0) { 3831 r = tmo; 3832 break; 3833 } 3834 } else { 3835 fence = next; 3836 } 3837 } 3838 mutex_unlock(&adev->shadow_list_lock); 3839 3840 if (fence) 3841 tmo = dma_fence_wait_timeout(fence, false, tmo); 3842 dma_fence_put(fence); 3843 3844 if (r < 0 || tmo <= 0) { 3845 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3846 return -EIO; 3847 } 3848 3849 DRM_INFO("recover vram bo from shadow done\n"); 3850 return 0; 3851 } 3852 3853 3854 /** 3855 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3856 * 3857 * @adev: amdgpu device pointer 3858 * @from_hypervisor: request from hypervisor 3859 * 3860 * do VF FLR and reinitialize Asic 3861 * return 0 means succeeded otherwise failed 3862 */ 3863 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3864 bool from_hypervisor) 3865 { 3866 int r; 3867 3868 if (from_hypervisor) 3869 r = amdgpu_virt_request_full_gpu(adev, true); 3870 else 3871 r = amdgpu_virt_reset_gpu(adev); 3872 if (r) 3873 return r; 3874 3875 amdgpu_amdkfd_pre_reset(adev); 3876 3877 /* Resume IP prior to SMC */ 3878 r = amdgpu_device_ip_reinit_early_sriov(adev); 3879 if (r) 3880 goto error; 3881 3882 amdgpu_virt_init_data_exchange(adev); 3883 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3884 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3885 3886 r = amdgpu_device_fw_loading(adev); 3887 if (r) 3888 return r; 3889 3890 /* now we are okay to resume SMC/CP/SDMA */ 3891 r = amdgpu_device_ip_reinit_late_sriov(adev); 3892 if (r) 3893 goto error; 3894 3895 amdgpu_irq_gpu_reset_resume_helper(adev); 3896 r = amdgpu_ib_ring_tests(adev); 3897 amdgpu_amdkfd_post_reset(adev); 3898 3899 error: 3900 amdgpu_virt_release_full_gpu(adev, true); 3901 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3902 amdgpu_inc_vram_lost(adev); 3903 r = amdgpu_device_recover_vram(adev); 3904 } 3905 3906 return r; 3907 } 3908 3909 /** 3910 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3911 * 3912 * @adev: amdgpu device pointer 3913 * 3914 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3915 * a hung GPU. 3916 */ 3917 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3918 { 3919 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3920 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3921 return false; 3922 } 3923 3924 if (amdgpu_gpu_recovery == 0) 3925 goto disabled; 3926 3927 if (amdgpu_sriov_vf(adev)) 3928 return true; 3929 3930 if (amdgpu_gpu_recovery == -1) { 3931 switch (adev->asic_type) { 3932 case CHIP_BONAIRE: 3933 case CHIP_HAWAII: 3934 case CHIP_TOPAZ: 3935 case CHIP_TONGA: 3936 case CHIP_FIJI: 3937 case CHIP_POLARIS10: 3938 case CHIP_POLARIS11: 3939 case CHIP_POLARIS12: 3940 case CHIP_VEGAM: 3941 case CHIP_VEGA20: 3942 case CHIP_VEGA10: 3943 case CHIP_VEGA12: 3944 case CHIP_RAVEN: 3945 case CHIP_ARCTURUS: 3946 case CHIP_RENOIR: 3947 case CHIP_NAVI10: 3948 case CHIP_NAVI14: 3949 case CHIP_NAVI12: 3950 case CHIP_SIENNA_CICHLID: 3951 break; 3952 default: 3953 goto disabled; 3954 } 3955 } 3956 3957 return true; 3958 3959 disabled: 3960 DRM_INFO("GPU recovery disabled.\n"); 3961 return false; 3962 } 3963 3964 3965 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 3966 struct amdgpu_job *job, 3967 bool *need_full_reset_arg) 3968 { 3969 int i, r = 0; 3970 bool need_full_reset = *need_full_reset_arg; 3971 3972 amdgpu_debugfs_wait_dump(adev); 3973 3974 /* block all schedulers and reset given job's ring */ 3975 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3976 struct amdgpu_ring *ring = adev->rings[i]; 3977 3978 if (!ring || !ring->sched.thread) 3979 continue; 3980 3981 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3982 amdgpu_fence_driver_force_completion(ring); 3983 } 3984 3985 if(job) 3986 drm_sched_increase_karma(&job->base); 3987 3988 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 3989 if (!amdgpu_sriov_vf(adev)) { 3990 3991 if (!need_full_reset) 3992 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 3993 3994 if (!need_full_reset) { 3995 amdgpu_device_ip_pre_soft_reset(adev); 3996 r = amdgpu_device_ip_soft_reset(adev); 3997 amdgpu_device_ip_post_soft_reset(adev); 3998 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 3999 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 4000 need_full_reset = true; 4001 } 4002 } 4003 4004 if (need_full_reset) 4005 r = amdgpu_device_ip_suspend(adev); 4006 4007 *need_full_reset_arg = need_full_reset; 4008 } 4009 4010 return r; 4011 } 4012 4013 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4014 struct list_head *device_list_handle, 4015 bool *need_full_reset_arg) 4016 { 4017 struct amdgpu_device *tmp_adev = NULL; 4018 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4019 int r = 0; 4020 4021 /* 4022 * ASIC reset has to be done on all HGMI hive nodes ASAP 4023 * to allow proper links negotiation in FW (within 1 sec) 4024 */ 4025 if (need_full_reset) { 4026 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4027 /* For XGMI run all resets in parallel to speed up the process */ 4028 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4029 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4030 r = -EALREADY; 4031 } else 4032 r = amdgpu_asic_reset(tmp_adev); 4033 4034 if (r) { 4035 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 4036 r, tmp_adev->ddev->unique); 4037 break; 4038 } 4039 } 4040 4041 /* For XGMI wait for all resets to complete before proceed */ 4042 if (!r) { 4043 list_for_each_entry(tmp_adev, device_list_handle, 4044 gmc.xgmi.head) { 4045 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4046 flush_work(&tmp_adev->xgmi_reset_work); 4047 r = tmp_adev->asic_reset_res; 4048 if (r) 4049 break; 4050 } 4051 } 4052 } 4053 } 4054 4055 if (!r && amdgpu_ras_intr_triggered()) { 4056 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4057 if (tmp_adev->mmhub.funcs && 4058 tmp_adev->mmhub.funcs->reset_ras_error_count) 4059 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4060 } 4061 4062 amdgpu_ras_intr_cleared(); 4063 } 4064 4065 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4066 if (need_full_reset) { 4067 /* post card */ 4068 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) { 4069 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4070 r = -EAGAIN; 4071 goto out; 4072 } 4073 4074 if (!r) { 4075 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4076 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4077 if (r) 4078 goto out; 4079 4080 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4081 if (vram_lost) { 4082 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4083 amdgpu_inc_vram_lost(tmp_adev); 4084 } 4085 4086 r = amdgpu_gtt_mgr_recover( 4087 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 4088 if (r) 4089 goto out; 4090 4091 r = amdgpu_device_fw_loading(tmp_adev); 4092 if (r) 4093 return r; 4094 4095 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4096 if (r) 4097 goto out; 4098 4099 if (vram_lost) 4100 amdgpu_device_fill_reset_magic(tmp_adev); 4101 4102 /* 4103 * Add this ASIC as tracked as reset was already 4104 * complete successfully. 4105 */ 4106 amdgpu_register_gpu_instance(tmp_adev); 4107 4108 r = amdgpu_device_ip_late_init(tmp_adev); 4109 if (r) 4110 goto out; 4111 4112 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4113 4114 /* must succeed. */ 4115 amdgpu_ras_resume(tmp_adev); 4116 4117 /* Update PSP FW topology after reset */ 4118 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4119 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4120 } 4121 } 4122 4123 4124 out: 4125 if (!r) { 4126 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4127 r = amdgpu_ib_ring_tests(tmp_adev); 4128 if (r) { 4129 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4130 r = amdgpu_device_ip_suspend(tmp_adev); 4131 need_full_reset = true; 4132 r = -EAGAIN; 4133 goto end; 4134 } 4135 } 4136 4137 if (!r) 4138 r = amdgpu_device_recover_vram(tmp_adev); 4139 else 4140 tmp_adev->asic_reset_res = r; 4141 } 4142 4143 end: 4144 *need_full_reset_arg = need_full_reset; 4145 return r; 4146 } 4147 4148 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) 4149 { 4150 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4151 return false; 4152 4153 down_write(&adev->reset_sem); 4154 4155 atomic_inc(&adev->gpu_reset_counter); 4156 switch (amdgpu_asic_reset_method(adev)) { 4157 case AMD_RESET_METHOD_MODE1: 4158 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4159 break; 4160 case AMD_RESET_METHOD_MODE2: 4161 adev->mp1_state = PP_MP1_STATE_RESET; 4162 break; 4163 default: 4164 adev->mp1_state = PP_MP1_STATE_NONE; 4165 break; 4166 } 4167 4168 return true; 4169 } 4170 4171 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4172 { 4173 amdgpu_vf_error_trans_all(adev); 4174 adev->mp1_state = PP_MP1_STATE_NONE; 4175 atomic_set(&adev->in_gpu_reset, 0); 4176 up_write(&adev->reset_sem); 4177 } 4178 4179 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4180 { 4181 struct pci_dev *p = NULL; 4182 4183 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4184 adev->pdev->bus->number, 1); 4185 if (p) { 4186 pm_runtime_enable(&(p->dev)); 4187 pm_runtime_resume(&(p->dev)); 4188 } 4189 } 4190 4191 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4192 { 4193 enum amd_reset_method reset_method; 4194 struct pci_dev *p = NULL; 4195 u64 expires; 4196 4197 /* 4198 * For now, only BACO and mode1 reset are confirmed 4199 * to suffer the audio issue without proper suspended. 4200 */ 4201 reset_method = amdgpu_asic_reset_method(adev); 4202 if ((reset_method != AMD_RESET_METHOD_BACO) && 4203 (reset_method != AMD_RESET_METHOD_MODE1)) 4204 return -EINVAL; 4205 4206 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4207 adev->pdev->bus->number, 1); 4208 if (!p) 4209 return -ENODEV; 4210 4211 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4212 if (!expires) 4213 /* 4214 * If we cannot get the audio device autosuspend delay, 4215 * a fixed 4S interval will be used. Considering 3S is 4216 * the audio controller default autosuspend delay setting. 4217 * 4S used here is guaranteed to cover that. 4218 */ 4219 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4220 4221 while (!pm_runtime_status_suspended(&(p->dev))) { 4222 if (!pm_runtime_suspend(&(p->dev))) 4223 break; 4224 4225 if (expires < ktime_get_mono_fast_ns()) { 4226 dev_warn(adev->dev, "failed to suspend display audio\n"); 4227 /* TODO: abort the succeeding gpu reset? */ 4228 return -ETIMEDOUT; 4229 } 4230 } 4231 4232 pm_runtime_disable(&(p->dev)); 4233 4234 return 0; 4235 } 4236 4237 /** 4238 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4239 * 4240 * @adev: amdgpu device pointer 4241 * @job: which job trigger hang 4242 * 4243 * Attempt to reset the GPU if it has hung (all asics). 4244 * Attempt to do soft-reset or full-reset and reinitialize Asic 4245 * Returns 0 for success or an error on failure. 4246 */ 4247 4248 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4249 struct amdgpu_job *job) 4250 { 4251 struct list_head device_list, *device_list_handle = NULL; 4252 bool need_full_reset = false; 4253 bool job_signaled = false; 4254 struct amdgpu_hive_info *hive = NULL; 4255 struct amdgpu_device *tmp_adev = NULL; 4256 int i, r = 0; 4257 bool need_emergency_restart = false; 4258 bool audio_suspended = false; 4259 4260 /** 4261 * Special case: RAS triggered and full reset isn't supported 4262 */ 4263 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4264 4265 /* 4266 * Flush RAM to disk so that after reboot 4267 * the user can read log and see why the system rebooted. 4268 */ 4269 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4270 DRM_WARN("Emergency reboot."); 4271 4272 ksys_sync_helper(); 4273 emergency_restart(); 4274 } 4275 4276 dev_info(adev->dev, "GPU %s begin!\n", 4277 need_emergency_restart ? "jobs stop":"reset"); 4278 4279 /* 4280 * Here we trylock to avoid chain of resets executing from 4281 * either trigger by jobs on different adevs in XGMI hive or jobs on 4282 * different schedulers for same device while this TO handler is running. 4283 * We always reset all schedulers for device and all devices for XGMI 4284 * hive so that should take care of them too. 4285 */ 4286 hive = amdgpu_get_xgmi_hive(adev, false); 4287 if (hive) { 4288 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4289 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4290 job ? job->base.id : -1, hive->hive_id); 4291 return 0; 4292 } 4293 mutex_lock(&hive->hive_lock); 4294 } 4295 4296 /* 4297 * Build list of devices to reset. 4298 * In case we are in XGMI hive mode, resort the device list 4299 * to put adev in the 1st position. 4300 */ 4301 INIT_LIST_HEAD(&device_list); 4302 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4303 if (!hive) 4304 return -ENODEV; 4305 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4306 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4307 device_list_handle = &hive->device_list; 4308 } else { 4309 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4310 device_list_handle = &device_list; 4311 } 4312 4313 /* block all schedulers and reset given job's ring */ 4314 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4315 if (!amdgpu_device_lock_adev(tmp_adev)) { 4316 DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", 4317 job ? job->base.id : -1); 4318 r = 0; 4319 goto skip_recovery; 4320 } 4321 4322 /* 4323 * Try to put the audio codec into suspend state 4324 * before gpu reset started. 4325 * 4326 * Due to the power domain of the graphics device 4327 * is shared with AZ power domain. Without this, 4328 * we may change the audio hardware from behind 4329 * the audio driver's back. That will trigger 4330 * some audio codec errors. 4331 */ 4332 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4333 audio_suspended = true; 4334 4335 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4336 4337 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4338 4339 if (!amdgpu_sriov_vf(tmp_adev)) 4340 amdgpu_amdkfd_pre_reset(tmp_adev); 4341 4342 /* 4343 * Mark these ASICs to be reseted as untracked first 4344 * And add them back after reset completed 4345 */ 4346 amdgpu_unregister_gpu_instance(tmp_adev); 4347 4348 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4349 4350 /* disable ras on ALL IPs */ 4351 if (!need_emergency_restart && 4352 amdgpu_device_ip_need_full_reset(tmp_adev)) 4353 amdgpu_ras_suspend(tmp_adev); 4354 4355 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4356 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4357 4358 if (!ring || !ring->sched.thread) 4359 continue; 4360 4361 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4362 4363 if (need_emergency_restart) 4364 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4365 } 4366 } 4367 4368 if (need_emergency_restart) 4369 goto skip_sched_resume; 4370 4371 /* 4372 * Must check guilty signal here since after this point all old 4373 * HW fences are force signaled. 4374 * 4375 * job->base holds a reference to parent fence 4376 */ 4377 if (job && job->base.s_fence->parent && 4378 dma_fence_is_signaled(job->base.s_fence->parent)) { 4379 job_signaled = true; 4380 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4381 goto skip_hw_reset; 4382 } 4383 4384 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4385 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4386 r = amdgpu_device_pre_asic_reset(tmp_adev, 4387 NULL, 4388 &need_full_reset); 4389 /*TODO Should we stop ?*/ 4390 if (r) { 4391 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4392 r, tmp_adev->ddev->unique); 4393 tmp_adev->asic_reset_res = r; 4394 } 4395 } 4396 4397 /* Actual ASIC resets if needed.*/ 4398 /* TODO Implement XGMI hive reset logic for SRIOV */ 4399 if (amdgpu_sriov_vf(adev)) { 4400 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4401 if (r) 4402 adev->asic_reset_res = r; 4403 } else { 4404 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4405 if (r && r == -EAGAIN) 4406 goto retry; 4407 } 4408 4409 skip_hw_reset: 4410 4411 /* Post ASIC reset for all devs .*/ 4412 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4413 4414 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4415 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4416 4417 if (!ring || !ring->sched.thread) 4418 continue; 4419 4420 /* No point to resubmit jobs if we didn't HW reset*/ 4421 if (!tmp_adev->asic_reset_res && !job_signaled) 4422 drm_sched_resubmit_jobs(&ring->sched); 4423 4424 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4425 } 4426 4427 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4428 drm_helper_resume_force_mode(tmp_adev->ddev); 4429 } 4430 4431 tmp_adev->asic_reset_res = 0; 4432 4433 if (r) { 4434 /* bad news, how to tell it to userspace ? */ 4435 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4436 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4437 } else { 4438 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4439 } 4440 } 4441 4442 skip_sched_resume: 4443 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4444 /*unlock kfd: SRIOV would do it separately */ 4445 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4446 amdgpu_amdkfd_post_reset(tmp_adev); 4447 if (audio_suspended) 4448 amdgpu_device_resume_display_audio(tmp_adev); 4449 amdgpu_device_unlock_adev(tmp_adev); 4450 } 4451 4452 skip_recovery: 4453 if (hive) { 4454 atomic_set(&hive->in_reset, 0); 4455 mutex_unlock(&hive->hive_lock); 4456 } 4457 4458 if (r) 4459 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4460 return r; 4461 } 4462 4463 /** 4464 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4465 * 4466 * @adev: amdgpu_device pointer 4467 * 4468 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4469 * and lanes) of the slot the device is in. Handles APUs and 4470 * virtualized environments where PCIE config space may not be available. 4471 */ 4472 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4473 { 4474 struct pci_dev *pdev; 4475 enum pci_bus_speed speed_cap, platform_speed_cap; 4476 enum pcie_link_width platform_link_width; 4477 4478 if (amdgpu_pcie_gen_cap) 4479 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4480 4481 if (amdgpu_pcie_lane_cap) 4482 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4483 4484 /* covers APUs as well */ 4485 if (pci_is_root_bus(adev->pdev->bus)) { 4486 if (adev->pm.pcie_gen_mask == 0) 4487 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4488 if (adev->pm.pcie_mlw_mask == 0) 4489 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4490 return; 4491 } 4492 4493 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4494 return; 4495 4496 pcie_bandwidth_available(adev->pdev, NULL, 4497 &platform_speed_cap, &platform_link_width); 4498 4499 if (adev->pm.pcie_gen_mask == 0) { 4500 /* asic caps */ 4501 pdev = adev->pdev; 4502 speed_cap = pcie_get_speed_cap(pdev); 4503 if (speed_cap == PCI_SPEED_UNKNOWN) { 4504 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4505 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4506 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4507 } else { 4508 if (speed_cap == PCIE_SPEED_16_0GT) 4509 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4510 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4511 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4512 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4513 else if (speed_cap == PCIE_SPEED_8_0GT) 4514 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4515 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4516 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4517 else if (speed_cap == PCIE_SPEED_5_0GT) 4518 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4519 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4520 else 4521 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4522 } 4523 /* platform caps */ 4524 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4525 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4526 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4527 } else { 4528 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4529 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4530 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4531 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4532 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4533 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4534 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4535 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4536 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4537 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4538 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4539 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4540 else 4541 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4542 4543 } 4544 } 4545 if (adev->pm.pcie_mlw_mask == 0) { 4546 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4547 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4548 } else { 4549 switch (platform_link_width) { 4550 case PCIE_LNK_X32: 4551 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4558 break; 4559 case PCIE_LNK_X16: 4560 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4565 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4566 break; 4567 case PCIE_LNK_X12: 4568 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4569 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4570 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4572 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4573 break; 4574 case PCIE_LNK_X8: 4575 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4576 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4577 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4578 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4579 break; 4580 case PCIE_LNK_X4: 4581 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4582 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4583 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4584 break; 4585 case PCIE_LNK_X2: 4586 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4587 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4588 break; 4589 case PCIE_LNK_X1: 4590 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4591 break; 4592 default: 4593 break; 4594 } 4595 } 4596 } 4597 } 4598 4599 int amdgpu_device_baco_enter(struct drm_device *dev) 4600 { 4601 struct amdgpu_device *adev = dev->dev_private; 4602 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4603 4604 if (!amdgpu_device_supports_baco(adev->ddev)) 4605 return -ENOTSUPP; 4606 4607 if (ras && ras->supported) 4608 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4609 4610 return amdgpu_dpm_baco_enter(adev); 4611 } 4612 4613 int amdgpu_device_baco_exit(struct drm_device *dev) 4614 { 4615 struct amdgpu_device *adev = dev->dev_private; 4616 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4617 int ret = 0; 4618 4619 if (!amdgpu_device_supports_baco(adev->ddev)) 4620 return -ENOTSUPP; 4621 4622 ret = amdgpu_dpm_baco_exit(adev); 4623 if (ret) 4624 return ret; 4625 4626 if (ras && ras->supported) 4627 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4628 4629 return 0; 4630 } 4631