1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 68 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 69 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 70 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 71 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 72 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 73 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 74 75 #define AMDGPU_RESUME_MS 2000 76 77 static const char *amdgpu_asic_name[] = { 78 "TAHITI", 79 "PITCAIRN", 80 "VERDE", 81 "OLAND", 82 "HAINAN", 83 "BONAIRE", 84 "KAVERI", 85 "KABINI", 86 "HAWAII", 87 "MULLINS", 88 "TOPAZ", 89 "TONGA", 90 "FIJI", 91 "CARRIZO", 92 "STONEY", 93 "POLARIS10", 94 "POLARIS11", 95 "POLARIS12", 96 "VEGAM", 97 "VEGA10", 98 "VEGA12", 99 "VEGA20", 100 "RAVEN", 101 "NAVI10", 102 "LAST", 103 }; 104 105 /** 106 * DOC: pcie_replay_count 107 * 108 * The amdgpu driver provides a sysfs API for reporting the total number 109 * of PCIe replays (NAKs) 110 * The file pcie_replay_count is used for this and returns the total 111 * number of replays as a sum of the NAKs generated and NAKs received 112 */ 113 114 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 115 struct device_attribute *attr, char *buf) 116 { 117 struct drm_device *ddev = dev_get_drvdata(dev); 118 struct amdgpu_device *adev = ddev->dev_private; 119 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 120 121 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 122 } 123 124 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 125 amdgpu_device_get_pcie_replay_count, NULL); 126 127 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 128 129 /** 130 * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control 131 * 132 * @dev: drm_device pointer 133 * 134 * Returns true if the device is a dGPU with HG/PX power control, 135 * otherwise return false. 136 */ 137 bool amdgpu_device_is_px(struct drm_device *dev) 138 { 139 struct amdgpu_device *adev = dev->dev_private; 140 141 if (adev->flags & AMD_IS_PX) 142 return true; 143 return false; 144 } 145 146 /* 147 * MMIO register access helper functions. 148 */ 149 /** 150 * amdgpu_mm_rreg - read a memory mapped IO register 151 * 152 * @adev: amdgpu_device pointer 153 * @reg: dword aligned register offset 154 * @acc_flags: access flags which require special behavior 155 * 156 * Returns the 32 bit value from the offset specified. 157 */ 158 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 159 uint32_t acc_flags) 160 { 161 uint32_t ret; 162 163 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 164 return amdgpu_virt_kiq_rreg(adev, reg); 165 166 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 167 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 168 else { 169 unsigned long flags; 170 171 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 172 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 173 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 174 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 175 } 176 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 177 return ret; 178 } 179 180 /* 181 * MMIO register read with bytes helper functions 182 * @offset:bytes offset from MMIO start 183 * 184 */ 185 186 /** 187 * amdgpu_mm_rreg8 - read a memory mapped IO register 188 * 189 * @adev: amdgpu_device pointer 190 * @offset: byte aligned register offset 191 * 192 * Returns the 8 bit value from the offset specified. 193 */ 194 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 195 if (offset < adev->rmmio_size) 196 return (readb(adev->rmmio + offset)); 197 BUG(); 198 } 199 200 /* 201 * MMIO register write with bytes helper functions 202 * @offset:bytes offset from MMIO start 203 * @value: the value want to be written to the register 204 * 205 */ 206 /** 207 * amdgpu_mm_wreg8 - read a memory mapped IO register 208 * 209 * @adev: amdgpu_device pointer 210 * @offset: byte aligned register offset 211 * @value: 8 bit value to write 212 * 213 * Writes the value specified to the offset specified. 214 */ 215 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 216 if (offset < adev->rmmio_size) 217 writeb(value, adev->rmmio + offset); 218 else 219 BUG(); 220 } 221 222 /** 223 * amdgpu_mm_wreg - write to a memory mapped IO register 224 * 225 * @adev: amdgpu_device pointer 226 * @reg: dword aligned register offset 227 * @v: 32 bit value to write to the register 228 * @acc_flags: access flags which require special behavior 229 * 230 * Writes the value specified to the offset specified. 231 */ 232 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 233 uint32_t acc_flags) 234 { 235 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 236 237 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 238 adev->last_mm_index = v; 239 } 240 241 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 242 return amdgpu_virt_kiq_wreg(adev, reg, v); 243 244 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 245 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 246 else { 247 unsigned long flags; 248 249 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 250 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 251 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 252 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 253 } 254 255 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 256 udelay(500); 257 } 258 } 259 260 /** 261 * amdgpu_io_rreg - read an IO register 262 * 263 * @adev: amdgpu_device pointer 264 * @reg: dword aligned register offset 265 * 266 * Returns the 32 bit value from the offset specified. 267 */ 268 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 269 { 270 if ((reg * 4) < adev->rio_mem_size) 271 return ioread32(adev->rio_mem + (reg * 4)); 272 else { 273 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 274 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 275 } 276 } 277 278 /** 279 * amdgpu_io_wreg - write to an IO register 280 * 281 * @adev: amdgpu_device pointer 282 * @reg: dword aligned register offset 283 * @v: 32 bit value to write to the register 284 * 285 * Writes the value specified to the offset specified. 286 */ 287 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 288 { 289 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 290 adev->last_mm_index = v; 291 } 292 293 if ((reg * 4) < adev->rio_mem_size) 294 iowrite32(v, adev->rio_mem + (reg * 4)); 295 else { 296 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 297 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 298 } 299 300 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 301 udelay(500); 302 } 303 } 304 305 /** 306 * amdgpu_mm_rdoorbell - read a doorbell dword 307 * 308 * @adev: amdgpu_device pointer 309 * @index: doorbell index 310 * 311 * Returns the value in the doorbell aperture at the 312 * requested doorbell index (CIK). 313 */ 314 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 315 { 316 if (index < adev->doorbell.num_doorbells) { 317 return readl(adev->doorbell.ptr + index); 318 } else { 319 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 320 return 0; 321 } 322 } 323 324 /** 325 * amdgpu_mm_wdoorbell - write a doorbell dword 326 * 327 * @adev: amdgpu_device pointer 328 * @index: doorbell index 329 * @v: value to write 330 * 331 * Writes @v to the doorbell aperture at the 332 * requested doorbell index (CIK). 333 */ 334 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 335 { 336 if (index < adev->doorbell.num_doorbells) { 337 writel(v, adev->doorbell.ptr + index); 338 } else { 339 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 340 } 341 } 342 343 /** 344 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 345 * 346 * @adev: amdgpu_device pointer 347 * @index: doorbell index 348 * 349 * Returns the value in the doorbell aperture at the 350 * requested doorbell index (VEGA10+). 351 */ 352 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 353 { 354 if (index < adev->doorbell.num_doorbells) { 355 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 356 } else { 357 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 358 return 0; 359 } 360 } 361 362 /** 363 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 364 * 365 * @adev: amdgpu_device pointer 366 * @index: doorbell index 367 * @v: value to write 368 * 369 * Writes @v to the doorbell aperture at the 370 * requested doorbell index (VEGA10+). 371 */ 372 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 373 { 374 if (index < adev->doorbell.num_doorbells) { 375 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 376 } else { 377 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 378 } 379 } 380 381 /** 382 * amdgpu_invalid_rreg - dummy reg read function 383 * 384 * @adev: amdgpu device pointer 385 * @reg: offset of register 386 * 387 * Dummy register read function. Used for register blocks 388 * that certain asics don't have (all asics). 389 * Returns the value in the register. 390 */ 391 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 392 { 393 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 394 BUG(); 395 return 0; 396 } 397 398 /** 399 * amdgpu_invalid_wreg - dummy reg write function 400 * 401 * @adev: amdgpu device pointer 402 * @reg: offset of register 403 * @v: value to write to the register 404 * 405 * Dummy register read function. Used for register blocks 406 * that certain asics don't have (all asics). 407 */ 408 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 409 { 410 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 411 reg, v); 412 BUG(); 413 } 414 415 /** 416 * amdgpu_block_invalid_rreg - dummy reg read function 417 * 418 * @adev: amdgpu device pointer 419 * @block: offset of instance 420 * @reg: offset of register 421 * 422 * Dummy register read function. Used for register blocks 423 * that certain asics don't have (all asics). 424 * Returns the value in the register. 425 */ 426 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 427 uint32_t block, uint32_t reg) 428 { 429 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 430 reg, block); 431 BUG(); 432 return 0; 433 } 434 435 /** 436 * amdgpu_block_invalid_wreg - dummy reg write function 437 * 438 * @adev: amdgpu device pointer 439 * @block: offset of instance 440 * @reg: offset of register 441 * @v: value to write to the register 442 * 443 * Dummy register read function. Used for register blocks 444 * that certain asics don't have (all asics). 445 */ 446 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 447 uint32_t block, 448 uint32_t reg, uint32_t v) 449 { 450 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 451 reg, block, v); 452 BUG(); 453 } 454 455 /** 456 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 457 * 458 * @adev: amdgpu device pointer 459 * 460 * Allocates a scratch page of VRAM for use by various things in the 461 * driver. 462 */ 463 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 464 { 465 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 466 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 467 &adev->vram_scratch.robj, 468 &adev->vram_scratch.gpu_addr, 469 (void **)&adev->vram_scratch.ptr); 470 } 471 472 /** 473 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 474 * 475 * @adev: amdgpu device pointer 476 * 477 * Frees the VRAM scratch page. 478 */ 479 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 480 { 481 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 482 } 483 484 /** 485 * amdgpu_device_program_register_sequence - program an array of registers. 486 * 487 * @adev: amdgpu_device pointer 488 * @registers: pointer to the register array 489 * @array_size: size of the register array 490 * 491 * Programs an array or registers with and and or masks. 492 * This is a helper for setting golden registers. 493 */ 494 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 495 const u32 *registers, 496 const u32 array_size) 497 { 498 u32 tmp, reg, and_mask, or_mask; 499 int i; 500 501 if (array_size % 3) 502 return; 503 504 for (i = 0; i < array_size; i +=3) { 505 reg = registers[i + 0]; 506 and_mask = registers[i + 1]; 507 or_mask = registers[i + 2]; 508 509 if (and_mask == 0xffffffff) { 510 tmp = or_mask; 511 } else { 512 tmp = RREG32(reg); 513 tmp &= ~and_mask; 514 if (adev->family >= AMDGPU_FAMILY_AI) 515 tmp |= (or_mask & and_mask); 516 else 517 tmp |= or_mask; 518 } 519 WREG32(reg, tmp); 520 } 521 } 522 523 /** 524 * amdgpu_device_pci_config_reset - reset the GPU 525 * 526 * @adev: amdgpu_device pointer 527 * 528 * Resets the GPU using the pci config reset sequence. 529 * Only applicable to asics prior to vega10. 530 */ 531 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 532 { 533 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 534 } 535 536 /* 537 * GPU doorbell aperture helpers function. 538 */ 539 /** 540 * amdgpu_device_doorbell_init - Init doorbell driver information. 541 * 542 * @adev: amdgpu_device pointer 543 * 544 * Init doorbell driver information (CIK) 545 * Returns 0 on success, error on failure. 546 */ 547 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 548 { 549 550 /* No doorbell on SI hardware generation */ 551 if (adev->asic_type < CHIP_BONAIRE) { 552 adev->doorbell.base = 0; 553 adev->doorbell.size = 0; 554 adev->doorbell.num_doorbells = 0; 555 adev->doorbell.ptr = NULL; 556 return 0; 557 } 558 559 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 560 return -EINVAL; 561 562 amdgpu_asic_init_doorbell_index(adev); 563 564 /* doorbell bar mapping */ 565 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 566 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 567 568 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 569 adev->doorbell_index.max_assignment+1); 570 if (adev->doorbell.num_doorbells == 0) 571 return -EINVAL; 572 573 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 574 * paging queue doorbell use the second page. The 575 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 576 * doorbells are in the first page. So with paging queue enabled, 577 * the max num_doorbells should + 1 page (0x400 in dword) 578 */ 579 if (adev->asic_type >= CHIP_VEGA10) 580 adev->doorbell.num_doorbells += 0x400; 581 582 adev->doorbell.ptr = ioremap(adev->doorbell.base, 583 adev->doorbell.num_doorbells * 584 sizeof(u32)); 585 if (adev->doorbell.ptr == NULL) 586 return -ENOMEM; 587 588 return 0; 589 } 590 591 /** 592 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 593 * 594 * @adev: amdgpu_device pointer 595 * 596 * Tear down doorbell driver information (CIK) 597 */ 598 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 599 { 600 iounmap(adev->doorbell.ptr); 601 adev->doorbell.ptr = NULL; 602 } 603 604 605 606 /* 607 * amdgpu_device_wb_*() 608 * Writeback is the method by which the GPU updates special pages in memory 609 * with the status of certain GPU events (fences, ring pointers,etc.). 610 */ 611 612 /** 613 * amdgpu_device_wb_fini - Disable Writeback and free memory 614 * 615 * @adev: amdgpu_device pointer 616 * 617 * Disables Writeback and frees the Writeback memory (all asics). 618 * Used at driver shutdown. 619 */ 620 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 621 { 622 if (adev->wb.wb_obj) { 623 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 624 &adev->wb.gpu_addr, 625 (void **)&adev->wb.wb); 626 adev->wb.wb_obj = NULL; 627 } 628 } 629 630 /** 631 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 632 * 633 * @adev: amdgpu_device pointer 634 * 635 * Initializes writeback and allocates writeback memory (all asics). 636 * Used at driver startup. 637 * Returns 0 on success or an -error on failure. 638 */ 639 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 640 { 641 int r; 642 643 if (adev->wb.wb_obj == NULL) { 644 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 645 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 646 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 647 &adev->wb.wb_obj, &adev->wb.gpu_addr, 648 (void **)&adev->wb.wb); 649 if (r) { 650 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 651 return r; 652 } 653 654 adev->wb.num_wb = AMDGPU_MAX_WB; 655 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 656 657 /* clear wb memory */ 658 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 659 } 660 661 return 0; 662 } 663 664 /** 665 * amdgpu_device_wb_get - Allocate a wb entry 666 * 667 * @adev: amdgpu_device pointer 668 * @wb: wb index 669 * 670 * Allocate a wb slot for use by the driver (all asics). 671 * Returns 0 on success or -EINVAL on failure. 672 */ 673 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 674 { 675 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 676 677 if (offset < adev->wb.num_wb) { 678 __set_bit(offset, adev->wb.used); 679 *wb = offset << 3; /* convert to dw offset */ 680 return 0; 681 } else { 682 return -EINVAL; 683 } 684 } 685 686 /** 687 * amdgpu_device_wb_free - Free a wb entry 688 * 689 * @adev: amdgpu_device pointer 690 * @wb: wb index 691 * 692 * Free a wb slot allocated for use by the driver (all asics) 693 */ 694 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 695 { 696 wb >>= 3; 697 if (wb < adev->wb.num_wb) 698 __clear_bit(wb, adev->wb.used); 699 } 700 701 /** 702 * amdgpu_device_resize_fb_bar - try to resize FB BAR 703 * 704 * @adev: amdgpu_device pointer 705 * 706 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 707 * to fail, but if any of the BARs is not accessible after the size we abort 708 * driver loading by returning -ENODEV. 709 */ 710 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 711 { 712 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 713 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 714 struct pci_bus *root; 715 struct resource *res; 716 unsigned i; 717 u16 cmd; 718 int r; 719 720 /* Bypass for VF */ 721 if (amdgpu_sriov_vf(adev)) 722 return 0; 723 724 /* Check if the root BUS has 64bit memory resources */ 725 root = adev->pdev->bus; 726 while (root->parent) 727 root = root->parent; 728 729 pci_bus_for_each_resource(root, res, i) { 730 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 731 res->start > 0x100000000ull) 732 break; 733 } 734 735 /* Trying to resize is pointless without a root hub window above 4GB */ 736 if (!res) 737 return 0; 738 739 /* Disable memory decoding while we change the BAR addresses and size */ 740 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 741 pci_write_config_word(adev->pdev, PCI_COMMAND, 742 cmd & ~PCI_COMMAND_MEMORY); 743 744 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 745 amdgpu_device_doorbell_fini(adev); 746 if (adev->asic_type >= CHIP_BONAIRE) 747 pci_release_resource(adev->pdev, 2); 748 749 pci_release_resource(adev->pdev, 0); 750 751 r = pci_resize_resource(adev->pdev, 0, rbar_size); 752 if (r == -ENOSPC) 753 DRM_INFO("Not enough PCI address space for a large BAR."); 754 else if (r && r != -ENOTSUPP) 755 DRM_ERROR("Problem resizing BAR0 (%d).", r); 756 757 pci_assign_unassigned_bus_resources(adev->pdev->bus); 758 759 /* When the doorbell or fb BAR isn't available we have no chance of 760 * using the device. 761 */ 762 r = amdgpu_device_doorbell_init(adev); 763 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 764 return -ENODEV; 765 766 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 767 768 return 0; 769 } 770 771 /* 772 * GPU helpers function. 773 */ 774 /** 775 * amdgpu_device_need_post - check if the hw need post or not 776 * 777 * @adev: amdgpu_device pointer 778 * 779 * Check if the asic has been initialized (all asics) at driver startup 780 * or post is needed if hw reset is performed. 781 * Returns true if need or false if not. 782 */ 783 bool amdgpu_device_need_post(struct amdgpu_device *adev) 784 { 785 uint32_t reg; 786 787 if (amdgpu_sriov_vf(adev)) 788 return false; 789 790 if (amdgpu_passthrough(adev)) { 791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 792 * some old smc fw still need driver do vPost otherwise gpu hang, while 793 * those smc fw version above 22.15 doesn't have this flaw, so we force 794 * vpost executed for smc version below 22.15 795 */ 796 if (adev->asic_type == CHIP_FIJI) { 797 int err; 798 uint32_t fw_ver; 799 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 800 /* force vPost if error occured */ 801 if (err) 802 return true; 803 804 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 805 if (fw_ver < 0x00160e00) 806 return true; 807 } 808 } 809 810 if (adev->has_hw_reset) { 811 adev->has_hw_reset = false; 812 return true; 813 } 814 815 /* bios scratch used on CIK+ */ 816 if (adev->asic_type >= CHIP_BONAIRE) 817 return amdgpu_atombios_scratch_need_asic_init(adev); 818 819 /* check MEM_SIZE for older asics */ 820 reg = amdgpu_asic_get_config_memsize(adev); 821 822 if ((reg != 0) && (reg != 0xffffffff)) 823 return false; 824 825 return true; 826 } 827 828 /* if we get transitioned to only one device, take VGA back */ 829 /** 830 * amdgpu_device_vga_set_decode - enable/disable vga decode 831 * 832 * @cookie: amdgpu_device pointer 833 * @state: enable/disable vga decode 834 * 835 * Enable/disable vga decode (all asics). 836 * Returns VGA resource flags. 837 */ 838 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 839 { 840 struct amdgpu_device *adev = cookie; 841 amdgpu_asic_set_vga_state(adev, state); 842 if (state) 843 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 844 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 845 else 846 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 847 } 848 849 /** 850 * amdgpu_device_check_block_size - validate the vm block size 851 * 852 * @adev: amdgpu_device pointer 853 * 854 * Validates the vm block size specified via module parameter. 855 * The vm block size defines number of bits in page table versus page directory, 856 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 857 * page table and the remaining bits are in the page directory. 858 */ 859 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 860 { 861 /* defines number of bits in page table versus page directory, 862 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 863 * page table and the remaining bits are in the page directory */ 864 if (amdgpu_vm_block_size == -1) 865 return; 866 867 if (amdgpu_vm_block_size < 9) { 868 dev_warn(adev->dev, "VM page table size (%d) too small\n", 869 amdgpu_vm_block_size); 870 amdgpu_vm_block_size = -1; 871 } 872 } 873 874 /** 875 * amdgpu_device_check_vm_size - validate the vm size 876 * 877 * @adev: amdgpu_device pointer 878 * 879 * Validates the vm size in GB specified via module parameter. 880 * The VM size is the size of the GPU virtual memory space in GB. 881 */ 882 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 883 { 884 /* no need to check the default value */ 885 if (amdgpu_vm_size == -1) 886 return; 887 888 if (amdgpu_vm_size < 1) { 889 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 890 amdgpu_vm_size); 891 amdgpu_vm_size = -1; 892 } 893 } 894 895 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 896 { 897 struct sysinfo si; 898 bool is_os_64 = (sizeof(void *) == 8) ? true : false; 899 uint64_t total_memory; 900 uint64_t dram_size_seven_GB = 0x1B8000000; 901 uint64_t dram_size_three_GB = 0xB8000000; 902 903 if (amdgpu_smu_memory_pool_size == 0) 904 return; 905 906 if (!is_os_64) { 907 DRM_WARN("Not 64-bit OS, feature not supported\n"); 908 goto def_value; 909 } 910 si_meminfo(&si); 911 total_memory = (uint64_t)si.totalram * si.mem_unit; 912 913 if ((amdgpu_smu_memory_pool_size == 1) || 914 (amdgpu_smu_memory_pool_size == 2)) { 915 if (total_memory < dram_size_three_GB) 916 goto def_value1; 917 } else if ((amdgpu_smu_memory_pool_size == 4) || 918 (amdgpu_smu_memory_pool_size == 8)) { 919 if (total_memory < dram_size_seven_GB) 920 goto def_value1; 921 } else { 922 DRM_WARN("Smu memory pool size not supported\n"); 923 goto def_value; 924 } 925 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 926 927 return; 928 929 def_value1: 930 DRM_WARN("No enough system memory\n"); 931 def_value: 932 adev->pm.smu_prv_buffer_size = 0; 933 } 934 935 /** 936 * amdgpu_device_check_arguments - validate module params 937 * 938 * @adev: amdgpu_device pointer 939 * 940 * Validates certain module parameters and updates 941 * the associated values used by the driver (all asics). 942 */ 943 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 944 { 945 int ret = 0; 946 947 if (amdgpu_sched_jobs < 4) { 948 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 949 amdgpu_sched_jobs); 950 amdgpu_sched_jobs = 4; 951 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 952 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 953 amdgpu_sched_jobs); 954 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 955 } 956 957 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 958 /* gart size must be greater or equal to 32M */ 959 dev_warn(adev->dev, "gart size (%d) too small\n", 960 amdgpu_gart_size); 961 amdgpu_gart_size = -1; 962 } 963 964 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 965 /* gtt size must be greater or equal to 32M */ 966 dev_warn(adev->dev, "gtt size (%d) too small\n", 967 amdgpu_gtt_size); 968 amdgpu_gtt_size = -1; 969 } 970 971 /* valid range is between 4 and 9 inclusive */ 972 if (amdgpu_vm_fragment_size != -1 && 973 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 974 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 975 amdgpu_vm_fragment_size = -1; 976 } 977 978 amdgpu_device_check_smu_prv_buffer_size(adev); 979 980 amdgpu_device_check_vm_size(adev); 981 982 amdgpu_device_check_block_size(adev); 983 984 ret = amdgpu_device_get_job_timeout_settings(adev); 985 if (ret) { 986 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 987 return ret; 988 } 989 990 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 991 992 return ret; 993 } 994 995 /** 996 * amdgpu_switcheroo_set_state - set switcheroo state 997 * 998 * @pdev: pci dev pointer 999 * @state: vga_switcheroo state 1000 * 1001 * Callback for the switcheroo driver. Suspends or resumes the 1002 * the asics before or after it is powered up using ACPI methods. 1003 */ 1004 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1005 { 1006 struct drm_device *dev = pci_get_drvdata(pdev); 1007 1008 if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) 1009 return; 1010 1011 if (state == VGA_SWITCHEROO_ON) { 1012 pr_info("amdgpu: switched on\n"); 1013 /* don't suspend or resume card normally */ 1014 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1015 1016 amdgpu_device_resume(dev, true, true); 1017 1018 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1019 drm_kms_helper_poll_enable(dev); 1020 } else { 1021 pr_info("amdgpu: switched off\n"); 1022 drm_kms_helper_poll_disable(dev); 1023 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1024 amdgpu_device_suspend(dev, true, true); 1025 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1026 } 1027 } 1028 1029 /** 1030 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1031 * 1032 * @pdev: pci dev pointer 1033 * 1034 * Callback for the switcheroo driver. Check of the switcheroo 1035 * state can be changed. 1036 * Returns true if the state can be changed, false if not. 1037 */ 1038 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1039 { 1040 struct drm_device *dev = pci_get_drvdata(pdev); 1041 1042 /* 1043 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1044 * locking inversion with the driver load path. And the access here is 1045 * completely racy anyway. So don't bother with locking for now. 1046 */ 1047 return dev->open_count == 0; 1048 } 1049 1050 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1051 .set_gpu_state = amdgpu_switcheroo_set_state, 1052 .reprobe = NULL, 1053 .can_switch = amdgpu_switcheroo_can_switch, 1054 }; 1055 1056 /** 1057 * amdgpu_device_ip_set_clockgating_state - set the CG state 1058 * 1059 * @dev: amdgpu_device pointer 1060 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1061 * @state: clockgating state (gate or ungate) 1062 * 1063 * Sets the requested clockgating state for all instances of 1064 * the hardware IP specified. 1065 * Returns the error code from the last instance. 1066 */ 1067 int amdgpu_device_ip_set_clockgating_state(void *dev, 1068 enum amd_ip_block_type block_type, 1069 enum amd_clockgating_state state) 1070 { 1071 struct amdgpu_device *adev = dev; 1072 int i, r = 0; 1073 1074 for (i = 0; i < adev->num_ip_blocks; i++) { 1075 if (!adev->ip_blocks[i].status.valid) 1076 continue; 1077 if (adev->ip_blocks[i].version->type != block_type) 1078 continue; 1079 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1080 continue; 1081 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1082 (void *)adev, state); 1083 if (r) 1084 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1085 adev->ip_blocks[i].version->funcs->name, r); 1086 } 1087 return r; 1088 } 1089 1090 /** 1091 * amdgpu_device_ip_set_powergating_state - set the PG state 1092 * 1093 * @dev: amdgpu_device pointer 1094 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1095 * @state: powergating state (gate or ungate) 1096 * 1097 * Sets the requested powergating state for all instances of 1098 * the hardware IP specified. 1099 * Returns the error code from the last instance. 1100 */ 1101 int amdgpu_device_ip_set_powergating_state(void *dev, 1102 enum amd_ip_block_type block_type, 1103 enum amd_powergating_state state) 1104 { 1105 struct amdgpu_device *adev = dev; 1106 int i, r = 0; 1107 1108 for (i = 0; i < adev->num_ip_blocks; i++) { 1109 if (!adev->ip_blocks[i].status.valid) 1110 continue; 1111 if (adev->ip_blocks[i].version->type != block_type) 1112 continue; 1113 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1114 continue; 1115 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1116 (void *)adev, state); 1117 if (r) 1118 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1119 adev->ip_blocks[i].version->funcs->name, r); 1120 } 1121 return r; 1122 } 1123 1124 /** 1125 * amdgpu_device_ip_get_clockgating_state - get the CG state 1126 * 1127 * @adev: amdgpu_device pointer 1128 * @flags: clockgating feature flags 1129 * 1130 * Walks the list of IPs on the device and updates the clockgating 1131 * flags for each IP. 1132 * Updates @flags with the feature flags for each hardware IP where 1133 * clockgating is enabled. 1134 */ 1135 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1136 u32 *flags) 1137 { 1138 int i; 1139 1140 for (i = 0; i < adev->num_ip_blocks; i++) { 1141 if (!adev->ip_blocks[i].status.valid) 1142 continue; 1143 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1144 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1145 } 1146 } 1147 1148 /** 1149 * amdgpu_device_ip_wait_for_idle - wait for idle 1150 * 1151 * @adev: amdgpu_device pointer 1152 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1153 * 1154 * Waits for the request hardware IP to be idle. 1155 * Returns 0 for success or a negative error code on failure. 1156 */ 1157 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1158 enum amd_ip_block_type block_type) 1159 { 1160 int i, r; 1161 1162 for (i = 0; i < adev->num_ip_blocks; i++) { 1163 if (!adev->ip_blocks[i].status.valid) 1164 continue; 1165 if (adev->ip_blocks[i].version->type == block_type) { 1166 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1167 if (r) 1168 return r; 1169 break; 1170 } 1171 } 1172 return 0; 1173 1174 } 1175 1176 /** 1177 * amdgpu_device_ip_is_idle - is the hardware IP idle 1178 * 1179 * @adev: amdgpu_device pointer 1180 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1181 * 1182 * Check if the hardware IP is idle or not. 1183 * Returns true if it the IP is idle, false if not. 1184 */ 1185 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1186 enum amd_ip_block_type block_type) 1187 { 1188 int i; 1189 1190 for (i = 0; i < adev->num_ip_blocks; i++) { 1191 if (!adev->ip_blocks[i].status.valid) 1192 continue; 1193 if (adev->ip_blocks[i].version->type == block_type) 1194 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1195 } 1196 return true; 1197 1198 } 1199 1200 /** 1201 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1202 * 1203 * @adev: amdgpu_device pointer 1204 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1205 * 1206 * Returns a pointer to the hardware IP block structure 1207 * if it exists for the asic, otherwise NULL. 1208 */ 1209 struct amdgpu_ip_block * 1210 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1211 enum amd_ip_block_type type) 1212 { 1213 int i; 1214 1215 for (i = 0; i < adev->num_ip_blocks; i++) 1216 if (adev->ip_blocks[i].version->type == type) 1217 return &adev->ip_blocks[i]; 1218 1219 return NULL; 1220 } 1221 1222 /** 1223 * amdgpu_device_ip_block_version_cmp 1224 * 1225 * @adev: amdgpu_device pointer 1226 * @type: enum amd_ip_block_type 1227 * @major: major version 1228 * @minor: minor version 1229 * 1230 * return 0 if equal or greater 1231 * return 1 if smaller or the ip_block doesn't exist 1232 */ 1233 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1234 enum amd_ip_block_type type, 1235 u32 major, u32 minor) 1236 { 1237 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1238 1239 if (ip_block && ((ip_block->version->major > major) || 1240 ((ip_block->version->major == major) && 1241 (ip_block->version->minor >= minor)))) 1242 return 0; 1243 1244 return 1; 1245 } 1246 1247 /** 1248 * amdgpu_device_ip_block_add 1249 * 1250 * @adev: amdgpu_device pointer 1251 * @ip_block_version: pointer to the IP to add 1252 * 1253 * Adds the IP block driver information to the collection of IPs 1254 * on the asic. 1255 */ 1256 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1257 const struct amdgpu_ip_block_version *ip_block_version) 1258 { 1259 if (!ip_block_version) 1260 return -EINVAL; 1261 1262 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1263 ip_block_version->funcs->name); 1264 1265 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1266 1267 return 0; 1268 } 1269 1270 /** 1271 * amdgpu_device_enable_virtual_display - enable virtual display feature 1272 * 1273 * @adev: amdgpu_device pointer 1274 * 1275 * Enabled the virtual display feature if the user has enabled it via 1276 * the module parameter virtual_display. This feature provides a virtual 1277 * display hardware on headless boards or in virtualized environments. 1278 * This function parses and validates the configuration string specified by 1279 * the user and configues the virtual display configuration (number of 1280 * virtual connectors, crtcs, etc.) specified. 1281 */ 1282 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1283 { 1284 adev->enable_virtual_display = false; 1285 1286 if (amdgpu_virtual_display) { 1287 struct drm_device *ddev = adev->ddev; 1288 const char *pci_address_name = pci_name(ddev->pdev); 1289 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1290 1291 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1292 pciaddstr_tmp = pciaddstr; 1293 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1294 pciaddname = strsep(&pciaddname_tmp, ","); 1295 if (!strcmp("all", pciaddname) 1296 || !strcmp(pci_address_name, pciaddname)) { 1297 long num_crtc; 1298 int res = -1; 1299 1300 adev->enable_virtual_display = true; 1301 1302 if (pciaddname_tmp) 1303 res = kstrtol(pciaddname_tmp, 10, 1304 &num_crtc); 1305 1306 if (!res) { 1307 if (num_crtc < 1) 1308 num_crtc = 1; 1309 if (num_crtc > 6) 1310 num_crtc = 6; 1311 adev->mode_info.num_crtc = num_crtc; 1312 } else { 1313 adev->mode_info.num_crtc = 1; 1314 } 1315 break; 1316 } 1317 } 1318 1319 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1320 amdgpu_virtual_display, pci_address_name, 1321 adev->enable_virtual_display, adev->mode_info.num_crtc); 1322 1323 kfree(pciaddstr); 1324 } 1325 } 1326 1327 /** 1328 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1329 * 1330 * @adev: amdgpu_device pointer 1331 * 1332 * Parses the asic configuration parameters specified in the gpu info 1333 * firmware and makes them availale to the driver for use in configuring 1334 * the asic. 1335 * Returns 0 on success, -EINVAL on failure. 1336 */ 1337 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1338 { 1339 const char *chip_name; 1340 char fw_name[30]; 1341 int err; 1342 const struct gpu_info_firmware_header_v1_0 *hdr; 1343 1344 adev->firmware.gpu_info_fw = NULL; 1345 1346 switch (adev->asic_type) { 1347 case CHIP_TOPAZ: 1348 case CHIP_TONGA: 1349 case CHIP_FIJI: 1350 case CHIP_POLARIS10: 1351 case CHIP_POLARIS11: 1352 case CHIP_POLARIS12: 1353 case CHIP_VEGAM: 1354 case CHIP_CARRIZO: 1355 case CHIP_STONEY: 1356 #ifdef CONFIG_DRM_AMDGPU_SI 1357 case CHIP_VERDE: 1358 case CHIP_TAHITI: 1359 case CHIP_PITCAIRN: 1360 case CHIP_OLAND: 1361 case CHIP_HAINAN: 1362 #endif 1363 #ifdef CONFIG_DRM_AMDGPU_CIK 1364 case CHIP_BONAIRE: 1365 case CHIP_HAWAII: 1366 case CHIP_KAVERI: 1367 case CHIP_KABINI: 1368 case CHIP_MULLINS: 1369 #endif 1370 case CHIP_VEGA20: 1371 default: 1372 return 0; 1373 case CHIP_VEGA10: 1374 chip_name = "vega10"; 1375 break; 1376 case CHIP_VEGA12: 1377 chip_name = "vega12"; 1378 break; 1379 case CHIP_RAVEN: 1380 if (adev->rev_id >= 8) 1381 chip_name = "raven2"; 1382 else if (adev->pdev->device == 0x15d8) 1383 chip_name = "picasso"; 1384 else 1385 chip_name = "raven"; 1386 break; 1387 case CHIP_NAVI10: 1388 chip_name = "navi10"; 1389 break; 1390 } 1391 1392 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1393 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1394 if (err) { 1395 dev_err(adev->dev, 1396 "Failed to load gpu_info firmware \"%s\"\n", 1397 fw_name); 1398 goto out; 1399 } 1400 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1401 if (err) { 1402 dev_err(adev->dev, 1403 "Failed to validate gpu_info firmware \"%s\"\n", 1404 fw_name); 1405 goto out; 1406 } 1407 1408 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1409 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1410 1411 switch (hdr->version_major) { 1412 case 1: 1413 { 1414 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1415 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1416 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1417 1418 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1419 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1420 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1421 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1422 adev->gfx.config.max_texture_channel_caches = 1423 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1424 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1425 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1426 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1427 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1428 adev->gfx.config.double_offchip_lds_buf = 1429 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1430 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1431 adev->gfx.cu_info.max_waves_per_simd = 1432 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1433 adev->gfx.cu_info.max_scratch_slots_per_cu = 1434 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1435 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1436 if (hdr->version_minor >= 1) { 1437 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1438 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1439 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1440 adev->gfx.config.num_sc_per_sh = 1441 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1442 adev->gfx.config.num_packer_per_sc = 1443 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1444 } 1445 #ifdef CONFIG_DRM_AMD_DC_DCN2_0 1446 if (hdr->version_minor == 2) { 1447 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1448 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1449 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1450 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1451 } 1452 #endif 1453 break; 1454 } 1455 default: 1456 dev_err(adev->dev, 1457 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1458 err = -EINVAL; 1459 goto out; 1460 } 1461 out: 1462 return err; 1463 } 1464 1465 /** 1466 * amdgpu_device_ip_early_init - run early init for hardware IPs 1467 * 1468 * @adev: amdgpu_device pointer 1469 * 1470 * Early initialization pass for hardware IPs. The hardware IPs that make 1471 * up each asic are discovered each IP's early_init callback is run. This 1472 * is the first stage in initializing the asic. 1473 * Returns 0 on success, negative error code on failure. 1474 */ 1475 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1476 { 1477 int i, r; 1478 1479 amdgpu_device_enable_virtual_display(adev); 1480 1481 switch (adev->asic_type) { 1482 case CHIP_TOPAZ: 1483 case CHIP_TONGA: 1484 case CHIP_FIJI: 1485 case CHIP_POLARIS10: 1486 case CHIP_POLARIS11: 1487 case CHIP_POLARIS12: 1488 case CHIP_VEGAM: 1489 case CHIP_CARRIZO: 1490 case CHIP_STONEY: 1491 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) 1492 adev->family = AMDGPU_FAMILY_CZ; 1493 else 1494 adev->family = AMDGPU_FAMILY_VI; 1495 1496 r = vi_set_ip_blocks(adev); 1497 if (r) 1498 return r; 1499 break; 1500 #ifdef CONFIG_DRM_AMDGPU_SI 1501 case CHIP_VERDE: 1502 case CHIP_TAHITI: 1503 case CHIP_PITCAIRN: 1504 case CHIP_OLAND: 1505 case CHIP_HAINAN: 1506 adev->family = AMDGPU_FAMILY_SI; 1507 r = si_set_ip_blocks(adev); 1508 if (r) 1509 return r; 1510 break; 1511 #endif 1512 #ifdef CONFIG_DRM_AMDGPU_CIK 1513 case CHIP_BONAIRE: 1514 case CHIP_HAWAII: 1515 case CHIP_KAVERI: 1516 case CHIP_KABINI: 1517 case CHIP_MULLINS: 1518 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) 1519 adev->family = AMDGPU_FAMILY_CI; 1520 else 1521 adev->family = AMDGPU_FAMILY_KV; 1522 1523 r = cik_set_ip_blocks(adev); 1524 if (r) 1525 return r; 1526 break; 1527 #endif 1528 case CHIP_VEGA10: 1529 case CHIP_VEGA12: 1530 case CHIP_VEGA20: 1531 case CHIP_RAVEN: 1532 if (adev->asic_type == CHIP_RAVEN) 1533 adev->family = AMDGPU_FAMILY_RV; 1534 else 1535 adev->family = AMDGPU_FAMILY_AI; 1536 1537 r = soc15_set_ip_blocks(adev); 1538 if (r) 1539 return r; 1540 break; 1541 case CHIP_NAVI10: 1542 adev->family = AMDGPU_FAMILY_NV; 1543 1544 r = nv_set_ip_blocks(adev); 1545 if (r) 1546 return r; 1547 break; 1548 default: 1549 /* FIXME: not supported yet */ 1550 return -EINVAL; 1551 } 1552 1553 r = amdgpu_device_parse_gpu_info_fw(adev); 1554 if (r) 1555 return r; 1556 1557 amdgpu_amdkfd_device_probe(adev); 1558 1559 if (amdgpu_sriov_vf(adev)) { 1560 r = amdgpu_virt_request_full_gpu(adev, true); 1561 if (r) 1562 return -EAGAIN; 1563 1564 /* query the reg access mode at the very beginning */ 1565 amdgpu_virt_init_reg_access_mode(adev); 1566 } 1567 1568 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1569 if (amdgpu_sriov_vf(adev)) 1570 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1571 1572 for (i = 0; i < adev->num_ip_blocks; i++) { 1573 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1574 DRM_ERROR("disabled ip block: %d <%s>\n", 1575 i, adev->ip_blocks[i].version->funcs->name); 1576 adev->ip_blocks[i].status.valid = false; 1577 } else { 1578 if (adev->ip_blocks[i].version->funcs->early_init) { 1579 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1580 if (r == -ENOENT) { 1581 adev->ip_blocks[i].status.valid = false; 1582 } else if (r) { 1583 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1584 adev->ip_blocks[i].version->funcs->name, r); 1585 return r; 1586 } else { 1587 adev->ip_blocks[i].status.valid = true; 1588 } 1589 } else { 1590 adev->ip_blocks[i].status.valid = true; 1591 } 1592 } 1593 /* get the vbios after the asic_funcs are set up */ 1594 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1595 /* Read BIOS */ 1596 if (!amdgpu_get_bios(adev)) 1597 return -EINVAL; 1598 1599 r = amdgpu_atombios_init(adev); 1600 if (r) { 1601 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1602 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1603 return r; 1604 } 1605 } 1606 } 1607 1608 adev->cg_flags &= amdgpu_cg_mask; 1609 adev->pg_flags &= amdgpu_pg_mask; 1610 1611 return 0; 1612 } 1613 1614 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1615 { 1616 int i, r; 1617 1618 for (i = 0; i < adev->num_ip_blocks; i++) { 1619 if (!adev->ip_blocks[i].status.sw) 1620 continue; 1621 if (adev->ip_blocks[i].status.hw) 1622 continue; 1623 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1624 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1625 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1626 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1627 if (r) { 1628 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1629 adev->ip_blocks[i].version->funcs->name, r); 1630 return r; 1631 } 1632 adev->ip_blocks[i].status.hw = true; 1633 } 1634 } 1635 1636 return 0; 1637 } 1638 1639 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1640 { 1641 int i, r; 1642 1643 for (i = 0; i < adev->num_ip_blocks; i++) { 1644 if (!adev->ip_blocks[i].status.sw) 1645 continue; 1646 if (adev->ip_blocks[i].status.hw) 1647 continue; 1648 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1649 if (r) { 1650 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1651 adev->ip_blocks[i].version->funcs->name, r); 1652 return r; 1653 } 1654 adev->ip_blocks[i].status.hw = true; 1655 } 1656 1657 return 0; 1658 } 1659 1660 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1661 { 1662 int r = 0; 1663 int i; 1664 uint32_t smu_version; 1665 1666 if (adev->asic_type >= CHIP_VEGA10) { 1667 for (i = 0; i < adev->num_ip_blocks; i++) { 1668 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 1669 if (adev->in_gpu_reset || adev->in_suspend) { 1670 if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) 1671 break; /* sriov gpu reset, psp need to do hw_init before IH because of hw limit */ 1672 r = adev->ip_blocks[i].version->funcs->resume(adev); 1673 if (r) { 1674 DRM_ERROR("resume of IP block <%s> failed %d\n", 1675 adev->ip_blocks[i].version->funcs->name, r); 1676 return r; 1677 } 1678 } else { 1679 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1680 if (r) { 1681 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1682 adev->ip_blocks[i].version->funcs->name, r); 1683 return r; 1684 } 1685 } 1686 adev->ip_blocks[i].status.hw = true; 1687 } 1688 } 1689 } 1690 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1691 1692 return r; 1693 } 1694 1695 /** 1696 * amdgpu_device_ip_init - run init for hardware IPs 1697 * 1698 * @adev: amdgpu_device pointer 1699 * 1700 * Main initialization pass for hardware IPs. The list of all the hardware 1701 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1702 * are run. sw_init initializes the software state associated with each IP 1703 * and hw_init initializes the hardware associated with each IP. 1704 * Returns 0 on success, negative error code on failure. 1705 */ 1706 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1707 { 1708 int i, r; 1709 1710 r = amdgpu_ras_init(adev); 1711 if (r) 1712 return r; 1713 1714 for (i = 0; i < adev->num_ip_blocks; i++) { 1715 if (!adev->ip_blocks[i].status.valid) 1716 continue; 1717 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1718 if (r) { 1719 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1720 adev->ip_blocks[i].version->funcs->name, r); 1721 goto init_failed; 1722 } 1723 adev->ip_blocks[i].status.sw = true; 1724 1725 /* need to do gmc hw init early so we can allocate gpu mem */ 1726 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 1727 r = amdgpu_device_vram_scratch_init(adev); 1728 if (r) { 1729 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 1730 goto init_failed; 1731 } 1732 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 1733 if (r) { 1734 DRM_ERROR("hw_init %d failed %d\n", i, r); 1735 goto init_failed; 1736 } 1737 r = amdgpu_device_wb_init(adev); 1738 if (r) { 1739 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 1740 goto init_failed; 1741 } 1742 adev->ip_blocks[i].status.hw = true; 1743 1744 /* right after GMC hw init, we create CSA */ 1745 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 1746 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 1747 AMDGPU_GEM_DOMAIN_VRAM, 1748 AMDGPU_CSA_SIZE); 1749 if (r) { 1750 DRM_ERROR("allocate CSA failed %d\n", r); 1751 goto init_failed; 1752 } 1753 } 1754 } 1755 } 1756 1757 r = amdgpu_ib_pool_init(adev); 1758 if (r) { 1759 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 1760 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 1761 goto init_failed; 1762 } 1763 1764 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 1765 if (r) 1766 goto init_failed; 1767 1768 r = amdgpu_device_ip_hw_init_phase1(adev); 1769 if (r) 1770 goto init_failed; 1771 1772 r = amdgpu_device_fw_loading(adev); 1773 if (r) 1774 goto init_failed; 1775 1776 r = amdgpu_device_ip_hw_init_phase2(adev); 1777 if (r) 1778 goto init_failed; 1779 1780 if (adev->gmc.xgmi.num_physical_nodes > 1) 1781 amdgpu_xgmi_add_device(adev); 1782 amdgpu_amdkfd_device_init(adev); 1783 1784 init_failed: 1785 if (amdgpu_sriov_vf(adev)) { 1786 if (!r) 1787 amdgpu_virt_init_data_exchange(adev); 1788 amdgpu_virt_release_full_gpu(adev, true); 1789 } 1790 1791 return r; 1792 } 1793 1794 /** 1795 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 1796 * 1797 * @adev: amdgpu_device pointer 1798 * 1799 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 1800 * this function before a GPU reset. If the value is retained after a 1801 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 1802 */ 1803 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 1804 { 1805 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 1806 } 1807 1808 /** 1809 * amdgpu_device_check_vram_lost - check if vram is valid 1810 * 1811 * @adev: amdgpu_device pointer 1812 * 1813 * Checks the reset magic value written to the gart pointer in VRAM. 1814 * The driver calls this after a GPU reset to see if the contents of 1815 * VRAM is lost or now. 1816 * returns true if vram is lost, false if not. 1817 */ 1818 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 1819 { 1820 return !!memcmp(adev->gart.ptr, adev->reset_magic, 1821 AMDGPU_RESET_MAGIC_NUM); 1822 } 1823 1824 /** 1825 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 1826 * 1827 * @adev: amdgpu_device pointer 1828 * 1829 * The list of all the hardware IPs that make up the asic is walked and the 1830 * set_clockgating_state callbacks are run. 1831 * Late initialization pass enabling clockgating for hardware IPs. 1832 * Fini or suspend, pass disabling clockgating for hardware IPs. 1833 * Returns 0 on success, negative error code on failure. 1834 */ 1835 1836 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 1837 enum amd_clockgating_state state) 1838 { 1839 int i, j, r; 1840 1841 if (amdgpu_emu_mode == 1) 1842 return 0; 1843 1844 for (j = 0; j < adev->num_ip_blocks; j++) { 1845 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 1846 if (!adev->ip_blocks[i].status.late_initialized) 1847 continue; 1848 /* skip CG for VCE/UVD, it's handled specially */ 1849 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 1850 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 1851 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 1852 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 1853 /* enable clockgating to save power */ 1854 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 1855 state); 1856 if (r) { 1857 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 1858 adev->ip_blocks[i].version->funcs->name, r); 1859 return r; 1860 } 1861 } 1862 } 1863 1864 return 0; 1865 } 1866 1867 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 1868 { 1869 int i, j, r; 1870 1871 if (amdgpu_emu_mode == 1) 1872 return 0; 1873 1874 for (j = 0; j < adev->num_ip_blocks; j++) { 1875 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 1876 if (!adev->ip_blocks[i].status.late_initialized) 1877 continue; 1878 /* skip CG for VCE/UVD, it's handled specially */ 1879 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 1880 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 1881 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 1882 adev->ip_blocks[i].version->funcs->set_powergating_state) { 1883 /* enable powergating to save power */ 1884 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 1885 state); 1886 if (r) { 1887 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 1888 adev->ip_blocks[i].version->funcs->name, r); 1889 return r; 1890 } 1891 } 1892 } 1893 return 0; 1894 } 1895 1896 static int amdgpu_device_enable_mgpu_fan_boost(void) 1897 { 1898 struct amdgpu_gpu_instance *gpu_ins; 1899 struct amdgpu_device *adev; 1900 int i, ret = 0; 1901 1902 mutex_lock(&mgpu_info.mutex); 1903 1904 /* 1905 * MGPU fan boost feature should be enabled 1906 * only when there are two or more dGPUs in 1907 * the system 1908 */ 1909 if (mgpu_info.num_dgpu < 2) 1910 goto out; 1911 1912 for (i = 0; i < mgpu_info.num_dgpu; i++) { 1913 gpu_ins = &(mgpu_info.gpu_ins[i]); 1914 adev = gpu_ins->adev; 1915 if (!(adev->flags & AMD_IS_APU) && 1916 !gpu_ins->mgpu_fan_enabled && 1917 adev->powerplay.pp_funcs && 1918 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 1919 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 1920 if (ret) 1921 break; 1922 1923 gpu_ins->mgpu_fan_enabled = 1; 1924 } 1925 } 1926 1927 out: 1928 mutex_unlock(&mgpu_info.mutex); 1929 1930 return ret; 1931 } 1932 1933 /** 1934 * amdgpu_device_ip_late_init - run late init for hardware IPs 1935 * 1936 * @adev: amdgpu_device pointer 1937 * 1938 * Late initialization pass for hardware IPs. The list of all the hardware 1939 * IPs that make up the asic is walked and the late_init callbacks are run. 1940 * late_init covers any special initialization that an IP requires 1941 * after all of the have been initialized or something that needs to happen 1942 * late in the init process. 1943 * Returns 0 on success, negative error code on failure. 1944 */ 1945 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 1946 { 1947 int i = 0, r; 1948 1949 for (i = 0; i < adev->num_ip_blocks; i++) { 1950 if (!adev->ip_blocks[i].status.hw) 1951 continue; 1952 if (adev->ip_blocks[i].version->funcs->late_init) { 1953 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 1954 if (r) { 1955 DRM_ERROR("late_init of IP block <%s> failed %d\n", 1956 adev->ip_blocks[i].version->funcs->name, r); 1957 return r; 1958 } 1959 } 1960 adev->ip_blocks[i].status.late_initialized = true; 1961 } 1962 1963 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 1964 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 1965 1966 amdgpu_device_fill_reset_magic(adev); 1967 1968 r = amdgpu_device_enable_mgpu_fan_boost(); 1969 if (r) 1970 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 1971 1972 /* set to low pstate by default */ 1973 amdgpu_xgmi_set_pstate(adev, 0); 1974 1975 return 0; 1976 } 1977 1978 /** 1979 * amdgpu_device_ip_fini - run fini for hardware IPs 1980 * 1981 * @adev: amdgpu_device pointer 1982 * 1983 * Main teardown pass for hardware IPs. The list of all the hardware 1984 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 1985 * are run. hw_fini tears down the hardware associated with each IP 1986 * and sw_fini tears down any software state associated with each IP. 1987 * Returns 0 on success, negative error code on failure. 1988 */ 1989 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 1990 { 1991 int i, r; 1992 1993 amdgpu_ras_pre_fini(adev); 1994 1995 if (adev->gmc.xgmi.num_physical_nodes > 1) 1996 amdgpu_xgmi_remove_device(adev); 1997 1998 amdgpu_amdkfd_device_fini(adev); 1999 2000 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2001 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2002 2003 /* need to disable SMC first */ 2004 for (i = 0; i < adev->num_ip_blocks; i++) { 2005 if (!adev->ip_blocks[i].status.hw) 2006 continue; 2007 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2008 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2009 /* XXX handle errors */ 2010 if (r) { 2011 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2012 adev->ip_blocks[i].version->funcs->name, r); 2013 } 2014 adev->ip_blocks[i].status.hw = false; 2015 break; 2016 } 2017 } 2018 2019 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2020 if (!adev->ip_blocks[i].status.hw) 2021 continue; 2022 2023 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2024 /* XXX handle errors */ 2025 if (r) { 2026 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2027 adev->ip_blocks[i].version->funcs->name, r); 2028 } 2029 2030 adev->ip_blocks[i].status.hw = false; 2031 } 2032 2033 2034 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2035 if (!adev->ip_blocks[i].status.sw) 2036 continue; 2037 2038 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2039 amdgpu_ucode_free_bo(adev); 2040 amdgpu_free_static_csa(&adev->virt.csa_obj); 2041 amdgpu_device_wb_fini(adev); 2042 amdgpu_device_vram_scratch_fini(adev); 2043 amdgpu_ib_pool_fini(adev); 2044 } 2045 2046 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2047 /* XXX handle errors */ 2048 if (r) { 2049 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2050 adev->ip_blocks[i].version->funcs->name, r); 2051 } 2052 adev->ip_blocks[i].status.sw = false; 2053 adev->ip_blocks[i].status.valid = false; 2054 } 2055 2056 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2057 if (!adev->ip_blocks[i].status.late_initialized) 2058 continue; 2059 if (adev->ip_blocks[i].version->funcs->late_fini) 2060 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2061 adev->ip_blocks[i].status.late_initialized = false; 2062 } 2063 2064 amdgpu_ras_fini(adev); 2065 2066 if (amdgpu_sriov_vf(adev)) 2067 if (amdgpu_virt_release_full_gpu(adev, false)) 2068 DRM_ERROR("failed to release exclusive mode on fini\n"); 2069 2070 return 0; 2071 } 2072 2073 /** 2074 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2075 * 2076 * @work: work_struct. 2077 */ 2078 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2079 { 2080 struct amdgpu_device *adev = 2081 container_of(work, struct amdgpu_device, delayed_init_work.work); 2082 int r; 2083 2084 r = amdgpu_ib_ring_tests(adev); 2085 if (r) 2086 DRM_ERROR("ib ring test failed (%d).\n", r); 2087 } 2088 2089 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2090 { 2091 struct amdgpu_device *adev = 2092 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2093 2094 mutex_lock(&adev->gfx.gfx_off_mutex); 2095 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2096 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2097 adev->gfx.gfx_off_state = true; 2098 } 2099 mutex_unlock(&adev->gfx.gfx_off_mutex); 2100 } 2101 2102 /** 2103 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2104 * 2105 * @adev: amdgpu_device pointer 2106 * 2107 * Main suspend function for hardware IPs. The list of all the hardware 2108 * IPs that make up the asic is walked, clockgating is disabled and the 2109 * suspend callbacks are run. suspend puts the hardware and software state 2110 * in each IP into a state suitable for suspend. 2111 * Returns 0 on success, negative error code on failure. 2112 */ 2113 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2114 { 2115 int i, r; 2116 2117 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2118 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2119 2120 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2121 if (!adev->ip_blocks[i].status.valid) 2122 continue; 2123 /* displays are handled separately */ 2124 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 2125 /* XXX handle errors */ 2126 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2127 /* XXX handle errors */ 2128 if (r) { 2129 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2130 adev->ip_blocks[i].version->funcs->name, r); 2131 } 2132 } 2133 } 2134 2135 return 0; 2136 } 2137 2138 /** 2139 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2140 * 2141 * @adev: amdgpu_device pointer 2142 * 2143 * Main suspend function for hardware IPs. The list of all the hardware 2144 * IPs that make up the asic is walked, clockgating is disabled and the 2145 * suspend callbacks are run. suspend puts the hardware and software state 2146 * in each IP into a state suitable for suspend. 2147 * Returns 0 on success, negative error code on failure. 2148 */ 2149 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2150 { 2151 int i, r; 2152 2153 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2154 if (!adev->ip_blocks[i].status.valid) 2155 continue; 2156 /* displays are handled in phase1 */ 2157 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2158 continue; 2159 /* XXX handle errors */ 2160 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2161 /* XXX handle errors */ 2162 if (r) { 2163 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2164 adev->ip_blocks[i].version->funcs->name, r); 2165 } 2166 } 2167 2168 return 0; 2169 } 2170 2171 /** 2172 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2173 * 2174 * @adev: amdgpu_device pointer 2175 * 2176 * Main suspend function for hardware IPs. The list of all the hardware 2177 * IPs that make up the asic is walked, clockgating is disabled and the 2178 * suspend callbacks are run. suspend puts the hardware and software state 2179 * in each IP into a state suitable for suspend. 2180 * Returns 0 on success, negative error code on failure. 2181 */ 2182 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2183 { 2184 int r; 2185 2186 if (amdgpu_sriov_vf(adev)) 2187 amdgpu_virt_request_full_gpu(adev, false); 2188 2189 r = amdgpu_device_ip_suspend_phase1(adev); 2190 if (r) 2191 return r; 2192 r = amdgpu_device_ip_suspend_phase2(adev); 2193 2194 if (amdgpu_sriov_vf(adev)) 2195 amdgpu_virt_release_full_gpu(adev, false); 2196 2197 return r; 2198 } 2199 2200 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2201 { 2202 int i, r; 2203 2204 static enum amd_ip_block_type ip_order[] = { 2205 AMD_IP_BLOCK_TYPE_GMC, 2206 AMD_IP_BLOCK_TYPE_COMMON, 2207 AMD_IP_BLOCK_TYPE_PSP, 2208 AMD_IP_BLOCK_TYPE_IH, 2209 }; 2210 2211 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2212 int j; 2213 struct amdgpu_ip_block *block; 2214 2215 for (j = 0; j < adev->num_ip_blocks; j++) { 2216 block = &adev->ip_blocks[j]; 2217 2218 if (block->version->type != ip_order[i] || 2219 !block->status.valid) 2220 continue; 2221 2222 r = block->version->funcs->hw_init(adev); 2223 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2224 if (r) 2225 return r; 2226 } 2227 } 2228 2229 return 0; 2230 } 2231 2232 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2233 { 2234 int i, r; 2235 2236 static enum amd_ip_block_type ip_order[] = { 2237 AMD_IP_BLOCK_TYPE_SMC, 2238 AMD_IP_BLOCK_TYPE_DCE, 2239 AMD_IP_BLOCK_TYPE_GFX, 2240 AMD_IP_BLOCK_TYPE_SDMA, 2241 AMD_IP_BLOCK_TYPE_UVD, 2242 AMD_IP_BLOCK_TYPE_VCE 2243 }; 2244 2245 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2246 int j; 2247 struct amdgpu_ip_block *block; 2248 2249 for (j = 0; j < adev->num_ip_blocks; j++) { 2250 block = &adev->ip_blocks[j]; 2251 2252 if (block->version->type != ip_order[i] || 2253 !block->status.valid) 2254 continue; 2255 2256 r = block->version->funcs->hw_init(adev); 2257 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2258 if (r) 2259 return r; 2260 } 2261 } 2262 2263 return 0; 2264 } 2265 2266 /** 2267 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2268 * 2269 * @adev: amdgpu_device pointer 2270 * 2271 * First resume function for hardware IPs. The list of all the hardware 2272 * IPs that make up the asic is walked and the resume callbacks are run for 2273 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2274 * after a suspend and updates the software state as necessary. This 2275 * function is also used for restoring the GPU after a GPU reset. 2276 * Returns 0 on success, negative error code on failure. 2277 */ 2278 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2279 { 2280 int i, r; 2281 2282 for (i = 0; i < adev->num_ip_blocks; i++) { 2283 if (!adev->ip_blocks[i].status.valid) 2284 continue; 2285 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2286 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2287 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2288 r = adev->ip_blocks[i].version->funcs->resume(adev); 2289 if (r) { 2290 DRM_ERROR("resume of IP block <%s> failed %d\n", 2291 adev->ip_blocks[i].version->funcs->name, r); 2292 return r; 2293 } 2294 } 2295 } 2296 2297 return 0; 2298 } 2299 2300 /** 2301 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2302 * 2303 * @adev: amdgpu_device pointer 2304 * 2305 * First resume function for hardware IPs. The list of all the hardware 2306 * IPs that make up the asic is walked and the resume callbacks are run for 2307 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2308 * functional state after a suspend and updates the software state as 2309 * necessary. This function is also used for restoring the GPU after a GPU 2310 * reset. 2311 * Returns 0 on success, negative error code on failure. 2312 */ 2313 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2314 { 2315 int i, r; 2316 2317 for (i = 0; i < adev->num_ip_blocks; i++) { 2318 if (!adev->ip_blocks[i].status.valid) 2319 continue; 2320 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2321 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2322 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2323 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2324 continue; 2325 r = adev->ip_blocks[i].version->funcs->resume(adev); 2326 if (r) { 2327 DRM_ERROR("resume of IP block <%s> failed %d\n", 2328 adev->ip_blocks[i].version->funcs->name, r); 2329 return r; 2330 } 2331 } 2332 2333 return 0; 2334 } 2335 2336 /** 2337 * amdgpu_device_ip_resume - run resume for hardware IPs 2338 * 2339 * @adev: amdgpu_device pointer 2340 * 2341 * Main resume function for hardware IPs. The hardware IPs 2342 * are split into two resume functions because they are 2343 * are also used in in recovering from a GPU reset and some additional 2344 * steps need to be take between them. In this case (S3/S4) they are 2345 * run sequentially. 2346 * Returns 0 on success, negative error code on failure. 2347 */ 2348 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2349 { 2350 int r; 2351 2352 r = amdgpu_device_ip_resume_phase1(adev); 2353 if (r) 2354 return r; 2355 2356 r = amdgpu_device_fw_loading(adev); 2357 if (r) 2358 return r; 2359 2360 r = amdgpu_device_ip_resume_phase2(adev); 2361 2362 return r; 2363 } 2364 2365 /** 2366 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2367 * 2368 * @adev: amdgpu_device pointer 2369 * 2370 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2371 */ 2372 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2373 { 2374 if (amdgpu_sriov_vf(adev)) { 2375 if (adev->is_atom_fw) { 2376 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2377 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2378 } else { 2379 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2380 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2381 } 2382 2383 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2384 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2385 } 2386 } 2387 2388 /** 2389 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2390 * 2391 * @asic_type: AMD asic type 2392 * 2393 * Check if there is DC (new modesetting infrastructre) support for an asic. 2394 * returns true if DC has support, false if not. 2395 */ 2396 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2397 { 2398 switch (asic_type) { 2399 #if defined(CONFIG_DRM_AMD_DC) 2400 case CHIP_BONAIRE: 2401 case CHIP_KAVERI: 2402 case CHIP_KABINI: 2403 case CHIP_MULLINS: 2404 /* 2405 * We have systems in the wild with these ASICs that require 2406 * LVDS and VGA support which is not supported with DC. 2407 * 2408 * Fallback to the non-DC driver here by default so as not to 2409 * cause regressions. 2410 */ 2411 return amdgpu_dc > 0; 2412 case CHIP_HAWAII: 2413 case CHIP_CARRIZO: 2414 case CHIP_STONEY: 2415 case CHIP_POLARIS10: 2416 case CHIP_POLARIS11: 2417 case CHIP_POLARIS12: 2418 case CHIP_VEGAM: 2419 case CHIP_TONGA: 2420 case CHIP_FIJI: 2421 case CHIP_VEGA10: 2422 case CHIP_VEGA12: 2423 case CHIP_VEGA20: 2424 #if defined(CONFIG_DRM_AMD_DC_DCN1_0) 2425 case CHIP_RAVEN: 2426 #endif 2427 #if defined(CONFIG_DRM_AMD_DC_DCN2_0) 2428 case CHIP_NAVI10: 2429 #endif 2430 return amdgpu_dc != 0; 2431 #endif 2432 default: 2433 return false; 2434 } 2435 } 2436 2437 /** 2438 * amdgpu_device_has_dc_support - check if dc is supported 2439 * 2440 * @adev: amdgpu_device_pointer 2441 * 2442 * Returns true for supported, false for not supported 2443 */ 2444 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2445 { 2446 if (amdgpu_sriov_vf(adev)) 2447 return false; 2448 2449 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2450 } 2451 2452 2453 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2454 { 2455 struct amdgpu_device *adev = 2456 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2457 2458 adev->asic_reset_res = amdgpu_asic_reset(adev); 2459 if (adev->asic_reset_res) 2460 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2461 adev->asic_reset_res, adev->ddev->unique); 2462 } 2463 2464 2465 /** 2466 * amdgpu_device_init - initialize the driver 2467 * 2468 * @adev: amdgpu_device pointer 2469 * @ddev: drm dev pointer 2470 * @pdev: pci dev pointer 2471 * @flags: driver flags 2472 * 2473 * Initializes the driver info and hw (all asics). 2474 * Returns 0 for success or an error on failure. 2475 * Called at driver startup. 2476 */ 2477 int amdgpu_device_init(struct amdgpu_device *adev, 2478 struct drm_device *ddev, 2479 struct pci_dev *pdev, 2480 uint32_t flags) 2481 { 2482 int r, i; 2483 bool runtime = false; 2484 u32 max_MBps; 2485 2486 adev->shutdown = false; 2487 adev->dev = &pdev->dev; 2488 adev->ddev = ddev; 2489 adev->pdev = pdev; 2490 adev->flags = flags; 2491 adev->asic_type = flags & AMD_ASIC_MASK; 2492 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2493 if (amdgpu_emu_mode == 1) 2494 adev->usec_timeout *= 2; 2495 adev->gmc.gart_size = 512 * 1024 * 1024; 2496 adev->accel_working = false; 2497 adev->num_rings = 0; 2498 adev->mman.buffer_funcs = NULL; 2499 adev->mman.buffer_funcs_ring = NULL; 2500 adev->vm_manager.vm_pte_funcs = NULL; 2501 adev->vm_manager.vm_pte_num_rqs = 0; 2502 adev->gmc.gmc_funcs = NULL; 2503 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 2504 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 2505 2506 adev->smc_rreg = &amdgpu_invalid_rreg; 2507 adev->smc_wreg = &amdgpu_invalid_wreg; 2508 adev->pcie_rreg = &amdgpu_invalid_rreg; 2509 adev->pcie_wreg = &amdgpu_invalid_wreg; 2510 adev->pciep_rreg = &amdgpu_invalid_rreg; 2511 adev->pciep_wreg = &amdgpu_invalid_wreg; 2512 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 2513 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 2514 adev->didt_rreg = &amdgpu_invalid_rreg; 2515 adev->didt_wreg = &amdgpu_invalid_wreg; 2516 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 2517 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 2518 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 2519 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 2520 2521 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 2522 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 2523 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 2524 2525 /* mutex initialization are all done here so we 2526 * can recall function without having locking issues */ 2527 atomic_set(&adev->irq.ih.lock, 0); 2528 mutex_init(&adev->firmware.mutex); 2529 mutex_init(&adev->pm.mutex); 2530 mutex_init(&adev->gfx.gpu_clock_mutex); 2531 mutex_init(&adev->srbm_mutex); 2532 mutex_init(&adev->gfx.pipe_reserve_mutex); 2533 mutex_init(&adev->gfx.gfx_off_mutex); 2534 mutex_init(&adev->grbm_idx_mutex); 2535 mutex_init(&adev->mn_lock); 2536 mutex_init(&adev->virt.vf_errors.lock); 2537 hash_init(adev->mn_hash); 2538 mutex_init(&adev->lock_reset); 2539 mutex_init(&adev->virt.dpm_mutex); 2540 mutex_init(&adev->psp.mutex); 2541 2542 r = amdgpu_device_check_arguments(adev); 2543 if (r) 2544 return r; 2545 2546 spin_lock_init(&adev->mmio_idx_lock); 2547 spin_lock_init(&adev->smc_idx_lock); 2548 spin_lock_init(&adev->pcie_idx_lock); 2549 spin_lock_init(&adev->uvd_ctx_idx_lock); 2550 spin_lock_init(&adev->didt_idx_lock); 2551 spin_lock_init(&adev->gc_cac_idx_lock); 2552 spin_lock_init(&adev->se_cac_idx_lock); 2553 spin_lock_init(&adev->audio_endpt_idx_lock); 2554 spin_lock_init(&adev->mm_stats.lock); 2555 2556 INIT_LIST_HEAD(&adev->shadow_list); 2557 mutex_init(&adev->shadow_list_lock); 2558 2559 INIT_LIST_HEAD(&adev->ring_lru_list); 2560 spin_lock_init(&adev->ring_lru_list_lock); 2561 2562 INIT_DELAYED_WORK(&adev->delayed_init_work, 2563 amdgpu_device_delayed_init_work_handler); 2564 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 2565 amdgpu_device_delay_enable_gfx_off); 2566 2567 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 2568 2569 adev->gfx.gfx_off_req_count = 1; 2570 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; 2571 2572 /* Registers mapping */ 2573 /* TODO: block userspace mapping of io register */ 2574 if (adev->asic_type >= CHIP_BONAIRE) { 2575 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 2576 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 2577 } else { 2578 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 2579 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 2580 } 2581 2582 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 2583 if (adev->rmmio == NULL) { 2584 return -ENOMEM; 2585 } 2586 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 2587 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 2588 2589 /* io port mapping */ 2590 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 2591 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 2592 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 2593 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 2594 break; 2595 } 2596 } 2597 if (adev->rio_mem == NULL) 2598 DRM_INFO("PCI I/O BAR is not found.\n"); 2599 2600 /* enable PCIE atomic ops */ 2601 r = pci_enable_atomic_ops_to_root(adev->pdev, 2602 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 2603 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 2604 if (r) { 2605 adev->have_atomics_support = false; 2606 DRM_INFO("PCIE atomic ops is not supported\n"); 2607 } else { 2608 adev->have_atomics_support = true; 2609 } 2610 2611 amdgpu_device_get_pcie_info(adev); 2612 2613 if (amdgpu_mcbp) 2614 DRM_INFO("MCBP is enabled\n"); 2615 2616 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 2617 adev->enable_mes = true; 2618 2619 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { 2620 r = amdgpu_discovery_init(adev); 2621 if (r) { 2622 dev_err(adev->dev, "amdgpu_discovery_init failed\n"); 2623 return r; 2624 } 2625 } 2626 2627 /* early init functions */ 2628 r = amdgpu_device_ip_early_init(adev); 2629 if (r) 2630 return r; 2631 2632 /* doorbell bar mapping and doorbell index init*/ 2633 amdgpu_device_doorbell_init(adev); 2634 2635 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 2636 /* this will fail for cards that aren't VGA class devices, just 2637 * ignore it */ 2638 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 2639 2640 if (amdgpu_device_is_px(ddev)) 2641 runtime = true; 2642 if (!pci_is_thunderbolt_attached(adev->pdev)) 2643 vga_switcheroo_register_client(adev->pdev, 2644 &amdgpu_switcheroo_ops, runtime); 2645 if (runtime) 2646 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 2647 2648 if (amdgpu_emu_mode == 1) { 2649 /* post the asic on emulation mode */ 2650 emu_soc_asic_init(adev); 2651 goto fence_driver_init; 2652 } 2653 2654 /* detect if we are with an SRIOV vbios */ 2655 amdgpu_device_detect_sriov_bios(adev); 2656 2657 /* check if we need to reset the asic 2658 * E.g., driver was not cleanly unloaded previously, etc. 2659 */ 2660 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 2661 r = amdgpu_asic_reset(adev); 2662 if (r) { 2663 dev_err(adev->dev, "asic reset on init failed\n"); 2664 goto failed; 2665 } 2666 } 2667 2668 /* Post card if necessary */ 2669 if (amdgpu_device_need_post(adev)) { 2670 if (!adev->bios) { 2671 dev_err(adev->dev, "no vBIOS found\n"); 2672 r = -EINVAL; 2673 goto failed; 2674 } 2675 DRM_INFO("GPU posting now...\n"); 2676 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 2677 if (r) { 2678 dev_err(adev->dev, "gpu post error!\n"); 2679 goto failed; 2680 } 2681 } 2682 2683 if (adev->is_atom_fw) { 2684 /* Initialize clocks */ 2685 r = amdgpu_atomfirmware_get_clock_info(adev); 2686 if (r) { 2687 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 2688 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 2689 goto failed; 2690 } 2691 } else { 2692 /* Initialize clocks */ 2693 r = amdgpu_atombios_get_clock_info(adev); 2694 if (r) { 2695 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 2696 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 2697 goto failed; 2698 } 2699 /* init i2c buses */ 2700 if (!amdgpu_device_has_dc_support(adev)) 2701 amdgpu_atombios_i2c_init(adev); 2702 } 2703 2704 fence_driver_init: 2705 /* Fence driver */ 2706 r = amdgpu_fence_driver_init(adev); 2707 if (r) { 2708 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 2709 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 2710 goto failed; 2711 } 2712 2713 /* init the mode config */ 2714 drm_mode_config_init(adev->ddev); 2715 2716 r = amdgpu_device_ip_init(adev); 2717 if (r) { 2718 /* failed in exclusive mode due to timeout */ 2719 if (amdgpu_sriov_vf(adev) && 2720 !amdgpu_sriov_runtime(adev) && 2721 amdgpu_virt_mmio_blocked(adev) && 2722 !amdgpu_virt_wait_reset(adev)) { 2723 dev_err(adev->dev, "VF exclusive mode timeout\n"); 2724 /* Don't send request since VF is inactive. */ 2725 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 2726 adev->virt.ops = NULL; 2727 r = -EAGAIN; 2728 goto failed; 2729 } 2730 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 2731 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 2732 if (amdgpu_virt_request_full_gpu(adev, false)) 2733 amdgpu_virt_release_full_gpu(adev, false); 2734 goto failed; 2735 } 2736 2737 adev->accel_working = true; 2738 2739 amdgpu_vm_check_compute_bug(adev); 2740 2741 /* Initialize the buffer migration limit. */ 2742 if (amdgpu_moverate >= 0) 2743 max_MBps = amdgpu_moverate; 2744 else 2745 max_MBps = 8; /* Allow 8 MB/s. */ 2746 /* Get a log2 for easy divisions. */ 2747 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 2748 2749 amdgpu_fbdev_init(adev); 2750 2751 if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) 2752 amdgpu_pm_virt_sysfs_init(adev); 2753 2754 r = amdgpu_pm_sysfs_init(adev); 2755 if (r) 2756 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 2757 2758 r = amdgpu_ucode_sysfs_init(adev); 2759 if (r) 2760 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 2761 2762 r = amdgpu_debugfs_gem_init(adev); 2763 if (r) 2764 DRM_ERROR("registering gem debugfs failed (%d).\n", r); 2765 2766 r = amdgpu_debugfs_regs_init(adev); 2767 if (r) 2768 DRM_ERROR("registering register debugfs failed (%d).\n", r); 2769 2770 r = amdgpu_debugfs_firmware_init(adev); 2771 if (r) 2772 DRM_ERROR("registering firmware debugfs failed (%d).\n", r); 2773 2774 r = amdgpu_debugfs_init(adev); 2775 if (r) 2776 DRM_ERROR("Creating debugfs files failed (%d).\n", r); 2777 2778 if ((amdgpu_testing & 1)) { 2779 if (adev->accel_working) 2780 amdgpu_test_moves(adev); 2781 else 2782 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 2783 } 2784 if (amdgpu_benchmarking) { 2785 if (adev->accel_working) 2786 amdgpu_benchmark(adev, amdgpu_benchmarking); 2787 else 2788 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 2789 } 2790 2791 /* enable clockgating, etc. after ib tests, etc. since some blocks require 2792 * explicit gating rather than handling it automatically. 2793 */ 2794 r = amdgpu_device_ip_late_init(adev); 2795 if (r) { 2796 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 2797 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 2798 goto failed; 2799 } 2800 2801 /* must succeed. */ 2802 amdgpu_ras_resume(adev); 2803 2804 queue_delayed_work(system_wq, &adev->delayed_init_work, 2805 msecs_to_jiffies(AMDGPU_RESUME_MS)); 2806 2807 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); 2808 if (r) { 2809 dev_err(adev->dev, "Could not create pcie_replay_count"); 2810 return r; 2811 } 2812 2813 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 2814 r = amdgpu_pmu_init(adev); 2815 if (r) 2816 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 2817 2818 return 0; 2819 2820 failed: 2821 amdgpu_vf_error_trans_all(adev); 2822 if (runtime) 2823 vga_switcheroo_fini_domain_pm_ops(adev->dev); 2824 2825 return r; 2826 } 2827 2828 /** 2829 * amdgpu_device_fini - tear down the driver 2830 * 2831 * @adev: amdgpu_device pointer 2832 * 2833 * Tear down the driver info (all asics). 2834 * Called at driver shutdown. 2835 */ 2836 void amdgpu_device_fini(struct amdgpu_device *adev) 2837 { 2838 int r; 2839 2840 DRM_INFO("amdgpu: finishing device.\n"); 2841 adev->shutdown = true; 2842 /* disable all interrupts */ 2843 amdgpu_irq_disable_all(adev); 2844 if (adev->mode_info.mode_config_initialized){ 2845 if (!amdgpu_device_has_dc_support(adev)) 2846 drm_helper_force_disable_all(adev->ddev); 2847 else 2848 drm_atomic_helper_shutdown(adev->ddev); 2849 } 2850 amdgpu_fence_driver_fini(adev); 2851 amdgpu_pm_sysfs_fini(adev); 2852 amdgpu_fbdev_fini(adev); 2853 r = amdgpu_device_ip_fini(adev); 2854 if (adev->firmware.gpu_info_fw) { 2855 release_firmware(adev->firmware.gpu_info_fw); 2856 adev->firmware.gpu_info_fw = NULL; 2857 } 2858 adev->accel_working = false; 2859 cancel_delayed_work_sync(&adev->delayed_init_work); 2860 /* free i2c buses */ 2861 if (!amdgpu_device_has_dc_support(adev)) 2862 amdgpu_i2c_fini(adev); 2863 2864 if (amdgpu_emu_mode != 1) 2865 amdgpu_atombios_fini(adev); 2866 2867 kfree(adev->bios); 2868 adev->bios = NULL; 2869 if (!pci_is_thunderbolt_attached(adev->pdev)) 2870 vga_switcheroo_unregister_client(adev->pdev); 2871 if (adev->flags & AMD_IS_PX) 2872 vga_switcheroo_fini_domain_pm_ops(adev->dev); 2873 vga_client_register(adev->pdev, NULL, NULL, NULL); 2874 if (adev->rio_mem) 2875 pci_iounmap(adev->pdev, adev->rio_mem); 2876 adev->rio_mem = NULL; 2877 iounmap(adev->rmmio); 2878 adev->rmmio = NULL; 2879 amdgpu_device_doorbell_fini(adev); 2880 if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) 2881 amdgpu_pm_virt_sysfs_fini(adev); 2882 2883 amdgpu_debugfs_regs_cleanup(adev); 2884 device_remove_file(adev->dev, &dev_attr_pcie_replay_count); 2885 amdgpu_ucode_sysfs_fini(adev); 2886 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 2887 amdgpu_pmu_fini(adev); 2888 amdgpu_debugfs_preempt_cleanup(adev); 2889 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 2890 amdgpu_discovery_fini(adev); 2891 } 2892 2893 2894 /* 2895 * Suspend & resume. 2896 */ 2897 /** 2898 * amdgpu_device_suspend - initiate device suspend 2899 * 2900 * @dev: drm dev pointer 2901 * @suspend: suspend state 2902 * @fbcon : notify the fbdev of suspend 2903 * 2904 * Puts the hw in the suspend state (all asics). 2905 * Returns 0 for success or an error on failure. 2906 * Called at driver suspend. 2907 */ 2908 int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) 2909 { 2910 struct amdgpu_device *adev; 2911 struct drm_crtc *crtc; 2912 struct drm_connector *connector; 2913 int r; 2914 2915 if (dev == NULL || dev->dev_private == NULL) { 2916 return -ENODEV; 2917 } 2918 2919 adev = dev->dev_private; 2920 2921 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 2922 return 0; 2923 2924 adev->in_suspend = true; 2925 drm_kms_helper_poll_disable(dev); 2926 2927 if (fbcon) 2928 amdgpu_fbdev_set_suspend(adev, 1); 2929 2930 cancel_delayed_work_sync(&adev->delayed_init_work); 2931 2932 if (!amdgpu_device_has_dc_support(adev)) { 2933 /* turn off display hw */ 2934 drm_modeset_lock_all(dev); 2935 list_for_each_entry(connector, &dev->mode_config.connector_list, head) { 2936 drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); 2937 } 2938 drm_modeset_unlock_all(dev); 2939 /* unpin the front buffers and cursors */ 2940 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 2941 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 2942 struct drm_framebuffer *fb = crtc->primary->fb; 2943 struct amdgpu_bo *robj; 2944 2945 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 2946 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 2947 r = amdgpu_bo_reserve(aobj, true); 2948 if (r == 0) { 2949 amdgpu_bo_unpin(aobj); 2950 amdgpu_bo_unreserve(aobj); 2951 } 2952 } 2953 2954 if (fb == NULL || fb->obj[0] == NULL) { 2955 continue; 2956 } 2957 robj = gem_to_amdgpu_bo(fb->obj[0]); 2958 /* don't unpin kernel fb objects */ 2959 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 2960 r = amdgpu_bo_reserve(robj, true); 2961 if (r == 0) { 2962 amdgpu_bo_unpin(robj); 2963 amdgpu_bo_unreserve(robj); 2964 } 2965 } 2966 } 2967 } 2968 2969 amdgpu_amdkfd_suspend(adev); 2970 2971 amdgpu_ras_suspend(adev); 2972 2973 r = amdgpu_device_ip_suspend_phase1(adev); 2974 2975 /* evict vram memory */ 2976 amdgpu_bo_evict_vram(adev); 2977 2978 amdgpu_fence_driver_suspend(adev); 2979 2980 r = amdgpu_device_ip_suspend_phase2(adev); 2981 2982 /* evict remaining vram memory 2983 * This second call to evict vram is to evict the gart page table 2984 * using the CPU. 2985 */ 2986 amdgpu_bo_evict_vram(adev); 2987 2988 pci_save_state(dev->pdev); 2989 if (suspend) { 2990 /* Shut down the device */ 2991 pci_disable_device(dev->pdev); 2992 pci_set_power_state(dev->pdev, PCI_D3hot); 2993 } else { 2994 r = amdgpu_asic_reset(adev); 2995 if (r) 2996 DRM_ERROR("amdgpu asic reset failed\n"); 2997 } 2998 2999 return 0; 3000 } 3001 3002 /** 3003 * amdgpu_device_resume - initiate device resume 3004 * 3005 * @dev: drm dev pointer 3006 * @resume: resume state 3007 * @fbcon : notify the fbdev of resume 3008 * 3009 * Bring the hw back to operating state (all asics). 3010 * Returns 0 for success or an error on failure. 3011 * Called at driver resume. 3012 */ 3013 int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) 3014 { 3015 struct drm_connector *connector; 3016 struct amdgpu_device *adev = dev->dev_private; 3017 struct drm_crtc *crtc; 3018 int r = 0; 3019 3020 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3021 return 0; 3022 3023 if (resume) { 3024 pci_set_power_state(dev->pdev, PCI_D0); 3025 pci_restore_state(dev->pdev); 3026 r = pci_enable_device(dev->pdev); 3027 if (r) 3028 return r; 3029 } 3030 3031 /* post card */ 3032 if (amdgpu_device_need_post(adev)) { 3033 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3034 if (r) 3035 DRM_ERROR("amdgpu asic init failed\n"); 3036 } 3037 3038 r = amdgpu_device_ip_resume(adev); 3039 if (r) { 3040 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3041 return r; 3042 } 3043 amdgpu_fence_driver_resume(adev); 3044 3045 3046 r = amdgpu_device_ip_late_init(adev); 3047 if (r) 3048 return r; 3049 3050 queue_delayed_work(system_wq, &adev->delayed_init_work, 3051 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3052 3053 if (!amdgpu_device_has_dc_support(adev)) { 3054 /* pin cursors */ 3055 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3056 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3057 3058 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3059 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3060 r = amdgpu_bo_reserve(aobj, true); 3061 if (r == 0) { 3062 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3063 if (r != 0) 3064 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3065 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3066 amdgpu_bo_unreserve(aobj); 3067 } 3068 } 3069 } 3070 } 3071 r = amdgpu_amdkfd_resume(adev); 3072 if (r) 3073 return r; 3074 3075 /* Make sure IB tests flushed */ 3076 flush_delayed_work(&adev->delayed_init_work); 3077 3078 /* blat the mode back in */ 3079 if (fbcon) { 3080 if (!amdgpu_device_has_dc_support(adev)) { 3081 /* pre DCE11 */ 3082 drm_helper_resume_force_mode(dev); 3083 3084 /* turn on display hw */ 3085 drm_modeset_lock_all(dev); 3086 list_for_each_entry(connector, &dev->mode_config.connector_list, head) { 3087 drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); 3088 } 3089 drm_modeset_unlock_all(dev); 3090 } 3091 amdgpu_fbdev_set_suspend(adev, 0); 3092 } 3093 3094 drm_kms_helper_poll_enable(dev); 3095 3096 amdgpu_ras_resume(adev); 3097 3098 /* 3099 * Most of the connector probing functions try to acquire runtime pm 3100 * refs to ensure that the GPU is powered on when connector polling is 3101 * performed. Since we're calling this from a runtime PM callback, 3102 * trying to acquire rpm refs will cause us to deadlock. 3103 * 3104 * Since we're guaranteed to be holding the rpm lock, it's safe to 3105 * temporarily disable the rpm helpers so this doesn't deadlock us. 3106 */ 3107 #ifdef CONFIG_PM 3108 dev->dev->power.disable_depth++; 3109 #endif 3110 if (!amdgpu_device_has_dc_support(adev)) 3111 drm_helper_hpd_irq_event(dev); 3112 else 3113 drm_kms_helper_hotplug_event(dev); 3114 #ifdef CONFIG_PM 3115 dev->dev->power.disable_depth--; 3116 #endif 3117 adev->in_suspend = false; 3118 3119 return 0; 3120 } 3121 3122 /** 3123 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3124 * 3125 * @adev: amdgpu_device pointer 3126 * 3127 * The list of all the hardware IPs that make up the asic is walked and 3128 * the check_soft_reset callbacks are run. check_soft_reset determines 3129 * if the asic is still hung or not. 3130 * Returns true if any of the IPs are still in a hung state, false if not. 3131 */ 3132 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3133 { 3134 int i; 3135 bool asic_hang = false; 3136 3137 if (amdgpu_sriov_vf(adev)) 3138 return true; 3139 3140 if (amdgpu_asic_need_full_reset(adev)) 3141 return true; 3142 3143 for (i = 0; i < adev->num_ip_blocks; i++) { 3144 if (!adev->ip_blocks[i].status.valid) 3145 continue; 3146 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3147 adev->ip_blocks[i].status.hang = 3148 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3149 if (adev->ip_blocks[i].status.hang) { 3150 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3151 asic_hang = true; 3152 } 3153 } 3154 return asic_hang; 3155 } 3156 3157 /** 3158 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3159 * 3160 * @adev: amdgpu_device pointer 3161 * 3162 * The list of all the hardware IPs that make up the asic is walked and the 3163 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3164 * handles any IP specific hardware or software state changes that are 3165 * necessary for a soft reset to succeed. 3166 * Returns 0 on success, negative error code on failure. 3167 */ 3168 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3169 { 3170 int i, r = 0; 3171 3172 for (i = 0; i < adev->num_ip_blocks; i++) { 3173 if (!adev->ip_blocks[i].status.valid) 3174 continue; 3175 if (adev->ip_blocks[i].status.hang && 3176 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3177 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3178 if (r) 3179 return r; 3180 } 3181 } 3182 3183 return 0; 3184 } 3185 3186 /** 3187 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3188 * 3189 * @adev: amdgpu_device pointer 3190 * 3191 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3192 * reset is necessary to recover. 3193 * Returns true if a full asic reset is required, false if not. 3194 */ 3195 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3196 { 3197 int i; 3198 3199 if (amdgpu_asic_need_full_reset(adev)) 3200 return true; 3201 3202 for (i = 0; i < adev->num_ip_blocks; i++) { 3203 if (!adev->ip_blocks[i].status.valid) 3204 continue; 3205 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3206 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3207 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3208 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3209 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3210 if (adev->ip_blocks[i].status.hang) { 3211 DRM_INFO("Some block need full reset!\n"); 3212 return true; 3213 } 3214 } 3215 } 3216 return false; 3217 } 3218 3219 /** 3220 * amdgpu_device_ip_soft_reset - do a soft reset 3221 * 3222 * @adev: amdgpu_device pointer 3223 * 3224 * The list of all the hardware IPs that make up the asic is walked and the 3225 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3226 * IP specific hardware or software state changes that are necessary to soft 3227 * reset the IP. 3228 * Returns 0 on success, negative error code on failure. 3229 */ 3230 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3231 { 3232 int i, r = 0; 3233 3234 for (i = 0; i < adev->num_ip_blocks; i++) { 3235 if (!adev->ip_blocks[i].status.valid) 3236 continue; 3237 if (adev->ip_blocks[i].status.hang && 3238 adev->ip_blocks[i].version->funcs->soft_reset) { 3239 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3240 if (r) 3241 return r; 3242 } 3243 } 3244 3245 return 0; 3246 } 3247 3248 /** 3249 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3250 * 3251 * @adev: amdgpu_device pointer 3252 * 3253 * The list of all the hardware IPs that make up the asic is walked and the 3254 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3255 * handles any IP specific hardware or software state changes that are 3256 * necessary after the IP has been soft reset. 3257 * Returns 0 on success, negative error code on failure. 3258 */ 3259 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3260 { 3261 int i, r = 0; 3262 3263 for (i = 0; i < adev->num_ip_blocks; i++) { 3264 if (!adev->ip_blocks[i].status.valid) 3265 continue; 3266 if (adev->ip_blocks[i].status.hang && 3267 adev->ip_blocks[i].version->funcs->post_soft_reset) 3268 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3269 if (r) 3270 return r; 3271 } 3272 3273 return 0; 3274 } 3275 3276 /** 3277 * amdgpu_device_recover_vram - Recover some VRAM contents 3278 * 3279 * @adev: amdgpu_device pointer 3280 * 3281 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3282 * restore things like GPUVM page tables after a GPU reset where 3283 * the contents of VRAM might be lost. 3284 * 3285 * Returns: 3286 * 0 on success, negative error code on failure. 3287 */ 3288 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3289 { 3290 struct dma_fence *fence = NULL, *next = NULL; 3291 struct amdgpu_bo *shadow; 3292 long r = 1, tmo; 3293 3294 if (amdgpu_sriov_runtime(adev)) 3295 tmo = msecs_to_jiffies(8000); 3296 else 3297 tmo = msecs_to_jiffies(100); 3298 3299 DRM_INFO("recover vram bo from shadow start\n"); 3300 mutex_lock(&adev->shadow_list_lock); 3301 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3302 3303 /* No need to recover an evicted BO */ 3304 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3305 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3306 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3307 continue; 3308 3309 r = amdgpu_bo_restore_shadow(shadow, &next); 3310 if (r) 3311 break; 3312 3313 if (fence) { 3314 tmo = dma_fence_wait_timeout(fence, false, tmo); 3315 dma_fence_put(fence); 3316 fence = next; 3317 if (tmo == 0) { 3318 r = -ETIMEDOUT; 3319 break; 3320 } else if (tmo < 0) { 3321 r = tmo; 3322 break; 3323 } 3324 } else { 3325 fence = next; 3326 } 3327 } 3328 mutex_unlock(&adev->shadow_list_lock); 3329 3330 if (fence) 3331 tmo = dma_fence_wait_timeout(fence, false, tmo); 3332 dma_fence_put(fence); 3333 3334 if (r < 0 || tmo <= 0) { 3335 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3336 return -EIO; 3337 } 3338 3339 DRM_INFO("recover vram bo from shadow done\n"); 3340 return 0; 3341 } 3342 3343 3344 /** 3345 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3346 * 3347 * @adev: amdgpu device pointer 3348 * @from_hypervisor: request from hypervisor 3349 * 3350 * do VF FLR and reinitialize Asic 3351 * return 0 means succeeded otherwise failed 3352 */ 3353 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3354 bool from_hypervisor) 3355 { 3356 int r; 3357 3358 if (from_hypervisor) 3359 r = amdgpu_virt_request_full_gpu(adev, true); 3360 else 3361 r = amdgpu_virt_reset_gpu(adev); 3362 if (r) 3363 return r; 3364 3365 amdgpu_amdkfd_pre_reset(adev); 3366 3367 /* Resume IP prior to SMC */ 3368 r = amdgpu_device_ip_reinit_early_sriov(adev); 3369 if (r) 3370 goto error; 3371 3372 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3373 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3374 3375 r = amdgpu_device_fw_loading(adev); 3376 if (r) 3377 return r; 3378 3379 /* now we are okay to resume SMC/CP/SDMA */ 3380 r = amdgpu_device_ip_reinit_late_sriov(adev); 3381 if (r) 3382 goto error; 3383 3384 amdgpu_irq_gpu_reset_resume_helper(adev); 3385 r = amdgpu_ib_ring_tests(adev); 3386 amdgpu_amdkfd_post_reset(adev); 3387 3388 error: 3389 amdgpu_virt_init_data_exchange(adev); 3390 amdgpu_virt_release_full_gpu(adev, true); 3391 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3392 atomic_inc(&adev->vram_lost_counter); 3393 r = amdgpu_device_recover_vram(adev); 3394 } 3395 3396 return r; 3397 } 3398 3399 /** 3400 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3401 * 3402 * @adev: amdgpu device pointer 3403 * 3404 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3405 * a hung GPU. 3406 */ 3407 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3408 { 3409 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3410 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3411 return false; 3412 } 3413 3414 if (amdgpu_gpu_recovery == 0) 3415 goto disabled; 3416 3417 if (amdgpu_sriov_vf(adev)) 3418 return true; 3419 3420 if (amdgpu_gpu_recovery == -1) { 3421 switch (adev->asic_type) { 3422 case CHIP_BONAIRE: 3423 case CHIP_HAWAII: 3424 case CHIP_TOPAZ: 3425 case CHIP_TONGA: 3426 case CHIP_FIJI: 3427 case CHIP_POLARIS10: 3428 case CHIP_POLARIS11: 3429 case CHIP_POLARIS12: 3430 case CHIP_VEGAM: 3431 case CHIP_VEGA20: 3432 case CHIP_VEGA10: 3433 case CHIP_VEGA12: 3434 break; 3435 default: 3436 goto disabled; 3437 } 3438 } 3439 3440 return true; 3441 3442 disabled: 3443 DRM_INFO("GPU recovery disabled.\n"); 3444 return false; 3445 } 3446 3447 3448 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 3449 struct amdgpu_job *job, 3450 bool *need_full_reset_arg) 3451 { 3452 int i, r = 0; 3453 bool need_full_reset = *need_full_reset_arg; 3454 3455 /* block all schedulers and reset given job's ring */ 3456 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3457 struct amdgpu_ring *ring = adev->rings[i]; 3458 3459 if (!ring || !ring->sched.thread) 3460 continue; 3461 3462 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3463 amdgpu_fence_driver_force_completion(ring); 3464 } 3465 3466 if(job) 3467 drm_sched_increase_karma(&job->base); 3468 3469 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 3470 if (!amdgpu_sriov_vf(adev)) { 3471 3472 if (!need_full_reset) 3473 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 3474 3475 if (!need_full_reset) { 3476 amdgpu_device_ip_pre_soft_reset(adev); 3477 r = amdgpu_device_ip_soft_reset(adev); 3478 amdgpu_device_ip_post_soft_reset(adev); 3479 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 3480 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 3481 need_full_reset = true; 3482 } 3483 } 3484 3485 if (need_full_reset) 3486 r = amdgpu_device_ip_suspend(adev); 3487 3488 *need_full_reset_arg = need_full_reset; 3489 } 3490 3491 return r; 3492 } 3493 3494 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 3495 struct list_head *device_list_handle, 3496 bool *need_full_reset_arg) 3497 { 3498 struct amdgpu_device *tmp_adev = NULL; 3499 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 3500 int r = 0; 3501 3502 /* 3503 * ASIC reset has to be done on all HGMI hive nodes ASAP 3504 * to allow proper links negotiation in FW (within 1 sec) 3505 */ 3506 if (need_full_reset) { 3507 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3508 /* For XGMI run all resets in parallel to speed up the process */ 3509 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3510 if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) 3511 r = -EALREADY; 3512 } else 3513 r = amdgpu_asic_reset(tmp_adev); 3514 3515 if (r) { 3516 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 3517 r, tmp_adev->ddev->unique); 3518 break; 3519 } 3520 } 3521 3522 /* For XGMI wait for all PSP resets to complete before proceed */ 3523 if (!r) { 3524 list_for_each_entry(tmp_adev, device_list_handle, 3525 gmc.xgmi.head) { 3526 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3527 flush_work(&tmp_adev->xgmi_reset_work); 3528 r = tmp_adev->asic_reset_res; 3529 if (r) 3530 break; 3531 } 3532 } 3533 3534 list_for_each_entry(tmp_adev, device_list_handle, 3535 gmc.xgmi.head) { 3536 amdgpu_ras_reserve_bad_pages(tmp_adev); 3537 } 3538 } 3539 } 3540 3541 3542 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3543 if (need_full_reset) { 3544 /* post card */ 3545 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 3546 DRM_WARN("asic atom init failed!"); 3547 3548 if (!r) { 3549 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 3550 r = amdgpu_device_ip_resume_phase1(tmp_adev); 3551 if (r) 3552 goto out; 3553 3554 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 3555 if (vram_lost) { 3556 DRM_INFO("VRAM is lost due to GPU reset!\n"); 3557 atomic_inc(&tmp_adev->vram_lost_counter); 3558 } 3559 3560 r = amdgpu_gtt_mgr_recover( 3561 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 3562 if (r) 3563 goto out; 3564 3565 r = amdgpu_device_fw_loading(tmp_adev); 3566 if (r) 3567 return r; 3568 3569 r = amdgpu_device_ip_resume_phase2(tmp_adev); 3570 if (r) 3571 goto out; 3572 3573 if (vram_lost) 3574 amdgpu_device_fill_reset_magic(tmp_adev); 3575 3576 /* 3577 * Add this ASIC as tracked as reset was already 3578 * complete successfully. 3579 */ 3580 amdgpu_register_gpu_instance(tmp_adev); 3581 3582 r = amdgpu_device_ip_late_init(tmp_adev); 3583 if (r) 3584 goto out; 3585 3586 /* must succeed. */ 3587 amdgpu_ras_resume(tmp_adev); 3588 3589 /* Update PSP FW topology after reset */ 3590 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 3591 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 3592 } 3593 } 3594 3595 3596 out: 3597 if (!r) { 3598 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 3599 r = amdgpu_ib_ring_tests(tmp_adev); 3600 if (r) { 3601 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 3602 r = amdgpu_device_ip_suspend(tmp_adev); 3603 need_full_reset = true; 3604 r = -EAGAIN; 3605 goto end; 3606 } 3607 } 3608 3609 if (!r) 3610 r = amdgpu_device_recover_vram(tmp_adev); 3611 else 3612 tmp_adev->asic_reset_res = r; 3613 } 3614 3615 end: 3616 *need_full_reset_arg = need_full_reset; 3617 return r; 3618 } 3619 3620 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 3621 { 3622 if (trylock) { 3623 if (!mutex_trylock(&adev->lock_reset)) 3624 return false; 3625 } else 3626 mutex_lock(&adev->lock_reset); 3627 3628 atomic_inc(&adev->gpu_reset_counter); 3629 adev->in_gpu_reset = 1; 3630 /* Block kfd: SRIOV would do it separately */ 3631 if (!amdgpu_sriov_vf(adev)) 3632 amdgpu_amdkfd_pre_reset(adev); 3633 3634 return true; 3635 } 3636 3637 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 3638 { 3639 /*unlock kfd: SRIOV would do it separately */ 3640 if (!amdgpu_sriov_vf(adev)) 3641 amdgpu_amdkfd_post_reset(adev); 3642 amdgpu_vf_error_trans_all(adev); 3643 adev->in_gpu_reset = 0; 3644 mutex_unlock(&adev->lock_reset); 3645 } 3646 3647 3648 /** 3649 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 3650 * 3651 * @adev: amdgpu device pointer 3652 * @job: which job trigger hang 3653 * 3654 * Attempt to reset the GPU if it has hung (all asics). 3655 * Attempt to do soft-reset or full-reset and reinitialize Asic 3656 * Returns 0 for success or an error on failure. 3657 */ 3658 3659 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 3660 struct amdgpu_job *job) 3661 { 3662 struct list_head device_list, *device_list_handle = NULL; 3663 bool need_full_reset, job_signaled; 3664 struct amdgpu_hive_info *hive = NULL; 3665 struct amdgpu_device *tmp_adev = NULL; 3666 int i, r = 0; 3667 3668 need_full_reset = job_signaled = false; 3669 INIT_LIST_HEAD(&device_list); 3670 3671 dev_info(adev->dev, "GPU reset begin!\n"); 3672 3673 cancel_delayed_work_sync(&adev->delayed_init_work); 3674 3675 hive = amdgpu_get_xgmi_hive(adev, false); 3676 3677 /* 3678 * Here we trylock to avoid chain of resets executing from 3679 * either trigger by jobs on different adevs in XGMI hive or jobs on 3680 * different schedulers for same device while this TO handler is running. 3681 * We always reset all schedulers for device and all devices for XGMI 3682 * hive so that should take care of them too. 3683 */ 3684 3685 if (hive && !mutex_trylock(&hive->reset_lock)) { 3686 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 3687 job->base.id, hive->hive_id); 3688 return 0; 3689 } 3690 3691 /* Start with adev pre asic reset first for soft reset check.*/ 3692 if (!amdgpu_device_lock_adev(adev, !hive)) { 3693 DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", 3694 job->base.id); 3695 return 0; 3696 } 3697 3698 /* Build list of devices to reset */ 3699 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3700 if (!hive) { 3701 amdgpu_device_unlock_adev(adev); 3702 return -ENODEV; 3703 } 3704 3705 /* 3706 * In case we are in XGMI hive mode device reset is done for all the 3707 * nodes in the hive to retrain all XGMI links and hence the reset 3708 * sequence is executed in loop on all nodes. 3709 */ 3710 device_list_handle = &hive->device_list; 3711 } else { 3712 list_add_tail(&adev->gmc.xgmi.head, &device_list); 3713 device_list_handle = &device_list; 3714 } 3715 3716 /* 3717 * Mark these ASICs to be reseted as untracked first 3718 * And add them back after reset completed 3719 */ 3720 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) 3721 amdgpu_unregister_gpu_instance(tmp_adev); 3722 3723 /* block all schedulers and reset given job's ring */ 3724 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3725 /* disable ras on ALL IPs */ 3726 if (amdgpu_device_ip_need_full_reset(tmp_adev)) 3727 amdgpu_ras_suspend(tmp_adev); 3728 3729 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3730 struct amdgpu_ring *ring = tmp_adev->rings[i]; 3731 3732 if (!ring || !ring->sched.thread) 3733 continue; 3734 3735 drm_sched_stop(&ring->sched, &job->base); 3736 } 3737 } 3738 3739 3740 /* 3741 * Must check guilty signal here since after this point all old 3742 * HW fences are force signaled. 3743 * 3744 * job->base holds a reference to parent fence 3745 */ 3746 if (job && job->base.s_fence->parent && 3747 dma_fence_is_signaled(job->base.s_fence->parent)) 3748 job_signaled = true; 3749 3750 if (!amdgpu_device_ip_need_full_reset(adev)) 3751 device_list_handle = &device_list; 3752 3753 if (job_signaled) { 3754 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 3755 goto skip_hw_reset; 3756 } 3757 3758 3759 /* Guilty job will be freed after this*/ 3760 r = amdgpu_device_pre_asic_reset(adev, 3761 job, 3762 &need_full_reset); 3763 if (r) { 3764 /*TODO Should we stop ?*/ 3765 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 3766 r, adev->ddev->unique); 3767 adev->asic_reset_res = r; 3768 } 3769 3770 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 3771 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3772 3773 if (tmp_adev == adev) 3774 continue; 3775 3776 amdgpu_device_lock_adev(tmp_adev, false); 3777 r = amdgpu_device_pre_asic_reset(tmp_adev, 3778 NULL, 3779 &need_full_reset); 3780 /*TODO Should we stop ?*/ 3781 if (r) { 3782 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 3783 r, tmp_adev->ddev->unique); 3784 tmp_adev->asic_reset_res = r; 3785 } 3786 } 3787 3788 /* Actual ASIC resets if needed.*/ 3789 /* TODO Implement XGMI hive reset logic for SRIOV */ 3790 if (amdgpu_sriov_vf(adev)) { 3791 r = amdgpu_device_reset_sriov(adev, job ? false : true); 3792 if (r) 3793 adev->asic_reset_res = r; 3794 } else { 3795 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 3796 if (r && r == -EAGAIN) 3797 goto retry; 3798 } 3799 3800 skip_hw_reset: 3801 3802 /* Post ASIC reset for all devs .*/ 3803 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3804 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3805 struct amdgpu_ring *ring = tmp_adev->rings[i]; 3806 3807 if (!ring || !ring->sched.thread) 3808 continue; 3809 3810 /* No point to resubmit jobs if we didn't HW reset*/ 3811 if (!tmp_adev->asic_reset_res && !job_signaled) 3812 drm_sched_resubmit_jobs(&ring->sched); 3813 3814 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 3815 } 3816 3817 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 3818 drm_helper_resume_force_mode(tmp_adev->ddev); 3819 } 3820 3821 tmp_adev->asic_reset_res = 0; 3822 3823 if (r) { 3824 /* bad news, how to tell it to userspace ? */ 3825 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); 3826 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 3827 } else { 3828 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); 3829 } 3830 3831 amdgpu_device_unlock_adev(tmp_adev); 3832 } 3833 3834 if (hive) 3835 mutex_unlock(&hive->reset_lock); 3836 3837 if (r) 3838 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 3839 return r; 3840 } 3841 3842 /** 3843 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 3844 * 3845 * @adev: amdgpu_device pointer 3846 * 3847 * Fetchs and stores in the driver the PCIE capabilities (gen speed 3848 * and lanes) of the slot the device is in. Handles APUs and 3849 * virtualized environments where PCIE config space may not be available. 3850 */ 3851 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 3852 { 3853 struct pci_dev *pdev; 3854 enum pci_bus_speed speed_cap, platform_speed_cap; 3855 enum pcie_link_width platform_link_width; 3856 3857 if (amdgpu_pcie_gen_cap) 3858 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 3859 3860 if (amdgpu_pcie_lane_cap) 3861 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 3862 3863 /* covers APUs as well */ 3864 if (pci_is_root_bus(adev->pdev->bus)) { 3865 if (adev->pm.pcie_gen_mask == 0) 3866 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 3867 if (adev->pm.pcie_mlw_mask == 0) 3868 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 3869 return; 3870 } 3871 3872 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 3873 return; 3874 3875 pcie_bandwidth_available(adev->pdev, NULL, 3876 &platform_speed_cap, &platform_link_width); 3877 3878 if (adev->pm.pcie_gen_mask == 0) { 3879 /* asic caps */ 3880 pdev = adev->pdev; 3881 speed_cap = pcie_get_speed_cap(pdev); 3882 if (speed_cap == PCI_SPEED_UNKNOWN) { 3883 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3884 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3885 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 3886 } else { 3887 if (speed_cap == PCIE_SPEED_16_0GT) 3888 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3889 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3890 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 3891 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 3892 else if (speed_cap == PCIE_SPEED_8_0GT) 3893 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3894 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3895 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 3896 else if (speed_cap == PCIE_SPEED_5_0GT) 3897 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3898 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 3899 else 3900 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 3901 } 3902 /* platform caps */ 3903 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 3904 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3905 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 3906 } else { 3907 if (platform_speed_cap == PCIE_SPEED_16_0GT) 3908 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3909 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3910 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 3911 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 3912 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 3913 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3914 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3915 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 3916 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 3917 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3918 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 3919 else 3920 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 3921 3922 } 3923 } 3924 if (adev->pm.pcie_mlw_mask == 0) { 3925 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 3926 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 3927 } else { 3928 switch (platform_link_width) { 3929 case PCIE_LNK_X32: 3930 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 3931 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 3932 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 3933 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3934 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3935 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3936 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3937 break; 3938 case PCIE_LNK_X16: 3939 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 3940 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 3941 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3942 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3943 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3944 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3945 break; 3946 case PCIE_LNK_X12: 3947 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 3948 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3952 break; 3953 case PCIE_LNK_X8: 3954 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3958 break; 3959 case PCIE_LNK_X4: 3960 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3961 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3962 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3963 break; 3964 case PCIE_LNK_X2: 3965 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3967 break; 3968 case PCIE_LNK_X1: 3969 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 3970 break; 3971 default: 3972 break; 3973 } 3974 } 3975 } 3976 } 3977 3978