1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 68 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 69 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 70 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 71 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 72 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 73 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 75 76 #define AMDGPU_RESUME_MS 2000 77 78 static const char *amdgpu_asic_name[] = { 79 "TAHITI", 80 "PITCAIRN", 81 "VERDE", 82 "OLAND", 83 "HAINAN", 84 "BONAIRE", 85 "KAVERI", 86 "KABINI", 87 "HAWAII", 88 "MULLINS", 89 "TOPAZ", 90 "TONGA", 91 "FIJI", 92 "CARRIZO", 93 "STONEY", 94 "POLARIS10", 95 "POLARIS11", 96 "POLARIS12", 97 "VEGAM", 98 "VEGA10", 99 "VEGA12", 100 "VEGA20", 101 "RAVEN", 102 "NAVI10", 103 "NAVI14", 104 "LAST", 105 }; 106 107 /** 108 * DOC: pcie_replay_count 109 * 110 * The amdgpu driver provides a sysfs API for reporting the total number 111 * of PCIe replays (NAKs) 112 * The file pcie_replay_count is used for this and returns the total 113 * number of replays as a sum of the NAKs generated and NAKs received 114 */ 115 116 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 117 struct device_attribute *attr, char *buf) 118 { 119 struct drm_device *ddev = dev_get_drvdata(dev); 120 struct amdgpu_device *adev = ddev->dev_private; 121 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 122 123 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 124 } 125 126 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 127 amdgpu_device_get_pcie_replay_count, NULL); 128 129 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 130 131 /** 132 * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control 133 * 134 * @dev: drm_device pointer 135 * 136 * Returns true if the device is a dGPU with HG/PX power control, 137 * otherwise return false. 138 */ 139 bool amdgpu_device_is_px(struct drm_device *dev) 140 { 141 struct amdgpu_device *adev = dev->dev_private; 142 143 if (adev->flags & AMD_IS_PX) 144 return true; 145 return false; 146 } 147 148 /* 149 * MMIO register access helper functions. 150 */ 151 /** 152 * amdgpu_mm_rreg - read a memory mapped IO register 153 * 154 * @adev: amdgpu_device pointer 155 * @reg: dword aligned register offset 156 * @acc_flags: access flags which require special behavior 157 * 158 * Returns the 32 bit value from the offset specified. 159 */ 160 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 161 uint32_t acc_flags) 162 { 163 uint32_t ret; 164 165 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 166 return amdgpu_virt_kiq_rreg(adev, reg); 167 168 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 169 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 170 else { 171 unsigned long flags; 172 173 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 174 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 175 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 176 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 177 } 178 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 179 return ret; 180 } 181 182 /* 183 * MMIO register read with bytes helper functions 184 * @offset:bytes offset from MMIO start 185 * 186 */ 187 188 /** 189 * amdgpu_mm_rreg8 - read a memory mapped IO register 190 * 191 * @adev: amdgpu_device pointer 192 * @offset: byte aligned register offset 193 * 194 * Returns the 8 bit value from the offset specified. 195 */ 196 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 197 if (offset < adev->rmmio_size) 198 return (readb(adev->rmmio + offset)); 199 BUG(); 200 } 201 202 /* 203 * MMIO register write with bytes helper functions 204 * @offset:bytes offset from MMIO start 205 * @value: the value want to be written to the register 206 * 207 */ 208 /** 209 * amdgpu_mm_wreg8 - read a memory mapped IO register 210 * 211 * @adev: amdgpu_device pointer 212 * @offset: byte aligned register offset 213 * @value: 8 bit value to write 214 * 215 * Writes the value specified to the offset specified. 216 */ 217 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 218 if (offset < adev->rmmio_size) 219 writeb(value, adev->rmmio + offset); 220 else 221 BUG(); 222 } 223 224 /** 225 * amdgpu_mm_wreg - write to a memory mapped IO register 226 * 227 * @adev: amdgpu_device pointer 228 * @reg: dword aligned register offset 229 * @v: 32 bit value to write to the register 230 * @acc_flags: access flags which require special behavior 231 * 232 * Writes the value specified to the offset specified. 233 */ 234 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 235 uint32_t acc_flags) 236 { 237 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 238 239 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 240 adev->last_mm_index = v; 241 } 242 243 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 244 return amdgpu_virt_kiq_wreg(adev, reg, v); 245 246 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 247 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 248 else { 249 unsigned long flags; 250 251 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 252 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 253 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 254 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 255 } 256 257 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 258 udelay(500); 259 } 260 } 261 262 /** 263 * amdgpu_io_rreg - read an IO register 264 * 265 * @adev: amdgpu_device pointer 266 * @reg: dword aligned register offset 267 * 268 * Returns the 32 bit value from the offset specified. 269 */ 270 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 271 { 272 if ((reg * 4) < adev->rio_mem_size) 273 return ioread32(adev->rio_mem + (reg * 4)); 274 else { 275 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 276 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 277 } 278 } 279 280 /** 281 * amdgpu_io_wreg - write to an IO register 282 * 283 * @adev: amdgpu_device pointer 284 * @reg: dword aligned register offset 285 * @v: 32 bit value to write to the register 286 * 287 * Writes the value specified to the offset specified. 288 */ 289 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 290 { 291 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 292 adev->last_mm_index = v; 293 } 294 295 if ((reg * 4) < adev->rio_mem_size) 296 iowrite32(v, adev->rio_mem + (reg * 4)); 297 else { 298 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 299 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 300 } 301 302 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 303 udelay(500); 304 } 305 } 306 307 /** 308 * amdgpu_mm_rdoorbell - read a doorbell dword 309 * 310 * @adev: amdgpu_device pointer 311 * @index: doorbell index 312 * 313 * Returns the value in the doorbell aperture at the 314 * requested doorbell index (CIK). 315 */ 316 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 317 { 318 if (index < adev->doorbell.num_doorbells) { 319 return readl(adev->doorbell.ptr + index); 320 } else { 321 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 322 return 0; 323 } 324 } 325 326 /** 327 * amdgpu_mm_wdoorbell - write a doorbell dword 328 * 329 * @adev: amdgpu_device pointer 330 * @index: doorbell index 331 * @v: value to write 332 * 333 * Writes @v to the doorbell aperture at the 334 * requested doorbell index (CIK). 335 */ 336 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 337 { 338 if (index < adev->doorbell.num_doorbells) { 339 writel(v, adev->doorbell.ptr + index); 340 } else { 341 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 342 } 343 } 344 345 /** 346 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 347 * 348 * @adev: amdgpu_device pointer 349 * @index: doorbell index 350 * 351 * Returns the value in the doorbell aperture at the 352 * requested doorbell index (VEGA10+). 353 */ 354 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 355 { 356 if (index < adev->doorbell.num_doorbells) { 357 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 358 } else { 359 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 360 return 0; 361 } 362 } 363 364 /** 365 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 366 * 367 * @adev: amdgpu_device pointer 368 * @index: doorbell index 369 * @v: value to write 370 * 371 * Writes @v to the doorbell aperture at the 372 * requested doorbell index (VEGA10+). 373 */ 374 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 375 { 376 if (index < adev->doorbell.num_doorbells) { 377 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 378 } else { 379 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 380 } 381 } 382 383 /** 384 * amdgpu_invalid_rreg - dummy reg read function 385 * 386 * @adev: amdgpu device pointer 387 * @reg: offset of register 388 * 389 * Dummy register read function. Used for register blocks 390 * that certain asics don't have (all asics). 391 * Returns the value in the register. 392 */ 393 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 394 { 395 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 396 BUG(); 397 return 0; 398 } 399 400 /** 401 * amdgpu_invalid_wreg - dummy reg write function 402 * 403 * @adev: amdgpu device pointer 404 * @reg: offset of register 405 * @v: value to write to the register 406 * 407 * Dummy register read function. Used for register blocks 408 * that certain asics don't have (all asics). 409 */ 410 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 411 { 412 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 413 reg, v); 414 BUG(); 415 } 416 417 /** 418 * amdgpu_block_invalid_rreg - dummy reg read function 419 * 420 * @adev: amdgpu device pointer 421 * @block: offset of instance 422 * @reg: offset of register 423 * 424 * Dummy register read function. Used for register blocks 425 * that certain asics don't have (all asics). 426 * Returns the value in the register. 427 */ 428 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 429 uint32_t block, uint32_t reg) 430 { 431 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 432 reg, block); 433 BUG(); 434 return 0; 435 } 436 437 /** 438 * amdgpu_block_invalid_wreg - dummy reg write function 439 * 440 * @adev: amdgpu device pointer 441 * @block: offset of instance 442 * @reg: offset of register 443 * @v: value to write to the register 444 * 445 * Dummy register read function. Used for register blocks 446 * that certain asics don't have (all asics). 447 */ 448 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 449 uint32_t block, 450 uint32_t reg, uint32_t v) 451 { 452 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 453 reg, block, v); 454 BUG(); 455 } 456 457 /** 458 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 459 * 460 * @adev: amdgpu device pointer 461 * 462 * Allocates a scratch page of VRAM for use by various things in the 463 * driver. 464 */ 465 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 466 { 467 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 468 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 469 &adev->vram_scratch.robj, 470 &adev->vram_scratch.gpu_addr, 471 (void **)&adev->vram_scratch.ptr); 472 } 473 474 /** 475 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 476 * 477 * @adev: amdgpu device pointer 478 * 479 * Frees the VRAM scratch page. 480 */ 481 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 482 { 483 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 484 } 485 486 /** 487 * amdgpu_device_program_register_sequence - program an array of registers. 488 * 489 * @adev: amdgpu_device pointer 490 * @registers: pointer to the register array 491 * @array_size: size of the register array 492 * 493 * Programs an array or registers with and and or masks. 494 * This is a helper for setting golden registers. 495 */ 496 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 497 const u32 *registers, 498 const u32 array_size) 499 { 500 u32 tmp, reg, and_mask, or_mask; 501 int i; 502 503 if (array_size % 3) 504 return; 505 506 for (i = 0; i < array_size; i +=3) { 507 reg = registers[i + 0]; 508 and_mask = registers[i + 1]; 509 or_mask = registers[i + 2]; 510 511 if (and_mask == 0xffffffff) { 512 tmp = or_mask; 513 } else { 514 tmp = RREG32(reg); 515 tmp &= ~and_mask; 516 if (adev->family >= AMDGPU_FAMILY_AI) 517 tmp |= (or_mask & and_mask); 518 else 519 tmp |= or_mask; 520 } 521 WREG32(reg, tmp); 522 } 523 } 524 525 /** 526 * amdgpu_device_pci_config_reset - reset the GPU 527 * 528 * @adev: amdgpu_device pointer 529 * 530 * Resets the GPU using the pci config reset sequence. 531 * Only applicable to asics prior to vega10. 532 */ 533 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 534 { 535 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 536 } 537 538 /* 539 * GPU doorbell aperture helpers function. 540 */ 541 /** 542 * amdgpu_device_doorbell_init - Init doorbell driver information. 543 * 544 * @adev: amdgpu_device pointer 545 * 546 * Init doorbell driver information (CIK) 547 * Returns 0 on success, error on failure. 548 */ 549 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 550 { 551 552 /* No doorbell on SI hardware generation */ 553 if (adev->asic_type < CHIP_BONAIRE) { 554 adev->doorbell.base = 0; 555 adev->doorbell.size = 0; 556 adev->doorbell.num_doorbells = 0; 557 adev->doorbell.ptr = NULL; 558 return 0; 559 } 560 561 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 562 return -EINVAL; 563 564 amdgpu_asic_init_doorbell_index(adev); 565 566 /* doorbell bar mapping */ 567 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 568 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 569 570 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 571 adev->doorbell_index.max_assignment+1); 572 if (adev->doorbell.num_doorbells == 0) 573 return -EINVAL; 574 575 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 576 * paging queue doorbell use the second page. The 577 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 578 * doorbells are in the first page. So with paging queue enabled, 579 * the max num_doorbells should + 1 page (0x400 in dword) 580 */ 581 if (adev->asic_type >= CHIP_VEGA10) 582 adev->doorbell.num_doorbells += 0x400; 583 584 adev->doorbell.ptr = ioremap(adev->doorbell.base, 585 adev->doorbell.num_doorbells * 586 sizeof(u32)); 587 if (adev->doorbell.ptr == NULL) 588 return -ENOMEM; 589 590 return 0; 591 } 592 593 /** 594 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 595 * 596 * @adev: amdgpu_device pointer 597 * 598 * Tear down doorbell driver information (CIK) 599 */ 600 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 601 { 602 iounmap(adev->doorbell.ptr); 603 adev->doorbell.ptr = NULL; 604 } 605 606 607 608 /* 609 * amdgpu_device_wb_*() 610 * Writeback is the method by which the GPU updates special pages in memory 611 * with the status of certain GPU events (fences, ring pointers,etc.). 612 */ 613 614 /** 615 * amdgpu_device_wb_fini - Disable Writeback and free memory 616 * 617 * @adev: amdgpu_device pointer 618 * 619 * Disables Writeback and frees the Writeback memory (all asics). 620 * Used at driver shutdown. 621 */ 622 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 623 { 624 if (adev->wb.wb_obj) { 625 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 626 &adev->wb.gpu_addr, 627 (void **)&adev->wb.wb); 628 adev->wb.wb_obj = NULL; 629 } 630 } 631 632 /** 633 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 634 * 635 * @adev: amdgpu_device pointer 636 * 637 * Initializes writeback and allocates writeback memory (all asics). 638 * Used at driver startup. 639 * Returns 0 on success or an -error on failure. 640 */ 641 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 642 { 643 int r; 644 645 if (adev->wb.wb_obj == NULL) { 646 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 647 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 648 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 649 &adev->wb.wb_obj, &adev->wb.gpu_addr, 650 (void **)&adev->wb.wb); 651 if (r) { 652 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 653 return r; 654 } 655 656 adev->wb.num_wb = AMDGPU_MAX_WB; 657 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 658 659 /* clear wb memory */ 660 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 661 } 662 663 return 0; 664 } 665 666 /** 667 * amdgpu_device_wb_get - Allocate a wb entry 668 * 669 * @adev: amdgpu_device pointer 670 * @wb: wb index 671 * 672 * Allocate a wb slot for use by the driver (all asics). 673 * Returns 0 on success or -EINVAL on failure. 674 */ 675 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 676 { 677 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 678 679 if (offset < adev->wb.num_wb) { 680 __set_bit(offset, adev->wb.used); 681 *wb = offset << 3; /* convert to dw offset */ 682 return 0; 683 } else { 684 return -EINVAL; 685 } 686 } 687 688 /** 689 * amdgpu_device_wb_free - Free a wb entry 690 * 691 * @adev: amdgpu_device pointer 692 * @wb: wb index 693 * 694 * Free a wb slot allocated for use by the driver (all asics) 695 */ 696 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 697 { 698 wb >>= 3; 699 if (wb < adev->wb.num_wb) 700 __clear_bit(wb, adev->wb.used); 701 } 702 703 /** 704 * amdgpu_device_resize_fb_bar - try to resize FB BAR 705 * 706 * @adev: amdgpu_device pointer 707 * 708 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 709 * to fail, but if any of the BARs is not accessible after the size we abort 710 * driver loading by returning -ENODEV. 711 */ 712 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 713 { 714 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 715 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 716 struct pci_bus *root; 717 struct resource *res; 718 unsigned i; 719 u16 cmd; 720 int r; 721 722 /* Bypass for VF */ 723 if (amdgpu_sriov_vf(adev)) 724 return 0; 725 726 /* Check if the root BUS has 64bit memory resources */ 727 root = adev->pdev->bus; 728 while (root->parent) 729 root = root->parent; 730 731 pci_bus_for_each_resource(root, res, i) { 732 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 733 res->start > 0x100000000ull) 734 break; 735 } 736 737 /* Trying to resize is pointless without a root hub window above 4GB */ 738 if (!res) 739 return 0; 740 741 /* Disable memory decoding while we change the BAR addresses and size */ 742 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 743 pci_write_config_word(adev->pdev, PCI_COMMAND, 744 cmd & ~PCI_COMMAND_MEMORY); 745 746 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 747 amdgpu_device_doorbell_fini(adev); 748 if (adev->asic_type >= CHIP_BONAIRE) 749 pci_release_resource(adev->pdev, 2); 750 751 pci_release_resource(adev->pdev, 0); 752 753 r = pci_resize_resource(adev->pdev, 0, rbar_size); 754 if (r == -ENOSPC) 755 DRM_INFO("Not enough PCI address space for a large BAR."); 756 else if (r && r != -ENOTSUPP) 757 DRM_ERROR("Problem resizing BAR0 (%d).", r); 758 759 pci_assign_unassigned_bus_resources(adev->pdev->bus); 760 761 /* When the doorbell or fb BAR isn't available we have no chance of 762 * using the device. 763 */ 764 r = amdgpu_device_doorbell_init(adev); 765 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 766 return -ENODEV; 767 768 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 769 770 return 0; 771 } 772 773 /* 774 * GPU helpers function. 775 */ 776 /** 777 * amdgpu_device_need_post - check if the hw need post or not 778 * 779 * @adev: amdgpu_device pointer 780 * 781 * Check if the asic has been initialized (all asics) at driver startup 782 * or post is needed if hw reset is performed. 783 * Returns true if need or false if not. 784 */ 785 bool amdgpu_device_need_post(struct amdgpu_device *adev) 786 { 787 uint32_t reg; 788 789 if (amdgpu_sriov_vf(adev)) 790 return false; 791 792 if (amdgpu_passthrough(adev)) { 793 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 794 * some old smc fw still need driver do vPost otherwise gpu hang, while 795 * those smc fw version above 22.15 doesn't have this flaw, so we force 796 * vpost executed for smc version below 22.15 797 */ 798 if (adev->asic_type == CHIP_FIJI) { 799 int err; 800 uint32_t fw_ver; 801 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 802 /* force vPost if error occured */ 803 if (err) 804 return true; 805 806 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 807 if (fw_ver < 0x00160e00) 808 return true; 809 } 810 } 811 812 if (adev->has_hw_reset) { 813 adev->has_hw_reset = false; 814 return true; 815 } 816 817 /* bios scratch used on CIK+ */ 818 if (adev->asic_type >= CHIP_BONAIRE) 819 return amdgpu_atombios_scratch_need_asic_init(adev); 820 821 /* check MEM_SIZE for older asics */ 822 reg = amdgpu_asic_get_config_memsize(adev); 823 824 if ((reg != 0) && (reg != 0xffffffff)) 825 return false; 826 827 return true; 828 } 829 830 /* if we get transitioned to only one device, take VGA back */ 831 /** 832 * amdgpu_device_vga_set_decode - enable/disable vga decode 833 * 834 * @cookie: amdgpu_device pointer 835 * @state: enable/disable vga decode 836 * 837 * Enable/disable vga decode (all asics). 838 * Returns VGA resource flags. 839 */ 840 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 841 { 842 struct amdgpu_device *adev = cookie; 843 amdgpu_asic_set_vga_state(adev, state); 844 if (state) 845 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 846 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 847 else 848 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 849 } 850 851 /** 852 * amdgpu_device_check_block_size - validate the vm block size 853 * 854 * @adev: amdgpu_device pointer 855 * 856 * Validates the vm block size specified via module parameter. 857 * The vm block size defines number of bits in page table versus page directory, 858 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 859 * page table and the remaining bits are in the page directory. 860 */ 861 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 862 { 863 /* defines number of bits in page table versus page directory, 864 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 865 * page table and the remaining bits are in the page directory */ 866 if (amdgpu_vm_block_size == -1) 867 return; 868 869 if (amdgpu_vm_block_size < 9) { 870 dev_warn(adev->dev, "VM page table size (%d) too small\n", 871 amdgpu_vm_block_size); 872 amdgpu_vm_block_size = -1; 873 } 874 } 875 876 /** 877 * amdgpu_device_check_vm_size - validate the vm size 878 * 879 * @adev: amdgpu_device pointer 880 * 881 * Validates the vm size in GB specified via module parameter. 882 * The VM size is the size of the GPU virtual memory space in GB. 883 */ 884 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 885 { 886 /* no need to check the default value */ 887 if (amdgpu_vm_size == -1) 888 return; 889 890 if (amdgpu_vm_size < 1) { 891 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 892 amdgpu_vm_size); 893 amdgpu_vm_size = -1; 894 } 895 } 896 897 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 898 { 899 struct sysinfo si; 900 bool is_os_64 = (sizeof(void *) == 8) ? true : false; 901 uint64_t total_memory; 902 uint64_t dram_size_seven_GB = 0x1B8000000; 903 uint64_t dram_size_three_GB = 0xB8000000; 904 905 if (amdgpu_smu_memory_pool_size == 0) 906 return; 907 908 if (!is_os_64) { 909 DRM_WARN("Not 64-bit OS, feature not supported\n"); 910 goto def_value; 911 } 912 si_meminfo(&si); 913 total_memory = (uint64_t)si.totalram * si.mem_unit; 914 915 if ((amdgpu_smu_memory_pool_size == 1) || 916 (amdgpu_smu_memory_pool_size == 2)) { 917 if (total_memory < dram_size_three_GB) 918 goto def_value1; 919 } else if ((amdgpu_smu_memory_pool_size == 4) || 920 (amdgpu_smu_memory_pool_size == 8)) { 921 if (total_memory < dram_size_seven_GB) 922 goto def_value1; 923 } else { 924 DRM_WARN("Smu memory pool size not supported\n"); 925 goto def_value; 926 } 927 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 928 929 return; 930 931 def_value1: 932 DRM_WARN("No enough system memory\n"); 933 def_value: 934 adev->pm.smu_prv_buffer_size = 0; 935 } 936 937 /** 938 * amdgpu_device_check_arguments - validate module params 939 * 940 * @adev: amdgpu_device pointer 941 * 942 * Validates certain module parameters and updates 943 * the associated values used by the driver (all asics). 944 */ 945 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 946 { 947 int ret = 0; 948 949 if (amdgpu_sched_jobs < 4) { 950 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 951 amdgpu_sched_jobs); 952 amdgpu_sched_jobs = 4; 953 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 954 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 955 amdgpu_sched_jobs); 956 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 957 } 958 959 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 960 /* gart size must be greater or equal to 32M */ 961 dev_warn(adev->dev, "gart size (%d) too small\n", 962 amdgpu_gart_size); 963 amdgpu_gart_size = -1; 964 } 965 966 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 967 /* gtt size must be greater or equal to 32M */ 968 dev_warn(adev->dev, "gtt size (%d) too small\n", 969 amdgpu_gtt_size); 970 amdgpu_gtt_size = -1; 971 } 972 973 /* valid range is between 4 and 9 inclusive */ 974 if (amdgpu_vm_fragment_size != -1 && 975 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 976 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 977 amdgpu_vm_fragment_size = -1; 978 } 979 980 amdgpu_device_check_smu_prv_buffer_size(adev); 981 982 amdgpu_device_check_vm_size(adev); 983 984 amdgpu_device_check_block_size(adev); 985 986 ret = amdgpu_device_get_job_timeout_settings(adev); 987 if (ret) { 988 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 989 return ret; 990 } 991 992 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 993 994 return ret; 995 } 996 997 /** 998 * amdgpu_switcheroo_set_state - set switcheroo state 999 * 1000 * @pdev: pci dev pointer 1001 * @state: vga_switcheroo state 1002 * 1003 * Callback for the switcheroo driver. Suspends or resumes the 1004 * the asics before or after it is powered up using ACPI methods. 1005 */ 1006 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1007 { 1008 struct drm_device *dev = pci_get_drvdata(pdev); 1009 1010 if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) 1011 return; 1012 1013 if (state == VGA_SWITCHEROO_ON) { 1014 pr_info("amdgpu: switched on\n"); 1015 /* don't suspend or resume card normally */ 1016 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1017 1018 amdgpu_device_resume(dev, true, true); 1019 1020 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1021 drm_kms_helper_poll_enable(dev); 1022 } else { 1023 pr_info("amdgpu: switched off\n"); 1024 drm_kms_helper_poll_disable(dev); 1025 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1026 amdgpu_device_suspend(dev, true, true); 1027 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1028 } 1029 } 1030 1031 /** 1032 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1033 * 1034 * @pdev: pci dev pointer 1035 * 1036 * Callback for the switcheroo driver. Check of the switcheroo 1037 * state can be changed. 1038 * Returns true if the state can be changed, false if not. 1039 */ 1040 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1041 { 1042 struct drm_device *dev = pci_get_drvdata(pdev); 1043 1044 /* 1045 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1046 * locking inversion with the driver load path. And the access here is 1047 * completely racy anyway. So don't bother with locking for now. 1048 */ 1049 return dev->open_count == 0; 1050 } 1051 1052 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1053 .set_gpu_state = amdgpu_switcheroo_set_state, 1054 .reprobe = NULL, 1055 .can_switch = amdgpu_switcheroo_can_switch, 1056 }; 1057 1058 /** 1059 * amdgpu_device_ip_set_clockgating_state - set the CG state 1060 * 1061 * @dev: amdgpu_device pointer 1062 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1063 * @state: clockgating state (gate or ungate) 1064 * 1065 * Sets the requested clockgating state for all instances of 1066 * the hardware IP specified. 1067 * Returns the error code from the last instance. 1068 */ 1069 int amdgpu_device_ip_set_clockgating_state(void *dev, 1070 enum amd_ip_block_type block_type, 1071 enum amd_clockgating_state state) 1072 { 1073 struct amdgpu_device *adev = dev; 1074 int i, r = 0; 1075 1076 for (i = 0; i < adev->num_ip_blocks; i++) { 1077 if (!adev->ip_blocks[i].status.valid) 1078 continue; 1079 if (adev->ip_blocks[i].version->type != block_type) 1080 continue; 1081 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1082 continue; 1083 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1084 (void *)adev, state); 1085 if (r) 1086 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1087 adev->ip_blocks[i].version->funcs->name, r); 1088 } 1089 return r; 1090 } 1091 1092 /** 1093 * amdgpu_device_ip_set_powergating_state - set the PG state 1094 * 1095 * @dev: amdgpu_device pointer 1096 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1097 * @state: powergating state (gate or ungate) 1098 * 1099 * Sets the requested powergating state for all instances of 1100 * the hardware IP specified. 1101 * Returns the error code from the last instance. 1102 */ 1103 int amdgpu_device_ip_set_powergating_state(void *dev, 1104 enum amd_ip_block_type block_type, 1105 enum amd_powergating_state state) 1106 { 1107 struct amdgpu_device *adev = dev; 1108 int i, r = 0; 1109 1110 for (i = 0; i < adev->num_ip_blocks; i++) { 1111 if (!adev->ip_blocks[i].status.valid) 1112 continue; 1113 if (adev->ip_blocks[i].version->type != block_type) 1114 continue; 1115 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1116 continue; 1117 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1118 (void *)adev, state); 1119 if (r) 1120 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1121 adev->ip_blocks[i].version->funcs->name, r); 1122 } 1123 return r; 1124 } 1125 1126 /** 1127 * amdgpu_device_ip_get_clockgating_state - get the CG state 1128 * 1129 * @adev: amdgpu_device pointer 1130 * @flags: clockgating feature flags 1131 * 1132 * Walks the list of IPs on the device and updates the clockgating 1133 * flags for each IP. 1134 * Updates @flags with the feature flags for each hardware IP where 1135 * clockgating is enabled. 1136 */ 1137 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1138 u32 *flags) 1139 { 1140 int i; 1141 1142 for (i = 0; i < adev->num_ip_blocks; i++) { 1143 if (!adev->ip_blocks[i].status.valid) 1144 continue; 1145 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1146 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1147 } 1148 } 1149 1150 /** 1151 * amdgpu_device_ip_wait_for_idle - wait for idle 1152 * 1153 * @adev: amdgpu_device pointer 1154 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1155 * 1156 * Waits for the request hardware IP to be idle. 1157 * Returns 0 for success or a negative error code on failure. 1158 */ 1159 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1160 enum amd_ip_block_type block_type) 1161 { 1162 int i, r; 1163 1164 for (i = 0; i < adev->num_ip_blocks; i++) { 1165 if (!adev->ip_blocks[i].status.valid) 1166 continue; 1167 if (adev->ip_blocks[i].version->type == block_type) { 1168 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1169 if (r) 1170 return r; 1171 break; 1172 } 1173 } 1174 return 0; 1175 1176 } 1177 1178 /** 1179 * amdgpu_device_ip_is_idle - is the hardware IP idle 1180 * 1181 * @adev: amdgpu_device pointer 1182 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1183 * 1184 * Check if the hardware IP is idle or not. 1185 * Returns true if it the IP is idle, false if not. 1186 */ 1187 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1188 enum amd_ip_block_type block_type) 1189 { 1190 int i; 1191 1192 for (i = 0; i < adev->num_ip_blocks; i++) { 1193 if (!adev->ip_blocks[i].status.valid) 1194 continue; 1195 if (adev->ip_blocks[i].version->type == block_type) 1196 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1197 } 1198 return true; 1199 1200 } 1201 1202 /** 1203 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1204 * 1205 * @adev: amdgpu_device pointer 1206 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1207 * 1208 * Returns a pointer to the hardware IP block structure 1209 * if it exists for the asic, otherwise NULL. 1210 */ 1211 struct amdgpu_ip_block * 1212 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1213 enum amd_ip_block_type type) 1214 { 1215 int i; 1216 1217 for (i = 0; i < adev->num_ip_blocks; i++) 1218 if (adev->ip_blocks[i].version->type == type) 1219 return &adev->ip_blocks[i]; 1220 1221 return NULL; 1222 } 1223 1224 /** 1225 * amdgpu_device_ip_block_version_cmp 1226 * 1227 * @adev: amdgpu_device pointer 1228 * @type: enum amd_ip_block_type 1229 * @major: major version 1230 * @minor: minor version 1231 * 1232 * return 0 if equal or greater 1233 * return 1 if smaller or the ip_block doesn't exist 1234 */ 1235 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1236 enum amd_ip_block_type type, 1237 u32 major, u32 minor) 1238 { 1239 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1240 1241 if (ip_block && ((ip_block->version->major > major) || 1242 ((ip_block->version->major == major) && 1243 (ip_block->version->minor >= minor)))) 1244 return 0; 1245 1246 return 1; 1247 } 1248 1249 /** 1250 * amdgpu_device_ip_block_add 1251 * 1252 * @adev: amdgpu_device pointer 1253 * @ip_block_version: pointer to the IP to add 1254 * 1255 * Adds the IP block driver information to the collection of IPs 1256 * on the asic. 1257 */ 1258 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1259 const struct amdgpu_ip_block_version *ip_block_version) 1260 { 1261 if (!ip_block_version) 1262 return -EINVAL; 1263 1264 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1265 ip_block_version->funcs->name); 1266 1267 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1268 1269 return 0; 1270 } 1271 1272 /** 1273 * amdgpu_device_enable_virtual_display - enable virtual display feature 1274 * 1275 * @adev: amdgpu_device pointer 1276 * 1277 * Enabled the virtual display feature if the user has enabled it via 1278 * the module parameter virtual_display. This feature provides a virtual 1279 * display hardware on headless boards or in virtualized environments. 1280 * This function parses and validates the configuration string specified by 1281 * the user and configues the virtual display configuration (number of 1282 * virtual connectors, crtcs, etc.) specified. 1283 */ 1284 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1285 { 1286 adev->enable_virtual_display = false; 1287 1288 if (amdgpu_virtual_display) { 1289 struct drm_device *ddev = adev->ddev; 1290 const char *pci_address_name = pci_name(ddev->pdev); 1291 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1292 1293 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1294 pciaddstr_tmp = pciaddstr; 1295 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1296 pciaddname = strsep(&pciaddname_tmp, ","); 1297 if (!strcmp("all", pciaddname) 1298 || !strcmp(pci_address_name, pciaddname)) { 1299 long num_crtc; 1300 int res = -1; 1301 1302 adev->enable_virtual_display = true; 1303 1304 if (pciaddname_tmp) 1305 res = kstrtol(pciaddname_tmp, 10, 1306 &num_crtc); 1307 1308 if (!res) { 1309 if (num_crtc < 1) 1310 num_crtc = 1; 1311 if (num_crtc > 6) 1312 num_crtc = 6; 1313 adev->mode_info.num_crtc = num_crtc; 1314 } else { 1315 adev->mode_info.num_crtc = 1; 1316 } 1317 break; 1318 } 1319 } 1320 1321 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1322 amdgpu_virtual_display, pci_address_name, 1323 adev->enable_virtual_display, adev->mode_info.num_crtc); 1324 1325 kfree(pciaddstr); 1326 } 1327 } 1328 1329 /** 1330 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1331 * 1332 * @adev: amdgpu_device pointer 1333 * 1334 * Parses the asic configuration parameters specified in the gpu info 1335 * firmware and makes them availale to the driver for use in configuring 1336 * the asic. 1337 * Returns 0 on success, -EINVAL on failure. 1338 */ 1339 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1340 { 1341 const char *chip_name; 1342 char fw_name[30]; 1343 int err; 1344 const struct gpu_info_firmware_header_v1_0 *hdr; 1345 1346 adev->firmware.gpu_info_fw = NULL; 1347 1348 switch (adev->asic_type) { 1349 case CHIP_TOPAZ: 1350 case CHIP_TONGA: 1351 case CHIP_FIJI: 1352 case CHIP_POLARIS10: 1353 case CHIP_POLARIS11: 1354 case CHIP_POLARIS12: 1355 case CHIP_VEGAM: 1356 case CHIP_CARRIZO: 1357 case CHIP_STONEY: 1358 #ifdef CONFIG_DRM_AMDGPU_SI 1359 case CHIP_VERDE: 1360 case CHIP_TAHITI: 1361 case CHIP_PITCAIRN: 1362 case CHIP_OLAND: 1363 case CHIP_HAINAN: 1364 #endif 1365 #ifdef CONFIG_DRM_AMDGPU_CIK 1366 case CHIP_BONAIRE: 1367 case CHIP_HAWAII: 1368 case CHIP_KAVERI: 1369 case CHIP_KABINI: 1370 case CHIP_MULLINS: 1371 #endif 1372 case CHIP_VEGA20: 1373 default: 1374 return 0; 1375 case CHIP_VEGA10: 1376 chip_name = "vega10"; 1377 break; 1378 case CHIP_VEGA12: 1379 chip_name = "vega12"; 1380 break; 1381 case CHIP_RAVEN: 1382 if (adev->rev_id >= 8) 1383 chip_name = "raven2"; 1384 else if (adev->pdev->device == 0x15d8) 1385 chip_name = "picasso"; 1386 else 1387 chip_name = "raven"; 1388 break; 1389 case CHIP_NAVI10: 1390 chip_name = "navi10"; 1391 break; 1392 case CHIP_NAVI14: 1393 chip_name = "navi14"; 1394 break; 1395 } 1396 1397 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1398 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1399 if (err) { 1400 dev_err(adev->dev, 1401 "Failed to load gpu_info firmware \"%s\"\n", 1402 fw_name); 1403 goto out; 1404 } 1405 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1406 if (err) { 1407 dev_err(adev->dev, 1408 "Failed to validate gpu_info firmware \"%s\"\n", 1409 fw_name); 1410 goto out; 1411 } 1412 1413 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1414 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1415 1416 switch (hdr->version_major) { 1417 case 1: 1418 { 1419 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1420 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1421 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1422 1423 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1424 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1425 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1426 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1427 adev->gfx.config.max_texture_channel_caches = 1428 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1429 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1430 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1431 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1432 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1433 adev->gfx.config.double_offchip_lds_buf = 1434 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1435 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1436 adev->gfx.cu_info.max_waves_per_simd = 1437 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1438 adev->gfx.cu_info.max_scratch_slots_per_cu = 1439 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1440 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1441 if (hdr->version_minor >= 1) { 1442 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1443 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1444 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1445 adev->gfx.config.num_sc_per_sh = 1446 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1447 adev->gfx.config.num_packer_per_sc = 1448 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1449 } 1450 #ifdef CONFIG_DRM_AMD_DC_DCN2_0 1451 if (hdr->version_minor == 2) { 1452 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1453 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1454 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1455 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1456 } 1457 #endif 1458 break; 1459 } 1460 default: 1461 dev_err(adev->dev, 1462 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1463 err = -EINVAL; 1464 goto out; 1465 } 1466 out: 1467 return err; 1468 } 1469 1470 /** 1471 * amdgpu_device_ip_early_init - run early init for hardware IPs 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Early initialization pass for hardware IPs. The hardware IPs that make 1476 * up each asic are discovered each IP's early_init callback is run. This 1477 * is the first stage in initializing the asic. 1478 * Returns 0 on success, negative error code on failure. 1479 */ 1480 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1481 { 1482 int i, r; 1483 1484 amdgpu_device_enable_virtual_display(adev); 1485 1486 switch (adev->asic_type) { 1487 case CHIP_TOPAZ: 1488 case CHIP_TONGA: 1489 case CHIP_FIJI: 1490 case CHIP_POLARIS10: 1491 case CHIP_POLARIS11: 1492 case CHIP_POLARIS12: 1493 case CHIP_VEGAM: 1494 case CHIP_CARRIZO: 1495 case CHIP_STONEY: 1496 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) 1497 adev->family = AMDGPU_FAMILY_CZ; 1498 else 1499 adev->family = AMDGPU_FAMILY_VI; 1500 1501 r = vi_set_ip_blocks(adev); 1502 if (r) 1503 return r; 1504 break; 1505 #ifdef CONFIG_DRM_AMDGPU_SI 1506 case CHIP_VERDE: 1507 case CHIP_TAHITI: 1508 case CHIP_PITCAIRN: 1509 case CHIP_OLAND: 1510 case CHIP_HAINAN: 1511 adev->family = AMDGPU_FAMILY_SI; 1512 r = si_set_ip_blocks(adev); 1513 if (r) 1514 return r; 1515 break; 1516 #endif 1517 #ifdef CONFIG_DRM_AMDGPU_CIK 1518 case CHIP_BONAIRE: 1519 case CHIP_HAWAII: 1520 case CHIP_KAVERI: 1521 case CHIP_KABINI: 1522 case CHIP_MULLINS: 1523 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) 1524 adev->family = AMDGPU_FAMILY_CI; 1525 else 1526 adev->family = AMDGPU_FAMILY_KV; 1527 1528 r = cik_set_ip_blocks(adev); 1529 if (r) 1530 return r; 1531 break; 1532 #endif 1533 case CHIP_VEGA10: 1534 case CHIP_VEGA12: 1535 case CHIP_VEGA20: 1536 case CHIP_RAVEN: 1537 if (adev->asic_type == CHIP_RAVEN) 1538 adev->family = AMDGPU_FAMILY_RV; 1539 else 1540 adev->family = AMDGPU_FAMILY_AI; 1541 1542 r = soc15_set_ip_blocks(adev); 1543 if (r) 1544 return r; 1545 break; 1546 case CHIP_NAVI10: 1547 case CHIP_NAVI14: 1548 adev->family = AMDGPU_FAMILY_NV; 1549 1550 r = nv_set_ip_blocks(adev); 1551 if (r) 1552 return r; 1553 break; 1554 default: 1555 /* FIXME: not supported yet */ 1556 return -EINVAL; 1557 } 1558 1559 r = amdgpu_device_parse_gpu_info_fw(adev); 1560 if (r) 1561 return r; 1562 1563 amdgpu_amdkfd_device_probe(adev); 1564 1565 if (amdgpu_sriov_vf(adev)) { 1566 r = amdgpu_virt_request_full_gpu(adev, true); 1567 if (r) 1568 return -EAGAIN; 1569 1570 /* query the reg access mode at the very beginning */ 1571 amdgpu_virt_init_reg_access_mode(adev); 1572 } 1573 1574 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1575 if (amdgpu_sriov_vf(adev)) 1576 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1577 1578 for (i = 0; i < adev->num_ip_blocks; i++) { 1579 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1580 DRM_ERROR("disabled ip block: %d <%s>\n", 1581 i, adev->ip_blocks[i].version->funcs->name); 1582 adev->ip_blocks[i].status.valid = false; 1583 } else { 1584 if (adev->ip_blocks[i].version->funcs->early_init) { 1585 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1586 if (r == -ENOENT) { 1587 adev->ip_blocks[i].status.valid = false; 1588 } else if (r) { 1589 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1590 adev->ip_blocks[i].version->funcs->name, r); 1591 return r; 1592 } else { 1593 adev->ip_blocks[i].status.valid = true; 1594 } 1595 } else { 1596 adev->ip_blocks[i].status.valid = true; 1597 } 1598 } 1599 /* get the vbios after the asic_funcs are set up */ 1600 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1601 /* Read BIOS */ 1602 if (!amdgpu_get_bios(adev)) 1603 return -EINVAL; 1604 1605 r = amdgpu_atombios_init(adev); 1606 if (r) { 1607 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1608 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1609 return r; 1610 } 1611 } 1612 } 1613 1614 adev->cg_flags &= amdgpu_cg_mask; 1615 adev->pg_flags &= amdgpu_pg_mask; 1616 1617 return 0; 1618 } 1619 1620 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1621 { 1622 int i, r; 1623 1624 for (i = 0; i < adev->num_ip_blocks; i++) { 1625 if (!adev->ip_blocks[i].status.sw) 1626 continue; 1627 if (adev->ip_blocks[i].status.hw) 1628 continue; 1629 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1630 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1631 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1632 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1633 if (r) { 1634 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1635 adev->ip_blocks[i].version->funcs->name, r); 1636 return r; 1637 } 1638 adev->ip_blocks[i].status.hw = true; 1639 } 1640 } 1641 1642 return 0; 1643 } 1644 1645 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1646 { 1647 int i, r; 1648 1649 for (i = 0; i < adev->num_ip_blocks; i++) { 1650 if (!adev->ip_blocks[i].status.sw) 1651 continue; 1652 if (adev->ip_blocks[i].status.hw) 1653 continue; 1654 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1655 if (r) { 1656 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1657 adev->ip_blocks[i].version->funcs->name, r); 1658 return r; 1659 } 1660 adev->ip_blocks[i].status.hw = true; 1661 } 1662 1663 return 0; 1664 } 1665 1666 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1667 { 1668 int r = 0; 1669 int i; 1670 uint32_t smu_version; 1671 1672 if (adev->asic_type >= CHIP_VEGA10) { 1673 for (i = 0; i < adev->num_ip_blocks; i++) { 1674 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 1675 if (adev->in_gpu_reset || adev->in_suspend) { 1676 if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) 1677 break; /* sriov gpu reset, psp need to do hw_init before IH because of hw limit */ 1678 r = adev->ip_blocks[i].version->funcs->resume(adev); 1679 if (r) { 1680 DRM_ERROR("resume of IP block <%s> failed %d\n", 1681 adev->ip_blocks[i].version->funcs->name, r); 1682 return r; 1683 } 1684 } else { 1685 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1686 if (r) { 1687 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1688 adev->ip_blocks[i].version->funcs->name, r); 1689 return r; 1690 } 1691 } 1692 adev->ip_blocks[i].status.hw = true; 1693 } 1694 } 1695 } 1696 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1697 1698 return r; 1699 } 1700 1701 /** 1702 * amdgpu_device_ip_init - run init for hardware IPs 1703 * 1704 * @adev: amdgpu_device pointer 1705 * 1706 * Main initialization pass for hardware IPs. The list of all the hardware 1707 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1708 * are run. sw_init initializes the software state associated with each IP 1709 * and hw_init initializes the hardware associated with each IP. 1710 * Returns 0 on success, negative error code on failure. 1711 */ 1712 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1713 { 1714 int i, r; 1715 1716 r = amdgpu_ras_init(adev); 1717 if (r) 1718 return r; 1719 1720 for (i = 0; i < adev->num_ip_blocks; i++) { 1721 if (!adev->ip_blocks[i].status.valid) 1722 continue; 1723 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1724 if (r) { 1725 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1726 adev->ip_blocks[i].version->funcs->name, r); 1727 goto init_failed; 1728 } 1729 adev->ip_blocks[i].status.sw = true; 1730 1731 /* need to do gmc hw init early so we can allocate gpu mem */ 1732 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 1733 r = amdgpu_device_vram_scratch_init(adev); 1734 if (r) { 1735 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 1736 goto init_failed; 1737 } 1738 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 1739 if (r) { 1740 DRM_ERROR("hw_init %d failed %d\n", i, r); 1741 goto init_failed; 1742 } 1743 r = amdgpu_device_wb_init(adev); 1744 if (r) { 1745 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 1746 goto init_failed; 1747 } 1748 adev->ip_blocks[i].status.hw = true; 1749 1750 /* right after GMC hw init, we create CSA */ 1751 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 1752 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 1753 AMDGPU_GEM_DOMAIN_VRAM, 1754 AMDGPU_CSA_SIZE); 1755 if (r) { 1756 DRM_ERROR("allocate CSA failed %d\n", r); 1757 goto init_failed; 1758 } 1759 } 1760 } 1761 } 1762 1763 r = amdgpu_ib_pool_init(adev); 1764 if (r) { 1765 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 1766 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 1767 goto init_failed; 1768 } 1769 1770 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 1771 if (r) 1772 goto init_failed; 1773 1774 r = amdgpu_device_ip_hw_init_phase1(adev); 1775 if (r) 1776 goto init_failed; 1777 1778 r = amdgpu_device_fw_loading(adev); 1779 if (r) 1780 goto init_failed; 1781 1782 r = amdgpu_device_ip_hw_init_phase2(adev); 1783 if (r) 1784 goto init_failed; 1785 1786 if (adev->gmc.xgmi.num_physical_nodes > 1) 1787 amdgpu_xgmi_add_device(adev); 1788 amdgpu_amdkfd_device_init(adev); 1789 1790 init_failed: 1791 if (amdgpu_sriov_vf(adev)) { 1792 if (!r) 1793 amdgpu_virt_init_data_exchange(adev); 1794 amdgpu_virt_release_full_gpu(adev, true); 1795 } 1796 1797 return r; 1798 } 1799 1800 /** 1801 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 1802 * 1803 * @adev: amdgpu_device pointer 1804 * 1805 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 1806 * this function before a GPU reset. If the value is retained after a 1807 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 1808 */ 1809 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 1810 { 1811 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 1812 } 1813 1814 /** 1815 * amdgpu_device_check_vram_lost - check if vram is valid 1816 * 1817 * @adev: amdgpu_device pointer 1818 * 1819 * Checks the reset magic value written to the gart pointer in VRAM. 1820 * The driver calls this after a GPU reset to see if the contents of 1821 * VRAM is lost or now. 1822 * returns true if vram is lost, false if not. 1823 */ 1824 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 1825 { 1826 return !!memcmp(adev->gart.ptr, adev->reset_magic, 1827 AMDGPU_RESET_MAGIC_NUM); 1828 } 1829 1830 /** 1831 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 1832 * 1833 * @adev: amdgpu_device pointer 1834 * 1835 * The list of all the hardware IPs that make up the asic is walked and the 1836 * set_clockgating_state callbacks are run. 1837 * Late initialization pass enabling clockgating for hardware IPs. 1838 * Fini or suspend, pass disabling clockgating for hardware IPs. 1839 * Returns 0 on success, negative error code on failure. 1840 */ 1841 1842 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 1843 enum amd_clockgating_state state) 1844 { 1845 int i, j, r; 1846 1847 if (amdgpu_emu_mode == 1) 1848 return 0; 1849 1850 for (j = 0; j < adev->num_ip_blocks; j++) { 1851 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 1852 if (!adev->ip_blocks[i].status.late_initialized) 1853 continue; 1854 /* skip CG for VCE/UVD, it's handled specially */ 1855 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 1856 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 1857 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 1858 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 1859 /* enable clockgating to save power */ 1860 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 1861 state); 1862 if (r) { 1863 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 1864 adev->ip_blocks[i].version->funcs->name, r); 1865 return r; 1866 } 1867 } 1868 } 1869 1870 return 0; 1871 } 1872 1873 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 1874 { 1875 int i, j, r; 1876 1877 if (amdgpu_emu_mode == 1) 1878 return 0; 1879 1880 for (j = 0; j < adev->num_ip_blocks; j++) { 1881 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 1882 if (!adev->ip_blocks[i].status.late_initialized) 1883 continue; 1884 /* skip CG for VCE/UVD, it's handled specially */ 1885 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 1886 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 1887 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 1888 adev->ip_blocks[i].version->funcs->set_powergating_state) { 1889 /* enable powergating to save power */ 1890 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 1891 state); 1892 if (r) { 1893 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 1894 adev->ip_blocks[i].version->funcs->name, r); 1895 return r; 1896 } 1897 } 1898 } 1899 return 0; 1900 } 1901 1902 static int amdgpu_device_enable_mgpu_fan_boost(void) 1903 { 1904 struct amdgpu_gpu_instance *gpu_ins; 1905 struct amdgpu_device *adev; 1906 int i, ret = 0; 1907 1908 mutex_lock(&mgpu_info.mutex); 1909 1910 /* 1911 * MGPU fan boost feature should be enabled 1912 * only when there are two or more dGPUs in 1913 * the system 1914 */ 1915 if (mgpu_info.num_dgpu < 2) 1916 goto out; 1917 1918 for (i = 0; i < mgpu_info.num_dgpu; i++) { 1919 gpu_ins = &(mgpu_info.gpu_ins[i]); 1920 adev = gpu_ins->adev; 1921 if (!(adev->flags & AMD_IS_APU) && 1922 !gpu_ins->mgpu_fan_enabled && 1923 adev->powerplay.pp_funcs && 1924 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 1925 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 1926 if (ret) 1927 break; 1928 1929 gpu_ins->mgpu_fan_enabled = 1; 1930 } 1931 } 1932 1933 out: 1934 mutex_unlock(&mgpu_info.mutex); 1935 1936 return ret; 1937 } 1938 1939 /** 1940 * amdgpu_device_ip_late_init - run late init for hardware IPs 1941 * 1942 * @adev: amdgpu_device pointer 1943 * 1944 * Late initialization pass for hardware IPs. The list of all the hardware 1945 * IPs that make up the asic is walked and the late_init callbacks are run. 1946 * late_init covers any special initialization that an IP requires 1947 * after all of the have been initialized or something that needs to happen 1948 * late in the init process. 1949 * Returns 0 on success, negative error code on failure. 1950 */ 1951 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 1952 { 1953 int i = 0, r; 1954 1955 for (i = 0; i < adev->num_ip_blocks; i++) { 1956 if (!adev->ip_blocks[i].status.hw) 1957 continue; 1958 if (adev->ip_blocks[i].version->funcs->late_init) { 1959 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 1960 if (r) { 1961 DRM_ERROR("late_init of IP block <%s> failed %d\n", 1962 adev->ip_blocks[i].version->funcs->name, r); 1963 return r; 1964 } 1965 } 1966 adev->ip_blocks[i].status.late_initialized = true; 1967 } 1968 1969 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 1970 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 1971 1972 amdgpu_device_fill_reset_magic(adev); 1973 1974 r = amdgpu_device_enable_mgpu_fan_boost(); 1975 if (r) 1976 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 1977 1978 /* set to low pstate by default */ 1979 amdgpu_xgmi_set_pstate(adev, 0); 1980 1981 return 0; 1982 } 1983 1984 /** 1985 * amdgpu_device_ip_fini - run fini for hardware IPs 1986 * 1987 * @adev: amdgpu_device pointer 1988 * 1989 * Main teardown pass for hardware IPs. The list of all the hardware 1990 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 1991 * are run. hw_fini tears down the hardware associated with each IP 1992 * and sw_fini tears down any software state associated with each IP. 1993 * Returns 0 on success, negative error code on failure. 1994 */ 1995 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 1996 { 1997 int i, r; 1998 1999 amdgpu_ras_pre_fini(adev); 2000 2001 if (adev->gmc.xgmi.num_physical_nodes > 1) 2002 amdgpu_xgmi_remove_device(adev); 2003 2004 amdgpu_amdkfd_device_fini(adev); 2005 2006 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2007 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2008 2009 /* need to disable SMC first */ 2010 for (i = 0; i < adev->num_ip_blocks; i++) { 2011 if (!adev->ip_blocks[i].status.hw) 2012 continue; 2013 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2014 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2015 /* XXX handle errors */ 2016 if (r) { 2017 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2018 adev->ip_blocks[i].version->funcs->name, r); 2019 } 2020 adev->ip_blocks[i].status.hw = false; 2021 break; 2022 } 2023 } 2024 2025 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2026 if (!adev->ip_blocks[i].status.hw) 2027 continue; 2028 2029 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2030 /* XXX handle errors */ 2031 if (r) { 2032 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2033 adev->ip_blocks[i].version->funcs->name, r); 2034 } 2035 2036 adev->ip_blocks[i].status.hw = false; 2037 } 2038 2039 2040 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2041 if (!adev->ip_blocks[i].status.sw) 2042 continue; 2043 2044 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2045 amdgpu_ucode_free_bo(adev); 2046 amdgpu_free_static_csa(&adev->virt.csa_obj); 2047 amdgpu_device_wb_fini(adev); 2048 amdgpu_device_vram_scratch_fini(adev); 2049 amdgpu_ib_pool_fini(adev); 2050 } 2051 2052 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2053 /* XXX handle errors */ 2054 if (r) { 2055 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2056 adev->ip_blocks[i].version->funcs->name, r); 2057 } 2058 adev->ip_blocks[i].status.sw = false; 2059 adev->ip_blocks[i].status.valid = false; 2060 } 2061 2062 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2063 if (!adev->ip_blocks[i].status.late_initialized) 2064 continue; 2065 if (adev->ip_blocks[i].version->funcs->late_fini) 2066 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2067 adev->ip_blocks[i].status.late_initialized = false; 2068 } 2069 2070 amdgpu_ras_fini(adev); 2071 2072 if (amdgpu_sriov_vf(adev)) 2073 if (amdgpu_virt_release_full_gpu(adev, false)) 2074 DRM_ERROR("failed to release exclusive mode on fini\n"); 2075 2076 return 0; 2077 } 2078 2079 /** 2080 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2081 * 2082 * @work: work_struct. 2083 */ 2084 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2085 { 2086 struct amdgpu_device *adev = 2087 container_of(work, struct amdgpu_device, delayed_init_work.work); 2088 int r; 2089 2090 r = amdgpu_ib_ring_tests(adev); 2091 if (r) 2092 DRM_ERROR("ib ring test failed (%d).\n", r); 2093 } 2094 2095 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2096 { 2097 struct amdgpu_device *adev = 2098 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2099 2100 mutex_lock(&adev->gfx.gfx_off_mutex); 2101 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2102 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2103 adev->gfx.gfx_off_state = true; 2104 } 2105 mutex_unlock(&adev->gfx.gfx_off_mutex); 2106 } 2107 2108 /** 2109 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2110 * 2111 * @adev: amdgpu_device pointer 2112 * 2113 * Main suspend function for hardware IPs. The list of all the hardware 2114 * IPs that make up the asic is walked, clockgating is disabled and the 2115 * suspend callbacks are run. suspend puts the hardware and software state 2116 * in each IP into a state suitable for suspend. 2117 * Returns 0 on success, negative error code on failure. 2118 */ 2119 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2120 { 2121 int i, r; 2122 2123 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2124 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2125 2126 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2127 if (!adev->ip_blocks[i].status.valid) 2128 continue; 2129 /* displays are handled separately */ 2130 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 2131 /* XXX handle errors */ 2132 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2133 /* XXX handle errors */ 2134 if (r) { 2135 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2136 adev->ip_blocks[i].version->funcs->name, r); 2137 } 2138 } 2139 } 2140 2141 return 0; 2142 } 2143 2144 /** 2145 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2146 * 2147 * @adev: amdgpu_device pointer 2148 * 2149 * Main suspend function for hardware IPs. The list of all the hardware 2150 * IPs that make up the asic is walked, clockgating is disabled and the 2151 * suspend callbacks are run. suspend puts the hardware and software state 2152 * in each IP into a state suitable for suspend. 2153 * Returns 0 on success, negative error code on failure. 2154 */ 2155 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2156 { 2157 int i, r; 2158 2159 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2160 if (!adev->ip_blocks[i].status.valid) 2161 continue; 2162 /* displays are handled in phase1 */ 2163 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2164 continue; 2165 /* XXX handle errors */ 2166 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2167 /* XXX handle errors */ 2168 if (r) { 2169 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2170 adev->ip_blocks[i].version->funcs->name, r); 2171 } 2172 } 2173 2174 return 0; 2175 } 2176 2177 /** 2178 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2179 * 2180 * @adev: amdgpu_device pointer 2181 * 2182 * Main suspend function for hardware IPs. The list of all the hardware 2183 * IPs that make up the asic is walked, clockgating is disabled and the 2184 * suspend callbacks are run. suspend puts the hardware and software state 2185 * in each IP into a state suitable for suspend. 2186 * Returns 0 on success, negative error code on failure. 2187 */ 2188 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2189 { 2190 int r; 2191 2192 if (amdgpu_sriov_vf(adev)) 2193 amdgpu_virt_request_full_gpu(adev, false); 2194 2195 r = amdgpu_device_ip_suspend_phase1(adev); 2196 if (r) 2197 return r; 2198 r = amdgpu_device_ip_suspend_phase2(adev); 2199 2200 if (amdgpu_sriov_vf(adev)) 2201 amdgpu_virt_release_full_gpu(adev, false); 2202 2203 return r; 2204 } 2205 2206 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2207 { 2208 int i, r; 2209 2210 static enum amd_ip_block_type ip_order[] = { 2211 AMD_IP_BLOCK_TYPE_GMC, 2212 AMD_IP_BLOCK_TYPE_COMMON, 2213 AMD_IP_BLOCK_TYPE_PSP, 2214 AMD_IP_BLOCK_TYPE_IH, 2215 }; 2216 2217 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2218 int j; 2219 struct amdgpu_ip_block *block; 2220 2221 for (j = 0; j < adev->num_ip_blocks; j++) { 2222 block = &adev->ip_blocks[j]; 2223 2224 if (block->version->type != ip_order[i] || 2225 !block->status.valid) 2226 continue; 2227 2228 r = block->version->funcs->hw_init(adev); 2229 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2230 if (r) 2231 return r; 2232 } 2233 } 2234 2235 return 0; 2236 } 2237 2238 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2239 { 2240 int i, r; 2241 2242 static enum amd_ip_block_type ip_order[] = { 2243 AMD_IP_BLOCK_TYPE_SMC, 2244 AMD_IP_BLOCK_TYPE_DCE, 2245 AMD_IP_BLOCK_TYPE_GFX, 2246 AMD_IP_BLOCK_TYPE_SDMA, 2247 AMD_IP_BLOCK_TYPE_UVD, 2248 AMD_IP_BLOCK_TYPE_VCE 2249 }; 2250 2251 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2252 int j; 2253 struct amdgpu_ip_block *block; 2254 2255 for (j = 0; j < adev->num_ip_blocks; j++) { 2256 block = &adev->ip_blocks[j]; 2257 2258 if (block->version->type != ip_order[i] || 2259 !block->status.valid) 2260 continue; 2261 2262 r = block->version->funcs->hw_init(adev); 2263 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2264 if (r) 2265 return r; 2266 } 2267 } 2268 2269 return 0; 2270 } 2271 2272 /** 2273 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2274 * 2275 * @adev: amdgpu_device pointer 2276 * 2277 * First resume function for hardware IPs. The list of all the hardware 2278 * IPs that make up the asic is walked and the resume callbacks are run for 2279 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2280 * after a suspend and updates the software state as necessary. This 2281 * function is also used for restoring the GPU after a GPU reset. 2282 * Returns 0 on success, negative error code on failure. 2283 */ 2284 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2285 { 2286 int i, r; 2287 2288 for (i = 0; i < adev->num_ip_blocks; i++) { 2289 if (!adev->ip_blocks[i].status.valid) 2290 continue; 2291 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2292 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2293 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2294 r = adev->ip_blocks[i].version->funcs->resume(adev); 2295 if (r) { 2296 DRM_ERROR("resume of IP block <%s> failed %d\n", 2297 adev->ip_blocks[i].version->funcs->name, r); 2298 return r; 2299 } 2300 } 2301 } 2302 2303 return 0; 2304 } 2305 2306 /** 2307 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2308 * 2309 * @adev: amdgpu_device pointer 2310 * 2311 * First resume function for hardware IPs. The list of all the hardware 2312 * IPs that make up the asic is walked and the resume callbacks are run for 2313 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2314 * functional state after a suspend and updates the software state as 2315 * necessary. This function is also used for restoring the GPU after a GPU 2316 * reset. 2317 * Returns 0 on success, negative error code on failure. 2318 */ 2319 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2320 { 2321 int i, r; 2322 2323 for (i = 0; i < adev->num_ip_blocks; i++) { 2324 if (!adev->ip_blocks[i].status.valid) 2325 continue; 2326 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2327 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2328 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2329 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2330 continue; 2331 r = adev->ip_blocks[i].version->funcs->resume(adev); 2332 if (r) { 2333 DRM_ERROR("resume of IP block <%s> failed %d\n", 2334 adev->ip_blocks[i].version->funcs->name, r); 2335 return r; 2336 } 2337 } 2338 2339 return 0; 2340 } 2341 2342 /** 2343 * amdgpu_device_ip_resume - run resume for hardware IPs 2344 * 2345 * @adev: amdgpu_device pointer 2346 * 2347 * Main resume function for hardware IPs. The hardware IPs 2348 * are split into two resume functions because they are 2349 * are also used in in recovering from a GPU reset and some additional 2350 * steps need to be take between them. In this case (S3/S4) they are 2351 * run sequentially. 2352 * Returns 0 on success, negative error code on failure. 2353 */ 2354 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2355 { 2356 int r; 2357 2358 r = amdgpu_device_ip_resume_phase1(adev); 2359 if (r) 2360 return r; 2361 2362 r = amdgpu_device_fw_loading(adev); 2363 if (r) 2364 return r; 2365 2366 r = amdgpu_device_ip_resume_phase2(adev); 2367 2368 return r; 2369 } 2370 2371 /** 2372 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2373 * 2374 * @adev: amdgpu_device pointer 2375 * 2376 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2377 */ 2378 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2379 { 2380 if (amdgpu_sriov_vf(adev)) { 2381 if (adev->is_atom_fw) { 2382 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2383 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2384 } else { 2385 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2386 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2387 } 2388 2389 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2390 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2391 } 2392 } 2393 2394 /** 2395 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2396 * 2397 * @asic_type: AMD asic type 2398 * 2399 * Check if there is DC (new modesetting infrastructre) support for an asic. 2400 * returns true if DC has support, false if not. 2401 */ 2402 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2403 { 2404 switch (asic_type) { 2405 #if defined(CONFIG_DRM_AMD_DC) 2406 case CHIP_BONAIRE: 2407 case CHIP_KAVERI: 2408 case CHIP_KABINI: 2409 case CHIP_MULLINS: 2410 /* 2411 * We have systems in the wild with these ASICs that require 2412 * LVDS and VGA support which is not supported with DC. 2413 * 2414 * Fallback to the non-DC driver here by default so as not to 2415 * cause regressions. 2416 */ 2417 return amdgpu_dc > 0; 2418 case CHIP_HAWAII: 2419 case CHIP_CARRIZO: 2420 case CHIP_STONEY: 2421 case CHIP_POLARIS10: 2422 case CHIP_POLARIS11: 2423 case CHIP_POLARIS12: 2424 case CHIP_VEGAM: 2425 case CHIP_TONGA: 2426 case CHIP_FIJI: 2427 case CHIP_VEGA10: 2428 case CHIP_VEGA12: 2429 case CHIP_VEGA20: 2430 #if defined(CONFIG_DRM_AMD_DC_DCN1_0) 2431 case CHIP_RAVEN: 2432 #endif 2433 #if defined(CONFIG_DRM_AMD_DC_DCN2_0) 2434 case CHIP_NAVI10: 2435 case CHIP_NAVI14: 2436 #endif 2437 return amdgpu_dc != 0; 2438 #endif 2439 default: 2440 return false; 2441 } 2442 } 2443 2444 /** 2445 * amdgpu_device_has_dc_support - check if dc is supported 2446 * 2447 * @adev: amdgpu_device_pointer 2448 * 2449 * Returns true for supported, false for not supported 2450 */ 2451 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2452 { 2453 if (amdgpu_sriov_vf(adev)) 2454 return false; 2455 2456 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2457 } 2458 2459 2460 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2461 { 2462 struct amdgpu_device *adev = 2463 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2464 2465 adev->asic_reset_res = amdgpu_asic_reset(adev); 2466 if (adev->asic_reset_res) 2467 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2468 adev->asic_reset_res, adev->ddev->unique); 2469 } 2470 2471 2472 /** 2473 * amdgpu_device_init - initialize the driver 2474 * 2475 * @adev: amdgpu_device pointer 2476 * @ddev: drm dev pointer 2477 * @pdev: pci dev pointer 2478 * @flags: driver flags 2479 * 2480 * Initializes the driver info and hw (all asics). 2481 * Returns 0 for success or an error on failure. 2482 * Called at driver startup. 2483 */ 2484 int amdgpu_device_init(struct amdgpu_device *adev, 2485 struct drm_device *ddev, 2486 struct pci_dev *pdev, 2487 uint32_t flags) 2488 { 2489 int r, i; 2490 bool runtime = false; 2491 u32 max_MBps; 2492 2493 adev->shutdown = false; 2494 adev->dev = &pdev->dev; 2495 adev->ddev = ddev; 2496 adev->pdev = pdev; 2497 adev->flags = flags; 2498 adev->asic_type = flags & AMD_ASIC_MASK; 2499 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2500 if (amdgpu_emu_mode == 1) 2501 adev->usec_timeout *= 2; 2502 adev->gmc.gart_size = 512 * 1024 * 1024; 2503 adev->accel_working = false; 2504 adev->num_rings = 0; 2505 adev->mman.buffer_funcs = NULL; 2506 adev->mman.buffer_funcs_ring = NULL; 2507 adev->vm_manager.vm_pte_funcs = NULL; 2508 adev->vm_manager.vm_pte_num_rqs = 0; 2509 adev->gmc.gmc_funcs = NULL; 2510 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 2511 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 2512 2513 adev->smc_rreg = &amdgpu_invalid_rreg; 2514 adev->smc_wreg = &amdgpu_invalid_wreg; 2515 adev->pcie_rreg = &amdgpu_invalid_rreg; 2516 adev->pcie_wreg = &amdgpu_invalid_wreg; 2517 adev->pciep_rreg = &amdgpu_invalid_rreg; 2518 adev->pciep_wreg = &amdgpu_invalid_wreg; 2519 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 2520 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 2521 adev->didt_rreg = &amdgpu_invalid_rreg; 2522 adev->didt_wreg = &amdgpu_invalid_wreg; 2523 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 2524 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 2525 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 2526 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 2527 2528 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 2529 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 2530 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 2531 2532 /* mutex initialization are all done here so we 2533 * can recall function without having locking issues */ 2534 atomic_set(&adev->irq.ih.lock, 0); 2535 mutex_init(&adev->firmware.mutex); 2536 mutex_init(&adev->pm.mutex); 2537 mutex_init(&adev->gfx.gpu_clock_mutex); 2538 mutex_init(&adev->srbm_mutex); 2539 mutex_init(&adev->gfx.pipe_reserve_mutex); 2540 mutex_init(&adev->gfx.gfx_off_mutex); 2541 mutex_init(&adev->grbm_idx_mutex); 2542 mutex_init(&adev->mn_lock); 2543 mutex_init(&adev->virt.vf_errors.lock); 2544 hash_init(adev->mn_hash); 2545 mutex_init(&adev->lock_reset); 2546 mutex_init(&adev->virt.dpm_mutex); 2547 mutex_init(&adev->psp.mutex); 2548 2549 r = amdgpu_device_check_arguments(adev); 2550 if (r) 2551 return r; 2552 2553 spin_lock_init(&adev->mmio_idx_lock); 2554 spin_lock_init(&adev->smc_idx_lock); 2555 spin_lock_init(&adev->pcie_idx_lock); 2556 spin_lock_init(&adev->uvd_ctx_idx_lock); 2557 spin_lock_init(&adev->didt_idx_lock); 2558 spin_lock_init(&adev->gc_cac_idx_lock); 2559 spin_lock_init(&adev->se_cac_idx_lock); 2560 spin_lock_init(&adev->audio_endpt_idx_lock); 2561 spin_lock_init(&adev->mm_stats.lock); 2562 2563 INIT_LIST_HEAD(&adev->shadow_list); 2564 mutex_init(&adev->shadow_list_lock); 2565 2566 INIT_LIST_HEAD(&adev->ring_lru_list); 2567 spin_lock_init(&adev->ring_lru_list_lock); 2568 2569 INIT_DELAYED_WORK(&adev->delayed_init_work, 2570 amdgpu_device_delayed_init_work_handler); 2571 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 2572 amdgpu_device_delay_enable_gfx_off); 2573 2574 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 2575 2576 adev->gfx.gfx_off_req_count = 1; 2577 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; 2578 2579 /* Registers mapping */ 2580 /* TODO: block userspace mapping of io register */ 2581 if (adev->asic_type >= CHIP_BONAIRE) { 2582 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 2583 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 2584 } else { 2585 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 2586 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 2587 } 2588 2589 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 2590 if (adev->rmmio == NULL) { 2591 return -ENOMEM; 2592 } 2593 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 2594 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 2595 2596 /* io port mapping */ 2597 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 2598 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 2599 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 2600 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 2601 break; 2602 } 2603 } 2604 if (adev->rio_mem == NULL) 2605 DRM_INFO("PCI I/O BAR is not found.\n"); 2606 2607 /* enable PCIE atomic ops */ 2608 r = pci_enable_atomic_ops_to_root(adev->pdev, 2609 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 2610 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 2611 if (r) { 2612 adev->have_atomics_support = false; 2613 DRM_INFO("PCIE atomic ops is not supported\n"); 2614 } else { 2615 adev->have_atomics_support = true; 2616 } 2617 2618 amdgpu_device_get_pcie_info(adev); 2619 2620 if (amdgpu_mcbp) 2621 DRM_INFO("MCBP is enabled\n"); 2622 2623 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 2624 adev->enable_mes = true; 2625 2626 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { 2627 r = amdgpu_discovery_init(adev); 2628 if (r) { 2629 dev_err(adev->dev, "amdgpu_discovery_init failed\n"); 2630 return r; 2631 } 2632 } 2633 2634 /* early init functions */ 2635 r = amdgpu_device_ip_early_init(adev); 2636 if (r) 2637 return r; 2638 2639 /* doorbell bar mapping and doorbell index init*/ 2640 amdgpu_device_doorbell_init(adev); 2641 2642 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 2643 /* this will fail for cards that aren't VGA class devices, just 2644 * ignore it */ 2645 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 2646 2647 if (amdgpu_device_is_px(ddev)) 2648 runtime = true; 2649 if (!pci_is_thunderbolt_attached(adev->pdev)) 2650 vga_switcheroo_register_client(adev->pdev, 2651 &amdgpu_switcheroo_ops, runtime); 2652 if (runtime) 2653 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 2654 2655 if (amdgpu_emu_mode == 1) { 2656 /* post the asic on emulation mode */ 2657 emu_soc_asic_init(adev); 2658 goto fence_driver_init; 2659 } 2660 2661 /* detect if we are with an SRIOV vbios */ 2662 amdgpu_device_detect_sriov_bios(adev); 2663 2664 /* check if we need to reset the asic 2665 * E.g., driver was not cleanly unloaded previously, etc. 2666 */ 2667 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 2668 r = amdgpu_asic_reset(adev); 2669 if (r) { 2670 dev_err(adev->dev, "asic reset on init failed\n"); 2671 goto failed; 2672 } 2673 } 2674 2675 /* Post card if necessary */ 2676 if (amdgpu_device_need_post(adev)) { 2677 if (!adev->bios) { 2678 dev_err(adev->dev, "no vBIOS found\n"); 2679 r = -EINVAL; 2680 goto failed; 2681 } 2682 DRM_INFO("GPU posting now...\n"); 2683 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 2684 if (r) { 2685 dev_err(adev->dev, "gpu post error!\n"); 2686 goto failed; 2687 } 2688 } 2689 2690 if (adev->is_atom_fw) { 2691 /* Initialize clocks */ 2692 r = amdgpu_atomfirmware_get_clock_info(adev); 2693 if (r) { 2694 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 2695 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 2696 goto failed; 2697 } 2698 } else { 2699 /* Initialize clocks */ 2700 r = amdgpu_atombios_get_clock_info(adev); 2701 if (r) { 2702 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 2703 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 2704 goto failed; 2705 } 2706 /* init i2c buses */ 2707 if (!amdgpu_device_has_dc_support(adev)) 2708 amdgpu_atombios_i2c_init(adev); 2709 } 2710 2711 fence_driver_init: 2712 /* Fence driver */ 2713 r = amdgpu_fence_driver_init(adev); 2714 if (r) { 2715 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 2716 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 2717 goto failed; 2718 } 2719 2720 /* init the mode config */ 2721 drm_mode_config_init(adev->ddev); 2722 2723 r = amdgpu_device_ip_init(adev); 2724 if (r) { 2725 /* failed in exclusive mode due to timeout */ 2726 if (amdgpu_sriov_vf(adev) && 2727 !amdgpu_sriov_runtime(adev) && 2728 amdgpu_virt_mmio_blocked(adev) && 2729 !amdgpu_virt_wait_reset(adev)) { 2730 dev_err(adev->dev, "VF exclusive mode timeout\n"); 2731 /* Don't send request since VF is inactive. */ 2732 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 2733 adev->virt.ops = NULL; 2734 r = -EAGAIN; 2735 goto failed; 2736 } 2737 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 2738 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 2739 if (amdgpu_virt_request_full_gpu(adev, false)) 2740 amdgpu_virt_release_full_gpu(adev, false); 2741 goto failed; 2742 } 2743 2744 adev->accel_working = true; 2745 2746 amdgpu_vm_check_compute_bug(adev); 2747 2748 /* Initialize the buffer migration limit. */ 2749 if (amdgpu_moverate >= 0) 2750 max_MBps = amdgpu_moverate; 2751 else 2752 max_MBps = 8; /* Allow 8 MB/s. */ 2753 /* Get a log2 for easy divisions. */ 2754 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 2755 2756 amdgpu_fbdev_init(adev); 2757 2758 if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) 2759 amdgpu_pm_virt_sysfs_init(adev); 2760 2761 r = amdgpu_pm_sysfs_init(adev); 2762 if (r) 2763 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 2764 2765 r = amdgpu_ucode_sysfs_init(adev); 2766 if (r) 2767 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 2768 2769 r = amdgpu_debugfs_gem_init(adev); 2770 if (r) 2771 DRM_ERROR("registering gem debugfs failed (%d).\n", r); 2772 2773 r = amdgpu_debugfs_regs_init(adev); 2774 if (r) 2775 DRM_ERROR("registering register debugfs failed (%d).\n", r); 2776 2777 r = amdgpu_debugfs_firmware_init(adev); 2778 if (r) 2779 DRM_ERROR("registering firmware debugfs failed (%d).\n", r); 2780 2781 r = amdgpu_debugfs_init(adev); 2782 if (r) 2783 DRM_ERROR("Creating debugfs files failed (%d).\n", r); 2784 2785 if ((amdgpu_testing & 1)) { 2786 if (adev->accel_working) 2787 amdgpu_test_moves(adev); 2788 else 2789 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 2790 } 2791 if (amdgpu_benchmarking) { 2792 if (adev->accel_working) 2793 amdgpu_benchmark(adev, amdgpu_benchmarking); 2794 else 2795 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 2796 } 2797 2798 /* enable clockgating, etc. after ib tests, etc. since some blocks require 2799 * explicit gating rather than handling it automatically. 2800 */ 2801 r = amdgpu_device_ip_late_init(adev); 2802 if (r) { 2803 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 2804 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 2805 goto failed; 2806 } 2807 2808 /* must succeed. */ 2809 amdgpu_ras_resume(adev); 2810 2811 queue_delayed_work(system_wq, &adev->delayed_init_work, 2812 msecs_to_jiffies(AMDGPU_RESUME_MS)); 2813 2814 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); 2815 if (r) { 2816 dev_err(adev->dev, "Could not create pcie_replay_count"); 2817 return r; 2818 } 2819 2820 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 2821 r = amdgpu_pmu_init(adev); 2822 if (r) 2823 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 2824 2825 return 0; 2826 2827 failed: 2828 amdgpu_vf_error_trans_all(adev); 2829 if (runtime) 2830 vga_switcheroo_fini_domain_pm_ops(adev->dev); 2831 2832 return r; 2833 } 2834 2835 /** 2836 * amdgpu_device_fini - tear down the driver 2837 * 2838 * @adev: amdgpu_device pointer 2839 * 2840 * Tear down the driver info (all asics). 2841 * Called at driver shutdown. 2842 */ 2843 void amdgpu_device_fini(struct amdgpu_device *adev) 2844 { 2845 int r; 2846 2847 DRM_INFO("amdgpu: finishing device.\n"); 2848 adev->shutdown = true; 2849 /* disable all interrupts */ 2850 amdgpu_irq_disable_all(adev); 2851 if (adev->mode_info.mode_config_initialized){ 2852 if (!amdgpu_device_has_dc_support(adev)) 2853 drm_helper_force_disable_all(adev->ddev); 2854 else 2855 drm_atomic_helper_shutdown(adev->ddev); 2856 } 2857 amdgpu_fence_driver_fini(adev); 2858 amdgpu_pm_sysfs_fini(adev); 2859 amdgpu_fbdev_fini(adev); 2860 r = amdgpu_device_ip_fini(adev); 2861 if (adev->firmware.gpu_info_fw) { 2862 release_firmware(adev->firmware.gpu_info_fw); 2863 adev->firmware.gpu_info_fw = NULL; 2864 } 2865 adev->accel_working = false; 2866 cancel_delayed_work_sync(&adev->delayed_init_work); 2867 /* free i2c buses */ 2868 if (!amdgpu_device_has_dc_support(adev)) 2869 amdgpu_i2c_fini(adev); 2870 2871 if (amdgpu_emu_mode != 1) 2872 amdgpu_atombios_fini(adev); 2873 2874 kfree(adev->bios); 2875 adev->bios = NULL; 2876 if (!pci_is_thunderbolt_attached(adev->pdev)) 2877 vga_switcheroo_unregister_client(adev->pdev); 2878 if (adev->flags & AMD_IS_PX) 2879 vga_switcheroo_fini_domain_pm_ops(adev->dev); 2880 vga_client_register(adev->pdev, NULL, NULL, NULL); 2881 if (adev->rio_mem) 2882 pci_iounmap(adev->pdev, adev->rio_mem); 2883 adev->rio_mem = NULL; 2884 iounmap(adev->rmmio); 2885 adev->rmmio = NULL; 2886 amdgpu_device_doorbell_fini(adev); 2887 if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) 2888 amdgpu_pm_virt_sysfs_fini(adev); 2889 2890 amdgpu_debugfs_regs_cleanup(adev); 2891 device_remove_file(adev->dev, &dev_attr_pcie_replay_count); 2892 amdgpu_ucode_sysfs_fini(adev); 2893 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 2894 amdgpu_pmu_fini(adev); 2895 amdgpu_debugfs_preempt_cleanup(adev); 2896 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 2897 amdgpu_discovery_fini(adev); 2898 } 2899 2900 2901 /* 2902 * Suspend & resume. 2903 */ 2904 /** 2905 * amdgpu_device_suspend - initiate device suspend 2906 * 2907 * @dev: drm dev pointer 2908 * @suspend: suspend state 2909 * @fbcon : notify the fbdev of suspend 2910 * 2911 * Puts the hw in the suspend state (all asics). 2912 * Returns 0 for success or an error on failure. 2913 * Called at driver suspend. 2914 */ 2915 int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) 2916 { 2917 struct amdgpu_device *adev; 2918 struct drm_crtc *crtc; 2919 struct drm_connector *connector; 2920 int r; 2921 2922 if (dev == NULL || dev->dev_private == NULL) { 2923 return -ENODEV; 2924 } 2925 2926 adev = dev->dev_private; 2927 2928 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 2929 return 0; 2930 2931 adev->in_suspend = true; 2932 drm_kms_helper_poll_disable(dev); 2933 2934 if (fbcon) 2935 amdgpu_fbdev_set_suspend(adev, 1); 2936 2937 cancel_delayed_work_sync(&adev->delayed_init_work); 2938 2939 if (!amdgpu_device_has_dc_support(adev)) { 2940 /* turn off display hw */ 2941 drm_modeset_lock_all(dev); 2942 list_for_each_entry(connector, &dev->mode_config.connector_list, head) { 2943 drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); 2944 } 2945 drm_modeset_unlock_all(dev); 2946 /* unpin the front buffers and cursors */ 2947 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 2948 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 2949 struct drm_framebuffer *fb = crtc->primary->fb; 2950 struct amdgpu_bo *robj; 2951 2952 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 2953 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 2954 r = amdgpu_bo_reserve(aobj, true); 2955 if (r == 0) { 2956 amdgpu_bo_unpin(aobj); 2957 amdgpu_bo_unreserve(aobj); 2958 } 2959 } 2960 2961 if (fb == NULL || fb->obj[0] == NULL) { 2962 continue; 2963 } 2964 robj = gem_to_amdgpu_bo(fb->obj[0]); 2965 /* don't unpin kernel fb objects */ 2966 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 2967 r = amdgpu_bo_reserve(robj, true); 2968 if (r == 0) { 2969 amdgpu_bo_unpin(robj); 2970 amdgpu_bo_unreserve(robj); 2971 } 2972 } 2973 } 2974 } 2975 2976 amdgpu_amdkfd_suspend(adev); 2977 2978 amdgpu_ras_suspend(adev); 2979 2980 r = amdgpu_device_ip_suspend_phase1(adev); 2981 2982 /* evict vram memory */ 2983 amdgpu_bo_evict_vram(adev); 2984 2985 amdgpu_fence_driver_suspend(adev); 2986 2987 r = amdgpu_device_ip_suspend_phase2(adev); 2988 2989 /* evict remaining vram memory 2990 * This second call to evict vram is to evict the gart page table 2991 * using the CPU. 2992 */ 2993 amdgpu_bo_evict_vram(adev); 2994 2995 pci_save_state(dev->pdev); 2996 if (suspend) { 2997 /* Shut down the device */ 2998 pci_disable_device(dev->pdev); 2999 pci_set_power_state(dev->pdev, PCI_D3hot); 3000 } else { 3001 r = amdgpu_asic_reset(adev); 3002 if (r) 3003 DRM_ERROR("amdgpu asic reset failed\n"); 3004 } 3005 3006 return 0; 3007 } 3008 3009 /** 3010 * amdgpu_device_resume - initiate device resume 3011 * 3012 * @dev: drm dev pointer 3013 * @resume: resume state 3014 * @fbcon : notify the fbdev of resume 3015 * 3016 * Bring the hw back to operating state (all asics). 3017 * Returns 0 for success or an error on failure. 3018 * Called at driver resume. 3019 */ 3020 int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) 3021 { 3022 struct drm_connector *connector; 3023 struct amdgpu_device *adev = dev->dev_private; 3024 struct drm_crtc *crtc; 3025 int r = 0; 3026 3027 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3028 return 0; 3029 3030 if (resume) { 3031 pci_set_power_state(dev->pdev, PCI_D0); 3032 pci_restore_state(dev->pdev); 3033 r = pci_enable_device(dev->pdev); 3034 if (r) 3035 return r; 3036 } 3037 3038 /* post card */ 3039 if (amdgpu_device_need_post(adev)) { 3040 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3041 if (r) 3042 DRM_ERROR("amdgpu asic init failed\n"); 3043 } 3044 3045 r = amdgpu_device_ip_resume(adev); 3046 if (r) { 3047 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3048 return r; 3049 } 3050 amdgpu_fence_driver_resume(adev); 3051 3052 3053 r = amdgpu_device_ip_late_init(adev); 3054 if (r) 3055 return r; 3056 3057 queue_delayed_work(system_wq, &adev->delayed_init_work, 3058 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3059 3060 if (!amdgpu_device_has_dc_support(adev)) { 3061 /* pin cursors */ 3062 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3063 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3064 3065 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3066 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3067 r = amdgpu_bo_reserve(aobj, true); 3068 if (r == 0) { 3069 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3070 if (r != 0) 3071 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3072 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3073 amdgpu_bo_unreserve(aobj); 3074 } 3075 } 3076 } 3077 } 3078 r = amdgpu_amdkfd_resume(adev); 3079 if (r) 3080 return r; 3081 3082 /* Make sure IB tests flushed */ 3083 flush_delayed_work(&adev->delayed_init_work); 3084 3085 /* blat the mode back in */ 3086 if (fbcon) { 3087 if (!amdgpu_device_has_dc_support(adev)) { 3088 /* pre DCE11 */ 3089 drm_helper_resume_force_mode(dev); 3090 3091 /* turn on display hw */ 3092 drm_modeset_lock_all(dev); 3093 list_for_each_entry(connector, &dev->mode_config.connector_list, head) { 3094 drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); 3095 } 3096 drm_modeset_unlock_all(dev); 3097 } 3098 amdgpu_fbdev_set_suspend(adev, 0); 3099 } 3100 3101 drm_kms_helper_poll_enable(dev); 3102 3103 amdgpu_ras_resume(adev); 3104 3105 /* 3106 * Most of the connector probing functions try to acquire runtime pm 3107 * refs to ensure that the GPU is powered on when connector polling is 3108 * performed. Since we're calling this from a runtime PM callback, 3109 * trying to acquire rpm refs will cause us to deadlock. 3110 * 3111 * Since we're guaranteed to be holding the rpm lock, it's safe to 3112 * temporarily disable the rpm helpers so this doesn't deadlock us. 3113 */ 3114 #ifdef CONFIG_PM 3115 dev->dev->power.disable_depth++; 3116 #endif 3117 if (!amdgpu_device_has_dc_support(adev)) 3118 drm_helper_hpd_irq_event(dev); 3119 else 3120 drm_kms_helper_hotplug_event(dev); 3121 #ifdef CONFIG_PM 3122 dev->dev->power.disable_depth--; 3123 #endif 3124 adev->in_suspend = false; 3125 3126 return 0; 3127 } 3128 3129 /** 3130 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3131 * 3132 * @adev: amdgpu_device pointer 3133 * 3134 * The list of all the hardware IPs that make up the asic is walked and 3135 * the check_soft_reset callbacks are run. check_soft_reset determines 3136 * if the asic is still hung or not. 3137 * Returns true if any of the IPs are still in a hung state, false if not. 3138 */ 3139 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3140 { 3141 int i; 3142 bool asic_hang = false; 3143 3144 if (amdgpu_sriov_vf(adev)) 3145 return true; 3146 3147 if (amdgpu_asic_need_full_reset(adev)) 3148 return true; 3149 3150 for (i = 0; i < adev->num_ip_blocks; i++) { 3151 if (!adev->ip_blocks[i].status.valid) 3152 continue; 3153 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3154 adev->ip_blocks[i].status.hang = 3155 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3156 if (adev->ip_blocks[i].status.hang) { 3157 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3158 asic_hang = true; 3159 } 3160 } 3161 return asic_hang; 3162 } 3163 3164 /** 3165 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3166 * 3167 * @adev: amdgpu_device pointer 3168 * 3169 * The list of all the hardware IPs that make up the asic is walked and the 3170 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3171 * handles any IP specific hardware or software state changes that are 3172 * necessary for a soft reset to succeed. 3173 * Returns 0 on success, negative error code on failure. 3174 */ 3175 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3176 { 3177 int i, r = 0; 3178 3179 for (i = 0; i < adev->num_ip_blocks; i++) { 3180 if (!adev->ip_blocks[i].status.valid) 3181 continue; 3182 if (adev->ip_blocks[i].status.hang && 3183 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3184 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3185 if (r) 3186 return r; 3187 } 3188 } 3189 3190 return 0; 3191 } 3192 3193 /** 3194 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3195 * 3196 * @adev: amdgpu_device pointer 3197 * 3198 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3199 * reset is necessary to recover. 3200 * Returns true if a full asic reset is required, false if not. 3201 */ 3202 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3203 { 3204 int i; 3205 3206 if (amdgpu_asic_need_full_reset(adev)) 3207 return true; 3208 3209 for (i = 0; i < adev->num_ip_blocks; i++) { 3210 if (!adev->ip_blocks[i].status.valid) 3211 continue; 3212 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3213 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3214 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3215 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3216 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3217 if (adev->ip_blocks[i].status.hang) { 3218 DRM_INFO("Some block need full reset!\n"); 3219 return true; 3220 } 3221 } 3222 } 3223 return false; 3224 } 3225 3226 /** 3227 * amdgpu_device_ip_soft_reset - do a soft reset 3228 * 3229 * @adev: amdgpu_device pointer 3230 * 3231 * The list of all the hardware IPs that make up the asic is walked and the 3232 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3233 * IP specific hardware or software state changes that are necessary to soft 3234 * reset the IP. 3235 * Returns 0 on success, negative error code on failure. 3236 */ 3237 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3238 { 3239 int i, r = 0; 3240 3241 for (i = 0; i < adev->num_ip_blocks; i++) { 3242 if (!adev->ip_blocks[i].status.valid) 3243 continue; 3244 if (adev->ip_blocks[i].status.hang && 3245 adev->ip_blocks[i].version->funcs->soft_reset) { 3246 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3247 if (r) 3248 return r; 3249 } 3250 } 3251 3252 return 0; 3253 } 3254 3255 /** 3256 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3257 * 3258 * @adev: amdgpu_device pointer 3259 * 3260 * The list of all the hardware IPs that make up the asic is walked and the 3261 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3262 * handles any IP specific hardware or software state changes that are 3263 * necessary after the IP has been soft reset. 3264 * Returns 0 on success, negative error code on failure. 3265 */ 3266 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3267 { 3268 int i, r = 0; 3269 3270 for (i = 0; i < adev->num_ip_blocks; i++) { 3271 if (!adev->ip_blocks[i].status.valid) 3272 continue; 3273 if (adev->ip_blocks[i].status.hang && 3274 adev->ip_blocks[i].version->funcs->post_soft_reset) 3275 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3276 if (r) 3277 return r; 3278 } 3279 3280 return 0; 3281 } 3282 3283 /** 3284 * amdgpu_device_recover_vram - Recover some VRAM contents 3285 * 3286 * @adev: amdgpu_device pointer 3287 * 3288 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3289 * restore things like GPUVM page tables after a GPU reset where 3290 * the contents of VRAM might be lost. 3291 * 3292 * Returns: 3293 * 0 on success, negative error code on failure. 3294 */ 3295 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3296 { 3297 struct dma_fence *fence = NULL, *next = NULL; 3298 struct amdgpu_bo *shadow; 3299 long r = 1, tmo; 3300 3301 if (amdgpu_sriov_runtime(adev)) 3302 tmo = msecs_to_jiffies(8000); 3303 else 3304 tmo = msecs_to_jiffies(100); 3305 3306 DRM_INFO("recover vram bo from shadow start\n"); 3307 mutex_lock(&adev->shadow_list_lock); 3308 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3309 3310 /* No need to recover an evicted BO */ 3311 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3312 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3313 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3314 continue; 3315 3316 r = amdgpu_bo_restore_shadow(shadow, &next); 3317 if (r) 3318 break; 3319 3320 if (fence) { 3321 tmo = dma_fence_wait_timeout(fence, false, tmo); 3322 dma_fence_put(fence); 3323 fence = next; 3324 if (tmo == 0) { 3325 r = -ETIMEDOUT; 3326 break; 3327 } else if (tmo < 0) { 3328 r = tmo; 3329 break; 3330 } 3331 } else { 3332 fence = next; 3333 } 3334 } 3335 mutex_unlock(&adev->shadow_list_lock); 3336 3337 if (fence) 3338 tmo = dma_fence_wait_timeout(fence, false, tmo); 3339 dma_fence_put(fence); 3340 3341 if (r < 0 || tmo <= 0) { 3342 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3343 return -EIO; 3344 } 3345 3346 DRM_INFO("recover vram bo from shadow done\n"); 3347 return 0; 3348 } 3349 3350 3351 /** 3352 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3353 * 3354 * @adev: amdgpu device pointer 3355 * @from_hypervisor: request from hypervisor 3356 * 3357 * do VF FLR and reinitialize Asic 3358 * return 0 means succeeded otherwise failed 3359 */ 3360 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3361 bool from_hypervisor) 3362 { 3363 int r; 3364 3365 if (from_hypervisor) 3366 r = amdgpu_virt_request_full_gpu(adev, true); 3367 else 3368 r = amdgpu_virt_reset_gpu(adev); 3369 if (r) 3370 return r; 3371 3372 amdgpu_amdkfd_pre_reset(adev); 3373 3374 /* Resume IP prior to SMC */ 3375 r = amdgpu_device_ip_reinit_early_sriov(adev); 3376 if (r) 3377 goto error; 3378 3379 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3380 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3381 3382 r = amdgpu_device_fw_loading(adev); 3383 if (r) 3384 return r; 3385 3386 /* now we are okay to resume SMC/CP/SDMA */ 3387 r = amdgpu_device_ip_reinit_late_sriov(adev); 3388 if (r) 3389 goto error; 3390 3391 amdgpu_irq_gpu_reset_resume_helper(adev); 3392 r = amdgpu_ib_ring_tests(adev); 3393 amdgpu_amdkfd_post_reset(adev); 3394 3395 error: 3396 amdgpu_virt_init_data_exchange(adev); 3397 amdgpu_virt_release_full_gpu(adev, true); 3398 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3399 atomic_inc(&adev->vram_lost_counter); 3400 r = amdgpu_device_recover_vram(adev); 3401 } 3402 3403 return r; 3404 } 3405 3406 /** 3407 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3408 * 3409 * @adev: amdgpu device pointer 3410 * 3411 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3412 * a hung GPU. 3413 */ 3414 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3415 { 3416 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3417 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3418 return false; 3419 } 3420 3421 if (amdgpu_gpu_recovery == 0) 3422 goto disabled; 3423 3424 if (amdgpu_sriov_vf(adev)) 3425 return true; 3426 3427 if (amdgpu_gpu_recovery == -1) { 3428 switch (adev->asic_type) { 3429 case CHIP_BONAIRE: 3430 case CHIP_HAWAII: 3431 case CHIP_TOPAZ: 3432 case CHIP_TONGA: 3433 case CHIP_FIJI: 3434 case CHIP_POLARIS10: 3435 case CHIP_POLARIS11: 3436 case CHIP_POLARIS12: 3437 case CHIP_VEGAM: 3438 case CHIP_VEGA20: 3439 case CHIP_VEGA10: 3440 case CHIP_VEGA12: 3441 break; 3442 default: 3443 goto disabled; 3444 } 3445 } 3446 3447 return true; 3448 3449 disabled: 3450 DRM_INFO("GPU recovery disabled.\n"); 3451 return false; 3452 } 3453 3454 3455 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 3456 struct amdgpu_job *job, 3457 bool *need_full_reset_arg) 3458 { 3459 int i, r = 0; 3460 bool need_full_reset = *need_full_reset_arg; 3461 3462 /* block all schedulers and reset given job's ring */ 3463 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3464 struct amdgpu_ring *ring = adev->rings[i]; 3465 3466 if (!ring || !ring->sched.thread) 3467 continue; 3468 3469 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3470 amdgpu_fence_driver_force_completion(ring); 3471 } 3472 3473 if(job) 3474 drm_sched_increase_karma(&job->base); 3475 3476 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 3477 if (!amdgpu_sriov_vf(adev)) { 3478 3479 if (!need_full_reset) 3480 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 3481 3482 if (!need_full_reset) { 3483 amdgpu_device_ip_pre_soft_reset(adev); 3484 r = amdgpu_device_ip_soft_reset(adev); 3485 amdgpu_device_ip_post_soft_reset(adev); 3486 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 3487 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 3488 need_full_reset = true; 3489 } 3490 } 3491 3492 if (need_full_reset) 3493 r = amdgpu_device_ip_suspend(adev); 3494 3495 *need_full_reset_arg = need_full_reset; 3496 } 3497 3498 return r; 3499 } 3500 3501 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 3502 struct list_head *device_list_handle, 3503 bool *need_full_reset_arg) 3504 { 3505 struct amdgpu_device *tmp_adev = NULL; 3506 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 3507 int r = 0; 3508 3509 /* 3510 * ASIC reset has to be done on all HGMI hive nodes ASAP 3511 * to allow proper links negotiation in FW (within 1 sec) 3512 */ 3513 if (need_full_reset) { 3514 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3515 /* For XGMI run all resets in parallel to speed up the process */ 3516 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3517 if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) 3518 r = -EALREADY; 3519 } else 3520 r = amdgpu_asic_reset(tmp_adev); 3521 3522 if (r) { 3523 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 3524 r, tmp_adev->ddev->unique); 3525 break; 3526 } 3527 } 3528 3529 /* For XGMI wait for all PSP resets to complete before proceed */ 3530 if (!r) { 3531 list_for_each_entry(tmp_adev, device_list_handle, 3532 gmc.xgmi.head) { 3533 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3534 flush_work(&tmp_adev->xgmi_reset_work); 3535 r = tmp_adev->asic_reset_res; 3536 if (r) 3537 break; 3538 } 3539 } 3540 3541 list_for_each_entry(tmp_adev, device_list_handle, 3542 gmc.xgmi.head) { 3543 amdgpu_ras_reserve_bad_pages(tmp_adev); 3544 } 3545 } 3546 } 3547 3548 3549 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3550 if (need_full_reset) { 3551 /* post card */ 3552 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 3553 DRM_WARN("asic atom init failed!"); 3554 3555 if (!r) { 3556 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 3557 r = amdgpu_device_ip_resume_phase1(tmp_adev); 3558 if (r) 3559 goto out; 3560 3561 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 3562 if (vram_lost) { 3563 DRM_INFO("VRAM is lost due to GPU reset!\n"); 3564 atomic_inc(&tmp_adev->vram_lost_counter); 3565 } 3566 3567 r = amdgpu_gtt_mgr_recover( 3568 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 3569 if (r) 3570 goto out; 3571 3572 r = amdgpu_device_fw_loading(tmp_adev); 3573 if (r) 3574 return r; 3575 3576 r = amdgpu_device_ip_resume_phase2(tmp_adev); 3577 if (r) 3578 goto out; 3579 3580 if (vram_lost) 3581 amdgpu_device_fill_reset_magic(tmp_adev); 3582 3583 /* 3584 * Add this ASIC as tracked as reset was already 3585 * complete successfully. 3586 */ 3587 amdgpu_register_gpu_instance(tmp_adev); 3588 3589 r = amdgpu_device_ip_late_init(tmp_adev); 3590 if (r) 3591 goto out; 3592 3593 /* must succeed. */ 3594 amdgpu_ras_resume(tmp_adev); 3595 3596 /* Update PSP FW topology after reset */ 3597 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 3598 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 3599 } 3600 } 3601 3602 3603 out: 3604 if (!r) { 3605 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 3606 r = amdgpu_ib_ring_tests(tmp_adev); 3607 if (r) { 3608 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 3609 r = amdgpu_device_ip_suspend(tmp_adev); 3610 need_full_reset = true; 3611 r = -EAGAIN; 3612 goto end; 3613 } 3614 } 3615 3616 if (!r) 3617 r = amdgpu_device_recover_vram(tmp_adev); 3618 else 3619 tmp_adev->asic_reset_res = r; 3620 } 3621 3622 end: 3623 *need_full_reset_arg = need_full_reset; 3624 return r; 3625 } 3626 3627 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 3628 { 3629 if (trylock) { 3630 if (!mutex_trylock(&adev->lock_reset)) 3631 return false; 3632 } else 3633 mutex_lock(&adev->lock_reset); 3634 3635 atomic_inc(&adev->gpu_reset_counter); 3636 adev->in_gpu_reset = 1; 3637 /* Block kfd: SRIOV would do it separately */ 3638 if (!amdgpu_sriov_vf(adev)) 3639 amdgpu_amdkfd_pre_reset(adev); 3640 3641 return true; 3642 } 3643 3644 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 3645 { 3646 /*unlock kfd: SRIOV would do it separately */ 3647 if (!amdgpu_sriov_vf(adev)) 3648 amdgpu_amdkfd_post_reset(adev); 3649 amdgpu_vf_error_trans_all(adev); 3650 adev->in_gpu_reset = 0; 3651 mutex_unlock(&adev->lock_reset); 3652 } 3653 3654 3655 /** 3656 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 3657 * 3658 * @adev: amdgpu device pointer 3659 * @job: which job trigger hang 3660 * 3661 * Attempt to reset the GPU if it has hung (all asics). 3662 * Attempt to do soft-reset or full-reset and reinitialize Asic 3663 * Returns 0 for success or an error on failure. 3664 */ 3665 3666 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 3667 struct amdgpu_job *job) 3668 { 3669 struct list_head device_list, *device_list_handle = NULL; 3670 bool need_full_reset, job_signaled; 3671 struct amdgpu_hive_info *hive = NULL; 3672 struct amdgpu_device *tmp_adev = NULL; 3673 int i, r = 0; 3674 3675 need_full_reset = job_signaled = false; 3676 INIT_LIST_HEAD(&device_list); 3677 3678 dev_info(adev->dev, "GPU reset begin!\n"); 3679 3680 cancel_delayed_work_sync(&adev->delayed_init_work); 3681 3682 hive = amdgpu_get_xgmi_hive(adev, false); 3683 3684 /* 3685 * Here we trylock to avoid chain of resets executing from 3686 * either trigger by jobs on different adevs in XGMI hive or jobs on 3687 * different schedulers for same device while this TO handler is running. 3688 * We always reset all schedulers for device and all devices for XGMI 3689 * hive so that should take care of them too. 3690 */ 3691 3692 if (hive && !mutex_trylock(&hive->reset_lock)) { 3693 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 3694 job->base.id, hive->hive_id); 3695 return 0; 3696 } 3697 3698 /* Start with adev pre asic reset first for soft reset check.*/ 3699 if (!amdgpu_device_lock_adev(adev, !hive)) { 3700 DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", 3701 job->base.id); 3702 return 0; 3703 } 3704 3705 /* Build list of devices to reset */ 3706 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3707 if (!hive) { 3708 amdgpu_device_unlock_adev(adev); 3709 return -ENODEV; 3710 } 3711 3712 /* 3713 * In case we are in XGMI hive mode device reset is done for all the 3714 * nodes in the hive to retrain all XGMI links and hence the reset 3715 * sequence is executed in loop on all nodes. 3716 */ 3717 device_list_handle = &hive->device_list; 3718 } else { 3719 list_add_tail(&adev->gmc.xgmi.head, &device_list); 3720 device_list_handle = &device_list; 3721 } 3722 3723 /* 3724 * Mark these ASICs to be reseted as untracked first 3725 * And add them back after reset completed 3726 */ 3727 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) 3728 amdgpu_unregister_gpu_instance(tmp_adev); 3729 3730 /* block all schedulers and reset given job's ring */ 3731 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3732 /* disable ras on ALL IPs */ 3733 if (amdgpu_device_ip_need_full_reset(tmp_adev)) 3734 amdgpu_ras_suspend(tmp_adev); 3735 3736 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3737 struct amdgpu_ring *ring = tmp_adev->rings[i]; 3738 3739 if (!ring || !ring->sched.thread) 3740 continue; 3741 3742 drm_sched_stop(&ring->sched, &job->base); 3743 } 3744 } 3745 3746 3747 /* 3748 * Must check guilty signal here since after this point all old 3749 * HW fences are force signaled. 3750 * 3751 * job->base holds a reference to parent fence 3752 */ 3753 if (job && job->base.s_fence->parent && 3754 dma_fence_is_signaled(job->base.s_fence->parent)) 3755 job_signaled = true; 3756 3757 if (!amdgpu_device_ip_need_full_reset(adev)) 3758 device_list_handle = &device_list; 3759 3760 if (job_signaled) { 3761 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 3762 goto skip_hw_reset; 3763 } 3764 3765 3766 /* Guilty job will be freed after this*/ 3767 r = amdgpu_device_pre_asic_reset(adev, 3768 job, 3769 &need_full_reset); 3770 if (r) { 3771 /*TODO Should we stop ?*/ 3772 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 3773 r, adev->ddev->unique); 3774 adev->asic_reset_res = r; 3775 } 3776 3777 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 3778 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3779 3780 if (tmp_adev == adev) 3781 continue; 3782 3783 amdgpu_device_lock_adev(tmp_adev, false); 3784 r = amdgpu_device_pre_asic_reset(tmp_adev, 3785 NULL, 3786 &need_full_reset); 3787 /*TODO Should we stop ?*/ 3788 if (r) { 3789 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 3790 r, tmp_adev->ddev->unique); 3791 tmp_adev->asic_reset_res = r; 3792 } 3793 } 3794 3795 /* Actual ASIC resets if needed.*/ 3796 /* TODO Implement XGMI hive reset logic for SRIOV */ 3797 if (amdgpu_sriov_vf(adev)) { 3798 r = amdgpu_device_reset_sriov(adev, job ? false : true); 3799 if (r) 3800 adev->asic_reset_res = r; 3801 } else { 3802 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 3803 if (r && r == -EAGAIN) 3804 goto retry; 3805 } 3806 3807 skip_hw_reset: 3808 3809 /* Post ASIC reset for all devs .*/ 3810 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3811 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3812 struct amdgpu_ring *ring = tmp_adev->rings[i]; 3813 3814 if (!ring || !ring->sched.thread) 3815 continue; 3816 3817 /* No point to resubmit jobs if we didn't HW reset*/ 3818 if (!tmp_adev->asic_reset_res && !job_signaled) 3819 drm_sched_resubmit_jobs(&ring->sched); 3820 3821 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 3822 } 3823 3824 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 3825 drm_helper_resume_force_mode(tmp_adev->ddev); 3826 } 3827 3828 tmp_adev->asic_reset_res = 0; 3829 3830 if (r) { 3831 /* bad news, how to tell it to userspace ? */ 3832 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); 3833 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 3834 } else { 3835 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); 3836 } 3837 3838 amdgpu_device_unlock_adev(tmp_adev); 3839 } 3840 3841 if (hive) 3842 mutex_unlock(&hive->reset_lock); 3843 3844 if (r) 3845 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 3846 return r; 3847 } 3848 3849 /** 3850 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 3851 * 3852 * @adev: amdgpu_device pointer 3853 * 3854 * Fetchs and stores in the driver the PCIE capabilities (gen speed 3855 * and lanes) of the slot the device is in. Handles APUs and 3856 * virtualized environments where PCIE config space may not be available. 3857 */ 3858 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 3859 { 3860 struct pci_dev *pdev; 3861 enum pci_bus_speed speed_cap, platform_speed_cap; 3862 enum pcie_link_width platform_link_width; 3863 3864 if (amdgpu_pcie_gen_cap) 3865 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 3866 3867 if (amdgpu_pcie_lane_cap) 3868 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 3869 3870 /* covers APUs as well */ 3871 if (pci_is_root_bus(adev->pdev->bus)) { 3872 if (adev->pm.pcie_gen_mask == 0) 3873 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 3874 if (adev->pm.pcie_mlw_mask == 0) 3875 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 3876 return; 3877 } 3878 3879 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 3880 return; 3881 3882 pcie_bandwidth_available(adev->pdev, NULL, 3883 &platform_speed_cap, &platform_link_width); 3884 3885 if (adev->pm.pcie_gen_mask == 0) { 3886 /* asic caps */ 3887 pdev = adev->pdev; 3888 speed_cap = pcie_get_speed_cap(pdev); 3889 if (speed_cap == PCI_SPEED_UNKNOWN) { 3890 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3891 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3892 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 3893 } else { 3894 if (speed_cap == PCIE_SPEED_16_0GT) 3895 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3896 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3897 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 3898 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 3899 else if (speed_cap == PCIE_SPEED_8_0GT) 3900 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3901 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3902 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 3903 else if (speed_cap == PCIE_SPEED_5_0GT) 3904 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3905 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 3906 else 3907 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 3908 } 3909 /* platform caps */ 3910 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 3911 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3912 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 3913 } else { 3914 if (platform_speed_cap == PCIE_SPEED_16_0GT) 3915 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3916 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3917 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 3918 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 3919 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 3920 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3921 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3922 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 3923 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 3924 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3925 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 3926 else 3927 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 3928 3929 } 3930 } 3931 if (adev->pm.pcie_mlw_mask == 0) { 3932 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 3933 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 3934 } else { 3935 switch (platform_link_width) { 3936 case PCIE_LNK_X32: 3937 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 3938 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 3939 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 3940 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3941 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3942 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3943 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3944 break; 3945 case PCIE_LNK_X16: 3946 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 3947 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 3948 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3952 break; 3953 case PCIE_LNK_X12: 3954 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 3955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3959 break; 3960 case PCIE_LNK_X8: 3961 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 3962 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3965 break; 3966 case PCIE_LNK_X4: 3967 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 3968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3970 break; 3971 case PCIE_LNK_X2: 3972 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 3973 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 3974 break; 3975 case PCIE_LNK_X1: 3976 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 3977 break; 3978 default: 3979 break; 3980 } 3981 } 3982 } 3983 } 3984 3985