1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "NAVI10", 115 "NAVI14", 116 "NAVI12", 117 "SIENNA_CICHLID", 118 "NAVY_FLOUNDER", 119 "LAST", 120 }; 121 122 /** 123 * DOC: pcie_replay_count 124 * 125 * The amdgpu driver provides a sysfs API for reporting the total number 126 * of PCIe replays (NAKs) 127 * The file pcie_replay_count is used for this and returns the total 128 * number of replays as a sum of the NAKs generated and NAKs received 129 */ 130 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct drm_device *ddev = dev_get_drvdata(dev); 135 struct amdgpu_device *adev = drm_to_adev(ddev); 136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 137 138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 139 } 140 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 142 amdgpu_device_get_pcie_replay_count, NULL); 143 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 145 146 /** 147 * DOC: product_name 148 * 149 * The amdgpu driver provides a sysfs API for reporting the product name 150 * for the device 151 * The file serial_number is used for this and returns the product name 152 * as returned from the FRU. 153 * NOTE: This is only available for certain server cards 154 */ 155 156 static ssize_t amdgpu_device_get_product_name(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 struct drm_device *ddev = dev_get_drvdata(dev); 160 struct amdgpu_device *adev = drm_to_adev(ddev); 161 162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 163 } 164 165 static DEVICE_ATTR(product_name, S_IRUGO, 166 amdgpu_device_get_product_name, NULL); 167 168 /** 169 * DOC: product_number 170 * 171 * The amdgpu driver provides a sysfs API for reporting the part number 172 * for the device 173 * The file serial_number is used for this and returns the part number 174 * as returned from the FRU. 175 * NOTE: This is only available for certain server cards 176 */ 177 178 static ssize_t amdgpu_device_get_product_number(struct device *dev, 179 struct device_attribute *attr, char *buf) 180 { 181 struct drm_device *ddev = dev_get_drvdata(dev); 182 struct amdgpu_device *adev = drm_to_adev(ddev); 183 184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 185 } 186 187 static DEVICE_ATTR(product_number, S_IRUGO, 188 amdgpu_device_get_product_number, NULL); 189 190 /** 191 * DOC: serial_number 192 * 193 * The amdgpu driver provides a sysfs API for reporting the serial number 194 * for the device 195 * The file serial_number is used for this and returns the serial number 196 * as returned from the FRU. 197 * NOTE: This is only available for certain server cards 198 */ 199 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 201 struct device_attribute *attr, char *buf) 202 { 203 struct drm_device *ddev = dev_get_drvdata(dev); 204 struct amdgpu_device *adev = drm_to_adev(ddev); 205 206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 207 } 208 209 static DEVICE_ATTR(serial_number, S_IRUGO, 210 amdgpu_device_get_serial_number, NULL); 211 212 /** 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 214 * 215 * @dev: drm_device pointer 216 * 217 * Returns true if the device is a dGPU with HG/PX power control, 218 * otherwise return false. 219 */ 220 bool amdgpu_device_supports_boco(struct drm_device *dev) 221 { 222 struct amdgpu_device *adev = drm_to_adev(dev); 223 224 if (adev->flags & AMD_IS_PX) 225 return true; 226 return false; 227 } 228 229 /** 230 * amdgpu_device_supports_baco - Does the device support BACO 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device supporte BACO, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_baco(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = drm_to_adev(dev); 240 241 return amdgpu_asic_supports_baco(adev); 242 } 243 244 /** 245 * VRAM access helper functions. 246 * 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * MMIO register access helper functions. 307 */ 308 /** 309 * amdgpu_mm_rreg - read a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 318 uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && 323 down_read_trylock(&adev->reset_sem)) { 324 ret = amdgpu_kiq_rreg(adev, reg); 325 up_read(&adev->reset_sem); 326 return ret; 327 } 328 329 if ((reg * 4) < adev->rmmio_size) 330 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 331 else { 332 unsigned long flags; 333 334 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 335 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 336 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 337 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 338 } 339 340 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 341 return ret; 342 } 343 344 /* 345 * MMIO register read with bytes helper functions 346 * @offset:bytes offset from MMIO start 347 * 348 */ 349 350 /** 351 * amdgpu_mm_rreg8 - read a memory mapped IO register 352 * 353 * @adev: amdgpu_device pointer 354 * @offset: byte aligned register offset 355 * 356 * Returns the 8 bit value from the offset specified. 357 */ 358 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 359 if (offset < adev->rmmio_size) 360 return (readb(adev->rmmio + offset)); 361 BUG(); 362 } 363 364 /* 365 * MMIO register write with bytes helper functions 366 * @offset:bytes offset from MMIO start 367 * @value: the value want to be written to the register 368 * 369 */ 370 /** 371 * amdgpu_mm_wreg8 - read a memory mapped IO register 372 * 373 * @adev: amdgpu_device pointer 374 * @offset: byte aligned register offset 375 * @value: 8 bit value to write 376 * 377 * Writes the value specified to the offset specified. 378 */ 379 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 380 if (offset < adev->rmmio_size) 381 writeb(value, adev->rmmio + offset); 382 else 383 BUG(); 384 } 385 386 static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, 387 uint32_t reg, uint32_t v, 388 uint32_t acc_flags) 389 { 390 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 391 392 if ((reg * 4) < adev->rmmio_size) 393 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 394 else { 395 unsigned long flags; 396 397 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 398 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 399 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 400 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 401 } 402 } 403 404 /** 405 * amdgpu_mm_wreg - write to a memory mapped IO register 406 * 407 * @adev: amdgpu_device pointer 408 * @reg: dword aligned register offset 409 * @v: 32 bit value to write to the register 410 * @acc_flags: access flags which require special behavior 411 * 412 * Writes the value specified to the offset specified. 413 */ 414 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 415 uint32_t acc_flags) 416 { 417 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && 418 down_read_trylock(&adev->reset_sem)) { 419 amdgpu_kiq_wreg(adev, reg, v); 420 up_read(&adev->reset_sem); 421 return; 422 } 423 424 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 425 } 426 427 /* 428 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 429 * 430 * this function is invoked only the debugfs register access 431 * */ 432 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 433 uint32_t acc_flags) 434 { 435 if (amdgpu_sriov_fullaccess(adev) && 436 adev->gfx.rlc.funcs && 437 adev->gfx.rlc.funcs->is_rlcg_access_range) { 438 439 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 440 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 441 } 442 443 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); 444 } 445 446 /** 447 * amdgpu_io_rreg - read an IO register 448 * 449 * @adev: amdgpu_device pointer 450 * @reg: dword aligned register offset 451 * 452 * Returns the 32 bit value from the offset specified. 453 */ 454 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 455 { 456 if ((reg * 4) < adev->rio_mem_size) 457 return ioread32(adev->rio_mem + (reg * 4)); 458 else { 459 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 460 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 461 } 462 } 463 464 /** 465 * amdgpu_io_wreg - write to an IO register 466 * 467 * @adev: amdgpu_device pointer 468 * @reg: dword aligned register offset 469 * @v: 32 bit value to write to the register 470 * 471 * Writes the value specified to the offset specified. 472 */ 473 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 474 { 475 if ((reg * 4) < adev->rio_mem_size) 476 iowrite32(v, adev->rio_mem + (reg * 4)); 477 else { 478 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 479 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 480 } 481 } 482 483 /** 484 * amdgpu_mm_rdoorbell - read a doorbell dword 485 * 486 * @adev: amdgpu_device pointer 487 * @index: doorbell index 488 * 489 * Returns the value in the doorbell aperture at the 490 * requested doorbell index (CIK). 491 */ 492 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 493 { 494 if (index < adev->doorbell.num_doorbells) { 495 return readl(adev->doorbell.ptr + index); 496 } else { 497 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 498 return 0; 499 } 500 } 501 502 /** 503 * amdgpu_mm_wdoorbell - write a doorbell dword 504 * 505 * @adev: amdgpu_device pointer 506 * @index: doorbell index 507 * @v: value to write 508 * 509 * Writes @v to the doorbell aperture at the 510 * requested doorbell index (CIK). 511 */ 512 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 513 { 514 if (index < adev->doorbell.num_doorbells) { 515 writel(v, adev->doorbell.ptr + index); 516 } else { 517 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 518 } 519 } 520 521 /** 522 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 523 * 524 * @adev: amdgpu_device pointer 525 * @index: doorbell index 526 * 527 * Returns the value in the doorbell aperture at the 528 * requested doorbell index (VEGA10+). 529 */ 530 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 531 { 532 if (index < adev->doorbell.num_doorbells) { 533 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 534 } else { 535 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 536 return 0; 537 } 538 } 539 540 /** 541 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 542 * 543 * @adev: amdgpu_device pointer 544 * @index: doorbell index 545 * @v: value to write 546 * 547 * Writes @v to the doorbell aperture at the 548 * requested doorbell index (VEGA10+). 549 */ 550 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 551 { 552 if (index < adev->doorbell.num_doorbells) { 553 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 554 } else { 555 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 556 } 557 } 558 559 /** 560 * amdgpu_invalid_rreg - dummy reg read function 561 * 562 * @adev: amdgpu device pointer 563 * @reg: offset of register 564 * 565 * Dummy register read function. Used for register blocks 566 * that certain asics don't have (all asics). 567 * Returns the value in the register. 568 */ 569 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 570 { 571 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 572 BUG(); 573 return 0; 574 } 575 576 /** 577 * amdgpu_invalid_wreg - dummy reg write function 578 * 579 * @adev: amdgpu device pointer 580 * @reg: offset of register 581 * @v: value to write to the register 582 * 583 * Dummy register read function. Used for register blocks 584 * that certain asics don't have (all asics). 585 */ 586 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 587 { 588 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 589 reg, v); 590 BUG(); 591 } 592 593 /** 594 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 595 * 596 * @adev: amdgpu device pointer 597 * @reg: offset of register 598 * 599 * Dummy register read function. Used for register blocks 600 * that certain asics don't have (all asics). 601 * Returns the value in the register. 602 */ 603 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 604 { 605 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 606 BUG(); 607 return 0; 608 } 609 610 /** 611 * amdgpu_invalid_wreg64 - dummy reg write function 612 * 613 * @adev: amdgpu device pointer 614 * @reg: offset of register 615 * @v: value to write to the register 616 * 617 * Dummy register read function. Used for register blocks 618 * that certain asics don't have (all asics). 619 */ 620 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 621 { 622 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 623 reg, v); 624 BUG(); 625 } 626 627 /** 628 * amdgpu_block_invalid_rreg - dummy reg read function 629 * 630 * @adev: amdgpu device pointer 631 * @block: offset of instance 632 * @reg: offset of register 633 * 634 * Dummy register read function. Used for register blocks 635 * that certain asics don't have (all asics). 636 * Returns the value in the register. 637 */ 638 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 639 uint32_t block, uint32_t reg) 640 { 641 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 642 reg, block); 643 BUG(); 644 return 0; 645 } 646 647 /** 648 * amdgpu_block_invalid_wreg - dummy reg write function 649 * 650 * @adev: amdgpu device pointer 651 * @block: offset of instance 652 * @reg: offset of register 653 * @v: value to write to the register 654 * 655 * Dummy register read function. Used for register blocks 656 * that certain asics don't have (all asics). 657 */ 658 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 659 uint32_t block, 660 uint32_t reg, uint32_t v) 661 { 662 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 663 reg, block, v); 664 BUG(); 665 } 666 667 /** 668 * amdgpu_device_asic_init - Wrapper for atom asic_init 669 * 670 * @dev: drm_device pointer 671 * 672 * Does any asic specific work and then calls atom asic init. 673 */ 674 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 675 { 676 amdgpu_asic_pre_asic_init(adev); 677 678 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 679 } 680 681 /** 682 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 683 * 684 * @adev: amdgpu device pointer 685 * 686 * Allocates a scratch page of VRAM for use by various things in the 687 * driver. 688 */ 689 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 690 { 691 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 692 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 693 &adev->vram_scratch.robj, 694 &adev->vram_scratch.gpu_addr, 695 (void **)&adev->vram_scratch.ptr); 696 } 697 698 /** 699 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 700 * 701 * @adev: amdgpu device pointer 702 * 703 * Frees the VRAM scratch page. 704 */ 705 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 706 { 707 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 708 } 709 710 /** 711 * amdgpu_device_program_register_sequence - program an array of registers. 712 * 713 * @adev: amdgpu_device pointer 714 * @registers: pointer to the register array 715 * @array_size: size of the register array 716 * 717 * Programs an array or registers with and and or masks. 718 * This is a helper for setting golden registers. 719 */ 720 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 721 const u32 *registers, 722 const u32 array_size) 723 { 724 u32 tmp, reg, and_mask, or_mask; 725 int i; 726 727 if (array_size % 3) 728 return; 729 730 for (i = 0; i < array_size; i +=3) { 731 reg = registers[i + 0]; 732 and_mask = registers[i + 1]; 733 or_mask = registers[i + 2]; 734 735 if (and_mask == 0xffffffff) { 736 tmp = or_mask; 737 } else { 738 tmp = RREG32(reg); 739 tmp &= ~and_mask; 740 if (adev->family >= AMDGPU_FAMILY_AI) 741 tmp |= (or_mask & and_mask); 742 else 743 tmp |= or_mask; 744 } 745 WREG32(reg, tmp); 746 } 747 } 748 749 /** 750 * amdgpu_device_pci_config_reset - reset the GPU 751 * 752 * @adev: amdgpu_device pointer 753 * 754 * Resets the GPU using the pci config reset sequence. 755 * Only applicable to asics prior to vega10. 756 */ 757 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 758 { 759 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 760 } 761 762 /* 763 * GPU doorbell aperture helpers function. 764 */ 765 /** 766 * amdgpu_device_doorbell_init - Init doorbell driver information. 767 * 768 * @adev: amdgpu_device pointer 769 * 770 * Init doorbell driver information (CIK) 771 * Returns 0 on success, error on failure. 772 */ 773 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 774 { 775 776 /* No doorbell on SI hardware generation */ 777 if (adev->asic_type < CHIP_BONAIRE) { 778 adev->doorbell.base = 0; 779 adev->doorbell.size = 0; 780 adev->doorbell.num_doorbells = 0; 781 adev->doorbell.ptr = NULL; 782 return 0; 783 } 784 785 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 786 return -EINVAL; 787 788 amdgpu_asic_init_doorbell_index(adev); 789 790 /* doorbell bar mapping */ 791 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 792 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 793 794 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 795 adev->doorbell_index.max_assignment+1); 796 if (adev->doorbell.num_doorbells == 0) 797 return -EINVAL; 798 799 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 800 * paging queue doorbell use the second page. The 801 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 802 * doorbells are in the first page. So with paging queue enabled, 803 * the max num_doorbells should + 1 page (0x400 in dword) 804 */ 805 if (adev->asic_type >= CHIP_VEGA10) 806 adev->doorbell.num_doorbells += 0x400; 807 808 adev->doorbell.ptr = ioremap(adev->doorbell.base, 809 adev->doorbell.num_doorbells * 810 sizeof(u32)); 811 if (adev->doorbell.ptr == NULL) 812 return -ENOMEM; 813 814 return 0; 815 } 816 817 /** 818 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 819 * 820 * @adev: amdgpu_device pointer 821 * 822 * Tear down doorbell driver information (CIK) 823 */ 824 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 825 { 826 iounmap(adev->doorbell.ptr); 827 adev->doorbell.ptr = NULL; 828 } 829 830 831 832 /* 833 * amdgpu_device_wb_*() 834 * Writeback is the method by which the GPU updates special pages in memory 835 * with the status of certain GPU events (fences, ring pointers,etc.). 836 */ 837 838 /** 839 * amdgpu_device_wb_fini - Disable Writeback and free memory 840 * 841 * @adev: amdgpu_device pointer 842 * 843 * Disables Writeback and frees the Writeback memory (all asics). 844 * Used at driver shutdown. 845 */ 846 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 847 { 848 if (adev->wb.wb_obj) { 849 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 850 &adev->wb.gpu_addr, 851 (void **)&adev->wb.wb); 852 adev->wb.wb_obj = NULL; 853 } 854 } 855 856 /** 857 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 858 * 859 * @adev: amdgpu_device pointer 860 * 861 * Initializes writeback and allocates writeback memory (all asics). 862 * Used at driver startup. 863 * Returns 0 on success or an -error on failure. 864 */ 865 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 866 { 867 int r; 868 869 if (adev->wb.wb_obj == NULL) { 870 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 871 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 872 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 873 &adev->wb.wb_obj, &adev->wb.gpu_addr, 874 (void **)&adev->wb.wb); 875 if (r) { 876 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 877 return r; 878 } 879 880 adev->wb.num_wb = AMDGPU_MAX_WB; 881 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 882 883 /* clear wb memory */ 884 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 885 } 886 887 return 0; 888 } 889 890 /** 891 * amdgpu_device_wb_get - Allocate a wb entry 892 * 893 * @adev: amdgpu_device pointer 894 * @wb: wb index 895 * 896 * Allocate a wb slot for use by the driver (all asics). 897 * Returns 0 on success or -EINVAL on failure. 898 */ 899 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 900 { 901 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 902 903 if (offset < adev->wb.num_wb) { 904 __set_bit(offset, adev->wb.used); 905 *wb = offset << 3; /* convert to dw offset */ 906 return 0; 907 } else { 908 return -EINVAL; 909 } 910 } 911 912 /** 913 * amdgpu_device_wb_free - Free a wb entry 914 * 915 * @adev: amdgpu_device pointer 916 * @wb: wb index 917 * 918 * Free a wb slot allocated for use by the driver (all asics) 919 */ 920 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 921 { 922 wb >>= 3; 923 if (wb < adev->wb.num_wb) 924 __clear_bit(wb, adev->wb.used); 925 } 926 927 /** 928 * amdgpu_device_resize_fb_bar - try to resize FB BAR 929 * 930 * @adev: amdgpu_device pointer 931 * 932 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 933 * to fail, but if any of the BARs is not accessible after the size we abort 934 * driver loading by returning -ENODEV. 935 */ 936 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 937 { 938 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 939 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 940 struct pci_bus *root; 941 struct resource *res; 942 unsigned i; 943 u16 cmd; 944 int r; 945 946 /* Bypass for VF */ 947 if (amdgpu_sriov_vf(adev)) 948 return 0; 949 950 /* skip if the bios has already enabled large BAR */ 951 if (adev->gmc.real_vram_size && 952 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 953 return 0; 954 955 /* Check if the root BUS has 64bit memory resources */ 956 root = adev->pdev->bus; 957 while (root->parent) 958 root = root->parent; 959 960 pci_bus_for_each_resource(root, res, i) { 961 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 962 res->start > 0x100000000ull) 963 break; 964 } 965 966 /* Trying to resize is pointless without a root hub window above 4GB */ 967 if (!res) 968 return 0; 969 970 /* Disable memory decoding while we change the BAR addresses and size */ 971 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 972 pci_write_config_word(adev->pdev, PCI_COMMAND, 973 cmd & ~PCI_COMMAND_MEMORY); 974 975 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 976 amdgpu_device_doorbell_fini(adev); 977 if (adev->asic_type >= CHIP_BONAIRE) 978 pci_release_resource(adev->pdev, 2); 979 980 pci_release_resource(adev->pdev, 0); 981 982 r = pci_resize_resource(adev->pdev, 0, rbar_size); 983 if (r == -ENOSPC) 984 DRM_INFO("Not enough PCI address space for a large BAR."); 985 else if (r && r != -ENOTSUPP) 986 DRM_ERROR("Problem resizing BAR0 (%d).", r); 987 988 pci_assign_unassigned_bus_resources(adev->pdev->bus); 989 990 /* When the doorbell or fb BAR isn't available we have no chance of 991 * using the device. 992 */ 993 r = amdgpu_device_doorbell_init(adev); 994 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 995 return -ENODEV; 996 997 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 998 999 return 0; 1000 } 1001 1002 /* 1003 * GPU helpers function. 1004 */ 1005 /** 1006 * amdgpu_device_need_post - check if the hw need post or not 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Check if the asic has been initialized (all asics) at driver startup 1011 * or post is needed if hw reset is performed. 1012 * Returns true if need or false if not. 1013 */ 1014 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1015 { 1016 uint32_t reg; 1017 1018 if (amdgpu_sriov_vf(adev)) 1019 return false; 1020 1021 if (amdgpu_passthrough(adev)) { 1022 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1023 * some old smc fw still need driver do vPost otherwise gpu hang, while 1024 * those smc fw version above 22.15 doesn't have this flaw, so we force 1025 * vpost executed for smc version below 22.15 1026 */ 1027 if (adev->asic_type == CHIP_FIJI) { 1028 int err; 1029 uint32_t fw_ver; 1030 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1031 /* force vPost if error occured */ 1032 if (err) 1033 return true; 1034 1035 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1036 if (fw_ver < 0x00160e00) 1037 return true; 1038 } 1039 } 1040 1041 if (adev->has_hw_reset) { 1042 adev->has_hw_reset = false; 1043 return true; 1044 } 1045 1046 /* bios scratch used on CIK+ */ 1047 if (adev->asic_type >= CHIP_BONAIRE) 1048 return amdgpu_atombios_scratch_need_asic_init(adev); 1049 1050 /* check MEM_SIZE for older asics */ 1051 reg = amdgpu_asic_get_config_memsize(adev); 1052 1053 if ((reg != 0) && (reg != 0xffffffff)) 1054 return false; 1055 1056 return true; 1057 } 1058 1059 /* if we get transitioned to only one device, take VGA back */ 1060 /** 1061 * amdgpu_device_vga_set_decode - enable/disable vga decode 1062 * 1063 * @cookie: amdgpu_device pointer 1064 * @state: enable/disable vga decode 1065 * 1066 * Enable/disable vga decode (all asics). 1067 * Returns VGA resource flags. 1068 */ 1069 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1070 { 1071 struct amdgpu_device *adev = cookie; 1072 amdgpu_asic_set_vga_state(adev, state); 1073 if (state) 1074 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1075 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1076 else 1077 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1078 } 1079 1080 /** 1081 * amdgpu_device_check_block_size - validate the vm block size 1082 * 1083 * @adev: amdgpu_device pointer 1084 * 1085 * Validates the vm block size specified via module parameter. 1086 * The vm block size defines number of bits in page table versus page directory, 1087 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1088 * page table and the remaining bits are in the page directory. 1089 */ 1090 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1091 { 1092 /* defines number of bits in page table versus page directory, 1093 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1094 * page table and the remaining bits are in the page directory */ 1095 if (amdgpu_vm_block_size == -1) 1096 return; 1097 1098 if (amdgpu_vm_block_size < 9) { 1099 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1100 amdgpu_vm_block_size); 1101 amdgpu_vm_block_size = -1; 1102 } 1103 } 1104 1105 /** 1106 * amdgpu_device_check_vm_size - validate the vm size 1107 * 1108 * @adev: amdgpu_device pointer 1109 * 1110 * Validates the vm size in GB specified via module parameter. 1111 * The VM size is the size of the GPU virtual memory space in GB. 1112 */ 1113 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1114 { 1115 /* no need to check the default value */ 1116 if (amdgpu_vm_size == -1) 1117 return; 1118 1119 if (amdgpu_vm_size < 1) { 1120 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1121 amdgpu_vm_size); 1122 amdgpu_vm_size = -1; 1123 } 1124 } 1125 1126 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1127 { 1128 struct sysinfo si; 1129 bool is_os_64 = (sizeof(void *) == 8); 1130 uint64_t total_memory; 1131 uint64_t dram_size_seven_GB = 0x1B8000000; 1132 uint64_t dram_size_three_GB = 0xB8000000; 1133 1134 if (amdgpu_smu_memory_pool_size == 0) 1135 return; 1136 1137 if (!is_os_64) { 1138 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1139 goto def_value; 1140 } 1141 si_meminfo(&si); 1142 total_memory = (uint64_t)si.totalram * si.mem_unit; 1143 1144 if ((amdgpu_smu_memory_pool_size == 1) || 1145 (amdgpu_smu_memory_pool_size == 2)) { 1146 if (total_memory < dram_size_three_GB) 1147 goto def_value1; 1148 } else if ((amdgpu_smu_memory_pool_size == 4) || 1149 (amdgpu_smu_memory_pool_size == 8)) { 1150 if (total_memory < dram_size_seven_GB) 1151 goto def_value1; 1152 } else { 1153 DRM_WARN("Smu memory pool size not supported\n"); 1154 goto def_value; 1155 } 1156 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1157 1158 return; 1159 1160 def_value1: 1161 DRM_WARN("No enough system memory\n"); 1162 def_value: 1163 adev->pm.smu_prv_buffer_size = 0; 1164 } 1165 1166 /** 1167 * amdgpu_device_check_arguments - validate module params 1168 * 1169 * @adev: amdgpu_device pointer 1170 * 1171 * Validates certain module parameters and updates 1172 * the associated values used by the driver (all asics). 1173 */ 1174 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1175 { 1176 if (amdgpu_sched_jobs < 4) { 1177 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1178 amdgpu_sched_jobs); 1179 amdgpu_sched_jobs = 4; 1180 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1181 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1182 amdgpu_sched_jobs); 1183 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1184 } 1185 1186 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1187 /* gart size must be greater or equal to 32M */ 1188 dev_warn(adev->dev, "gart size (%d) too small\n", 1189 amdgpu_gart_size); 1190 amdgpu_gart_size = -1; 1191 } 1192 1193 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1194 /* gtt size must be greater or equal to 32M */ 1195 dev_warn(adev->dev, "gtt size (%d) too small\n", 1196 amdgpu_gtt_size); 1197 amdgpu_gtt_size = -1; 1198 } 1199 1200 /* valid range is between 4 and 9 inclusive */ 1201 if (amdgpu_vm_fragment_size != -1 && 1202 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1203 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1204 amdgpu_vm_fragment_size = -1; 1205 } 1206 1207 if (amdgpu_sched_hw_submission < 2) { 1208 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1209 amdgpu_sched_hw_submission); 1210 amdgpu_sched_hw_submission = 2; 1211 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1212 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1213 amdgpu_sched_hw_submission); 1214 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1215 } 1216 1217 amdgpu_device_check_smu_prv_buffer_size(adev); 1218 1219 amdgpu_device_check_vm_size(adev); 1220 1221 amdgpu_device_check_block_size(adev); 1222 1223 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1224 1225 amdgpu_gmc_tmz_set(adev); 1226 1227 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1228 amdgpu_num_kcq = 8; 1229 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1230 } 1231 1232 return 0; 1233 } 1234 1235 /** 1236 * amdgpu_switcheroo_set_state - set switcheroo state 1237 * 1238 * @pdev: pci dev pointer 1239 * @state: vga_switcheroo state 1240 * 1241 * Callback for the switcheroo driver. Suspends or resumes the 1242 * the asics before or after it is powered up using ACPI methods. 1243 */ 1244 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1245 enum vga_switcheroo_state state) 1246 { 1247 struct drm_device *dev = pci_get_drvdata(pdev); 1248 int r; 1249 1250 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1251 return; 1252 1253 if (state == VGA_SWITCHEROO_ON) { 1254 pr_info("switched on\n"); 1255 /* don't suspend or resume card normally */ 1256 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1257 1258 pci_set_power_state(dev->pdev, PCI_D0); 1259 pci_restore_state(dev->pdev); 1260 r = pci_enable_device(dev->pdev); 1261 if (r) 1262 DRM_WARN("pci_enable_device failed (%d)\n", r); 1263 amdgpu_device_resume(dev, true); 1264 1265 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1266 drm_kms_helper_poll_enable(dev); 1267 } else { 1268 pr_info("switched off\n"); 1269 drm_kms_helper_poll_disable(dev); 1270 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1271 amdgpu_device_suspend(dev, true); 1272 pci_save_state(dev->pdev); 1273 /* Shut down the device */ 1274 pci_disable_device(dev->pdev); 1275 pci_set_power_state(dev->pdev, PCI_D3cold); 1276 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1277 } 1278 } 1279 1280 /** 1281 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1282 * 1283 * @pdev: pci dev pointer 1284 * 1285 * Callback for the switcheroo driver. Check of the switcheroo 1286 * state can be changed. 1287 * Returns true if the state can be changed, false if not. 1288 */ 1289 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1290 { 1291 struct drm_device *dev = pci_get_drvdata(pdev); 1292 1293 /* 1294 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1295 * locking inversion with the driver load path. And the access here is 1296 * completely racy anyway. So don't bother with locking for now. 1297 */ 1298 return atomic_read(&dev->open_count) == 0; 1299 } 1300 1301 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1302 .set_gpu_state = amdgpu_switcheroo_set_state, 1303 .reprobe = NULL, 1304 .can_switch = amdgpu_switcheroo_can_switch, 1305 }; 1306 1307 /** 1308 * amdgpu_device_ip_set_clockgating_state - set the CG state 1309 * 1310 * @dev: amdgpu_device pointer 1311 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1312 * @state: clockgating state (gate or ungate) 1313 * 1314 * Sets the requested clockgating state for all instances of 1315 * the hardware IP specified. 1316 * Returns the error code from the last instance. 1317 */ 1318 int amdgpu_device_ip_set_clockgating_state(void *dev, 1319 enum amd_ip_block_type block_type, 1320 enum amd_clockgating_state state) 1321 { 1322 struct amdgpu_device *adev = dev; 1323 int i, r = 0; 1324 1325 for (i = 0; i < adev->num_ip_blocks; i++) { 1326 if (!adev->ip_blocks[i].status.valid) 1327 continue; 1328 if (adev->ip_blocks[i].version->type != block_type) 1329 continue; 1330 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1331 continue; 1332 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1333 (void *)adev, state); 1334 if (r) 1335 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1336 adev->ip_blocks[i].version->funcs->name, r); 1337 } 1338 return r; 1339 } 1340 1341 /** 1342 * amdgpu_device_ip_set_powergating_state - set the PG state 1343 * 1344 * @dev: amdgpu_device pointer 1345 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1346 * @state: powergating state (gate or ungate) 1347 * 1348 * Sets the requested powergating state for all instances of 1349 * the hardware IP specified. 1350 * Returns the error code from the last instance. 1351 */ 1352 int amdgpu_device_ip_set_powergating_state(void *dev, 1353 enum amd_ip_block_type block_type, 1354 enum amd_powergating_state state) 1355 { 1356 struct amdgpu_device *adev = dev; 1357 int i, r = 0; 1358 1359 for (i = 0; i < adev->num_ip_blocks; i++) { 1360 if (!adev->ip_blocks[i].status.valid) 1361 continue; 1362 if (adev->ip_blocks[i].version->type != block_type) 1363 continue; 1364 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1365 continue; 1366 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1367 (void *)adev, state); 1368 if (r) 1369 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1370 adev->ip_blocks[i].version->funcs->name, r); 1371 } 1372 return r; 1373 } 1374 1375 /** 1376 * amdgpu_device_ip_get_clockgating_state - get the CG state 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @flags: clockgating feature flags 1380 * 1381 * Walks the list of IPs on the device and updates the clockgating 1382 * flags for each IP. 1383 * Updates @flags with the feature flags for each hardware IP where 1384 * clockgating is enabled. 1385 */ 1386 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1387 u32 *flags) 1388 { 1389 int i; 1390 1391 for (i = 0; i < adev->num_ip_blocks; i++) { 1392 if (!adev->ip_blocks[i].status.valid) 1393 continue; 1394 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1395 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1396 } 1397 } 1398 1399 /** 1400 * amdgpu_device_ip_wait_for_idle - wait for idle 1401 * 1402 * @adev: amdgpu_device pointer 1403 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1404 * 1405 * Waits for the request hardware IP to be idle. 1406 * Returns 0 for success or a negative error code on failure. 1407 */ 1408 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1409 enum amd_ip_block_type block_type) 1410 { 1411 int i, r; 1412 1413 for (i = 0; i < adev->num_ip_blocks; i++) { 1414 if (!adev->ip_blocks[i].status.valid) 1415 continue; 1416 if (adev->ip_blocks[i].version->type == block_type) { 1417 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1418 if (r) 1419 return r; 1420 break; 1421 } 1422 } 1423 return 0; 1424 1425 } 1426 1427 /** 1428 * amdgpu_device_ip_is_idle - is the hardware IP idle 1429 * 1430 * @adev: amdgpu_device pointer 1431 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1432 * 1433 * Check if the hardware IP is idle or not. 1434 * Returns true if it the IP is idle, false if not. 1435 */ 1436 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1437 enum amd_ip_block_type block_type) 1438 { 1439 int i; 1440 1441 for (i = 0; i < adev->num_ip_blocks; i++) { 1442 if (!adev->ip_blocks[i].status.valid) 1443 continue; 1444 if (adev->ip_blocks[i].version->type == block_type) 1445 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1446 } 1447 return true; 1448 1449 } 1450 1451 /** 1452 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1453 * 1454 * @adev: amdgpu_device pointer 1455 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1456 * 1457 * Returns a pointer to the hardware IP block structure 1458 * if it exists for the asic, otherwise NULL. 1459 */ 1460 struct amdgpu_ip_block * 1461 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1462 enum amd_ip_block_type type) 1463 { 1464 int i; 1465 1466 for (i = 0; i < adev->num_ip_blocks; i++) 1467 if (adev->ip_blocks[i].version->type == type) 1468 return &adev->ip_blocks[i]; 1469 1470 return NULL; 1471 } 1472 1473 /** 1474 * amdgpu_device_ip_block_version_cmp 1475 * 1476 * @adev: amdgpu_device pointer 1477 * @type: enum amd_ip_block_type 1478 * @major: major version 1479 * @minor: minor version 1480 * 1481 * return 0 if equal or greater 1482 * return 1 if smaller or the ip_block doesn't exist 1483 */ 1484 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1485 enum amd_ip_block_type type, 1486 u32 major, u32 minor) 1487 { 1488 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1489 1490 if (ip_block && ((ip_block->version->major > major) || 1491 ((ip_block->version->major == major) && 1492 (ip_block->version->minor >= minor)))) 1493 return 0; 1494 1495 return 1; 1496 } 1497 1498 /** 1499 * amdgpu_device_ip_block_add 1500 * 1501 * @adev: amdgpu_device pointer 1502 * @ip_block_version: pointer to the IP to add 1503 * 1504 * Adds the IP block driver information to the collection of IPs 1505 * on the asic. 1506 */ 1507 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1508 const struct amdgpu_ip_block_version *ip_block_version) 1509 { 1510 if (!ip_block_version) 1511 return -EINVAL; 1512 1513 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1514 ip_block_version->funcs->name); 1515 1516 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1517 1518 return 0; 1519 } 1520 1521 /** 1522 * amdgpu_device_enable_virtual_display - enable virtual display feature 1523 * 1524 * @adev: amdgpu_device pointer 1525 * 1526 * Enabled the virtual display feature if the user has enabled it via 1527 * the module parameter virtual_display. This feature provides a virtual 1528 * display hardware on headless boards or in virtualized environments. 1529 * This function parses and validates the configuration string specified by 1530 * the user and configues the virtual display configuration (number of 1531 * virtual connectors, crtcs, etc.) specified. 1532 */ 1533 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1534 { 1535 adev->enable_virtual_display = false; 1536 1537 if (amdgpu_virtual_display) { 1538 struct drm_device *ddev = adev_to_drm(adev); 1539 const char *pci_address_name = pci_name(ddev->pdev); 1540 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1541 1542 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1543 pciaddstr_tmp = pciaddstr; 1544 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1545 pciaddname = strsep(&pciaddname_tmp, ","); 1546 if (!strcmp("all", pciaddname) 1547 || !strcmp(pci_address_name, pciaddname)) { 1548 long num_crtc; 1549 int res = -1; 1550 1551 adev->enable_virtual_display = true; 1552 1553 if (pciaddname_tmp) 1554 res = kstrtol(pciaddname_tmp, 10, 1555 &num_crtc); 1556 1557 if (!res) { 1558 if (num_crtc < 1) 1559 num_crtc = 1; 1560 if (num_crtc > 6) 1561 num_crtc = 6; 1562 adev->mode_info.num_crtc = num_crtc; 1563 } else { 1564 adev->mode_info.num_crtc = 1; 1565 } 1566 break; 1567 } 1568 } 1569 1570 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1571 amdgpu_virtual_display, pci_address_name, 1572 adev->enable_virtual_display, adev->mode_info.num_crtc); 1573 1574 kfree(pciaddstr); 1575 } 1576 } 1577 1578 /** 1579 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1580 * 1581 * @adev: amdgpu_device pointer 1582 * 1583 * Parses the asic configuration parameters specified in the gpu info 1584 * firmware and makes them availale to the driver for use in configuring 1585 * the asic. 1586 * Returns 0 on success, -EINVAL on failure. 1587 */ 1588 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1589 { 1590 const char *chip_name; 1591 char fw_name[40]; 1592 int err; 1593 const struct gpu_info_firmware_header_v1_0 *hdr; 1594 1595 adev->firmware.gpu_info_fw = NULL; 1596 1597 if (adev->mman.discovery_bin) { 1598 amdgpu_discovery_get_gfx_info(adev); 1599 1600 /* 1601 * FIXME: The bounding box is still needed by Navi12, so 1602 * temporarily read it from gpu_info firmware. Should be droped 1603 * when DAL no longer needs it. 1604 */ 1605 if (adev->asic_type != CHIP_NAVI12) 1606 return 0; 1607 } 1608 1609 switch (adev->asic_type) { 1610 #ifdef CONFIG_DRM_AMDGPU_SI 1611 case CHIP_VERDE: 1612 case CHIP_TAHITI: 1613 case CHIP_PITCAIRN: 1614 case CHIP_OLAND: 1615 case CHIP_HAINAN: 1616 #endif 1617 #ifdef CONFIG_DRM_AMDGPU_CIK 1618 case CHIP_BONAIRE: 1619 case CHIP_HAWAII: 1620 case CHIP_KAVERI: 1621 case CHIP_KABINI: 1622 case CHIP_MULLINS: 1623 #endif 1624 case CHIP_TOPAZ: 1625 case CHIP_TONGA: 1626 case CHIP_FIJI: 1627 case CHIP_POLARIS10: 1628 case CHIP_POLARIS11: 1629 case CHIP_POLARIS12: 1630 case CHIP_VEGAM: 1631 case CHIP_CARRIZO: 1632 case CHIP_STONEY: 1633 case CHIP_VEGA20: 1634 default: 1635 return 0; 1636 case CHIP_VEGA10: 1637 chip_name = "vega10"; 1638 break; 1639 case CHIP_VEGA12: 1640 chip_name = "vega12"; 1641 break; 1642 case CHIP_RAVEN: 1643 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1644 chip_name = "raven2"; 1645 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1646 chip_name = "picasso"; 1647 else 1648 chip_name = "raven"; 1649 break; 1650 case CHIP_ARCTURUS: 1651 chip_name = "arcturus"; 1652 break; 1653 case CHIP_RENOIR: 1654 chip_name = "renoir"; 1655 break; 1656 case CHIP_NAVI10: 1657 chip_name = "navi10"; 1658 break; 1659 case CHIP_NAVI14: 1660 chip_name = "navi14"; 1661 break; 1662 case CHIP_NAVI12: 1663 chip_name = "navi12"; 1664 break; 1665 case CHIP_SIENNA_CICHLID: 1666 chip_name = "sienna_cichlid"; 1667 break; 1668 case CHIP_NAVY_FLOUNDER: 1669 chip_name = "navy_flounder"; 1670 break; 1671 } 1672 1673 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1674 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1675 if (err) { 1676 dev_err(adev->dev, 1677 "Failed to load gpu_info firmware \"%s\"\n", 1678 fw_name); 1679 goto out; 1680 } 1681 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1682 if (err) { 1683 dev_err(adev->dev, 1684 "Failed to validate gpu_info firmware \"%s\"\n", 1685 fw_name); 1686 goto out; 1687 } 1688 1689 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1690 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1691 1692 switch (hdr->version_major) { 1693 case 1: 1694 { 1695 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1696 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1697 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1698 1699 /* 1700 * Should be droped when DAL no longer needs it. 1701 */ 1702 if (adev->asic_type == CHIP_NAVI12) 1703 goto parse_soc_bounding_box; 1704 1705 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1706 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1707 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1708 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1709 adev->gfx.config.max_texture_channel_caches = 1710 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1711 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1712 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1713 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1714 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1715 adev->gfx.config.double_offchip_lds_buf = 1716 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1717 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1718 adev->gfx.cu_info.max_waves_per_simd = 1719 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1720 adev->gfx.cu_info.max_scratch_slots_per_cu = 1721 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1722 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1723 if (hdr->version_minor >= 1) { 1724 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1725 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1726 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1727 adev->gfx.config.num_sc_per_sh = 1728 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1729 adev->gfx.config.num_packer_per_sc = 1730 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1731 } 1732 1733 parse_soc_bounding_box: 1734 /* 1735 * soc bounding box info is not integrated in disocovery table, 1736 * we always need to parse it from gpu info firmware if needed. 1737 */ 1738 if (hdr->version_minor == 2) { 1739 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1740 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1741 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1742 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1743 } 1744 break; 1745 } 1746 default: 1747 dev_err(adev->dev, 1748 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1749 err = -EINVAL; 1750 goto out; 1751 } 1752 out: 1753 return err; 1754 } 1755 1756 /** 1757 * amdgpu_device_ip_early_init - run early init for hardware IPs 1758 * 1759 * @adev: amdgpu_device pointer 1760 * 1761 * Early initialization pass for hardware IPs. The hardware IPs that make 1762 * up each asic are discovered each IP's early_init callback is run. This 1763 * is the first stage in initializing the asic. 1764 * Returns 0 on success, negative error code on failure. 1765 */ 1766 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1767 { 1768 int i, r; 1769 1770 amdgpu_device_enable_virtual_display(adev); 1771 1772 if (amdgpu_sriov_vf(adev)) { 1773 r = amdgpu_virt_request_full_gpu(adev, true); 1774 if (r) 1775 return r; 1776 } 1777 1778 switch (adev->asic_type) { 1779 #ifdef CONFIG_DRM_AMDGPU_SI 1780 case CHIP_VERDE: 1781 case CHIP_TAHITI: 1782 case CHIP_PITCAIRN: 1783 case CHIP_OLAND: 1784 case CHIP_HAINAN: 1785 adev->family = AMDGPU_FAMILY_SI; 1786 r = si_set_ip_blocks(adev); 1787 if (r) 1788 return r; 1789 break; 1790 #endif 1791 #ifdef CONFIG_DRM_AMDGPU_CIK 1792 case CHIP_BONAIRE: 1793 case CHIP_HAWAII: 1794 case CHIP_KAVERI: 1795 case CHIP_KABINI: 1796 case CHIP_MULLINS: 1797 if (adev->flags & AMD_IS_APU) 1798 adev->family = AMDGPU_FAMILY_KV; 1799 else 1800 adev->family = AMDGPU_FAMILY_CI; 1801 1802 r = cik_set_ip_blocks(adev); 1803 if (r) 1804 return r; 1805 break; 1806 #endif 1807 case CHIP_TOPAZ: 1808 case CHIP_TONGA: 1809 case CHIP_FIJI: 1810 case CHIP_POLARIS10: 1811 case CHIP_POLARIS11: 1812 case CHIP_POLARIS12: 1813 case CHIP_VEGAM: 1814 case CHIP_CARRIZO: 1815 case CHIP_STONEY: 1816 if (adev->flags & AMD_IS_APU) 1817 adev->family = AMDGPU_FAMILY_CZ; 1818 else 1819 adev->family = AMDGPU_FAMILY_VI; 1820 1821 r = vi_set_ip_blocks(adev); 1822 if (r) 1823 return r; 1824 break; 1825 case CHIP_VEGA10: 1826 case CHIP_VEGA12: 1827 case CHIP_VEGA20: 1828 case CHIP_RAVEN: 1829 case CHIP_ARCTURUS: 1830 case CHIP_RENOIR: 1831 if (adev->flags & AMD_IS_APU) 1832 adev->family = AMDGPU_FAMILY_RV; 1833 else 1834 adev->family = AMDGPU_FAMILY_AI; 1835 1836 r = soc15_set_ip_blocks(adev); 1837 if (r) 1838 return r; 1839 break; 1840 case CHIP_NAVI10: 1841 case CHIP_NAVI14: 1842 case CHIP_NAVI12: 1843 case CHIP_SIENNA_CICHLID: 1844 case CHIP_NAVY_FLOUNDER: 1845 adev->family = AMDGPU_FAMILY_NV; 1846 1847 r = nv_set_ip_blocks(adev); 1848 if (r) 1849 return r; 1850 break; 1851 default: 1852 /* FIXME: not supported yet */ 1853 return -EINVAL; 1854 } 1855 1856 amdgpu_amdkfd_device_probe(adev); 1857 1858 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1859 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1860 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1861 1862 for (i = 0; i < adev->num_ip_blocks; i++) { 1863 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1864 DRM_ERROR("disabled ip block: %d <%s>\n", 1865 i, adev->ip_blocks[i].version->funcs->name); 1866 adev->ip_blocks[i].status.valid = false; 1867 } else { 1868 if (adev->ip_blocks[i].version->funcs->early_init) { 1869 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1870 if (r == -ENOENT) { 1871 adev->ip_blocks[i].status.valid = false; 1872 } else if (r) { 1873 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1874 adev->ip_blocks[i].version->funcs->name, r); 1875 return r; 1876 } else { 1877 adev->ip_blocks[i].status.valid = true; 1878 } 1879 } else { 1880 adev->ip_blocks[i].status.valid = true; 1881 } 1882 } 1883 /* get the vbios after the asic_funcs are set up */ 1884 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1885 r = amdgpu_device_parse_gpu_info_fw(adev); 1886 if (r) 1887 return r; 1888 1889 /* Read BIOS */ 1890 if (!amdgpu_get_bios(adev)) 1891 return -EINVAL; 1892 1893 r = amdgpu_atombios_init(adev); 1894 if (r) { 1895 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1896 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1897 return r; 1898 } 1899 } 1900 } 1901 1902 adev->cg_flags &= amdgpu_cg_mask; 1903 adev->pg_flags &= amdgpu_pg_mask; 1904 1905 return 0; 1906 } 1907 1908 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1909 { 1910 int i, r; 1911 1912 for (i = 0; i < adev->num_ip_blocks; i++) { 1913 if (!adev->ip_blocks[i].status.sw) 1914 continue; 1915 if (adev->ip_blocks[i].status.hw) 1916 continue; 1917 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1918 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1919 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1920 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1921 if (r) { 1922 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1923 adev->ip_blocks[i].version->funcs->name, r); 1924 return r; 1925 } 1926 adev->ip_blocks[i].status.hw = true; 1927 } 1928 } 1929 1930 return 0; 1931 } 1932 1933 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1934 { 1935 int i, r; 1936 1937 for (i = 0; i < adev->num_ip_blocks; i++) { 1938 if (!adev->ip_blocks[i].status.sw) 1939 continue; 1940 if (adev->ip_blocks[i].status.hw) 1941 continue; 1942 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1943 if (r) { 1944 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1945 adev->ip_blocks[i].version->funcs->name, r); 1946 return r; 1947 } 1948 adev->ip_blocks[i].status.hw = true; 1949 } 1950 1951 return 0; 1952 } 1953 1954 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1955 { 1956 int r = 0; 1957 int i; 1958 uint32_t smu_version; 1959 1960 if (adev->asic_type >= CHIP_VEGA10) { 1961 for (i = 0; i < adev->num_ip_blocks; i++) { 1962 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1963 continue; 1964 1965 /* no need to do the fw loading again if already done*/ 1966 if (adev->ip_blocks[i].status.hw == true) 1967 break; 1968 1969 if (amdgpu_in_reset(adev) || adev->in_suspend) { 1970 r = adev->ip_blocks[i].version->funcs->resume(adev); 1971 if (r) { 1972 DRM_ERROR("resume of IP block <%s> failed %d\n", 1973 adev->ip_blocks[i].version->funcs->name, r); 1974 return r; 1975 } 1976 } else { 1977 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1978 if (r) { 1979 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1980 adev->ip_blocks[i].version->funcs->name, r); 1981 return r; 1982 } 1983 } 1984 1985 adev->ip_blocks[i].status.hw = true; 1986 break; 1987 } 1988 } 1989 1990 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 1991 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1992 1993 return r; 1994 } 1995 1996 /** 1997 * amdgpu_device_ip_init - run init for hardware IPs 1998 * 1999 * @adev: amdgpu_device pointer 2000 * 2001 * Main initialization pass for hardware IPs. The list of all the hardware 2002 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2003 * are run. sw_init initializes the software state associated with each IP 2004 * and hw_init initializes the hardware associated with each IP. 2005 * Returns 0 on success, negative error code on failure. 2006 */ 2007 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2008 { 2009 int i, r; 2010 2011 r = amdgpu_ras_init(adev); 2012 if (r) 2013 return r; 2014 2015 for (i = 0; i < adev->num_ip_blocks; i++) { 2016 if (!adev->ip_blocks[i].status.valid) 2017 continue; 2018 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2019 if (r) { 2020 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2021 adev->ip_blocks[i].version->funcs->name, r); 2022 goto init_failed; 2023 } 2024 adev->ip_blocks[i].status.sw = true; 2025 2026 /* need to do gmc hw init early so we can allocate gpu mem */ 2027 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2028 r = amdgpu_device_vram_scratch_init(adev); 2029 if (r) { 2030 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2031 goto init_failed; 2032 } 2033 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2034 if (r) { 2035 DRM_ERROR("hw_init %d failed %d\n", i, r); 2036 goto init_failed; 2037 } 2038 r = amdgpu_device_wb_init(adev); 2039 if (r) { 2040 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2041 goto init_failed; 2042 } 2043 adev->ip_blocks[i].status.hw = true; 2044 2045 /* right after GMC hw init, we create CSA */ 2046 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2047 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2048 AMDGPU_GEM_DOMAIN_VRAM, 2049 AMDGPU_CSA_SIZE); 2050 if (r) { 2051 DRM_ERROR("allocate CSA failed %d\n", r); 2052 goto init_failed; 2053 } 2054 } 2055 } 2056 } 2057 2058 if (amdgpu_sriov_vf(adev)) 2059 amdgpu_virt_init_data_exchange(adev); 2060 2061 r = amdgpu_ib_pool_init(adev); 2062 if (r) { 2063 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2064 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2065 goto init_failed; 2066 } 2067 2068 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2069 if (r) 2070 goto init_failed; 2071 2072 r = amdgpu_device_ip_hw_init_phase1(adev); 2073 if (r) 2074 goto init_failed; 2075 2076 r = amdgpu_device_fw_loading(adev); 2077 if (r) 2078 goto init_failed; 2079 2080 r = amdgpu_device_ip_hw_init_phase2(adev); 2081 if (r) 2082 goto init_failed; 2083 2084 /* 2085 * retired pages will be loaded from eeprom and reserved here, 2086 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2087 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2088 * for I2C communication which only true at this point. 2089 * 2090 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2091 * failure from bad gpu situation and stop amdgpu init process 2092 * accordingly. For other failed cases, it will still release all 2093 * the resource and print error message, rather than returning one 2094 * negative value to upper level. 2095 * 2096 * Note: theoretically, this should be called before all vram allocations 2097 * to protect retired page from abusing 2098 */ 2099 r = amdgpu_ras_recovery_init(adev); 2100 if (r) 2101 goto init_failed; 2102 2103 if (adev->gmc.xgmi.num_physical_nodes > 1) 2104 amdgpu_xgmi_add_device(adev); 2105 amdgpu_amdkfd_device_init(adev); 2106 2107 amdgpu_fru_get_product_info(adev); 2108 2109 init_failed: 2110 if (amdgpu_sriov_vf(adev)) 2111 amdgpu_virt_release_full_gpu(adev, true); 2112 2113 return r; 2114 } 2115 2116 /** 2117 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2118 * 2119 * @adev: amdgpu_device pointer 2120 * 2121 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2122 * this function before a GPU reset. If the value is retained after a 2123 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2124 */ 2125 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2126 { 2127 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2128 } 2129 2130 /** 2131 * amdgpu_device_check_vram_lost - check if vram is valid 2132 * 2133 * @adev: amdgpu_device pointer 2134 * 2135 * Checks the reset magic value written to the gart pointer in VRAM. 2136 * The driver calls this after a GPU reset to see if the contents of 2137 * VRAM is lost or now. 2138 * returns true if vram is lost, false if not. 2139 */ 2140 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2141 { 2142 if (memcmp(adev->gart.ptr, adev->reset_magic, 2143 AMDGPU_RESET_MAGIC_NUM)) 2144 return true; 2145 2146 if (!amdgpu_in_reset(adev)) 2147 return false; 2148 2149 /* 2150 * For all ASICs with baco/mode1 reset, the VRAM is 2151 * always assumed to be lost. 2152 */ 2153 switch (amdgpu_asic_reset_method(adev)) { 2154 case AMD_RESET_METHOD_BACO: 2155 case AMD_RESET_METHOD_MODE1: 2156 return true; 2157 default: 2158 return false; 2159 } 2160 } 2161 2162 /** 2163 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2164 * 2165 * @adev: amdgpu_device pointer 2166 * @state: clockgating state (gate or ungate) 2167 * 2168 * The list of all the hardware IPs that make up the asic is walked and the 2169 * set_clockgating_state callbacks are run. 2170 * Late initialization pass enabling clockgating for hardware IPs. 2171 * Fini or suspend, pass disabling clockgating for hardware IPs. 2172 * Returns 0 on success, negative error code on failure. 2173 */ 2174 2175 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2176 enum amd_clockgating_state state) 2177 { 2178 int i, j, r; 2179 2180 if (amdgpu_emu_mode == 1) 2181 return 0; 2182 2183 for (j = 0; j < adev->num_ip_blocks; j++) { 2184 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2185 if (!adev->ip_blocks[i].status.late_initialized) 2186 continue; 2187 /* skip CG for VCE/UVD, it's handled specially */ 2188 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2189 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2190 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2191 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2192 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2193 /* enable clockgating to save power */ 2194 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2195 state); 2196 if (r) { 2197 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2198 adev->ip_blocks[i].version->funcs->name, r); 2199 return r; 2200 } 2201 } 2202 } 2203 2204 return 0; 2205 } 2206 2207 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2208 { 2209 int i, j, r; 2210 2211 if (amdgpu_emu_mode == 1) 2212 return 0; 2213 2214 for (j = 0; j < adev->num_ip_blocks; j++) { 2215 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2216 if (!adev->ip_blocks[i].status.late_initialized) 2217 continue; 2218 /* skip CG for VCE/UVD, it's handled specially */ 2219 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2220 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2221 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2222 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2223 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2224 /* enable powergating to save power */ 2225 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2226 state); 2227 if (r) { 2228 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2229 adev->ip_blocks[i].version->funcs->name, r); 2230 return r; 2231 } 2232 } 2233 } 2234 return 0; 2235 } 2236 2237 static int amdgpu_device_enable_mgpu_fan_boost(void) 2238 { 2239 struct amdgpu_gpu_instance *gpu_ins; 2240 struct amdgpu_device *adev; 2241 int i, ret = 0; 2242 2243 mutex_lock(&mgpu_info.mutex); 2244 2245 /* 2246 * MGPU fan boost feature should be enabled 2247 * only when there are two or more dGPUs in 2248 * the system 2249 */ 2250 if (mgpu_info.num_dgpu < 2) 2251 goto out; 2252 2253 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2254 gpu_ins = &(mgpu_info.gpu_ins[i]); 2255 adev = gpu_ins->adev; 2256 if (!(adev->flags & AMD_IS_APU) && 2257 !gpu_ins->mgpu_fan_enabled) { 2258 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2259 if (ret) 2260 break; 2261 2262 gpu_ins->mgpu_fan_enabled = 1; 2263 } 2264 } 2265 2266 out: 2267 mutex_unlock(&mgpu_info.mutex); 2268 2269 return ret; 2270 } 2271 2272 /** 2273 * amdgpu_device_ip_late_init - run late init for hardware IPs 2274 * 2275 * @adev: amdgpu_device pointer 2276 * 2277 * Late initialization pass for hardware IPs. The list of all the hardware 2278 * IPs that make up the asic is walked and the late_init callbacks are run. 2279 * late_init covers any special initialization that an IP requires 2280 * after all of the have been initialized or something that needs to happen 2281 * late in the init process. 2282 * Returns 0 on success, negative error code on failure. 2283 */ 2284 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2285 { 2286 struct amdgpu_gpu_instance *gpu_instance; 2287 int i = 0, r; 2288 2289 for (i = 0; i < adev->num_ip_blocks; i++) { 2290 if (!adev->ip_blocks[i].status.hw) 2291 continue; 2292 if (adev->ip_blocks[i].version->funcs->late_init) { 2293 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2294 if (r) { 2295 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2296 adev->ip_blocks[i].version->funcs->name, r); 2297 return r; 2298 } 2299 } 2300 adev->ip_blocks[i].status.late_initialized = true; 2301 } 2302 2303 amdgpu_ras_set_error_query_ready(adev, true); 2304 2305 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2306 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2307 2308 amdgpu_device_fill_reset_magic(adev); 2309 2310 r = amdgpu_device_enable_mgpu_fan_boost(); 2311 if (r) 2312 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2313 2314 2315 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2316 mutex_lock(&mgpu_info.mutex); 2317 2318 /* 2319 * Reset device p-state to low as this was booted with high. 2320 * 2321 * This should be performed only after all devices from the same 2322 * hive get initialized. 2323 * 2324 * However, it's unknown how many device in the hive in advance. 2325 * As this is counted one by one during devices initializations. 2326 * 2327 * So, we wait for all XGMI interlinked devices initialized. 2328 * This may bring some delays as those devices may come from 2329 * different hives. But that should be OK. 2330 */ 2331 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2332 for (i = 0; i < mgpu_info.num_gpu; i++) { 2333 gpu_instance = &(mgpu_info.gpu_ins[i]); 2334 if (gpu_instance->adev->flags & AMD_IS_APU) 2335 continue; 2336 2337 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2338 AMDGPU_XGMI_PSTATE_MIN); 2339 if (r) { 2340 DRM_ERROR("pstate setting failed (%d).\n", r); 2341 break; 2342 } 2343 } 2344 } 2345 2346 mutex_unlock(&mgpu_info.mutex); 2347 } 2348 2349 return 0; 2350 } 2351 2352 /** 2353 * amdgpu_device_ip_fini - run fini for hardware IPs 2354 * 2355 * @adev: amdgpu_device pointer 2356 * 2357 * Main teardown pass for hardware IPs. The list of all the hardware 2358 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2359 * are run. hw_fini tears down the hardware associated with each IP 2360 * and sw_fini tears down any software state associated with each IP. 2361 * Returns 0 on success, negative error code on failure. 2362 */ 2363 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2364 { 2365 int i, r; 2366 2367 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2368 amdgpu_virt_release_ras_err_handler_data(adev); 2369 2370 amdgpu_ras_pre_fini(adev); 2371 2372 if (adev->gmc.xgmi.num_physical_nodes > 1) 2373 amdgpu_xgmi_remove_device(adev); 2374 2375 amdgpu_amdkfd_device_fini(adev); 2376 2377 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2378 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2379 2380 /* need to disable SMC first */ 2381 for (i = 0; i < adev->num_ip_blocks; i++) { 2382 if (!adev->ip_blocks[i].status.hw) 2383 continue; 2384 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2385 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2386 /* XXX handle errors */ 2387 if (r) { 2388 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2389 adev->ip_blocks[i].version->funcs->name, r); 2390 } 2391 adev->ip_blocks[i].status.hw = false; 2392 break; 2393 } 2394 } 2395 2396 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2397 if (!adev->ip_blocks[i].status.hw) 2398 continue; 2399 2400 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2401 /* XXX handle errors */ 2402 if (r) { 2403 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2404 adev->ip_blocks[i].version->funcs->name, r); 2405 } 2406 2407 adev->ip_blocks[i].status.hw = false; 2408 } 2409 2410 2411 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2412 if (!adev->ip_blocks[i].status.sw) 2413 continue; 2414 2415 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2416 amdgpu_ucode_free_bo(adev); 2417 amdgpu_free_static_csa(&adev->virt.csa_obj); 2418 amdgpu_device_wb_fini(adev); 2419 amdgpu_device_vram_scratch_fini(adev); 2420 amdgpu_ib_pool_fini(adev); 2421 } 2422 2423 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2424 /* XXX handle errors */ 2425 if (r) { 2426 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2427 adev->ip_blocks[i].version->funcs->name, r); 2428 } 2429 adev->ip_blocks[i].status.sw = false; 2430 adev->ip_blocks[i].status.valid = false; 2431 } 2432 2433 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2434 if (!adev->ip_blocks[i].status.late_initialized) 2435 continue; 2436 if (adev->ip_blocks[i].version->funcs->late_fini) 2437 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2438 adev->ip_blocks[i].status.late_initialized = false; 2439 } 2440 2441 amdgpu_ras_fini(adev); 2442 2443 if (amdgpu_sriov_vf(adev)) 2444 if (amdgpu_virt_release_full_gpu(adev, false)) 2445 DRM_ERROR("failed to release exclusive mode on fini\n"); 2446 2447 return 0; 2448 } 2449 2450 /** 2451 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2452 * 2453 * @work: work_struct. 2454 */ 2455 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2456 { 2457 struct amdgpu_device *adev = 2458 container_of(work, struct amdgpu_device, delayed_init_work.work); 2459 int r; 2460 2461 r = amdgpu_ib_ring_tests(adev); 2462 if (r) 2463 DRM_ERROR("ib ring test failed (%d).\n", r); 2464 } 2465 2466 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2467 { 2468 struct amdgpu_device *adev = 2469 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2470 2471 mutex_lock(&adev->gfx.gfx_off_mutex); 2472 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2473 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2474 adev->gfx.gfx_off_state = true; 2475 } 2476 mutex_unlock(&adev->gfx.gfx_off_mutex); 2477 } 2478 2479 /** 2480 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2481 * 2482 * @adev: amdgpu_device pointer 2483 * 2484 * Main suspend function for hardware IPs. The list of all the hardware 2485 * IPs that make up the asic is walked, clockgating is disabled and the 2486 * suspend callbacks are run. suspend puts the hardware and software state 2487 * in each IP into a state suitable for suspend. 2488 * Returns 0 on success, negative error code on failure. 2489 */ 2490 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2491 { 2492 int i, r; 2493 2494 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2495 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2496 2497 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2498 if (!adev->ip_blocks[i].status.valid) 2499 continue; 2500 2501 /* displays are handled separately */ 2502 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2503 continue; 2504 2505 /* XXX handle errors */ 2506 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2507 /* XXX handle errors */ 2508 if (r) { 2509 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2510 adev->ip_blocks[i].version->funcs->name, r); 2511 return r; 2512 } 2513 2514 adev->ip_blocks[i].status.hw = false; 2515 } 2516 2517 return 0; 2518 } 2519 2520 /** 2521 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2522 * 2523 * @adev: amdgpu_device pointer 2524 * 2525 * Main suspend function for hardware IPs. The list of all the hardware 2526 * IPs that make up the asic is walked, clockgating is disabled and the 2527 * suspend callbacks are run. suspend puts the hardware and software state 2528 * in each IP into a state suitable for suspend. 2529 * Returns 0 on success, negative error code on failure. 2530 */ 2531 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2532 { 2533 int i, r; 2534 2535 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2536 if (!adev->ip_blocks[i].status.valid) 2537 continue; 2538 /* displays are handled in phase1 */ 2539 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2540 continue; 2541 /* PSP lost connection when err_event_athub occurs */ 2542 if (amdgpu_ras_intr_triggered() && 2543 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2544 adev->ip_blocks[i].status.hw = false; 2545 continue; 2546 } 2547 /* XXX handle errors */ 2548 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2549 /* XXX handle errors */ 2550 if (r) { 2551 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2552 adev->ip_blocks[i].version->funcs->name, r); 2553 } 2554 adev->ip_blocks[i].status.hw = false; 2555 /* handle putting the SMC in the appropriate state */ 2556 if(!amdgpu_sriov_vf(adev)){ 2557 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2558 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2559 if (r) { 2560 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2561 adev->mp1_state, r); 2562 return r; 2563 } 2564 } 2565 } 2566 adev->ip_blocks[i].status.hw = false; 2567 } 2568 2569 return 0; 2570 } 2571 2572 /** 2573 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2574 * 2575 * @adev: amdgpu_device pointer 2576 * 2577 * Main suspend function for hardware IPs. The list of all the hardware 2578 * IPs that make up the asic is walked, clockgating is disabled and the 2579 * suspend callbacks are run. suspend puts the hardware and software state 2580 * in each IP into a state suitable for suspend. 2581 * Returns 0 on success, negative error code on failure. 2582 */ 2583 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2584 { 2585 int r; 2586 2587 if (amdgpu_sriov_vf(adev)) 2588 amdgpu_virt_request_full_gpu(adev, false); 2589 2590 r = amdgpu_device_ip_suspend_phase1(adev); 2591 if (r) 2592 return r; 2593 r = amdgpu_device_ip_suspend_phase2(adev); 2594 2595 if (amdgpu_sriov_vf(adev)) 2596 amdgpu_virt_release_full_gpu(adev, false); 2597 2598 return r; 2599 } 2600 2601 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2602 { 2603 int i, r; 2604 2605 static enum amd_ip_block_type ip_order[] = { 2606 AMD_IP_BLOCK_TYPE_GMC, 2607 AMD_IP_BLOCK_TYPE_COMMON, 2608 AMD_IP_BLOCK_TYPE_PSP, 2609 AMD_IP_BLOCK_TYPE_IH, 2610 }; 2611 2612 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2613 int j; 2614 struct amdgpu_ip_block *block; 2615 2616 block = &adev->ip_blocks[i]; 2617 block->status.hw = false; 2618 2619 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2620 2621 if (block->version->type != ip_order[j] || 2622 !block->status.valid) 2623 continue; 2624 2625 r = block->version->funcs->hw_init(adev); 2626 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2627 if (r) 2628 return r; 2629 block->status.hw = true; 2630 } 2631 } 2632 2633 return 0; 2634 } 2635 2636 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2637 { 2638 int i, r; 2639 2640 static enum amd_ip_block_type ip_order[] = { 2641 AMD_IP_BLOCK_TYPE_SMC, 2642 AMD_IP_BLOCK_TYPE_DCE, 2643 AMD_IP_BLOCK_TYPE_GFX, 2644 AMD_IP_BLOCK_TYPE_SDMA, 2645 AMD_IP_BLOCK_TYPE_UVD, 2646 AMD_IP_BLOCK_TYPE_VCE, 2647 AMD_IP_BLOCK_TYPE_VCN 2648 }; 2649 2650 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2651 int j; 2652 struct amdgpu_ip_block *block; 2653 2654 for (j = 0; j < adev->num_ip_blocks; j++) { 2655 block = &adev->ip_blocks[j]; 2656 2657 if (block->version->type != ip_order[i] || 2658 !block->status.valid || 2659 block->status.hw) 2660 continue; 2661 2662 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2663 r = block->version->funcs->resume(adev); 2664 else 2665 r = block->version->funcs->hw_init(adev); 2666 2667 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2668 if (r) 2669 return r; 2670 block->status.hw = true; 2671 } 2672 } 2673 2674 return 0; 2675 } 2676 2677 /** 2678 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2679 * 2680 * @adev: amdgpu_device pointer 2681 * 2682 * First resume function for hardware IPs. The list of all the hardware 2683 * IPs that make up the asic is walked and the resume callbacks are run for 2684 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2685 * after a suspend and updates the software state as necessary. This 2686 * function is also used for restoring the GPU after a GPU reset. 2687 * Returns 0 on success, negative error code on failure. 2688 */ 2689 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2690 { 2691 int i, r; 2692 2693 for (i = 0; i < adev->num_ip_blocks; i++) { 2694 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2695 continue; 2696 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2697 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2698 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2699 2700 r = adev->ip_blocks[i].version->funcs->resume(adev); 2701 if (r) { 2702 DRM_ERROR("resume of IP block <%s> failed %d\n", 2703 adev->ip_blocks[i].version->funcs->name, r); 2704 return r; 2705 } 2706 adev->ip_blocks[i].status.hw = true; 2707 } 2708 } 2709 2710 return 0; 2711 } 2712 2713 /** 2714 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2715 * 2716 * @adev: amdgpu_device pointer 2717 * 2718 * First resume function for hardware IPs. The list of all the hardware 2719 * IPs that make up the asic is walked and the resume callbacks are run for 2720 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2721 * functional state after a suspend and updates the software state as 2722 * necessary. This function is also used for restoring the GPU after a GPU 2723 * reset. 2724 * Returns 0 on success, negative error code on failure. 2725 */ 2726 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2727 { 2728 int i, r; 2729 2730 for (i = 0; i < adev->num_ip_blocks; i++) { 2731 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2732 continue; 2733 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2736 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2737 continue; 2738 r = adev->ip_blocks[i].version->funcs->resume(adev); 2739 if (r) { 2740 DRM_ERROR("resume of IP block <%s> failed %d\n", 2741 adev->ip_blocks[i].version->funcs->name, r); 2742 return r; 2743 } 2744 adev->ip_blocks[i].status.hw = true; 2745 } 2746 2747 return 0; 2748 } 2749 2750 /** 2751 * amdgpu_device_ip_resume - run resume for hardware IPs 2752 * 2753 * @adev: amdgpu_device pointer 2754 * 2755 * Main resume function for hardware IPs. The hardware IPs 2756 * are split into two resume functions because they are 2757 * are also used in in recovering from a GPU reset and some additional 2758 * steps need to be take between them. In this case (S3/S4) they are 2759 * run sequentially. 2760 * Returns 0 on success, negative error code on failure. 2761 */ 2762 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2763 { 2764 int r; 2765 2766 r = amdgpu_device_ip_resume_phase1(adev); 2767 if (r) 2768 return r; 2769 2770 r = amdgpu_device_fw_loading(adev); 2771 if (r) 2772 return r; 2773 2774 r = amdgpu_device_ip_resume_phase2(adev); 2775 2776 return r; 2777 } 2778 2779 /** 2780 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2781 * 2782 * @adev: amdgpu_device pointer 2783 * 2784 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2785 */ 2786 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2787 { 2788 if (amdgpu_sriov_vf(adev)) { 2789 if (adev->is_atom_fw) { 2790 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2791 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2792 } else { 2793 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2794 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2795 } 2796 2797 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2798 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2799 } 2800 } 2801 2802 /** 2803 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2804 * 2805 * @asic_type: AMD asic type 2806 * 2807 * Check if there is DC (new modesetting infrastructre) support for an asic. 2808 * returns true if DC has support, false if not. 2809 */ 2810 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2811 { 2812 switch (asic_type) { 2813 #if defined(CONFIG_DRM_AMD_DC) 2814 #if defined(CONFIG_DRM_AMD_DC_SI) 2815 case CHIP_TAHITI: 2816 case CHIP_PITCAIRN: 2817 case CHIP_VERDE: 2818 case CHIP_OLAND: 2819 #endif 2820 case CHIP_BONAIRE: 2821 case CHIP_KAVERI: 2822 case CHIP_KABINI: 2823 case CHIP_MULLINS: 2824 /* 2825 * We have systems in the wild with these ASICs that require 2826 * LVDS and VGA support which is not supported with DC. 2827 * 2828 * Fallback to the non-DC driver here by default so as not to 2829 * cause regressions. 2830 */ 2831 return amdgpu_dc > 0; 2832 case CHIP_HAWAII: 2833 case CHIP_CARRIZO: 2834 case CHIP_STONEY: 2835 case CHIP_POLARIS10: 2836 case CHIP_POLARIS11: 2837 case CHIP_POLARIS12: 2838 case CHIP_VEGAM: 2839 case CHIP_TONGA: 2840 case CHIP_FIJI: 2841 case CHIP_VEGA10: 2842 case CHIP_VEGA12: 2843 case CHIP_VEGA20: 2844 #if defined(CONFIG_DRM_AMD_DC_DCN) 2845 case CHIP_RAVEN: 2846 case CHIP_NAVI10: 2847 case CHIP_NAVI14: 2848 case CHIP_NAVI12: 2849 case CHIP_RENOIR: 2850 #endif 2851 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 2852 case CHIP_SIENNA_CICHLID: 2853 case CHIP_NAVY_FLOUNDER: 2854 #endif 2855 return amdgpu_dc != 0; 2856 #endif 2857 default: 2858 if (amdgpu_dc > 0) 2859 DRM_INFO("Display Core has been requested via kernel parameter " 2860 "but isn't supported by ASIC, ignoring\n"); 2861 return false; 2862 } 2863 } 2864 2865 /** 2866 * amdgpu_device_has_dc_support - check if dc is supported 2867 * 2868 * @adev: amdgpu_device_pointer 2869 * 2870 * Returns true for supported, false for not supported 2871 */ 2872 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2873 { 2874 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 2875 return false; 2876 2877 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2878 } 2879 2880 2881 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2882 { 2883 struct amdgpu_device *adev = 2884 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2885 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2886 2887 /* It's a bug to not have a hive within this function */ 2888 if (WARN_ON(!hive)) 2889 return; 2890 2891 /* 2892 * Use task barrier to synchronize all xgmi reset works across the 2893 * hive. task_barrier_enter and task_barrier_exit will block 2894 * until all the threads running the xgmi reset works reach 2895 * those points. task_barrier_full will do both blocks. 2896 */ 2897 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2898 2899 task_barrier_enter(&hive->tb); 2900 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 2901 2902 if (adev->asic_reset_res) 2903 goto fail; 2904 2905 task_barrier_exit(&hive->tb); 2906 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 2907 2908 if (adev->asic_reset_res) 2909 goto fail; 2910 2911 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 2912 adev->mmhub.funcs->reset_ras_error_count(adev); 2913 } else { 2914 2915 task_barrier_full(&hive->tb); 2916 adev->asic_reset_res = amdgpu_asic_reset(adev); 2917 } 2918 2919 fail: 2920 if (adev->asic_reset_res) 2921 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2922 adev->asic_reset_res, adev_to_drm(adev)->unique); 2923 amdgpu_put_xgmi_hive(hive); 2924 } 2925 2926 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2927 { 2928 char *input = amdgpu_lockup_timeout; 2929 char *timeout_setting = NULL; 2930 int index = 0; 2931 long timeout; 2932 int ret = 0; 2933 2934 /* 2935 * By default timeout for non compute jobs is 10000. 2936 * And there is no timeout enforced on compute jobs. 2937 * In SR-IOV or passthrough mode, timeout for compute 2938 * jobs are 60000 by default. 2939 */ 2940 adev->gfx_timeout = msecs_to_jiffies(10000); 2941 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2942 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2943 adev->compute_timeout = msecs_to_jiffies(60000); 2944 else 2945 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2946 2947 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2948 while ((timeout_setting = strsep(&input, ",")) && 2949 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2950 ret = kstrtol(timeout_setting, 0, &timeout); 2951 if (ret) 2952 return ret; 2953 2954 if (timeout == 0) { 2955 index++; 2956 continue; 2957 } else if (timeout < 0) { 2958 timeout = MAX_SCHEDULE_TIMEOUT; 2959 } else { 2960 timeout = msecs_to_jiffies(timeout); 2961 } 2962 2963 switch (index++) { 2964 case 0: 2965 adev->gfx_timeout = timeout; 2966 break; 2967 case 1: 2968 adev->compute_timeout = timeout; 2969 break; 2970 case 2: 2971 adev->sdma_timeout = timeout; 2972 break; 2973 case 3: 2974 adev->video_timeout = timeout; 2975 break; 2976 default: 2977 break; 2978 } 2979 } 2980 /* 2981 * There is only one value specified and 2982 * it should apply to all non-compute jobs. 2983 */ 2984 if (index == 1) { 2985 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2986 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2987 adev->compute_timeout = adev->gfx_timeout; 2988 } 2989 } 2990 2991 return ret; 2992 } 2993 2994 static const struct attribute *amdgpu_dev_attributes[] = { 2995 &dev_attr_product_name.attr, 2996 &dev_attr_product_number.attr, 2997 &dev_attr_serial_number.attr, 2998 &dev_attr_pcie_replay_count.attr, 2999 NULL 3000 }; 3001 3002 /** 3003 * amdgpu_device_init - initialize the driver 3004 * 3005 * @adev: amdgpu_device pointer 3006 * @flags: driver flags 3007 * 3008 * Initializes the driver info and hw (all asics). 3009 * Returns 0 for success or an error on failure. 3010 * Called at driver startup. 3011 */ 3012 int amdgpu_device_init(struct amdgpu_device *adev, 3013 uint32_t flags) 3014 { 3015 struct drm_device *ddev = adev_to_drm(adev); 3016 struct pci_dev *pdev = adev->pdev; 3017 int r, i; 3018 bool boco = false; 3019 u32 max_MBps; 3020 3021 adev->shutdown = false; 3022 adev->flags = flags; 3023 3024 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3025 adev->asic_type = amdgpu_force_asic_type; 3026 else 3027 adev->asic_type = flags & AMD_ASIC_MASK; 3028 3029 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3030 if (amdgpu_emu_mode == 1) 3031 adev->usec_timeout *= 10; 3032 adev->gmc.gart_size = 512 * 1024 * 1024; 3033 adev->accel_working = false; 3034 adev->num_rings = 0; 3035 adev->mman.buffer_funcs = NULL; 3036 adev->mman.buffer_funcs_ring = NULL; 3037 adev->vm_manager.vm_pte_funcs = NULL; 3038 adev->vm_manager.vm_pte_num_scheds = 0; 3039 adev->gmc.gmc_funcs = NULL; 3040 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3041 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3042 3043 adev->smc_rreg = &amdgpu_invalid_rreg; 3044 adev->smc_wreg = &amdgpu_invalid_wreg; 3045 adev->pcie_rreg = &amdgpu_invalid_rreg; 3046 adev->pcie_wreg = &amdgpu_invalid_wreg; 3047 adev->pciep_rreg = &amdgpu_invalid_rreg; 3048 adev->pciep_wreg = &amdgpu_invalid_wreg; 3049 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3050 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3051 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3052 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3053 adev->didt_rreg = &amdgpu_invalid_rreg; 3054 adev->didt_wreg = &amdgpu_invalid_wreg; 3055 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3056 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3057 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3058 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3059 3060 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3061 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3062 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3063 3064 /* mutex initialization are all done here so we 3065 * can recall function without having locking issues */ 3066 atomic_set(&adev->irq.ih.lock, 0); 3067 mutex_init(&adev->firmware.mutex); 3068 mutex_init(&adev->pm.mutex); 3069 mutex_init(&adev->gfx.gpu_clock_mutex); 3070 mutex_init(&adev->srbm_mutex); 3071 mutex_init(&adev->gfx.pipe_reserve_mutex); 3072 mutex_init(&adev->gfx.gfx_off_mutex); 3073 mutex_init(&adev->grbm_idx_mutex); 3074 mutex_init(&adev->mn_lock); 3075 mutex_init(&adev->virt.vf_errors.lock); 3076 hash_init(adev->mn_hash); 3077 atomic_set(&adev->in_gpu_reset, 0); 3078 init_rwsem(&adev->reset_sem); 3079 mutex_init(&adev->psp.mutex); 3080 mutex_init(&adev->notifier_lock); 3081 3082 r = amdgpu_device_check_arguments(adev); 3083 if (r) 3084 return r; 3085 3086 spin_lock_init(&adev->mmio_idx_lock); 3087 spin_lock_init(&adev->smc_idx_lock); 3088 spin_lock_init(&adev->pcie_idx_lock); 3089 spin_lock_init(&adev->uvd_ctx_idx_lock); 3090 spin_lock_init(&adev->didt_idx_lock); 3091 spin_lock_init(&adev->gc_cac_idx_lock); 3092 spin_lock_init(&adev->se_cac_idx_lock); 3093 spin_lock_init(&adev->audio_endpt_idx_lock); 3094 spin_lock_init(&adev->mm_stats.lock); 3095 3096 INIT_LIST_HEAD(&adev->shadow_list); 3097 mutex_init(&adev->shadow_list_lock); 3098 3099 INIT_DELAYED_WORK(&adev->delayed_init_work, 3100 amdgpu_device_delayed_init_work_handler); 3101 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3102 amdgpu_device_delay_enable_gfx_off); 3103 3104 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3105 3106 adev->gfx.gfx_off_req_count = 1; 3107 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3108 3109 atomic_set(&adev->throttling_logging_enabled, 1); 3110 /* 3111 * If throttling continues, logging will be performed every minute 3112 * to avoid log flooding. "-1" is subtracted since the thermal 3113 * throttling interrupt comes every second. Thus, the total logging 3114 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3115 * for throttling interrupt) = 60 seconds. 3116 */ 3117 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3118 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3119 3120 /* Registers mapping */ 3121 /* TODO: block userspace mapping of io register */ 3122 if (adev->asic_type >= CHIP_BONAIRE) { 3123 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3124 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3125 } else { 3126 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3127 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3128 } 3129 3130 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3131 if (adev->rmmio == NULL) { 3132 return -ENOMEM; 3133 } 3134 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3135 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3136 3137 /* io port mapping */ 3138 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3139 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3140 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3141 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3142 break; 3143 } 3144 } 3145 if (adev->rio_mem == NULL) 3146 DRM_INFO("PCI I/O BAR is not found.\n"); 3147 3148 /* enable PCIE atomic ops */ 3149 r = pci_enable_atomic_ops_to_root(adev->pdev, 3150 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3151 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3152 if (r) { 3153 adev->have_atomics_support = false; 3154 DRM_INFO("PCIE atomic ops is not supported\n"); 3155 } else { 3156 adev->have_atomics_support = true; 3157 } 3158 3159 amdgpu_device_get_pcie_info(adev); 3160 3161 if (amdgpu_mcbp) 3162 DRM_INFO("MCBP is enabled\n"); 3163 3164 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3165 adev->enable_mes = true; 3166 3167 /* detect hw virtualization here */ 3168 amdgpu_detect_virtualization(adev); 3169 3170 r = amdgpu_device_get_job_timeout_settings(adev); 3171 if (r) { 3172 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3173 return r; 3174 } 3175 3176 /* early init functions */ 3177 r = amdgpu_device_ip_early_init(adev); 3178 if (r) 3179 return r; 3180 3181 /* doorbell bar mapping and doorbell index init*/ 3182 amdgpu_device_doorbell_init(adev); 3183 3184 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3185 /* this will fail for cards that aren't VGA class devices, just 3186 * ignore it */ 3187 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3188 3189 if (amdgpu_device_supports_boco(ddev)) 3190 boco = true; 3191 if (amdgpu_has_atpx() && 3192 (amdgpu_is_atpx_hybrid() || 3193 amdgpu_has_atpx_dgpu_power_cntl()) && 3194 !pci_is_thunderbolt_attached(adev->pdev)) 3195 vga_switcheroo_register_client(adev->pdev, 3196 &amdgpu_switcheroo_ops, boco); 3197 if (boco) 3198 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3199 3200 if (amdgpu_emu_mode == 1) { 3201 /* post the asic on emulation mode */ 3202 emu_soc_asic_init(adev); 3203 goto fence_driver_init; 3204 } 3205 3206 /* detect if we are with an SRIOV vbios */ 3207 amdgpu_device_detect_sriov_bios(adev); 3208 3209 /* check if we need to reset the asic 3210 * E.g., driver was not cleanly unloaded previously, etc. 3211 */ 3212 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3213 r = amdgpu_asic_reset(adev); 3214 if (r) { 3215 dev_err(adev->dev, "asic reset on init failed\n"); 3216 goto failed; 3217 } 3218 } 3219 3220 /* Post card if necessary */ 3221 if (amdgpu_device_need_post(adev)) { 3222 if (!adev->bios) { 3223 dev_err(adev->dev, "no vBIOS found\n"); 3224 r = -EINVAL; 3225 goto failed; 3226 } 3227 DRM_INFO("GPU posting now...\n"); 3228 r = amdgpu_device_asic_init(adev); 3229 if (r) { 3230 dev_err(adev->dev, "gpu post error!\n"); 3231 goto failed; 3232 } 3233 } 3234 3235 if (adev->is_atom_fw) { 3236 /* Initialize clocks */ 3237 r = amdgpu_atomfirmware_get_clock_info(adev); 3238 if (r) { 3239 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3240 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3241 goto failed; 3242 } 3243 } else { 3244 /* Initialize clocks */ 3245 r = amdgpu_atombios_get_clock_info(adev); 3246 if (r) { 3247 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3248 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3249 goto failed; 3250 } 3251 /* init i2c buses */ 3252 if (!amdgpu_device_has_dc_support(adev)) 3253 amdgpu_atombios_i2c_init(adev); 3254 } 3255 3256 fence_driver_init: 3257 /* Fence driver */ 3258 r = amdgpu_fence_driver_init(adev); 3259 if (r) { 3260 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3261 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3262 goto failed; 3263 } 3264 3265 /* init the mode config */ 3266 drm_mode_config_init(adev_to_drm(adev)); 3267 3268 r = amdgpu_device_ip_init(adev); 3269 if (r) { 3270 /* failed in exclusive mode due to timeout */ 3271 if (amdgpu_sriov_vf(adev) && 3272 !amdgpu_sriov_runtime(adev) && 3273 amdgpu_virt_mmio_blocked(adev) && 3274 !amdgpu_virt_wait_reset(adev)) { 3275 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3276 /* Don't send request since VF is inactive. */ 3277 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3278 adev->virt.ops = NULL; 3279 r = -EAGAIN; 3280 goto failed; 3281 } 3282 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3283 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3284 goto failed; 3285 } 3286 3287 dev_info(adev->dev, 3288 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3289 adev->gfx.config.max_shader_engines, 3290 adev->gfx.config.max_sh_per_se, 3291 adev->gfx.config.max_cu_per_sh, 3292 adev->gfx.cu_info.number); 3293 3294 adev->accel_working = true; 3295 3296 amdgpu_vm_check_compute_bug(adev); 3297 3298 /* Initialize the buffer migration limit. */ 3299 if (amdgpu_moverate >= 0) 3300 max_MBps = amdgpu_moverate; 3301 else 3302 max_MBps = 8; /* Allow 8 MB/s. */ 3303 /* Get a log2 for easy divisions. */ 3304 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3305 3306 amdgpu_fbdev_init(adev); 3307 3308 r = amdgpu_pm_sysfs_init(adev); 3309 if (r) { 3310 adev->pm_sysfs_en = false; 3311 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3312 } else 3313 adev->pm_sysfs_en = true; 3314 3315 r = amdgpu_ucode_sysfs_init(adev); 3316 if (r) { 3317 adev->ucode_sysfs_en = false; 3318 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3319 } else 3320 adev->ucode_sysfs_en = true; 3321 3322 if ((amdgpu_testing & 1)) { 3323 if (adev->accel_working) 3324 amdgpu_test_moves(adev); 3325 else 3326 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3327 } 3328 if (amdgpu_benchmarking) { 3329 if (adev->accel_working) 3330 amdgpu_benchmark(adev, amdgpu_benchmarking); 3331 else 3332 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3333 } 3334 3335 /* 3336 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3337 * Otherwise the mgpu fan boost feature will be skipped due to the 3338 * gpu instance is counted less. 3339 */ 3340 amdgpu_register_gpu_instance(adev); 3341 3342 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3343 * explicit gating rather than handling it automatically. 3344 */ 3345 r = amdgpu_device_ip_late_init(adev); 3346 if (r) { 3347 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3348 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3349 goto failed; 3350 } 3351 3352 /* must succeed. */ 3353 amdgpu_ras_resume(adev); 3354 3355 queue_delayed_work(system_wq, &adev->delayed_init_work, 3356 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3357 3358 if (amdgpu_sriov_vf(adev)) 3359 flush_delayed_work(&adev->delayed_init_work); 3360 3361 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3362 if (r) { 3363 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3364 return r; 3365 } 3366 3367 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3368 r = amdgpu_pmu_init(adev); 3369 if (r) 3370 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3371 3372 return 0; 3373 3374 failed: 3375 amdgpu_vf_error_trans_all(adev); 3376 if (boco) 3377 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3378 3379 return r; 3380 } 3381 3382 /** 3383 * amdgpu_device_fini - tear down the driver 3384 * 3385 * @adev: amdgpu_device pointer 3386 * 3387 * Tear down the driver info (all asics). 3388 * Called at driver shutdown. 3389 */ 3390 void amdgpu_device_fini(struct amdgpu_device *adev) 3391 { 3392 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3393 flush_delayed_work(&adev->delayed_init_work); 3394 adev->shutdown = true; 3395 3396 /* make sure IB test finished before entering exclusive mode 3397 * to avoid preemption on IB test 3398 * */ 3399 if (amdgpu_sriov_vf(adev)) 3400 amdgpu_virt_request_full_gpu(adev, false); 3401 3402 /* disable all interrupts */ 3403 amdgpu_irq_disable_all(adev); 3404 if (adev->mode_info.mode_config_initialized){ 3405 if (!amdgpu_device_has_dc_support(adev)) 3406 drm_helper_force_disable_all(adev_to_drm(adev)); 3407 else 3408 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3409 } 3410 amdgpu_fence_driver_fini(adev); 3411 if (adev->pm_sysfs_en) 3412 amdgpu_pm_sysfs_fini(adev); 3413 amdgpu_fbdev_fini(adev); 3414 amdgpu_device_ip_fini(adev); 3415 release_firmware(adev->firmware.gpu_info_fw); 3416 adev->firmware.gpu_info_fw = NULL; 3417 adev->accel_working = false; 3418 /* free i2c buses */ 3419 if (!amdgpu_device_has_dc_support(adev)) 3420 amdgpu_i2c_fini(adev); 3421 3422 if (amdgpu_emu_mode != 1) 3423 amdgpu_atombios_fini(adev); 3424 3425 kfree(adev->bios); 3426 adev->bios = NULL; 3427 if (amdgpu_has_atpx() && 3428 (amdgpu_is_atpx_hybrid() || 3429 amdgpu_has_atpx_dgpu_power_cntl()) && 3430 !pci_is_thunderbolt_attached(adev->pdev)) 3431 vga_switcheroo_unregister_client(adev->pdev); 3432 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3433 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3434 vga_client_register(adev->pdev, NULL, NULL, NULL); 3435 if (adev->rio_mem) 3436 pci_iounmap(adev->pdev, adev->rio_mem); 3437 adev->rio_mem = NULL; 3438 iounmap(adev->rmmio); 3439 adev->rmmio = NULL; 3440 amdgpu_device_doorbell_fini(adev); 3441 3442 if (adev->ucode_sysfs_en) 3443 amdgpu_ucode_sysfs_fini(adev); 3444 3445 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3446 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3447 amdgpu_pmu_fini(adev); 3448 if (adev->mman.discovery_bin) 3449 amdgpu_discovery_fini(adev); 3450 } 3451 3452 3453 /* 3454 * Suspend & resume. 3455 */ 3456 /** 3457 * amdgpu_device_suspend - initiate device suspend 3458 * 3459 * @dev: drm dev pointer 3460 * @fbcon : notify the fbdev of suspend 3461 * 3462 * Puts the hw in the suspend state (all asics). 3463 * Returns 0 for success or an error on failure. 3464 * Called at driver suspend. 3465 */ 3466 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3467 { 3468 struct amdgpu_device *adev; 3469 struct drm_crtc *crtc; 3470 struct drm_connector *connector; 3471 struct drm_connector_list_iter iter; 3472 int r; 3473 3474 adev = drm_to_adev(dev); 3475 3476 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3477 return 0; 3478 3479 adev->in_suspend = true; 3480 drm_kms_helper_poll_disable(dev); 3481 3482 if (fbcon) 3483 amdgpu_fbdev_set_suspend(adev, 1); 3484 3485 cancel_delayed_work_sync(&adev->delayed_init_work); 3486 3487 if (!amdgpu_device_has_dc_support(adev)) { 3488 /* turn off display hw */ 3489 drm_modeset_lock_all(dev); 3490 drm_connector_list_iter_begin(dev, &iter); 3491 drm_for_each_connector_iter(connector, &iter) 3492 drm_helper_connector_dpms(connector, 3493 DRM_MODE_DPMS_OFF); 3494 drm_connector_list_iter_end(&iter); 3495 drm_modeset_unlock_all(dev); 3496 /* unpin the front buffers and cursors */ 3497 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3498 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3499 struct drm_framebuffer *fb = crtc->primary->fb; 3500 struct amdgpu_bo *robj; 3501 3502 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3503 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3504 r = amdgpu_bo_reserve(aobj, true); 3505 if (r == 0) { 3506 amdgpu_bo_unpin(aobj); 3507 amdgpu_bo_unreserve(aobj); 3508 } 3509 } 3510 3511 if (fb == NULL || fb->obj[0] == NULL) { 3512 continue; 3513 } 3514 robj = gem_to_amdgpu_bo(fb->obj[0]); 3515 /* don't unpin kernel fb objects */ 3516 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3517 r = amdgpu_bo_reserve(robj, true); 3518 if (r == 0) { 3519 amdgpu_bo_unpin(robj); 3520 amdgpu_bo_unreserve(robj); 3521 } 3522 } 3523 } 3524 } 3525 3526 amdgpu_ras_suspend(adev); 3527 3528 r = amdgpu_device_ip_suspend_phase1(adev); 3529 3530 amdgpu_amdkfd_suspend(adev, !fbcon); 3531 3532 /* evict vram memory */ 3533 amdgpu_bo_evict_vram(adev); 3534 3535 amdgpu_fence_driver_suspend(adev); 3536 3537 r = amdgpu_device_ip_suspend_phase2(adev); 3538 3539 /* evict remaining vram memory 3540 * This second call to evict vram is to evict the gart page table 3541 * using the CPU. 3542 */ 3543 amdgpu_bo_evict_vram(adev); 3544 3545 return 0; 3546 } 3547 3548 /** 3549 * amdgpu_device_resume - initiate device resume 3550 * 3551 * @dev: drm dev pointer 3552 * @fbcon : notify the fbdev of resume 3553 * 3554 * Bring the hw back to operating state (all asics). 3555 * Returns 0 for success or an error on failure. 3556 * Called at driver resume. 3557 */ 3558 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3559 { 3560 struct drm_connector *connector; 3561 struct drm_connector_list_iter iter; 3562 struct amdgpu_device *adev = drm_to_adev(dev); 3563 struct drm_crtc *crtc; 3564 int r = 0; 3565 3566 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3567 return 0; 3568 3569 /* post card */ 3570 if (amdgpu_device_need_post(adev)) { 3571 r = amdgpu_device_asic_init(adev); 3572 if (r) 3573 dev_err(adev->dev, "amdgpu asic init failed\n"); 3574 } 3575 3576 r = amdgpu_device_ip_resume(adev); 3577 if (r) { 3578 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3579 return r; 3580 } 3581 amdgpu_fence_driver_resume(adev); 3582 3583 3584 r = amdgpu_device_ip_late_init(adev); 3585 if (r) 3586 return r; 3587 3588 queue_delayed_work(system_wq, &adev->delayed_init_work, 3589 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3590 3591 if (!amdgpu_device_has_dc_support(adev)) { 3592 /* pin cursors */ 3593 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3594 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3595 3596 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3597 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3598 r = amdgpu_bo_reserve(aobj, true); 3599 if (r == 0) { 3600 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3601 if (r != 0) 3602 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3603 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3604 amdgpu_bo_unreserve(aobj); 3605 } 3606 } 3607 } 3608 } 3609 r = amdgpu_amdkfd_resume(adev, !fbcon); 3610 if (r) 3611 return r; 3612 3613 /* Make sure IB tests flushed */ 3614 flush_delayed_work(&adev->delayed_init_work); 3615 3616 /* blat the mode back in */ 3617 if (fbcon) { 3618 if (!amdgpu_device_has_dc_support(adev)) { 3619 /* pre DCE11 */ 3620 drm_helper_resume_force_mode(dev); 3621 3622 /* turn on display hw */ 3623 drm_modeset_lock_all(dev); 3624 3625 drm_connector_list_iter_begin(dev, &iter); 3626 drm_for_each_connector_iter(connector, &iter) 3627 drm_helper_connector_dpms(connector, 3628 DRM_MODE_DPMS_ON); 3629 drm_connector_list_iter_end(&iter); 3630 3631 drm_modeset_unlock_all(dev); 3632 } 3633 amdgpu_fbdev_set_suspend(adev, 0); 3634 } 3635 3636 drm_kms_helper_poll_enable(dev); 3637 3638 amdgpu_ras_resume(adev); 3639 3640 /* 3641 * Most of the connector probing functions try to acquire runtime pm 3642 * refs to ensure that the GPU is powered on when connector polling is 3643 * performed. Since we're calling this from a runtime PM callback, 3644 * trying to acquire rpm refs will cause us to deadlock. 3645 * 3646 * Since we're guaranteed to be holding the rpm lock, it's safe to 3647 * temporarily disable the rpm helpers so this doesn't deadlock us. 3648 */ 3649 #ifdef CONFIG_PM 3650 dev->dev->power.disable_depth++; 3651 #endif 3652 if (!amdgpu_device_has_dc_support(adev)) 3653 drm_helper_hpd_irq_event(dev); 3654 else 3655 drm_kms_helper_hotplug_event(dev); 3656 #ifdef CONFIG_PM 3657 dev->dev->power.disable_depth--; 3658 #endif 3659 adev->in_suspend = false; 3660 3661 return 0; 3662 } 3663 3664 /** 3665 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3666 * 3667 * @adev: amdgpu_device pointer 3668 * 3669 * The list of all the hardware IPs that make up the asic is walked and 3670 * the check_soft_reset callbacks are run. check_soft_reset determines 3671 * if the asic is still hung or not. 3672 * Returns true if any of the IPs are still in a hung state, false if not. 3673 */ 3674 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3675 { 3676 int i; 3677 bool asic_hang = false; 3678 3679 if (amdgpu_sriov_vf(adev)) 3680 return true; 3681 3682 if (amdgpu_asic_need_full_reset(adev)) 3683 return true; 3684 3685 for (i = 0; i < adev->num_ip_blocks; i++) { 3686 if (!adev->ip_blocks[i].status.valid) 3687 continue; 3688 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3689 adev->ip_blocks[i].status.hang = 3690 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3691 if (adev->ip_blocks[i].status.hang) { 3692 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3693 asic_hang = true; 3694 } 3695 } 3696 return asic_hang; 3697 } 3698 3699 /** 3700 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3701 * 3702 * @adev: amdgpu_device pointer 3703 * 3704 * The list of all the hardware IPs that make up the asic is walked and the 3705 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3706 * handles any IP specific hardware or software state changes that are 3707 * necessary for a soft reset to succeed. 3708 * Returns 0 on success, negative error code on failure. 3709 */ 3710 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3711 { 3712 int i, r = 0; 3713 3714 for (i = 0; i < adev->num_ip_blocks; i++) { 3715 if (!adev->ip_blocks[i].status.valid) 3716 continue; 3717 if (adev->ip_blocks[i].status.hang && 3718 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3719 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3720 if (r) 3721 return r; 3722 } 3723 } 3724 3725 return 0; 3726 } 3727 3728 /** 3729 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3730 * 3731 * @adev: amdgpu_device pointer 3732 * 3733 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3734 * reset is necessary to recover. 3735 * Returns true if a full asic reset is required, false if not. 3736 */ 3737 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3738 { 3739 int i; 3740 3741 if (amdgpu_asic_need_full_reset(adev)) 3742 return true; 3743 3744 for (i = 0; i < adev->num_ip_blocks; i++) { 3745 if (!adev->ip_blocks[i].status.valid) 3746 continue; 3747 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3748 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3749 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3750 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3752 if (adev->ip_blocks[i].status.hang) { 3753 dev_info(adev->dev, "Some block need full reset!\n"); 3754 return true; 3755 } 3756 } 3757 } 3758 return false; 3759 } 3760 3761 /** 3762 * amdgpu_device_ip_soft_reset - do a soft reset 3763 * 3764 * @adev: amdgpu_device pointer 3765 * 3766 * The list of all the hardware IPs that make up the asic is walked and the 3767 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3768 * IP specific hardware or software state changes that are necessary to soft 3769 * reset the IP. 3770 * Returns 0 on success, negative error code on failure. 3771 */ 3772 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3773 { 3774 int i, r = 0; 3775 3776 for (i = 0; i < adev->num_ip_blocks; i++) { 3777 if (!adev->ip_blocks[i].status.valid) 3778 continue; 3779 if (adev->ip_blocks[i].status.hang && 3780 adev->ip_blocks[i].version->funcs->soft_reset) { 3781 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3782 if (r) 3783 return r; 3784 } 3785 } 3786 3787 return 0; 3788 } 3789 3790 /** 3791 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3792 * 3793 * @adev: amdgpu_device pointer 3794 * 3795 * The list of all the hardware IPs that make up the asic is walked and the 3796 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3797 * handles any IP specific hardware or software state changes that are 3798 * necessary after the IP has been soft reset. 3799 * Returns 0 on success, negative error code on failure. 3800 */ 3801 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3802 { 3803 int i, r = 0; 3804 3805 for (i = 0; i < adev->num_ip_blocks; i++) { 3806 if (!adev->ip_blocks[i].status.valid) 3807 continue; 3808 if (adev->ip_blocks[i].status.hang && 3809 adev->ip_blocks[i].version->funcs->post_soft_reset) 3810 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3811 if (r) 3812 return r; 3813 } 3814 3815 return 0; 3816 } 3817 3818 /** 3819 * amdgpu_device_recover_vram - Recover some VRAM contents 3820 * 3821 * @adev: amdgpu_device pointer 3822 * 3823 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3824 * restore things like GPUVM page tables after a GPU reset where 3825 * the contents of VRAM might be lost. 3826 * 3827 * Returns: 3828 * 0 on success, negative error code on failure. 3829 */ 3830 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3831 { 3832 struct dma_fence *fence = NULL, *next = NULL; 3833 struct amdgpu_bo *shadow; 3834 long r = 1, tmo; 3835 3836 if (amdgpu_sriov_runtime(adev)) 3837 tmo = msecs_to_jiffies(8000); 3838 else 3839 tmo = msecs_to_jiffies(100); 3840 3841 dev_info(adev->dev, "recover vram bo from shadow start\n"); 3842 mutex_lock(&adev->shadow_list_lock); 3843 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3844 3845 /* No need to recover an evicted BO */ 3846 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3847 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3848 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3849 continue; 3850 3851 r = amdgpu_bo_restore_shadow(shadow, &next); 3852 if (r) 3853 break; 3854 3855 if (fence) { 3856 tmo = dma_fence_wait_timeout(fence, false, tmo); 3857 dma_fence_put(fence); 3858 fence = next; 3859 if (tmo == 0) { 3860 r = -ETIMEDOUT; 3861 break; 3862 } else if (tmo < 0) { 3863 r = tmo; 3864 break; 3865 } 3866 } else { 3867 fence = next; 3868 } 3869 } 3870 mutex_unlock(&adev->shadow_list_lock); 3871 3872 if (fence) 3873 tmo = dma_fence_wait_timeout(fence, false, tmo); 3874 dma_fence_put(fence); 3875 3876 if (r < 0 || tmo <= 0) { 3877 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3878 return -EIO; 3879 } 3880 3881 dev_info(adev->dev, "recover vram bo from shadow done\n"); 3882 return 0; 3883 } 3884 3885 3886 /** 3887 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3888 * 3889 * @adev: amdgpu device pointer 3890 * @from_hypervisor: request from hypervisor 3891 * 3892 * do VF FLR and reinitialize Asic 3893 * return 0 means succeeded otherwise failed 3894 */ 3895 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3896 bool from_hypervisor) 3897 { 3898 int r; 3899 3900 if (from_hypervisor) 3901 r = amdgpu_virt_request_full_gpu(adev, true); 3902 else 3903 r = amdgpu_virt_reset_gpu(adev); 3904 if (r) 3905 return r; 3906 3907 amdgpu_amdkfd_pre_reset(adev); 3908 3909 /* Resume IP prior to SMC */ 3910 r = amdgpu_device_ip_reinit_early_sriov(adev); 3911 if (r) 3912 goto error; 3913 3914 amdgpu_virt_init_data_exchange(adev); 3915 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3916 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 3917 3918 r = amdgpu_device_fw_loading(adev); 3919 if (r) 3920 return r; 3921 3922 /* now we are okay to resume SMC/CP/SDMA */ 3923 r = amdgpu_device_ip_reinit_late_sriov(adev); 3924 if (r) 3925 goto error; 3926 3927 amdgpu_irq_gpu_reset_resume_helper(adev); 3928 r = amdgpu_ib_ring_tests(adev); 3929 amdgpu_amdkfd_post_reset(adev); 3930 3931 error: 3932 amdgpu_virt_release_full_gpu(adev, true); 3933 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3934 amdgpu_inc_vram_lost(adev); 3935 r = amdgpu_device_recover_vram(adev); 3936 } 3937 3938 return r; 3939 } 3940 3941 /** 3942 * amdgpu_device_has_job_running - check if there is any job in mirror list 3943 * 3944 * @adev: amdgpu device pointer 3945 * 3946 * check if there is any job in mirror list 3947 */ 3948 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 3949 { 3950 int i; 3951 struct drm_sched_job *job; 3952 3953 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3954 struct amdgpu_ring *ring = adev->rings[i]; 3955 3956 if (!ring || !ring->sched.thread) 3957 continue; 3958 3959 spin_lock(&ring->sched.job_list_lock); 3960 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 3961 struct drm_sched_job, node); 3962 spin_unlock(&ring->sched.job_list_lock); 3963 if (job) 3964 return true; 3965 } 3966 return false; 3967 } 3968 3969 /** 3970 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3971 * 3972 * @adev: amdgpu device pointer 3973 * 3974 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3975 * a hung GPU. 3976 */ 3977 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3978 { 3979 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3980 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 3981 return false; 3982 } 3983 3984 if (amdgpu_gpu_recovery == 0) 3985 goto disabled; 3986 3987 if (amdgpu_sriov_vf(adev)) 3988 return true; 3989 3990 if (amdgpu_gpu_recovery == -1) { 3991 switch (adev->asic_type) { 3992 case CHIP_BONAIRE: 3993 case CHIP_HAWAII: 3994 case CHIP_TOPAZ: 3995 case CHIP_TONGA: 3996 case CHIP_FIJI: 3997 case CHIP_POLARIS10: 3998 case CHIP_POLARIS11: 3999 case CHIP_POLARIS12: 4000 case CHIP_VEGAM: 4001 case CHIP_VEGA20: 4002 case CHIP_VEGA10: 4003 case CHIP_VEGA12: 4004 case CHIP_RAVEN: 4005 case CHIP_ARCTURUS: 4006 case CHIP_RENOIR: 4007 case CHIP_NAVI10: 4008 case CHIP_NAVI14: 4009 case CHIP_NAVI12: 4010 case CHIP_SIENNA_CICHLID: 4011 break; 4012 default: 4013 goto disabled; 4014 } 4015 } 4016 4017 return true; 4018 4019 disabled: 4020 dev_info(adev->dev, "GPU recovery disabled.\n"); 4021 return false; 4022 } 4023 4024 4025 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4026 struct amdgpu_job *job, 4027 bool *need_full_reset_arg) 4028 { 4029 int i, r = 0; 4030 bool need_full_reset = *need_full_reset_arg; 4031 4032 amdgpu_debugfs_wait_dump(adev); 4033 4034 /* block all schedulers and reset given job's ring */ 4035 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4036 struct amdgpu_ring *ring = adev->rings[i]; 4037 4038 if (!ring || !ring->sched.thread) 4039 continue; 4040 4041 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4042 amdgpu_fence_driver_force_completion(ring); 4043 } 4044 4045 if(job) 4046 drm_sched_increase_karma(&job->base); 4047 4048 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4049 if (!amdgpu_sriov_vf(adev)) { 4050 4051 if (!need_full_reset) 4052 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4053 4054 if (!need_full_reset) { 4055 amdgpu_device_ip_pre_soft_reset(adev); 4056 r = amdgpu_device_ip_soft_reset(adev); 4057 amdgpu_device_ip_post_soft_reset(adev); 4058 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4059 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4060 need_full_reset = true; 4061 } 4062 } 4063 4064 if (need_full_reset) 4065 r = amdgpu_device_ip_suspend(adev); 4066 4067 *need_full_reset_arg = need_full_reset; 4068 } 4069 4070 return r; 4071 } 4072 4073 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4074 struct list_head *device_list_handle, 4075 bool *need_full_reset_arg) 4076 { 4077 struct amdgpu_device *tmp_adev = NULL; 4078 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4079 int r = 0; 4080 4081 /* 4082 * ASIC reset has to be done on all HGMI hive nodes ASAP 4083 * to allow proper links negotiation in FW (within 1 sec) 4084 */ 4085 if (need_full_reset) { 4086 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4087 /* For XGMI run all resets in parallel to speed up the process */ 4088 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4089 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4090 r = -EALREADY; 4091 } else 4092 r = amdgpu_asic_reset(tmp_adev); 4093 4094 if (r) { 4095 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4096 r, adev_to_drm(tmp_adev)->unique); 4097 break; 4098 } 4099 } 4100 4101 /* For XGMI wait for all resets to complete before proceed */ 4102 if (!r) { 4103 list_for_each_entry(tmp_adev, device_list_handle, 4104 gmc.xgmi.head) { 4105 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4106 flush_work(&tmp_adev->xgmi_reset_work); 4107 r = tmp_adev->asic_reset_res; 4108 if (r) 4109 break; 4110 } 4111 } 4112 } 4113 } 4114 4115 if (!r && amdgpu_ras_intr_triggered()) { 4116 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4117 if (tmp_adev->mmhub.funcs && 4118 tmp_adev->mmhub.funcs->reset_ras_error_count) 4119 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4120 } 4121 4122 amdgpu_ras_intr_cleared(); 4123 } 4124 4125 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4126 if (need_full_reset) { 4127 /* post card */ 4128 if (amdgpu_device_asic_init(tmp_adev)) 4129 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4130 4131 if (!r) { 4132 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4133 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4134 if (r) 4135 goto out; 4136 4137 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4138 if (vram_lost) { 4139 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4140 amdgpu_inc_vram_lost(tmp_adev); 4141 } 4142 4143 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4144 if (r) 4145 goto out; 4146 4147 r = amdgpu_device_fw_loading(tmp_adev); 4148 if (r) 4149 return r; 4150 4151 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4152 if (r) 4153 goto out; 4154 4155 if (vram_lost) 4156 amdgpu_device_fill_reset_magic(tmp_adev); 4157 4158 /* 4159 * Add this ASIC as tracked as reset was already 4160 * complete successfully. 4161 */ 4162 amdgpu_register_gpu_instance(tmp_adev); 4163 4164 r = amdgpu_device_ip_late_init(tmp_adev); 4165 if (r) 4166 goto out; 4167 4168 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4169 4170 /* 4171 * The GPU enters bad state once faulty pages 4172 * by ECC has reached the threshold, and ras 4173 * recovery is scheduled next. So add one check 4174 * here to break recovery if it indeed exceeds 4175 * bad page threshold, and remind user to 4176 * retire this GPU or setting one bigger 4177 * bad_page_threshold value to fix this once 4178 * probing driver again. 4179 */ 4180 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4181 /* must succeed. */ 4182 amdgpu_ras_resume(tmp_adev); 4183 } else { 4184 r = -EINVAL; 4185 goto out; 4186 } 4187 4188 /* Update PSP FW topology after reset */ 4189 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4190 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4191 } 4192 } 4193 4194 out: 4195 if (!r) { 4196 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4197 r = amdgpu_ib_ring_tests(tmp_adev); 4198 if (r) { 4199 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4200 r = amdgpu_device_ip_suspend(tmp_adev); 4201 need_full_reset = true; 4202 r = -EAGAIN; 4203 goto end; 4204 } 4205 } 4206 4207 if (!r) 4208 r = amdgpu_device_recover_vram(tmp_adev); 4209 else 4210 tmp_adev->asic_reset_res = r; 4211 } 4212 4213 end: 4214 *need_full_reset_arg = need_full_reset; 4215 return r; 4216 } 4217 4218 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4219 struct amdgpu_hive_info *hive) 4220 { 4221 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4222 return false; 4223 4224 if (hive) { 4225 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4226 } else { 4227 down_write(&adev->reset_sem); 4228 } 4229 4230 atomic_inc(&adev->gpu_reset_counter); 4231 switch (amdgpu_asic_reset_method(adev)) { 4232 case AMD_RESET_METHOD_MODE1: 4233 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4234 break; 4235 case AMD_RESET_METHOD_MODE2: 4236 adev->mp1_state = PP_MP1_STATE_RESET; 4237 break; 4238 default: 4239 adev->mp1_state = PP_MP1_STATE_NONE; 4240 break; 4241 } 4242 4243 return true; 4244 } 4245 4246 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4247 { 4248 amdgpu_vf_error_trans_all(adev); 4249 adev->mp1_state = PP_MP1_STATE_NONE; 4250 atomic_set(&adev->in_gpu_reset, 0); 4251 up_write(&adev->reset_sem); 4252 } 4253 4254 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4255 { 4256 struct pci_dev *p = NULL; 4257 4258 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4259 adev->pdev->bus->number, 1); 4260 if (p) { 4261 pm_runtime_enable(&(p->dev)); 4262 pm_runtime_resume(&(p->dev)); 4263 } 4264 } 4265 4266 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4267 { 4268 enum amd_reset_method reset_method; 4269 struct pci_dev *p = NULL; 4270 u64 expires; 4271 4272 /* 4273 * For now, only BACO and mode1 reset are confirmed 4274 * to suffer the audio issue without proper suspended. 4275 */ 4276 reset_method = amdgpu_asic_reset_method(adev); 4277 if ((reset_method != AMD_RESET_METHOD_BACO) && 4278 (reset_method != AMD_RESET_METHOD_MODE1)) 4279 return -EINVAL; 4280 4281 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4282 adev->pdev->bus->number, 1); 4283 if (!p) 4284 return -ENODEV; 4285 4286 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4287 if (!expires) 4288 /* 4289 * If we cannot get the audio device autosuspend delay, 4290 * a fixed 4S interval will be used. Considering 3S is 4291 * the audio controller default autosuspend delay setting. 4292 * 4S used here is guaranteed to cover that. 4293 */ 4294 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4295 4296 while (!pm_runtime_status_suspended(&(p->dev))) { 4297 if (!pm_runtime_suspend(&(p->dev))) 4298 break; 4299 4300 if (expires < ktime_get_mono_fast_ns()) { 4301 dev_warn(adev->dev, "failed to suspend display audio\n"); 4302 /* TODO: abort the succeeding gpu reset? */ 4303 return -ETIMEDOUT; 4304 } 4305 } 4306 4307 pm_runtime_disable(&(p->dev)); 4308 4309 return 0; 4310 } 4311 4312 /** 4313 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4314 * 4315 * @adev: amdgpu device pointer 4316 * @job: which job trigger hang 4317 * 4318 * Attempt to reset the GPU if it has hung (all asics). 4319 * Attempt to do soft-reset or full-reset and reinitialize Asic 4320 * Returns 0 for success or an error on failure. 4321 */ 4322 4323 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4324 struct amdgpu_job *job) 4325 { 4326 struct list_head device_list, *device_list_handle = NULL; 4327 bool need_full_reset = false; 4328 bool job_signaled = false; 4329 struct amdgpu_hive_info *hive = NULL; 4330 struct amdgpu_device *tmp_adev = NULL; 4331 int i, r = 0; 4332 bool need_emergency_restart = false; 4333 bool audio_suspended = false; 4334 4335 /** 4336 * Special case: RAS triggered and full reset isn't supported 4337 */ 4338 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4339 4340 /* 4341 * Flush RAM to disk so that after reboot 4342 * the user can read log and see why the system rebooted. 4343 */ 4344 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4345 DRM_WARN("Emergency reboot."); 4346 4347 ksys_sync_helper(); 4348 emergency_restart(); 4349 } 4350 4351 dev_info(adev->dev, "GPU %s begin!\n", 4352 need_emergency_restart ? "jobs stop":"reset"); 4353 4354 /* 4355 * Here we trylock to avoid chain of resets executing from 4356 * either trigger by jobs on different adevs in XGMI hive or jobs on 4357 * different schedulers for same device while this TO handler is running. 4358 * We always reset all schedulers for device and all devices for XGMI 4359 * hive so that should take care of them too. 4360 */ 4361 hive = amdgpu_get_xgmi_hive(adev); 4362 if (hive) { 4363 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4364 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4365 job ? job->base.id : -1, hive->hive_id); 4366 amdgpu_put_xgmi_hive(hive); 4367 return 0; 4368 } 4369 mutex_lock(&hive->hive_lock); 4370 } 4371 4372 /* 4373 * Build list of devices to reset. 4374 * In case we are in XGMI hive mode, resort the device list 4375 * to put adev in the 1st position. 4376 */ 4377 INIT_LIST_HEAD(&device_list); 4378 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4379 if (!hive) 4380 return -ENODEV; 4381 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4382 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4383 device_list_handle = &hive->device_list; 4384 } else { 4385 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4386 device_list_handle = &device_list; 4387 } 4388 4389 /* block all schedulers and reset given job's ring */ 4390 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4391 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4392 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4393 job ? job->base.id : -1); 4394 r = 0; 4395 goto skip_recovery; 4396 } 4397 4398 /* 4399 * Try to put the audio codec into suspend state 4400 * before gpu reset started. 4401 * 4402 * Due to the power domain of the graphics device 4403 * is shared with AZ power domain. Without this, 4404 * we may change the audio hardware from behind 4405 * the audio driver's back. That will trigger 4406 * some audio codec errors. 4407 */ 4408 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4409 audio_suspended = true; 4410 4411 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4412 4413 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4414 4415 if (!amdgpu_sriov_vf(tmp_adev)) 4416 amdgpu_amdkfd_pre_reset(tmp_adev); 4417 4418 /* 4419 * Mark these ASICs to be reseted as untracked first 4420 * And add them back after reset completed 4421 */ 4422 amdgpu_unregister_gpu_instance(tmp_adev); 4423 4424 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4425 4426 /* disable ras on ALL IPs */ 4427 if (!need_emergency_restart && 4428 amdgpu_device_ip_need_full_reset(tmp_adev)) 4429 amdgpu_ras_suspend(tmp_adev); 4430 4431 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4432 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4433 4434 if (!ring || !ring->sched.thread) 4435 continue; 4436 4437 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4438 4439 if (need_emergency_restart) 4440 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4441 } 4442 } 4443 4444 if (need_emergency_restart) 4445 goto skip_sched_resume; 4446 4447 /* 4448 * Must check guilty signal here since after this point all old 4449 * HW fences are force signaled. 4450 * 4451 * job->base holds a reference to parent fence 4452 */ 4453 if (job && job->base.s_fence->parent && 4454 dma_fence_is_signaled(job->base.s_fence->parent)) { 4455 job_signaled = true; 4456 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4457 goto skip_hw_reset; 4458 } 4459 4460 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4461 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4462 r = amdgpu_device_pre_asic_reset(tmp_adev, 4463 NULL, 4464 &need_full_reset); 4465 /*TODO Should we stop ?*/ 4466 if (r) { 4467 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4468 r, adev_to_drm(tmp_adev)->unique); 4469 tmp_adev->asic_reset_res = r; 4470 } 4471 } 4472 4473 /* Actual ASIC resets if needed.*/ 4474 /* TODO Implement XGMI hive reset logic for SRIOV */ 4475 if (amdgpu_sriov_vf(adev)) { 4476 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4477 if (r) 4478 adev->asic_reset_res = r; 4479 } else { 4480 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4481 if (r && r == -EAGAIN) 4482 goto retry; 4483 } 4484 4485 skip_hw_reset: 4486 4487 /* Post ASIC reset for all devs .*/ 4488 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4489 4490 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4491 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4492 4493 if (!ring || !ring->sched.thread) 4494 continue; 4495 4496 /* No point to resubmit jobs if we didn't HW reset*/ 4497 if (!tmp_adev->asic_reset_res && !job_signaled) 4498 drm_sched_resubmit_jobs(&ring->sched); 4499 4500 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4501 } 4502 4503 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4504 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4505 } 4506 4507 tmp_adev->asic_reset_res = 0; 4508 4509 if (r) { 4510 /* bad news, how to tell it to userspace ? */ 4511 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4512 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4513 } else { 4514 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4515 } 4516 } 4517 4518 skip_sched_resume: 4519 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4520 /*unlock kfd: SRIOV would do it separately */ 4521 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4522 amdgpu_amdkfd_post_reset(tmp_adev); 4523 if (audio_suspended) 4524 amdgpu_device_resume_display_audio(tmp_adev); 4525 amdgpu_device_unlock_adev(tmp_adev); 4526 } 4527 4528 skip_recovery: 4529 if (hive) { 4530 atomic_set(&hive->in_reset, 0); 4531 mutex_unlock(&hive->hive_lock); 4532 amdgpu_put_xgmi_hive(hive); 4533 } 4534 4535 if (r) 4536 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4537 return r; 4538 } 4539 4540 /** 4541 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4542 * 4543 * @adev: amdgpu_device pointer 4544 * 4545 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4546 * and lanes) of the slot the device is in. Handles APUs and 4547 * virtualized environments where PCIE config space may not be available. 4548 */ 4549 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4550 { 4551 struct pci_dev *pdev; 4552 enum pci_bus_speed speed_cap, platform_speed_cap; 4553 enum pcie_link_width platform_link_width; 4554 4555 if (amdgpu_pcie_gen_cap) 4556 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4557 4558 if (amdgpu_pcie_lane_cap) 4559 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4560 4561 /* covers APUs as well */ 4562 if (pci_is_root_bus(adev->pdev->bus)) { 4563 if (adev->pm.pcie_gen_mask == 0) 4564 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4565 if (adev->pm.pcie_mlw_mask == 0) 4566 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4567 return; 4568 } 4569 4570 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4571 return; 4572 4573 pcie_bandwidth_available(adev->pdev, NULL, 4574 &platform_speed_cap, &platform_link_width); 4575 4576 if (adev->pm.pcie_gen_mask == 0) { 4577 /* asic caps */ 4578 pdev = adev->pdev; 4579 speed_cap = pcie_get_speed_cap(pdev); 4580 if (speed_cap == PCI_SPEED_UNKNOWN) { 4581 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4582 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4583 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4584 } else { 4585 if (speed_cap == PCIE_SPEED_16_0GT) 4586 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4587 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4588 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4589 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4590 else if (speed_cap == PCIE_SPEED_8_0GT) 4591 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4592 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4593 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4594 else if (speed_cap == PCIE_SPEED_5_0GT) 4595 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4596 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4597 else 4598 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4599 } 4600 /* platform caps */ 4601 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4602 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4603 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4604 } else { 4605 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4606 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4607 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4608 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4609 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4610 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4611 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4612 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4613 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4614 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4615 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4616 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4617 else 4618 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4619 4620 } 4621 } 4622 if (adev->pm.pcie_mlw_mask == 0) { 4623 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4624 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4625 } else { 4626 switch (platform_link_width) { 4627 case PCIE_LNK_X32: 4628 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4629 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4630 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4631 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4632 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4633 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4634 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4635 break; 4636 case PCIE_LNK_X16: 4637 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4638 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4639 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4640 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4641 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4642 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4643 break; 4644 case PCIE_LNK_X12: 4645 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4646 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4647 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4648 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4649 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4650 break; 4651 case PCIE_LNK_X8: 4652 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4653 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4654 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4655 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4656 break; 4657 case PCIE_LNK_X4: 4658 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4659 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4660 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4661 break; 4662 case PCIE_LNK_X2: 4663 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4664 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4665 break; 4666 case PCIE_LNK_X1: 4667 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4668 break; 4669 default: 4670 break; 4671 } 4672 } 4673 } 4674 } 4675 4676 int amdgpu_device_baco_enter(struct drm_device *dev) 4677 { 4678 struct amdgpu_device *adev = drm_to_adev(dev); 4679 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4680 4681 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4682 return -ENOTSUPP; 4683 4684 if (ras && ras->supported) 4685 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4686 4687 return amdgpu_dpm_baco_enter(adev); 4688 } 4689 4690 int amdgpu_device_baco_exit(struct drm_device *dev) 4691 { 4692 struct amdgpu_device *adev = drm_to_adev(dev); 4693 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4694 int ret = 0; 4695 4696 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4697 return -ENOTSUPP; 4698 4699 ret = amdgpu_dpm_baco_exit(adev); 4700 if (ret) 4701 return ret; 4702 4703 if (ras && ras->supported) 4704 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4705 4706 return 0; 4707 } 4708