1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100
101 static const struct drm_driver amdgpu_kms_driver;
102
103 const char *amdgpu_asic_name[] = {
104 "TAHITI",
105 "PITCAIRN",
106 "VERDE",
107 "OLAND",
108 "HAINAN",
109 "BONAIRE",
110 "KAVERI",
111 "KABINI",
112 "HAWAII",
113 "MULLINS",
114 "TOPAZ",
115 "TONGA",
116 "FIJI",
117 "CARRIZO",
118 "STONEY",
119 "POLARIS10",
120 "POLARIS11",
121 "POLARIS12",
122 "VEGAM",
123 "VEGA10",
124 "VEGA12",
125 "VEGA20",
126 "RAVEN",
127 "ARCTURUS",
128 "RENOIR",
129 "ALDEBARAN",
130 "NAVI10",
131 "CYAN_SKILLFISH",
132 "NAVI14",
133 "NAVI12",
134 "SIENNA_CICHLID",
135 "NAVY_FLOUNDER",
136 "VANGOGH",
137 "DIMGREY_CAVEFISH",
138 "BEIGE_GOBY",
139 "YELLOW_CARP",
140 "IP DISCOVERY",
141 "LAST",
142 };
143
144 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
145 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
146 void *data);
147
148 /**
149 * DOC: pcie_replay_count
150 *
151 * The amdgpu driver provides a sysfs API for reporting the total number
152 * of PCIe replays (NAKs)
153 * The file pcie_replay_count is used for this and returns the total
154 * number of replays as a sum of the NAKs generated and NAKs received
155 */
156
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)157 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
158 struct device_attribute *attr, char *buf)
159 {
160 struct drm_device *ddev = dev_get_drvdata(dev);
161 struct amdgpu_device *adev = drm_to_adev(ddev);
162 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
163
164 return sysfs_emit(buf, "%llu\n", cnt);
165 }
166
167 static DEVICE_ATTR(pcie_replay_count, 0444,
168 amdgpu_device_get_pcie_replay_count, NULL);
169
170 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
171
172
173 /**
174 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
175 *
176 * @dev: drm_device pointer
177 *
178 * Returns true if the device is a dGPU with ATPX power control,
179 * otherwise return false.
180 */
amdgpu_device_supports_px(struct drm_device * dev)181 bool amdgpu_device_supports_px(struct drm_device *dev)
182 {
183 struct amdgpu_device *adev = drm_to_adev(dev);
184
185 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
186 return true;
187 return false;
188 }
189
190 /**
191 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
192 *
193 * @dev: drm_device pointer
194 *
195 * Returns true if the device is a dGPU with ACPI power control,
196 * otherwise return false.
197 */
amdgpu_device_supports_boco(struct drm_device * dev)198 bool amdgpu_device_supports_boco(struct drm_device *dev)
199 {
200 struct amdgpu_device *adev = drm_to_adev(dev);
201
202 if (adev->has_pr3 ||
203 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
204 return true;
205 return false;
206 }
207
208 /**
209 * amdgpu_device_supports_baco - Does the device support BACO
210 *
211 * @dev: drm_device pointer
212 *
213 * Returns true if the device supporte BACO,
214 * otherwise return false.
215 */
amdgpu_device_supports_baco(struct drm_device * dev)216 bool amdgpu_device_supports_baco(struct drm_device *dev)
217 {
218 struct amdgpu_device *adev = drm_to_adev(dev);
219
220 return amdgpu_asic_supports_baco(adev);
221 }
222
223 /**
224 * amdgpu_device_supports_smart_shift - Is the device dGPU with
225 * smart shift support
226 *
227 * @dev: drm_device pointer
228 *
229 * Returns true if the device is a dGPU with Smart Shift support,
230 * otherwise returns false.
231 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)232 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
233 {
234 return (amdgpu_device_supports_boco(dev) &&
235 amdgpu_acpi_is_power_shift_control_supported());
236 }
237
238 /*
239 * VRAM access helper functions
240 */
241
242 /**
243 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
244 *
245 * @adev: amdgpu_device pointer
246 * @pos: offset of the buffer in vram
247 * @buf: virtual address of the buffer in system memory
248 * @size: read/write size, sizeof(@buf) must > @size
249 * @write: true - write to vram, otherwise - read from vram
250 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)251 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
252 void *buf, size_t size, bool write)
253 {
254 unsigned long flags;
255 uint32_t hi = ~0, tmp = 0;
256 uint32_t *data = buf;
257 uint64_t last;
258 int idx;
259
260 if (!drm_dev_enter(adev_to_drm(adev), &idx))
261 return;
262
263 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
264
265 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
266 for (last = pos + size; pos < last; pos += 4) {
267 tmp = pos >> 31;
268
269 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
270 if (tmp != hi) {
271 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
272 hi = tmp;
273 }
274 if (write)
275 WREG32_NO_KIQ(mmMM_DATA, *data++);
276 else
277 *data++ = RREG32_NO_KIQ(mmMM_DATA);
278 }
279
280 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
281 drm_dev_exit(idx);
282 }
283
284 /**
285 * amdgpu_device_aper_access - access vram by vram aperature
286 *
287 * @adev: amdgpu_device pointer
288 * @pos: offset of the buffer in vram
289 * @buf: virtual address of the buffer in system memory
290 * @size: read/write size, sizeof(@buf) must > @size
291 * @write: true - write to vram, otherwise - read from vram
292 *
293 * The return value means how many bytes have been transferred.
294 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)295 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
296 void *buf, size_t size, bool write)
297 {
298 #ifdef CONFIG_64BIT
299 void __iomem *addr;
300 size_t count = 0;
301 uint64_t last;
302
303 if (!adev->mman.aper_base_kaddr)
304 return 0;
305
306 last = min(pos + size, adev->gmc.visible_vram_size);
307 if (last > pos) {
308 addr = adev->mman.aper_base_kaddr + pos;
309 count = last - pos;
310
311 if (write) {
312 memcpy_toio(addr, buf, count);
313 /* Make sure HDP write cache flush happens without any reordering
314 * after the system memory contents are sent over PCIe device
315 */
316 mb();
317 amdgpu_device_flush_hdp(adev, NULL);
318 } else {
319 amdgpu_device_invalidate_hdp(adev, NULL);
320 /* Make sure HDP read cache is invalidated before issuing a read
321 * to the PCIe device
322 */
323 mb();
324 memcpy_fromio(buf, addr, count);
325 }
326
327 }
328
329 return count;
330 #else
331 return 0;
332 #endif
333 }
334
335 /**
336 * amdgpu_device_vram_access - read/write a buffer in vram
337 *
338 * @adev: amdgpu_device pointer
339 * @pos: offset of the buffer in vram
340 * @buf: virtual address of the buffer in system memory
341 * @size: read/write size, sizeof(@buf) must > @size
342 * @write: true - write to vram, otherwise - read from vram
343 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)344 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
345 void *buf, size_t size, bool write)
346 {
347 size_t count;
348
349 /* try to using vram apreature to access vram first */
350 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
351 size -= count;
352 if (size) {
353 /* using MM to access rest vram */
354 pos += count;
355 buf += count;
356 amdgpu_device_mm_access(adev, pos, buf, size, write);
357 }
358 }
359
360 /*
361 * register access helper functions.
362 */
363
364 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)365 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
366 {
367 if (adev->no_hw_access)
368 return true;
369
370 #ifdef CONFIG_LOCKDEP
371 /*
372 * This is a bit complicated to understand, so worth a comment. What we assert
373 * here is that the GPU reset is not running on another thread in parallel.
374 *
375 * For this we trylock the read side of the reset semaphore, if that succeeds
376 * we know that the reset is not running in paralell.
377 *
378 * If the trylock fails we assert that we are either already holding the read
379 * side of the lock or are the reset thread itself and hold the write side of
380 * the lock.
381 */
382 if (in_task()) {
383 if (down_read_trylock(&adev->reset_domain->sem))
384 up_read(&adev->reset_domain->sem);
385 else
386 lockdep_assert_held(&adev->reset_domain->sem);
387 }
388 #endif
389 return false;
390 }
391
392 /**
393 * amdgpu_device_rreg - read a memory mapped IO or indirect register
394 *
395 * @adev: amdgpu_device pointer
396 * @reg: dword aligned register offset
397 * @acc_flags: access flags which require special behavior
398 *
399 * Returns the 32 bit value from the offset specified.
400 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)401 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
402 uint32_t reg, uint32_t acc_flags)
403 {
404 uint32_t ret;
405
406 if (amdgpu_device_skip_hw_access(adev))
407 return 0;
408
409 if ((reg * 4) < adev->rmmio_size) {
410 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
411 amdgpu_sriov_runtime(adev) &&
412 down_read_trylock(&adev->reset_domain->sem)) {
413 ret = amdgpu_kiq_rreg(adev, reg);
414 up_read(&adev->reset_domain->sem);
415 } else {
416 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
417 }
418 } else {
419 ret = adev->pcie_rreg(adev, reg * 4);
420 }
421
422 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
423
424 return ret;
425 }
426
427 /*
428 * MMIO register read with bytes helper functions
429 * @offset:bytes offset from MMIO start
430 */
431
432 /**
433 * amdgpu_mm_rreg8 - read a memory mapped IO register
434 *
435 * @adev: amdgpu_device pointer
436 * @offset: byte aligned register offset
437 *
438 * Returns the 8 bit value from the offset specified.
439 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)440 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
441 {
442 if (amdgpu_device_skip_hw_access(adev))
443 return 0;
444
445 if (offset < adev->rmmio_size)
446 return (readb(adev->rmmio + offset));
447 BUG();
448 }
449
450 /*
451 * MMIO register write with bytes helper functions
452 * @offset:bytes offset from MMIO start
453 * @value: the value want to be written to the register
454 */
455
456 /**
457 * amdgpu_mm_wreg8 - read a memory mapped IO register
458 *
459 * @adev: amdgpu_device pointer
460 * @offset: byte aligned register offset
461 * @value: 8 bit value to write
462 *
463 * Writes the value specified to the offset specified.
464 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)465 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
466 {
467 if (amdgpu_device_skip_hw_access(adev))
468 return;
469
470 if (offset < adev->rmmio_size)
471 writeb(value, adev->rmmio + offset);
472 else
473 BUG();
474 }
475
476 /**
477 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
478 *
479 * @adev: amdgpu_device pointer
480 * @reg: dword aligned register offset
481 * @v: 32 bit value to write to the register
482 * @acc_flags: access flags which require special behavior
483 *
484 * Writes the value specified to the offset specified.
485 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)486 void amdgpu_device_wreg(struct amdgpu_device *adev,
487 uint32_t reg, uint32_t v,
488 uint32_t acc_flags)
489 {
490 if (amdgpu_device_skip_hw_access(adev))
491 return;
492
493 if ((reg * 4) < adev->rmmio_size) {
494 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
495 amdgpu_sriov_runtime(adev) &&
496 down_read_trylock(&adev->reset_domain->sem)) {
497 amdgpu_kiq_wreg(adev, reg, v);
498 up_read(&adev->reset_domain->sem);
499 } else {
500 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
501 }
502 } else {
503 adev->pcie_wreg(adev, reg * 4, v);
504 }
505
506 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
507 }
508
509 /**
510 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
511 *
512 * @adev: amdgpu_device pointer
513 * @reg: mmio/rlc register
514 * @v: value to write
515 *
516 * this function is invoked only for the debugfs register access
517 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)518 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
519 uint32_t reg, uint32_t v,
520 uint32_t xcc_id)
521 {
522 if (amdgpu_device_skip_hw_access(adev))
523 return;
524
525 if (amdgpu_sriov_fullaccess(adev) &&
526 adev->gfx.rlc.funcs &&
527 adev->gfx.rlc.funcs->is_rlcg_access_range) {
528 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
529 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
530 } else if ((reg * 4) >= adev->rmmio_size) {
531 adev->pcie_wreg(adev, reg * 4, v);
532 } else {
533 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
534 }
535 }
536
537 /**
538 * amdgpu_device_indirect_rreg - read an indirect register
539 *
540 * @adev: amdgpu_device pointer
541 * @reg_addr: indirect register address to read from
542 *
543 * Returns the value of indirect register @reg_addr
544 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)545 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
546 u32 reg_addr)
547 {
548 unsigned long flags, pcie_index, pcie_data;
549 void __iomem *pcie_index_offset;
550 void __iomem *pcie_data_offset;
551 u32 r;
552
553 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
554 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
555
556 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
557 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
558 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
559
560 writel(reg_addr, pcie_index_offset);
561 readl(pcie_index_offset);
562 r = readl(pcie_data_offset);
563 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
564
565 return r;
566 }
567
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)568 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
569 u64 reg_addr)
570 {
571 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
572 u32 r;
573 void __iomem *pcie_index_offset;
574 void __iomem *pcie_index_hi_offset;
575 void __iomem *pcie_data_offset;
576
577 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
578 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
579 if (adev->nbio.funcs->get_pcie_index_hi_offset)
580 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
581 else
582 pcie_index_hi = 0;
583
584 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
585 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
586 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
587 if (pcie_index_hi != 0)
588 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
589 pcie_index_hi * 4;
590
591 writel(reg_addr, pcie_index_offset);
592 readl(pcie_index_offset);
593 if (pcie_index_hi != 0) {
594 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
595 readl(pcie_index_hi_offset);
596 }
597 r = readl(pcie_data_offset);
598
599 /* clear the high bits */
600 if (pcie_index_hi != 0) {
601 writel(0, pcie_index_hi_offset);
602 readl(pcie_index_hi_offset);
603 }
604
605 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
606
607 return r;
608 }
609
610 /**
611 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
612 *
613 * @adev: amdgpu_device pointer
614 * @reg_addr: indirect register address to read from
615 *
616 * Returns the value of indirect register @reg_addr
617 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)618 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
619 u32 reg_addr)
620 {
621 unsigned long flags, pcie_index, pcie_data;
622 void __iomem *pcie_index_offset;
623 void __iomem *pcie_data_offset;
624 u64 r;
625
626 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
627 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
628
629 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
630 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
631 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
632
633 /* read low 32 bits */
634 writel(reg_addr, pcie_index_offset);
635 readl(pcie_index_offset);
636 r = readl(pcie_data_offset);
637 /* read high 32 bits */
638 writel(reg_addr + 4, pcie_index_offset);
639 readl(pcie_index_offset);
640 r |= ((u64)readl(pcie_data_offset) << 32);
641 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
642
643 return r;
644 }
645
646 /**
647 * amdgpu_device_indirect_wreg - write an indirect register address
648 *
649 * @adev: amdgpu_device pointer
650 * @reg_addr: indirect register offset
651 * @reg_data: indirect register data
652 *
653 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)654 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
655 u32 reg_addr, u32 reg_data)
656 {
657 unsigned long flags, pcie_index, pcie_data;
658 void __iomem *pcie_index_offset;
659 void __iomem *pcie_data_offset;
660
661 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
662 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
663
664 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
665 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
666 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
667
668 writel(reg_addr, pcie_index_offset);
669 readl(pcie_index_offset);
670 writel(reg_data, pcie_data_offset);
671 readl(pcie_data_offset);
672 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
673 }
674
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)675 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
676 u64 reg_addr, u32 reg_data)
677 {
678 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
679 void __iomem *pcie_index_offset;
680 void __iomem *pcie_index_hi_offset;
681 void __iomem *pcie_data_offset;
682
683 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
684 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
685 if (adev->nbio.funcs->get_pcie_index_hi_offset)
686 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
687 else
688 pcie_index_hi = 0;
689
690 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693 if (pcie_index_hi != 0)
694 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
695 pcie_index_hi * 4;
696
697 writel(reg_addr, pcie_index_offset);
698 readl(pcie_index_offset);
699 if (pcie_index_hi != 0) {
700 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
701 readl(pcie_index_hi_offset);
702 }
703 writel(reg_data, pcie_data_offset);
704 readl(pcie_data_offset);
705
706 /* clear the high bits */
707 if (pcie_index_hi != 0) {
708 writel(0, pcie_index_hi_offset);
709 readl(pcie_index_hi_offset);
710 }
711
712 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
713 }
714
715 /**
716 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
717 *
718 * @adev: amdgpu_device pointer
719 * @reg_addr: indirect register offset
720 * @reg_data: indirect register data
721 *
722 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)723 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
724 u32 reg_addr, u64 reg_data)
725 {
726 unsigned long flags, pcie_index, pcie_data;
727 void __iomem *pcie_index_offset;
728 void __iomem *pcie_data_offset;
729
730 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
731 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
732
733 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
734 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
735 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
736
737 /* write low 32 bits */
738 writel(reg_addr, pcie_index_offset);
739 readl(pcie_index_offset);
740 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
741 readl(pcie_data_offset);
742 /* write high 32 bits */
743 writel(reg_addr + 4, pcie_index_offset);
744 readl(pcie_index_offset);
745 writel((u32)(reg_data >> 32), pcie_data_offset);
746 readl(pcie_data_offset);
747 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
748 }
749
750 /**
751 * amdgpu_device_get_rev_id - query device rev_id
752 *
753 * @adev: amdgpu_device pointer
754 *
755 * Return device rev_id
756 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)757 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
758 {
759 return adev->nbio.funcs->get_rev_id(adev);
760 }
761
762 /**
763 * amdgpu_invalid_rreg - dummy reg read function
764 *
765 * @adev: amdgpu_device pointer
766 * @reg: offset of register
767 *
768 * Dummy register read function. Used for register blocks
769 * that certain asics don't have (all asics).
770 * Returns the value in the register.
771 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)772 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
773 {
774 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
775 BUG();
776 return 0;
777 }
778
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)779 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
780 {
781 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
782 BUG();
783 return 0;
784 }
785
786 /**
787 * amdgpu_invalid_wreg - dummy reg write function
788 *
789 * @adev: amdgpu_device pointer
790 * @reg: offset of register
791 * @v: value to write to the register
792 *
793 * Dummy register read function. Used for register blocks
794 * that certain asics don't have (all asics).
795 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)796 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
797 {
798 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
799 reg, v);
800 BUG();
801 }
802
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)803 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
804 {
805 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
806 reg, v);
807 BUG();
808 }
809
810 /**
811 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
812 *
813 * @adev: amdgpu_device pointer
814 * @reg: offset of register
815 *
816 * Dummy register read function. Used for register blocks
817 * that certain asics don't have (all asics).
818 * Returns the value in the register.
819 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)820 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
821 {
822 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
823 BUG();
824 return 0;
825 }
826
827 /**
828 * amdgpu_invalid_wreg64 - dummy reg write function
829 *
830 * @adev: amdgpu_device pointer
831 * @reg: offset of register
832 * @v: value to write to the register
833 *
834 * Dummy register read function. Used for register blocks
835 * that certain asics don't have (all asics).
836 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)837 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
838 {
839 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
840 reg, v);
841 BUG();
842 }
843
844 /**
845 * amdgpu_block_invalid_rreg - dummy reg read function
846 *
847 * @adev: amdgpu_device pointer
848 * @block: offset of instance
849 * @reg: offset of register
850 *
851 * Dummy register read function. Used for register blocks
852 * that certain asics don't have (all asics).
853 * Returns the value in the register.
854 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)855 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
856 uint32_t block, uint32_t reg)
857 {
858 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
859 reg, block);
860 BUG();
861 return 0;
862 }
863
864 /**
865 * amdgpu_block_invalid_wreg - dummy reg write function
866 *
867 * @adev: amdgpu_device pointer
868 * @block: offset of instance
869 * @reg: offset of register
870 * @v: value to write to the register
871 *
872 * Dummy register read function. Used for register blocks
873 * that certain asics don't have (all asics).
874 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)875 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
876 uint32_t block,
877 uint32_t reg, uint32_t v)
878 {
879 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
880 reg, block, v);
881 BUG();
882 }
883
884 /**
885 * amdgpu_device_asic_init - Wrapper for atom asic_init
886 *
887 * @adev: amdgpu_device pointer
888 *
889 * Does any asic specific work and then calls atom asic init.
890 */
amdgpu_device_asic_init(struct amdgpu_device * adev)891 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
892 {
893 int ret;
894
895 amdgpu_asic_pre_asic_init(adev);
896
897 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
898 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
899 amdgpu_psp_wait_for_bootloader(adev);
900 ret = amdgpu_atomfirmware_asic_init(adev, true);
901 return ret;
902 } else {
903 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
904 }
905
906 return 0;
907 }
908
909 /**
910 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
911 *
912 * @adev: amdgpu_device pointer
913 *
914 * Allocates a scratch page of VRAM for use by various things in the
915 * driver.
916 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)917 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
918 {
919 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
920 AMDGPU_GEM_DOMAIN_VRAM |
921 AMDGPU_GEM_DOMAIN_GTT,
922 &adev->mem_scratch.robj,
923 &adev->mem_scratch.gpu_addr,
924 (void **)&adev->mem_scratch.ptr);
925 }
926
927 /**
928 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
929 *
930 * @adev: amdgpu_device pointer
931 *
932 * Frees the VRAM scratch page.
933 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)934 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
935 {
936 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
937 }
938
939 /**
940 * amdgpu_device_program_register_sequence - program an array of registers.
941 *
942 * @adev: amdgpu_device pointer
943 * @registers: pointer to the register array
944 * @array_size: size of the register array
945 *
946 * Programs an array or registers with and or masks.
947 * This is a helper for setting golden registers.
948 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)949 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
950 const u32 *registers,
951 const u32 array_size)
952 {
953 u32 tmp, reg, and_mask, or_mask;
954 int i;
955
956 if (array_size % 3)
957 return;
958
959 for (i = 0; i < array_size; i += 3) {
960 reg = registers[i + 0];
961 and_mask = registers[i + 1];
962 or_mask = registers[i + 2];
963
964 if (and_mask == 0xffffffff) {
965 tmp = or_mask;
966 } else {
967 tmp = RREG32(reg);
968 tmp &= ~and_mask;
969 if (adev->family >= AMDGPU_FAMILY_AI)
970 tmp |= (or_mask & and_mask);
971 else
972 tmp |= or_mask;
973 }
974 WREG32(reg, tmp);
975 }
976 }
977
978 /**
979 * amdgpu_device_pci_config_reset - reset the GPU
980 *
981 * @adev: amdgpu_device pointer
982 *
983 * Resets the GPU using the pci config reset sequence.
984 * Only applicable to asics prior to vega10.
985 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)986 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
987 {
988 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
989 }
990
991 /**
992 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
993 *
994 * @adev: amdgpu_device pointer
995 *
996 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
997 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)998 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
999 {
1000 return pci_reset_function(adev->pdev);
1001 }
1002
1003 /*
1004 * amdgpu_device_wb_*()
1005 * Writeback is the method by which the GPU updates special pages in memory
1006 * with the status of certain GPU events (fences, ring pointers,etc.).
1007 */
1008
1009 /**
1010 * amdgpu_device_wb_fini - Disable Writeback and free memory
1011 *
1012 * @adev: amdgpu_device pointer
1013 *
1014 * Disables Writeback and frees the Writeback memory (all asics).
1015 * Used at driver shutdown.
1016 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 if (adev->wb.wb_obj) {
1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 &adev->wb.gpu_addr,
1022 (void **)&adev->wb.wb);
1023 adev->wb.wb_obj = NULL;
1024 }
1025 }
1026
1027 /**
1028 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1029 *
1030 * @adev: amdgpu_device pointer
1031 *
1032 * Initializes writeback and allocates writeback memory (all asics).
1033 * Used at driver startup.
1034 * Returns 0 on success or an -error on failure.
1035 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 int r;
1039
1040 if (adev->wb.wb_obj == NULL) {
1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 (void **)&adev->wb.wb);
1046 if (r) {
1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 return r;
1049 }
1050
1051 adev->wb.num_wb = AMDGPU_MAX_WB;
1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053
1054 /* clear wb memory */
1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 }
1057
1058 return 0;
1059 }
1060
1061 /**
1062 * amdgpu_device_wb_get - Allocate a wb entry
1063 *
1064 * @adev: amdgpu_device pointer
1065 * @wb: wb index
1066 *
1067 * Allocate a wb slot for use by the driver (all asics).
1068 * Returns 0 on success or -EINVAL on failure.
1069 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073
1074 if (offset < adev->wb.num_wb) {
1075 __set_bit(offset, adev->wb.used);
1076 *wb = offset << 3; /* convert to dw offset */
1077 return 0;
1078 } else {
1079 return -EINVAL;
1080 }
1081 }
1082
1083 /**
1084 * amdgpu_device_wb_free - Free a wb entry
1085 *
1086 * @adev: amdgpu_device pointer
1087 * @wb: wb index
1088 *
1089 * Free a wb slot allocated for use by the driver (all asics)
1090 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 wb >>= 3;
1094 if (wb < adev->wb.num_wb)
1095 __clear_bit(wb, adev->wb.used);
1096 }
1097
1098 /**
1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100 *
1101 * @adev: amdgpu_device pointer
1102 *
1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104 * to fail, but if any of the BARs is not accessible after the size we abort
1105 * driver loading by returning -ENODEV.
1106 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1110 struct pci_bus *root;
1111 struct resource *res;
1112 unsigned int i;
1113 u16 cmd;
1114 int r;
1115
1116 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1117 return 0;
1118
1119 /* Bypass for VF */
1120 if (amdgpu_sriov_vf(adev))
1121 return 0;
1122
1123 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1124 if ((amdgpu_runtime_pm != 0) &&
1125 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1126 adev->pdev->device == 0x731f &&
1127 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1128 return 0;
1129
1130 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1131 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1132 DRM_WARN("System can't access extended configuration space,please check!!\n");
1133
1134 /* skip if the bios has already enabled large BAR */
1135 if (adev->gmc.real_vram_size &&
1136 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1137 return 0;
1138
1139 /* Check if the root BUS has 64bit memory resources */
1140 root = adev->pdev->bus;
1141 while (root->parent)
1142 root = root->parent;
1143
1144 pci_bus_for_each_resource(root, res, i) {
1145 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1146 res->start > 0x100000000ull)
1147 break;
1148 }
1149
1150 /* Trying to resize is pointless without a root hub window above 4GB */
1151 if (!res)
1152 return 0;
1153
1154 /* Limit the BAR size to what is available */
1155 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1156 rbar_size);
1157
1158 /* Disable memory decoding while we change the BAR addresses and size */
1159 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1160 pci_write_config_word(adev->pdev, PCI_COMMAND,
1161 cmd & ~PCI_COMMAND_MEMORY);
1162
1163 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1164 amdgpu_doorbell_fini(adev);
1165 if (adev->asic_type >= CHIP_BONAIRE)
1166 pci_release_resource(adev->pdev, 2);
1167
1168 pci_release_resource(adev->pdev, 0);
1169
1170 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1171 if (r == -ENOSPC)
1172 DRM_INFO("Not enough PCI address space for a large BAR.");
1173 else if (r && r != -ENOTSUPP)
1174 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1175
1176 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1177
1178 /* When the doorbell or fb BAR isn't available we have no chance of
1179 * using the device.
1180 */
1181 r = amdgpu_doorbell_init(adev);
1182 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1183 return -ENODEV;
1184
1185 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1186
1187 return 0;
1188 }
1189
amdgpu_device_read_bios(struct amdgpu_device * adev)1190 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1191 {
1192 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1193 return false;
1194
1195 return true;
1196 }
1197
1198 /*
1199 * GPU helpers function.
1200 */
1201 /**
1202 * amdgpu_device_need_post - check if the hw need post or not
1203 *
1204 * @adev: amdgpu_device pointer
1205 *
1206 * Check if the asic has been initialized (all asics) at driver startup
1207 * or post is needed if hw reset is performed.
1208 * Returns true if need or false if not.
1209 */
amdgpu_device_need_post(struct amdgpu_device * adev)1210 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1211 {
1212 uint32_t reg;
1213
1214 if (amdgpu_sriov_vf(adev))
1215 return false;
1216
1217 if (!amdgpu_device_read_bios(adev))
1218 return false;
1219
1220 if (amdgpu_passthrough(adev)) {
1221 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1222 * some old smc fw still need driver do vPost otherwise gpu hang, while
1223 * those smc fw version above 22.15 doesn't have this flaw, so we force
1224 * vpost executed for smc version below 22.15
1225 */
1226 if (adev->asic_type == CHIP_FIJI) {
1227 int err;
1228 uint32_t fw_ver;
1229
1230 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1231 /* force vPost if error occured */
1232 if (err)
1233 return true;
1234
1235 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1236 release_firmware(adev->pm.fw);
1237 if (fw_ver < 0x00160e00)
1238 return true;
1239 }
1240 }
1241
1242 /* Don't post if we need to reset whole hive on init */
1243 if (adev->gmc.xgmi.pending_reset)
1244 return false;
1245
1246 if (adev->has_hw_reset) {
1247 adev->has_hw_reset = false;
1248 return true;
1249 }
1250
1251 /* bios scratch used on CIK+ */
1252 if (adev->asic_type >= CHIP_BONAIRE)
1253 return amdgpu_atombios_scratch_need_asic_init(adev);
1254
1255 /* check MEM_SIZE for older asics */
1256 reg = amdgpu_asic_get_config_memsize(adev);
1257
1258 if ((reg != 0) && (reg != 0xffffffff))
1259 return false;
1260
1261 return true;
1262 }
1263
1264 /*
1265 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1266 * speed switching. Until we have confirmation from Intel that a specific host
1267 * supports it, it's safer that we keep it disabled for all.
1268 *
1269 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1270 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1271 */
amdgpu_device_pcie_dynamic_switching_supported(void)1272 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1273 {
1274 #if IS_ENABLED(CONFIG_X86)
1275 struct cpuinfo_x86 *c = &cpu_data(0);
1276
1277 if (c->x86_vendor == X86_VENDOR_INTEL)
1278 return false;
1279 #endif
1280 return true;
1281 }
1282
1283 /**
1284 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1285 *
1286 * @adev: amdgpu_device pointer
1287 *
1288 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1289 * be set for this device.
1290 *
1291 * Returns true if it should be used or false if not.
1292 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1293 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1294 {
1295 switch (amdgpu_aspm) {
1296 case -1:
1297 break;
1298 case 0:
1299 return false;
1300 case 1:
1301 return true;
1302 default:
1303 return false;
1304 }
1305 return pcie_aspm_enabled(adev->pdev);
1306 }
1307
amdgpu_device_aspm_support_quirk(void)1308 bool amdgpu_device_aspm_support_quirk(void)
1309 {
1310 #if IS_ENABLED(CONFIG_X86)
1311 struct cpuinfo_x86 *c = &cpu_data(0);
1312
1313 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1314 #else
1315 return true;
1316 #endif
1317 }
1318
1319 /* if we get transitioned to only one device, take VGA back */
1320 /**
1321 * amdgpu_device_vga_set_decode - enable/disable vga decode
1322 *
1323 * @pdev: PCI device pointer
1324 * @state: enable/disable vga decode
1325 *
1326 * Enable/disable vga decode (all asics).
1327 * Returns VGA resource flags.
1328 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1329 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1330 bool state)
1331 {
1332 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1333
1334 amdgpu_asic_set_vga_state(adev, state);
1335 if (state)
1336 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1337 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1338 else
1339 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1340 }
1341
1342 /**
1343 * amdgpu_device_check_block_size - validate the vm block size
1344 *
1345 * @adev: amdgpu_device pointer
1346 *
1347 * Validates the vm block size specified via module parameter.
1348 * The vm block size defines number of bits in page table versus page directory,
1349 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1350 * page table and the remaining bits are in the page directory.
1351 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1352 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1353 {
1354 /* defines number of bits in page table versus page directory,
1355 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1356 * page table and the remaining bits are in the page directory
1357 */
1358 if (amdgpu_vm_block_size == -1)
1359 return;
1360
1361 if (amdgpu_vm_block_size < 9) {
1362 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1363 amdgpu_vm_block_size);
1364 amdgpu_vm_block_size = -1;
1365 }
1366 }
1367
1368 /**
1369 * amdgpu_device_check_vm_size - validate the vm size
1370 *
1371 * @adev: amdgpu_device pointer
1372 *
1373 * Validates the vm size in GB specified via module parameter.
1374 * The VM size is the size of the GPU virtual memory space in GB.
1375 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1376 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1377 {
1378 /* no need to check the default value */
1379 if (amdgpu_vm_size == -1)
1380 return;
1381
1382 if (amdgpu_vm_size < 1) {
1383 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1384 amdgpu_vm_size);
1385 amdgpu_vm_size = -1;
1386 }
1387 }
1388
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1389 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1390 {
1391 struct sysinfo si;
1392 bool is_os_64 = (sizeof(void *) == 8);
1393 uint64_t total_memory;
1394 uint64_t dram_size_seven_GB = 0x1B8000000;
1395 uint64_t dram_size_three_GB = 0xB8000000;
1396
1397 if (amdgpu_smu_memory_pool_size == 0)
1398 return;
1399
1400 if (!is_os_64) {
1401 DRM_WARN("Not 64-bit OS, feature not supported\n");
1402 goto def_value;
1403 }
1404 si_meminfo(&si);
1405 total_memory = (uint64_t)si.totalram * si.mem_unit;
1406
1407 if ((amdgpu_smu_memory_pool_size == 1) ||
1408 (amdgpu_smu_memory_pool_size == 2)) {
1409 if (total_memory < dram_size_three_GB)
1410 goto def_value1;
1411 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1412 (amdgpu_smu_memory_pool_size == 8)) {
1413 if (total_memory < dram_size_seven_GB)
1414 goto def_value1;
1415 } else {
1416 DRM_WARN("Smu memory pool size not supported\n");
1417 goto def_value;
1418 }
1419 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1420
1421 return;
1422
1423 def_value1:
1424 DRM_WARN("No enough system memory\n");
1425 def_value:
1426 adev->pm.smu_prv_buffer_size = 0;
1427 }
1428
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1429 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1430 {
1431 if (!(adev->flags & AMD_IS_APU) ||
1432 adev->asic_type < CHIP_RAVEN)
1433 return 0;
1434
1435 switch (adev->asic_type) {
1436 case CHIP_RAVEN:
1437 if (adev->pdev->device == 0x15dd)
1438 adev->apu_flags |= AMD_APU_IS_RAVEN;
1439 if (adev->pdev->device == 0x15d8)
1440 adev->apu_flags |= AMD_APU_IS_PICASSO;
1441 break;
1442 case CHIP_RENOIR:
1443 if ((adev->pdev->device == 0x1636) ||
1444 (adev->pdev->device == 0x164c))
1445 adev->apu_flags |= AMD_APU_IS_RENOIR;
1446 else
1447 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1448 break;
1449 case CHIP_VANGOGH:
1450 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1451 break;
1452 case CHIP_YELLOW_CARP:
1453 break;
1454 case CHIP_CYAN_SKILLFISH:
1455 if ((adev->pdev->device == 0x13FE) ||
1456 (adev->pdev->device == 0x143F))
1457 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1458 break;
1459 default:
1460 break;
1461 }
1462
1463 return 0;
1464 }
1465
1466 /**
1467 * amdgpu_device_check_arguments - validate module params
1468 *
1469 * @adev: amdgpu_device pointer
1470 *
1471 * Validates certain module parameters and updates
1472 * the associated values used by the driver (all asics).
1473 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1474 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1475 {
1476 if (amdgpu_sched_jobs < 4) {
1477 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1478 amdgpu_sched_jobs);
1479 amdgpu_sched_jobs = 4;
1480 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1481 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1482 amdgpu_sched_jobs);
1483 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1484 }
1485
1486 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1487 /* gart size must be greater or equal to 32M */
1488 dev_warn(adev->dev, "gart size (%d) too small\n",
1489 amdgpu_gart_size);
1490 amdgpu_gart_size = -1;
1491 }
1492
1493 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1494 /* gtt size must be greater or equal to 32M */
1495 dev_warn(adev->dev, "gtt size (%d) too small\n",
1496 amdgpu_gtt_size);
1497 amdgpu_gtt_size = -1;
1498 }
1499
1500 /* valid range is between 4 and 9 inclusive */
1501 if (amdgpu_vm_fragment_size != -1 &&
1502 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1503 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1504 amdgpu_vm_fragment_size = -1;
1505 }
1506
1507 if (amdgpu_sched_hw_submission < 2) {
1508 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1509 amdgpu_sched_hw_submission);
1510 amdgpu_sched_hw_submission = 2;
1511 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1512 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1513 amdgpu_sched_hw_submission);
1514 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1515 }
1516
1517 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1518 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1519 amdgpu_reset_method = -1;
1520 }
1521
1522 amdgpu_device_check_smu_prv_buffer_size(adev);
1523
1524 amdgpu_device_check_vm_size(adev);
1525
1526 amdgpu_device_check_block_size(adev);
1527
1528 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1529
1530 return 0;
1531 }
1532
1533 /**
1534 * amdgpu_switcheroo_set_state - set switcheroo state
1535 *
1536 * @pdev: pci dev pointer
1537 * @state: vga_switcheroo state
1538 *
1539 * Callback for the switcheroo driver. Suspends or resumes
1540 * the asics before or after it is powered up using ACPI methods.
1541 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1542 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1543 enum vga_switcheroo_state state)
1544 {
1545 struct drm_device *dev = pci_get_drvdata(pdev);
1546 int r;
1547
1548 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1549 return;
1550
1551 if (state == VGA_SWITCHEROO_ON) {
1552 pr_info("switched on\n");
1553 /* don't suspend or resume card normally */
1554 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1555
1556 pci_set_power_state(pdev, PCI_D0);
1557 amdgpu_device_load_pci_state(pdev);
1558 r = pci_enable_device(pdev);
1559 if (r)
1560 DRM_WARN("pci_enable_device failed (%d)\n", r);
1561 amdgpu_device_resume(dev, true);
1562
1563 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1564 } else {
1565 pr_info("switched off\n");
1566 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1567 amdgpu_device_prepare(dev);
1568 amdgpu_device_suspend(dev, true);
1569 amdgpu_device_cache_pci_state(pdev);
1570 /* Shut down the device */
1571 pci_disable_device(pdev);
1572 pci_set_power_state(pdev, PCI_D3cold);
1573 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1574 }
1575 }
1576
1577 /**
1578 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1579 *
1580 * @pdev: pci dev pointer
1581 *
1582 * Callback for the switcheroo driver. Check of the switcheroo
1583 * state can be changed.
1584 * Returns true if the state can be changed, false if not.
1585 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1586 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1587 {
1588 struct drm_device *dev = pci_get_drvdata(pdev);
1589
1590 /*
1591 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1592 * locking inversion with the driver load path. And the access here is
1593 * completely racy anyway. So don't bother with locking for now.
1594 */
1595 return atomic_read(&dev->open_count) == 0;
1596 }
1597
1598 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1599 .set_gpu_state = amdgpu_switcheroo_set_state,
1600 .reprobe = NULL,
1601 .can_switch = amdgpu_switcheroo_can_switch,
1602 };
1603
1604 /**
1605 * amdgpu_device_ip_set_clockgating_state - set the CG state
1606 *
1607 * @dev: amdgpu_device pointer
1608 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1609 * @state: clockgating state (gate or ungate)
1610 *
1611 * Sets the requested clockgating state for all instances of
1612 * the hardware IP specified.
1613 * Returns the error code from the last instance.
1614 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1615 int amdgpu_device_ip_set_clockgating_state(void *dev,
1616 enum amd_ip_block_type block_type,
1617 enum amd_clockgating_state state)
1618 {
1619 struct amdgpu_device *adev = dev;
1620 int i, r = 0;
1621
1622 for (i = 0; i < adev->num_ip_blocks; i++) {
1623 if (!adev->ip_blocks[i].status.valid)
1624 continue;
1625 if (adev->ip_blocks[i].version->type != block_type)
1626 continue;
1627 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1628 continue;
1629 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1630 (void *)adev, state);
1631 if (r)
1632 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1633 adev->ip_blocks[i].version->funcs->name, r);
1634 }
1635 return r;
1636 }
1637
1638 /**
1639 * amdgpu_device_ip_set_powergating_state - set the PG state
1640 *
1641 * @dev: amdgpu_device pointer
1642 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1643 * @state: powergating state (gate or ungate)
1644 *
1645 * Sets the requested powergating state for all instances of
1646 * the hardware IP specified.
1647 * Returns the error code from the last instance.
1648 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1649 int amdgpu_device_ip_set_powergating_state(void *dev,
1650 enum amd_ip_block_type block_type,
1651 enum amd_powergating_state state)
1652 {
1653 struct amdgpu_device *adev = dev;
1654 int i, r = 0;
1655
1656 for (i = 0; i < adev->num_ip_blocks; i++) {
1657 if (!adev->ip_blocks[i].status.valid)
1658 continue;
1659 if (adev->ip_blocks[i].version->type != block_type)
1660 continue;
1661 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1662 continue;
1663 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1664 (void *)adev, state);
1665 if (r)
1666 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1667 adev->ip_blocks[i].version->funcs->name, r);
1668 }
1669 return r;
1670 }
1671
1672 /**
1673 * amdgpu_device_ip_get_clockgating_state - get the CG state
1674 *
1675 * @adev: amdgpu_device pointer
1676 * @flags: clockgating feature flags
1677 *
1678 * Walks the list of IPs on the device and updates the clockgating
1679 * flags for each IP.
1680 * Updates @flags with the feature flags for each hardware IP where
1681 * clockgating is enabled.
1682 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1683 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1684 u64 *flags)
1685 {
1686 int i;
1687
1688 for (i = 0; i < adev->num_ip_blocks; i++) {
1689 if (!adev->ip_blocks[i].status.valid)
1690 continue;
1691 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1692 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1693 }
1694 }
1695
1696 /**
1697 * amdgpu_device_ip_wait_for_idle - wait for idle
1698 *
1699 * @adev: amdgpu_device pointer
1700 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1701 *
1702 * Waits for the request hardware IP to be idle.
1703 * Returns 0 for success or a negative error code on failure.
1704 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1705 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1706 enum amd_ip_block_type block_type)
1707 {
1708 int i, r;
1709
1710 for (i = 0; i < adev->num_ip_blocks; i++) {
1711 if (!adev->ip_blocks[i].status.valid)
1712 continue;
1713 if (adev->ip_blocks[i].version->type == block_type) {
1714 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1715 if (r)
1716 return r;
1717 break;
1718 }
1719 }
1720 return 0;
1721
1722 }
1723
1724 /**
1725 * amdgpu_device_ip_is_idle - is the hardware IP idle
1726 *
1727 * @adev: amdgpu_device pointer
1728 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1729 *
1730 * Check if the hardware IP is idle or not.
1731 * Returns true if it the IP is idle, false if not.
1732 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1733 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1734 enum amd_ip_block_type block_type)
1735 {
1736 int i;
1737
1738 for (i = 0; i < adev->num_ip_blocks; i++) {
1739 if (!adev->ip_blocks[i].status.valid)
1740 continue;
1741 if (adev->ip_blocks[i].version->type == block_type)
1742 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1743 }
1744 return true;
1745
1746 }
1747
1748 /**
1749 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1750 *
1751 * @adev: amdgpu_device pointer
1752 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1753 *
1754 * Returns a pointer to the hardware IP block structure
1755 * if it exists for the asic, otherwise NULL.
1756 */
1757 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1758 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1759 enum amd_ip_block_type type)
1760 {
1761 int i;
1762
1763 for (i = 0; i < adev->num_ip_blocks; i++)
1764 if (adev->ip_blocks[i].version->type == type)
1765 return &adev->ip_blocks[i];
1766
1767 return NULL;
1768 }
1769
1770 /**
1771 * amdgpu_device_ip_block_version_cmp
1772 *
1773 * @adev: amdgpu_device pointer
1774 * @type: enum amd_ip_block_type
1775 * @major: major version
1776 * @minor: minor version
1777 *
1778 * return 0 if equal or greater
1779 * return 1 if smaller or the ip_block doesn't exist
1780 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1781 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1782 enum amd_ip_block_type type,
1783 u32 major, u32 minor)
1784 {
1785 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1786
1787 if (ip_block && ((ip_block->version->major > major) ||
1788 ((ip_block->version->major == major) &&
1789 (ip_block->version->minor >= minor))))
1790 return 0;
1791
1792 return 1;
1793 }
1794
1795 /**
1796 * amdgpu_device_ip_block_add
1797 *
1798 * @adev: amdgpu_device pointer
1799 * @ip_block_version: pointer to the IP to add
1800 *
1801 * Adds the IP block driver information to the collection of IPs
1802 * on the asic.
1803 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1804 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1805 const struct amdgpu_ip_block_version *ip_block_version)
1806 {
1807 if (!ip_block_version)
1808 return -EINVAL;
1809
1810 switch (ip_block_version->type) {
1811 case AMD_IP_BLOCK_TYPE_VCN:
1812 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1813 return 0;
1814 break;
1815 case AMD_IP_BLOCK_TYPE_JPEG:
1816 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1817 return 0;
1818 break;
1819 default:
1820 break;
1821 }
1822
1823 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1824 ip_block_version->funcs->name);
1825
1826 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1827
1828 return 0;
1829 }
1830
1831 /**
1832 * amdgpu_device_enable_virtual_display - enable virtual display feature
1833 *
1834 * @adev: amdgpu_device pointer
1835 *
1836 * Enabled the virtual display feature if the user has enabled it via
1837 * the module parameter virtual_display. This feature provides a virtual
1838 * display hardware on headless boards or in virtualized environments.
1839 * This function parses and validates the configuration string specified by
1840 * the user and configues the virtual display configuration (number of
1841 * virtual connectors, crtcs, etc.) specified.
1842 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1843 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1844 {
1845 adev->enable_virtual_display = false;
1846
1847 if (amdgpu_virtual_display) {
1848 const char *pci_address_name = pci_name(adev->pdev);
1849 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1850
1851 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1852 pciaddstr_tmp = pciaddstr;
1853 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1854 pciaddname = strsep(&pciaddname_tmp, ",");
1855 if (!strcmp("all", pciaddname)
1856 || !strcmp(pci_address_name, pciaddname)) {
1857 long num_crtc;
1858 int res = -1;
1859
1860 adev->enable_virtual_display = true;
1861
1862 if (pciaddname_tmp)
1863 res = kstrtol(pciaddname_tmp, 10,
1864 &num_crtc);
1865
1866 if (!res) {
1867 if (num_crtc < 1)
1868 num_crtc = 1;
1869 if (num_crtc > 6)
1870 num_crtc = 6;
1871 adev->mode_info.num_crtc = num_crtc;
1872 } else {
1873 adev->mode_info.num_crtc = 1;
1874 }
1875 break;
1876 }
1877 }
1878
1879 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1880 amdgpu_virtual_display, pci_address_name,
1881 adev->enable_virtual_display, adev->mode_info.num_crtc);
1882
1883 kfree(pciaddstr);
1884 }
1885 }
1886
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1887 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1888 {
1889 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1890 adev->mode_info.num_crtc = 1;
1891 adev->enable_virtual_display = true;
1892 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1893 adev->enable_virtual_display, adev->mode_info.num_crtc);
1894 }
1895 }
1896
1897 /**
1898 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1899 *
1900 * @adev: amdgpu_device pointer
1901 *
1902 * Parses the asic configuration parameters specified in the gpu info
1903 * firmware and makes them availale to the driver for use in configuring
1904 * the asic.
1905 * Returns 0 on success, -EINVAL on failure.
1906 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1907 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1908 {
1909 const char *chip_name;
1910 char fw_name[40];
1911 int err;
1912 const struct gpu_info_firmware_header_v1_0 *hdr;
1913
1914 adev->firmware.gpu_info_fw = NULL;
1915
1916 if (adev->mman.discovery_bin)
1917 return 0;
1918
1919 switch (adev->asic_type) {
1920 default:
1921 return 0;
1922 case CHIP_VEGA10:
1923 chip_name = "vega10";
1924 break;
1925 case CHIP_VEGA12:
1926 chip_name = "vega12";
1927 break;
1928 case CHIP_RAVEN:
1929 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1930 chip_name = "raven2";
1931 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1932 chip_name = "picasso";
1933 else
1934 chip_name = "raven";
1935 break;
1936 case CHIP_ARCTURUS:
1937 chip_name = "arcturus";
1938 break;
1939 case CHIP_NAVI12:
1940 chip_name = "navi12";
1941 break;
1942 }
1943
1944 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1945 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1946 if (err) {
1947 dev_err(adev->dev,
1948 "Failed to get gpu_info firmware \"%s\"\n",
1949 fw_name);
1950 goto out;
1951 }
1952
1953 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1954 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1955
1956 switch (hdr->version_major) {
1957 case 1:
1958 {
1959 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1960 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1961 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1962
1963 /*
1964 * Should be droped when DAL no longer needs it.
1965 */
1966 if (adev->asic_type == CHIP_NAVI12)
1967 goto parse_soc_bounding_box;
1968
1969 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1970 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1971 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1972 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1973 adev->gfx.config.max_texture_channel_caches =
1974 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1975 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1976 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1977 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1978 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1979 adev->gfx.config.double_offchip_lds_buf =
1980 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1981 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1982 adev->gfx.cu_info.max_waves_per_simd =
1983 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1984 adev->gfx.cu_info.max_scratch_slots_per_cu =
1985 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1986 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1987 if (hdr->version_minor >= 1) {
1988 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1989 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1990 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1991 adev->gfx.config.num_sc_per_sh =
1992 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1993 adev->gfx.config.num_packer_per_sc =
1994 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1995 }
1996
1997 parse_soc_bounding_box:
1998 /*
1999 * soc bounding box info is not integrated in disocovery table,
2000 * we always need to parse it from gpu info firmware if needed.
2001 */
2002 if (hdr->version_minor == 2) {
2003 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2004 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2005 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2006 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2007 }
2008 break;
2009 }
2010 default:
2011 dev_err(adev->dev,
2012 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2013 err = -EINVAL;
2014 goto out;
2015 }
2016 out:
2017 return err;
2018 }
2019
2020 /**
2021 * amdgpu_device_ip_early_init - run early init for hardware IPs
2022 *
2023 * @adev: amdgpu_device pointer
2024 *
2025 * Early initialization pass for hardware IPs. The hardware IPs that make
2026 * up each asic are discovered each IP's early_init callback is run. This
2027 * is the first stage in initializing the asic.
2028 * Returns 0 on success, negative error code on failure.
2029 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2030 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2031 {
2032 struct pci_dev *parent;
2033 int i, r;
2034 bool total;
2035
2036 amdgpu_device_enable_virtual_display(adev);
2037
2038 if (amdgpu_sriov_vf(adev)) {
2039 r = amdgpu_virt_request_full_gpu(adev, true);
2040 if (r)
2041 return r;
2042 }
2043
2044 switch (adev->asic_type) {
2045 #ifdef CONFIG_DRM_AMDGPU_SI
2046 case CHIP_VERDE:
2047 case CHIP_TAHITI:
2048 case CHIP_PITCAIRN:
2049 case CHIP_OLAND:
2050 case CHIP_HAINAN:
2051 adev->family = AMDGPU_FAMILY_SI;
2052 r = si_set_ip_blocks(adev);
2053 if (r)
2054 return r;
2055 break;
2056 #endif
2057 #ifdef CONFIG_DRM_AMDGPU_CIK
2058 case CHIP_BONAIRE:
2059 case CHIP_HAWAII:
2060 case CHIP_KAVERI:
2061 case CHIP_KABINI:
2062 case CHIP_MULLINS:
2063 if (adev->flags & AMD_IS_APU)
2064 adev->family = AMDGPU_FAMILY_KV;
2065 else
2066 adev->family = AMDGPU_FAMILY_CI;
2067
2068 r = cik_set_ip_blocks(adev);
2069 if (r)
2070 return r;
2071 break;
2072 #endif
2073 case CHIP_TOPAZ:
2074 case CHIP_TONGA:
2075 case CHIP_FIJI:
2076 case CHIP_POLARIS10:
2077 case CHIP_POLARIS11:
2078 case CHIP_POLARIS12:
2079 case CHIP_VEGAM:
2080 case CHIP_CARRIZO:
2081 case CHIP_STONEY:
2082 if (adev->flags & AMD_IS_APU)
2083 adev->family = AMDGPU_FAMILY_CZ;
2084 else
2085 adev->family = AMDGPU_FAMILY_VI;
2086
2087 r = vi_set_ip_blocks(adev);
2088 if (r)
2089 return r;
2090 break;
2091 default:
2092 r = amdgpu_discovery_set_ip_blocks(adev);
2093 if (r)
2094 return r;
2095 break;
2096 }
2097
2098 if (amdgpu_has_atpx() &&
2099 (amdgpu_is_atpx_hybrid() ||
2100 amdgpu_has_atpx_dgpu_power_cntl()) &&
2101 ((adev->flags & AMD_IS_APU) == 0) &&
2102 !dev_is_removable(&adev->pdev->dev))
2103 adev->flags |= AMD_IS_PX;
2104
2105 if (!(adev->flags & AMD_IS_APU)) {
2106 parent = pcie_find_root_port(adev->pdev);
2107 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2108 }
2109
2110
2111 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2112 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2113 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2114 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2115 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2116 if (!amdgpu_device_pcie_dynamic_switching_supported())
2117 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2118
2119 total = true;
2120 for (i = 0; i < adev->num_ip_blocks; i++) {
2121 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2122 DRM_WARN("disabled ip block: %d <%s>\n",
2123 i, adev->ip_blocks[i].version->funcs->name);
2124 adev->ip_blocks[i].status.valid = false;
2125 } else {
2126 if (adev->ip_blocks[i].version->funcs->early_init) {
2127 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2128 if (r == -ENOENT) {
2129 adev->ip_blocks[i].status.valid = false;
2130 } else if (r) {
2131 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2132 adev->ip_blocks[i].version->funcs->name, r);
2133 total = false;
2134 } else {
2135 adev->ip_blocks[i].status.valid = true;
2136 }
2137 } else {
2138 adev->ip_blocks[i].status.valid = true;
2139 }
2140 }
2141 /* get the vbios after the asic_funcs are set up */
2142 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2143 r = amdgpu_device_parse_gpu_info_fw(adev);
2144 if (r)
2145 return r;
2146
2147 /* Read BIOS */
2148 if (amdgpu_device_read_bios(adev)) {
2149 if (!amdgpu_get_bios(adev))
2150 return -EINVAL;
2151
2152 r = amdgpu_atombios_init(adev);
2153 if (r) {
2154 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2155 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2156 return r;
2157 }
2158 }
2159
2160 /*get pf2vf msg info at it's earliest time*/
2161 if (amdgpu_sriov_vf(adev))
2162 amdgpu_virt_init_data_exchange(adev);
2163
2164 }
2165 }
2166 if (!total)
2167 return -ENODEV;
2168
2169 amdgpu_amdkfd_device_probe(adev);
2170 adev->cg_flags &= amdgpu_cg_mask;
2171 adev->pg_flags &= amdgpu_pg_mask;
2172
2173 return 0;
2174 }
2175
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2176 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2177 {
2178 int i, r;
2179
2180 for (i = 0; i < adev->num_ip_blocks; i++) {
2181 if (!adev->ip_blocks[i].status.sw)
2182 continue;
2183 if (adev->ip_blocks[i].status.hw)
2184 continue;
2185 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2186 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2187 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2188 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2189 if (r) {
2190 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2191 adev->ip_blocks[i].version->funcs->name, r);
2192 return r;
2193 }
2194 adev->ip_blocks[i].status.hw = true;
2195 }
2196 }
2197
2198 return 0;
2199 }
2200
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2201 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2202 {
2203 int i, r;
2204
2205 for (i = 0; i < adev->num_ip_blocks; i++) {
2206 if (!adev->ip_blocks[i].status.sw)
2207 continue;
2208 if (adev->ip_blocks[i].status.hw)
2209 continue;
2210 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2211 if (r) {
2212 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2213 adev->ip_blocks[i].version->funcs->name, r);
2214 return r;
2215 }
2216 adev->ip_blocks[i].status.hw = true;
2217 }
2218
2219 return 0;
2220 }
2221
amdgpu_device_fw_loading(struct amdgpu_device * adev)2222 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2223 {
2224 int r = 0;
2225 int i;
2226 uint32_t smu_version;
2227
2228 if (adev->asic_type >= CHIP_VEGA10) {
2229 for (i = 0; i < adev->num_ip_blocks; i++) {
2230 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2231 continue;
2232
2233 if (!adev->ip_blocks[i].status.sw)
2234 continue;
2235
2236 /* no need to do the fw loading again if already done*/
2237 if (adev->ip_blocks[i].status.hw == true)
2238 break;
2239
2240 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2241 r = adev->ip_blocks[i].version->funcs->resume(adev);
2242 if (r) {
2243 DRM_ERROR("resume of IP block <%s> failed %d\n",
2244 adev->ip_blocks[i].version->funcs->name, r);
2245 return r;
2246 }
2247 } else {
2248 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2249 if (r) {
2250 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2251 adev->ip_blocks[i].version->funcs->name, r);
2252 return r;
2253 }
2254 }
2255
2256 adev->ip_blocks[i].status.hw = true;
2257 break;
2258 }
2259 }
2260
2261 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2262 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2263
2264 return r;
2265 }
2266
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2267 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2268 {
2269 long timeout;
2270 int r, i;
2271
2272 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2273 struct amdgpu_ring *ring = adev->rings[i];
2274
2275 /* No need to setup the GPU scheduler for rings that don't need it */
2276 if (!ring || ring->no_scheduler)
2277 continue;
2278
2279 switch (ring->funcs->type) {
2280 case AMDGPU_RING_TYPE_GFX:
2281 timeout = adev->gfx_timeout;
2282 break;
2283 case AMDGPU_RING_TYPE_COMPUTE:
2284 timeout = adev->compute_timeout;
2285 break;
2286 case AMDGPU_RING_TYPE_SDMA:
2287 timeout = adev->sdma_timeout;
2288 break;
2289 default:
2290 timeout = adev->video_timeout;
2291 break;
2292 }
2293
2294 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2295 ring->num_hw_submission, 0,
2296 timeout, adev->reset_domain->wq,
2297 ring->sched_score, ring->name,
2298 adev->dev);
2299 if (r) {
2300 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2301 ring->name);
2302 return r;
2303 }
2304 }
2305
2306 amdgpu_xcp_update_partition_sched_list(adev);
2307
2308 return 0;
2309 }
2310
2311
2312 /**
2313 * amdgpu_device_ip_init - run init for hardware IPs
2314 *
2315 * @adev: amdgpu_device pointer
2316 *
2317 * Main initialization pass for hardware IPs. The list of all the hardware
2318 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2319 * are run. sw_init initializes the software state associated with each IP
2320 * and hw_init initializes the hardware associated with each IP.
2321 * Returns 0 on success, negative error code on failure.
2322 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2323 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2324 {
2325 int i, r;
2326
2327 r = amdgpu_ras_init(adev);
2328 if (r)
2329 return r;
2330
2331 for (i = 0; i < adev->num_ip_blocks; i++) {
2332 if (!adev->ip_blocks[i].status.valid)
2333 continue;
2334 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2335 if (r) {
2336 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2337 adev->ip_blocks[i].version->funcs->name, r);
2338 goto init_failed;
2339 }
2340 adev->ip_blocks[i].status.sw = true;
2341
2342 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2343 /* need to do common hw init early so everything is set up for gmc */
2344 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2345 if (r) {
2346 DRM_ERROR("hw_init %d failed %d\n", i, r);
2347 goto init_failed;
2348 }
2349 adev->ip_blocks[i].status.hw = true;
2350 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2351 /* need to do gmc hw init early so we can allocate gpu mem */
2352 /* Try to reserve bad pages early */
2353 if (amdgpu_sriov_vf(adev))
2354 amdgpu_virt_exchange_data(adev);
2355
2356 r = amdgpu_device_mem_scratch_init(adev);
2357 if (r) {
2358 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2359 goto init_failed;
2360 }
2361 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2362 if (r) {
2363 DRM_ERROR("hw_init %d failed %d\n", i, r);
2364 goto init_failed;
2365 }
2366 r = amdgpu_device_wb_init(adev);
2367 if (r) {
2368 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2369 goto init_failed;
2370 }
2371 adev->ip_blocks[i].status.hw = true;
2372
2373 /* right after GMC hw init, we create CSA */
2374 if (adev->gfx.mcbp) {
2375 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2376 AMDGPU_GEM_DOMAIN_VRAM |
2377 AMDGPU_GEM_DOMAIN_GTT,
2378 AMDGPU_CSA_SIZE);
2379 if (r) {
2380 DRM_ERROR("allocate CSA failed %d\n", r);
2381 goto init_failed;
2382 }
2383 }
2384 }
2385 }
2386
2387 if (amdgpu_sriov_vf(adev))
2388 amdgpu_virt_init_data_exchange(adev);
2389
2390 r = amdgpu_ib_pool_init(adev);
2391 if (r) {
2392 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2393 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2394 goto init_failed;
2395 }
2396
2397 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2398 if (r)
2399 goto init_failed;
2400
2401 r = amdgpu_device_ip_hw_init_phase1(adev);
2402 if (r)
2403 goto init_failed;
2404
2405 r = amdgpu_device_fw_loading(adev);
2406 if (r)
2407 goto init_failed;
2408
2409 r = amdgpu_device_ip_hw_init_phase2(adev);
2410 if (r)
2411 goto init_failed;
2412
2413 /*
2414 * retired pages will be loaded from eeprom and reserved here,
2415 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2416 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2417 * for I2C communication which only true at this point.
2418 *
2419 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2420 * failure from bad gpu situation and stop amdgpu init process
2421 * accordingly. For other failed cases, it will still release all
2422 * the resource and print error message, rather than returning one
2423 * negative value to upper level.
2424 *
2425 * Note: theoretically, this should be called before all vram allocations
2426 * to protect retired page from abusing
2427 */
2428 r = amdgpu_ras_recovery_init(adev);
2429 if (r)
2430 goto init_failed;
2431
2432 /**
2433 * In case of XGMI grab extra reference for reset domain for this device
2434 */
2435 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2436 if (amdgpu_xgmi_add_device(adev) == 0) {
2437 if (!amdgpu_sriov_vf(adev)) {
2438 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2439
2440 if (WARN_ON(!hive)) {
2441 r = -ENOENT;
2442 goto init_failed;
2443 }
2444
2445 if (!hive->reset_domain ||
2446 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2447 r = -ENOENT;
2448 amdgpu_put_xgmi_hive(hive);
2449 goto init_failed;
2450 }
2451
2452 /* Drop the early temporary reset domain we created for device */
2453 amdgpu_reset_put_reset_domain(adev->reset_domain);
2454 adev->reset_domain = hive->reset_domain;
2455 amdgpu_put_xgmi_hive(hive);
2456 }
2457 }
2458 }
2459
2460 r = amdgpu_device_init_schedulers(adev);
2461 if (r)
2462 goto init_failed;
2463
2464 /* Don't init kfd if whole hive need to be reset during init */
2465 if (!adev->gmc.xgmi.pending_reset) {
2466 kgd2kfd_init_zone_device(adev);
2467 amdgpu_amdkfd_device_init(adev);
2468 }
2469
2470 amdgpu_fru_get_product_info(adev);
2471
2472 init_failed:
2473
2474 return r;
2475 }
2476
2477 /**
2478 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2479 *
2480 * @adev: amdgpu_device pointer
2481 *
2482 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2483 * this function before a GPU reset. If the value is retained after a
2484 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2485 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2486 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2487 {
2488 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2489 }
2490
2491 /**
2492 * amdgpu_device_check_vram_lost - check if vram is valid
2493 *
2494 * @adev: amdgpu_device pointer
2495 *
2496 * Checks the reset magic value written to the gart pointer in VRAM.
2497 * The driver calls this after a GPU reset to see if the contents of
2498 * VRAM is lost or now.
2499 * returns true if vram is lost, false if not.
2500 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2501 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2502 {
2503 if (memcmp(adev->gart.ptr, adev->reset_magic,
2504 AMDGPU_RESET_MAGIC_NUM))
2505 return true;
2506
2507 if (!amdgpu_in_reset(adev))
2508 return false;
2509
2510 /*
2511 * For all ASICs with baco/mode1 reset, the VRAM is
2512 * always assumed to be lost.
2513 */
2514 switch (amdgpu_asic_reset_method(adev)) {
2515 case AMD_RESET_METHOD_BACO:
2516 case AMD_RESET_METHOD_MODE1:
2517 return true;
2518 default:
2519 return false;
2520 }
2521 }
2522
2523 /**
2524 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2525 *
2526 * @adev: amdgpu_device pointer
2527 * @state: clockgating state (gate or ungate)
2528 *
2529 * The list of all the hardware IPs that make up the asic is walked and the
2530 * set_clockgating_state callbacks are run.
2531 * Late initialization pass enabling clockgating for hardware IPs.
2532 * Fini or suspend, pass disabling clockgating for hardware IPs.
2533 * Returns 0 on success, negative error code on failure.
2534 */
2535
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2536 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2537 enum amd_clockgating_state state)
2538 {
2539 int i, j, r;
2540
2541 if (amdgpu_emu_mode == 1)
2542 return 0;
2543
2544 for (j = 0; j < adev->num_ip_blocks; j++) {
2545 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2546 if (!adev->ip_blocks[i].status.late_initialized)
2547 continue;
2548 /* skip CG for GFX, SDMA on S0ix */
2549 if (adev->in_s0ix &&
2550 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2551 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2552 continue;
2553 /* skip CG for VCE/UVD, it's handled specially */
2554 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2555 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2556 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2557 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2558 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2559 /* enable clockgating to save power */
2560 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2561 state);
2562 if (r) {
2563 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2564 adev->ip_blocks[i].version->funcs->name, r);
2565 return r;
2566 }
2567 }
2568 }
2569
2570 return 0;
2571 }
2572
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2573 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2574 enum amd_powergating_state state)
2575 {
2576 int i, j, r;
2577
2578 if (amdgpu_emu_mode == 1)
2579 return 0;
2580
2581 for (j = 0; j < adev->num_ip_blocks; j++) {
2582 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2583 if (!adev->ip_blocks[i].status.late_initialized)
2584 continue;
2585 /* skip PG for GFX, SDMA on S0ix */
2586 if (adev->in_s0ix &&
2587 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2588 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2589 continue;
2590 /* skip CG for VCE/UVD, it's handled specially */
2591 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2592 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2593 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2594 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2595 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2596 /* enable powergating to save power */
2597 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2598 state);
2599 if (r) {
2600 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2601 adev->ip_blocks[i].version->funcs->name, r);
2602 return r;
2603 }
2604 }
2605 }
2606 return 0;
2607 }
2608
amdgpu_device_enable_mgpu_fan_boost(void)2609 static int amdgpu_device_enable_mgpu_fan_boost(void)
2610 {
2611 struct amdgpu_gpu_instance *gpu_ins;
2612 struct amdgpu_device *adev;
2613 int i, ret = 0;
2614
2615 mutex_lock(&mgpu_info.mutex);
2616
2617 /*
2618 * MGPU fan boost feature should be enabled
2619 * only when there are two or more dGPUs in
2620 * the system
2621 */
2622 if (mgpu_info.num_dgpu < 2)
2623 goto out;
2624
2625 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2626 gpu_ins = &(mgpu_info.gpu_ins[i]);
2627 adev = gpu_ins->adev;
2628 if (!(adev->flags & AMD_IS_APU) &&
2629 !gpu_ins->mgpu_fan_enabled) {
2630 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2631 if (ret)
2632 break;
2633
2634 gpu_ins->mgpu_fan_enabled = 1;
2635 }
2636 }
2637
2638 out:
2639 mutex_unlock(&mgpu_info.mutex);
2640
2641 return ret;
2642 }
2643
2644 /**
2645 * amdgpu_device_ip_late_init - run late init for hardware IPs
2646 *
2647 * @adev: amdgpu_device pointer
2648 *
2649 * Late initialization pass for hardware IPs. The list of all the hardware
2650 * IPs that make up the asic is walked and the late_init callbacks are run.
2651 * late_init covers any special initialization that an IP requires
2652 * after all of the have been initialized or something that needs to happen
2653 * late in the init process.
2654 * Returns 0 on success, negative error code on failure.
2655 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2656 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2657 {
2658 struct amdgpu_gpu_instance *gpu_instance;
2659 int i = 0, r;
2660
2661 for (i = 0; i < adev->num_ip_blocks; i++) {
2662 if (!adev->ip_blocks[i].status.hw)
2663 continue;
2664 if (adev->ip_blocks[i].version->funcs->late_init) {
2665 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2666 if (r) {
2667 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2668 adev->ip_blocks[i].version->funcs->name, r);
2669 return r;
2670 }
2671 }
2672 adev->ip_blocks[i].status.late_initialized = true;
2673 }
2674
2675 r = amdgpu_ras_late_init(adev);
2676 if (r) {
2677 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2678 return r;
2679 }
2680
2681 amdgpu_ras_set_error_query_ready(adev, true);
2682
2683 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2684 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2685
2686 amdgpu_device_fill_reset_magic(adev);
2687
2688 r = amdgpu_device_enable_mgpu_fan_boost();
2689 if (r)
2690 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2691
2692 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2693 if (amdgpu_passthrough(adev) &&
2694 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2695 adev->asic_type == CHIP_ALDEBARAN))
2696 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2697
2698 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2699 mutex_lock(&mgpu_info.mutex);
2700
2701 /*
2702 * Reset device p-state to low as this was booted with high.
2703 *
2704 * This should be performed only after all devices from the same
2705 * hive get initialized.
2706 *
2707 * However, it's unknown how many device in the hive in advance.
2708 * As this is counted one by one during devices initializations.
2709 *
2710 * So, we wait for all XGMI interlinked devices initialized.
2711 * This may bring some delays as those devices may come from
2712 * different hives. But that should be OK.
2713 */
2714 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2715 for (i = 0; i < mgpu_info.num_gpu; i++) {
2716 gpu_instance = &(mgpu_info.gpu_ins[i]);
2717 if (gpu_instance->adev->flags & AMD_IS_APU)
2718 continue;
2719
2720 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2721 AMDGPU_XGMI_PSTATE_MIN);
2722 if (r) {
2723 DRM_ERROR("pstate setting failed (%d).\n", r);
2724 break;
2725 }
2726 }
2727 }
2728
2729 mutex_unlock(&mgpu_info.mutex);
2730 }
2731
2732 return 0;
2733 }
2734
2735 /**
2736 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2737 *
2738 * @adev: amdgpu_device pointer
2739 *
2740 * For ASICs need to disable SMC first
2741 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2742 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2743 {
2744 int i, r;
2745
2746 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2747 return;
2748
2749 for (i = 0; i < adev->num_ip_blocks; i++) {
2750 if (!adev->ip_blocks[i].status.hw)
2751 continue;
2752 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2753 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2754 /* XXX handle errors */
2755 if (r) {
2756 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2757 adev->ip_blocks[i].version->funcs->name, r);
2758 }
2759 adev->ip_blocks[i].status.hw = false;
2760 break;
2761 }
2762 }
2763 }
2764
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2765 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2766 {
2767 int i, r;
2768
2769 for (i = 0; i < adev->num_ip_blocks; i++) {
2770 if (!adev->ip_blocks[i].version->funcs->early_fini)
2771 continue;
2772
2773 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2774 if (r) {
2775 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2776 adev->ip_blocks[i].version->funcs->name, r);
2777 }
2778 }
2779
2780 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2781 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2782
2783 amdgpu_amdkfd_suspend(adev, false);
2784
2785 /* Workaroud for ASICs need to disable SMC first */
2786 amdgpu_device_smu_fini_early(adev);
2787
2788 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2789 if (!adev->ip_blocks[i].status.hw)
2790 continue;
2791
2792 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2793 /* XXX handle errors */
2794 if (r) {
2795 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2796 adev->ip_blocks[i].version->funcs->name, r);
2797 }
2798
2799 adev->ip_blocks[i].status.hw = false;
2800 }
2801
2802 if (amdgpu_sriov_vf(adev)) {
2803 if (amdgpu_virt_release_full_gpu(adev, false))
2804 DRM_ERROR("failed to release exclusive mode on fini\n");
2805 }
2806
2807 return 0;
2808 }
2809
2810 /**
2811 * amdgpu_device_ip_fini - run fini for hardware IPs
2812 *
2813 * @adev: amdgpu_device pointer
2814 *
2815 * Main teardown pass for hardware IPs. The list of all the hardware
2816 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2817 * are run. hw_fini tears down the hardware associated with each IP
2818 * and sw_fini tears down any software state associated with each IP.
2819 * Returns 0 on success, negative error code on failure.
2820 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2821 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2822 {
2823 int i, r;
2824
2825 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2826 amdgpu_virt_release_ras_err_handler_data(adev);
2827
2828 if (adev->gmc.xgmi.num_physical_nodes > 1)
2829 amdgpu_xgmi_remove_device(adev);
2830
2831 amdgpu_amdkfd_device_fini_sw(adev);
2832
2833 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2834 if (!adev->ip_blocks[i].status.sw)
2835 continue;
2836
2837 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2838 amdgpu_ucode_free_bo(adev);
2839 amdgpu_free_static_csa(&adev->virt.csa_obj);
2840 amdgpu_device_wb_fini(adev);
2841 amdgpu_device_mem_scratch_fini(adev);
2842 amdgpu_ib_pool_fini(adev);
2843 }
2844
2845 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2846 /* XXX handle errors */
2847 if (r) {
2848 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2849 adev->ip_blocks[i].version->funcs->name, r);
2850 }
2851 adev->ip_blocks[i].status.sw = false;
2852 adev->ip_blocks[i].status.valid = false;
2853 }
2854
2855 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2856 if (!adev->ip_blocks[i].status.late_initialized)
2857 continue;
2858 if (adev->ip_blocks[i].version->funcs->late_fini)
2859 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2860 adev->ip_blocks[i].status.late_initialized = false;
2861 }
2862
2863 amdgpu_ras_fini(adev);
2864
2865 return 0;
2866 }
2867
2868 /**
2869 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2870 *
2871 * @work: work_struct.
2872 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2873 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2874 {
2875 struct amdgpu_device *adev =
2876 container_of(work, struct amdgpu_device, delayed_init_work.work);
2877 int r;
2878
2879 r = amdgpu_ib_ring_tests(adev);
2880 if (r)
2881 DRM_ERROR("ib ring test failed (%d).\n", r);
2882 }
2883
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2884 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2885 {
2886 struct amdgpu_device *adev =
2887 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2888
2889 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2890 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2891
2892 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2893 adev->gfx.gfx_off_state = true;
2894 }
2895
2896 /**
2897 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2898 *
2899 * @adev: amdgpu_device pointer
2900 *
2901 * Main suspend function for hardware IPs. The list of all the hardware
2902 * IPs that make up the asic is walked, clockgating is disabled and the
2903 * suspend callbacks are run. suspend puts the hardware and software state
2904 * in each IP into a state suitable for suspend.
2905 * Returns 0 on success, negative error code on failure.
2906 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2907 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2908 {
2909 int i, r;
2910
2911 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2912 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2913
2914 /*
2915 * Per PMFW team's suggestion, driver needs to handle gfxoff
2916 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2917 * scenario. Add the missing df cstate disablement here.
2918 */
2919 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2920 dev_warn(adev->dev, "Failed to disallow df cstate");
2921
2922 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2923 if (!adev->ip_blocks[i].status.valid)
2924 continue;
2925
2926 /* displays are handled separately */
2927 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2928 continue;
2929
2930 /* XXX handle errors */
2931 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2932 /* XXX handle errors */
2933 if (r) {
2934 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2935 adev->ip_blocks[i].version->funcs->name, r);
2936 return r;
2937 }
2938
2939 adev->ip_blocks[i].status.hw = false;
2940 }
2941
2942 return 0;
2943 }
2944
2945 /**
2946 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2947 *
2948 * @adev: amdgpu_device pointer
2949 *
2950 * Main suspend function for hardware IPs. The list of all the hardware
2951 * IPs that make up the asic is walked, clockgating is disabled and the
2952 * suspend callbacks are run. suspend puts the hardware and software state
2953 * in each IP into a state suitable for suspend.
2954 * Returns 0 on success, negative error code on failure.
2955 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2956 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2957 {
2958 int i, r;
2959
2960 if (adev->in_s0ix)
2961 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2962
2963 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2964 if (!adev->ip_blocks[i].status.valid)
2965 continue;
2966 /* displays are handled in phase1 */
2967 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2968 continue;
2969 /* PSP lost connection when err_event_athub occurs */
2970 if (amdgpu_ras_intr_triggered() &&
2971 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2972 adev->ip_blocks[i].status.hw = false;
2973 continue;
2974 }
2975
2976 /* skip unnecessary suspend if we do not initialize them yet */
2977 if (adev->gmc.xgmi.pending_reset &&
2978 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2979 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2980 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2982 adev->ip_blocks[i].status.hw = false;
2983 continue;
2984 }
2985
2986 /* skip suspend of gfx/mes and psp for S0ix
2987 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2988 * like at runtime. PSP is also part of the always on hardware
2989 * so no need to suspend it.
2990 */
2991 if (adev->in_s0ix &&
2992 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
2995 continue;
2996
2997 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
2998 if (adev->in_s0ix &&
2999 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3000 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3001 continue;
3002
3003 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3004 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3005 * from this location and RLC Autoload automatically also gets loaded
3006 * from here based on PMFW -> PSP message during re-init sequence.
3007 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3008 * the TMR and reload FWs again for IMU enabled APU ASICs.
3009 */
3010 if (amdgpu_in_reset(adev) &&
3011 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3012 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3013 continue;
3014
3015 /* XXX handle errors */
3016 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3017 /* XXX handle errors */
3018 if (r) {
3019 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3020 adev->ip_blocks[i].version->funcs->name, r);
3021 }
3022 adev->ip_blocks[i].status.hw = false;
3023 /* handle putting the SMC in the appropriate state */
3024 if (!amdgpu_sriov_vf(adev)) {
3025 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3026 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3027 if (r) {
3028 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3029 adev->mp1_state, r);
3030 return r;
3031 }
3032 }
3033 }
3034 }
3035
3036 return 0;
3037 }
3038
3039 /**
3040 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3041 *
3042 * @adev: amdgpu_device pointer
3043 *
3044 * Main suspend function for hardware IPs. The list of all the hardware
3045 * IPs that make up the asic is walked, clockgating is disabled and the
3046 * suspend callbacks are run. suspend puts the hardware and software state
3047 * in each IP into a state suitable for suspend.
3048 * Returns 0 on success, negative error code on failure.
3049 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3050 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3051 {
3052 int r;
3053
3054 if (amdgpu_sriov_vf(adev)) {
3055 amdgpu_virt_fini_data_exchange(adev);
3056 amdgpu_virt_request_full_gpu(adev, false);
3057 }
3058
3059 r = amdgpu_device_ip_suspend_phase1(adev);
3060 if (r)
3061 return r;
3062 r = amdgpu_device_ip_suspend_phase2(adev);
3063
3064 if (amdgpu_sriov_vf(adev))
3065 amdgpu_virt_release_full_gpu(adev, false);
3066
3067 return r;
3068 }
3069
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3070 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3071 {
3072 int i, r;
3073
3074 static enum amd_ip_block_type ip_order[] = {
3075 AMD_IP_BLOCK_TYPE_COMMON,
3076 AMD_IP_BLOCK_TYPE_GMC,
3077 AMD_IP_BLOCK_TYPE_PSP,
3078 AMD_IP_BLOCK_TYPE_IH,
3079 };
3080
3081 for (i = 0; i < adev->num_ip_blocks; i++) {
3082 int j;
3083 struct amdgpu_ip_block *block;
3084
3085 block = &adev->ip_blocks[i];
3086 block->status.hw = false;
3087
3088 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3089
3090 if (block->version->type != ip_order[j] ||
3091 !block->status.valid)
3092 continue;
3093
3094 r = block->version->funcs->hw_init(adev);
3095 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3096 if (r)
3097 return r;
3098 block->status.hw = true;
3099 }
3100 }
3101
3102 return 0;
3103 }
3104
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3105 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3106 {
3107 int i, r;
3108
3109 static enum amd_ip_block_type ip_order[] = {
3110 AMD_IP_BLOCK_TYPE_SMC,
3111 AMD_IP_BLOCK_TYPE_DCE,
3112 AMD_IP_BLOCK_TYPE_GFX,
3113 AMD_IP_BLOCK_TYPE_SDMA,
3114 AMD_IP_BLOCK_TYPE_MES,
3115 AMD_IP_BLOCK_TYPE_UVD,
3116 AMD_IP_BLOCK_TYPE_VCE,
3117 AMD_IP_BLOCK_TYPE_VCN,
3118 AMD_IP_BLOCK_TYPE_JPEG
3119 };
3120
3121 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3122 int j;
3123 struct amdgpu_ip_block *block;
3124
3125 for (j = 0; j < adev->num_ip_blocks; j++) {
3126 block = &adev->ip_blocks[j];
3127
3128 if (block->version->type != ip_order[i] ||
3129 !block->status.valid ||
3130 block->status.hw)
3131 continue;
3132
3133 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3134 r = block->version->funcs->resume(adev);
3135 else
3136 r = block->version->funcs->hw_init(adev);
3137
3138 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3139 if (r)
3140 return r;
3141 block->status.hw = true;
3142 }
3143 }
3144
3145 return 0;
3146 }
3147
3148 /**
3149 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3150 *
3151 * @adev: amdgpu_device pointer
3152 *
3153 * First resume function for hardware IPs. The list of all the hardware
3154 * IPs that make up the asic is walked and the resume callbacks are run for
3155 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3156 * after a suspend and updates the software state as necessary. This
3157 * function is also used for restoring the GPU after a GPU reset.
3158 * Returns 0 on success, negative error code on failure.
3159 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3160 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3161 {
3162 int i, r;
3163
3164 for (i = 0; i < adev->num_ip_blocks; i++) {
3165 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3166 continue;
3167 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3168 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3169 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3170 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3171
3172 r = adev->ip_blocks[i].version->funcs->resume(adev);
3173 if (r) {
3174 DRM_ERROR("resume of IP block <%s> failed %d\n",
3175 adev->ip_blocks[i].version->funcs->name, r);
3176 return r;
3177 }
3178 adev->ip_blocks[i].status.hw = true;
3179 }
3180 }
3181
3182 return 0;
3183 }
3184
3185 /**
3186 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3187 *
3188 * @adev: amdgpu_device pointer
3189 *
3190 * First resume function for hardware IPs. The list of all the hardware
3191 * IPs that make up the asic is walked and the resume callbacks are run for
3192 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3193 * functional state after a suspend and updates the software state as
3194 * necessary. This function is also used for restoring the GPU after a GPU
3195 * reset.
3196 * Returns 0 on success, negative error code on failure.
3197 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3198 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3199 {
3200 int i, r;
3201
3202 for (i = 0; i < adev->num_ip_blocks; i++) {
3203 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3204 continue;
3205 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3206 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3207 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3208 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3209 continue;
3210 r = adev->ip_blocks[i].version->funcs->resume(adev);
3211 if (r) {
3212 DRM_ERROR("resume of IP block <%s> failed %d\n",
3213 adev->ip_blocks[i].version->funcs->name, r);
3214 return r;
3215 }
3216 adev->ip_blocks[i].status.hw = true;
3217 }
3218
3219 return 0;
3220 }
3221
3222 /**
3223 * amdgpu_device_ip_resume - run resume for hardware IPs
3224 *
3225 * @adev: amdgpu_device pointer
3226 *
3227 * Main resume function for hardware IPs. The hardware IPs
3228 * are split into two resume functions because they are
3229 * also used in recovering from a GPU reset and some additional
3230 * steps need to be take between them. In this case (S3/S4) they are
3231 * run sequentially.
3232 * Returns 0 on success, negative error code on failure.
3233 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3234 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3235 {
3236 int r;
3237
3238 r = amdgpu_device_ip_resume_phase1(adev);
3239 if (r)
3240 return r;
3241
3242 r = amdgpu_device_fw_loading(adev);
3243 if (r)
3244 return r;
3245
3246 r = amdgpu_device_ip_resume_phase2(adev);
3247
3248 return r;
3249 }
3250
3251 /**
3252 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3253 *
3254 * @adev: amdgpu_device pointer
3255 *
3256 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3257 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3258 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3259 {
3260 if (amdgpu_sriov_vf(adev)) {
3261 if (adev->is_atom_fw) {
3262 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3263 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3264 } else {
3265 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3266 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3267 }
3268
3269 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3270 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3271 }
3272 }
3273
3274 /**
3275 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3276 *
3277 * @asic_type: AMD asic type
3278 *
3279 * Check if there is DC (new modesetting infrastructre) support for an asic.
3280 * returns true if DC has support, false if not.
3281 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3282 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3283 {
3284 switch (asic_type) {
3285 #ifdef CONFIG_DRM_AMDGPU_SI
3286 case CHIP_HAINAN:
3287 #endif
3288 case CHIP_TOPAZ:
3289 /* chips with no display hardware */
3290 return false;
3291 #if defined(CONFIG_DRM_AMD_DC)
3292 case CHIP_TAHITI:
3293 case CHIP_PITCAIRN:
3294 case CHIP_VERDE:
3295 case CHIP_OLAND:
3296 /*
3297 * We have systems in the wild with these ASICs that require
3298 * LVDS and VGA support which is not supported with DC.
3299 *
3300 * Fallback to the non-DC driver here by default so as not to
3301 * cause regressions.
3302 */
3303 #if defined(CONFIG_DRM_AMD_DC_SI)
3304 return amdgpu_dc > 0;
3305 #else
3306 return false;
3307 #endif
3308 case CHIP_BONAIRE:
3309 case CHIP_KAVERI:
3310 case CHIP_KABINI:
3311 case CHIP_MULLINS:
3312 /*
3313 * We have systems in the wild with these ASICs that require
3314 * VGA support which is not supported with DC.
3315 *
3316 * Fallback to the non-DC driver here by default so as not to
3317 * cause regressions.
3318 */
3319 return amdgpu_dc > 0;
3320 default:
3321 return amdgpu_dc != 0;
3322 #else
3323 default:
3324 if (amdgpu_dc > 0)
3325 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3326 return false;
3327 #endif
3328 }
3329 }
3330
3331 /**
3332 * amdgpu_device_has_dc_support - check if dc is supported
3333 *
3334 * @adev: amdgpu_device pointer
3335 *
3336 * Returns true for supported, false for not supported
3337 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3338 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3339 {
3340 if (adev->enable_virtual_display ||
3341 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3342 return false;
3343
3344 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3345 }
3346
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3347 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3348 {
3349 struct amdgpu_device *adev =
3350 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3351 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3352
3353 /* It's a bug to not have a hive within this function */
3354 if (WARN_ON(!hive))
3355 return;
3356
3357 /*
3358 * Use task barrier to synchronize all xgmi reset works across the
3359 * hive. task_barrier_enter and task_barrier_exit will block
3360 * until all the threads running the xgmi reset works reach
3361 * those points. task_barrier_full will do both blocks.
3362 */
3363 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3364
3365 task_barrier_enter(&hive->tb);
3366 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3367
3368 if (adev->asic_reset_res)
3369 goto fail;
3370
3371 task_barrier_exit(&hive->tb);
3372 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3373
3374 if (adev->asic_reset_res)
3375 goto fail;
3376
3377 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3378 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3379 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3380 } else {
3381
3382 task_barrier_full(&hive->tb);
3383 adev->asic_reset_res = amdgpu_asic_reset(adev);
3384 }
3385
3386 fail:
3387 if (adev->asic_reset_res)
3388 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3389 adev->asic_reset_res, adev_to_drm(adev)->unique);
3390 amdgpu_put_xgmi_hive(hive);
3391 }
3392
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3393 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3394 {
3395 char *input = amdgpu_lockup_timeout;
3396 char *timeout_setting = NULL;
3397 int index = 0;
3398 long timeout;
3399 int ret = 0;
3400
3401 /*
3402 * By default timeout for non compute jobs is 10000
3403 * and 60000 for compute jobs.
3404 * In SR-IOV or passthrough mode, timeout for compute
3405 * jobs are 60000 by default.
3406 */
3407 adev->gfx_timeout = msecs_to_jiffies(10000);
3408 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3409 if (amdgpu_sriov_vf(adev))
3410 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3411 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3412 else
3413 adev->compute_timeout = msecs_to_jiffies(60000);
3414
3415 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3416 while ((timeout_setting = strsep(&input, ",")) &&
3417 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3418 ret = kstrtol(timeout_setting, 0, &timeout);
3419 if (ret)
3420 return ret;
3421
3422 if (timeout == 0) {
3423 index++;
3424 continue;
3425 } else if (timeout < 0) {
3426 timeout = MAX_SCHEDULE_TIMEOUT;
3427 dev_warn(adev->dev, "lockup timeout disabled");
3428 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3429 } else {
3430 timeout = msecs_to_jiffies(timeout);
3431 }
3432
3433 switch (index++) {
3434 case 0:
3435 adev->gfx_timeout = timeout;
3436 break;
3437 case 1:
3438 adev->compute_timeout = timeout;
3439 break;
3440 case 2:
3441 adev->sdma_timeout = timeout;
3442 break;
3443 case 3:
3444 adev->video_timeout = timeout;
3445 break;
3446 default:
3447 break;
3448 }
3449 }
3450 /*
3451 * There is only one value specified and
3452 * it should apply to all non-compute jobs.
3453 */
3454 if (index == 1) {
3455 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3456 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3457 adev->compute_timeout = adev->gfx_timeout;
3458 }
3459 }
3460
3461 return ret;
3462 }
3463
3464 /**
3465 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3466 *
3467 * @adev: amdgpu_device pointer
3468 *
3469 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3470 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3471 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3472 {
3473 struct iommu_domain *domain;
3474
3475 domain = iommu_get_domain_for_dev(adev->dev);
3476 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3477 adev->ram_is_direct_mapped = true;
3478 }
3479
3480 static const struct attribute *amdgpu_dev_attributes[] = {
3481 &dev_attr_pcie_replay_count.attr,
3482 NULL
3483 };
3484
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3485 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3486 {
3487 if (amdgpu_mcbp == 1)
3488 adev->gfx.mcbp = true;
3489 else if (amdgpu_mcbp == 0)
3490 adev->gfx.mcbp = false;
3491
3492 if (amdgpu_sriov_vf(adev))
3493 adev->gfx.mcbp = true;
3494
3495 if (adev->gfx.mcbp)
3496 DRM_INFO("MCBP is enabled\n");
3497 }
3498
3499 /**
3500 * amdgpu_device_init - initialize the driver
3501 *
3502 * @adev: amdgpu_device pointer
3503 * @flags: driver flags
3504 *
3505 * Initializes the driver info and hw (all asics).
3506 * Returns 0 for success or an error on failure.
3507 * Called at driver startup.
3508 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3509 int amdgpu_device_init(struct amdgpu_device *adev,
3510 uint32_t flags)
3511 {
3512 struct drm_device *ddev = adev_to_drm(adev);
3513 struct pci_dev *pdev = adev->pdev;
3514 int r, i;
3515 bool px = false;
3516 u32 max_MBps;
3517 int tmp;
3518
3519 adev->shutdown = false;
3520 adev->flags = flags;
3521
3522 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3523 adev->asic_type = amdgpu_force_asic_type;
3524 else
3525 adev->asic_type = flags & AMD_ASIC_MASK;
3526
3527 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3528 if (amdgpu_emu_mode == 1)
3529 adev->usec_timeout *= 10;
3530 adev->gmc.gart_size = 512 * 1024 * 1024;
3531 adev->accel_working = false;
3532 adev->num_rings = 0;
3533 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3534 adev->mman.buffer_funcs = NULL;
3535 adev->mman.buffer_funcs_ring = NULL;
3536 adev->vm_manager.vm_pte_funcs = NULL;
3537 adev->vm_manager.vm_pte_num_scheds = 0;
3538 adev->gmc.gmc_funcs = NULL;
3539 adev->harvest_ip_mask = 0x0;
3540 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3541 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3542
3543 adev->smc_rreg = &amdgpu_invalid_rreg;
3544 adev->smc_wreg = &amdgpu_invalid_wreg;
3545 adev->pcie_rreg = &amdgpu_invalid_rreg;
3546 adev->pcie_wreg = &amdgpu_invalid_wreg;
3547 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3548 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3549 adev->pciep_rreg = &amdgpu_invalid_rreg;
3550 adev->pciep_wreg = &amdgpu_invalid_wreg;
3551 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3552 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3553 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3554 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3555 adev->didt_rreg = &amdgpu_invalid_rreg;
3556 adev->didt_wreg = &amdgpu_invalid_wreg;
3557 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3558 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3559 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3560 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3561
3562 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3563 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3564 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3565
3566 /* mutex initialization are all done here so we
3567 * can recall function without having locking issues
3568 */
3569 mutex_init(&adev->firmware.mutex);
3570 mutex_init(&adev->pm.mutex);
3571 mutex_init(&adev->gfx.gpu_clock_mutex);
3572 mutex_init(&adev->srbm_mutex);
3573 mutex_init(&adev->gfx.pipe_reserve_mutex);
3574 mutex_init(&adev->gfx.gfx_off_mutex);
3575 mutex_init(&adev->gfx.partition_mutex);
3576 mutex_init(&adev->grbm_idx_mutex);
3577 mutex_init(&adev->mn_lock);
3578 mutex_init(&adev->virt.vf_errors.lock);
3579 mutex_init(&adev->virt.rlcg_reg_lock);
3580 hash_init(adev->mn_hash);
3581 mutex_init(&adev->psp.mutex);
3582 mutex_init(&adev->notifier_lock);
3583 mutex_init(&adev->pm.stable_pstate_ctx_lock);
3584 mutex_init(&adev->benchmark_mutex);
3585
3586 amdgpu_device_init_apu_flags(adev);
3587
3588 r = amdgpu_device_check_arguments(adev);
3589 if (r)
3590 return r;
3591
3592 spin_lock_init(&adev->mmio_idx_lock);
3593 spin_lock_init(&adev->smc_idx_lock);
3594 spin_lock_init(&adev->pcie_idx_lock);
3595 spin_lock_init(&adev->uvd_ctx_idx_lock);
3596 spin_lock_init(&adev->didt_idx_lock);
3597 spin_lock_init(&adev->gc_cac_idx_lock);
3598 spin_lock_init(&adev->se_cac_idx_lock);
3599 spin_lock_init(&adev->audio_endpt_idx_lock);
3600 spin_lock_init(&adev->mm_stats.lock);
3601
3602 INIT_LIST_HEAD(&adev->shadow_list);
3603 mutex_init(&adev->shadow_list_lock);
3604
3605 INIT_LIST_HEAD(&adev->reset_list);
3606
3607 INIT_LIST_HEAD(&adev->ras_list);
3608
3609 INIT_DELAYED_WORK(&adev->delayed_init_work,
3610 amdgpu_device_delayed_init_work_handler);
3611 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3612 amdgpu_device_delay_enable_gfx_off);
3613
3614 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3615
3616 adev->gfx.gfx_off_req_count = 1;
3617 adev->gfx.gfx_off_residency = 0;
3618 adev->gfx.gfx_off_entrycount = 0;
3619 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3620
3621 atomic_set(&adev->throttling_logging_enabled, 1);
3622 /*
3623 * If throttling continues, logging will be performed every minute
3624 * to avoid log flooding. "-1" is subtracted since the thermal
3625 * throttling interrupt comes every second. Thus, the total logging
3626 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3627 * for throttling interrupt) = 60 seconds.
3628 */
3629 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3630 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3631
3632 /* Registers mapping */
3633 /* TODO: block userspace mapping of io register */
3634 if (adev->asic_type >= CHIP_BONAIRE) {
3635 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3636 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3637 } else {
3638 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3639 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3640 }
3641
3642 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3643 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3644
3645 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3646 if (!adev->rmmio)
3647 return -ENOMEM;
3648
3649 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3650 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3651
3652 /*
3653 * Reset domain needs to be present early, before XGMI hive discovered
3654 * (if any) and intitialized to use reset sem and in_gpu reset flag
3655 * early on during init and before calling to RREG32.
3656 */
3657 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3658 if (!adev->reset_domain)
3659 return -ENOMEM;
3660
3661 /* detect hw virtualization here */
3662 amdgpu_detect_virtualization(adev);
3663
3664 amdgpu_device_get_pcie_info(adev);
3665
3666 r = amdgpu_device_get_job_timeout_settings(adev);
3667 if (r) {
3668 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3669 return r;
3670 }
3671
3672 /* early init functions */
3673 r = amdgpu_device_ip_early_init(adev);
3674 if (r)
3675 return r;
3676
3677 amdgpu_device_set_mcbp(adev);
3678
3679 /* Get rid of things like offb */
3680 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3681 if (r)
3682 return r;
3683
3684 /* Enable TMZ based on IP_VERSION */
3685 amdgpu_gmc_tmz_set(adev);
3686
3687 amdgpu_gmc_noretry_set(adev);
3688 /* Need to get xgmi info early to decide the reset behavior*/
3689 if (adev->gmc.xgmi.supported) {
3690 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3691 if (r)
3692 return r;
3693 }
3694
3695 /* enable PCIE atomic ops */
3696 if (amdgpu_sriov_vf(adev)) {
3697 if (adev->virt.fw_reserve.p_pf2vf)
3698 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3699 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3700 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3701 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3702 * internal path natively support atomics, set have_atomics_support to true.
3703 */
3704 } else if ((adev->flags & AMD_IS_APU) &&
3705 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3706 adev->have_atomics_support = true;
3707 } else {
3708 adev->have_atomics_support =
3709 !pci_enable_atomic_ops_to_root(adev->pdev,
3710 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3711 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3712 }
3713
3714 if (!adev->have_atomics_support)
3715 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3716
3717 /* doorbell bar mapping and doorbell index init*/
3718 amdgpu_doorbell_init(adev);
3719
3720 if (amdgpu_emu_mode == 1) {
3721 /* post the asic on emulation mode */
3722 emu_soc_asic_init(adev);
3723 goto fence_driver_init;
3724 }
3725
3726 amdgpu_reset_init(adev);
3727
3728 /* detect if we are with an SRIOV vbios */
3729 if (adev->bios)
3730 amdgpu_device_detect_sriov_bios(adev);
3731
3732 /* check if we need to reset the asic
3733 * E.g., driver was not cleanly unloaded previously, etc.
3734 */
3735 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3736 if (adev->gmc.xgmi.num_physical_nodes) {
3737 dev_info(adev->dev, "Pending hive reset.\n");
3738 adev->gmc.xgmi.pending_reset = true;
3739 /* Only need to init necessary block for SMU to handle the reset */
3740 for (i = 0; i < adev->num_ip_blocks; i++) {
3741 if (!adev->ip_blocks[i].status.valid)
3742 continue;
3743 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3744 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3745 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3746 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3747 DRM_DEBUG("IP %s disabled for hw_init.\n",
3748 adev->ip_blocks[i].version->funcs->name);
3749 adev->ip_blocks[i].status.hw = true;
3750 }
3751 }
3752 } else {
3753 tmp = amdgpu_reset_method;
3754 /* It should do a default reset when loading or reloading the driver,
3755 * regardless of the module parameter reset_method.
3756 */
3757 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3758 r = amdgpu_asic_reset(adev);
3759 amdgpu_reset_method = tmp;
3760 if (r) {
3761 dev_err(adev->dev, "asic reset on init failed\n");
3762 goto failed;
3763 }
3764 }
3765 }
3766
3767 /* Post card if necessary */
3768 if (amdgpu_device_need_post(adev)) {
3769 if (!adev->bios) {
3770 dev_err(adev->dev, "no vBIOS found\n");
3771 r = -EINVAL;
3772 goto failed;
3773 }
3774 DRM_INFO("GPU posting now...\n");
3775 r = amdgpu_device_asic_init(adev);
3776 if (r) {
3777 dev_err(adev->dev, "gpu post error!\n");
3778 goto failed;
3779 }
3780 }
3781
3782 if (adev->bios) {
3783 if (adev->is_atom_fw) {
3784 /* Initialize clocks */
3785 r = amdgpu_atomfirmware_get_clock_info(adev);
3786 if (r) {
3787 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3788 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3789 goto failed;
3790 }
3791 } else {
3792 /* Initialize clocks */
3793 r = amdgpu_atombios_get_clock_info(adev);
3794 if (r) {
3795 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3796 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3797 goto failed;
3798 }
3799 /* init i2c buses */
3800 if (!amdgpu_device_has_dc_support(adev))
3801 amdgpu_atombios_i2c_init(adev);
3802 }
3803 }
3804
3805 fence_driver_init:
3806 /* Fence driver */
3807 r = amdgpu_fence_driver_sw_init(adev);
3808 if (r) {
3809 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3810 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3811 goto failed;
3812 }
3813
3814 /* init the mode config */
3815 drm_mode_config_init(adev_to_drm(adev));
3816
3817 r = amdgpu_device_ip_init(adev);
3818 if (r) {
3819 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3820 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3821 goto release_ras_con;
3822 }
3823
3824 amdgpu_fence_driver_hw_init(adev);
3825
3826 dev_info(adev->dev,
3827 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3828 adev->gfx.config.max_shader_engines,
3829 adev->gfx.config.max_sh_per_se,
3830 adev->gfx.config.max_cu_per_sh,
3831 adev->gfx.cu_info.number);
3832
3833 adev->accel_working = true;
3834
3835 amdgpu_vm_check_compute_bug(adev);
3836
3837 /* Initialize the buffer migration limit. */
3838 if (amdgpu_moverate >= 0)
3839 max_MBps = amdgpu_moverate;
3840 else
3841 max_MBps = 8; /* Allow 8 MB/s. */
3842 /* Get a log2 for easy divisions. */
3843 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3844
3845 r = amdgpu_atombios_sysfs_init(adev);
3846 if (r)
3847 drm_err(&adev->ddev,
3848 "registering atombios sysfs failed (%d).\n", r);
3849
3850 r = amdgpu_pm_sysfs_init(adev);
3851 if (r)
3852 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3853
3854 r = amdgpu_ucode_sysfs_init(adev);
3855 if (r) {
3856 adev->ucode_sysfs_en = false;
3857 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3858 } else
3859 adev->ucode_sysfs_en = true;
3860
3861 /*
3862 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3863 * Otherwise the mgpu fan boost feature will be skipped due to the
3864 * gpu instance is counted less.
3865 */
3866 amdgpu_register_gpu_instance(adev);
3867
3868 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3869 * explicit gating rather than handling it automatically.
3870 */
3871 if (!adev->gmc.xgmi.pending_reset) {
3872 r = amdgpu_device_ip_late_init(adev);
3873 if (r) {
3874 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3875 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3876 goto release_ras_con;
3877 }
3878 /* must succeed. */
3879 amdgpu_ras_resume(adev);
3880 queue_delayed_work(system_wq, &adev->delayed_init_work,
3881 msecs_to_jiffies(AMDGPU_RESUME_MS));
3882 }
3883
3884 if (amdgpu_sriov_vf(adev)) {
3885 amdgpu_virt_release_full_gpu(adev, true);
3886 flush_delayed_work(&adev->delayed_init_work);
3887 }
3888
3889 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3890 if (r)
3891 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3892
3893 amdgpu_fru_sysfs_init(adev);
3894
3895 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3896 r = amdgpu_pmu_init(adev);
3897 if (r)
3898 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3899
3900 /* Have stored pci confspace at hand for restore in sudden PCI error */
3901 if (amdgpu_device_cache_pci_state(adev->pdev))
3902 pci_restore_state(pdev);
3903
3904 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3905 /* this will fail for cards that aren't VGA class devices, just
3906 * ignore it
3907 */
3908 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3909 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3910
3911 px = amdgpu_device_supports_px(ddev);
3912
3913 if (px || (!dev_is_removable(&adev->pdev->dev) &&
3914 apple_gmux_detect(NULL, NULL)))
3915 vga_switcheroo_register_client(adev->pdev,
3916 &amdgpu_switcheroo_ops, px);
3917
3918 if (px)
3919 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3920
3921 if (adev->gmc.xgmi.pending_reset)
3922 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3923 msecs_to_jiffies(AMDGPU_RESUME_MS));
3924
3925 amdgpu_device_check_iommu_direct_map(adev);
3926
3927 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
3928 r = register_pm_notifier(&adev->pm_nb);
3929 if (r)
3930 goto failed;
3931
3932 return 0;
3933
3934 release_ras_con:
3935 if (amdgpu_sriov_vf(adev))
3936 amdgpu_virt_release_full_gpu(adev, true);
3937
3938 /* failed in exclusive mode due to timeout */
3939 if (amdgpu_sriov_vf(adev) &&
3940 !amdgpu_sriov_runtime(adev) &&
3941 amdgpu_virt_mmio_blocked(adev) &&
3942 !amdgpu_virt_wait_reset(adev)) {
3943 dev_err(adev->dev, "VF exclusive mode timeout\n");
3944 /* Don't send request since VF is inactive. */
3945 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3946 adev->virt.ops = NULL;
3947 r = -EAGAIN;
3948 }
3949 amdgpu_release_ras_context(adev);
3950
3951 failed:
3952 amdgpu_vf_error_trans_all(adev);
3953
3954 return r;
3955 }
3956
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)3957 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3958 {
3959
3960 /* Clear all CPU mappings pointing to this device */
3961 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3962
3963 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3964 amdgpu_doorbell_fini(adev);
3965
3966 iounmap(adev->rmmio);
3967 adev->rmmio = NULL;
3968 if (adev->mman.aper_base_kaddr)
3969 iounmap(adev->mman.aper_base_kaddr);
3970 adev->mman.aper_base_kaddr = NULL;
3971
3972 /* Memory manager related */
3973 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
3974 arch_phys_wc_del(adev->gmc.vram_mtrr);
3975 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3976 }
3977 }
3978
3979 /**
3980 * amdgpu_device_fini_hw - tear down the driver
3981 *
3982 * @adev: amdgpu_device pointer
3983 *
3984 * Tear down the driver info (all asics).
3985 * Called at driver shutdown.
3986 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)3987 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3988 {
3989 dev_info(adev->dev, "amdgpu: finishing device.\n");
3990 flush_delayed_work(&adev->delayed_init_work);
3991 adev->shutdown = true;
3992
3993 unregister_pm_notifier(&adev->pm_nb);
3994
3995 /* make sure IB test finished before entering exclusive mode
3996 * to avoid preemption on IB test
3997 */
3998 if (amdgpu_sriov_vf(adev)) {
3999 amdgpu_virt_request_full_gpu(adev, false);
4000 amdgpu_virt_fini_data_exchange(adev);
4001 }
4002
4003 /* disable all interrupts */
4004 amdgpu_irq_disable_all(adev);
4005 if (adev->mode_info.mode_config_initialized) {
4006 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4007 drm_helper_force_disable_all(adev_to_drm(adev));
4008 else
4009 drm_atomic_helper_shutdown(adev_to_drm(adev));
4010 }
4011 amdgpu_fence_driver_hw_fini(adev);
4012
4013 if (adev->mman.initialized)
4014 drain_workqueue(adev->mman.bdev.wq);
4015
4016 if (adev->pm.sysfs_initialized)
4017 amdgpu_pm_sysfs_fini(adev);
4018 if (adev->ucode_sysfs_en)
4019 amdgpu_ucode_sysfs_fini(adev);
4020 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4021 amdgpu_fru_sysfs_fini(adev);
4022
4023 /* disable ras feature must before hw fini */
4024 amdgpu_ras_pre_fini(adev);
4025
4026 amdgpu_device_ip_fini_early(adev);
4027
4028 amdgpu_irq_fini_hw(adev);
4029
4030 if (adev->mman.initialized)
4031 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4032
4033 amdgpu_gart_dummy_page_fini(adev);
4034
4035 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4036 amdgpu_device_unmap_mmio(adev);
4037
4038 }
4039
amdgpu_device_fini_sw(struct amdgpu_device * adev)4040 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4041 {
4042 int idx;
4043 bool px;
4044
4045 amdgpu_device_ip_fini(adev);
4046 amdgpu_fence_driver_sw_fini(adev);
4047 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4048 adev->accel_working = false;
4049 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4050
4051 amdgpu_reset_fini(adev);
4052
4053 /* free i2c buses */
4054 if (!amdgpu_device_has_dc_support(adev))
4055 amdgpu_i2c_fini(adev);
4056
4057 if (amdgpu_emu_mode != 1)
4058 amdgpu_atombios_fini(adev);
4059
4060 kfree(adev->bios);
4061 adev->bios = NULL;
4062
4063 px = amdgpu_device_supports_px(adev_to_drm(adev));
4064
4065 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4066 apple_gmux_detect(NULL, NULL)))
4067 vga_switcheroo_unregister_client(adev->pdev);
4068
4069 if (px)
4070 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4071
4072 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4073 vga_client_unregister(adev->pdev);
4074
4075 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4076
4077 iounmap(adev->rmmio);
4078 adev->rmmio = NULL;
4079 amdgpu_doorbell_fini(adev);
4080 drm_dev_exit(idx);
4081 }
4082
4083 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4084 amdgpu_pmu_fini(adev);
4085 if (adev->mman.discovery_bin)
4086 amdgpu_discovery_fini(adev);
4087
4088 amdgpu_reset_put_reset_domain(adev->reset_domain);
4089 adev->reset_domain = NULL;
4090
4091 kfree(adev->pci_state);
4092
4093 }
4094
4095 /**
4096 * amdgpu_device_evict_resources - evict device resources
4097 * @adev: amdgpu device object
4098 *
4099 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4100 * of the vram memory type. Mainly used for evicting device resources
4101 * at suspend time.
4102 *
4103 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4104 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4105 {
4106 int ret;
4107
4108 /* No need to evict vram on APUs for suspend to ram or s2idle */
4109 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4110 return 0;
4111
4112 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4113 if (ret)
4114 DRM_WARN("evicting device resources failed\n");
4115 return ret;
4116 }
4117
4118 /*
4119 * Suspend & resume.
4120 */
4121 /**
4122 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4123 * @nb: notifier block
4124 * @mode: suspend mode
4125 * @data: data
4126 *
4127 * This function is called when the system is about to suspend or hibernate.
4128 * It is used to set the appropriate flags so that eviction can be optimized
4129 * in the pm prepare callback.
4130 */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)4131 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4132 void *data)
4133 {
4134 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4135
4136 switch (mode) {
4137 case PM_HIBERNATION_PREPARE:
4138 adev->in_s4 = true;
4139 break;
4140 case PM_POST_HIBERNATION:
4141 adev->in_s4 = false;
4142 break;
4143 }
4144
4145 return NOTIFY_DONE;
4146 }
4147
4148 /**
4149 * amdgpu_device_prepare - prepare for device suspend
4150 *
4151 * @dev: drm dev pointer
4152 *
4153 * Prepare to put the hw in the suspend state (all asics).
4154 * Returns 0 for success or an error on failure.
4155 * Called at driver suspend.
4156 */
amdgpu_device_prepare(struct drm_device * dev)4157 int amdgpu_device_prepare(struct drm_device *dev)
4158 {
4159 struct amdgpu_device *adev = drm_to_adev(dev);
4160 int i, r;
4161
4162 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4163 return 0;
4164
4165 /* Evict the majority of BOs before starting suspend sequence */
4166 r = amdgpu_device_evict_resources(adev);
4167 if (r)
4168 return r;
4169
4170 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4171
4172 for (i = 0; i < adev->num_ip_blocks; i++) {
4173 if (!adev->ip_blocks[i].status.valid)
4174 continue;
4175 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4176 continue;
4177 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4178 if (r)
4179 return r;
4180 }
4181
4182 return 0;
4183 }
4184
4185 /**
4186 * amdgpu_device_suspend - initiate device suspend
4187 *
4188 * @dev: drm dev pointer
4189 * @fbcon : notify the fbdev of suspend
4190 *
4191 * Puts the hw in the suspend state (all asics).
4192 * Returns 0 for success or an error on failure.
4193 * Called at driver suspend.
4194 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4195 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4196 {
4197 struct amdgpu_device *adev = drm_to_adev(dev);
4198 int r = 0;
4199
4200 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4201 return 0;
4202
4203 adev->in_suspend = true;
4204
4205 if (amdgpu_sriov_vf(adev)) {
4206 amdgpu_virt_fini_data_exchange(adev);
4207 r = amdgpu_virt_request_full_gpu(adev, false);
4208 if (r)
4209 return r;
4210 }
4211
4212 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4213 DRM_WARN("smart shift update failed\n");
4214
4215 if (fbcon)
4216 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4217
4218 cancel_delayed_work_sync(&adev->delayed_init_work);
4219
4220 amdgpu_ras_suspend(adev);
4221
4222 amdgpu_device_ip_suspend_phase1(adev);
4223
4224 if (!adev->in_s0ix)
4225 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4226
4227 r = amdgpu_device_evict_resources(adev);
4228 if (r)
4229 return r;
4230
4231 amdgpu_fence_driver_hw_fini(adev);
4232
4233 amdgpu_device_ip_suspend_phase2(adev);
4234
4235 if (amdgpu_sriov_vf(adev))
4236 amdgpu_virt_release_full_gpu(adev, false);
4237
4238 return 0;
4239 }
4240
4241 /**
4242 * amdgpu_device_resume - initiate device resume
4243 *
4244 * @dev: drm dev pointer
4245 * @fbcon : notify the fbdev of resume
4246 *
4247 * Bring the hw back to operating state (all asics).
4248 * Returns 0 for success or an error on failure.
4249 * Called at driver resume.
4250 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4251 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4252 {
4253 struct amdgpu_device *adev = drm_to_adev(dev);
4254 int r = 0;
4255
4256 if (amdgpu_sriov_vf(adev)) {
4257 r = amdgpu_virt_request_full_gpu(adev, true);
4258 if (r)
4259 return r;
4260 }
4261
4262 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4263 return 0;
4264
4265 if (adev->in_s0ix)
4266 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4267
4268 /* post card */
4269 if (amdgpu_device_need_post(adev)) {
4270 r = amdgpu_device_asic_init(adev);
4271 if (r)
4272 dev_err(adev->dev, "amdgpu asic init failed\n");
4273 }
4274
4275 r = amdgpu_device_ip_resume(adev);
4276
4277 if (r) {
4278 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4279 goto exit;
4280 }
4281 amdgpu_fence_driver_hw_init(adev);
4282
4283 r = amdgpu_device_ip_late_init(adev);
4284 if (r)
4285 goto exit;
4286
4287 queue_delayed_work(system_wq, &adev->delayed_init_work,
4288 msecs_to_jiffies(AMDGPU_RESUME_MS));
4289
4290 if (!adev->in_s0ix) {
4291 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4292 if (r)
4293 goto exit;
4294 }
4295
4296 exit:
4297 if (amdgpu_sriov_vf(adev)) {
4298 amdgpu_virt_init_data_exchange(adev);
4299 amdgpu_virt_release_full_gpu(adev, true);
4300 }
4301
4302 if (r)
4303 return r;
4304
4305 /* Make sure IB tests flushed */
4306 flush_delayed_work(&adev->delayed_init_work);
4307
4308 if (fbcon)
4309 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4310
4311 amdgpu_ras_resume(adev);
4312
4313 if (adev->mode_info.num_crtc) {
4314 /*
4315 * Most of the connector probing functions try to acquire runtime pm
4316 * refs to ensure that the GPU is powered on when connector polling is
4317 * performed. Since we're calling this from a runtime PM callback,
4318 * trying to acquire rpm refs will cause us to deadlock.
4319 *
4320 * Since we're guaranteed to be holding the rpm lock, it's safe to
4321 * temporarily disable the rpm helpers so this doesn't deadlock us.
4322 */
4323 #ifdef CONFIG_PM
4324 dev->dev->power.disable_depth++;
4325 #endif
4326 if (!adev->dc_enabled)
4327 drm_helper_hpd_irq_event(dev);
4328 else
4329 drm_kms_helper_hotplug_event(dev);
4330 #ifdef CONFIG_PM
4331 dev->dev->power.disable_depth--;
4332 #endif
4333 }
4334 adev->in_suspend = false;
4335
4336 if (adev->enable_mes)
4337 amdgpu_mes_self_test(adev);
4338
4339 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4340 DRM_WARN("smart shift update failed\n");
4341
4342 return 0;
4343 }
4344
4345 /**
4346 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4347 *
4348 * @adev: amdgpu_device pointer
4349 *
4350 * The list of all the hardware IPs that make up the asic is walked and
4351 * the check_soft_reset callbacks are run. check_soft_reset determines
4352 * if the asic is still hung or not.
4353 * Returns true if any of the IPs are still in a hung state, false if not.
4354 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4355 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4356 {
4357 int i;
4358 bool asic_hang = false;
4359
4360 if (amdgpu_sriov_vf(adev))
4361 return true;
4362
4363 if (amdgpu_asic_need_full_reset(adev))
4364 return true;
4365
4366 for (i = 0; i < adev->num_ip_blocks; i++) {
4367 if (!adev->ip_blocks[i].status.valid)
4368 continue;
4369 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4370 adev->ip_blocks[i].status.hang =
4371 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4372 if (adev->ip_blocks[i].status.hang) {
4373 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4374 asic_hang = true;
4375 }
4376 }
4377 return asic_hang;
4378 }
4379
4380 /**
4381 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4382 *
4383 * @adev: amdgpu_device pointer
4384 *
4385 * The list of all the hardware IPs that make up the asic is walked and the
4386 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4387 * handles any IP specific hardware or software state changes that are
4388 * necessary for a soft reset to succeed.
4389 * Returns 0 on success, negative error code on failure.
4390 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4391 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4392 {
4393 int i, r = 0;
4394
4395 for (i = 0; i < adev->num_ip_blocks; i++) {
4396 if (!adev->ip_blocks[i].status.valid)
4397 continue;
4398 if (adev->ip_blocks[i].status.hang &&
4399 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4400 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4401 if (r)
4402 return r;
4403 }
4404 }
4405
4406 return 0;
4407 }
4408
4409 /**
4410 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4411 *
4412 * @adev: amdgpu_device pointer
4413 *
4414 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4415 * reset is necessary to recover.
4416 * Returns true if a full asic reset is required, false if not.
4417 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4418 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4419 {
4420 int i;
4421
4422 if (amdgpu_asic_need_full_reset(adev))
4423 return true;
4424
4425 for (i = 0; i < adev->num_ip_blocks; i++) {
4426 if (!adev->ip_blocks[i].status.valid)
4427 continue;
4428 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4429 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4430 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4431 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4432 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4433 if (adev->ip_blocks[i].status.hang) {
4434 dev_info(adev->dev, "Some block need full reset!\n");
4435 return true;
4436 }
4437 }
4438 }
4439 return false;
4440 }
4441
4442 /**
4443 * amdgpu_device_ip_soft_reset - do a soft reset
4444 *
4445 * @adev: amdgpu_device pointer
4446 *
4447 * The list of all the hardware IPs that make up the asic is walked and the
4448 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4449 * IP specific hardware or software state changes that are necessary to soft
4450 * reset the IP.
4451 * Returns 0 on success, negative error code on failure.
4452 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4453 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4454 {
4455 int i, r = 0;
4456
4457 for (i = 0; i < adev->num_ip_blocks; i++) {
4458 if (!adev->ip_blocks[i].status.valid)
4459 continue;
4460 if (adev->ip_blocks[i].status.hang &&
4461 adev->ip_blocks[i].version->funcs->soft_reset) {
4462 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4463 if (r)
4464 return r;
4465 }
4466 }
4467
4468 return 0;
4469 }
4470
4471 /**
4472 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4473 *
4474 * @adev: amdgpu_device pointer
4475 *
4476 * The list of all the hardware IPs that make up the asic is walked and the
4477 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4478 * handles any IP specific hardware or software state changes that are
4479 * necessary after the IP has been soft reset.
4480 * Returns 0 on success, negative error code on failure.
4481 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4482 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4483 {
4484 int i, r = 0;
4485
4486 for (i = 0; i < adev->num_ip_blocks; i++) {
4487 if (!adev->ip_blocks[i].status.valid)
4488 continue;
4489 if (adev->ip_blocks[i].status.hang &&
4490 adev->ip_blocks[i].version->funcs->post_soft_reset)
4491 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4492 if (r)
4493 return r;
4494 }
4495
4496 return 0;
4497 }
4498
4499 /**
4500 * amdgpu_device_recover_vram - Recover some VRAM contents
4501 *
4502 * @adev: amdgpu_device pointer
4503 *
4504 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4505 * restore things like GPUVM page tables after a GPU reset where
4506 * the contents of VRAM might be lost.
4507 *
4508 * Returns:
4509 * 0 on success, negative error code on failure.
4510 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4511 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4512 {
4513 struct dma_fence *fence = NULL, *next = NULL;
4514 struct amdgpu_bo *shadow;
4515 struct amdgpu_bo_vm *vmbo;
4516 long r = 1, tmo;
4517
4518 if (amdgpu_sriov_runtime(adev))
4519 tmo = msecs_to_jiffies(8000);
4520 else
4521 tmo = msecs_to_jiffies(100);
4522
4523 dev_info(adev->dev, "recover vram bo from shadow start\n");
4524 mutex_lock(&adev->shadow_list_lock);
4525 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4526 /* If vm is compute context or adev is APU, shadow will be NULL */
4527 if (!vmbo->shadow)
4528 continue;
4529 shadow = vmbo->shadow;
4530
4531 /* No need to recover an evicted BO */
4532 if (!shadow->tbo.resource ||
4533 shadow->tbo.resource->mem_type != TTM_PL_TT ||
4534 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4535 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4536 continue;
4537
4538 r = amdgpu_bo_restore_shadow(shadow, &next);
4539 if (r)
4540 break;
4541
4542 if (fence) {
4543 tmo = dma_fence_wait_timeout(fence, false, tmo);
4544 dma_fence_put(fence);
4545 fence = next;
4546 if (tmo == 0) {
4547 r = -ETIMEDOUT;
4548 break;
4549 } else if (tmo < 0) {
4550 r = tmo;
4551 break;
4552 }
4553 } else {
4554 fence = next;
4555 }
4556 }
4557 mutex_unlock(&adev->shadow_list_lock);
4558
4559 if (fence)
4560 tmo = dma_fence_wait_timeout(fence, false, tmo);
4561 dma_fence_put(fence);
4562
4563 if (r < 0 || tmo <= 0) {
4564 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4565 return -EIO;
4566 }
4567
4568 dev_info(adev->dev, "recover vram bo from shadow done\n");
4569 return 0;
4570 }
4571
4572
4573 /**
4574 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4575 *
4576 * @adev: amdgpu_device pointer
4577 * @from_hypervisor: request from hypervisor
4578 *
4579 * do VF FLR and reinitialize Asic
4580 * return 0 means succeeded otherwise failed
4581 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4582 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4583 bool from_hypervisor)
4584 {
4585 int r;
4586 struct amdgpu_hive_info *hive = NULL;
4587 int retry_limit = 0;
4588
4589 retry:
4590 amdgpu_amdkfd_pre_reset(adev);
4591
4592 amdgpu_device_stop_pending_resets(adev);
4593
4594 if (from_hypervisor)
4595 r = amdgpu_virt_request_full_gpu(adev, true);
4596 else
4597 r = amdgpu_virt_reset_gpu(adev);
4598 if (r)
4599 return r;
4600 amdgpu_irq_gpu_reset_resume_helper(adev);
4601
4602 /* some sw clean up VF needs to do before recover */
4603 amdgpu_virt_post_reset(adev);
4604
4605 /* Resume IP prior to SMC */
4606 r = amdgpu_device_ip_reinit_early_sriov(adev);
4607 if (r)
4608 goto error;
4609
4610 amdgpu_virt_init_data_exchange(adev);
4611
4612 r = amdgpu_device_fw_loading(adev);
4613 if (r)
4614 return r;
4615
4616 /* now we are okay to resume SMC/CP/SDMA */
4617 r = amdgpu_device_ip_reinit_late_sriov(adev);
4618 if (r)
4619 goto error;
4620
4621 hive = amdgpu_get_xgmi_hive(adev);
4622 /* Update PSP FW topology after reset */
4623 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4624 r = amdgpu_xgmi_update_topology(hive, adev);
4625
4626 if (hive)
4627 amdgpu_put_xgmi_hive(hive);
4628
4629 if (!r) {
4630 r = amdgpu_ib_ring_tests(adev);
4631
4632 amdgpu_amdkfd_post_reset(adev);
4633 }
4634
4635 error:
4636 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4637 amdgpu_inc_vram_lost(adev);
4638 r = amdgpu_device_recover_vram(adev);
4639 }
4640 amdgpu_virt_release_full_gpu(adev, true);
4641
4642 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4643 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4644 retry_limit++;
4645 goto retry;
4646 } else
4647 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4648 }
4649
4650 return r;
4651 }
4652
4653 /**
4654 * amdgpu_device_has_job_running - check if there is any job in mirror list
4655 *
4656 * @adev: amdgpu_device pointer
4657 *
4658 * check if there is any job in mirror list
4659 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4660 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4661 {
4662 int i;
4663 struct drm_sched_job *job;
4664
4665 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4666 struct amdgpu_ring *ring = adev->rings[i];
4667
4668 if (!ring || !ring->sched.thread)
4669 continue;
4670
4671 spin_lock(&ring->sched.job_list_lock);
4672 job = list_first_entry_or_null(&ring->sched.pending_list,
4673 struct drm_sched_job, list);
4674 spin_unlock(&ring->sched.job_list_lock);
4675 if (job)
4676 return true;
4677 }
4678 return false;
4679 }
4680
4681 /**
4682 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4683 *
4684 * @adev: amdgpu_device pointer
4685 *
4686 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4687 * a hung GPU.
4688 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4689 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4690 {
4691
4692 if (amdgpu_gpu_recovery == 0)
4693 goto disabled;
4694
4695 /* Skip soft reset check in fatal error mode */
4696 if (!amdgpu_ras_is_poison_mode_supported(adev))
4697 return true;
4698
4699 if (amdgpu_sriov_vf(adev))
4700 return true;
4701
4702 if (amdgpu_gpu_recovery == -1) {
4703 switch (adev->asic_type) {
4704 #ifdef CONFIG_DRM_AMDGPU_SI
4705 case CHIP_VERDE:
4706 case CHIP_TAHITI:
4707 case CHIP_PITCAIRN:
4708 case CHIP_OLAND:
4709 case CHIP_HAINAN:
4710 #endif
4711 #ifdef CONFIG_DRM_AMDGPU_CIK
4712 case CHIP_KAVERI:
4713 case CHIP_KABINI:
4714 case CHIP_MULLINS:
4715 #endif
4716 case CHIP_CARRIZO:
4717 case CHIP_STONEY:
4718 case CHIP_CYAN_SKILLFISH:
4719 goto disabled;
4720 default:
4721 break;
4722 }
4723 }
4724
4725 return true;
4726
4727 disabled:
4728 dev_info(adev->dev, "GPU recovery disabled.\n");
4729 return false;
4730 }
4731
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4732 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4733 {
4734 u32 i;
4735 int ret = 0;
4736
4737 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4738
4739 dev_info(adev->dev, "GPU mode1 reset\n");
4740
4741 /* Cache the state before bus master disable. The saved config space
4742 * values are used in other cases like restore after mode-2 reset.
4743 */
4744 amdgpu_device_cache_pci_state(adev->pdev);
4745
4746 /* disable BM */
4747 pci_clear_master(adev->pdev);
4748
4749 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4750 dev_info(adev->dev, "GPU smu mode1 reset\n");
4751 ret = amdgpu_dpm_mode1_reset(adev);
4752 } else {
4753 dev_info(adev->dev, "GPU psp mode1 reset\n");
4754 ret = psp_gpu_reset(adev);
4755 }
4756
4757 if (ret)
4758 goto mode1_reset_failed;
4759
4760 amdgpu_device_load_pci_state(adev->pdev);
4761 ret = amdgpu_psp_wait_for_bootloader(adev);
4762 if (ret)
4763 goto mode1_reset_failed;
4764
4765 /* wait for asic to come out of reset */
4766 for (i = 0; i < adev->usec_timeout; i++) {
4767 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4768
4769 if (memsize != 0xffffffff)
4770 break;
4771 udelay(1);
4772 }
4773
4774 if (i >= adev->usec_timeout) {
4775 ret = -ETIMEDOUT;
4776 goto mode1_reset_failed;
4777 }
4778
4779 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4780
4781 return 0;
4782
4783 mode1_reset_failed:
4784 dev_err(adev->dev, "GPU mode1 reset failed\n");
4785 return ret;
4786 }
4787
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4788 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4789 struct amdgpu_reset_context *reset_context)
4790 {
4791 int i, r = 0;
4792 struct amdgpu_job *job = NULL;
4793 bool need_full_reset =
4794 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4795
4796 if (reset_context->reset_req_dev == adev)
4797 job = reset_context->job;
4798
4799 if (amdgpu_sriov_vf(adev)) {
4800 /* stop the data exchange thread */
4801 amdgpu_virt_fini_data_exchange(adev);
4802 }
4803
4804 amdgpu_fence_driver_isr_toggle(adev, true);
4805
4806 /* block all schedulers and reset given job's ring */
4807 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4808 struct amdgpu_ring *ring = adev->rings[i];
4809
4810 if (!ring || !ring->sched.thread)
4811 continue;
4812
4813 /* Clear job fence from fence drv to avoid force_completion
4814 * leave NULL and vm flush fence in fence drv
4815 */
4816 amdgpu_fence_driver_clear_job_fences(ring);
4817
4818 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4819 amdgpu_fence_driver_force_completion(ring);
4820 }
4821
4822 amdgpu_fence_driver_isr_toggle(adev, false);
4823
4824 if (job && job->vm)
4825 drm_sched_increase_karma(&job->base);
4826
4827 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4828 /* If reset handler not implemented, continue; otherwise return */
4829 if (r == -EOPNOTSUPP)
4830 r = 0;
4831 else
4832 return r;
4833
4834 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4835 if (!amdgpu_sriov_vf(adev)) {
4836
4837 if (!need_full_reset)
4838 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4839
4840 if (!need_full_reset && amdgpu_gpu_recovery &&
4841 amdgpu_device_ip_check_soft_reset(adev)) {
4842 amdgpu_device_ip_pre_soft_reset(adev);
4843 r = amdgpu_device_ip_soft_reset(adev);
4844 amdgpu_device_ip_post_soft_reset(adev);
4845 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4846 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4847 need_full_reset = true;
4848 }
4849 }
4850
4851 if (need_full_reset)
4852 r = amdgpu_device_ip_suspend(adev);
4853 if (need_full_reset)
4854 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4855 else
4856 clear_bit(AMDGPU_NEED_FULL_RESET,
4857 &reset_context->flags);
4858 }
4859
4860 return r;
4861 }
4862
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4863 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4864 {
4865 int i;
4866
4867 lockdep_assert_held(&adev->reset_domain->sem);
4868
4869 for (i = 0; i < adev->num_regs; i++) {
4870 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4871 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4872 adev->reset_dump_reg_value[i]);
4873 }
4874
4875 return 0;
4876 }
4877
4878 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4879 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4880 size_t count, void *data, size_t datalen)
4881 {
4882 struct drm_printer p;
4883 struct amdgpu_device *adev = data;
4884 struct drm_print_iterator iter;
4885 int i;
4886
4887 iter.data = buffer;
4888 iter.offset = 0;
4889 iter.start = offset;
4890 iter.remain = count;
4891
4892 p = drm_coredump_printer(&iter);
4893
4894 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4895 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4896 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4897 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4898 if (adev->reset_task_info.pid)
4899 drm_printf(&p, "process_name: %s PID: %d\n",
4900 adev->reset_task_info.process_name,
4901 adev->reset_task_info.pid);
4902
4903 if (adev->reset_vram_lost)
4904 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4905 if (adev->num_regs) {
4906 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4907
4908 for (i = 0; i < adev->num_regs; i++)
4909 drm_printf(&p, "0x%08x: 0x%08x\n",
4910 adev->reset_dump_reg_list[i],
4911 adev->reset_dump_reg_value[i]);
4912 }
4913
4914 return count - iter.remain;
4915 }
4916
amdgpu_devcoredump_free(void * data)4917 static void amdgpu_devcoredump_free(void *data)
4918 {
4919 }
4920
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)4921 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4922 {
4923 struct drm_device *dev = adev_to_drm(adev);
4924
4925 ktime_get_ts64(&adev->reset_time);
4926 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
4927 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4928 }
4929 #endif
4930
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4931 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4932 struct amdgpu_reset_context *reset_context)
4933 {
4934 struct amdgpu_device *tmp_adev = NULL;
4935 bool need_full_reset, skip_hw_reset, vram_lost = false;
4936 int r = 0;
4937 bool gpu_reset_for_dev_remove = 0;
4938
4939 /* Try reset handler method first */
4940 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4941 reset_list);
4942 amdgpu_reset_reg_dumps(tmp_adev);
4943
4944 reset_context->reset_device_list = device_list_handle;
4945 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4946 /* If reset handler not implemented, continue; otherwise return */
4947 if (r == -EOPNOTSUPP)
4948 r = 0;
4949 else
4950 return r;
4951
4952 /* Reset handler not implemented, use the default method */
4953 need_full_reset =
4954 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4955 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4956
4957 gpu_reset_for_dev_remove =
4958 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4959 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4960
4961 /*
4962 * ASIC reset has to be done on all XGMI hive nodes ASAP
4963 * to allow proper links negotiation in FW (within 1 sec)
4964 */
4965 if (!skip_hw_reset && need_full_reset) {
4966 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4967 /* For XGMI run all resets in parallel to speed up the process */
4968 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4969 tmp_adev->gmc.xgmi.pending_reset = false;
4970 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4971 r = -EALREADY;
4972 } else
4973 r = amdgpu_asic_reset(tmp_adev);
4974
4975 if (r) {
4976 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4977 r, adev_to_drm(tmp_adev)->unique);
4978 break;
4979 }
4980 }
4981
4982 /* For XGMI wait for all resets to complete before proceed */
4983 if (!r) {
4984 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4985 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4986 flush_work(&tmp_adev->xgmi_reset_work);
4987 r = tmp_adev->asic_reset_res;
4988 if (r)
4989 break;
4990 }
4991 }
4992 }
4993 }
4994
4995 if (!r && amdgpu_ras_intr_triggered()) {
4996 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4997 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4998 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4999 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5000 }
5001
5002 amdgpu_ras_intr_cleared();
5003 }
5004
5005 /* Since the mode1 reset affects base ip blocks, the
5006 * phase1 ip blocks need to be resumed. Otherwise there
5007 * will be a BIOS signature error and the psp bootloader
5008 * can't load kdb on the next amdgpu install.
5009 */
5010 if (gpu_reset_for_dev_remove) {
5011 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5012 amdgpu_device_ip_resume_phase1(tmp_adev);
5013
5014 goto end;
5015 }
5016
5017 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5018 if (need_full_reset) {
5019 /* post card */
5020 r = amdgpu_device_asic_init(tmp_adev);
5021 if (r) {
5022 dev_warn(tmp_adev->dev, "asic atom init failed!");
5023 } else {
5024 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5025
5026 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5027 if (r)
5028 goto out;
5029
5030 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5031 #ifdef CONFIG_DEV_COREDUMP
5032 tmp_adev->reset_vram_lost = vram_lost;
5033 memset(&tmp_adev->reset_task_info, 0,
5034 sizeof(tmp_adev->reset_task_info));
5035 if (reset_context->job && reset_context->job->vm)
5036 tmp_adev->reset_task_info =
5037 reset_context->job->vm->task_info;
5038 amdgpu_reset_capture_coredumpm(tmp_adev);
5039 #endif
5040 if (vram_lost) {
5041 DRM_INFO("VRAM is lost due to GPU reset!\n");
5042 amdgpu_inc_vram_lost(tmp_adev);
5043 }
5044
5045 r = amdgpu_device_fw_loading(tmp_adev);
5046 if (r)
5047 return r;
5048
5049 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5050 if (r)
5051 goto out;
5052
5053 if (vram_lost)
5054 amdgpu_device_fill_reset_magic(tmp_adev);
5055
5056 /*
5057 * Add this ASIC as tracked as reset was already
5058 * complete successfully.
5059 */
5060 amdgpu_register_gpu_instance(tmp_adev);
5061
5062 if (!reset_context->hive &&
5063 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5064 amdgpu_xgmi_add_device(tmp_adev);
5065
5066 r = amdgpu_device_ip_late_init(tmp_adev);
5067 if (r)
5068 goto out;
5069
5070 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5071
5072 /*
5073 * The GPU enters bad state once faulty pages
5074 * by ECC has reached the threshold, and ras
5075 * recovery is scheduled next. So add one check
5076 * here to break recovery if it indeed exceeds
5077 * bad page threshold, and remind user to
5078 * retire this GPU or setting one bigger
5079 * bad_page_threshold value to fix this once
5080 * probing driver again.
5081 */
5082 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5083 /* must succeed. */
5084 amdgpu_ras_resume(tmp_adev);
5085 } else {
5086 r = -EINVAL;
5087 goto out;
5088 }
5089
5090 /* Update PSP FW topology after reset */
5091 if (reset_context->hive &&
5092 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5093 r = amdgpu_xgmi_update_topology(
5094 reset_context->hive, tmp_adev);
5095 }
5096 }
5097
5098 out:
5099 if (!r) {
5100 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5101 r = amdgpu_ib_ring_tests(tmp_adev);
5102 if (r) {
5103 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5104 need_full_reset = true;
5105 r = -EAGAIN;
5106 goto end;
5107 }
5108 }
5109
5110 if (!r)
5111 r = amdgpu_device_recover_vram(tmp_adev);
5112 else
5113 tmp_adev->asic_reset_res = r;
5114 }
5115
5116 end:
5117 if (need_full_reset)
5118 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5119 else
5120 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5121 return r;
5122 }
5123
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5124 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5125 {
5126
5127 switch (amdgpu_asic_reset_method(adev)) {
5128 case AMD_RESET_METHOD_MODE1:
5129 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5130 break;
5131 case AMD_RESET_METHOD_MODE2:
5132 adev->mp1_state = PP_MP1_STATE_RESET;
5133 break;
5134 default:
5135 adev->mp1_state = PP_MP1_STATE_NONE;
5136 break;
5137 }
5138 }
5139
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5140 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5141 {
5142 amdgpu_vf_error_trans_all(adev);
5143 adev->mp1_state = PP_MP1_STATE_NONE;
5144 }
5145
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5146 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5147 {
5148 struct pci_dev *p = NULL;
5149
5150 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5151 adev->pdev->bus->number, 1);
5152 if (p) {
5153 pm_runtime_enable(&(p->dev));
5154 pm_runtime_resume(&(p->dev));
5155 }
5156
5157 pci_dev_put(p);
5158 }
5159
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5160 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5161 {
5162 enum amd_reset_method reset_method;
5163 struct pci_dev *p = NULL;
5164 u64 expires;
5165
5166 /*
5167 * For now, only BACO and mode1 reset are confirmed
5168 * to suffer the audio issue without proper suspended.
5169 */
5170 reset_method = amdgpu_asic_reset_method(adev);
5171 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5172 (reset_method != AMD_RESET_METHOD_MODE1))
5173 return -EINVAL;
5174
5175 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5176 adev->pdev->bus->number, 1);
5177 if (!p)
5178 return -ENODEV;
5179
5180 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5181 if (!expires)
5182 /*
5183 * If we cannot get the audio device autosuspend delay,
5184 * a fixed 4S interval will be used. Considering 3S is
5185 * the audio controller default autosuspend delay setting.
5186 * 4S used here is guaranteed to cover that.
5187 */
5188 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5189
5190 while (!pm_runtime_status_suspended(&(p->dev))) {
5191 if (!pm_runtime_suspend(&(p->dev)))
5192 break;
5193
5194 if (expires < ktime_get_mono_fast_ns()) {
5195 dev_warn(adev->dev, "failed to suspend display audio\n");
5196 pci_dev_put(p);
5197 /* TODO: abort the succeeding gpu reset? */
5198 return -ETIMEDOUT;
5199 }
5200 }
5201
5202 pm_runtime_disable(&(p->dev));
5203
5204 pci_dev_put(p);
5205 return 0;
5206 }
5207
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5208 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5209 {
5210 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5211
5212 #if defined(CONFIG_DEBUG_FS)
5213 if (!amdgpu_sriov_vf(adev))
5214 cancel_work(&adev->reset_work);
5215 #endif
5216
5217 if (adev->kfd.dev)
5218 cancel_work(&adev->kfd.reset_work);
5219
5220 if (amdgpu_sriov_vf(adev))
5221 cancel_work(&adev->virt.flr_work);
5222
5223 if (con && adev->ras_enabled)
5224 cancel_work(&con->recovery_work);
5225
5226 }
5227
5228 /**
5229 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5230 *
5231 * @adev: amdgpu_device pointer
5232 * @job: which job trigger hang
5233 * @reset_context: amdgpu reset context pointer
5234 *
5235 * Attempt to reset the GPU if it has hung (all asics).
5236 * Attempt to do soft-reset or full-reset and reinitialize Asic
5237 * Returns 0 for success or an error on failure.
5238 */
5239
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5240 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5241 struct amdgpu_job *job,
5242 struct amdgpu_reset_context *reset_context)
5243 {
5244 struct list_head device_list, *device_list_handle = NULL;
5245 bool job_signaled = false;
5246 struct amdgpu_hive_info *hive = NULL;
5247 struct amdgpu_device *tmp_adev = NULL;
5248 int i, r = 0;
5249 bool need_emergency_restart = false;
5250 bool audio_suspended = false;
5251 bool gpu_reset_for_dev_remove = false;
5252
5253 gpu_reset_for_dev_remove =
5254 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5255 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5256
5257 /*
5258 * Special case: RAS triggered and full reset isn't supported
5259 */
5260 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5261
5262 /*
5263 * Flush RAM to disk so that after reboot
5264 * the user can read log and see why the system rebooted.
5265 */
5266 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5267 amdgpu_ras_get_context(adev)->reboot) {
5268 DRM_WARN("Emergency reboot.");
5269
5270 ksys_sync_helper();
5271 emergency_restart();
5272 }
5273
5274 dev_info(adev->dev, "GPU %s begin!\n",
5275 need_emergency_restart ? "jobs stop":"reset");
5276
5277 if (!amdgpu_sriov_vf(adev))
5278 hive = amdgpu_get_xgmi_hive(adev);
5279 if (hive)
5280 mutex_lock(&hive->hive_lock);
5281
5282 reset_context->job = job;
5283 reset_context->hive = hive;
5284 /*
5285 * Build list of devices to reset.
5286 * In case we are in XGMI hive mode, resort the device list
5287 * to put adev in the 1st position.
5288 */
5289 INIT_LIST_HEAD(&device_list);
5290 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5291 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5292 list_add_tail(&tmp_adev->reset_list, &device_list);
5293 if (gpu_reset_for_dev_remove && adev->shutdown)
5294 tmp_adev->shutdown = true;
5295 }
5296 if (!list_is_first(&adev->reset_list, &device_list))
5297 list_rotate_to_front(&adev->reset_list, &device_list);
5298 device_list_handle = &device_list;
5299 } else {
5300 list_add_tail(&adev->reset_list, &device_list);
5301 device_list_handle = &device_list;
5302 }
5303
5304 /* We need to lock reset domain only once both for XGMI and single device */
5305 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5306 reset_list);
5307 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5308
5309 /* block all schedulers and reset given job's ring */
5310 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5311
5312 amdgpu_device_set_mp1_state(tmp_adev);
5313
5314 /*
5315 * Try to put the audio codec into suspend state
5316 * before gpu reset started.
5317 *
5318 * Due to the power domain of the graphics device
5319 * is shared with AZ power domain. Without this,
5320 * we may change the audio hardware from behind
5321 * the audio driver's back. That will trigger
5322 * some audio codec errors.
5323 */
5324 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5325 audio_suspended = true;
5326
5327 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5328
5329 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5330
5331 if (!amdgpu_sriov_vf(tmp_adev))
5332 amdgpu_amdkfd_pre_reset(tmp_adev);
5333
5334 /*
5335 * Mark these ASICs to be reseted as untracked first
5336 * And add them back after reset completed
5337 */
5338 amdgpu_unregister_gpu_instance(tmp_adev);
5339
5340 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5341
5342 /* disable ras on ALL IPs */
5343 if (!need_emergency_restart &&
5344 amdgpu_device_ip_need_full_reset(tmp_adev))
5345 amdgpu_ras_suspend(tmp_adev);
5346
5347 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5348 struct amdgpu_ring *ring = tmp_adev->rings[i];
5349
5350 if (!ring || !ring->sched.thread)
5351 continue;
5352
5353 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5354
5355 if (need_emergency_restart)
5356 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5357 }
5358 atomic_inc(&tmp_adev->gpu_reset_counter);
5359 }
5360
5361 if (need_emergency_restart)
5362 goto skip_sched_resume;
5363
5364 /*
5365 * Must check guilty signal here since after this point all old
5366 * HW fences are force signaled.
5367 *
5368 * job->base holds a reference to parent fence
5369 */
5370 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5371 job_signaled = true;
5372 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5373 goto skip_hw_reset;
5374 }
5375
5376 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5377 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5378 if (gpu_reset_for_dev_remove) {
5379 /* Workaroud for ASICs need to disable SMC first */
5380 amdgpu_device_smu_fini_early(tmp_adev);
5381 }
5382 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5383 /*TODO Should we stop ?*/
5384 if (r) {
5385 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5386 r, adev_to_drm(tmp_adev)->unique);
5387 tmp_adev->asic_reset_res = r;
5388 }
5389
5390 if (!amdgpu_sriov_vf(tmp_adev))
5391 /*
5392 * Drop all pending non scheduler resets. Scheduler resets
5393 * were already dropped during drm_sched_stop
5394 */
5395 amdgpu_device_stop_pending_resets(tmp_adev);
5396 }
5397
5398 /* Actual ASIC resets if needed.*/
5399 /* Host driver will handle XGMI hive reset for SRIOV */
5400 if (amdgpu_sriov_vf(adev)) {
5401 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5402 if (r)
5403 adev->asic_reset_res = r;
5404
5405 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5406 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5407 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5408 amdgpu_ras_resume(adev);
5409 } else {
5410 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5411 if (r && r == -EAGAIN)
5412 goto retry;
5413
5414 if (!r && gpu_reset_for_dev_remove)
5415 goto recover_end;
5416 }
5417
5418 skip_hw_reset:
5419
5420 /* Post ASIC reset for all devs .*/
5421 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5422
5423 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5424 struct amdgpu_ring *ring = tmp_adev->rings[i];
5425
5426 if (!ring || !ring->sched.thread)
5427 continue;
5428
5429 drm_sched_start(&ring->sched, true);
5430 }
5431
5432 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5433 amdgpu_mes_self_test(tmp_adev);
5434
5435 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5436 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5437
5438 if (tmp_adev->asic_reset_res)
5439 r = tmp_adev->asic_reset_res;
5440
5441 tmp_adev->asic_reset_res = 0;
5442
5443 if (r) {
5444 /* bad news, how to tell it to userspace ? */
5445 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5446 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5447 } else {
5448 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5449 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5450 DRM_WARN("smart shift update failed\n");
5451 }
5452 }
5453
5454 skip_sched_resume:
5455 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5456 /* unlock kfd: SRIOV would do it separately */
5457 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5458 amdgpu_amdkfd_post_reset(tmp_adev);
5459
5460 /* kfd_post_reset will do nothing if kfd device is not initialized,
5461 * need to bring up kfd here if it's not be initialized before
5462 */
5463 if (!adev->kfd.init_complete)
5464 amdgpu_amdkfd_device_init(adev);
5465
5466 if (audio_suspended)
5467 amdgpu_device_resume_display_audio(tmp_adev);
5468
5469 amdgpu_device_unset_mp1_state(tmp_adev);
5470
5471 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5472 }
5473
5474 recover_end:
5475 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5476 reset_list);
5477 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5478
5479 if (hive) {
5480 mutex_unlock(&hive->hive_lock);
5481 amdgpu_put_xgmi_hive(hive);
5482 }
5483
5484 if (r)
5485 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5486
5487 atomic_set(&adev->reset_domain->reset_res, r);
5488 return r;
5489 }
5490
5491 /**
5492 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5493 *
5494 * @adev: amdgpu_device pointer
5495 *
5496 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5497 * and lanes) of the slot the device is in. Handles APUs and
5498 * virtualized environments where PCIE config space may not be available.
5499 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5500 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5501 {
5502 struct pci_dev *pdev;
5503 enum pci_bus_speed speed_cap, platform_speed_cap;
5504 enum pcie_link_width platform_link_width;
5505
5506 if (amdgpu_pcie_gen_cap)
5507 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5508
5509 if (amdgpu_pcie_lane_cap)
5510 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5511
5512 /* covers APUs as well */
5513 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5514 if (adev->pm.pcie_gen_mask == 0)
5515 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5516 if (adev->pm.pcie_mlw_mask == 0)
5517 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5518 return;
5519 }
5520
5521 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5522 return;
5523
5524 pcie_bandwidth_available(adev->pdev, NULL,
5525 &platform_speed_cap, &platform_link_width);
5526
5527 if (adev->pm.pcie_gen_mask == 0) {
5528 /* asic caps */
5529 pdev = adev->pdev;
5530 speed_cap = pcie_get_speed_cap(pdev);
5531 if (speed_cap == PCI_SPEED_UNKNOWN) {
5532 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5533 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5534 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5535 } else {
5536 if (speed_cap == PCIE_SPEED_32_0GT)
5537 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5538 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5539 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5540 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5541 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5542 else if (speed_cap == PCIE_SPEED_16_0GT)
5543 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5544 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5545 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5546 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5547 else if (speed_cap == PCIE_SPEED_8_0GT)
5548 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5549 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5550 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5551 else if (speed_cap == PCIE_SPEED_5_0GT)
5552 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5553 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5554 else
5555 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5556 }
5557 /* platform caps */
5558 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5559 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5560 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5561 } else {
5562 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5563 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5564 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5565 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5566 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5567 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5568 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5569 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5570 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5571 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5572 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5573 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5574 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5575 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5576 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5577 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5578 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5579 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5580 else
5581 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5582
5583 }
5584 }
5585 if (adev->pm.pcie_mlw_mask == 0) {
5586 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5587 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5588 } else {
5589 switch (platform_link_width) {
5590 case PCIE_LNK_X32:
5591 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5592 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5593 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5594 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5595 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5596 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5597 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5598 break;
5599 case PCIE_LNK_X16:
5600 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5601 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5602 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5603 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5604 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5605 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5606 break;
5607 case PCIE_LNK_X12:
5608 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5609 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5610 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5611 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5612 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5613 break;
5614 case PCIE_LNK_X8:
5615 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5616 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5617 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5618 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5619 break;
5620 case PCIE_LNK_X4:
5621 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5622 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5623 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5624 break;
5625 case PCIE_LNK_X2:
5626 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5627 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5628 break;
5629 case PCIE_LNK_X1:
5630 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5631 break;
5632 default:
5633 break;
5634 }
5635 }
5636 }
5637 }
5638
5639 /**
5640 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5641 *
5642 * @adev: amdgpu_device pointer
5643 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5644 *
5645 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5646 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5647 * @peer_adev.
5648 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5649 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5650 struct amdgpu_device *peer_adev)
5651 {
5652 #ifdef CONFIG_HSA_AMD_P2P
5653 uint64_t address_mask = peer_adev->dev->dma_mask ?
5654 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5655 resource_size_t aper_limit =
5656 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5657 bool p2p_access =
5658 !adev->gmc.xgmi.connected_to_cpu &&
5659 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5660
5661 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5662 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5663 !(adev->gmc.aper_base & address_mask ||
5664 aper_limit & address_mask));
5665 #else
5666 return false;
5667 #endif
5668 }
5669
amdgpu_device_baco_enter(struct drm_device * dev)5670 int amdgpu_device_baco_enter(struct drm_device *dev)
5671 {
5672 struct amdgpu_device *adev = drm_to_adev(dev);
5673 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5674
5675 if (!amdgpu_device_supports_baco(dev))
5676 return -ENOTSUPP;
5677
5678 if (ras && adev->ras_enabled &&
5679 adev->nbio.funcs->enable_doorbell_interrupt)
5680 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5681
5682 return amdgpu_dpm_baco_enter(adev);
5683 }
5684
amdgpu_device_baco_exit(struct drm_device * dev)5685 int amdgpu_device_baco_exit(struct drm_device *dev)
5686 {
5687 struct amdgpu_device *adev = drm_to_adev(dev);
5688 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5689 int ret = 0;
5690
5691 if (!amdgpu_device_supports_baco(dev))
5692 return -ENOTSUPP;
5693
5694 ret = amdgpu_dpm_baco_exit(adev);
5695 if (ret)
5696 return ret;
5697
5698 if (ras && adev->ras_enabled &&
5699 adev->nbio.funcs->enable_doorbell_interrupt)
5700 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5701
5702 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
5703 adev->nbio.funcs->clear_doorbell_interrupt)
5704 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5705
5706 return 0;
5707 }
5708
5709 /**
5710 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5711 * @pdev: PCI device struct
5712 * @state: PCI channel state
5713 *
5714 * Description: Called when a PCI error is detected.
5715 *
5716 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5717 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5718 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5719 {
5720 struct drm_device *dev = pci_get_drvdata(pdev);
5721 struct amdgpu_device *adev = drm_to_adev(dev);
5722 int i;
5723
5724 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5725
5726 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5727 DRM_WARN("No support for XGMI hive yet...");
5728 return PCI_ERS_RESULT_DISCONNECT;
5729 }
5730
5731 adev->pci_channel_state = state;
5732
5733 switch (state) {
5734 case pci_channel_io_normal:
5735 return PCI_ERS_RESULT_CAN_RECOVER;
5736 /* Fatal error, prepare for slot reset */
5737 case pci_channel_io_frozen:
5738 /*
5739 * Locking adev->reset_domain->sem will prevent any external access
5740 * to GPU during PCI error recovery
5741 */
5742 amdgpu_device_lock_reset_domain(adev->reset_domain);
5743 amdgpu_device_set_mp1_state(adev);
5744
5745 /*
5746 * Block any work scheduling as we do for regular GPU reset
5747 * for the duration of the recovery
5748 */
5749 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5750 struct amdgpu_ring *ring = adev->rings[i];
5751
5752 if (!ring || !ring->sched.thread)
5753 continue;
5754
5755 drm_sched_stop(&ring->sched, NULL);
5756 }
5757 atomic_inc(&adev->gpu_reset_counter);
5758 return PCI_ERS_RESULT_NEED_RESET;
5759 case pci_channel_io_perm_failure:
5760 /* Permanent error, prepare for device removal */
5761 return PCI_ERS_RESULT_DISCONNECT;
5762 }
5763
5764 return PCI_ERS_RESULT_NEED_RESET;
5765 }
5766
5767 /**
5768 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5769 * @pdev: pointer to PCI device
5770 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5771 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5772 {
5773
5774 DRM_INFO("PCI error: mmio enabled callback!!\n");
5775
5776 /* TODO - dump whatever for debugging purposes */
5777
5778 /* This called only if amdgpu_pci_error_detected returns
5779 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5780 * works, no need to reset slot.
5781 */
5782
5783 return PCI_ERS_RESULT_RECOVERED;
5784 }
5785
5786 /**
5787 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5788 * @pdev: PCI device struct
5789 *
5790 * Description: This routine is called by the pci error recovery
5791 * code after the PCI slot has been reset, just before we
5792 * should resume normal operations.
5793 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5794 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5795 {
5796 struct drm_device *dev = pci_get_drvdata(pdev);
5797 struct amdgpu_device *adev = drm_to_adev(dev);
5798 int r, i;
5799 struct amdgpu_reset_context reset_context;
5800 u32 memsize;
5801 struct list_head device_list;
5802
5803 DRM_INFO("PCI error: slot reset callback!!\n");
5804
5805 memset(&reset_context, 0, sizeof(reset_context));
5806
5807 INIT_LIST_HEAD(&device_list);
5808 list_add_tail(&adev->reset_list, &device_list);
5809
5810 /* wait for asic to come out of reset */
5811 msleep(500);
5812
5813 /* Restore PCI confspace */
5814 amdgpu_device_load_pci_state(pdev);
5815
5816 /* confirm ASIC came out of reset */
5817 for (i = 0; i < adev->usec_timeout; i++) {
5818 memsize = amdgpu_asic_get_config_memsize(adev);
5819
5820 if (memsize != 0xffffffff)
5821 break;
5822 udelay(1);
5823 }
5824 if (memsize == 0xffffffff) {
5825 r = -ETIME;
5826 goto out;
5827 }
5828
5829 reset_context.method = AMD_RESET_METHOD_NONE;
5830 reset_context.reset_req_dev = adev;
5831 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5832 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5833
5834 adev->no_hw_access = true;
5835 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5836 adev->no_hw_access = false;
5837 if (r)
5838 goto out;
5839
5840 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5841
5842 out:
5843 if (!r) {
5844 if (amdgpu_device_cache_pci_state(adev->pdev))
5845 pci_restore_state(adev->pdev);
5846
5847 DRM_INFO("PCIe error recovery succeeded\n");
5848 } else {
5849 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5850 amdgpu_device_unset_mp1_state(adev);
5851 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5852 }
5853
5854 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5855 }
5856
5857 /**
5858 * amdgpu_pci_resume() - resume normal ops after PCI reset
5859 * @pdev: pointer to PCI device
5860 *
5861 * Called when the error recovery driver tells us that its
5862 * OK to resume normal operation.
5863 */
amdgpu_pci_resume(struct pci_dev * pdev)5864 void amdgpu_pci_resume(struct pci_dev *pdev)
5865 {
5866 struct drm_device *dev = pci_get_drvdata(pdev);
5867 struct amdgpu_device *adev = drm_to_adev(dev);
5868 int i;
5869
5870
5871 DRM_INFO("PCI error: resume callback!!\n");
5872
5873 /* Only continue execution for the case of pci_channel_io_frozen */
5874 if (adev->pci_channel_state != pci_channel_io_frozen)
5875 return;
5876
5877 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5878 struct amdgpu_ring *ring = adev->rings[i];
5879
5880 if (!ring || !ring->sched.thread)
5881 continue;
5882
5883 drm_sched_start(&ring->sched, true);
5884 }
5885
5886 amdgpu_device_unset_mp1_state(adev);
5887 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5888 }
5889
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5890 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5891 {
5892 struct drm_device *dev = pci_get_drvdata(pdev);
5893 struct amdgpu_device *adev = drm_to_adev(dev);
5894 int r;
5895
5896 if (amdgpu_sriov_vf(adev))
5897 return false;
5898
5899 r = pci_save_state(pdev);
5900 if (!r) {
5901 kfree(adev->pci_state);
5902
5903 adev->pci_state = pci_store_saved_state(pdev);
5904
5905 if (!adev->pci_state) {
5906 DRM_ERROR("Failed to store PCI saved state");
5907 return false;
5908 }
5909 } else {
5910 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5911 return false;
5912 }
5913
5914 return true;
5915 }
5916
amdgpu_device_load_pci_state(struct pci_dev * pdev)5917 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5918 {
5919 struct drm_device *dev = pci_get_drvdata(pdev);
5920 struct amdgpu_device *adev = drm_to_adev(dev);
5921 int r;
5922
5923 if (!adev->pci_state)
5924 return false;
5925
5926 r = pci_load_saved_state(pdev, adev->pci_state);
5927
5928 if (!r) {
5929 pci_restore_state(pdev);
5930 } else {
5931 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5932 return false;
5933 }
5934
5935 return true;
5936 }
5937
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5938 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5939 struct amdgpu_ring *ring)
5940 {
5941 #ifdef CONFIG_X86_64
5942 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5943 return;
5944 #endif
5945 if (adev->gmc.xgmi.connected_to_cpu)
5946 return;
5947
5948 if (ring && ring->funcs->emit_hdp_flush)
5949 amdgpu_ring_emit_hdp_flush(ring);
5950 else
5951 amdgpu_asic_flush_hdp(adev, ring);
5952 }
5953
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5954 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5955 struct amdgpu_ring *ring)
5956 {
5957 #ifdef CONFIG_X86_64
5958 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5959 return;
5960 #endif
5961 if (adev->gmc.xgmi.connected_to_cpu)
5962 return;
5963
5964 amdgpu_asic_invalidate_hdp(adev, ring);
5965 }
5966
amdgpu_in_reset(struct amdgpu_device * adev)5967 int amdgpu_in_reset(struct amdgpu_device *adev)
5968 {
5969 return atomic_read(&adev->reset_domain->in_gpu_reset);
5970 }
5971
5972 /**
5973 * amdgpu_device_halt() - bring hardware to some kind of halt state
5974 *
5975 * @adev: amdgpu_device pointer
5976 *
5977 * Bring hardware to some kind of halt state so that no one can touch it
5978 * any more. It will help to maintain error context when error occurred.
5979 * Compare to a simple hang, the system will keep stable at least for SSH
5980 * access. Then it should be trivial to inspect the hardware state and
5981 * see what's going on. Implemented as following:
5982 *
5983 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5984 * clears all CPU mappings to device, disallows remappings through page faults
5985 * 2. amdgpu_irq_disable_all() disables all interrupts
5986 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5987 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5988 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5989 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5990 * flush any in flight DMA operations
5991 */
amdgpu_device_halt(struct amdgpu_device * adev)5992 void amdgpu_device_halt(struct amdgpu_device *adev)
5993 {
5994 struct pci_dev *pdev = adev->pdev;
5995 struct drm_device *ddev = adev_to_drm(adev);
5996
5997 amdgpu_xcp_dev_unplug(adev);
5998 drm_dev_unplug(ddev);
5999
6000 amdgpu_irq_disable_all(adev);
6001
6002 amdgpu_fence_driver_hw_fini(adev);
6003
6004 adev->no_hw_access = true;
6005
6006 amdgpu_device_unmap_mmio(adev);
6007
6008 pci_disable_device(pdev);
6009 pci_wait_for_pending_transaction(pdev);
6010 }
6011
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6012 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6013 u32 reg)
6014 {
6015 unsigned long flags, address, data;
6016 u32 r;
6017
6018 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6019 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6020
6021 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6022 WREG32(address, reg * 4);
6023 (void)RREG32(address);
6024 r = RREG32(data);
6025 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6026 return r;
6027 }
6028
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6029 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6030 u32 reg, u32 v)
6031 {
6032 unsigned long flags, address, data;
6033
6034 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6035 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6036
6037 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6038 WREG32(address, reg * 4);
6039 (void)RREG32(address);
6040 WREG32(data, v);
6041 (void)RREG32(data);
6042 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6043 }
6044
6045 /**
6046 * amdgpu_device_switch_gang - switch to a new gang
6047 * @adev: amdgpu_device pointer
6048 * @gang: the gang to switch to
6049 *
6050 * Try to switch to a new gang.
6051 * Returns: NULL if we switched to the new gang or a reference to the current
6052 * gang leader.
6053 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6054 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6055 struct dma_fence *gang)
6056 {
6057 struct dma_fence *old = NULL;
6058
6059 dma_fence_get(gang);
6060 do {
6061 dma_fence_put(old);
6062 rcu_read_lock();
6063 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6064 rcu_read_unlock();
6065
6066 if (old == gang)
6067 break;
6068
6069 if (!dma_fence_is_signaled(old)) {
6070 dma_fence_put(gang);
6071 return old;
6072 }
6073
6074 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6075 old, gang) != old);
6076
6077 /*
6078 * Drop it once for the exchanged reference in adev and once for the
6079 * thread local reference acquired in amdgpu_device_get_gang().
6080 */
6081 dma_fence_put(old);
6082 dma_fence_put(old);
6083 return NULL;
6084 }
6085
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6086 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6087 {
6088 switch (adev->asic_type) {
6089 #ifdef CONFIG_DRM_AMDGPU_SI
6090 case CHIP_HAINAN:
6091 #endif
6092 case CHIP_TOPAZ:
6093 /* chips with no display hardware */
6094 return false;
6095 #ifdef CONFIG_DRM_AMDGPU_SI
6096 case CHIP_TAHITI:
6097 case CHIP_PITCAIRN:
6098 case CHIP_VERDE:
6099 case CHIP_OLAND:
6100 #endif
6101 #ifdef CONFIG_DRM_AMDGPU_CIK
6102 case CHIP_BONAIRE:
6103 case CHIP_HAWAII:
6104 case CHIP_KAVERI:
6105 case CHIP_KABINI:
6106 case CHIP_MULLINS:
6107 #endif
6108 case CHIP_TONGA:
6109 case CHIP_FIJI:
6110 case CHIP_POLARIS10:
6111 case CHIP_POLARIS11:
6112 case CHIP_POLARIS12:
6113 case CHIP_VEGAM:
6114 case CHIP_CARRIZO:
6115 case CHIP_STONEY:
6116 /* chips with display hardware */
6117 return true;
6118 default:
6119 /* IP discovery */
6120 if (!adev->ip_versions[DCE_HWIP][0] ||
6121 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6122 return false;
6123 return true;
6124 }
6125 }
6126
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6127 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6128 uint32_t inst, uint32_t reg_addr, char reg_name[],
6129 uint32_t expected_value, uint32_t mask)
6130 {
6131 uint32_t ret = 0;
6132 uint32_t old_ = 0;
6133 uint32_t tmp_ = RREG32(reg_addr);
6134 uint32_t loop = adev->usec_timeout;
6135
6136 while ((tmp_ & (mask)) != (expected_value)) {
6137 if (old_ != tmp_) {
6138 loop = adev->usec_timeout;
6139 old_ = tmp_;
6140 } else
6141 udelay(1);
6142 tmp_ = RREG32(reg_addr);
6143 loop--;
6144 if (!loop) {
6145 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6146 inst, reg_name, (uint32_t)expected_value,
6147 (uint32_t)(tmp_ & (mask)));
6148 ret = -ETIMEDOUT;
6149 break;
6150 }
6151 }
6152 return ret;
6153 }
6154