1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100
101 static const struct drm_driver amdgpu_kms_driver;
102
103 const char *amdgpu_asic_name[] = {
104 "TAHITI",
105 "PITCAIRN",
106 "VERDE",
107 "OLAND",
108 "HAINAN",
109 "BONAIRE",
110 "KAVERI",
111 "KABINI",
112 "HAWAII",
113 "MULLINS",
114 "TOPAZ",
115 "TONGA",
116 "FIJI",
117 "CARRIZO",
118 "STONEY",
119 "POLARIS10",
120 "POLARIS11",
121 "POLARIS12",
122 "VEGAM",
123 "VEGA10",
124 "VEGA12",
125 "VEGA20",
126 "RAVEN",
127 "ARCTURUS",
128 "RENOIR",
129 "ALDEBARAN",
130 "NAVI10",
131 "CYAN_SKILLFISH",
132 "NAVI14",
133 "NAVI12",
134 "SIENNA_CICHLID",
135 "NAVY_FLOUNDER",
136 "VANGOGH",
137 "DIMGREY_CAVEFISH",
138 "BEIGE_GOBY",
139 "YELLOW_CARP",
140 "IP DISCOVERY",
141 "LAST",
142 };
143
144 /**
145 * DOC: pcie_replay_count
146 *
147 * The amdgpu driver provides a sysfs API for reporting the total number
148 * of PCIe replays (NAKs)
149 * The file pcie_replay_count is used for this and returns the total
150 * number of replays as a sum of the NAKs generated and NAKs received
151 */
152
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
154 struct device_attribute *attr, char *buf)
155 {
156 struct drm_device *ddev = dev_get_drvdata(dev);
157 struct amdgpu_device *adev = drm_to_adev(ddev);
158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
159
160 return sysfs_emit(buf, "%llu\n", cnt);
161 }
162
163 static DEVICE_ATTR(pcie_replay_count, 0444,
164 amdgpu_device_get_pcie_replay_count, NULL);
165
166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
167
168
169 /**
170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
171 *
172 * @dev: drm_device pointer
173 *
174 * Returns true if the device is a dGPU with ATPX power control,
175 * otherwise return false.
176 */
amdgpu_device_supports_px(struct drm_device * dev)177 bool amdgpu_device_supports_px(struct drm_device *dev)
178 {
179 struct amdgpu_device *adev = drm_to_adev(dev);
180
181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
182 return true;
183 return false;
184 }
185
186 /**
187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
188 *
189 * @dev: drm_device pointer
190 *
191 * Returns true if the device is a dGPU with ACPI power control,
192 * otherwise return false.
193 */
amdgpu_device_supports_boco(struct drm_device * dev)194 bool amdgpu_device_supports_boco(struct drm_device *dev)
195 {
196 struct amdgpu_device *adev = drm_to_adev(dev);
197
198 if (adev->has_pr3 ||
199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
200 return true;
201 return false;
202 }
203
204 /**
205 * amdgpu_device_supports_baco - Does the device support BACO
206 *
207 * @dev: drm_device pointer
208 *
209 * Returns true if the device supporte BACO,
210 * otherwise return false.
211 */
amdgpu_device_supports_baco(struct drm_device * dev)212 bool amdgpu_device_supports_baco(struct drm_device *dev)
213 {
214 struct amdgpu_device *adev = drm_to_adev(dev);
215
216 return amdgpu_asic_supports_baco(adev);
217 }
218
219 /**
220 * amdgpu_device_supports_smart_shift - Is the device dGPU with
221 * smart shift support
222 *
223 * @dev: drm_device pointer
224 *
225 * Returns true if the device is a dGPU with Smart Shift support,
226 * otherwise returns false.
227 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
229 {
230 return (amdgpu_device_supports_boco(dev) &&
231 amdgpu_acpi_is_power_shift_control_supported());
232 }
233
234 /*
235 * VRAM access helper functions
236 */
237
238 /**
239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
240 *
241 * @adev: amdgpu_device pointer
242 * @pos: offset of the buffer in vram
243 * @buf: virtual address of the buffer in system memory
244 * @size: read/write size, sizeof(@buf) must > @size
245 * @write: true - write to vram, otherwise - read from vram
246 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
248 void *buf, size_t size, bool write)
249 {
250 unsigned long flags;
251 uint32_t hi = ~0, tmp = 0;
252 uint32_t *data = buf;
253 uint64_t last;
254 int idx;
255
256 if (!drm_dev_enter(adev_to_drm(adev), &idx))
257 return;
258
259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
260
261 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
262 for (last = pos + size; pos < last; pos += 4) {
263 tmp = pos >> 31;
264
265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
266 if (tmp != hi) {
267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
268 hi = tmp;
269 }
270 if (write)
271 WREG32_NO_KIQ(mmMM_DATA, *data++);
272 else
273 *data++ = RREG32_NO_KIQ(mmMM_DATA);
274 }
275
276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
277 drm_dev_exit(idx);
278 }
279
280 /**
281 * amdgpu_device_aper_access - access vram by vram aperature
282 *
283 * @adev: amdgpu_device pointer
284 * @pos: offset of the buffer in vram
285 * @buf: virtual address of the buffer in system memory
286 * @size: read/write size, sizeof(@buf) must > @size
287 * @write: true - write to vram, otherwise - read from vram
288 *
289 * The return value means how many bytes have been transferred.
290 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
292 void *buf, size_t size, bool write)
293 {
294 #ifdef CONFIG_64BIT
295 void __iomem *addr;
296 size_t count = 0;
297 uint64_t last;
298
299 if (!adev->mman.aper_base_kaddr)
300 return 0;
301
302 last = min(pos + size, adev->gmc.visible_vram_size);
303 if (last > pos) {
304 addr = adev->mman.aper_base_kaddr + pos;
305 count = last - pos;
306
307 if (write) {
308 memcpy_toio(addr, buf, count);
309 /* Make sure HDP write cache flush happens without any reordering
310 * after the system memory contents are sent over PCIe device
311 */
312 mb();
313 amdgpu_device_flush_hdp(adev, NULL);
314 } else {
315 amdgpu_device_invalidate_hdp(adev, NULL);
316 /* Make sure HDP read cache is invalidated before issuing a read
317 * to the PCIe device
318 */
319 mb();
320 memcpy_fromio(buf, addr, count);
321 }
322
323 }
324
325 return count;
326 #else
327 return 0;
328 #endif
329 }
330
331 /**
332 * amdgpu_device_vram_access - read/write a buffer in vram
333 *
334 * @adev: amdgpu_device pointer
335 * @pos: offset of the buffer in vram
336 * @buf: virtual address of the buffer in system memory
337 * @size: read/write size, sizeof(@buf) must > @size
338 * @write: true - write to vram, otherwise - read from vram
339 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
341 void *buf, size_t size, bool write)
342 {
343 size_t count;
344
345 /* try to using vram apreature to access vram first */
346 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
347 size -= count;
348 if (size) {
349 /* using MM to access rest vram */
350 pos += count;
351 buf += count;
352 amdgpu_device_mm_access(adev, pos, buf, size, write);
353 }
354 }
355
356 /*
357 * register access helper functions.
358 */
359
360 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362 {
363 if (adev->no_hw_access)
364 return true;
365
366 #ifdef CONFIG_LOCKDEP
367 /*
368 * This is a bit complicated to understand, so worth a comment. What we assert
369 * here is that the GPU reset is not running on another thread in parallel.
370 *
371 * For this we trylock the read side of the reset semaphore, if that succeeds
372 * we know that the reset is not running in paralell.
373 *
374 * If the trylock fails we assert that we are either already holding the read
375 * side of the lock or are the reset thread itself and hold the write side of
376 * the lock.
377 */
378 if (in_task()) {
379 if (down_read_trylock(&adev->reset_domain->sem))
380 up_read(&adev->reset_domain->sem);
381 else
382 lockdep_assert_held(&adev->reset_domain->sem);
383 }
384 #endif
385 return false;
386 }
387
388 /**
389 * amdgpu_device_rreg - read a memory mapped IO or indirect register
390 *
391 * @adev: amdgpu_device pointer
392 * @reg: dword aligned register offset
393 * @acc_flags: access flags which require special behavior
394 *
395 * Returns the 32 bit value from the offset specified.
396 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 uint32_t reg, uint32_t acc_flags)
399 {
400 uint32_t ret;
401
402 if (amdgpu_device_skip_hw_access(adev))
403 return 0;
404
405 if ((reg * 4) < adev->rmmio_size) {
406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 amdgpu_sriov_runtime(adev) &&
408 down_read_trylock(&adev->reset_domain->sem)) {
409 ret = amdgpu_kiq_rreg(adev, reg);
410 up_read(&adev->reset_domain->sem);
411 } else {
412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 }
414 } else {
415 ret = adev->pcie_rreg(adev, reg * 4);
416 }
417
418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
419
420 return ret;
421 }
422
423 /*
424 * MMIO register read with bytes helper functions
425 * @offset:bytes offset from MMIO start
426 */
427
428 /**
429 * amdgpu_mm_rreg8 - read a memory mapped IO register
430 *
431 * @adev: amdgpu_device pointer
432 * @offset: byte aligned register offset
433 *
434 * Returns the 8 bit value from the offset specified.
435 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
437 {
438 if (amdgpu_device_skip_hw_access(adev))
439 return 0;
440
441 if (offset < adev->rmmio_size)
442 return (readb(adev->rmmio + offset));
443 BUG();
444 }
445
446 /*
447 * MMIO register write with bytes helper functions
448 * @offset:bytes offset from MMIO start
449 * @value: the value want to be written to the register
450 */
451
452 /**
453 * amdgpu_mm_wreg8 - read a memory mapped IO register
454 *
455 * @adev: amdgpu_device pointer
456 * @offset: byte aligned register offset
457 * @value: 8 bit value to write
458 *
459 * Writes the value specified to the offset specified.
460 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
462 {
463 if (amdgpu_device_skip_hw_access(adev))
464 return;
465
466 if (offset < adev->rmmio_size)
467 writeb(value, adev->rmmio + offset);
468 else
469 BUG();
470 }
471
472 /**
473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
474 *
475 * @adev: amdgpu_device pointer
476 * @reg: dword aligned register offset
477 * @v: 32 bit value to write to the register
478 * @acc_flags: access flags which require special behavior
479 *
480 * Writes the value specified to the offset specified.
481 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)482 void amdgpu_device_wreg(struct amdgpu_device *adev,
483 uint32_t reg, uint32_t v,
484 uint32_t acc_flags)
485 {
486 if (amdgpu_device_skip_hw_access(adev))
487 return;
488
489 if ((reg * 4) < adev->rmmio_size) {
490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
491 amdgpu_sriov_runtime(adev) &&
492 down_read_trylock(&adev->reset_domain->sem)) {
493 amdgpu_kiq_wreg(adev, reg, v);
494 up_read(&adev->reset_domain->sem);
495 } else {
496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
497 }
498 } else {
499 adev->pcie_wreg(adev, reg * 4, v);
500 }
501
502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
503 }
504
505 /**
506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
507 *
508 * @adev: amdgpu_device pointer
509 * @reg: mmio/rlc register
510 * @v: value to write
511 *
512 * this function is invoked only for the debugfs register access
513 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
515 uint32_t reg, uint32_t v,
516 uint32_t xcc_id)
517 {
518 if (amdgpu_device_skip_hw_access(adev))
519 return;
520
521 if (amdgpu_sriov_fullaccess(adev) &&
522 adev->gfx.rlc.funcs &&
523 adev->gfx.rlc.funcs->is_rlcg_access_range) {
524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
526 } else if ((reg * 4) >= adev->rmmio_size) {
527 adev->pcie_wreg(adev, reg * 4, v);
528 } else {
529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
530 }
531 }
532
533 /**
534 * amdgpu_device_indirect_rreg - read an indirect register
535 *
536 * @adev: amdgpu_device pointer
537 * @reg_addr: indirect register address to read from
538 *
539 * Returns the value of indirect register @reg_addr
540 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
542 u32 reg_addr)
543 {
544 unsigned long flags, pcie_index, pcie_data;
545 void __iomem *pcie_index_offset;
546 void __iomem *pcie_data_offset;
547 u32 r;
548
549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
551
552 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
555
556 writel(reg_addr, pcie_index_offset);
557 readl(pcie_index_offset);
558 r = readl(pcie_data_offset);
559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
560
561 return r;
562 }
563
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
565 u64 reg_addr)
566 {
567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
568 u32 r;
569 void __iomem *pcie_index_offset;
570 void __iomem *pcie_index_hi_offset;
571 void __iomem *pcie_data_offset;
572
573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
575 if (adev->nbio.funcs->get_pcie_index_hi_offset)
576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
577 else
578 pcie_index_hi = 0;
579
580 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
583 if (pcie_index_hi != 0)
584 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
585 pcie_index_hi * 4;
586
587 writel(reg_addr, pcie_index_offset);
588 readl(pcie_index_offset);
589 if (pcie_index_hi != 0) {
590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
591 readl(pcie_index_hi_offset);
592 }
593 r = readl(pcie_data_offset);
594
595 /* clear the high bits */
596 if (pcie_index_hi != 0) {
597 writel(0, pcie_index_hi_offset);
598 readl(pcie_index_hi_offset);
599 }
600
601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602
603 return r;
604 }
605
606 /**
607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608 *
609 * @adev: amdgpu_device pointer
610 * @reg_addr: indirect register address to read from
611 *
612 * Returns the value of indirect register @reg_addr
613 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
615 u32 reg_addr)
616 {
617 unsigned long flags, pcie_index, pcie_data;
618 void __iomem *pcie_index_offset;
619 void __iomem *pcie_data_offset;
620 u64 r;
621
622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
624
625 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
628
629 /* read low 32 bits */
630 writel(reg_addr, pcie_index_offset);
631 readl(pcie_index_offset);
632 r = readl(pcie_data_offset);
633 /* read high 32 bits */
634 writel(reg_addr + 4, pcie_index_offset);
635 readl(pcie_index_offset);
636 r |= ((u64)readl(pcie_data_offset) << 32);
637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
638
639 return r;
640 }
641
642 /**
643 * amdgpu_device_indirect_wreg - write an indirect register address
644 *
645 * @adev: amdgpu_device pointer
646 * @reg_addr: indirect register offset
647 * @reg_data: indirect register data
648 *
649 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
651 u32 reg_addr, u32 reg_data)
652 {
653 unsigned long flags, pcie_index, pcie_data;
654 void __iomem *pcie_index_offset;
655 void __iomem *pcie_data_offset;
656
657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
659
660 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
663
664 writel(reg_addr, pcie_index_offset);
665 readl(pcie_index_offset);
666 writel(reg_data, pcie_data_offset);
667 readl(pcie_data_offset);
668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
669 }
670
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
672 u64 reg_addr, u32 reg_data)
673 {
674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
675 void __iomem *pcie_index_offset;
676 void __iomem *pcie_index_hi_offset;
677 void __iomem *pcie_data_offset;
678
679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
681 if (adev->nbio.funcs->get_pcie_index_hi_offset)
682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
683 else
684 pcie_index_hi = 0;
685
686 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
689 if (pcie_index_hi != 0)
690 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
691 pcie_index_hi * 4;
692
693 writel(reg_addr, pcie_index_offset);
694 readl(pcie_index_offset);
695 if (pcie_index_hi != 0) {
696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
697 readl(pcie_index_hi_offset);
698 }
699 writel(reg_data, pcie_data_offset);
700 readl(pcie_data_offset);
701
702 /* clear the high bits */
703 if (pcie_index_hi != 0) {
704 writel(0, pcie_index_hi_offset);
705 readl(pcie_index_hi_offset);
706 }
707
708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709 }
710
711 /**
712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713 *
714 * @adev: amdgpu_device pointer
715 * @reg_addr: indirect register offset
716 * @reg_data: indirect register data
717 *
718 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
720 u32 reg_addr, u64 reg_data)
721 {
722 unsigned long flags, pcie_index, pcie_data;
723 void __iomem *pcie_index_offset;
724 void __iomem *pcie_data_offset;
725
726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
728
729 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732
733 /* write low 32 bits */
734 writel(reg_addr, pcie_index_offset);
735 readl(pcie_index_offset);
736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 readl(pcie_data_offset);
738 /* write high 32 bits */
739 writel(reg_addr + 4, pcie_index_offset);
740 readl(pcie_index_offset);
741 writel((u32)(reg_data >> 32), pcie_data_offset);
742 readl(pcie_data_offset);
743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744 }
745
746 /**
747 * amdgpu_device_get_rev_id - query device rev_id
748 *
749 * @adev: amdgpu_device pointer
750 *
751 * Return device rev_id
752 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
754 {
755 return adev->nbio.funcs->get_rev_id(adev);
756 }
757
758 /**
759 * amdgpu_invalid_rreg - dummy reg read function
760 *
761 * @adev: amdgpu_device pointer
762 * @reg: offset of register
763 *
764 * Dummy register read function. Used for register blocks
765 * that certain asics don't have (all asics).
766 * Returns the value in the register.
767 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
769 {
770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
771 BUG();
772 return 0;
773 }
774
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
776 {
777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
778 BUG();
779 return 0;
780 }
781
782 /**
783 * amdgpu_invalid_wreg - dummy reg write function
784 *
785 * @adev: amdgpu_device pointer
786 * @reg: offset of register
787 * @v: value to write to the register
788 *
789 * Dummy register read function. Used for register blocks
790 * that certain asics don't have (all asics).
791 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
793 {
794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
795 reg, v);
796 BUG();
797 }
798
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
800 {
801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
802 reg, v);
803 BUG();
804 }
805
806 /**
807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
808 *
809 * @adev: amdgpu_device pointer
810 * @reg: offset of register
811 *
812 * Dummy register read function. Used for register blocks
813 * that certain asics don't have (all asics).
814 * Returns the value in the register.
815 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
817 {
818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
819 BUG();
820 return 0;
821 }
822
823 /**
824 * amdgpu_invalid_wreg64 - dummy reg write function
825 *
826 * @adev: amdgpu_device pointer
827 * @reg: offset of register
828 * @v: value to write to the register
829 *
830 * Dummy register read function. Used for register blocks
831 * that certain asics don't have (all asics).
832 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
834 {
835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
836 reg, v);
837 BUG();
838 }
839
840 /**
841 * amdgpu_block_invalid_rreg - dummy reg read function
842 *
843 * @adev: amdgpu_device pointer
844 * @block: offset of instance
845 * @reg: offset of register
846 *
847 * Dummy register read function. Used for register blocks
848 * that certain asics don't have (all asics).
849 * Returns the value in the register.
850 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
852 uint32_t block, uint32_t reg)
853 {
854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
855 reg, block);
856 BUG();
857 return 0;
858 }
859
860 /**
861 * amdgpu_block_invalid_wreg - dummy reg write function
862 *
863 * @adev: amdgpu_device pointer
864 * @block: offset of instance
865 * @reg: offset of register
866 * @v: value to write to the register
867 *
868 * Dummy register read function. Used for register blocks
869 * that certain asics don't have (all asics).
870 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
872 uint32_t block,
873 uint32_t reg, uint32_t v)
874 {
875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
876 reg, block, v);
877 BUG();
878 }
879
880 /**
881 * amdgpu_device_asic_init - Wrapper for atom asic_init
882 *
883 * @adev: amdgpu_device pointer
884 *
885 * Does any asic specific work and then calls atom asic init.
886 */
amdgpu_device_asic_init(struct amdgpu_device * adev)887 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
888 {
889 int ret;
890
891 amdgpu_asic_pre_asic_init(adev);
892
893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
895 amdgpu_psp_wait_for_bootloader(adev);
896 ret = amdgpu_atomfirmware_asic_init(adev, true);
897 return ret;
898 } else {
899 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
900 }
901
902 return 0;
903 }
904
905 /**
906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
907 *
908 * @adev: amdgpu_device pointer
909 *
910 * Allocates a scratch page of VRAM for use by various things in the
911 * driver.
912 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
914 {
915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
916 AMDGPU_GEM_DOMAIN_VRAM |
917 AMDGPU_GEM_DOMAIN_GTT,
918 &adev->mem_scratch.robj,
919 &adev->mem_scratch.gpu_addr,
920 (void **)&adev->mem_scratch.ptr);
921 }
922
923 /**
924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
925 *
926 * @adev: amdgpu_device pointer
927 *
928 * Frees the VRAM scratch page.
929 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
931 {
932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
933 }
934
935 /**
936 * amdgpu_device_program_register_sequence - program an array of registers.
937 *
938 * @adev: amdgpu_device pointer
939 * @registers: pointer to the register array
940 * @array_size: size of the register array
941 *
942 * Programs an array or registers with and or masks.
943 * This is a helper for setting golden registers.
944 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
946 const u32 *registers,
947 const u32 array_size)
948 {
949 u32 tmp, reg, and_mask, or_mask;
950 int i;
951
952 if (array_size % 3)
953 return;
954
955 for (i = 0; i < array_size; i += 3) {
956 reg = registers[i + 0];
957 and_mask = registers[i + 1];
958 or_mask = registers[i + 2];
959
960 if (and_mask == 0xffffffff) {
961 tmp = or_mask;
962 } else {
963 tmp = RREG32(reg);
964 tmp &= ~and_mask;
965 if (adev->family >= AMDGPU_FAMILY_AI)
966 tmp |= (or_mask & and_mask);
967 else
968 tmp |= or_mask;
969 }
970 WREG32(reg, tmp);
971 }
972 }
973
974 /**
975 * amdgpu_device_pci_config_reset - reset the GPU
976 *
977 * @adev: amdgpu_device pointer
978 *
979 * Resets the GPU using the pci config reset sequence.
980 * Only applicable to asics prior to vega10.
981 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
983 {
984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
985 }
986
987 /**
988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
989 *
990 * @adev: amdgpu_device pointer
991 *
992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
993 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)994 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
995 {
996 return pci_reset_function(adev->pdev);
997 }
998
999 /*
1000 * amdgpu_device_wb_*()
1001 * Writeback is the method by which the GPU updates special pages in memory
1002 * with the status of certain GPU events (fences, ring pointers,etc.).
1003 */
1004
1005 /**
1006 * amdgpu_device_wb_fini - Disable Writeback and free memory
1007 *
1008 * @adev: amdgpu_device pointer
1009 *
1010 * Disables Writeback and frees the Writeback memory (all asics).
1011 * Used at driver shutdown.
1012 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1013 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1014 {
1015 if (adev->wb.wb_obj) {
1016 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1017 &adev->wb.gpu_addr,
1018 (void **)&adev->wb.wb);
1019 adev->wb.wb_obj = NULL;
1020 }
1021 }
1022
1023 /**
1024 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1025 *
1026 * @adev: amdgpu_device pointer
1027 *
1028 * Initializes writeback and allocates writeback memory (all asics).
1029 * Used at driver startup.
1030 * Returns 0 on success or an -error on failure.
1031 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1032 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1033 {
1034 int r;
1035
1036 if (adev->wb.wb_obj == NULL) {
1037 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1038 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1039 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1040 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1041 (void **)&adev->wb.wb);
1042 if (r) {
1043 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1044 return r;
1045 }
1046
1047 adev->wb.num_wb = AMDGPU_MAX_WB;
1048 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1049
1050 /* clear wb memory */
1051 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1052 }
1053
1054 return 0;
1055 }
1056
1057 /**
1058 * amdgpu_device_wb_get - Allocate a wb entry
1059 *
1060 * @adev: amdgpu_device pointer
1061 * @wb: wb index
1062 *
1063 * Allocate a wb slot for use by the driver (all asics).
1064 * Returns 0 on success or -EINVAL on failure.
1065 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1066 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1067 {
1068 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1069
1070 if (offset < adev->wb.num_wb) {
1071 __set_bit(offset, adev->wb.used);
1072 *wb = offset << 3; /* convert to dw offset */
1073 return 0;
1074 } else {
1075 return -EINVAL;
1076 }
1077 }
1078
1079 /**
1080 * amdgpu_device_wb_free - Free a wb entry
1081 *
1082 * @adev: amdgpu_device pointer
1083 * @wb: wb index
1084 *
1085 * Free a wb slot allocated for use by the driver (all asics)
1086 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1087 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1088 {
1089 wb >>= 3;
1090 if (wb < adev->wb.num_wb)
1091 __clear_bit(wb, adev->wb.used);
1092 }
1093
1094 /**
1095 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1096 *
1097 * @adev: amdgpu_device pointer
1098 *
1099 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1100 * to fail, but if any of the BARs is not accessible after the size we abort
1101 * driver loading by returning -ENODEV.
1102 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1103 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1104 {
1105 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1106 struct pci_bus *root;
1107 struct resource *res;
1108 unsigned int i;
1109 u16 cmd;
1110 int r;
1111
1112 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1113 return 0;
1114
1115 /* Bypass for VF */
1116 if (amdgpu_sriov_vf(adev))
1117 return 0;
1118
1119 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1120 if ((amdgpu_runtime_pm != 0) &&
1121 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1122 adev->pdev->device == 0x731f &&
1123 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1124 return 0;
1125
1126 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1127 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1128 DRM_WARN("System can't access extended configuration space,please check!!\n");
1129
1130 /* skip if the bios has already enabled large BAR */
1131 if (adev->gmc.real_vram_size &&
1132 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1133 return 0;
1134
1135 /* Check if the root BUS has 64bit memory resources */
1136 root = adev->pdev->bus;
1137 while (root->parent)
1138 root = root->parent;
1139
1140 pci_bus_for_each_resource(root, res, i) {
1141 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1142 res->start > 0x100000000ull)
1143 break;
1144 }
1145
1146 /* Trying to resize is pointless without a root hub window above 4GB */
1147 if (!res)
1148 return 0;
1149
1150 /* Limit the BAR size to what is available */
1151 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1152 rbar_size);
1153
1154 /* Disable memory decoding while we change the BAR addresses and size */
1155 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1156 pci_write_config_word(adev->pdev, PCI_COMMAND,
1157 cmd & ~PCI_COMMAND_MEMORY);
1158
1159 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1160 amdgpu_doorbell_fini(adev);
1161 if (adev->asic_type >= CHIP_BONAIRE)
1162 pci_release_resource(adev->pdev, 2);
1163
1164 pci_release_resource(adev->pdev, 0);
1165
1166 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1167 if (r == -ENOSPC)
1168 DRM_INFO("Not enough PCI address space for a large BAR.");
1169 else if (r && r != -ENOTSUPP)
1170 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1171
1172 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1173
1174 /* When the doorbell or fb BAR isn't available we have no chance of
1175 * using the device.
1176 */
1177 r = amdgpu_doorbell_init(adev);
1178 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1179 return -ENODEV;
1180
1181 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1182
1183 return 0;
1184 }
1185
amdgpu_device_read_bios(struct amdgpu_device * adev)1186 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1187 {
1188 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1189 return false;
1190
1191 return true;
1192 }
1193
1194 /*
1195 * GPU helpers function.
1196 */
1197 /**
1198 * amdgpu_device_need_post - check if the hw need post or not
1199 *
1200 * @adev: amdgpu_device pointer
1201 *
1202 * Check if the asic has been initialized (all asics) at driver startup
1203 * or post is needed if hw reset is performed.
1204 * Returns true if need or false if not.
1205 */
amdgpu_device_need_post(struct amdgpu_device * adev)1206 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1207 {
1208 uint32_t reg;
1209
1210 if (amdgpu_sriov_vf(adev))
1211 return false;
1212
1213 if (!amdgpu_device_read_bios(adev))
1214 return false;
1215
1216 if (amdgpu_passthrough(adev)) {
1217 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1218 * some old smc fw still need driver do vPost otherwise gpu hang, while
1219 * those smc fw version above 22.15 doesn't have this flaw, so we force
1220 * vpost executed for smc version below 22.15
1221 */
1222 if (adev->asic_type == CHIP_FIJI) {
1223 int err;
1224 uint32_t fw_ver;
1225
1226 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1227 /* force vPost if error occured */
1228 if (err)
1229 return true;
1230
1231 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1232 release_firmware(adev->pm.fw);
1233 if (fw_ver < 0x00160e00)
1234 return true;
1235 }
1236 }
1237
1238 /* Don't post if we need to reset whole hive on init */
1239 if (adev->gmc.xgmi.pending_reset)
1240 return false;
1241
1242 if (adev->has_hw_reset) {
1243 adev->has_hw_reset = false;
1244 return true;
1245 }
1246
1247 /* bios scratch used on CIK+ */
1248 if (adev->asic_type >= CHIP_BONAIRE)
1249 return amdgpu_atombios_scratch_need_asic_init(adev);
1250
1251 /* check MEM_SIZE for older asics */
1252 reg = amdgpu_asic_get_config_memsize(adev);
1253
1254 if ((reg != 0) && (reg != 0xffffffff))
1255 return false;
1256
1257 return true;
1258 }
1259
1260 /*
1261 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1262 * speed switching. Until we have confirmation from Intel that a specific host
1263 * supports it, it's safer that we keep it disabled for all.
1264 *
1265 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1266 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1267 */
amdgpu_device_pcie_dynamic_switching_supported(void)1268 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1269 {
1270 #if IS_ENABLED(CONFIG_X86)
1271 struct cpuinfo_x86 *c = &cpu_data(0);
1272
1273 if (c->x86_vendor == X86_VENDOR_INTEL)
1274 return false;
1275 #endif
1276 return true;
1277 }
1278
1279 /**
1280 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1281 *
1282 * @adev: amdgpu_device pointer
1283 *
1284 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1285 * be set for this device.
1286 *
1287 * Returns true if it should be used or false if not.
1288 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1289 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1290 {
1291 switch (amdgpu_aspm) {
1292 case -1:
1293 break;
1294 case 0:
1295 return false;
1296 case 1:
1297 return true;
1298 default:
1299 return false;
1300 }
1301 return pcie_aspm_enabled(adev->pdev);
1302 }
1303
amdgpu_device_aspm_support_quirk(void)1304 bool amdgpu_device_aspm_support_quirk(void)
1305 {
1306 #if IS_ENABLED(CONFIG_X86)
1307 struct cpuinfo_x86 *c = &cpu_data(0);
1308
1309 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1310 #else
1311 return true;
1312 #endif
1313 }
1314
1315 /* if we get transitioned to only one device, take VGA back */
1316 /**
1317 * amdgpu_device_vga_set_decode - enable/disable vga decode
1318 *
1319 * @pdev: PCI device pointer
1320 * @state: enable/disable vga decode
1321 *
1322 * Enable/disable vga decode (all asics).
1323 * Returns VGA resource flags.
1324 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 bool state)
1327 {
1328 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329
1330 amdgpu_asic_set_vga_state(adev, state);
1331 if (state)
1332 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334 else
1335 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337
1338 /**
1339 * amdgpu_device_check_block_size - validate the vm block size
1340 *
1341 * @adev: amdgpu_device pointer
1342 *
1343 * Validates the vm block size specified via module parameter.
1344 * The vm block size defines number of bits in page table versus page directory,
1345 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1346 * page table and the remaining bits are in the page directory.
1347 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1348 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1349 {
1350 /* defines number of bits in page table versus page directory,
1351 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1352 * page table and the remaining bits are in the page directory
1353 */
1354 if (amdgpu_vm_block_size == -1)
1355 return;
1356
1357 if (amdgpu_vm_block_size < 9) {
1358 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1359 amdgpu_vm_block_size);
1360 amdgpu_vm_block_size = -1;
1361 }
1362 }
1363
1364 /**
1365 * amdgpu_device_check_vm_size - validate the vm size
1366 *
1367 * @adev: amdgpu_device pointer
1368 *
1369 * Validates the vm size in GB specified via module parameter.
1370 * The VM size is the size of the GPU virtual memory space in GB.
1371 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1372 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1373 {
1374 /* no need to check the default value */
1375 if (amdgpu_vm_size == -1)
1376 return;
1377
1378 if (amdgpu_vm_size < 1) {
1379 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1380 amdgpu_vm_size);
1381 amdgpu_vm_size = -1;
1382 }
1383 }
1384
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1385 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1386 {
1387 struct sysinfo si;
1388 bool is_os_64 = (sizeof(void *) == 8);
1389 uint64_t total_memory;
1390 uint64_t dram_size_seven_GB = 0x1B8000000;
1391 uint64_t dram_size_three_GB = 0xB8000000;
1392
1393 if (amdgpu_smu_memory_pool_size == 0)
1394 return;
1395
1396 if (!is_os_64) {
1397 DRM_WARN("Not 64-bit OS, feature not supported\n");
1398 goto def_value;
1399 }
1400 si_meminfo(&si);
1401 total_memory = (uint64_t)si.totalram * si.mem_unit;
1402
1403 if ((amdgpu_smu_memory_pool_size == 1) ||
1404 (amdgpu_smu_memory_pool_size == 2)) {
1405 if (total_memory < dram_size_three_GB)
1406 goto def_value1;
1407 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1408 (amdgpu_smu_memory_pool_size == 8)) {
1409 if (total_memory < dram_size_seven_GB)
1410 goto def_value1;
1411 } else {
1412 DRM_WARN("Smu memory pool size not supported\n");
1413 goto def_value;
1414 }
1415 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1416
1417 return;
1418
1419 def_value1:
1420 DRM_WARN("No enough system memory\n");
1421 def_value:
1422 adev->pm.smu_prv_buffer_size = 0;
1423 }
1424
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1425 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1426 {
1427 if (!(adev->flags & AMD_IS_APU) ||
1428 adev->asic_type < CHIP_RAVEN)
1429 return 0;
1430
1431 switch (adev->asic_type) {
1432 case CHIP_RAVEN:
1433 if (adev->pdev->device == 0x15dd)
1434 adev->apu_flags |= AMD_APU_IS_RAVEN;
1435 if (adev->pdev->device == 0x15d8)
1436 adev->apu_flags |= AMD_APU_IS_PICASSO;
1437 break;
1438 case CHIP_RENOIR:
1439 if ((adev->pdev->device == 0x1636) ||
1440 (adev->pdev->device == 0x164c))
1441 adev->apu_flags |= AMD_APU_IS_RENOIR;
1442 else
1443 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1444 break;
1445 case CHIP_VANGOGH:
1446 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1447 break;
1448 case CHIP_YELLOW_CARP:
1449 break;
1450 case CHIP_CYAN_SKILLFISH:
1451 if ((adev->pdev->device == 0x13FE) ||
1452 (adev->pdev->device == 0x143F))
1453 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1454 break;
1455 default:
1456 break;
1457 }
1458
1459 return 0;
1460 }
1461
1462 /**
1463 * amdgpu_device_check_arguments - validate module params
1464 *
1465 * @adev: amdgpu_device pointer
1466 *
1467 * Validates certain module parameters and updates
1468 * the associated values used by the driver (all asics).
1469 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1470 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1471 {
1472 if (amdgpu_sched_jobs < 4) {
1473 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1474 amdgpu_sched_jobs);
1475 amdgpu_sched_jobs = 4;
1476 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1477 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1478 amdgpu_sched_jobs);
1479 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1480 }
1481
1482 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1483 /* gart size must be greater or equal to 32M */
1484 dev_warn(adev->dev, "gart size (%d) too small\n",
1485 amdgpu_gart_size);
1486 amdgpu_gart_size = -1;
1487 }
1488
1489 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1490 /* gtt size must be greater or equal to 32M */
1491 dev_warn(adev->dev, "gtt size (%d) too small\n",
1492 amdgpu_gtt_size);
1493 amdgpu_gtt_size = -1;
1494 }
1495
1496 /* valid range is between 4 and 9 inclusive */
1497 if (amdgpu_vm_fragment_size != -1 &&
1498 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1499 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1500 amdgpu_vm_fragment_size = -1;
1501 }
1502
1503 if (amdgpu_sched_hw_submission < 2) {
1504 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1505 amdgpu_sched_hw_submission);
1506 amdgpu_sched_hw_submission = 2;
1507 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1508 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1509 amdgpu_sched_hw_submission);
1510 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1511 }
1512
1513 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1514 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1515 amdgpu_reset_method = -1;
1516 }
1517
1518 amdgpu_device_check_smu_prv_buffer_size(adev);
1519
1520 amdgpu_device_check_vm_size(adev);
1521
1522 amdgpu_device_check_block_size(adev);
1523
1524 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1525
1526 return 0;
1527 }
1528
1529 /**
1530 * amdgpu_switcheroo_set_state - set switcheroo state
1531 *
1532 * @pdev: pci dev pointer
1533 * @state: vga_switcheroo state
1534 *
1535 * Callback for the switcheroo driver. Suspends or resumes
1536 * the asics before or after it is powered up using ACPI methods.
1537 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1538 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1539 enum vga_switcheroo_state state)
1540 {
1541 struct drm_device *dev = pci_get_drvdata(pdev);
1542 int r;
1543
1544 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1545 return;
1546
1547 if (state == VGA_SWITCHEROO_ON) {
1548 pr_info("switched on\n");
1549 /* don't suspend or resume card normally */
1550 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1551
1552 pci_set_power_state(pdev, PCI_D0);
1553 amdgpu_device_load_pci_state(pdev);
1554 r = pci_enable_device(pdev);
1555 if (r)
1556 DRM_WARN("pci_enable_device failed (%d)\n", r);
1557 amdgpu_device_resume(dev, true);
1558
1559 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1560 } else {
1561 pr_info("switched off\n");
1562 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1563 amdgpu_device_prepare(dev);
1564 amdgpu_device_suspend(dev, true);
1565 amdgpu_device_cache_pci_state(pdev);
1566 /* Shut down the device */
1567 pci_disable_device(pdev);
1568 pci_set_power_state(pdev, PCI_D3cold);
1569 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1570 }
1571 }
1572
1573 /**
1574 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1575 *
1576 * @pdev: pci dev pointer
1577 *
1578 * Callback for the switcheroo driver. Check of the switcheroo
1579 * state can be changed.
1580 * Returns true if the state can be changed, false if not.
1581 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1582 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1583 {
1584 struct drm_device *dev = pci_get_drvdata(pdev);
1585
1586 /*
1587 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1588 * locking inversion with the driver load path. And the access here is
1589 * completely racy anyway. So don't bother with locking for now.
1590 */
1591 return atomic_read(&dev->open_count) == 0;
1592 }
1593
1594 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1595 .set_gpu_state = amdgpu_switcheroo_set_state,
1596 .reprobe = NULL,
1597 .can_switch = amdgpu_switcheroo_can_switch,
1598 };
1599
1600 /**
1601 * amdgpu_device_ip_set_clockgating_state - set the CG state
1602 *
1603 * @dev: amdgpu_device pointer
1604 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1605 * @state: clockgating state (gate or ungate)
1606 *
1607 * Sets the requested clockgating state for all instances of
1608 * the hardware IP specified.
1609 * Returns the error code from the last instance.
1610 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1611 int amdgpu_device_ip_set_clockgating_state(void *dev,
1612 enum amd_ip_block_type block_type,
1613 enum amd_clockgating_state state)
1614 {
1615 struct amdgpu_device *adev = dev;
1616 int i, r = 0;
1617
1618 for (i = 0; i < adev->num_ip_blocks; i++) {
1619 if (!adev->ip_blocks[i].status.valid)
1620 continue;
1621 if (adev->ip_blocks[i].version->type != block_type)
1622 continue;
1623 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1624 continue;
1625 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1626 (void *)adev, state);
1627 if (r)
1628 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1629 adev->ip_blocks[i].version->funcs->name, r);
1630 }
1631 return r;
1632 }
1633
1634 /**
1635 * amdgpu_device_ip_set_powergating_state - set the PG state
1636 *
1637 * @dev: amdgpu_device pointer
1638 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1639 * @state: powergating state (gate or ungate)
1640 *
1641 * Sets the requested powergating state for all instances of
1642 * the hardware IP specified.
1643 * Returns the error code from the last instance.
1644 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1645 int amdgpu_device_ip_set_powergating_state(void *dev,
1646 enum amd_ip_block_type block_type,
1647 enum amd_powergating_state state)
1648 {
1649 struct amdgpu_device *adev = dev;
1650 int i, r = 0;
1651
1652 for (i = 0; i < adev->num_ip_blocks; i++) {
1653 if (!adev->ip_blocks[i].status.valid)
1654 continue;
1655 if (adev->ip_blocks[i].version->type != block_type)
1656 continue;
1657 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1658 continue;
1659 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1660 (void *)adev, state);
1661 if (r)
1662 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1663 adev->ip_blocks[i].version->funcs->name, r);
1664 }
1665 return r;
1666 }
1667
1668 /**
1669 * amdgpu_device_ip_get_clockgating_state - get the CG state
1670 *
1671 * @adev: amdgpu_device pointer
1672 * @flags: clockgating feature flags
1673 *
1674 * Walks the list of IPs on the device and updates the clockgating
1675 * flags for each IP.
1676 * Updates @flags with the feature flags for each hardware IP where
1677 * clockgating is enabled.
1678 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1679 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1680 u64 *flags)
1681 {
1682 int i;
1683
1684 for (i = 0; i < adev->num_ip_blocks; i++) {
1685 if (!adev->ip_blocks[i].status.valid)
1686 continue;
1687 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1688 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1689 }
1690 }
1691
1692 /**
1693 * amdgpu_device_ip_wait_for_idle - wait for idle
1694 *
1695 * @adev: amdgpu_device pointer
1696 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1697 *
1698 * Waits for the request hardware IP to be idle.
1699 * Returns 0 for success or a negative error code on failure.
1700 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1701 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1702 enum amd_ip_block_type block_type)
1703 {
1704 int i, r;
1705
1706 for (i = 0; i < adev->num_ip_blocks; i++) {
1707 if (!adev->ip_blocks[i].status.valid)
1708 continue;
1709 if (adev->ip_blocks[i].version->type == block_type) {
1710 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1711 if (r)
1712 return r;
1713 break;
1714 }
1715 }
1716 return 0;
1717
1718 }
1719
1720 /**
1721 * amdgpu_device_ip_is_idle - is the hardware IP idle
1722 *
1723 * @adev: amdgpu_device pointer
1724 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1725 *
1726 * Check if the hardware IP is idle or not.
1727 * Returns true if it the IP is idle, false if not.
1728 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1729 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1730 enum amd_ip_block_type block_type)
1731 {
1732 int i;
1733
1734 for (i = 0; i < adev->num_ip_blocks; i++) {
1735 if (!adev->ip_blocks[i].status.valid)
1736 continue;
1737 if (adev->ip_blocks[i].version->type == block_type)
1738 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1739 }
1740 return true;
1741
1742 }
1743
1744 /**
1745 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1746 *
1747 * @adev: amdgpu_device pointer
1748 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1749 *
1750 * Returns a pointer to the hardware IP block structure
1751 * if it exists for the asic, otherwise NULL.
1752 */
1753 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1754 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1755 enum amd_ip_block_type type)
1756 {
1757 int i;
1758
1759 for (i = 0; i < adev->num_ip_blocks; i++)
1760 if (adev->ip_blocks[i].version->type == type)
1761 return &adev->ip_blocks[i];
1762
1763 return NULL;
1764 }
1765
1766 /**
1767 * amdgpu_device_ip_block_version_cmp
1768 *
1769 * @adev: amdgpu_device pointer
1770 * @type: enum amd_ip_block_type
1771 * @major: major version
1772 * @minor: minor version
1773 *
1774 * return 0 if equal or greater
1775 * return 1 if smaller or the ip_block doesn't exist
1776 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1777 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1778 enum amd_ip_block_type type,
1779 u32 major, u32 minor)
1780 {
1781 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1782
1783 if (ip_block && ((ip_block->version->major > major) ||
1784 ((ip_block->version->major == major) &&
1785 (ip_block->version->minor >= minor))))
1786 return 0;
1787
1788 return 1;
1789 }
1790
1791 /**
1792 * amdgpu_device_ip_block_add
1793 *
1794 * @adev: amdgpu_device pointer
1795 * @ip_block_version: pointer to the IP to add
1796 *
1797 * Adds the IP block driver information to the collection of IPs
1798 * on the asic.
1799 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1800 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1801 const struct amdgpu_ip_block_version *ip_block_version)
1802 {
1803 if (!ip_block_version)
1804 return -EINVAL;
1805
1806 switch (ip_block_version->type) {
1807 case AMD_IP_BLOCK_TYPE_VCN:
1808 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1809 return 0;
1810 break;
1811 case AMD_IP_BLOCK_TYPE_JPEG:
1812 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1813 return 0;
1814 break;
1815 default:
1816 break;
1817 }
1818
1819 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1820 ip_block_version->funcs->name);
1821
1822 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1823
1824 return 0;
1825 }
1826
1827 /**
1828 * amdgpu_device_enable_virtual_display - enable virtual display feature
1829 *
1830 * @adev: amdgpu_device pointer
1831 *
1832 * Enabled the virtual display feature if the user has enabled it via
1833 * the module parameter virtual_display. This feature provides a virtual
1834 * display hardware on headless boards or in virtualized environments.
1835 * This function parses and validates the configuration string specified by
1836 * the user and configues the virtual display configuration (number of
1837 * virtual connectors, crtcs, etc.) specified.
1838 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1839 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1840 {
1841 adev->enable_virtual_display = false;
1842
1843 if (amdgpu_virtual_display) {
1844 const char *pci_address_name = pci_name(adev->pdev);
1845 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1846
1847 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1848 pciaddstr_tmp = pciaddstr;
1849 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1850 pciaddname = strsep(&pciaddname_tmp, ",");
1851 if (!strcmp("all", pciaddname)
1852 || !strcmp(pci_address_name, pciaddname)) {
1853 long num_crtc;
1854 int res = -1;
1855
1856 adev->enable_virtual_display = true;
1857
1858 if (pciaddname_tmp)
1859 res = kstrtol(pciaddname_tmp, 10,
1860 &num_crtc);
1861
1862 if (!res) {
1863 if (num_crtc < 1)
1864 num_crtc = 1;
1865 if (num_crtc > 6)
1866 num_crtc = 6;
1867 adev->mode_info.num_crtc = num_crtc;
1868 } else {
1869 adev->mode_info.num_crtc = 1;
1870 }
1871 break;
1872 }
1873 }
1874
1875 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1876 amdgpu_virtual_display, pci_address_name,
1877 adev->enable_virtual_display, adev->mode_info.num_crtc);
1878
1879 kfree(pciaddstr);
1880 }
1881 }
1882
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1883 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1884 {
1885 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1886 adev->mode_info.num_crtc = 1;
1887 adev->enable_virtual_display = true;
1888 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1889 adev->enable_virtual_display, adev->mode_info.num_crtc);
1890 }
1891 }
1892
1893 /**
1894 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1895 *
1896 * @adev: amdgpu_device pointer
1897 *
1898 * Parses the asic configuration parameters specified in the gpu info
1899 * firmware and makes them availale to the driver for use in configuring
1900 * the asic.
1901 * Returns 0 on success, -EINVAL on failure.
1902 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1903 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1904 {
1905 const char *chip_name;
1906 char fw_name[40];
1907 int err;
1908 const struct gpu_info_firmware_header_v1_0 *hdr;
1909
1910 adev->firmware.gpu_info_fw = NULL;
1911
1912 if (adev->mman.discovery_bin)
1913 return 0;
1914
1915 switch (adev->asic_type) {
1916 default:
1917 return 0;
1918 case CHIP_VEGA10:
1919 chip_name = "vega10";
1920 break;
1921 case CHIP_VEGA12:
1922 chip_name = "vega12";
1923 break;
1924 case CHIP_RAVEN:
1925 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1926 chip_name = "raven2";
1927 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1928 chip_name = "picasso";
1929 else
1930 chip_name = "raven";
1931 break;
1932 case CHIP_ARCTURUS:
1933 chip_name = "arcturus";
1934 break;
1935 case CHIP_NAVI12:
1936 chip_name = "navi12";
1937 break;
1938 }
1939
1940 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1941 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1942 if (err) {
1943 dev_err(adev->dev,
1944 "Failed to get gpu_info firmware \"%s\"\n",
1945 fw_name);
1946 goto out;
1947 }
1948
1949 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1950 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1951
1952 switch (hdr->version_major) {
1953 case 1:
1954 {
1955 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1956 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1957 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1958
1959 /*
1960 * Should be droped when DAL no longer needs it.
1961 */
1962 if (adev->asic_type == CHIP_NAVI12)
1963 goto parse_soc_bounding_box;
1964
1965 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1966 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1967 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1968 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1969 adev->gfx.config.max_texture_channel_caches =
1970 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1971 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1972 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1973 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1974 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1975 adev->gfx.config.double_offchip_lds_buf =
1976 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1977 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1978 adev->gfx.cu_info.max_waves_per_simd =
1979 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1980 adev->gfx.cu_info.max_scratch_slots_per_cu =
1981 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1982 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1983 if (hdr->version_minor >= 1) {
1984 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1985 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1986 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1987 adev->gfx.config.num_sc_per_sh =
1988 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1989 adev->gfx.config.num_packer_per_sc =
1990 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1991 }
1992
1993 parse_soc_bounding_box:
1994 /*
1995 * soc bounding box info is not integrated in disocovery table,
1996 * we always need to parse it from gpu info firmware if needed.
1997 */
1998 if (hdr->version_minor == 2) {
1999 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2000 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2001 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2002 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2003 }
2004 break;
2005 }
2006 default:
2007 dev_err(adev->dev,
2008 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2009 err = -EINVAL;
2010 goto out;
2011 }
2012 out:
2013 return err;
2014 }
2015
2016 /**
2017 * amdgpu_device_ip_early_init - run early init for hardware IPs
2018 *
2019 * @adev: amdgpu_device pointer
2020 *
2021 * Early initialization pass for hardware IPs. The hardware IPs that make
2022 * up each asic are discovered each IP's early_init callback is run. This
2023 * is the first stage in initializing the asic.
2024 * Returns 0 on success, negative error code on failure.
2025 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2026 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2027 {
2028 struct pci_dev *parent;
2029 int i, r;
2030 bool total;
2031
2032 amdgpu_device_enable_virtual_display(adev);
2033
2034 if (amdgpu_sriov_vf(adev)) {
2035 r = amdgpu_virt_request_full_gpu(adev, true);
2036 if (r)
2037 return r;
2038 }
2039
2040 switch (adev->asic_type) {
2041 #ifdef CONFIG_DRM_AMDGPU_SI
2042 case CHIP_VERDE:
2043 case CHIP_TAHITI:
2044 case CHIP_PITCAIRN:
2045 case CHIP_OLAND:
2046 case CHIP_HAINAN:
2047 adev->family = AMDGPU_FAMILY_SI;
2048 r = si_set_ip_blocks(adev);
2049 if (r)
2050 return r;
2051 break;
2052 #endif
2053 #ifdef CONFIG_DRM_AMDGPU_CIK
2054 case CHIP_BONAIRE:
2055 case CHIP_HAWAII:
2056 case CHIP_KAVERI:
2057 case CHIP_KABINI:
2058 case CHIP_MULLINS:
2059 if (adev->flags & AMD_IS_APU)
2060 adev->family = AMDGPU_FAMILY_KV;
2061 else
2062 adev->family = AMDGPU_FAMILY_CI;
2063
2064 r = cik_set_ip_blocks(adev);
2065 if (r)
2066 return r;
2067 break;
2068 #endif
2069 case CHIP_TOPAZ:
2070 case CHIP_TONGA:
2071 case CHIP_FIJI:
2072 case CHIP_POLARIS10:
2073 case CHIP_POLARIS11:
2074 case CHIP_POLARIS12:
2075 case CHIP_VEGAM:
2076 case CHIP_CARRIZO:
2077 case CHIP_STONEY:
2078 if (adev->flags & AMD_IS_APU)
2079 adev->family = AMDGPU_FAMILY_CZ;
2080 else
2081 adev->family = AMDGPU_FAMILY_VI;
2082
2083 r = vi_set_ip_blocks(adev);
2084 if (r)
2085 return r;
2086 break;
2087 default:
2088 r = amdgpu_discovery_set_ip_blocks(adev);
2089 if (r)
2090 return r;
2091 break;
2092 }
2093
2094 if (amdgpu_has_atpx() &&
2095 (amdgpu_is_atpx_hybrid() ||
2096 amdgpu_has_atpx_dgpu_power_cntl()) &&
2097 ((adev->flags & AMD_IS_APU) == 0) &&
2098 !dev_is_removable(&adev->pdev->dev))
2099 adev->flags |= AMD_IS_PX;
2100
2101 if (!(adev->flags & AMD_IS_APU)) {
2102 parent = pcie_find_root_port(adev->pdev);
2103 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2104 }
2105
2106
2107 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2108 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2109 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2110 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2111 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2112 if (!amdgpu_device_pcie_dynamic_switching_supported())
2113 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2114
2115 total = true;
2116 for (i = 0; i < adev->num_ip_blocks; i++) {
2117 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2118 DRM_WARN("disabled ip block: %d <%s>\n",
2119 i, adev->ip_blocks[i].version->funcs->name);
2120 adev->ip_blocks[i].status.valid = false;
2121 } else {
2122 if (adev->ip_blocks[i].version->funcs->early_init) {
2123 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2124 if (r == -ENOENT) {
2125 adev->ip_blocks[i].status.valid = false;
2126 } else if (r) {
2127 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2128 adev->ip_blocks[i].version->funcs->name, r);
2129 total = false;
2130 } else {
2131 adev->ip_blocks[i].status.valid = true;
2132 }
2133 } else {
2134 adev->ip_blocks[i].status.valid = true;
2135 }
2136 }
2137 /* get the vbios after the asic_funcs are set up */
2138 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2139 r = amdgpu_device_parse_gpu_info_fw(adev);
2140 if (r)
2141 return r;
2142
2143 /* Read BIOS */
2144 if (amdgpu_device_read_bios(adev)) {
2145 if (!amdgpu_get_bios(adev))
2146 return -EINVAL;
2147
2148 r = amdgpu_atombios_init(adev);
2149 if (r) {
2150 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2151 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2152 return r;
2153 }
2154 }
2155
2156 /*get pf2vf msg info at it's earliest time*/
2157 if (amdgpu_sriov_vf(adev))
2158 amdgpu_virt_init_data_exchange(adev);
2159
2160 }
2161 }
2162 if (!total)
2163 return -ENODEV;
2164
2165 amdgpu_amdkfd_device_probe(adev);
2166 adev->cg_flags &= amdgpu_cg_mask;
2167 adev->pg_flags &= amdgpu_pg_mask;
2168
2169 return 0;
2170 }
2171
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2172 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2173 {
2174 int i, r;
2175
2176 for (i = 0; i < adev->num_ip_blocks; i++) {
2177 if (!adev->ip_blocks[i].status.sw)
2178 continue;
2179 if (adev->ip_blocks[i].status.hw)
2180 continue;
2181 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2182 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2183 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2184 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2185 if (r) {
2186 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2187 adev->ip_blocks[i].version->funcs->name, r);
2188 return r;
2189 }
2190 adev->ip_blocks[i].status.hw = true;
2191 }
2192 }
2193
2194 return 0;
2195 }
2196
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2197 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2198 {
2199 int i, r;
2200
2201 for (i = 0; i < adev->num_ip_blocks; i++) {
2202 if (!adev->ip_blocks[i].status.sw)
2203 continue;
2204 if (adev->ip_blocks[i].status.hw)
2205 continue;
2206 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2207 if (r) {
2208 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2209 adev->ip_blocks[i].version->funcs->name, r);
2210 return r;
2211 }
2212 adev->ip_blocks[i].status.hw = true;
2213 }
2214
2215 return 0;
2216 }
2217
amdgpu_device_fw_loading(struct amdgpu_device * adev)2218 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2219 {
2220 int r = 0;
2221 int i;
2222 uint32_t smu_version;
2223
2224 if (adev->asic_type >= CHIP_VEGA10) {
2225 for (i = 0; i < adev->num_ip_blocks; i++) {
2226 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2227 continue;
2228
2229 if (!adev->ip_blocks[i].status.sw)
2230 continue;
2231
2232 /* no need to do the fw loading again if already done*/
2233 if (adev->ip_blocks[i].status.hw == true)
2234 break;
2235
2236 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2237 r = adev->ip_blocks[i].version->funcs->resume(adev);
2238 if (r) {
2239 DRM_ERROR("resume of IP block <%s> failed %d\n",
2240 adev->ip_blocks[i].version->funcs->name, r);
2241 return r;
2242 }
2243 } else {
2244 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2245 if (r) {
2246 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2247 adev->ip_blocks[i].version->funcs->name, r);
2248 return r;
2249 }
2250 }
2251
2252 adev->ip_blocks[i].status.hw = true;
2253 break;
2254 }
2255 }
2256
2257 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2258 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2259
2260 return r;
2261 }
2262
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2263 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2264 {
2265 long timeout;
2266 int r, i;
2267
2268 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2269 struct amdgpu_ring *ring = adev->rings[i];
2270
2271 /* No need to setup the GPU scheduler for rings that don't need it */
2272 if (!ring || ring->no_scheduler)
2273 continue;
2274
2275 switch (ring->funcs->type) {
2276 case AMDGPU_RING_TYPE_GFX:
2277 timeout = adev->gfx_timeout;
2278 break;
2279 case AMDGPU_RING_TYPE_COMPUTE:
2280 timeout = adev->compute_timeout;
2281 break;
2282 case AMDGPU_RING_TYPE_SDMA:
2283 timeout = adev->sdma_timeout;
2284 break;
2285 default:
2286 timeout = adev->video_timeout;
2287 break;
2288 }
2289
2290 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2291 ring->num_hw_submission, 0,
2292 timeout, adev->reset_domain->wq,
2293 ring->sched_score, ring->name,
2294 adev->dev);
2295 if (r) {
2296 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2297 ring->name);
2298 return r;
2299 }
2300 }
2301
2302 amdgpu_xcp_update_partition_sched_list(adev);
2303
2304 return 0;
2305 }
2306
2307
2308 /**
2309 * amdgpu_device_ip_init - run init for hardware IPs
2310 *
2311 * @adev: amdgpu_device pointer
2312 *
2313 * Main initialization pass for hardware IPs. The list of all the hardware
2314 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2315 * are run. sw_init initializes the software state associated with each IP
2316 * and hw_init initializes the hardware associated with each IP.
2317 * Returns 0 on success, negative error code on failure.
2318 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2319 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2320 {
2321 int i, r;
2322
2323 r = amdgpu_ras_init(adev);
2324 if (r)
2325 return r;
2326
2327 for (i = 0; i < adev->num_ip_blocks; i++) {
2328 if (!adev->ip_blocks[i].status.valid)
2329 continue;
2330 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2331 if (r) {
2332 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2333 adev->ip_blocks[i].version->funcs->name, r);
2334 goto init_failed;
2335 }
2336 adev->ip_blocks[i].status.sw = true;
2337
2338 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2339 /* need to do common hw init early so everything is set up for gmc */
2340 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2341 if (r) {
2342 DRM_ERROR("hw_init %d failed %d\n", i, r);
2343 goto init_failed;
2344 }
2345 adev->ip_blocks[i].status.hw = true;
2346 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2347 /* need to do gmc hw init early so we can allocate gpu mem */
2348 /* Try to reserve bad pages early */
2349 if (amdgpu_sriov_vf(adev))
2350 amdgpu_virt_exchange_data(adev);
2351
2352 r = amdgpu_device_mem_scratch_init(adev);
2353 if (r) {
2354 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2355 goto init_failed;
2356 }
2357 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2358 if (r) {
2359 DRM_ERROR("hw_init %d failed %d\n", i, r);
2360 goto init_failed;
2361 }
2362 r = amdgpu_device_wb_init(adev);
2363 if (r) {
2364 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2365 goto init_failed;
2366 }
2367 adev->ip_blocks[i].status.hw = true;
2368
2369 /* right after GMC hw init, we create CSA */
2370 if (adev->gfx.mcbp) {
2371 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2372 AMDGPU_GEM_DOMAIN_VRAM |
2373 AMDGPU_GEM_DOMAIN_GTT,
2374 AMDGPU_CSA_SIZE);
2375 if (r) {
2376 DRM_ERROR("allocate CSA failed %d\n", r);
2377 goto init_failed;
2378 }
2379 }
2380 }
2381 }
2382
2383 if (amdgpu_sriov_vf(adev))
2384 amdgpu_virt_init_data_exchange(adev);
2385
2386 r = amdgpu_ib_pool_init(adev);
2387 if (r) {
2388 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2389 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2390 goto init_failed;
2391 }
2392
2393 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2394 if (r)
2395 goto init_failed;
2396
2397 r = amdgpu_device_ip_hw_init_phase1(adev);
2398 if (r)
2399 goto init_failed;
2400
2401 r = amdgpu_device_fw_loading(adev);
2402 if (r)
2403 goto init_failed;
2404
2405 r = amdgpu_device_ip_hw_init_phase2(adev);
2406 if (r)
2407 goto init_failed;
2408
2409 /*
2410 * retired pages will be loaded from eeprom and reserved here,
2411 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2412 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2413 * for I2C communication which only true at this point.
2414 *
2415 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2416 * failure from bad gpu situation and stop amdgpu init process
2417 * accordingly. For other failed cases, it will still release all
2418 * the resource and print error message, rather than returning one
2419 * negative value to upper level.
2420 *
2421 * Note: theoretically, this should be called before all vram allocations
2422 * to protect retired page from abusing
2423 */
2424 r = amdgpu_ras_recovery_init(adev);
2425 if (r)
2426 goto init_failed;
2427
2428 /**
2429 * In case of XGMI grab extra reference for reset domain for this device
2430 */
2431 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2432 if (amdgpu_xgmi_add_device(adev) == 0) {
2433 if (!amdgpu_sriov_vf(adev)) {
2434 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2435
2436 if (WARN_ON(!hive)) {
2437 r = -ENOENT;
2438 goto init_failed;
2439 }
2440
2441 if (!hive->reset_domain ||
2442 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2443 r = -ENOENT;
2444 amdgpu_put_xgmi_hive(hive);
2445 goto init_failed;
2446 }
2447
2448 /* Drop the early temporary reset domain we created for device */
2449 amdgpu_reset_put_reset_domain(adev->reset_domain);
2450 adev->reset_domain = hive->reset_domain;
2451 amdgpu_put_xgmi_hive(hive);
2452 }
2453 }
2454 }
2455
2456 r = amdgpu_device_init_schedulers(adev);
2457 if (r)
2458 goto init_failed;
2459
2460 /* Don't init kfd if whole hive need to be reset during init */
2461 if (!adev->gmc.xgmi.pending_reset) {
2462 kgd2kfd_init_zone_device(adev);
2463 amdgpu_amdkfd_device_init(adev);
2464 }
2465
2466 amdgpu_fru_get_product_info(adev);
2467
2468 init_failed:
2469
2470 return r;
2471 }
2472
2473 /**
2474 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2475 *
2476 * @adev: amdgpu_device pointer
2477 *
2478 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2479 * this function before a GPU reset. If the value is retained after a
2480 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2481 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2482 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2483 {
2484 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2485 }
2486
2487 /**
2488 * amdgpu_device_check_vram_lost - check if vram is valid
2489 *
2490 * @adev: amdgpu_device pointer
2491 *
2492 * Checks the reset magic value written to the gart pointer in VRAM.
2493 * The driver calls this after a GPU reset to see if the contents of
2494 * VRAM is lost or now.
2495 * returns true if vram is lost, false if not.
2496 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2497 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2498 {
2499 if (memcmp(adev->gart.ptr, adev->reset_magic,
2500 AMDGPU_RESET_MAGIC_NUM))
2501 return true;
2502
2503 if (!amdgpu_in_reset(adev))
2504 return false;
2505
2506 /*
2507 * For all ASICs with baco/mode1 reset, the VRAM is
2508 * always assumed to be lost.
2509 */
2510 switch (amdgpu_asic_reset_method(adev)) {
2511 case AMD_RESET_METHOD_BACO:
2512 case AMD_RESET_METHOD_MODE1:
2513 return true;
2514 default:
2515 return false;
2516 }
2517 }
2518
2519 /**
2520 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2521 *
2522 * @adev: amdgpu_device pointer
2523 * @state: clockgating state (gate or ungate)
2524 *
2525 * The list of all the hardware IPs that make up the asic is walked and the
2526 * set_clockgating_state callbacks are run.
2527 * Late initialization pass enabling clockgating for hardware IPs.
2528 * Fini or suspend, pass disabling clockgating for hardware IPs.
2529 * Returns 0 on success, negative error code on failure.
2530 */
2531
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2532 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2533 enum amd_clockgating_state state)
2534 {
2535 int i, j, r;
2536
2537 if (amdgpu_emu_mode == 1)
2538 return 0;
2539
2540 for (j = 0; j < adev->num_ip_blocks; j++) {
2541 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2542 if (!adev->ip_blocks[i].status.late_initialized)
2543 continue;
2544 /* skip CG for GFX, SDMA on S0ix */
2545 if (adev->in_s0ix &&
2546 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2547 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2548 continue;
2549 /* skip CG for VCE/UVD, it's handled specially */
2550 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2551 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2552 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2553 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2554 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2555 /* enable clockgating to save power */
2556 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2557 state);
2558 if (r) {
2559 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2560 adev->ip_blocks[i].version->funcs->name, r);
2561 return r;
2562 }
2563 }
2564 }
2565
2566 return 0;
2567 }
2568
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2569 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2570 enum amd_powergating_state state)
2571 {
2572 int i, j, r;
2573
2574 if (amdgpu_emu_mode == 1)
2575 return 0;
2576
2577 for (j = 0; j < adev->num_ip_blocks; j++) {
2578 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2579 if (!adev->ip_blocks[i].status.late_initialized)
2580 continue;
2581 /* skip PG for GFX, SDMA on S0ix */
2582 if (adev->in_s0ix &&
2583 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2584 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2585 continue;
2586 /* skip CG for VCE/UVD, it's handled specially */
2587 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2588 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2589 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2590 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2591 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2592 /* enable powergating to save power */
2593 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2594 state);
2595 if (r) {
2596 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2597 adev->ip_blocks[i].version->funcs->name, r);
2598 return r;
2599 }
2600 }
2601 }
2602 return 0;
2603 }
2604
amdgpu_device_enable_mgpu_fan_boost(void)2605 static int amdgpu_device_enable_mgpu_fan_boost(void)
2606 {
2607 struct amdgpu_gpu_instance *gpu_ins;
2608 struct amdgpu_device *adev;
2609 int i, ret = 0;
2610
2611 mutex_lock(&mgpu_info.mutex);
2612
2613 /*
2614 * MGPU fan boost feature should be enabled
2615 * only when there are two or more dGPUs in
2616 * the system
2617 */
2618 if (mgpu_info.num_dgpu < 2)
2619 goto out;
2620
2621 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2622 gpu_ins = &(mgpu_info.gpu_ins[i]);
2623 adev = gpu_ins->adev;
2624 if (!(adev->flags & AMD_IS_APU) &&
2625 !gpu_ins->mgpu_fan_enabled) {
2626 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2627 if (ret)
2628 break;
2629
2630 gpu_ins->mgpu_fan_enabled = 1;
2631 }
2632 }
2633
2634 out:
2635 mutex_unlock(&mgpu_info.mutex);
2636
2637 return ret;
2638 }
2639
2640 /**
2641 * amdgpu_device_ip_late_init - run late init for hardware IPs
2642 *
2643 * @adev: amdgpu_device pointer
2644 *
2645 * Late initialization pass for hardware IPs. The list of all the hardware
2646 * IPs that make up the asic is walked and the late_init callbacks are run.
2647 * late_init covers any special initialization that an IP requires
2648 * after all of the have been initialized or something that needs to happen
2649 * late in the init process.
2650 * Returns 0 on success, negative error code on failure.
2651 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2652 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2653 {
2654 struct amdgpu_gpu_instance *gpu_instance;
2655 int i = 0, r;
2656
2657 for (i = 0; i < adev->num_ip_blocks; i++) {
2658 if (!adev->ip_blocks[i].status.hw)
2659 continue;
2660 if (adev->ip_blocks[i].version->funcs->late_init) {
2661 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2662 if (r) {
2663 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2664 adev->ip_blocks[i].version->funcs->name, r);
2665 return r;
2666 }
2667 }
2668 adev->ip_blocks[i].status.late_initialized = true;
2669 }
2670
2671 r = amdgpu_ras_late_init(adev);
2672 if (r) {
2673 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2674 return r;
2675 }
2676
2677 amdgpu_ras_set_error_query_ready(adev, true);
2678
2679 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2680 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2681
2682 amdgpu_device_fill_reset_magic(adev);
2683
2684 r = amdgpu_device_enable_mgpu_fan_boost();
2685 if (r)
2686 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2687
2688 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2689 if (amdgpu_passthrough(adev) &&
2690 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2691 adev->asic_type == CHIP_ALDEBARAN))
2692 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2693
2694 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2695 mutex_lock(&mgpu_info.mutex);
2696
2697 /*
2698 * Reset device p-state to low as this was booted with high.
2699 *
2700 * This should be performed only after all devices from the same
2701 * hive get initialized.
2702 *
2703 * However, it's unknown how many device in the hive in advance.
2704 * As this is counted one by one during devices initializations.
2705 *
2706 * So, we wait for all XGMI interlinked devices initialized.
2707 * This may bring some delays as those devices may come from
2708 * different hives. But that should be OK.
2709 */
2710 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2711 for (i = 0; i < mgpu_info.num_gpu; i++) {
2712 gpu_instance = &(mgpu_info.gpu_ins[i]);
2713 if (gpu_instance->adev->flags & AMD_IS_APU)
2714 continue;
2715
2716 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2717 AMDGPU_XGMI_PSTATE_MIN);
2718 if (r) {
2719 DRM_ERROR("pstate setting failed (%d).\n", r);
2720 break;
2721 }
2722 }
2723 }
2724
2725 mutex_unlock(&mgpu_info.mutex);
2726 }
2727
2728 return 0;
2729 }
2730
2731 /**
2732 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2733 *
2734 * @adev: amdgpu_device pointer
2735 *
2736 * For ASICs need to disable SMC first
2737 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2738 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2739 {
2740 int i, r;
2741
2742 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2743 return;
2744
2745 for (i = 0; i < adev->num_ip_blocks; i++) {
2746 if (!adev->ip_blocks[i].status.hw)
2747 continue;
2748 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2749 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2750 /* XXX handle errors */
2751 if (r) {
2752 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2753 adev->ip_blocks[i].version->funcs->name, r);
2754 }
2755 adev->ip_blocks[i].status.hw = false;
2756 break;
2757 }
2758 }
2759 }
2760
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2761 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2762 {
2763 int i, r;
2764
2765 for (i = 0; i < adev->num_ip_blocks; i++) {
2766 if (!adev->ip_blocks[i].version->funcs->early_fini)
2767 continue;
2768
2769 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2770 if (r) {
2771 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2772 adev->ip_blocks[i].version->funcs->name, r);
2773 }
2774 }
2775
2776 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2777 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2778
2779 amdgpu_amdkfd_suspend(adev, false);
2780
2781 /* Workaroud for ASICs need to disable SMC first */
2782 amdgpu_device_smu_fini_early(adev);
2783
2784 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2785 if (!adev->ip_blocks[i].status.hw)
2786 continue;
2787
2788 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2789 /* XXX handle errors */
2790 if (r) {
2791 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2792 adev->ip_blocks[i].version->funcs->name, r);
2793 }
2794
2795 adev->ip_blocks[i].status.hw = false;
2796 }
2797
2798 if (amdgpu_sriov_vf(adev)) {
2799 if (amdgpu_virt_release_full_gpu(adev, false))
2800 DRM_ERROR("failed to release exclusive mode on fini\n");
2801 }
2802
2803 return 0;
2804 }
2805
2806 /**
2807 * amdgpu_device_ip_fini - run fini for hardware IPs
2808 *
2809 * @adev: amdgpu_device pointer
2810 *
2811 * Main teardown pass for hardware IPs. The list of all the hardware
2812 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2813 * are run. hw_fini tears down the hardware associated with each IP
2814 * and sw_fini tears down any software state associated with each IP.
2815 * Returns 0 on success, negative error code on failure.
2816 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2817 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2818 {
2819 int i, r;
2820
2821 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2822 amdgpu_virt_release_ras_err_handler_data(adev);
2823
2824 if (adev->gmc.xgmi.num_physical_nodes > 1)
2825 amdgpu_xgmi_remove_device(adev);
2826
2827 amdgpu_amdkfd_device_fini_sw(adev);
2828
2829 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2830 if (!adev->ip_blocks[i].status.sw)
2831 continue;
2832
2833 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2834 amdgpu_ucode_free_bo(adev);
2835 amdgpu_free_static_csa(&adev->virt.csa_obj);
2836 amdgpu_device_wb_fini(adev);
2837 amdgpu_device_mem_scratch_fini(adev);
2838 amdgpu_ib_pool_fini(adev);
2839 }
2840
2841 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2842 /* XXX handle errors */
2843 if (r) {
2844 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2845 adev->ip_blocks[i].version->funcs->name, r);
2846 }
2847 adev->ip_blocks[i].status.sw = false;
2848 adev->ip_blocks[i].status.valid = false;
2849 }
2850
2851 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2852 if (!adev->ip_blocks[i].status.late_initialized)
2853 continue;
2854 if (adev->ip_blocks[i].version->funcs->late_fini)
2855 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2856 adev->ip_blocks[i].status.late_initialized = false;
2857 }
2858
2859 amdgpu_ras_fini(adev);
2860
2861 return 0;
2862 }
2863
2864 /**
2865 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2866 *
2867 * @work: work_struct.
2868 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2869 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2870 {
2871 struct amdgpu_device *adev =
2872 container_of(work, struct amdgpu_device, delayed_init_work.work);
2873 int r;
2874
2875 r = amdgpu_ib_ring_tests(adev);
2876 if (r)
2877 DRM_ERROR("ib ring test failed (%d).\n", r);
2878 }
2879
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2880 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2881 {
2882 struct amdgpu_device *adev =
2883 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2884
2885 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2886 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2887
2888 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2889 adev->gfx.gfx_off_state = true;
2890 }
2891
2892 /**
2893 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2894 *
2895 * @adev: amdgpu_device pointer
2896 *
2897 * Main suspend function for hardware IPs. The list of all the hardware
2898 * IPs that make up the asic is walked, clockgating is disabled and the
2899 * suspend callbacks are run. suspend puts the hardware and software state
2900 * in each IP into a state suitable for suspend.
2901 * Returns 0 on success, negative error code on failure.
2902 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2903 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2904 {
2905 int i, r;
2906
2907 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2908 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2909
2910 /*
2911 * Per PMFW team's suggestion, driver needs to handle gfxoff
2912 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2913 * scenario. Add the missing df cstate disablement here.
2914 */
2915 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2916 dev_warn(adev->dev, "Failed to disallow df cstate");
2917
2918 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2919 if (!adev->ip_blocks[i].status.valid)
2920 continue;
2921
2922 /* displays are handled separately */
2923 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2924 continue;
2925
2926 /* XXX handle errors */
2927 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2928 /* XXX handle errors */
2929 if (r) {
2930 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2931 adev->ip_blocks[i].version->funcs->name, r);
2932 return r;
2933 }
2934
2935 adev->ip_blocks[i].status.hw = false;
2936 }
2937
2938 return 0;
2939 }
2940
2941 /**
2942 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2943 *
2944 * @adev: amdgpu_device pointer
2945 *
2946 * Main suspend function for hardware IPs. The list of all the hardware
2947 * IPs that make up the asic is walked, clockgating is disabled and the
2948 * suspend callbacks are run. suspend puts the hardware and software state
2949 * in each IP into a state suitable for suspend.
2950 * Returns 0 on success, negative error code on failure.
2951 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2952 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2953 {
2954 int i, r;
2955
2956 if (adev->in_s0ix)
2957 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2958
2959 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2960 if (!adev->ip_blocks[i].status.valid)
2961 continue;
2962 /* displays are handled in phase1 */
2963 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2964 continue;
2965 /* PSP lost connection when err_event_athub occurs */
2966 if (amdgpu_ras_intr_triggered() &&
2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2968 adev->ip_blocks[i].status.hw = false;
2969 continue;
2970 }
2971
2972 /* skip unnecessary suspend if we do not initialize them yet */
2973 if (adev->gmc.xgmi.pending_reset &&
2974 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2975 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2976 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2977 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2978 adev->ip_blocks[i].status.hw = false;
2979 continue;
2980 }
2981
2982 /* skip suspend of gfx/mes and psp for S0ix
2983 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2984 * like at runtime. PSP is also part of the always on hardware
2985 * so no need to suspend it.
2986 */
2987 if (adev->in_s0ix &&
2988 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2989 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2990 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
2991 continue;
2992
2993 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
2994 if (adev->in_s0ix &&
2995 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
2996 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2997 continue;
2998
2999 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3000 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3001 * from this location and RLC Autoload automatically also gets loaded
3002 * from here based on PMFW -> PSP message during re-init sequence.
3003 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3004 * the TMR and reload FWs again for IMU enabled APU ASICs.
3005 */
3006 if (amdgpu_in_reset(adev) &&
3007 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3009 continue;
3010
3011 /* XXX handle errors */
3012 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3013 /* XXX handle errors */
3014 if (r) {
3015 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3016 adev->ip_blocks[i].version->funcs->name, r);
3017 }
3018 adev->ip_blocks[i].status.hw = false;
3019 /* handle putting the SMC in the appropriate state */
3020 if (!amdgpu_sriov_vf(adev)) {
3021 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3022 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3023 if (r) {
3024 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3025 adev->mp1_state, r);
3026 return r;
3027 }
3028 }
3029 }
3030 }
3031
3032 return 0;
3033 }
3034
3035 /**
3036 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3037 *
3038 * @adev: amdgpu_device pointer
3039 *
3040 * Main suspend function for hardware IPs. The list of all the hardware
3041 * IPs that make up the asic is walked, clockgating is disabled and the
3042 * suspend callbacks are run. suspend puts the hardware and software state
3043 * in each IP into a state suitable for suspend.
3044 * Returns 0 on success, negative error code on failure.
3045 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3046 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3047 {
3048 int r;
3049
3050 if (amdgpu_sriov_vf(adev)) {
3051 amdgpu_virt_fini_data_exchange(adev);
3052 amdgpu_virt_request_full_gpu(adev, false);
3053 }
3054
3055 r = amdgpu_device_ip_suspend_phase1(adev);
3056 if (r)
3057 return r;
3058 r = amdgpu_device_ip_suspend_phase2(adev);
3059
3060 if (amdgpu_sriov_vf(adev))
3061 amdgpu_virt_release_full_gpu(adev, false);
3062
3063 return r;
3064 }
3065
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3066 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3067 {
3068 int i, r;
3069
3070 static enum amd_ip_block_type ip_order[] = {
3071 AMD_IP_BLOCK_TYPE_COMMON,
3072 AMD_IP_BLOCK_TYPE_GMC,
3073 AMD_IP_BLOCK_TYPE_PSP,
3074 AMD_IP_BLOCK_TYPE_IH,
3075 };
3076
3077 for (i = 0; i < adev->num_ip_blocks; i++) {
3078 int j;
3079 struct amdgpu_ip_block *block;
3080
3081 block = &adev->ip_blocks[i];
3082 block->status.hw = false;
3083
3084 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3085
3086 if (block->version->type != ip_order[j] ||
3087 !block->status.valid)
3088 continue;
3089
3090 r = block->version->funcs->hw_init(adev);
3091 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3092 if (r)
3093 return r;
3094 block->status.hw = true;
3095 }
3096 }
3097
3098 return 0;
3099 }
3100
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3101 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3102 {
3103 int i, r;
3104
3105 static enum amd_ip_block_type ip_order[] = {
3106 AMD_IP_BLOCK_TYPE_SMC,
3107 AMD_IP_BLOCK_TYPE_DCE,
3108 AMD_IP_BLOCK_TYPE_GFX,
3109 AMD_IP_BLOCK_TYPE_SDMA,
3110 AMD_IP_BLOCK_TYPE_MES,
3111 AMD_IP_BLOCK_TYPE_UVD,
3112 AMD_IP_BLOCK_TYPE_VCE,
3113 AMD_IP_BLOCK_TYPE_VCN,
3114 AMD_IP_BLOCK_TYPE_JPEG
3115 };
3116
3117 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3118 int j;
3119 struct amdgpu_ip_block *block;
3120
3121 for (j = 0; j < adev->num_ip_blocks; j++) {
3122 block = &adev->ip_blocks[j];
3123
3124 if (block->version->type != ip_order[i] ||
3125 !block->status.valid ||
3126 block->status.hw)
3127 continue;
3128
3129 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3130 r = block->version->funcs->resume(adev);
3131 else
3132 r = block->version->funcs->hw_init(adev);
3133
3134 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3135 if (r)
3136 return r;
3137 block->status.hw = true;
3138 }
3139 }
3140
3141 return 0;
3142 }
3143
3144 /**
3145 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3146 *
3147 * @adev: amdgpu_device pointer
3148 *
3149 * First resume function for hardware IPs. The list of all the hardware
3150 * IPs that make up the asic is walked and the resume callbacks are run for
3151 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3152 * after a suspend and updates the software state as necessary. This
3153 * function is also used for restoring the GPU after a GPU reset.
3154 * Returns 0 on success, negative error code on failure.
3155 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3156 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3157 {
3158 int i, r;
3159
3160 for (i = 0; i < adev->num_ip_blocks; i++) {
3161 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3162 continue;
3163 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3164 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3165 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3166 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3167
3168 r = adev->ip_blocks[i].version->funcs->resume(adev);
3169 if (r) {
3170 DRM_ERROR("resume of IP block <%s> failed %d\n",
3171 adev->ip_blocks[i].version->funcs->name, r);
3172 return r;
3173 }
3174 adev->ip_blocks[i].status.hw = true;
3175 }
3176 }
3177
3178 return 0;
3179 }
3180
3181 /**
3182 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3183 *
3184 * @adev: amdgpu_device pointer
3185 *
3186 * First resume function for hardware IPs. The list of all the hardware
3187 * IPs that make up the asic is walked and the resume callbacks are run for
3188 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3189 * functional state after a suspend and updates the software state as
3190 * necessary. This function is also used for restoring the GPU after a GPU
3191 * reset.
3192 * Returns 0 on success, negative error code on failure.
3193 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3194 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3195 {
3196 int i, r;
3197
3198 for (i = 0; i < adev->num_ip_blocks; i++) {
3199 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3200 continue;
3201 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3202 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3203 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3204 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3205 continue;
3206 r = adev->ip_blocks[i].version->funcs->resume(adev);
3207 if (r) {
3208 DRM_ERROR("resume of IP block <%s> failed %d\n",
3209 adev->ip_blocks[i].version->funcs->name, r);
3210 return r;
3211 }
3212 adev->ip_blocks[i].status.hw = true;
3213 }
3214
3215 return 0;
3216 }
3217
3218 /**
3219 * amdgpu_device_ip_resume - run resume for hardware IPs
3220 *
3221 * @adev: amdgpu_device pointer
3222 *
3223 * Main resume function for hardware IPs. The hardware IPs
3224 * are split into two resume functions because they are
3225 * also used in recovering from a GPU reset and some additional
3226 * steps need to be take between them. In this case (S3/S4) they are
3227 * run sequentially.
3228 * Returns 0 on success, negative error code on failure.
3229 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3230 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3231 {
3232 int r;
3233
3234 r = amdgpu_device_ip_resume_phase1(adev);
3235 if (r)
3236 return r;
3237
3238 r = amdgpu_device_fw_loading(adev);
3239 if (r)
3240 return r;
3241
3242 r = amdgpu_device_ip_resume_phase2(adev);
3243
3244 return r;
3245 }
3246
3247 /**
3248 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3249 *
3250 * @adev: amdgpu_device pointer
3251 *
3252 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3253 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3254 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3255 {
3256 if (amdgpu_sriov_vf(adev)) {
3257 if (adev->is_atom_fw) {
3258 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3259 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3260 } else {
3261 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3262 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3263 }
3264
3265 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3266 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3267 }
3268 }
3269
3270 /**
3271 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3272 *
3273 * @asic_type: AMD asic type
3274 *
3275 * Check if there is DC (new modesetting infrastructre) support for an asic.
3276 * returns true if DC has support, false if not.
3277 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3278 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3279 {
3280 switch (asic_type) {
3281 #ifdef CONFIG_DRM_AMDGPU_SI
3282 case CHIP_HAINAN:
3283 #endif
3284 case CHIP_TOPAZ:
3285 /* chips with no display hardware */
3286 return false;
3287 #if defined(CONFIG_DRM_AMD_DC)
3288 case CHIP_TAHITI:
3289 case CHIP_PITCAIRN:
3290 case CHIP_VERDE:
3291 case CHIP_OLAND:
3292 /*
3293 * We have systems in the wild with these ASICs that require
3294 * LVDS and VGA support which is not supported with DC.
3295 *
3296 * Fallback to the non-DC driver here by default so as not to
3297 * cause regressions.
3298 */
3299 #if defined(CONFIG_DRM_AMD_DC_SI)
3300 return amdgpu_dc > 0;
3301 #else
3302 return false;
3303 #endif
3304 case CHIP_BONAIRE:
3305 case CHIP_KAVERI:
3306 case CHIP_KABINI:
3307 case CHIP_MULLINS:
3308 /*
3309 * We have systems in the wild with these ASICs that require
3310 * VGA support which is not supported with DC.
3311 *
3312 * Fallback to the non-DC driver here by default so as not to
3313 * cause regressions.
3314 */
3315 return amdgpu_dc > 0;
3316 default:
3317 return amdgpu_dc != 0;
3318 #else
3319 default:
3320 if (amdgpu_dc > 0)
3321 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3322 return false;
3323 #endif
3324 }
3325 }
3326
3327 /**
3328 * amdgpu_device_has_dc_support - check if dc is supported
3329 *
3330 * @adev: amdgpu_device pointer
3331 *
3332 * Returns true for supported, false for not supported
3333 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3334 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3335 {
3336 if (adev->enable_virtual_display ||
3337 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3338 return false;
3339
3340 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3341 }
3342
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3343 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3344 {
3345 struct amdgpu_device *adev =
3346 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3347 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3348
3349 /* It's a bug to not have a hive within this function */
3350 if (WARN_ON(!hive))
3351 return;
3352
3353 /*
3354 * Use task barrier to synchronize all xgmi reset works across the
3355 * hive. task_barrier_enter and task_barrier_exit will block
3356 * until all the threads running the xgmi reset works reach
3357 * those points. task_barrier_full will do both blocks.
3358 */
3359 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3360
3361 task_barrier_enter(&hive->tb);
3362 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3363
3364 if (adev->asic_reset_res)
3365 goto fail;
3366
3367 task_barrier_exit(&hive->tb);
3368 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3369
3370 if (adev->asic_reset_res)
3371 goto fail;
3372
3373 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3374 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3375 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3376 } else {
3377
3378 task_barrier_full(&hive->tb);
3379 adev->asic_reset_res = amdgpu_asic_reset(adev);
3380 }
3381
3382 fail:
3383 if (adev->asic_reset_res)
3384 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3385 adev->asic_reset_res, adev_to_drm(adev)->unique);
3386 amdgpu_put_xgmi_hive(hive);
3387 }
3388
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3389 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3390 {
3391 char *input = amdgpu_lockup_timeout;
3392 char *timeout_setting = NULL;
3393 int index = 0;
3394 long timeout;
3395 int ret = 0;
3396
3397 /*
3398 * By default timeout for non compute jobs is 10000
3399 * and 60000 for compute jobs.
3400 * In SR-IOV or passthrough mode, timeout for compute
3401 * jobs are 60000 by default.
3402 */
3403 adev->gfx_timeout = msecs_to_jiffies(10000);
3404 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3405 if (amdgpu_sriov_vf(adev))
3406 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3407 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3408 else
3409 adev->compute_timeout = msecs_to_jiffies(60000);
3410
3411 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3412 while ((timeout_setting = strsep(&input, ",")) &&
3413 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3414 ret = kstrtol(timeout_setting, 0, &timeout);
3415 if (ret)
3416 return ret;
3417
3418 if (timeout == 0) {
3419 index++;
3420 continue;
3421 } else if (timeout < 0) {
3422 timeout = MAX_SCHEDULE_TIMEOUT;
3423 dev_warn(adev->dev, "lockup timeout disabled");
3424 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3425 } else {
3426 timeout = msecs_to_jiffies(timeout);
3427 }
3428
3429 switch (index++) {
3430 case 0:
3431 adev->gfx_timeout = timeout;
3432 break;
3433 case 1:
3434 adev->compute_timeout = timeout;
3435 break;
3436 case 2:
3437 adev->sdma_timeout = timeout;
3438 break;
3439 case 3:
3440 adev->video_timeout = timeout;
3441 break;
3442 default:
3443 break;
3444 }
3445 }
3446 /*
3447 * There is only one value specified and
3448 * it should apply to all non-compute jobs.
3449 */
3450 if (index == 1) {
3451 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3452 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3453 adev->compute_timeout = adev->gfx_timeout;
3454 }
3455 }
3456
3457 return ret;
3458 }
3459
3460 /**
3461 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3462 *
3463 * @adev: amdgpu_device pointer
3464 *
3465 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3466 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3467 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3468 {
3469 struct iommu_domain *domain;
3470
3471 domain = iommu_get_domain_for_dev(adev->dev);
3472 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3473 adev->ram_is_direct_mapped = true;
3474 }
3475
3476 static const struct attribute *amdgpu_dev_attributes[] = {
3477 &dev_attr_pcie_replay_count.attr,
3478 NULL
3479 };
3480
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3481 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3482 {
3483 if (amdgpu_mcbp == 1)
3484 adev->gfx.mcbp = true;
3485 else if (amdgpu_mcbp == 0)
3486 adev->gfx.mcbp = false;
3487
3488 if (amdgpu_sriov_vf(adev))
3489 adev->gfx.mcbp = true;
3490
3491 if (adev->gfx.mcbp)
3492 DRM_INFO("MCBP is enabled\n");
3493 }
3494
3495 /**
3496 * amdgpu_device_init - initialize the driver
3497 *
3498 * @adev: amdgpu_device pointer
3499 * @flags: driver flags
3500 *
3501 * Initializes the driver info and hw (all asics).
3502 * Returns 0 for success or an error on failure.
3503 * Called at driver startup.
3504 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3505 int amdgpu_device_init(struct amdgpu_device *adev,
3506 uint32_t flags)
3507 {
3508 struct drm_device *ddev = adev_to_drm(adev);
3509 struct pci_dev *pdev = adev->pdev;
3510 int r, i;
3511 bool px = false;
3512 u32 max_MBps;
3513 int tmp;
3514
3515 adev->shutdown = false;
3516 adev->flags = flags;
3517
3518 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3519 adev->asic_type = amdgpu_force_asic_type;
3520 else
3521 adev->asic_type = flags & AMD_ASIC_MASK;
3522
3523 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3524 if (amdgpu_emu_mode == 1)
3525 adev->usec_timeout *= 10;
3526 adev->gmc.gart_size = 512 * 1024 * 1024;
3527 adev->accel_working = false;
3528 adev->num_rings = 0;
3529 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3530 adev->mman.buffer_funcs = NULL;
3531 adev->mman.buffer_funcs_ring = NULL;
3532 adev->vm_manager.vm_pte_funcs = NULL;
3533 adev->vm_manager.vm_pte_num_scheds = 0;
3534 adev->gmc.gmc_funcs = NULL;
3535 adev->harvest_ip_mask = 0x0;
3536 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3537 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3538
3539 adev->smc_rreg = &amdgpu_invalid_rreg;
3540 adev->smc_wreg = &amdgpu_invalid_wreg;
3541 adev->pcie_rreg = &amdgpu_invalid_rreg;
3542 adev->pcie_wreg = &amdgpu_invalid_wreg;
3543 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3544 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3545 adev->pciep_rreg = &amdgpu_invalid_rreg;
3546 adev->pciep_wreg = &amdgpu_invalid_wreg;
3547 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3548 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3549 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3550 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3551 adev->didt_rreg = &amdgpu_invalid_rreg;
3552 adev->didt_wreg = &amdgpu_invalid_wreg;
3553 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3554 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3555 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3556 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3557
3558 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3559 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3560 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3561
3562 /* mutex initialization are all done here so we
3563 * can recall function without having locking issues
3564 */
3565 mutex_init(&adev->firmware.mutex);
3566 mutex_init(&adev->pm.mutex);
3567 mutex_init(&adev->gfx.gpu_clock_mutex);
3568 mutex_init(&adev->srbm_mutex);
3569 mutex_init(&adev->gfx.pipe_reserve_mutex);
3570 mutex_init(&adev->gfx.gfx_off_mutex);
3571 mutex_init(&adev->gfx.partition_mutex);
3572 mutex_init(&adev->grbm_idx_mutex);
3573 mutex_init(&adev->mn_lock);
3574 mutex_init(&adev->virt.vf_errors.lock);
3575 mutex_init(&adev->virt.rlcg_reg_lock);
3576 hash_init(adev->mn_hash);
3577 mutex_init(&adev->psp.mutex);
3578 mutex_init(&adev->notifier_lock);
3579 mutex_init(&adev->pm.stable_pstate_ctx_lock);
3580 mutex_init(&adev->benchmark_mutex);
3581
3582 amdgpu_device_init_apu_flags(adev);
3583
3584 r = amdgpu_device_check_arguments(adev);
3585 if (r)
3586 return r;
3587
3588 spin_lock_init(&adev->mmio_idx_lock);
3589 spin_lock_init(&adev->smc_idx_lock);
3590 spin_lock_init(&adev->pcie_idx_lock);
3591 spin_lock_init(&adev->uvd_ctx_idx_lock);
3592 spin_lock_init(&adev->didt_idx_lock);
3593 spin_lock_init(&adev->gc_cac_idx_lock);
3594 spin_lock_init(&adev->se_cac_idx_lock);
3595 spin_lock_init(&adev->audio_endpt_idx_lock);
3596 spin_lock_init(&adev->mm_stats.lock);
3597
3598 INIT_LIST_HEAD(&adev->shadow_list);
3599 mutex_init(&adev->shadow_list_lock);
3600
3601 INIT_LIST_HEAD(&adev->reset_list);
3602
3603 INIT_LIST_HEAD(&adev->ras_list);
3604
3605 INIT_DELAYED_WORK(&adev->delayed_init_work,
3606 amdgpu_device_delayed_init_work_handler);
3607 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3608 amdgpu_device_delay_enable_gfx_off);
3609
3610 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3611
3612 adev->gfx.gfx_off_req_count = 1;
3613 adev->gfx.gfx_off_residency = 0;
3614 adev->gfx.gfx_off_entrycount = 0;
3615 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3616
3617 atomic_set(&adev->throttling_logging_enabled, 1);
3618 /*
3619 * If throttling continues, logging will be performed every minute
3620 * to avoid log flooding. "-1" is subtracted since the thermal
3621 * throttling interrupt comes every second. Thus, the total logging
3622 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3623 * for throttling interrupt) = 60 seconds.
3624 */
3625 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3626 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3627
3628 /* Registers mapping */
3629 /* TODO: block userspace mapping of io register */
3630 if (adev->asic_type >= CHIP_BONAIRE) {
3631 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3632 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3633 } else {
3634 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3635 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3636 }
3637
3638 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3639 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3640
3641 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3642 if (!adev->rmmio)
3643 return -ENOMEM;
3644
3645 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3646 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3647
3648 /*
3649 * Reset domain needs to be present early, before XGMI hive discovered
3650 * (if any) and intitialized to use reset sem and in_gpu reset flag
3651 * early on during init and before calling to RREG32.
3652 */
3653 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3654 if (!adev->reset_domain)
3655 return -ENOMEM;
3656
3657 /* detect hw virtualization here */
3658 amdgpu_detect_virtualization(adev);
3659
3660 amdgpu_device_get_pcie_info(adev);
3661
3662 r = amdgpu_device_get_job_timeout_settings(adev);
3663 if (r) {
3664 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3665 return r;
3666 }
3667
3668 /* early init functions */
3669 r = amdgpu_device_ip_early_init(adev);
3670 if (r)
3671 return r;
3672
3673 amdgpu_device_set_mcbp(adev);
3674
3675 /* Get rid of things like offb */
3676 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3677 if (r)
3678 return r;
3679
3680 /* Enable TMZ based on IP_VERSION */
3681 amdgpu_gmc_tmz_set(adev);
3682
3683 amdgpu_gmc_noretry_set(adev);
3684 /* Need to get xgmi info early to decide the reset behavior*/
3685 if (adev->gmc.xgmi.supported) {
3686 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3687 if (r)
3688 return r;
3689 }
3690
3691 /* enable PCIE atomic ops */
3692 if (amdgpu_sriov_vf(adev)) {
3693 if (adev->virt.fw_reserve.p_pf2vf)
3694 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3695 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3696 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3697 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3698 * internal path natively support atomics, set have_atomics_support to true.
3699 */
3700 } else if ((adev->flags & AMD_IS_APU) &&
3701 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3702 adev->have_atomics_support = true;
3703 } else {
3704 adev->have_atomics_support =
3705 !pci_enable_atomic_ops_to_root(adev->pdev,
3706 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3707 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3708 }
3709
3710 if (!adev->have_atomics_support)
3711 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3712
3713 /* doorbell bar mapping and doorbell index init*/
3714 amdgpu_doorbell_init(adev);
3715
3716 if (amdgpu_emu_mode == 1) {
3717 /* post the asic on emulation mode */
3718 emu_soc_asic_init(adev);
3719 goto fence_driver_init;
3720 }
3721
3722 amdgpu_reset_init(adev);
3723
3724 /* detect if we are with an SRIOV vbios */
3725 if (adev->bios)
3726 amdgpu_device_detect_sriov_bios(adev);
3727
3728 /* check if we need to reset the asic
3729 * E.g., driver was not cleanly unloaded previously, etc.
3730 */
3731 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3732 if (adev->gmc.xgmi.num_physical_nodes) {
3733 dev_info(adev->dev, "Pending hive reset.\n");
3734 adev->gmc.xgmi.pending_reset = true;
3735 /* Only need to init necessary block for SMU to handle the reset */
3736 for (i = 0; i < adev->num_ip_blocks; i++) {
3737 if (!adev->ip_blocks[i].status.valid)
3738 continue;
3739 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3740 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3743 DRM_DEBUG("IP %s disabled for hw_init.\n",
3744 adev->ip_blocks[i].version->funcs->name);
3745 adev->ip_blocks[i].status.hw = true;
3746 }
3747 }
3748 } else {
3749 tmp = amdgpu_reset_method;
3750 /* It should do a default reset when loading or reloading the driver,
3751 * regardless of the module parameter reset_method.
3752 */
3753 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3754 r = amdgpu_asic_reset(adev);
3755 amdgpu_reset_method = tmp;
3756 if (r) {
3757 dev_err(adev->dev, "asic reset on init failed\n");
3758 goto failed;
3759 }
3760 }
3761 }
3762
3763 /* Post card if necessary */
3764 if (amdgpu_device_need_post(adev)) {
3765 if (!adev->bios) {
3766 dev_err(adev->dev, "no vBIOS found\n");
3767 r = -EINVAL;
3768 goto failed;
3769 }
3770 DRM_INFO("GPU posting now...\n");
3771 r = amdgpu_device_asic_init(adev);
3772 if (r) {
3773 dev_err(adev->dev, "gpu post error!\n");
3774 goto failed;
3775 }
3776 }
3777
3778 if (adev->bios) {
3779 if (adev->is_atom_fw) {
3780 /* Initialize clocks */
3781 r = amdgpu_atomfirmware_get_clock_info(adev);
3782 if (r) {
3783 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3784 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3785 goto failed;
3786 }
3787 } else {
3788 /* Initialize clocks */
3789 r = amdgpu_atombios_get_clock_info(adev);
3790 if (r) {
3791 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3792 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3793 goto failed;
3794 }
3795 /* init i2c buses */
3796 if (!amdgpu_device_has_dc_support(adev))
3797 amdgpu_atombios_i2c_init(adev);
3798 }
3799 }
3800
3801 fence_driver_init:
3802 /* Fence driver */
3803 r = amdgpu_fence_driver_sw_init(adev);
3804 if (r) {
3805 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3806 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3807 goto failed;
3808 }
3809
3810 /* init the mode config */
3811 drm_mode_config_init(adev_to_drm(adev));
3812
3813 r = amdgpu_device_ip_init(adev);
3814 if (r) {
3815 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3816 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3817 goto release_ras_con;
3818 }
3819
3820 amdgpu_fence_driver_hw_init(adev);
3821
3822 dev_info(adev->dev,
3823 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3824 adev->gfx.config.max_shader_engines,
3825 adev->gfx.config.max_sh_per_se,
3826 adev->gfx.config.max_cu_per_sh,
3827 adev->gfx.cu_info.number);
3828
3829 adev->accel_working = true;
3830
3831 amdgpu_vm_check_compute_bug(adev);
3832
3833 /* Initialize the buffer migration limit. */
3834 if (amdgpu_moverate >= 0)
3835 max_MBps = amdgpu_moverate;
3836 else
3837 max_MBps = 8; /* Allow 8 MB/s. */
3838 /* Get a log2 for easy divisions. */
3839 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3840
3841 r = amdgpu_atombios_sysfs_init(adev);
3842 if (r)
3843 drm_err(&adev->ddev,
3844 "registering atombios sysfs failed (%d).\n", r);
3845
3846 r = amdgpu_pm_sysfs_init(adev);
3847 if (r)
3848 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3849
3850 r = amdgpu_ucode_sysfs_init(adev);
3851 if (r) {
3852 adev->ucode_sysfs_en = false;
3853 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3854 } else
3855 adev->ucode_sysfs_en = true;
3856
3857 /*
3858 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3859 * Otherwise the mgpu fan boost feature will be skipped due to the
3860 * gpu instance is counted less.
3861 */
3862 amdgpu_register_gpu_instance(adev);
3863
3864 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3865 * explicit gating rather than handling it automatically.
3866 */
3867 if (!adev->gmc.xgmi.pending_reset) {
3868 r = amdgpu_device_ip_late_init(adev);
3869 if (r) {
3870 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3871 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3872 goto release_ras_con;
3873 }
3874 /* must succeed. */
3875 amdgpu_ras_resume(adev);
3876 queue_delayed_work(system_wq, &adev->delayed_init_work,
3877 msecs_to_jiffies(AMDGPU_RESUME_MS));
3878 }
3879
3880 if (amdgpu_sriov_vf(adev)) {
3881 amdgpu_virt_release_full_gpu(adev, true);
3882 flush_delayed_work(&adev->delayed_init_work);
3883 }
3884
3885 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3886 if (r)
3887 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3888
3889 amdgpu_fru_sysfs_init(adev);
3890
3891 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3892 r = amdgpu_pmu_init(adev);
3893 if (r)
3894 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3895
3896 /* Have stored pci confspace at hand for restore in sudden PCI error */
3897 if (amdgpu_device_cache_pci_state(adev->pdev))
3898 pci_restore_state(pdev);
3899
3900 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3901 /* this will fail for cards that aren't VGA class devices, just
3902 * ignore it
3903 */
3904 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3905 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3906
3907 px = amdgpu_device_supports_px(ddev);
3908
3909 if (px || (!dev_is_removable(&adev->pdev->dev) &&
3910 apple_gmux_detect(NULL, NULL)))
3911 vga_switcheroo_register_client(adev->pdev,
3912 &amdgpu_switcheroo_ops, px);
3913
3914 if (px)
3915 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3916
3917 if (adev->gmc.xgmi.pending_reset)
3918 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3919 msecs_to_jiffies(AMDGPU_RESUME_MS));
3920
3921 amdgpu_device_check_iommu_direct_map(adev);
3922
3923 return 0;
3924
3925 release_ras_con:
3926 if (amdgpu_sriov_vf(adev))
3927 amdgpu_virt_release_full_gpu(adev, true);
3928
3929 /* failed in exclusive mode due to timeout */
3930 if (amdgpu_sriov_vf(adev) &&
3931 !amdgpu_sriov_runtime(adev) &&
3932 amdgpu_virt_mmio_blocked(adev) &&
3933 !amdgpu_virt_wait_reset(adev)) {
3934 dev_err(adev->dev, "VF exclusive mode timeout\n");
3935 /* Don't send request since VF is inactive. */
3936 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3937 adev->virt.ops = NULL;
3938 r = -EAGAIN;
3939 }
3940 amdgpu_release_ras_context(adev);
3941
3942 failed:
3943 amdgpu_vf_error_trans_all(adev);
3944
3945 return r;
3946 }
3947
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)3948 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3949 {
3950
3951 /* Clear all CPU mappings pointing to this device */
3952 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3953
3954 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3955 amdgpu_doorbell_fini(adev);
3956
3957 iounmap(adev->rmmio);
3958 adev->rmmio = NULL;
3959 if (adev->mman.aper_base_kaddr)
3960 iounmap(adev->mman.aper_base_kaddr);
3961 adev->mman.aper_base_kaddr = NULL;
3962
3963 /* Memory manager related */
3964 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
3965 arch_phys_wc_del(adev->gmc.vram_mtrr);
3966 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3967 }
3968 }
3969
3970 /**
3971 * amdgpu_device_fini_hw - tear down the driver
3972 *
3973 * @adev: amdgpu_device pointer
3974 *
3975 * Tear down the driver info (all asics).
3976 * Called at driver shutdown.
3977 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)3978 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3979 {
3980 dev_info(adev->dev, "amdgpu: finishing device.\n");
3981 flush_delayed_work(&adev->delayed_init_work);
3982 adev->shutdown = true;
3983
3984 /* make sure IB test finished before entering exclusive mode
3985 * to avoid preemption on IB test
3986 */
3987 if (amdgpu_sriov_vf(adev)) {
3988 amdgpu_virt_request_full_gpu(adev, false);
3989 amdgpu_virt_fini_data_exchange(adev);
3990 }
3991
3992 /* disable all interrupts */
3993 amdgpu_irq_disable_all(adev);
3994 if (adev->mode_info.mode_config_initialized) {
3995 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3996 drm_helper_force_disable_all(adev_to_drm(adev));
3997 else
3998 drm_atomic_helper_shutdown(adev_to_drm(adev));
3999 }
4000 amdgpu_fence_driver_hw_fini(adev);
4001
4002 if (adev->mman.initialized)
4003 drain_workqueue(adev->mman.bdev.wq);
4004
4005 if (adev->pm.sysfs_initialized)
4006 amdgpu_pm_sysfs_fini(adev);
4007 if (adev->ucode_sysfs_en)
4008 amdgpu_ucode_sysfs_fini(adev);
4009 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4010 amdgpu_fru_sysfs_fini(adev);
4011
4012 /* disable ras feature must before hw fini */
4013 amdgpu_ras_pre_fini(adev);
4014
4015 amdgpu_device_ip_fini_early(adev);
4016
4017 amdgpu_irq_fini_hw(adev);
4018
4019 if (adev->mman.initialized)
4020 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4021
4022 amdgpu_gart_dummy_page_fini(adev);
4023
4024 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4025 amdgpu_device_unmap_mmio(adev);
4026
4027 }
4028
amdgpu_device_fini_sw(struct amdgpu_device * adev)4029 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4030 {
4031 int idx;
4032 bool px;
4033
4034 amdgpu_device_ip_fini(adev);
4035 amdgpu_fence_driver_sw_fini(adev);
4036 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4037 adev->accel_working = false;
4038 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4039
4040 amdgpu_reset_fini(adev);
4041
4042 /* free i2c buses */
4043 if (!amdgpu_device_has_dc_support(adev))
4044 amdgpu_i2c_fini(adev);
4045
4046 if (amdgpu_emu_mode != 1)
4047 amdgpu_atombios_fini(adev);
4048
4049 kfree(adev->bios);
4050 adev->bios = NULL;
4051
4052 px = amdgpu_device_supports_px(adev_to_drm(adev));
4053
4054 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4055 apple_gmux_detect(NULL, NULL)))
4056 vga_switcheroo_unregister_client(adev->pdev);
4057
4058 if (px)
4059 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4060
4061 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4062 vga_client_unregister(adev->pdev);
4063
4064 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4065
4066 iounmap(adev->rmmio);
4067 adev->rmmio = NULL;
4068 amdgpu_doorbell_fini(adev);
4069 drm_dev_exit(idx);
4070 }
4071
4072 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4073 amdgpu_pmu_fini(adev);
4074 if (adev->mman.discovery_bin)
4075 amdgpu_discovery_fini(adev);
4076
4077 amdgpu_reset_put_reset_domain(adev->reset_domain);
4078 adev->reset_domain = NULL;
4079
4080 kfree(adev->pci_state);
4081
4082 }
4083
4084 /**
4085 * amdgpu_device_evict_resources - evict device resources
4086 * @adev: amdgpu device object
4087 *
4088 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4089 * of the vram memory type. Mainly used for evicting device resources
4090 * at suspend time.
4091 *
4092 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4093 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4094 {
4095 int ret;
4096
4097 /* No need to evict vram on APUs for suspend to ram or s2idle */
4098 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4099 return 0;
4100
4101 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4102 if (ret)
4103 DRM_WARN("evicting device resources failed\n");
4104 return ret;
4105 }
4106
4107 /*
4108 * Suspend & resume.
4109 */
4110 /**
4111 * amdgpu_device_prepare - prepare for device suspend
4112 *
4113 * @dev: drm dev pointer
4114 *
4115 * Prepare to put the hw in the suspend state (all asics).
4116 * Returns 0 for success or an error on failure.
4117 * Called at driver suspend.
4118 */
amdgpu_device_prepare(struct drm_device * dev)4119 int amdgpu_device_prepare(struct drm_device *dev)
4120 {
4121 struct amdgpu_device *adev = drm_to_adev(dev);
4122 int i, r;
4123
4124 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4125 return 0;
4126
4127 /* Evict the majority of BOs before starting suspend sequence */
4128 r = amdgpu_device_evict_resources(adev);
4129 if (r)
4130 return r;
4131
4132 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4133
4134 for (i = 0; i < adev->num_ip_blocks; i++) {
4135 if (!adev->ip_blocks[i].status.valid)
4136 continue;
4137 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4138 continue;
4139 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4140 if (r)
4141 return r;
4142 }
4143
4144 return 0;
4145 }
4146
4147 /**
4148 * amdgpu_device_suspend - initiate device suspend
4149 *
4150 * @dev: drm dev pointer
4151 * @fbcon : notify the fbdev of suspend
4152 *
4153 * Puts the hw in the suspend state (all asics).
4154 * Returns 0 for success or an error on failure.
4155 * Called at driver suspend.
4156 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4157 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4158 {
4159 struct amdgpu_device *adev = drm_to_adev(dev);
4160 int r = 0;
4161
4162 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4163 return 0;
4164
4165 adev->in_suspend = true;
4166
4167 if (amdgpu_sriov_vf(adev)) {
4168 amdgpu_virt_fini_data_exchange(adev);
4169 r = amdgpu_virt_request_full_gpu(adev, false);
4170 if (r)
4171 return r;
4172 }
4173
4174 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4175 DRM_WARN("smart shift update failed\n");
4176
4177 if (fbcon)
4178 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4179
4180 cancel_delayed_work_sync(&adev->delayed_init_work);
4181
4182 amdgpu_ras_suspend(adev);
4183
4184 amdgpu_device_ip_suspend_phase1(adev);
4185
4186 if (!adev->in_s0ix)
4187 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4188
4189 r = amdgpu_device_evict_resources(adev);
4190 if (r)
4191 return r;
4192
4193 amdgpu_fence_driver_hw_fini(adev);
4194
4195 amdgpu_device_ip_suspend_phase2(adev);
4196
4197 if (amdgpu_sriov_vf(adev))
4198 amdgpu_virt_release_full_gpu(adev, false);
4199
4200 return 0;
4201 }
4202
4203 /**
4204 * amdgpu_device_resume - initiate device resume
4205 *
4206 * @dev: drm dev pointer
4207 * @fbcon : notify the fbdev of resume
4208 *
4209 * Bring the hw back to operating state (all asics).
4210 * Returns 0 for success or an error on failure.
4211 * Called at driver resume.
4212 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4213 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4214 {
4215 struct amdgpu_device *adev = drm_to_adev(dev);
4216 int r = 0;
4217
4218 if (amdgpu_sriov_vf(adev)) {
4219 r = amdgpu_virt_request_full_gpu(adev, true);
4220 if (r)
4221 return r;
4222 }
4223
4224 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4225 return 0;
4226
4227 if (adev->in_s0ix)
4228 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4229
4230 /* post card */
4231 if (amdgpu_device_need_post(adev)) {
4232 r = amdgpu_device_asic_init(adev);
4233 if (r)
4234 dev_err(adev->dev, "amdgpu asic init failed\n");
4235 }
4236
4237 r = amdgpu_device_ip_resume(adev);
4238
4239 if (r) {
4240 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4241 goto exit;
4242 }
4243 amdgpu_fence_driver_hw_init(adev);
4244
4245 r = amdgpu_device_ip_late_init(adev);
4246 if (r)
4247 goto exit;
4248
4249 queue_delayed_work(system_wq, &adev->delayed_init_work,
4250 msecs_to_jiffies(AMDGPU_RESUME_MS));
4251
4252 if (!adev->in_s0ix) {
4253 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4254 if (r)
4255 goto exit;
4256 }
4257
4258 exit:
4259 if (amdgpu_sriov_vf(adev)) {
4260 amdgpu_virt_init_data_exchange(adev);
4261 amdgpu_virt_release_full_gpu(adev, true);
4262 }
4263
4264 if (r)
4265 return r;
4266
4267 /* Make sure IB tests flushed */
4268 flush_delayed_work(&adev->delayed_init_work);
4269
4270 if (fbcon)
4271 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4272
4273 amdgpu_ras_resume(adev);
4274
4275 if (adev->mode_info.num_crtc) {
4276 /*
4277 * Most of the connector probing functions try to acquire runtime pm
4278 * refs to ensure that the GPU is powered on when connector polling is
4279 * performed. Since we're calling this from a runtime PM callback,
4280 * trying to acquire rpm refs will cause us to deadlock.
4281 *
4282 * Since we're guaranteed to be holding the rpm lock, it's safe to
4283 * temporarily disable the rpm helpers so this doesn't deadlock us.
4284 */
4285 #ifdef CONFIG_PM
4286 dev->dev->power.disable_depth++;
4287 #endif
4288 if (!adev->dc_enabled)
4289 drm_helper_hpd_irq_event(dev);
4290 else
4291 drm_kms_helper_hotplug_event(dev);
4292 #ifdef CONFIG_PM
4293 dev->dev->power.disable_depth--;
4294 #endif
4295 }
4296 adev->in_suspend = false;
4297
4298 if (adev->enable_mes)
4299 amdgpu_mes_self_test(adev);
4300
4301 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4302 DRM_WARN("smart shift update failed\n");
4303
4304 return 0;
4305 }
4306
4307 /**
4308 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4309 *
4310 * @adev: amdgpu_device pointer
4311 *
4312 * The list of all the hardware IPs that make up the asic is walked and
4313 * the check_soft_reset callbacks are run. check_soft_reset determines
4314 * if the asic is still hung or not.
4315 * Returns true if any of the IPs are still in a hung state, false if not.
4316 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4317 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4318 {
4319 int i;
4320 bool asic_hang = false;
4321
4322 if (amdgpu_sriov_vf(adev))
4323 return true;
4324
4325 if (amdgpu_asic_need_full_reset(adev))
4326 return true;
4327
4328 for (i = 0; i < adev->num_ip_blocks; i++) {
4329 if (!adev->ip_blocks[i].status.valid)
4330 continue;
4331 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4332 adev->ip_blocks[i].status.hang =
4333 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4334 if (adev->ip_blocks[i].status.hang) {
4335 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4336 asic_hang = true;
4337 }
4338 }
4339 return asic_hang;
4340 }
4341
4342 /**
4343 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4344 *
4345 * @adev: amdgpu_device pointer
4346 *
4347 * The list of all the hardware IPs that make up the asic is walked and the
4348 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4349 * handles any IP specific hardware or software state changes that are
4350 * necessary for a soft reset to succeed.
4351 * Returns 0 on success, negative error code on failure.
4352 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4353 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4354 {
4355 int i, r = 0;
4356
4357 for (i = 0; i < adev->num_ip_blocks; i++) {
4358 if (!adev->ip_blocks[i].status.valid)
4359 continue;
4360 if (adev->ip_blocks[i].status.hang &&
4361 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4362 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4363 if (r)
4364 return r;
4365 }
4366 }
4367
4368 return 0;
4369 }
4370
4371 /**
4372 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4373 *
4374 * @adev: amdgpu_device pointer
4375 *
4376 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4377 * reset is necessary to recover.
4378 * Returns true if a full asic reset is required, false if not.
4379 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4380 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4381 {
4382 int i;
4383
4384 if (amdgpu_asic_need_full_reset(adev))
4385 return true;
4386
4387 for (i = 0; i < adev->num_ip_blocks; i++) {
4388 if (!adev->ip_blocks[i].status.valid)
4389 continue;
4390 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4391 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4392 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4393 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4394 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4395 if (adev->ip_blocks[i].status.hang) {
4396 dev_info(adev->dev, "Some block need full reset!\n");
4397 return true;
4398 }
4399 }
4400 }
4401 return false;
4402 }
4403
4404 /**
4405 * amdgpu_device_ip_soft_reset - do a soft reset
4406 *
4407 * @adev: amdgpu_device pointer
4408 *
4409 * The list of all the hardware IPs that make up the asic is walked and the
4410 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4411 * IP specific hardware or software state changes that are necessary to soft
4412 * reset the IP.
4413 * Returns 0 on success, negative error code on failure.
4414 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4415 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4416 {
4417 int i, r = 0;
4418
4419 for (i = 0; i < adev->num_ip_blocks; i++) {
4420 if (!adev->ip_blocks[i].status.valid)
4421 continue;
4422 if (adev->ip_blocks[i].status.hang &&
4423 adev->ip_blocks[i].version->funcs->soft_reset) {
4424 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4425 if (r)
4426 return r;
4427 }
4428 }
4429
4430 return 0;
4431 }
4432
4433 /**
4434 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4435 *
4436 * @adev: amdgpu_device pointer
4437 *
4438 * The list of all the hardware IPs that make up the asic is walked and the
4439 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4440 * handles any IP specific hardware or software state changes that are
4441 * necessary after the IP has been soft reset.
4442 * Returns 0 on success, negative error code on failure.
4443 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4444 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4445 {
4446 int i, r = 0;
4447
4448 for (i = 0; i < adev->num_ip_blocks; i++) {
4449 if (!adev->ip_blocks[i].status.valid)
4450 continue;
4451 if (adev->ip_blocks[i].status.hang &&
4452 adev->ip_blocks[i].version->funcs->post_soft_reset)
4453 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4454 if (r)
4455 return r;
4456 }
4457
4458 return 0;
4459 }
4460
4461 /**
4462 * amdgpu_device_recover_vram - Recover some VRAM contents
4463 *
4464 * @adev: amdgpu_device pointer
4465 *
4466 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4467 * restore things like GPUVM page tables after a GPU reset where
4468 * the contents of VRAM might be lost.
4469 *
4470 * Returns:
4471 * 0 on success, negative error code on failure.
4472 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4473 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4474 {
4475 struct dma_fence *fence = NULL, *next = NULL;
4476 struct amdgpu_bo *shadow;
4477 struct amdgpu_bo_vm *vmbo;
4478 long r = 1, tmo;
4479
4480 if (amdgpu_sriov_runtime(adev))
4481 tmo = msecs_to_jiffies(8000);
4482 else
4483 tmo = msecs_to_jiffies(100);
4484
4485 dev_info(adev->dev, "recover vram bo from shadow start\n");
4486 mutex_lock(&adev->shadow_list_lock);
4487 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4488 /* If vm is compute context or adev is APU, shadow will be NULL */
4489 if (!vmbo->shadow)
4490 continue;
4491 shadow = vmbo->shadow;
4492
4493 /* No need to recover an evicted BO */
4494 if (!shadow->tbo.resource ||
4495 shadow->tbo.resource->mem_type != TTM_PL_TT ||
4496 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4497 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4498 continue;
4499
4500 r = amdgpu_bo_restore_shadow(shadow, &next);
4501 if (r)
4502 break;
4503
4504 if (fence) {
4505 tmo = dma_fence_wait_timeout(fence, false, tmo);
4506 dma_fence_put(fence);
4507 fence = next;
4508 if (tmo == 0) {
4509 r = -ETIMEDOUT;
4510 break;
4511 } else if (tmo < 0) {
4512 r = tmo;
4513 break;
4514 }
4515 } else {
4516 fence = next;
4517 }
4518 }
4519 mutex_unlock(&adev->shadow_list_lock);
4520
4521 if (fence)
4522 tmo = dma_fence_wait_timeout(fence, false, tmo);
4523 dma_fence_put(fence);
4524
4525 if (r < 0 || tmo <= 0) {
4526 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4527 return -EIO;
4528 }
4529
4530 dev_info(adev->dev, "recover vram bo from shadow done\n");
4531 return 0;
4532 }
4533
4534
4535 /**
4536 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4537 *
4538 * @adev: amdgpu_device pointer
4539 * @from_hypervisor: request from hypervisor
4540 *
4541 * do VF FLR and reinitialize Asic
4542 * return 0 means succeeded otherwise failed
4543 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4544 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4545 bool from_hypervisor)
4546 {
4547 int r;
4548 struct amdgpu_hive_info *hive = NULL;
4549 int retry_limit = 0;
4550
4551 retry:
4552 amdgpu_amdkfd_pre_reset(adev);
4553
4554 if (from_hypervisor)
4555 r = amdgpu_virt_request_full_gpu(adev, true);
4556 else
4557 r = amdgpu_virt_reset_gpu(adev);
4558 if (r)
4559 return r;
4560 amdgpu_irq_gpu_reset_resume_helper(adev);
4561
4562 /* some sw clean up VF needs to do before recover */
4563 amdgpu_virt_post_reset(adev);
4564
4565 /* Resume IP prior to SMC */
4566 r = amdgpu_device_ip_reinit_early_sriov(adev);
4567 if (r)
4568 goto error;
4569
4570 amdgpu_virt_init_data_exchange(adev);
4571
4572 r = amdgpu_device_fw_loading(adev);
4573 if (r)
4574 return r;
4575
4576 /* now we are okay to resume SMC/CP/SDMA */
4577 r = amdgpu_device_ip_reinit_late_sriov(adev);
4578 if (r)
4579 goto error;
4580
4581 hive = amdgpu_get_xgmi_hive(adev);
4582 /* Update PSP FW topology after reset */
4583 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4584 r = amdgpu_xgmi_update_topology(hive, adev);
4585
4586 if (hive)
4587 amdgpu_put_xgmi_hive(hive);
4588
4589 if (!r) {
4590 r = amdgpu_ib_ring_tests(adev);
4591
4592 amdgpu_amdkfd_post_reset(adev);
4593 }
4594
4595 error:
4596 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4597 amdgpu_inc_vram_lost(adev);
4598 r = amdgpu_device_recover_vram(adev);
4599 }
4600 amdgpu_virt_release_full_gpu(adev, true);
4601
4602 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4603 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4604 retry_limit++;
4605 goto retry;
4606 } else
4607 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4608 }
4609
4610 return r;
4611 }
4612
4613 /**
4614 * amdgpu_device_has_job_running - check if there is any job in mirror list
4615 *
4616 * @adev: amdgpu_device pointer
4617 *
4618 * check if there is any job in mirror list
4619 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4620 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4621 {
4622 int i;
4623 struct drm_sched_job *job;
4624
4625 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4626 struct amdgpu_ring *ring = adev->rings[i];
4627
4628 if (!ring || !ring->sched.thread)
4629 continue;
4630
4631 spin_lock(&ring->sched.job_list_lock);
4632 job = list_first_entry_or_null(&ring->sched.pending_list,
4633 struct drm_sched_job, list);
4634 spin_unlock(&ring->sched.job_list_lock);
4635 if (job)
4636 return true;
4637 }
4638 return false;
4639 }
4640
4641 /**
4642 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4643 *
4644 * @adev: amdgpu_device pointer
4645 *
4646 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4647 * a hung GPU.
4648 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4649 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4650 {
4651
4652 if (amdgpu_gpu_recovery == 0)
4653 goto disabled;
4654
4655 /* Skip soft reset check in fatal error mode */
4656 if (!amdgpu_ras_is_poison_mode_supported(adev))
4657 return true;
4658
4659 if (amdgpu_sriov_vf(adev))
4660 return true;
4661
4662 if (amdgpu_gpu_recovery == -1) {
4663 switch (adev->asic_type) {
4664 #ifdef CONFIG_DRM_AMDGPU_SI
4665 case CHIP_VERDE:
4666 case CHIP_TAHITI:
4667 case CHIP_PITCAIRN:
4668 case CHIP_OLAND:
4669 case CHIP_HAINAN:
4670 #endif
4671 #ifdef CONFIG_DRM_AMDGPU_CIK
4672 case CHIP_KAVERI:
4673 case CHIP_KABINI:
4674 case CHIP_MULLINS:
4675 #endif
4676 case CHIP_CARRIZO:
4677 case CHIP_STONEY:
4678 case CHIP_CYAN_SKILLFISH:
4679 goto disabled;
4680 default:
4681 break;
4682 }
4683 }
4684
4685 return true;
4686
4687 disabled:
4688 dev_info(adev->dev, "GPU recovery disabled.\n");
4689 return false;
4690 }
4691
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4692 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4693 {
4694 u32 i;
4695 int ret = 0;
4696
4697 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4698
4699 dev_info(adev->dev, "GPU mode1 reset\n");
4700
4701 /* Cache the state before bus master disable. The saved config space
4702 * values are used in other cases like restore after mode-2 reset.
4703 */
4704 amdgpu_device_cache_pci_state(adev->pdev);
4705
4706 /* disable BM */
4707 pci_clear_master(adev->pdev);
4708
4709 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4710 dev_info(adev->dev, "GPU smu mode1 reset\n");
4711 ret = amdgpu_dpm_mode1_reset(adev);
4712 } else {
4713 dev_info(adev->dev, "GPU psp mode1 reset\n");
4714 ret = psp_gpu_reset(adev);
4715 }
4716
4717 if (ret)
4718 goto mode1_reset_failed;
4719
4720 amdgpu_device_load_pci_state(adev->pdev);
4721 ret = amdgpu_psp_wait_for_bootloader(adev);
4722 if (ret)
4723 goto mode1_reset_failed;
4724
4725 /* wait for asic to come out of reset */
4726 for (i = 0; i < adev->usec_timeout; i++) {
4727 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4728
4729 if (memsize != 0xffffffff)
4730 break;
4731 udelay(1);
4732 }
4733
4734 if (i >= adev->usec_timeout) {
4735 ret = -ETIMEDOUT;
4736 goto mode1_reset_failed;
4737 }
4738
4739 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4740
4741 return 0;
4742
4743 mode1_reset_failed:
4744 dev_err(adev->dev, "GPU mode1 reset failed\n");
4745 return ret;
4746 }
4747
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4748 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4749 struct amdgpu_reset_context *reset_context)
4750 {
4751 int i, r = 0;
4752 struct amdgpu_job *job = NULL;
4753 bool need_full_reset =
4754 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4755
4756 if (reset_context->reset_req_dev == adev)
4757 job = reset_context->job;
4758
4759 if (amdgpu_sriov_vf(adev)) {
4760 /* stop the data exchange thread */
4761 amdgpu_virt_fini_data_exchange(adev);
4762 }
4763
4764 amdgpu_fence_driver_isr_toggle(adev, true);
4765
4766 /* block all schedulers and reset given job's ring */
4767 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4768 struct amdgpu_ring *ring = adev->rings[i];
4769
4770 if (!ring || !ring->sched.thread)
4771 continue;
4772
4773 /* Clear job fence from fence drv to avoid force_completion
4774 * leave NULL and vm flush fence in fence drv
4775 */
4776 amdgpu_fence_driver_clear_job_fences(ring);
4777
4778 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4779 amdgpu_fence_driver_force_completion(ring);
4780 }
4781
4782 amdgpu_fence_driver_isr_toggle(adev, false);
4783
4784 if (job && job->vm)
4785 drm_sched_increase_karma(&job->base);
4786
4787 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4788 /* If reset handler not implemented, continue; otherwise return */
4789 if (r == -EOPNOTSUPP)
4790 r = 0;
4791 else
4792 return r;
4793
4794 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4795 if (!amdgpu_sriov_vf(adev)) {
4796
4797 if (!need_full_reset)
4798 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4799
4800 if (!need_full_reset && amdgpu_gpu_recovery &&
4801 amdgpu_device_ip_check_soft_reset(adev)) {
4802 amdgpu_device_ip_pre_soft_reset(adev);
4803 r = amdgpu_device_ip_soft_reset(adev);
4804 amdgpu_device_ip_post_soft_reset(adev);
4805 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4806 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4807 need_full_reset = true;
4808 }
4809 }
4810
4811 if (need_full_reset)
4812 r = amdgpu_device_ip_suspend(adev);
4813 if (need_full_reset)
4814 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4815 else
4816 clear_bit(AMDGPU_NEED_FULL_RESET,
4817 &reset_context->flags);
4818 }
4819
4820 return r;
4821 }
4822
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4823 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4824 {
4825 int i;
4826
4827 lockdep_assert_held(&adev->reset_domain->sem);
4828
4829 for (i = 0; i < adev->num_regs; i++) {
4830 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4831 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4832 adev->reset_dump_reg_value[i]);
4833 }
4834
4835 return 0;
4836 }
4837
4838 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4839 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4840 size_t count, void *data, size_t datalen)
4841 {
4842 struct drm_printer p;
4843 struct amdgpu_device *adev = data;
4844 struct drm_print_iterator iter;
4845 int i;
4846
4847 iter.data = buffer;
4848 iter.offset = 0;
4849 iter.start = offset;
4850 iter.remain = count;
4851
4852 p = drm_coredump_printer(&iter);
4853
4854 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4855 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4856 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4857 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4858 if (adev->reset_task_info.pid)
4859 drm_printf(&p, "process_name: %s PID: %d\n",
4860 adev->reset_task_info.process_name,
4861 adev->reset_task_info.pid);
4862
4863 if (adev->reset_vram_lost)
4864 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4865 if (adev->num_regs) {
4866 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4867
4868 for (i = 0; i < adev->num_regs; i++)
4869 drm_printf(&p, "0x%08x: 0x%08x\n",
4870 adev->reset_dump_reg_list[i],
4871 adev->reset_dump_reg_value[i]);
4872 }
4873
4874 return count - iter.remain;
4875 }
4876
amdgpu_devcoredump_free(void * data)4877 static void amdgpu_devcoredump_free(void *data)
4878 {
4879 }
4880
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)4881 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4882 {
4883 struct drm_device *dev = adev_to_drm(adev);
4884
4885 ktime_get_ts64(&adev->reset_time);
4886 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
4887 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4888 }
4889 #endif
4890
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4891 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4892 struct amdgpu_reset_context *reset_context)
4893 {
4894 struct amdgpu_device *tmp_adev = NULL;
4895 bool need_full_reset, skip_hw_reset, vram_lost = false;
4896 int r = 0;
4897 bool gpu_reset_for_dev_remove = 0;
4898
4899 /* Try reset handler method first */
4900 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4901 reset_list);
4902 amdgpu_reset_reg_dumps(tmp_adev);
4903
4904 reset_context->reset_device_list = device_list_handle;
4905 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4906 /* If reset handler not implemented, continue; otherwise return */
4907 if (r == -EOPNOTSUPP)
4908 r = 0;
4909 else
4910 return r;
4911
4912 /* Reset handler not implemented, use the default method */
4913 need_full_reset =
4914 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4915 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4916
4917 gpu_reset_for_dev_remove =
4918 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4919 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4920
4921 /*
4922 * ASIC reset has to be done on all XGMI hive nodes ASAP
4923 * to allow proper links negotiation in FW (within 1 sec)
4924 */
4925 if (!skip_hw_reset && need_full_reset) {
4926 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4927 /* For XGMI run all resets in parallel to speed up the process */
4928 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4929 tmp_adev->gmc.xgmi.pending_reset = false;
4930 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4931 r = -EALREADY;
4932 } else
4933 r = amdgpu_asic_reset(tmp_adev);
4934
4935 if (r) {
4936 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4937 r, adev_to_drm(tmp_adev)->unique);
4938 break;
4939 }
4940 }
4941
4942 /* For XGMI wait for all resets to complete before proceed */
4943 if (!r) {
4944 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4945 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4946 flush_work(&tmp_adev->xgmi_reset_work);
4947 r = tmp_adev->asic_reset_res;
4948 if (r)
4949 break;
4950 }
4951 }
4952 }
4953 }
4954
4955 if (!r && amdgpu_ras_intr_triggered()) {
4956 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4957 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4958 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4959 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4960 }
4961
4962 amdgpu_ras_intr_cleared();
4963 }
4964
4965 /* Since the mode1 reset affects base ip blocks, the
4966 * phase1 ip blocks need to be resumed. Otherwise there
4967 * will be a BIOS signature error and the psp bootloader
4968 * can't load kdb on the next amdgpu install.
4969 */
4970 if (gpu_reset_for_dev_remove) {
4971 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4972 amdgpu_device_ip_resume_phase1(tmp_adev);
4973
4974 goto end;
4975 }
4976
4977 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4978 if (need_full_reset) {
4979 /* post card */
4980 r = amdgpu_device_asic_init(tmp_adev);
4981 if (r) {
4982 dev_warn(tmp_adev->dev, "asic atom init failed!");
4983 } else {
4984 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4985
4986 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4987 if (r)
4988 goto out;
4989
4990 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4991 #ifdef CONFIG_DEV_COREDUMP
4992 tmp_adev->reset_vram_lost = vram_lost;
4993 memset(&tmp_adev->reset_task_info, 0,
4994 sizeof(tmp_adev->reset_task_info));
4995 if (reset_context->job && reset_context->job->vm)
4996 tmp_adev->reset_task_info =
4997 reset_context->job->vm->task_info;
4998 amdgpu_reset_capture_coredumpm(tmp_adev);
4999 #endif
5000 if (vram_lost) {
5001 DRM_INFO("VRAM is lost due to GPU reset!\n");
5002 amdgpu_inc_vram_lost(tmp_adev);
5003 }
5004
5005 r = amdgpu_device_fw_loading(tmp_adev);
5006 if (r)
5007 return r;
5008
5009 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5010 if (r)
5011 goto out;
5012
5013 if (vram_lost)
5014 amdgpu_device_fill_reset_magic(tmp_adev);
5015
5016 /*
5017 * Add this ASIC as tracked as reset was already
5018 * complete successfully.
5019 */
5020 amdgpu_register_gpu_instance(tmp_adev);
5021
5022 if (!reset_context->hive &&
5023 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5024 amdgpu_xgmi_add_device(tmp_adev);
5025
5026 r = amdgpu_device_ip_late_init(tmp_adev);
5027 if (r)
5028 goto out;
5029
5030 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5031
5032 /*
5033 * The GPU enters bad state once faulty pages
5034 * by ECC has reached the threshold, and ras
5035 * recovery is scheduled next. So add one check
5036 * here to break recovery if it indeed exceeds
5037 * bad page threshold, and remind user to
5038 * retire this GPU or setting one bigger
5039 * bad_page_threshold value to fix this once
5040 * probing driver again.
5041 */
5042 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5043 /* must succeed. */
5044 amdgpu_ras_resume(tmp_adev);
5045 } else {
5046 r = -EINVAL;
5047 goto out;
5048 }
5049
5050 /* Update PSP FW topology after reset */
5051 if (reset_context->hive &&
5052 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5053 r = amdgpu_xgmi_update_topology(
5054 reset_context->hive, tmp_adev);
5055 }
5056 }
5057
5058 out:
5059 if (!r) {
5060 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5061 r = amdgpu_ib_ring_tests(tmp_adev);
5062 if (r) {
5063 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5064 need_full_reset = true;
5065 r = -EAGAIN;
5066 goto end;
5067 }
5068 }
5069
5070 if (!r)
5071 r = amdgpu_device_recover_vram(tmp_adev);
5072 else
5073 tmp_adev->asic_reset_res = r;
5074 }
5075
5076 end:
5077 if (need_full_reset)
5078 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5079 else
5080 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5081 return r;
5082 }
5083
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5084 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5085 {
5086
5087 switch (amdgpu_asic_reset_method(adev)) {
5088 case AMD_RESET_METHOD_MODE1:
5089 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5090 break;
5091 case AMD_RESET_METHOD_MODE2:
5092 adev->mp1_state = PP_MP1_STATE_RESET;
5093 break;
5094 default:
5095 adev->mp1_state = PP_MP1_STATE_NONE;
5096 break;
5097 }
5098 }
5099
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5100 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5101 {
5102 amdgpu_vf_error_trans_all(adev);
5103 adev->mp1_state = PP_MP1_STATE_NONE;
5104 }
5105
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5106 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5107 {
5108 struct pci_dev *p = NULL;
5109
5110 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5111 adev->pdev->bus->number, 1);
5112 if (p) {
5113 pm_runtime_enable(&(p->dev));
5114 pm_runtime_resume(&(p->dev));
5115 }
5116
5117 pci_dev_put(p);
5118 }
5119
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5120 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5121 {
5122 enum amd_reset_method reset_method;
5123 struct pci_dev *p = NULL;
5124 u64 expires;
5125
5126 /*
5127 * For now, only BACO and mode1 reset are confirmed
5128 * to suffer the audio issue without proper suspended.
5129 */
5130 reset_method = amdgpu_asic_reset_method(adev);
5131 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5132 (reset_method != AMD_RESET_METHOD_MODE1))
5133 return -EINVAL;
5134
5135 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5136 adev->pdev->bus->number, 1);
5137 if (!p)
5138 return -ENODEV;
5139
5140 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5141 if (!expires)
5142 /*
5143 * If we cannot get the audio device autosuspend delay,
5144 * a fixed 4S interval will be used. Considering 3S is
5145 * the audio controller default autosuspend delay setting.
5146 * 4S used here is guaranteed to cover that.
5147 */
5148 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5149
5150 while (!pm_runtime_status_suspended(&(p->dev))) {
5151 if (!pm_runtime_suspend(&(p->dev)))
5152 break;
5153
5154 if (expires < ktime_get_mono_fast_ns()) {
5155 dev_warn(adev->dev, "failed to suspend display audio\n");
5156 pci_dev_put(p);
5157 /* TODO: abort the succeeding gpu reset? */
5158 return -ETIMEDOUT;
5159 }
5160 }
5161
5162 pm_runtime_disable(&(p->dev));
5163
5164 pci_dev_put(p);
5165 return 0;
5166 }
5167
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5168 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5169 {
5170 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5171
5172 #if defined(CONFIG_DEBUG_FS)
5173 if (!amdgpu_sriov_vf(adev))
5174 cancel_work(&adev->reset_work);
5175 #endif
5176
5177 if (adev->kfd.dev)
5178 cancel_work(&adev->kfd.reset_work);
5179
5180 if (amdgpu_sriov_vf(adev))
5181 cancel_work(&adev->virt.flr_work);
5182
5183 if (con && adev->ras_enabled)
5184 cancel_work(&con->recovery_work);
5185
5186 }
5187
5188 /**
5189 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5190 *
5191 * @adev: amdgpu_device pointer
5192 * @job: which job trigger hang
5193 * @reset_context: amdgpu reset context pointer
5194 *
5195 * Attempt to reset the GPU if it has hung (all asics).
5196 * Attempt to do soft-reset or full-reset and reinitialize Asic
5197 * Returns 0 for success or an error on failure.
5198 */
5199
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5200 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5201 struct amdgpu_job *job,
5202 struct amdgpu_reset_context *reset_context)
5203 {
5204 struct list_head device_list, *device_list_handle = NULL;
5205 bool job_signaled = false;
5206 struct amdgpu_hive_info *hive = NULL;
5207 struct amdgpu_device *tmp_adev = NULL;
5208 int i, r = 0;
5209 bool need_emergency_restart = false;
5210 bool audio_suspended = false;
5211 bool gpu_reset_for_dev_remove = false;
5212
5213 gpu_reset_for_dev_remove =
5214 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5215 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5216
5217 /*
5218 * Special case: RAS triggered and full reset isn't supported
5219 */
5220 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5221
5222 /*
5223 * Flush RAM to disk so that after reboot
5224 * the user can read log and see why the system rebooted.
5225 */
5226 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5227 amdgpu_ras_get_context(adev)->reboot) {
5228 DRM_WARN("Emergency reboot.");
5229
5230 ksys_sync_helper();
5231 emergency_restart();
5232 }
5233
5234 dev_info(adev->dev, "GPU %s begin!\n",
5235 need_emergency_restart ? "jobs stop":"reset");
5236
5237 if (!amdgpu_sriov_vf(adev))
5238 hive = amdgpu_get_xgmi_hive(adev);
5239 if (hive)
5240 mutex_lock(&hive->hive_lock);
5241
5242 reset_context->job = job;
5243 reset_context->hive = hive;
5244 /*
5245 * Build list of devices to reset.
5246 * In case we are in XGMI hive mode, resort the device list
5247 * to put adev in the 1st position.
5248 */
5249 INIT_LIST_HEAD(&device_list);
5250 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5251 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5252 list_add_tail(&tmp_adev->reset_list, &device_list);
5253 if (gpu_reset_for_dev_remove && adev->shutdown)
5254 tmp_adev->shutdown = true;
5255 }
5256 if (!list_is_first(&adev->reset_list, &device_list))
5257 list_rotate_to_front(&adev->reset_list, &device_list);
5258 device_list_handle = &device_list;
5259 } else {
5260 list_add_tail(&adev->reset_list, &device_list);
5261 device_list_handle = &device_list;
5262 }
5263
5264 /* We need to lock reset domain only once both for XGMI and single device */
5265 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5266 reset_list);
5267 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5268
5269 /* block all schedulers and reset given job's ring */
5270 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5271
5272 amdgpu_device_set_mp1_state(tmp_adev);
5273
5274 /*
5275 * Try to put the audio codec into suspend state
5276 * before gpu reset started.
5277 *
5278 * Due to the power domain of the graphics device
5279 * is shared with AZ power domain. Without this,
5280 * we may change the audio hardware from behind
5281 * the audio driver's back. That will trigger
5282 * some audio codec errors.
5283 */
5284 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5285 audio_suspended = true;
5286
5287 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5288
5289 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5290
5291 if (!amdgpu_sriov_vf(tmp_adev))
5292 amdgpu_amdkfd_pre_reset(tmp_adev);
5293
5294 /*
5295 * Mark these ASICs to be reseted as untracked first
5296 * And add them back after reset completed
5297 */
5298 amdgpu_unregister_gpu_instance(tmp_adev);
5299
5300 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5301
5302 /* disable ras on ALL IPs */
5303 if (!need_emergency_restart &&
5304 amdgpu_device_ip_need_full_reset(tmp_adev))
5305 amdgpu_ras_suspend(tmp_adev);
5306
5307 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5308 struct amdgpu_ring *ring = tmp_adev->rings[i];
5309
5310 if (!ring || !ring->sched.thread)
5311 continue;
5312
5313 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5314
5315 if (need_emergency_restart)
5316 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5317 }
5318 atomic_inc(&tmp_adev->gpu_reset_counter);
5319 }
5320
5321 if (need_emergency_restart)
5322 goto skip_sched_resume;
5323
5324 /*
5325 * Must check guilty signal here since after this point all old
5326 * HW fences are force signaled.
5327 *
5328 * job->base holds a reference to parent fence
5329 */
5330 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5331 job_signaled = true;
5332 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5333 goto skip_hw_reset;
5334 }
5335
5336 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5337 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5338 if (gpu_reset_for_dev_remove) {
5339 /* Workaroud for ASICs need to disable SMC first */
5340 amdgpu_device_smu_fini_early(tmp_adev);
5341 }
5342 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5343 /*TODO Should we stop ?*/
5344 if (r) {
5345 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5346 r, adev_to_drm(tmp_adev)->unique);
5347 tmp_adev->asic_reset_res = r;
5348 }
5349
5350 /*
5351 * Drop all pending non scheduler resets. Scheduler resets
5352 * were already dropped during drm_sched_stop
5353 */
5354 amdgpu_device_stop_pending_resets(tmp_adev);
5355 }
5356
5357 /* Actual ASIC resets if needed.*/
5358 /* Host driver will handle XGMI hive reset for SRIOV */
5359 if (amdgpu_sriov_vf(adev)) {
5360 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5361 if (r)
5362 adev->asic_reset_res = r;
5363
5364 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5365 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5366 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5367 amdgpu_ras_resume(adev);
5368 } else {
5369 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5370 if (r && r == -EAGAIN)
5371 goto retry;
5372
5373 if (!r && gpu_reset_for_dev_remove)
5374 goto recover_end;
5375 }
5376
5377 skip_hw_reset:
5378
5379 /* Post ASIC reset for all devs .*/
5380 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5381
5382 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5383 struct amdgpu_ring *ring = tmp_adev->rings[i];
5384
5385 if (!ring || !ring->sched.thread)
5386 continue;
5387
5388 drm_sched_start(&ring->sched, true);
5389 }
5390
5391 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5392 amdgpu_mes_self_test(tmp_adev);
5393
5394 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5395 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5396
5397 if (tmp_adev->asic_reset_res)
5398 r = tmp_adev->asic_reset_res;
5399
5400 tmp_adev->asic_reset_res = 0;
5401
5402 if (r) {
5403 /* bad news, how to tell it to userspace ? */
5404 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5405 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5406 } else {
5407 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5408 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5409 DRM_WARN("smart shift update failed\n");
5410 }
5411 }
5412
5413 skip_sched_resume:
5414 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5415 /* unlock kfd: SRIOV would do it separately */
5416 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5417 amdgpu_amdkfd_post_reset(tmp_adev);
5418
5419 /* kfd_post_reset will do nothing if kfd device is not initialized,
5420 * need to bring up kfd here if it's not be initialized before
5421 */
5422 if (!adev->kfd.init_complete)
5423 amdgpu_amdkfd_device_init(adev);
5424
5425 if (audio_suspended)
5426 amdgpu_device_resume_display_audio(tmp_adev);
5427
5428 amdgpu_device_unset_mp1_state(tmp_adev);
5429
5430 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5431 }
5432
5433 recover_end:
5434 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5435 reset_list);
5436 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5437
5438 if (hive) {
5439 mutex_unlock(&hive->hive_lock);
5440 amdgpu_put_xgmi_hive(hive);
5441 }
5442
5443 if (r)
5444 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5445
5446 atomic_set(&adev->reset_domain->reset_res, r);
5447 return r;
5448 }
5449
5450 /**
5451 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5452 *
5453 * @adev: amdgpu_device pointer
5454 *
5455 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5456 * and lanes) of the slot the device is in. Handles APUs and
5457 * virtualized environments where PCIE config space may not be available.
5458 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5459 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5460 {
5461 struct pci_dev *pdev;
5462 enum pci_bus_speed speed_cap, platform_speed_cap;
5463 enum pcie_link_width platform_link_width;
5464
5465 if (amdgpu_pcie_gen_cap)
5466 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5467
5468 if (amdgpu_pcie_lane_cap)
5469 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5470
5471 /* covers APUs as well */
5472 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5473 if (adev->pm.pcie_gen_mask == 0)
5474 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5475 if (adev->pm.pcie_mlw_mask == 0)
5476 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5477 return;
5478 }
5479
5480 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5481 return;
5482
5483 pcie_bandwidth_available(adev->pdev, NULL,
5484 &platform_speed_cap, &platform_link_width);
5485
5486 if (adev->pm.pcie_gen_mask == 0) {
5487 /* asic caps */
5488 pdev = adev->pdev;
5489 speed_cap = pcie_get_speed_cap(pdev);
5490 if (speed_cap == PCI_SPEED_UNKNOWN) {
5491 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5492 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5494 } else {
5495 if (speed_cap == PCIE_SPEED_32_0GT)
5496 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5497 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5498 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5499 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5500 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5501 else if (speed_cap == PCIE_SPEED_16_0GT)
5502 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5504 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5505 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5506 else if (speed_cap == PCIE_SPEED_8_0GT)
5507 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5508 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5509 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5510 else if (speed_cap == PCIE_SPEED_5_0GT)
5511 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5512 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5513 else
5514 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5515 }
5516 /* platform caps */
5517 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5518 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5519 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5520 } else {
5521 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5522 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5523 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5524 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5525 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5526 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5527 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5528 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5530 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5531 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5532 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5533 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5534 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5535 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5536 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5537 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5538 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5539 else
5540 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5541
5542 }
5543 }
5544 if (adev->pm.pcie_mlw_mask == 0) {
5545 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5546 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5547 } else {
5548 switch (platform_link_width) {
5549 case PCIE_LNK_X32:
5550 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5557 break;
5558 case PCIE_LNK_X16:
5559 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5560 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5565 break;
5566 case PCIE_LNK_X12:
5567 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5568 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5569 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5570 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5572 break;
5573 case PCIE_LNK_X8:
5574 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5575 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5576 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5577 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5578 break;
5579 case PCIE_LNK_X4:
5580 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5581 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5582 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5583 break;
5584 case PCIE_LNK_X2:
5585 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5586 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5587 break;
5588 case PCIE_LNK_X1:
5589 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5590 break;
5591 default:
5592 break;
5593 }
5594 }
5595 }
5596 }
5597
5598 /**
5599 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5600 *
5601 * @adev: amdgpu_device pointer
5602 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5603 *
5604 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5605 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5606 * @peer_adev.
5607 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5608 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5609 struct amdgpu_device *peer_adev)
5610 {
5611 #ifdef CONFIG_HSA_AMD_P2P
5612 uint64_t address_mask = peer_adev->dev->dma_mask ?
5613 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5614 resource_size_t aper_limit =
5615 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5616 bool p2p_access =
5617 !adev->gmc.xgmi.connected_to_cpu &&
5618 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5619
5620 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5621 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5622 !(adev->gmc.aper_base & address_mask ||
5623 aper_limit & address_mask));
5624 #else
5625 return false;
5626 #endif
5627 }
5628
amdgpu_device_baco_enter(struct drm_device * dev)5629 int amdgpu_device_baco_enter(struct drm_device *dev)
5630 {
5631 struct amdgpu_device *adev = drm_to_adev(dev);
5632 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5633
5634 if (!amdgpu_device_supports_baco(dev))
5635 return -ENOTSUPP;
5636
5637 if (ras && adev->ras_enabled &&
5638 adev->nbio.funcs->enable_doorbell_interrupt)
5639 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5640
5641 return amdgpu_dpm_baco_enter(adev);
5642 }
5643
amdgpu_device_baco_exit(struct drm_device * dev)5644 int amdgpu_device_baco_exit(struct drm_device *dev)
5645 {
5646 struct amdgpu_device *adev = drm_to_adev(dev);
5647 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5648 int ret = 0;
5649
5650 if (!amdgpu_device_supports_baco(dev))
5651 return -ENOTSUPP;
5652
5653 ret = amdgpu_dpm_baco_exit(adev);
5654 if (ret)
5655 return ret;
5656
5657 if (ras && adev->ras_enabled &&
5658 adev->nbio.funcs->enable_doorbell_interrupt)
5659 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5660
5661 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
5662 adev->nbio.funcs->clear_doorbell_interrupt)
5663 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5664
5665 return 0;
5666 }
5667
5668 /**
5669 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5670 * @pdev: PCI device struct
5671 * @state: PCI channel state
5672 *
5673 * Description: Called when a PCI error is detected.
5674 *
5675 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5676 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5677 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5678 {
5679 struct drm_device *dev = pci_get_drvdata(pdev);
5680 struct amdgpu_device *adev = drm_to_adev(dev);
5681 int i;
5682
5683 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5684
5685 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5686 DRM_WARN("No support for XGMI hive yet...");
5687 return PCI_ERS_RESULT_DISCONNECT;
5688 }
5689
5690 adev->pci_channel_state = state;
5691
5692 switch (state) {
5693 case pci_channel_io_normal:
5694 return PCI_ERS_RESULT_CAN_RECOVER;
5695 /* Fatal error, prepare for slot reset */
5696 case pci_channel_io_frozen:
5697 /*
5698 * Locking adev->reset_domain->sem will prevent any external access
5699 * to GPU during PCI error recovery
5700 */
5701 amdgpu_device_lock_reset_domain(adev->reset_domain);
5702 amdgpu_device_set_mp1_state(adev);
5703
5704 /*
5705 * Block any work scheduling as we do for regular GPU reset
5706 * for the duration of the recovery
5707 */
5708 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5709 struct amdgpu_ring *ring = adev->rings[i];
5710
5711 if (!ring || !ring->sched.thread)
5712 continue;
5713
5714 drm_sched_stop(&ring->sched, NULL);
5715 }
5716 atomic_inc(&adev->gpu_reset_counter);
5717 return PCI_ERS_RESULT_NEED_RESET;
5718 case pci_channel_io_perm_failure:
5719 /* Permanent error, prepare for device removal */
5720 return PCI_ERS_RESULT_DISCONNECT;
5721 }
5722
5723 return PCI_ERS_RESULT_NEED_RESET;
5724 }
5725
5726 /**
5727 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5728 * @pdev: pointer to PCI device
5729 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5730 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5731 {
5732
5733 DRM_INFO("PCI error: mmio enabled callback!!\n");
5734
5735 /* TODO - dump whatever for debugging purposes */
5736
5737 /* This called only if amdgpu_pci_error_detected returns
5738 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5739 * works, no need to reset slot.
5740 */
5741
5742 return PCI_ERS_RESULT_RECOVERED;
5743 }
5744
5745 /**
5746 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5747 * @pdev: PCI device struct
5748 *
5749 * Description: This routine is called by the pci error recovery
5750 * code after the PCI slot has been reset, just before we
5751 * should resume normal operations.
5752 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5753 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5754 {
5755 struct drm_device *dev = pci_get_drvdata(pdev);
5756 struct amdgpu_device *adev = drm_to_adev(dev);
5757 int r, i;
5758 struct amdgpu_reset_context reset_context;
5759 u32 memsize;
5760 struct list_head device_list;
5761
5762 DRM_INFO("PCI error: slot reset callback!!\n");
5763
5764 memset(&reset_context, 0, sizeof(reset_context));
5765
5766 INIT_LIST_HEAD(&device_list);
5767 list_add_tail(&adev->reset_list, &device_list);
5768
5769 /* wait for asic to come out of reset */
5770 msleep(500);
5771
5772 /* Restore PCI confspace */
5773 amdgpu_device_load_pci_state(pdev);
5774
5775 /* confirm ASIC came out of reset */
5776 for (i = 0; i < adev->usec_timeout; i++) {
5777 memsize = amdgpu_asic_get_config_memsize(adev);
5778
5779 if (memsize != 0xffffffff)
5780 break;
5781 udelay(1);
5782 }
5783 if (memsize == 0xffffffff) {
5784 r = -ETIME;
5785 goto out;
5786 }
5787
5788 reset_context.method = AMD_RESET_METHOD_NONE;
5789 reset_context.reset_req_dev = adev;
5790 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5791 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5792
5793 adev->no_hw_access = true;
5794 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5795 adev->no_hw_access = false;
5796 if (r)
5797 goto out;
5798
5799 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5800
5801 out:
5802 if (!r) {
5803 if (amdgpu_device_cache_pci_state(adev->pdev))
5804 pci_restore_state(adev->pdev);
5805
5806 DRM_INFO("PCIe error recovery succeeded\n");
5807 } else {
5808 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5809 amdgpu_device_unset_mp1_state(adev);
5810 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5811 }
5812
5813 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5814 }
5815
5816 /**
5817 * amdgpu_pci_resume() - resume normal ops after PCI reset
5818 * @pdev: pointer to PCI device
5819 *
5820 * Called when the error recovery driver tells us that its
5821 * OK to resume normal operation.
5822 */
amdgpu_pci_resume(struct pci_dev * pdev)5823 void amdgpu_pci_resume(struct pci_dev *pdev)
5824 {
5825 struct drm_device *dev = pci_get_drvdata(pdev);
5826 struct amdgpu_device *adev = drm_to_adev(dev);
5827 int i;
5828
5829
5830 DRM_INFO("PCI error: resume callback!!\n");
5831
5832 /* Only continue execution for the case of pci_channel_io_frozen */
5833 if (adev->pci_channel_state != pci_channel_io_frozen)
5834 return;
5835
5836 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5837 struct amdgpu_ring *ring = adev->rings[i];
5838
5839 if (!ring || !ring->sched.thread)
5840 continue;
5841
5842 drm_sched_start(&ring->sched, true);
5843 }
5844
5845 amdgpu_device_unset_mp1_state(adev);
5846 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5847 }
5848
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5849 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5850 {
5851 struct drm_device *dev = pci_get_drvdata(pdev);
5852 struct amdgpu_device *adev = drm_to_adev(dev);
5853 int r;
5854
5855 if (amdgpu_sriov_vf(adev))
5856 return false;
5857
5858 r = pci_save_state(pdev);
5859 if (!r) {
5860 kfree(adev->pci_state);
5861
5862 adev->pci_state = pci_store_saved_state(pdev);
5863
5864 if (!adev->pci_state) {
5865 DRM_ERROR("Failed to store PCI saved state");
5866 return false;
5867 }
5868 } else {
5869 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5870 return false;
5871 }
5872
5873 return true;
5874 }
5875
amdgpu_device_load_pci_state(struct pci_dev * pdev)5876 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5877 {
5878 struct drm_device *dev = pci_get_drvdata(pdev);
5879 struct amdgpu_device *adev = drm_to_adev(dev);
5880 int r;
5881
5882 if (!adev->pci_state)
5883 return false;
5884
5885 r = pci_load_saved_state(pdev, adev->pci_state);
5886
5887 if (!r) {
5888 pci_restore_state(pdev);
5889 } else {
5890 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5891 return false;
5892 }
5893
5894 return true;
5895 }
5896
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5897 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5898 struct amdgpu_ring *ring)
5899 {
5900 #ifdef CONFIG_X86_64
5901 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5902 return;
5903 #endif
5904 if (adev->gmc.xgmi.connected_to_cpu)
5905 return;
5906
5907 if (ring && ring->funcs->emit_hdp_flush)
5908 amdgpu_ring_emit_hdp_flush(ring);
5909 else
5910 amdgpu_asic_flush_hdp(adev, ring);
5911 }
5912
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5913 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5914 struct amdgpu_ring *ring)
5915 {
5916 #ifdef CONFIG_X86_64
5917 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5918 return;
5919 #endif
5920 if (adev->gmc.xgmi.connected_to_cpu)
5921 return;
5922
5923 amdgpu_asic_invalidate_hdp(adev, ring);
5924 }
5925
amdgpu_in_reset(struct amdgpu_device * adev)5926 int amdgpu_in_reset(struct amdgpu_device *adev)
5927 {
5928 return atomic_read(&adev->reset_domain->in_gpu_reset);
5929 }
5930
5931 /**
5932 * amdgpu_device_halt() - bring hardware to some kind of halt state
5933 *
5934 * @adev: amdgpu_device pointer
5935 *
5936 * Bring hardware to some kind of halt state so that no one can touch it
5937 * any more. It will help to maintain error context when error occurred.
5938 * Compare to a simple hang, the system will keep stable at least for SSH
5939 * access. Then it should be trivial to inspect the hardware state and
5940 * see what's going on. Implemented as following:
5941 *
5942 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5943 * clears all CPU mappings to device, disallows remappings through page faults
5944 * 2. amdgpu_irq_disable_all() disables all interrupts
5945 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5946 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5947 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5948 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5949 * flush any in flight DMA operations
5950 */
amdgpu_device_halt(struct amdgpu_device * adev)5951 void amdgpu_device_halt(struct amdgpu_device *adev)
5952 {
5953 struct pci_dev *pdev = adev->pdev;
5954 struct drm_device *ddev = adev_to_drm(adev);
5955
5956 amdgpu_xcp_dev_unplug(adev);
5957 drm_dev_unplug(ddev);
5958
5959 amdgpu_irq_disable_all(adev);
5960
5961 amdgpu_fence_driver_hw_fini(adev);
5962
5963 adev->no_hw_access = true;
5964
5965 amdgpu_device_unmap_mmio(adev);
5966
5967 pci_disable_device(pdev);
5968 pci_wait_for_pending_transaction(pdev);
5969 }
5970
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)5971 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5972 u32 reg)
5973 {
5974 unsigned long flags, address, data;
5975 u32 r;
5976
5977 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5978 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5979
5980 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5981 WREG32(address, reg * 4);
5982 (void)RREG32(address);
5983 r = RREG32(data);
5984 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5985 return r;
5986 }
5987
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)5988 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5989 u32 reg, u32 v)
5990 {
5991 unsigned long flags, address, data;
5992
5993 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5994 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5995
5996 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5997 WREG32(address, reg * 4);
5998 (void)RREG32(address);
5999 WREG32(data, v);
6000 (void)RREG32(data);
6001 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6002 }
6003
6004 /**
6005 * amdgpu_device_switch_gang - switch to a new gang
6006 * @adev: amdgpu_device pointer
6007 * @gang: the gang to switch to
6008 *
6009 * Try to switch to a new gang.
6010 * Returns: NULL if we switched to the new gang or a reference to the current
6011 * gang leader.
6012 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6013 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6014 struct dma_fence *gang)
6015 {
6016 struct dma_fence *old = NULL;
6017
6018 do {
6019 dma_fence_put(old);
6020 rcu_read_lock();
6021 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6022 rcu_read_unlock();
6023
6024 if (old == gang)
6025 break;
6026
6027 if (!dma_fence_is_signaled(old))
6028 return old;
6029
6030 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6031 old, gang) != old);
6032
6033 dma_fence_put(old);
6034 return NULL;
6035 }
6036
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6037 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6038 {
6039 switch (adev->asic_type) {
6040 #ifdef CONFIG_DRM_AMDGPU_SI
6041 case CHIP_HAINAN:
6042 #endif
6043 case CHIP_TOPAZ:
6044 /* chips with no display hardware */
6045 return false;
6046 #ifdef CONFIG_DRM_AMDGPU_SI
6047 case CHIP_TAHITI:
6048 case CHIP_PITCAIRN:
6049 case CHIP_VERDE:
6050 case CHIP_OLAND:
6051 #endif
6052 #ifdef CONFIG_DRM_AMDGPU_CIK
6053 case CHIP_BONAIRE:
6054 case CHIP_HAWAII:
6055 case CHIP_KAVERI:
6056 case CHIP_KABINI:
6057 case CHIP_MULLINS:
6058 #endif
6059 case CHIP_TONGA:
6060 case CHIP_FIJI:
6061 case CHIP_POLARIS10:
6062 case CHIP_POLARIS11:
6063 case CHIP_POLARIS12:
6064 case CHIP_VEGAM:
6065 case CHIP_CARRIZO:
6066 case CHIP_STONEY:
6067 /* chips with display hardware */
6068 return true;
6069 default:
6070 /* IP discovery */
6071 if (!adev->ip_versions[DCE_HWIP][0] ||
6072 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6073 return false;
6074 return true;
6075 }
6076 }
6077
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6078 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6079 uint32_t inst, uint32_t reg_addr, char reg_name[],
6080 uint32_t expected_value, uint32_t mask)
6081 {
6082 uint32_t ret = 0;
6083 uint32_t old_ = 0;
6084 uint32_t tmp_ = RREG32(reg_addr);
6085 uint32_t loop = adev->usec_timeout;
6086
6087 while ((tmp_ & (mask)) != (expected_value)) {
6088 if (old_ != tmp_) {
6089 loop = adev->usec_timeout;
6090 old_ = tmp_;
6091 } else
6092 udelay(1);
6093 tmp_ = RREG32(reg_addr);
6094 loop--;
6095 if (!loop) {
6096 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6097 inst, reg_name, (uint32_t)expected_value,
6098 (uint32_t)(tmp_ & (mask)));
6099 ret = -ETIMEDOUT;
6100 break;
6101 }
6102 }
6103 return ret;
6104 }
6105