1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 #include "amdgpu_reset.h"
69 
70 #include <linux/suspend.h>
71 #include <drm/task_barrier.h>
72 #include <linux/pm_runtime.h>
73 
74 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
85 
86 #define AMDGPU_RESUME_MS		2000
87 
88 const char *amdgpu_asic_name[] = {
89 	"TAHITI",
90 	"PITCAIRN",
91 	"VERDE",
92 	"OLAND",
93 	"HAINAN",
94 	"BONAIRE",
95 	"KAVERI",
96 	"KABINI",
97 	"HAWAII",
98 	"MULLINS",
99 	"TOPAZ",
100 	"TONGA",
101 	"FIJI",
102 	"CARRIZO",
103 	"STONEY",
104 	"POLARIS10",
105 	"POLARIS11",
106 	"POLARIS12",
107 	"VEGAM",
108 	"VEGA10",
109 	"VEGA12",
110 	"VEGA20",
111 	"RAVEN",
112 	"ARCTURUS",
113 	"RENOIR",
114 	"ALDEBARAN",
115 	"NAVI10",
116 	"NAVI14",
117 	"NAVI12",
118 	"SIENNA_CICHLID",
119 	"NAVY_FLOUNDER",
120 	"VANGOGH",
121 	"DIMGREY_CAVEFISH",
122 	"LAST",
123 };
124 
125 /**
126  * DOC: pcie_replay_count
127  *
128  * The amdgpu driver provides a sysfs API for reporting the total number
129  * of PCIe replays (NAKs)
130  * The file pcie_replay_count is used for this and returns the total
131  * number of replays as a sum of the NAKs generated and NAKs received
132  */
133 
134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
135 		struct device_attribute *attr, char *buf)
136 {
137 	struct drm_device *ddev = dev_get_drvdata(dev);
138 	struct amdgpu_device *adev = drm_to_adev(ddev);
139 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
140 
141 	return sysfs_emit(buf, "%llu\n", cnt);
142 }
143 
144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
145 		amdgpu_device_get_pcie_replay_count, NULL);
146 
147 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
148 
149 /**
150  * DOC: product_name
151  *
152  * The amdgpu driver provides a sysfs API for reporting the product name
153  * for the device
154  * The file serial_number is used for this and returns the product name
155  * as returned from the FRU.
156  * NOTE: This is only available for certain server cards
157  */
158 
159 static ssize_t amdgpu_device_get_product_name(struct device *dev,
160 		struct device_attribute *attr, char *buf)
161 {
162 	struct drm_device *ddev = dev_get_drvdata(dev);
163 	struct amdgpu_device *adev = drm_to_adev(ddev);
164 
165 	return sysfs_emit(buf, "%s\n", adev->product_name);
166 }
167 
168 static DEVICE_ATTR(product_name, S_IRUGO,
169 		amdgpu_device_get_product_name, NULL);
170 
171 /**
172  * DOC: product_number
173  *
174  * The amdgpu driver provides a sysfs API for reporting the part number
175  * for the device
176  * The file serial_number is used for this and returns the part number
177  * as returned from the FRU.
178  * NOTE: This is only available for certain server cards
179  */
180 
181 static ssize_t amdgpu_device_get_product_number(struct device *dev,
182 		struct device_attribute *attr, char *buf)
183 {
184 	struct drm_device *ddev = dev_get_drvdata(dev);
185 	struct amdgpu_device *adev = drm_to_adev(ddev);
186 
187 	return sysfs_emit(buf, "%s\n", adev->product_number);
188 }
189 
190 static DEVICE_ATTR(product_number, S_IRUGO,
191 		amdgpu_device_get_product_number, NULL);
192 
193 /**
194  * DOC: serial_number
195  *
196  * The amdgpu driver provides a sysfs API for reporting the serial number
197  * for the device
198  * The file serial_number is used for this and returns the serial number
199  * as returned from the FRU.
200  * NOTE: This is only available for certain server cards
201  */
202 
203 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
204 		struct device_attribute *attr, char *buf)
205 {
206 	struct drm_device *ddev = dev_get_drvdata(dev);
207 	struct amdgpu_device *adev = drm_to_adev(ddev);
208 
209 	return sysfs_emit(buf, "%s\n", adev->serial);
210 }
211 
212 static DEVICE_ATTR(serial_number, S_IRUGO,
213 		amdgpu_device_get_serial_number, NULL);
214 
215 /**
216  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
217  *
218  * @dev: drm_device pointer
219  *
220  * Returns true if the device is a dGPU with ATPX power control,
221  * otherwise return false.
222  */
223 bool amdgpu_device_supports_px(struct drm_device *dev)
224 {
225 	struct amdgpu_device *adev = drm_to_adev(dev);
226 
227 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
228 		return true;
229 	return false;
230 }
231 
232 /**
233  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
234  *
235  * @dev: drm_device pointer
236  *
237  * Returns true if the device is a dGPU with ACPI power control,
238  * otherwise return false.
239  */
240 bool amdgpu_device_supports_boco(struct drm_device *dev)
241 {
242 	struct amdgpu_device *adev = drm_to_adev(dev);
243 
244 	if (adev->has_pr3 ||
245 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
246 		return true;
247 	return false;
248 }
249 
250 /**
251  * amdgpu_device_supports_baco - Does the device support BACO
252  *
253  * @dev: drm_device pointer
254  *
255  * Returns true if the device supporte BACO,
256  * otherwise return false.
257  */
258 bool amdgpu_device_supports_baco(struct drm_device *dev)
259 {
260 	struct amdgpu_device *adev = drm_to_adev(dev);
261 
262 	return amdgpu_asic_supports_baco(adev);
263 }
264 
265 /*
266  * VRAM access helper functions
267  */
268 
269 /**
270  * amdgpu_device_vram_access - read/write a buffer in vram
271  *
272  * @adev: amdgpu_device pointer
273  * @pos: offset of the buffer in vram
274  * @buf: virtual address of the buffer in system memory
275  * @size: read/write size, sizeof(@buf) must > @size
276  * @write: true - write to vram, otherwise - read from vram
277  */
278 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
279 			       uint32_t *buf, size_t size, bool write)
280 {
281 	unsigned long flags;
282 	uint32_t hi = ~0;
283 	uint64_t last;
284 
285 
286 #ifdef CONFIG_64BIT
287 	last = min(pos + size, adev->gmc.visible_vram_size);
288 	if (last > pos) {
289 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
290 		size_t count = last - pos;
291 
292 		if (write) {
293 			memcpy_toio(addr, buf, count);
294 			mb();
295 			amdgpu_asic_flush_hdp(adev, NULL);
296 		} else {
297 			amdgpu_asic_invalidate_hdp(adev, NULL);
298 			mb();
299 			memcpy_fromio(buf, addr, count);
300 		}
301 
302 		if (count == size)
303 			return;
304 
305 		pos += count;
306 		buf += count / 4;
307 		size -= count;
308 	}
309 #endif
310 
311 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
312 	for (last = pos + size; pos < last; pos += 4) {
313 		uint32_t tmp = pos >> 31;
314 
315 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
316 		if (tmp != hi) {
317 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
318 			hi = tmp;
319 		}
320 		if (write)
321 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
322 		else
323 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
324 	}
325 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
326 }
327 
328 /*
329  * register access helper functions.
330  */
331 
332 /* Check if hw access should be skipped because of hotplug or device error */
333 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
334 {
335 	if (adev->in_pci_err_recovery)
336 		return true;
337 
338 #ifdef CONFIG_LOCKDEP
339 	/*
340 	 * This is a bit complicated to understand, so worth a comment. What we assert
341 	 * here is that the GPU reset is not running on another thread in parallel.
342 	 *
343 	 * For this we trylock the read side of the reset semaphore, if that succeeds
344 	 * we know that the reset is not running in paralell.
345 	 *
346 	 * If the trylock fails we assert that we are either already holding the read
347 	 * side of the lock or are the reset thread itself and hold the write side of
348 	 * the lock.
349 	 */
350 	if (in_task()) {
351 		if (down_read_trylock(&adev->reset_sem))
352 			up_read(&adev->reset_sem);
353 		else
354 			lockdep_assert_held(&adev->reset_sem);
355 	}
356 #endif
357 	return false;
358 }
359 
360 /**
361  * amdgpu_device_rreg - read a memory mapped IO or indirect register
362  *
363  * @adev: amdgpu_device pointer
364  * @reg: dword aligned register offset
365  * @acc_flags: access flags which require special behavior
366  *
367  * Returns the 32 bit value from the offset specified.
368  */
369 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
370 			    uint32_t reg, uint32_t acc_flags)
371 {
372 	uint32_t ret;
373 
374 	if (amdgpu_device_skip_hw_access(adev))
375 		return 0;
376 
377 	if ((reg * 4) < adev->rmmio_size) {
378 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
379 		    amdgpu_sriov_runtime(adev) &&
380 		    down_read_trylock(&adev->reset_sem)) {
381 			ret = amdgpu_kiq_rreg(adev, reg);
382 			up_read(&adev->reset_sem);
383 		} else {
384 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
385 		}
386 	} else {
387 		ret = adev->pcie_rreg(adev, reg * 4);
388 	}
389 
390 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
391 
392 	return ret;
393 }
394 
395 /*
396  * MMIO register read with bytes helper functions
397  * @offset:bytes offset from MMIO start
398  *
399 */
400 
401 /**
402  * amdgpu_mm_rreg8 - read a memory mapped IO register
403  *
404  * @adev: amdgpu_device pointer
405  * @offset: byte aligned register offset
406  *
407  * Returns the 8 bit value from the offset specified.
408  */
409 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
410 {
411 	if (amdgpu_device_skip_hw_access(adev))
412 		return 0;
413 
414 	if (offset < adev->rmmio_size)
415 		return (readb(adev->rmmio + offset));
416 	BUG();
417 }
418 
419 /*
420  * MMIO register write with bytes helper functions
421  * @offset:bytes offset from MMIO start
422  * @value: the value want to be written to the register
423  *
424 */
425 /**
426  * amdgpu_mm_wreg8 - read a memory mapped IO register
427  *
428  * @adev: amdgpu_device pointer
429  * @offset: byte aligned register offset
430  * @value: 8 bit value to write
431  *
432  * Writes the value specified to the offset specified.
433  */
434 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
435 {
436 	if (amdgpu_device_skip_hw_access(adev))
437 		return;
438 
439 	if (offset < adev->rmmio_size)
440 		writeb(value, adev->rmmio + offset);
441 	else
442 		BUG();
443 }
444 
445 /**
446  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
447  *
448  * @adev: amdgpu_device pointer
449  * @reg: dword aligned register offset
450  * @v: 32 bit value to write to the register
451  * @acc_flags: access flags which require special behavior
452  *
453  * Writes the value specified to the offset specified.
454  */
455 void amdgpu_device_wreg(struct amdgpu_device *adev,
456 			uint32_t reg, uint32_t v,
457 			uint32_t acc_flags)
458 {
459 	if (amdgpu_device_skip_hw_access(adev))
460 		return;
461 
462 	if ((reg * 4) < adev->rmmio_size) {
463 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
464 		    amdgpu_sriov_runtime(adev) &&
465 		    down_read_trylock(&adev->reset_sem)) {
466 			amdgpu_kiq_wreg(adev, reg, v);
467 			up_read(&adev->reset_sem);
468 		} else {
469 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
470 		}
471 	} else {
472 		adev->pcie_wreg(adev, reg * 4, v);
473 	}
474 
475 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
476 }
477 
478 /*
479  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
480  *
481  * this function is invoked only the debugfs register access
482  * */
483 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
484 			     uint32_t reg, uint32_t v)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return;
488 
489 	if (amdgpu_sriov_fullaccess(adev) &&
490 	    adev->gfx.rlc.funcs &&
491 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
492 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
493 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0);
494 	} else {
495 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
496 	}
497 }
498 
499 /**
500  * amdgpu_mm_rdoorbell - read a doorbell dword
501  *
502  * @adev: amdgpu_device pointer
503  * @index: doorbell index
504  *
505  * Returns the value in the doorbell aperture at the
506  * requested doorbell index (CIK).
507  */
508 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
509 {
510 	if (amdgpu_device_skip_hw_access(adev))
511 		return 0;
512 
513 	if (index < adev->doorbell.num_doorbells) {
514 		return readl(adev->doorbell.ptr + index);
515 	} else {
516 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
517 		return 0;
518 	}
519 }
520 
521 /**
522  * amdgpu_mm_wdoorbell - write a doorbell dword
523  *
524  * @adev: amdgpu_device pointer
525  * @index: doorbell index
526  * @v: value to write
527  *
528  * Writes @v to the doorbell aperture at the
529  * requested doorbell index (CIK).
530  */
531 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
532 {
533 	if (amdgpu_device_skip_hw_access(adev))
534 		return;
535 
536 	if (index < adev->doorbell.num_doorbells) {
537 		writel(v, adev->doorbell.ptr + index);
538 	} else {
539 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
540 	}
541 }
542 
543 /**
544  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
545  *
546  * @adev: amdgpu_device pointer
547  * @index: doorbell index
548  *
549  * Returns the value in the doorbell aperture at the
550  * requested doorbell index (VEGA10+).
551  */
552 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
553 {
554 	if (amdgpu_device_skip_hw_access(adev))
555 		return 0;
556 
557 	if (index < adev->doorbell.num_doorbells) {
558 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
559 	} else {
560 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
561 		return 0;
562 	}
563 }
564 
565 /**
566  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
567  *
568  * @adev: amdgpu_device pointer
569  * @index: doorbell index
570  * @v: value to write
571  *
572  * Writes @v to the doorbell aperture at the
573  * requested doorbell index (VEGA10+).
574  */
575 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
576 {
577 	if (amdgpu_device_skip_hw_access(adev))
578 		return;
579 
580 	if (index < adev->doorbell.num_doorbells) {
581 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
582 	} else {
583 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
584 	}
585 }
586 
587 /**
588  * amdgpu_device_indirect_rreg - read an indirect register
589  *
590  * @adev: amdgpu_device pointer
591  * @pcie_index: mmio register offset
592  * @pcie_data: mmio register offset
593  * @reg_addr: indirect register address to read from
594  *
595  * Returns the value of indirect register @reg_addr
596  */
597 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
598 				u32 pcie_index, u32 pcie_data,
599 				u32 reg_addr)
600 {
601 	unsigned long flags;
602 	u32 r;
603 	void __iomem *pcie_index_offset;
604 	void __iomem *pcie_data_offset;
605 
606 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
607 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
608 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
609 
610 	writel(reg_addr, pcie_index_offset);
611 	readl(pcie_index_offset);
612 	r = readl(pcie_data_offset);
613 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
614 
615 	return r;
616 }
617 
618 /**
619  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
620  *
621  * @adev: amdgpu_device pointer
622  * @pcie_index: mmio register offset
623  * @pcie_data: mmio register offset
624  * @reg_addr: indirect register address to read from
625  *
626  * Returns the value of indirect register @reg_addr
627  */
628 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
629 				  u32 pcie_index, u32 pcie_data,
630 				  u32 reg_addr)
631 {
632 	unsigned long flags;
633 	u64 r;
634 	void __iomem *pcie_index_offset;
635 	void __iomem *pcie_data_offset;
636 
637 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
638 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
639 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
640 
641 	/* read low 32 bits */
642 	writel(reg_addr, pcie_index_offset);
643 	readl(pcie_index_offset);
644 	r = readl(pcie_data_offset);
645 	/* read high 32 bits */
646 	writel(reg_addr + 4, pcie_index_offset);
647 	readl(pcie_index_offset);
648 	r |= ((u64)readl(pcie_data_offset) << 32);
649 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
650 
651 	return r;
652 }
653 
654 /**
655  * amdgpu_device_indirect_wreg - write an indirect register address
656  *
657  * @adev: amdgpu_device pointer
658  * @pcie_index: mmio register offset
659  * @pcie_data: mmio register offset
660  * @reg_addr: indirect register offset
661  * @reg_data: indirect register data
662  *
663  */
664 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
665 				 u32 pcie_index, u32 pcie_data,
666 				 u32 reg_addr, u32 reg_data)
667 {
668 	unsigned long flags;
669 	void __iomem *pcie_index_offset;
670 	void __iomem *pcie_data_offset;
671 
672 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
673 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
674 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
675 
676 	writel(reg_addr, pcie_index_offset);
677 	readl(pcie_index_offset);
678 	writel(reg_data, pcie_data_offset);
679 	readl(pcie_data_offset);
680 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
681 }
682 
683 /**
684  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
685  *
686  * @adev: amdgpu_device pointer
687  * @pcie_index: mmio register offset
688  * @pcie_data: mmio register offset
689  * @reg_addr: indirect register offset
690  * @reg_data: indirect register data
691  *
692  */
693 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
694 				   u32 pcie_index, u32 pcie_data,
695 				   u32 reg_addr, u64 reg_data)
696 {
697 	unsigned long flags;
698 	void __iomem *pcie_index_offset;
699 	void __iomem *pcie_data_offset;
700 
701 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
702 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
703 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
704 
705 	/* write low 32 bits */
706 	writel(reg_addr, pcie_index_offset);
707 	readl(pcie_index_offset);
708 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
709 	readl(pcie_data_offset);
710 	/* write high 32 bits */
711 	writel(reg_addr + 4, pcie_index_offset);
712 	readl(pcie_index_offset);
713 	writel((u32)(reg_data >> 32), pcie_data_offset);
714 	readl(pcie_data_offset);
715 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
716 }
717 
718 /**
719  * amdgpu_invalid_rreg - dummy reg read function
720  *
721  * @adev: amdgpu_device pointer
722  * @reg: offset of register
723  *
724  * Dummy register read function.  Used for register blocks
725  * that certain asics don't have (all asics).
726  * Returns the value in the register.
727  */
728 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
729 {
730 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
731 	BUG();
732 	return 0;
733 }
734 
735 /**
736  * amdgpu_invalid_wreg - dummy reg write function
737  *
738  * @adev: amdgpu_device pointer
739  * @reg: offset of register
740  * @v: value to write to the register
741  *
742  * Dummy register read function.  Used for register blocks
743  * that certain asics don't have (all asics).
744  */
745 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
746 {
747 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
748 		  reg, v);
749 	BUG();
750 }
751 
752 /**
753  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
754  *
755  * @adev: amdgpu_device pointer
756  * @reg: offset of register
757  *
758  * Dummy register read function.  Used for register blocks
759  * that certain asics don't have (all asics).
760  * Returns the value in the register.
761  */
762 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
763 {
764 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
765 	BUG();
766 	return 0;
767 }
768 
769 /**
770  * amdgpu_invalid_wreg64 - dummy reg write function
771  *
772  * @adev: amdgpu_device pointer
773  * @reg: offset of register
774  * @v: value to write to the register
775  *
776  * Dummy register read function.  Used for register blocks
777  * that certain asics don't have (all asics).
778  */
779 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
780 {
781 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
782 		  reg, v);
783 	BUG();
784 }
785 
786 /**
787  * amdgpu_block_invalid_rreg - dummy reg read function
788  *
789  * @adev: amdgpu_device pointer
790  * @block: offset of instance
791  * @reg: offset of register
792  *
793  * Dummy register read function.  Used for register blocks
794  * that certain asics don't have (all asics).
795  * Returns the value in the register.
796  */
797 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
798 					  uint32_t block, uint32_t reg)
799 {
800 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
801 		  reg, block);
802 	BUG();
803 	return 0;
804 }
805 
806 /**
807  * amdgpu_block_invalid_wreg - dummy reg write function
808  *
809  * @adev: amdgpu_device pointer
810  * @block: offset of instance
811  * @reg: offset of register
812  * @v: value to write to the register
813  *
814  * Dummy register read function.  Used for register blocks
815  * that certain asics don't have (all asics).
816  */
817 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
818 				      uint32_t block,
819 				      uint32_t reg, uint32_t v)
820 {
821 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
822 		  reg, block, v);
823 	BUG();
824 }
825 
826 /**
827  * amdgpu_device_asic_init - Wrapper for atom asic_init
828  *
829  * @adev: amdgpu_device pointer
830  *
831  * Does any asic specific work and then calls atom asic init.
832  */
833 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
834 {
835 	amdgpu_asic_pre_asic_init(adev);
836 
837 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
838 }
839 
840 /**
841  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
842  *
843  * @adev: amdgpu_device pointer
844  *
845  * Allocates a scratch page of VRAM for use by various things in the
846  * driver.
847  */
848 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
849 {
850 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
851 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
852 				       &adev->vram_scratch.robj,
853 				       &adev->vram_scratch.gpu_addr,
854 				       (void **)&adev->vram_scratch.ptr);
855 }
856 
857 /**
858  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
859  *
860  * @adev: amdgpu_device pointer
861  *
862  * Frees the VRAM scratch page.
863  */
864 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
865 {
866 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
867 }
868 
869 /**
870  * amdgpu_device_program_register_sequence - program an array of registers.
871  *
872  * @adev: amdgpu_device pointer
873  * @registers: pointer to the register array
874  * @array_size: size of the register array
875  *
876  * Programs an array or registers with and and or masks.
877  * This is a helper for setting golden registers.
878  */
879 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
880 					     const u32 *registers,
881 					     const u32 array_size)
882 {
883 	u32 tmp, reg, and_mask, or_mask;
884 	int i;
885 
886 	if (array_size % 3)
887 		return;
888 
889 	for (i = 0; i < array_size; i +=3) {
890 		reg = registers[i + 0];
891 		and_mask = registers[i + 1];
892 		or_mask = registers[i + 2];
893 
894 		if (and_mask == 0xffffffff) {
895 			tmp = or_mask;
896 		} else {
897 			tmp = RREG32(reg);
898 			tmp &= ~and_mask;
899 			if (adev->family >= AMDGPU_FAMILY_AI)
900 				tmp |= (or_mask & and_mask);
901 			else
902 				tmp |= or_mask;
903 		}
904 		WREG32(reg, tmp);
905 	}
906 }
907 
908 /**
909  * amdgpu_device_pci_config_reset - reset the GPU
910  *
911  * @adev: amdgpu_device pointer
912  *
913  * Resets the GPU using the pci config reset sequence.
914  * Only applicable to asics prior to vega10.
915  */
916 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
917 {
918 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
919 }
920 
921 /**
922  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
923  *
924  * @adev: amdgpu_device pointer
925  *
926  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
927  */
928 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
929 {
930 	return pci_reset_function(adev->pdev);
931 }
932 
933 /*
934  * GPU doorbell aperture helpers function.
935  */
936 /**
937  * amdgpu_device_doorbell_init - Init doorbell driver information.
938  *
939  * @adev: amdgpu_device pointer
940  *
941  * Init doorbell driver information (CIK)
942  * Returns 0 on success, error on failure.
943  */
944 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
945 {
946 
947 	/* No doorbell on SI hardware generation */
948 	if (adev->asic_type < CHIP_BONAIRE) {
949 		adev->doorbell.base = 0;
950 		adev->doorbell.size = 0;
951 		adev->doorbell.num_doorbells = 0;
952 		adev->doorbell.ptr = NULL;
953 		return 0;
954 	}
955 
956 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
957 		return -EINVAL;
958 
959 	amdgpu_asic_init_doorbell_index(adev);
960 
961 	/* doorbell bar mapping */
962 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
963 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
964 
965 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
966 					     adev->doorbell_index.max_assignment+1);
967 	if (adev->doorbell.num_doorbells == 0)
968 		return -EINVAL;
969 
970 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
971 	 * paging queue doorbell use the second page. The
972 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
973 	 * doorbells are in the first page. So with paging queue enabled,
974 	 * the max num_doorbells should + 1 page (0x400 in dword)
975 	 */
976 	if (adev->asic_type >= CHIP_VEGA10)
977 		adev->doorbell.num_doorbells += 0x400;
978 
979 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
980 				     adev->doorbell.num_doorbells *
981 				     sizeof(u32));
982 	if (adev->doorbell.ptr == NULL)
983 		return -ENOMEM;
984 
985 	return 0;
986 }
987 
988 /**
989  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
990  *
991  * @adev: amdgpu_device pointer
992  *
993  * Tear down doorbell driver information (CIK)
994  */
995 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
996 {
997 	iounmap(adev->doorbell.ptr);
998 	adev->doorbell.ptr = NULL;
999 }
1000 
1001 
1002 
1003 /*
1004  * amdgpu_device_wb_*()
1005  * Writeback is the method by which the GPU updates special pages in memory
1006  * with the status of certain GPU events (fences, ring pointers,etc.).
1007  */
1008 
1009 /**
1010  * amdgpu_device_wb_fini - Disable Writeback and free memory
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Disables Writeback and frees the Writeback memory (all asics).
1015  * Used at driver shutdown.
1016  */
1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 	if (adev->wb.wb_obj) {
1020 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 				      &adev->wb.gpu_addr,
1022 				      (void **)&adev->wb.wb);
1023 		adev->wb.wb_obj = NULL;
1024 	}
1025 }
1026 
1027 /**
1028  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1029  *
1030  * @adev: amdgpu_device pointer
1031  *
1032  * Initializes writeback and allocates writeback memory (all asics).
1033  * Used at driver startup.
1034  * Returns 0 on success or an -error on failure.
1035  */
1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 	int r;
1039 
1040 	if (adev->wb.wb_obj == NULL) {
1041 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 					    (void **)&adev->wb.wb);
1046 		if (r) {
1047 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 			return r;
1049 		}
1050 
1051 		adev->wb.num_wb = AMDGPU_MAX_WB;
1052 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053 
1054 		/* clear wb memory */
1055 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 	}
1057 
1058 	return 0;
1059 }
1060 
1061 /**
1062  * amdgpu_device_wb_get - Allocate a wb entry
1063  *
1064  * @adev: amdgpu_device pointer
1065  * @wb: wb index
1066  *
1067  * Allocate a wb slot for use by the driver (all asics).
1068  * Returns 0 on success or -EINVAL on failure.
1069  */
1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073 
1074 	if (offset < adev->wb.num_wb) {
1075 		__set_bit(offset, adev->wb.used);
1076 		*wb = offset << 3; /* convert to dw offset */
1077 		return 0;
1078 	} else {
1079 		return -EINVAL;
1080 	}
1081 }
1082 
1083 /**
1084  * amdgpu_device_wb_free - Free a wb entry
1085  *
1086  * @adev: amdgpu_device pointer
1087  * @wb: wb index
1088  *
1089  * Free a wb slot allocated for use by the driver (all asics)
1090  */
1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 	wb >>= 3;
1094 	if (wb < adev->wb.num_wb)
1095 		__clear_bit(wb, adev->wb.used);
1096 }
1097 
1098 /**
1099  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100  *
1101  * @adev: amdgpu_device pointer
1102  *
1103  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104  * to fail, but if any of the BARs is not accessible after the size we abort
1105  * driver loading by returning -ENODEV.
1106  */
1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1110 	struct pci_bus *root;
1111 	struct resource *res;
1112 	unsigned i;
1113 	u16 cmd;
1114 	int r;
1115 
1116 	/* Bypass for VF */
1117 	if (amdgpu_sriov_vf(adev))
1118 		return 0;
1119 
1120 	/* skip if the bios has already enabled large BAR */
1121 	if (adev->gmc.real_vram_size &&
1122 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1123 		return 0;
1124 
1125 	/* Check if the root BUS has 64bit memory resources */
1126 	root = adev->pdev->bus;
1127 	while (root->parent)
1128 		root = root->parent;
1129 
1130 	pci_bus_for_each_resource(root, res, i) {
1131 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1132 		    res->start > 0x100000000ull)
1133 			break;
1134 	}
1135 
1136 	/* Trying to resize is pointless without a root hub window above 4GB */
1137 	if (!res)
1138 		return 0;
1139 
1140 	/* Limit the BAR size to what is available */
1141 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1142 			rbar_size);
1143 
1144 	/* Disable memory decoding while we change the BAR addresses and size */
1145 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1146 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1147 			      cmd & ~PCI_COMMAND_MEMORY);
1148 
1149 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1150 	amdgpu_device_doorbell_fini(adev);
1151 	if (adev->asic_type >= CHIP_BONAIRE)
1152 		pci_release_resource(adev->pdev, 2);
1153 
1154 	pci_release_resource(adev->pdev, 0);
1155 
1156 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1157 	if (r == -ENOSPC)
1158 		DRM_INFO("Not enough PCI address space for a large BAR.");
1159 	else if (r && r != -ENOTSUPP)
1160 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1161 
1162 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1163 
1164 	/* When the doorbell or fb BAR isn't available we have no chance of
1165 	 * using the device.
1166 	 */
1167 	r = amdgpu_device_doorbell_init(adev);
1168 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1169 		return -ENODEV;
1170 
1171 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1172 
1173 	return 0;
1174 }
1175 
1176 /*
1177  * GPU helpers function.
1178  */
1179 /**
1180  * amdgpu_device_need_post - check if the hw need post or not
1181  *
1182  * @adev: amdgpu_device pointer
1183  *
1184  * Check if the asic has been initialized (all asics) at driver startup
1185  * or post is needed if  hw reset is performed.
1186  * Returns true if need or false if not.
1187  */
1188 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1189 {
1190 	uint32_t reg;
1191 
1192 	if (amdgpu_sriov_vf(adev))
1193 		return false;
1194 
1195 	if (amdgpu_passthrough(adev)) {
1196 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1197 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1198 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1199 		 * vpost executed for smc version below 22.15
1200 		 */
1201 		if (adev->asic_type == CHIP_FIJI) {
1202 			int err;
1203 			uint32_t fw_ver;
1204 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1205 			/* force vPost if error occured */
1206 			if (err)
1207 				return true;
1208 
1209 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1210 			if (fw_ver < 0x00160e00)
1211 				return true;
1212 		}
1213 	}
1214 
1215 	/* Don't post if we need to reset whole hive on init */
1216 	if (adev->gmc.xgmi.pending_reset)
1217 		return false;
1218 
1219 	if (adev->has_hw_reset) {
1220 		adev->has_hw_reset = false;
1221 		return true;
1222 	}
1223 
1224 	/* bios scratch used on CIK+ */
1225 	if (adev->asic_type >= CHIP_BONAIRE)
1226 		return amdgpu_atombios_scratch_need_asic_init(adev);
1227 
1228 	/* check MEM_SIZE for older asics */
1229 	reg = amdgpu_asic_get_config_memsize(adev);
1230 
1231 	if ((reg != 0) && (reg != 0xffffffff))
1232 		return false;
1233 
1234 	return true;
1235 }
1236 
1237 /* if we get transitioned to only one device, take VGA back */
1238 /**
1239  * amdgpu_device_vga_set_decode - enable/disable vga decode
1240  *
1241  * @cookie: amdgpu_device pointer
1242  * @state: enable/disable vga decode
1243  *
1244  * Enable/disable vga decode (all asics).
1245  * Returns VGA resource flags.
1246  */
1247 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1248 {
1249 	struct amdgpu_device *adev = cookie;
1250 	amdgpu_asic_set_vga_state(adev, state);
1251 	if (state)
1252 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1253 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1254 	else
1255 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1256 }
1257 
1258 /**
1259  * amdgpu_device_check_block_size - validate the vm block size
1260  *
1261  * @adev: amdgpu_device pointer
1262  *
1263  * Validates the vm block size specified via module parameter.
1264  * The vm block size defines number of bits in page table versus page directory,
1265  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1266  * page table and the remaining bits are in the page directory.
1267  */
1268 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1269 {
1270 	/* defines number of bits in page table versus page directory,
1271 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1272 	 * page table and the remaining bits are in the page directory */
1273 	if (amdgpu_vm_block_size == -1)
1274 		return;
1275 
1276 	if (amdgpu_vm_block_size < 9) {
1277 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1278 			 amdgpu_vm_block_size);
1279 		amdgpu_vm_block_size = -1;
1280 	}
1281 }
1282 
1283 /**
1284  * amdgpu_device_check_vm_size - validate the vm size
1285  *
1286  * @adev: amdgpu_device pointer
1287  *
1288  * Validates the vm size in GB specified via module parameter.
1289  * The VM size is the size of the GPU virtual memory space in GB.
1290  */
1291 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1292 {
1293 	/* no need to check the default value */
1294 	if (amdgpu_vm_size == -1)
1295 		return;
1296 
1297 	if (amdgpu_vm_size < 1) {
1298 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1299 			 amdgpu_vm_size);
1300 		amdgpu_vm_size = -1;
1301 	}
1302 }
1303 
1304 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1305 {
1306 	struct sysinfo si;
1307 	bool is_os_64 = (sizeof(void *) == 8);
1308 	uint64_t total_memory;
1309 	uint64_t dram_size_seven_GB = 0x1B8000000;
1310 	uint64_t dram_size_three_GB = 0xB8000000;
1311 
1312 	if (amdgpu_smu_memory_pool_size == 0)
1313 		return;
1314 
1315 	if (!is_os_64) {
1316 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1317 		goto def_value;
1318 	}
1319 	si_meminfo(&si);
1320 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1321 
1322 	if ((amdgpu_smu_memory_pool_size == 1) ||
1323 		(amdgpu_smu_memory_pool_size == 2)) {
1324 		if (total_memory < dram_size_three_GB)
1325 			goto def_value1;
1326 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1327 		(amdgpu_smu_memory_pool_size == 8)) {
1328 		if (total_memory < dram_size_seven_GB)
1329 			goto def_value1;
1330 	} else {
1331 		DRM_WARN("Smu memory pool size not supported\n");
1332 		goto def_value;
1333 	}
1334 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1335 
1336 	return;
1337 
1338 def_value1:
1339 	DRM_WARN("No enough system memory\n");
1340 def_value:
1341 	adev->pm.smu_prv_buffer_size = 0;
1342 }
1343 
1344 /**
1345  * amdgpu_device_check_arguments - validate module params
1346  *
1347  * @adev: amdgpu_device pointer
1348  *
1349  * Validates certain module parameters and updates
1350  * the associated values used by the driver (all asics).
1351  */
1352 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1353 {
1354 	if (amdgpu_sched_jobs < 4) {
1355 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1356 			 amdgpu_sched_jobs);
1357 		amdgpu_sched_jobs = 4;
1358 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1359 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1360 			 amdgpu_sched_jobs);
1361 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1362 	}
1363 
1364 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1365 		/* gart size must be greater or equal to 32M */
1366 		dev_warn(adev->dev, "gart size (%d) too small\n",
1367 			 amdgpu_gart_size);
1368 		amdgpu_gart_size = -1;
1369 	}
1370 
1371 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1372 		/* gtt size must be greater or equal to 32M */
1373 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1374 				 amdgpu_gtt_size);
1375 		amdgpu_gtt_size = -1;
1376 	}
1377 
1378 	/* valid range is between 4 and 9 inclusive */
1379 	if (amdgpu_vm_fragment_size != -1 &&
1380 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1381 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1382 		amdgpu_vm_fragment_size = -1;
1383 	}
1384 
1385 	if (amdgpu_sched_hw_submission < 2) {
1386 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1387 			 amdgpu_sched_hw_submission);
1388 		amdgpu_sched_hw_submission = 2;
1389 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1390 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1391 			 amdgpu_sched_hw_submission);
1392 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1393 	}
1394 
1395 	amdgpu_device_check_smu_prv_buffer_size(adev);
1396 
1397 	amdgpu_device_check_vm_size(adev);
1398 
1399 	amdgpu_device_check_block_size(adev);
1400 
1401 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1402 
1403 	amdgpu_gmc_tmz_set(adev);
1404 
1405 	amdgpu_gmc_noretry_set(adev);
1406 
1407 	return 0;
1408 }
1409 
1410 /**
1411  * amdgpu_switcheroo_set_state - set switcheroo state
1412  *
1413  * @pdev: pci dev pointer
1414  * @state: vga_switcheroo state
1415  *
1416  * Callback for the switcheroo driver.  Suspends or resumes the
1417  * the asics before or after it is powered up using ACPI methods.
1418  */
1419 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1420 					enum vga_switcheroo_state state)
1421 {
1422 	struct drm_device *dev = pci_get_drvdata(pdev);
1423 	int r;
1424 
1425 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1426 		return;
1427 
1428 	if (state == VGA_SWITCHEROO_ON) {
1429 		pr_info("switched on\n");
1430 		/* don't suspend or resume card normally */
1431 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1432 
1433 		pci_set_power_state(pdev, PCI_D0);
1434 		amdgpu_device_load_pci_state(pdev);
1435 		r = pci_enable_device(pdev);
1436 		if (r)
1437 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1438 		amdgpu_device_resume(dev, true);
1439 
1440 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1441 	} else {
1442 		pr_info("switched off\n");
1443 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1444 		amdgpu_device_suspend(dev, true);
1445 		amdgpu_device_cache_pci_state(pdev);
1446 		/* Shut down the device */
1447 		pci_disable_device(pdev);
1448 		pci_set_power_state(pdev, PCI_D3cold);
1449 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1450 	}
1451 }
1452 
1453 /**
1454  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1455  *
1456  * @pdev: pci dev pointer
1457  *
1458  * Callback for the switcheroo driver.  Check of the switcheroo
1459  * state can be changed.
1460  * Returns true if the state can be changed, false if not.
1461  */
1462 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1463 {
1464 	struct drm_device *dev = pci_get_drvdata(pdev);
1465 
1466 	/*
1467 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1468 	* locking inversion with the driver load path. And the access here is
1469 	* completely racy anyway. So don't bother with locking for now.
1470 	*/
1471 	return atomic_read(&dev->open_count) == 0;
1472 }
1473 
1474 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1475 	.set_gpu_state = amdgpu_switcheroo_set_state,
1476 	.reprobe = NULL,
1477 	.can_switch = amdgpu_switcheroo_can_switch,
1478 };
1479 
1480 /**
1481  * amdgpu_device_ip_set_clockgating_state - set the CG state
1482  *
1483  * @dev: amdgpu_device pointer
1484  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1485  * @state: clockgating state (gate or ungate)
1486  *
1487  * Sets the requested clockgating state for all instances of
1488  * the hardware IP specified.
1489  * Returns the error code from the last instance.
1490  */
1491 int amdgpu_device_ip_set_clockgating_state(void *dev,
1492 					   enum amd_ip_block_type block_type,
1493 					   enum amd_clockgating_state state)
1494 {
1495 	struct amdgpu_device *adev = dev;
1496 	int i, r = 0;
1497 
1498 	for (i = 0; i < adev->num_ip_blocks; i++) {
1499 		if (!adev->ip_blocks[i].status.valid)
1500 			continue;
1501 		if (adev->ip_blocks[i].version->type != block_type)
1502 			continue;
1503 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1504 			continue;
1505 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1506 			(void *)adev, state);
1507 		if (r)
1508 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1509 				  adev->ip_blocks[i].version->funcs->name, r);
1510 	}
1511 	return r;
1512 }
1513 
1514 /**
1515  * amdgpu_device_ip_set_powergating_state - set the PG state
1516  *
1517  * @dev: amdgpu_device pointer
1518  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1519  * @state: powergating state (gate or ungate)
1520  *
1521  * Sets the requested powergating state for all instances of
1522  * the hardware IP specified.
1523  * Returns the error code from the last instance.
1524  */
1525 int amdgpu_device_ip_set_powergating_state(void *dev,
1526 					   enum amd_ip_block_type block_type,
1527 					   enum amd_powergating_state state)
1528 {
1529 	struct amdgpu_device *adev = dev;
1530 	int i, r = 0;
1531 
1532 	for (i = 0; i < adev->num_ip_blocks; i++) {
1533 		if (!adev->ip_blocks[i].status.valid)
1534 			continue;
1535 		if (adev->ip_blocks[i].version->type != block_type)
1536 			continue;
1537 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1538 			continue;
1539 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1540 			(void *)adev, state);
1541 		if (r)
1542 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1543 				  adev->ip_blocks[i].version->funcs->name, r);
1544 	}
1545 	return r;
1546 }
1547 
1548 /**
1549  * amdgpu_device_ip_get_clockgating_state - get the CG state
1550  *
1551  * @adev: amdgpu_device pointer
1552  * @flags: clockgating feature flags
1553  *
1554  * Walks the list of IPs on the device and updates the clockgating
1555  * flags for each IP.
1556  * Updates @flags with the feature flags for each hardware IP where
1557  * clockgating is enabled.
1558  */
1559 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1560 					    u32 *flags)
1561 {
1562 	int i;
1563 
1564 	for (i = 0; i < adev->num_ip_blocks; i++) {
1565 		if (!adev->ip_blocks[i].status.valid)
1566 			continue;
1567 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1568 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1569 	}
1570 }
1571 
1572 /**
1573  * amdgpu_device_ip_wait_for_idle - wait for idle
1574  *
1575  * @adev: amdgpu_device pointer
1576  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1577  *
1578  * Waits for the request hardware IP to be idle.
1579  * Returns 0 for success or a negative error code on failure.
1580  */
1581 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1582 				   enum amd_ip_block_type block_type)
1583 {
1584 	int i, r;
1585 
1586 	for (i = 0; i < adev->num_ip_blocks; i++) {
1587 		if (!adev->ip_blocks[i].status.valid)
1588 			continue;
1589 		if (adev->ip_blocks[i].version->type == block_type) {
1590 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1591 			if (r)
1592 				return r;
1593 			break;
1594 		}
1595 	}
1596 	return 0;
1597 
1598 }
1599 
1600 /**
1601  * amdgpu_device_ip_is_idle - is the hardware IP idle
1602  *
1603  * @adev: amdgpu_device pointer
1604  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1605  *
1606  * Check if the hardware IP is idle or not.
1607  * Returns true if it the IP is idle, false if not.
1608  */
1609 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1610 			      enum amd_ip_block_type block_type)
1611 {
1612 	int i;
1613 
1614 	for (i = 0; i < adev->num_ip_blocks; i++) {
1615 		if (!adev->ip_blocks[i].status.valid)
1616 			continue;
1617 		if (adev->ip_blocks[i].version->type == block_type)
1618 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1619 	}
1620 	return true;
1621 
1622 }
1623 
1624 /**
1625  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1626  *
1627  * @adev: amdgpu_device pointer
1628  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1629  *
1630  * Returns a pointer to the hardware IP block structure
1631  * if it exists for the asic, otherwise NULL.
1632  */
1633 struct amdgpu_ip_block *
1634 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1635 			      enum amd_ip_block_type type)
1636 {
1637 	int i;
1638 
1639 	for (i = 0; i < adev->num_ip_blocks; i++)
1640 		if (adev->ip_blocks[i].version->type == type)
1641 			return &adev->ip_blocks[i];
1642 
1643 	return NULL;
1644 }
1645 
1646 /**
1647  * amdgpu_device_ip_block_version_cmp
1648  *
1649  * @adev: amdgpu_device pointer
1650  * @type: enum amd_ip_block_type
1651  * @major: major version
1652  * @minor: minor version
1653  *
1654  * return 0 if equal or greater
1655  * return 1 if smaller or the ip_block doesn't exist
1656  */
1657 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1658 				       enum amd_ip_block_type type,
1659 				       u32 major, u32 minor)
1660 {
1661 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1662 
1663 	if (ip_block && ((ip_block->version->major > major) ||
1664 			((ip_block->version->major == major) &&
1665 			(ip_block->version->minor >= minor))))
1666 		return 0;
1667 
1668 	return 1;
1669 }
1670 
1671 /**
1672  * amdgpu_device_ip_block_add
1673  *
1674  * @adev: amdgpu_device pointer
1675  * @ip_block_version: pointer to the IP to add
1676  *
1677  * Adds the IP block driver information to the collection of IPs
1678  * on the asic.
1679  */
1680 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1681 			       const struct amdgpu_ip_block_version *ip_block_version)
1682 {
1683 	if (!ip_block_version)
1684 		return -EINVAL;
1685 
1686 	switch (ip_block_version->type) {
1687 	case AMD_IP_BLOCK_TYPE_VCN:
1688 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1689 			return 0;
1690 		break;
1691 	case AMD_IP_BLOCK_TYPE_JPEG:
1692 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1693 			return 0;
1694 		break;
1695 	default:
1696 		break;
1697 	}
1698 
1699 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1700 		  ip_block_version->funcs->name);
1701 
1702 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1703 
1704 	return 0;
1705 }
1706 
1707 /**
1708  * amdgpu_device_enable_virtual_display - enable virtual display feature
1709  *
1710  * @adev: amdgpu_device pointer
1711  *
1712  * Enabled the virtual display feature if the user has enabled it via
1713  * the module parameter virtual_display.  This feature provides a virtual
1714  * display hardware on headless boards or in virtualized environments.
1715  * This function parses and validates the configuration string specified by
1716  * the user and configues the virtual display configuration (number of
1717  * virtual connectors, crtcs, etc.) specified.
1718  */
1719 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1720 {
1721 	adev->enable_virtual_display = false;
1722 
1723 	if (amdgpu_virtual_display) {
1724 		const char *pci_address_name = pci_name(adev->pdev);
1725 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1726 
1727 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1728 		pciaddstr_tmp = pciaddstr;
1729 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1730 			pciaddname = strsep(&pciaddname_tmp, ",");
1731 			if (!strcmp("all", pciaddname)
1732 			    || !strcmp(pci_address_name, pciaddname)) {
1733 				long num_crtc;
1734 				int res = -1;
1735 
1736 				adev->enable_virtual_display = true;
1737 
1738 				if (pciaddname_tmp)
1739 					res = kstrtol(pciaddname_tmp, 10,
1740 						      &num_crtc);
1741 
1742 				if (!res) {
1743 					if (num_crtc < 1)
1744 						num_crtc = 1;
1745 					if (num_crtc > 6)
1746 						num_crtc = 6;
1747 					adev->mode_info.num_crtc = num_crtc;
1748 				} else {
1749 					adev->mode_info.num_crtc = 1;
1750 				}
1751 				break;
1752 			}
1753 		}
1754 
1755 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1756 			 amdgpu_virtual_display, pci_address_name,
1757 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1758 
1759 		kfree(pciaddstr);
1760 	}
1761 }
1762 
1763 /**
1764  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1765  *
1766  * @adev: amdgpu_device pointer
1767  *
1768  * Parses the asic configuration parameters specified in the gpu info
1769  * firmware and makes them availale to the driver for use in configuring
1770  * the asic.
1771  * Returns 0 on success, -EINVAL on failure.
1772  */
1773 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1774 {
1775 	const char *chip_name;
1776 	char fw_name[40];
1777 	int err;
1778 	const struct gpu_info_firmware_header_v1_0 *hdr;
1779 
1780 	adev->firmware.gpu_info_fw = NULL;
1781 
1782 	if (adev->mman.discovery_bin) {
1783 		amdgpu_discovery_get_gfx_info(adev);
1784 
1785 		/*
1786 		 * FIXME: The bounding box is still needed by Navi12, so
1787 		 * temporarily read it from gpu_info firmware. Should be droped
1788 		 * when DAL no longer needs it.
1789 		 */
1790 		if (adev->asic_type != CHIP_NAVI12)
1791 			return 0;
1792 	}
1793 
1794 	switch (adev->asic_type) {
1795 #ifdef CONFIG_DRM_AMDGPU_SI
1796 	case CHIP_VERDE:
1797 	case CHIP_TAHITI:
1798 	case CHIP_PITCAIRN:
1799 	case CHIP_OLAND:
1800 	case CHIP_HAINAN:
1801 #endif
1802 #ifdef CONFIG_DRM_AMDGPU_CIK
1803 	case CHIP_BONAIRE:
1804 	case CHIP_HAWAII:
1805 	case CHIP_KAVERI:
1806 	case CHIP_KABINI:
1807 	case CHIP_MULLINS:
1808 #endif
1809 	case CHIP_TOPAZ:
1810 	case CHIP_TONGA:
1811 	case CHIP_FIJI:
1812 	case CHIP_POLARIS10:
1813 	case CHIP_POLARIS11:
1814 	case CHIP_POLARIS12:
1815 	case CHIP_VEGAM:
1816 	case CHIP_CARRIZO:
1817 	case CHIP_STONEY:
1818 	case CHIP_VEGA20:
1819 	case CHIP_ALDEBARAN:
1820 	case CHIP_SIENNA_CICHLID:
1821 	case CHIP_NAVY_FLOUNDER:
1822 	case CHIP_DIMGREY_CAVEFISH:
1823 	default:
1824 		return 0;
1825 	case CHIP_VEGA10:
1826 		chip_name = "vega10";
1827 		break;
1828 	case CHIP_VEGA12:
1829 		chip_name = "vega12";
1830 		break;
1831 	case CHIP_RAVEN:
1832 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1833 			chip_name = "raven2";
1834 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1835 			chip_name = "picasso";
1836 		else
1837 			chip_name = "raven";
1838 		break;
1839 	case CHIP_ARCTURUS:
1840 		chip_name = "arcturus";
1841 		break;
1842 	case CHIP_RENOIR:
1843 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1844 			chip_name = "renoir";
1845 		else
1846 			chip_name = "green_sardine";
1847 		break;
1848 	case CHIP_NAVI10:
1849 		chip_name = "navi10";
1850 		break;
1851 	case CHIP_NAVI14:
1852 		chip_name = "navi14";
1853 		break;
1854 	case CHIP_NAVI12:
1855 		chip_name = "navi12";
1856 		break;
1857 	case CHIP_VANGOGH:
1858 		chip_name = "vangogh";
1859 		break;
1860 	}
1861 
1862 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1863 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1864 	if (err) {
1865 		dev_err(adev->dev,
1866 			"Failed to load gpu_info firmware \"%s\"\n",
1867 			fw_name);
1868 		goto out;
1869 	}
1870 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1871 	if (err) {
1872 		dev_err(adev->dev,
1873 			"Failed to validate gpu_info firmware \"%s\"\n",
1874 			fw_name);
1875 		goto out;
1876 	}
1877 
1878 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1879 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1880 
1881 	switch (hdr->version_major) {
1882 	case 1:
1883 	{
1884 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1885 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1886 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1887 
1888 		/*
1889 		 * Should be droped when DAL no longer needs it.
1890 		 */
1891 		if (adev->asic_type == CHIP_NAVI12)
1892 			goto parse_soc_bounding_box;
1893 
1894 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1895 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1896 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1897 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1898 		adev->gfx.config.max_texture_channel_caches =
1899 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1900 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1901 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1902 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1903 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1904 		adev->gfx.config.double_offchip_lds_buf =
1905 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1906 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1907 		adev->gfx.cu_info.max_waves_per_simd =
1908 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1909 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1910 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1911 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1912 		if (hdr->version_minor >= 1) {
1913 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1914 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1915 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1916 			adev->gfx.config.num_sc_per_sh =
1917 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1918 			adev->gfx.config.num_packer_per_sc =
1919 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1920 		}
1921 
1922 parse_soc_bounding_box:
1923 		/*
1924 		 * soc bounding box info is not integrated in disocovery table,
1925 		 * we always need to parse it from gpu info firmware if needed.
1926 		 */
1927 		if (hdr->version_minor == 2) {
1928 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1929 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1930 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1931 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1932 		}
1933 		break;
1934 	}
1935 	default:
1936 		dev_err(adev->dev,
1937 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1938 		err = -EINVAL;
1939 		goto out;
1940 	}
1941 out:
1942 	return err;
1943 }
1944 
1945 /**
1946  * amdgpu_device_ip_early_init - run early init for hardware IPs
1947  *
1948  * @adev: amdgpu_device pointer
1949  *
1950  * Early initialization pass for hardware IPs.  The hardware IPs that make
1951  * up each asic are discovered each IP's early_init callback is run.  This
1952  * is the first stage in initializing the asic.
1953  * Returns 0 on success, negative error code on failure.
1954  */
1955 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1956 {
1957 	int i, r;
1958 
1959 	amdgpu_device_enable_virtual_display(adev);
1960 
1961 	if (amdgpu_sriov_vf(adev)) {
1962 		r = amdgpu_virt_request_full_gpu(adev, true);
1963 		if (r)
1964 			return r;
1965 	}
1966 
1967 	switch (adev->asic_type) {
1968 #ifdef CONFIG_DRM_AMDGPU_SI
1969 	case CHIP_VERDE:
1970 	case CHIP_TAHITI:
1971 	case CHIP_PITCAIRN:
1972 	case CHIP_OLAND:
1973 	case CHIP_HAINAN:
1974 		adev->family = AMDGPU_FAMILY_SI;
1975 		r = si_set_ip_blocks(adev);
1976 		if (r)
1977 			return r;
1978 		break;
1979 #endif
1980 #ifdef CONFIG_DRM_AMDGPU_CIK
1981 	case CHIP_BONAIRE:
1982 	case CHIP_HAWAII:
1983 	case CHIP_KAVERI:
1984 	case CHIP_KABINI:
1985 	case CHIP_MULLINS:
1986 		if (adev->flags & AMD_IS_APU)
1987 			adev->family = AMDGPU_FAMILY_KV;
1988 		else
1989 			adev->family = AMDGPU_FAMILY_CI;
1990 
1991 		r = cik_set_ip_blocks(adev);
1992 		if (r)
1993 			return r;
1994 		break;
1995 #endif
1996 	case CHIP_TOPAZ:
1997 	case CHIP_TONGA:
1998 	case CHIP_FIJI:
1999 	case CHIP_POLARIS10:
2000 	case CHIP_POLARIS11:
2001 	case CHIP_POLARIS12:
2002 	case CHIP_VEGAM:
2003 	case CHIP_CARRIZO:
2004 	case CHIP_STONEY:
2005 		if (adev->flags & AMD_IS_APU)
2006 			adev->family = AMDGPU_FAMILY_CZ;
2007 		else
2008 			adev->family = AMDGPU_FAMILY_VI;
2009 
2010 		r = vi_set_ip_blocks(adev);
2011 		if (r)
2012 			return r;
2013 		break;
2014 	case CHIP_VEGA10:
2015 	case CHIP_VEGA12:
2016 	case CHIP_VEGA20:
2017 	case CHIP_RAVEN:
2018 	case CHIP_ARCTURUS:
2019 	case CHIP_RENOIR:
2020 	case CHIP_ALDEBARAN:
2021 		if (adev->flags & AMD_IS_APU)
2022 			adev->family = AMDGPU_FAMILY_RV;
2023 		else
2024 			adev->family = AMDGPU_FAMILY_AI;
2025 
2026 		r = soc15_set_ip_blocks(adev);
2027 		if (r)
2028 			return r;
2029 		break;
2030 	case  CHIP_NAVI10:
2031 	case  CHIP_NAVI14:
2032 	case  CHIP_NAVI12:
2033 	case  CHIP_SIENNA_CICHLID:
2034 	case  CHIP_NAVY_FLOUNDER:
2035 	case  CHIP_DIMGREY_CAVEFISH:
2036 	case CHIP_VANGOGH:
2037 		if (adev->asic_type == CHIP_VANGOGH)
2038 			adev->family = AMDGPU_FAMILY_VGH;
2039 		else
2040 			adev->family = AMDGPU_FAMILY_NV;
2041 
2042 		r = nv_set_ip_blocks(adev);
2043 		if (r)
2044 			return r;
2045 		break;
2046 	default:
2047 		/* FIXME: not supported yet */
2048 		return -EINVAL;
2049 	}
2050 
2051 	amdgpu_amdkfd_device_probe(adev);
2052 
2053 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2054 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2055 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2056 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2057 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2058 
2059 	for (i = 0; i < adev->num_ip_blocks; i++) {
2060 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2061 			DRM_ERROR("disabled ip block: %d <%s>\n",
2062 				  i, adev->ip_blocks[i].version->funcs->name);
2063 			adev->ip_blocks[i].status.valid = false;
2064 		} else {
2065 			if (adev->ip_blocks[i].version->funcs->early_init) {
2066 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2067 				if (r == -ENOENT) {
2068 					adev->ip_blocks[i].status.valid = false;
2069 				} else if (r) {
2070 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2071 						  adev->ip_blocks[i].version->funcs->name, r);
2072 					return r;
2073 				} else {
2074 					adev->ip_blocks[i].status.valid = true;
2075 				}
2076 			} else {
2077 				adev->ip_blocks[i].status.valid = true;
2078 			}
2079 		}
2080 		/* get the vbios after the asic_funcs are set up */
2081 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2082 			r = amdgpu_device_parse_gpu_info_fw(adev);
2083 			if (r)
2084 				return r;
2085 
2086 			/* Read BIOS */
2087 			if (!amdgpu_get_bios(adev))
2088 				return -EINVAL;
2089 
2090 			r = amdgpu_atombios_init(adev);
2091 			if (r) {
2092 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2093 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2094 				return r;
2095 			}
2096 
2097 			/*get pf2vf msg info at it's earliest time*/
2098 			if (amdgpu_sriov_vf(adev))
2099 				amdgpu_virt_init_data_exchange(adev);
2100 
2101 		}
2102 	}
2103 
2104 	adev->cg_flags &= amdgpu_cg_mask;
2105 	adev->pg_flags &= amdgpu_pg_mask;
2106 
2107 	return 0;
2108 }
2109 
2110 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2111 {
2112 	int i, r;
2113 
2114 	for (i = 0; i < adev->num_ip_blocks; i++) {
2115 		if (!adev->ip_blocks[i].status.sw)
2116 			continue;
2117 		if (adev->ip_blocks[i].status.hw)
2118 			continue;
2119 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2120 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2121 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2122 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2123 			if (r) {
2124 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2125 					  adev->ip_blocks[i].version->funcs->name, r);
2126 				return r;
2127 			}
2128 			adev->ip_blocks[i].status.hw = true;
2129 		}
2130 	}
2131 
2132 	return 0;
2133 }
2134 
2135 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2136 {
2137 	int i, r;
2138 
2139 	for (i = 0; i < adev->num_ip_blocks; i++) {
2140 		if (!adev->ip_blocks[i].status.sw)
2141 			continue;
2142 		if (adev->ip_blocks[i].status.hw)
2143 			continue;
2144 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2145 		if (r) {
2146 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2147 				  adev->ip_blocks[i].version->funcs->name, r);
2148 			return r;
2149 		}
2150 		adev->ip_blocks[i].status.hw = true;
2151 	}
2152 
2153 	return 0;
2154 }
2155 
2156 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2157 {
2158 	int r = 0;
2159 	int i;
2160 	uint32_t smu_version;
2161 
2162 	if (adev->asic_type >= CHIP_VEGA10) {
2163 		for (i = 0; i < adev->num_ip_blocks; i++) {
2164 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2165 				continue;
2166 
2167 			if (!adev->ip_blocks[i].status.sw)
2168 				continue;
2169 
2170 			/* no need to do the fw loading again if already done*/
2171 			if (adev->ip_blocks[i].status.hw == true)
2172 				break;
2173 
2174 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2175 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2176 				if (r) {
2177 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2178 							  adev->ip_blocks[i].version->funcs->name, r);
2179 					return r;
2180 				}
2181 			} else {
2182 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2183 				if (r) {
2184 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2185 							  adev->ip_blocks[i].version->funcs->name, r);
2186 					return r;
2187 				}
2188 			}
2189 
2190 			adev->ip_blocks[i].status.hw = true;
2191 			break;
2192 		}
2193 	}
2194 
2195 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2196 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2197 
2198 	return r;
2199 }
2200 
2201 /**
2202  * amdgpu_device_ip_init - run init for hardware IPs
2203  *
2204  * @adev: amdgpu_device pointer
2205  *
2206  * Main initialization pass for hardware IPs.  The list of all the hardware
2207  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2208  * are run.  sw_init initializes the software state associated with each IP
2209  * and hw_init initializes the hardware associated with each IP.
2210  * Returns 0 on success, negative error code on failure.
2211  */
2212 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2213 {
2214 	int i, r;
2215 
2216 	r = amdgpu_ras_init(adev);
2217 	if (r)
2218 		return r;
2219 
2220 	for (i = 0; i < adev->num_ip_blocks; i++) {
2221 		if (!adev->ip_blocks[i].status.valid)
2222 			continue;
2223 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2224 		if (r) {
2225 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2226 				  adev->ip_blocks[i].version->funcs->name, r);
2227 			goto init_failed;
2228 		}
2229 		adev->ip_blocks[i].status.sw = true;
2230 
2231 		/* need to do gmc hw init early so we can allocate gpu mem */
2232 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2233 			r = amdgpu_device_vram_scratch_init(adev);
2234 			if (r) {
2235 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2236 				goto init_failed;
2237 			}
2238 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2239 			if (r) {
2240 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2241 				goto init_failed;
2242 			}
2243 			r = amdgpu_device_wb_init(adev);
2244 			if (r) {
2245 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2246 				goto init_failed;
2247 			}
2248 			adev->ip_blocks[i].status.hw = true;
2249 
2250 			/* right after GMC hw init, we create CSA */
2251 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2252 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2253 								AMDGPU_GEM_DOMAIN_VRAM,
2254 								AMDGPU_CSA_SIZE);
2255 				if (r) {
2256 					DRM_ERROR("allocate CSA failed %d\n", r);
2257 					goto init_failed;
2258 				}
2259 			}
2260 		}
2261 	}
2262 
2263 	if (amdgpu_sriov_vf(adev))
2264 		amdgpu_virt_init_data_exchange(adev);
2265 
2266 	r = amdgpu_ib_pool_init(adev);
2267 	if (r) {
2268 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2269 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2270 		goto init_failed;
2271 	}
2272 
2273 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2274 	if (r)
2275 		goto init_failed;
2276 
2277 	r = amdgpu_device_ip_hw_init_phase1(adev);
2278 	if (r)
2279 		goto init_failed;
2280 
2281 	r = amdgpu_device_fw_loading(adev);
2282 	if (r)
2283 		goto init_failed;
2284 
2285 	r = amdgpu_device_ip_hw_init_phase2(adev);
2286 	if (r)
2287 		goto init_failed;
2288 
2289 	/*
2290 	 * retired pages will be loaded from eeprom and reserved here,
2291 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2292 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2293 	 * for I2C communication which only true at this point.
2294 	 *
2295 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2296 	 * failure from bad gpu situation and stop amdgpu init process
2297 	 * accordingly. For other failed cases, it will still release all
2298 	 * the resource and print error message, rather than returning one
2299 	 * negative value to upper level.
2300 	 *
2301 	 * Note: theoretically, this should be called before all vram allocations
2302 	 * to protect retired page from abusing
2303 	 */
2304 	r = amdgpu_ras_recovery_init(adev);
2305 	if (r)
2306 		goto init_failed;
2307 
2308 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2309 		amdgpu_xgmi_add_device(adev);
2310 
2311 	/* Don't init kfd if whole hive need to be reset during init */
2312 	if (!adev->gmc.xgmi.pending_reset)
2313 		amdgpu_amdkfd_device_init(adev);
2314 
2315 	amdgpu_fru_get_product_info(adev);
2316 
2317 init_failed:
2318 	if (amdgpu_sriov_vf(adev))
2319 		amdgpu_virt_release_full_gpu(adev, true);
2320 
2321 	return r;
2322 }
2323 
2324 /**
2325  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2326  *
2327  * @adev: amdgpu_device pointer
2328  *
2329  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2330  * this function before a GPU reset.  If the value is retained after a
2331  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2332  */
2333 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2334 {
2335 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2336 }
2337 
2338 /**
2339  * amdgpu_device_check_vram_lost - check if vram is valid
2340  *
2341  * @adev: amdgpu_device pointer
2342  *
2343  * Checks the reset magic value written to the gart pointer in VRAM.
2344  * The driver calls this after a GPU reset to see if the contents of
2345  * VRAM is lost or now.
2346  * returns true if vram is lost, false if not.
2347  */
2348 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2349 {
2350 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2351 			AMDGPU_RESET_MAGIC_NUM))
2352 		return true;
2353 
2354 	if (!amdgpu_in_reset(adev))
2355 		return false;
2356 
2357 	/*
2358 	 * For all ASICs with baco/mode1 reset, the VRAM is
2359 	 * always assumed to be lost.
2360 	 */
2361 	switch (amdgpu_asic_reset_method(adev)) {
2362 	case AMD_RESET_METHOD_BACO:
2363 	case AMD_RESET_METHOD_MODE1:
2364 		return true;
2365 	default:
2366 		return false;
2367 	}
2368 }
2369 
2370 /**
2371  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2372  *
2373  * @adev: amdgpu_device pointer
2374  * @state: clockgating state (gate or ungate)
2375  *
2376  * The list of all the hardware IPs that make up the asic is walked and the
2377  * set_clockgating_state callbacks are run.
2378  * Late initialization pass enabling clockgating for hardware IPs.
2379  * Fini or suspend, pass disabling clockgating for hardware IPs.
2380  * Returns 0 on success, negative error code on failure.
2381  */
2382 
2383 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2384 			       enum amd_clockgating_state state)
2385 {
2386 	int i, j, r;
2387 
2388 	if (amdgpu_emu_mode == 1)
2389 		return 0;
2390 
2391 	for (j = 0; j < adev->num_ip_blocks; j++) {
2392 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2393 		if (!adev->ip_blocks[i].status.late_initialized)
2394 			continue;
2395 		/* skip CG for GFX on S0ix */
2396 		if (adev->in_s0ix &&
2397 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2398 			continue;
2399 		/* skip CG for VCE/UVD, it's handled specially */
2400 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2401 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2402 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2403 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2404 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2405 			/* enable clockgating to save power */
2406 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2407 										     state);
2408 			if (r) {
2409 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2410 					  adev->ip_blocks[i].version->funcs->name, r);
2411 				return r;
2412 			}
2413 		}
2414 	}
2415 
2416 	return 0;
2417 }
2418 
2419 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2420 			       enum amd_powergating_state state)
2421 {
2422 	int i, j, r;
2423 
2424 	if (amdgpu_emu_mode == 1)
2425 		return 0;
2426 
2427 	for (j = 0; j < adev->num_ip_blocks; j++) {
2428 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2429 		if (!adev->ip_blocks[i].status.late_initialized)
2430 			continue;
2431 		/* skip PG for GFX on S0ix */
2432 		if (adev->in_s0ix &&
2433 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2434 			continue;
2435 		/* skip CG for VCE/UVD, it's handled specially */
2436 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2437 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2438 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2439 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2440 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2441 			/* enable powergating to save power */
2442 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2443 											state);
2444 			if (r) {
2445 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2446 					  adev->ip_blocks[i].version->funcs->name, r);
2447 				return r;
2448 			}
2449 		}
2450 	}
2451 	return 0;
2452 }
2453 
2454 static int amdgpu_device_enable_mgpu_fan_boost(void)
2455 {
2456 	struct amdgpu_gpu_instance *gpu_ins;
2457 	struct amdgpu_device *adev;
2458 	int i, ret = 0;
2459 
2460 	mutex_lock(&mgpu_info.mutex);
2461 
2462 	/*
2463 	 * MGPU fan boost feature should be enabled
2464 	 * only when there are two or more dGPUs in
2465 	 * the system
2466 	 */
2467 	if (mgpu_info.num_dgpu < 2)
2468 		goto out;
2469 
2470 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2471 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2472 		adev = gpu_ins->adev;
2473 		if (!(adev->flags & AMD_IS_APU) &&
2474 		    !gpu_ins->mgpu_fan_enabled) {
2475 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2476 			if (ret)
2477 				break;
2478 
2479 			gpu_ins->mgpu_fan_enabled = 1;
2480 		}
2481 	}
2482 
2483 out:
2484 	mutex_unlock(&mgpu_info.mutex);
2485 
2486 	return ret;
2487 }
2488 
2489 /**
2490  * amdgpu_device_ip_late_init - run late init for hardware IPs
2491  *
2492  * @adev: amdgpu_device pointer
2493  *
2494  * Late initialization pass for hardware IPs.  The list of all the hardware
2495  * IPs that make up the asic is walked and the late_init callbacks are run.
2496  * late_init covers any special initialization that an IP requires
2497  * after all of the have been initialized or something that needs to happen
2498  * late in the init process.
2499  * Returns 0 on success, negative error code on failure.
2500  */
2501 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2502 {
2503 	struct amdgpu_gpu_instance *gpu_instance;
2504 	int i = 0, r;
2505 
2506 	for (i = 0; i < adev->num_ip_blocks; i++) {
2507 		if (!adev->ip_blocks[i].status.hw)
2508 			continue;
2509 		if (adev->ip_blocks[i].version->funcs->late_init) {
2510 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2511 			if (r) {
2512 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2513 					  adev->ip_blocks[i].version->funcs->name, r);
2514 				return r;
2515 			}
2516 		}
2517 		adev->ip_blocks[i].status.late_initialized = true;
2518 	}
2519 
2520 	amdgpu_ras_set_error_query_ready(adev, true);
2521 
2522 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2523 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2524 
2525 	amdgpu_device_fill_reset_magic(adev);
2526 
2527 	r = amdgpu_device_enable_mgpu_fan_boost();
2528 	if (r)
2529 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2530 
2531 	/* For XGMI + passthrough configuration on arcturus, enable light SBR */
2532 	if (adev->asic_type == CHIP_ARCTURUS &&
2533 	    amdgpu_passthrough(adev) &&
2534 	    adev->gmc.xgmi.num_physical_nodes > 1)
2535 		smu_set_light_sbr(&adev->smu, true);
2536 
2537 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2538 		mutex_lock(&mgpu_info.mutex);
2539 
2540 		/*
2541 		 * Reset device p-state to low as this was booted with high.
2542 		 *
2543 		 * This should be performed only after all devices from the same
2544 		 * hive get initialized.
2545 		 *
2546 		 * However, it's unknown how many device in the hive in advance.
2547 		 * As this is counted one by one during devices initializations.
2548 		 *
2549 		 * So, we wait for all XGMI interlinked devices initialized.
2550 		 * This may bring some delays as those devices may come from
2551 		 * different hives. But that should be OK.
2552 		 */
2553 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2554 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2555 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2556 				if (gpu_instance->adev->flags & AMD_IS_APU)
2557 					continue;
2558 
2559 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2560 						AMDGPU_XGMI_PSTATE_MIN);
2561 				if (r) {
2562 					DRM_ERROR("pstate setting failed (%d).\n", r);
2563 					break;
2564 				}
2565 			}
2566 		}
2567 
2568 		mutex_unlock(&mgpu_info.mutex);
2569 	}
2570 
2571 	return 0;
2572 }
2573 
2574 /**
2575  * amdgpu_device_ip_fini - run fini for hardware IPs
2576  *
2577  * @adev: amdgpu_device pointer
2578  *
2579  * Main teardown pass for hardware IPs.  The list of all the hardware
2580  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2581  * are run.  hw_fini tears down the hardware associated with each IP
2582  * and sw_fini tears down any software state associated with each IP.
2583  * Returns 0 on success, negative error code on failure.
2584  */
2585 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2586 {
2587 	int i, r;
2588 
2589 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2590 		amdgpu_virt_release_ras_err_handler_data(adev);
2591 
2592 	amdgpu_ras_pre_fini(adev);
2593 
2594 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2595 		amdgpu_xgmi_remove_device(adev);
2596 
2597 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2598 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2599 
2600 	amdgpu_amdkfd_device_fini(adev);
2601 
2602 	/* need to disable SMC first */
2603 	for (i = 0; i < adev->num_ip_blocks; i++) {
2604 		if (!adev->ip_blocks[i].status.hw)
2605 			continue;
2606 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2607 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2608 			/* XXX handle errors */
2609 			if (r) {
2610 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2611 					  adev->ip_blocks[i].version->funcs->name, r);
2612 			}
2613 			adev->ip_blocks[i].status.hw = false;
2614 			break;
2615 		}
2616 	}
2617 
2618 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2619 		if (!adev->ip_blocks[i].status.hw)
2620 			continue;
2621 
2622 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2623 		/* XXX handle errors */
2624 		if (r) {
2625 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2626 				  adev->ip_blocks[i].version->funcs->name, r);
2627 		}
2628 
2629 		adev->ip_blocks[i].status.hw = false;
2630 	}
2631 
2632 
2633 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2634 		if (!adev->ip_blocks[i].status.sw)
2635 			continue;
2636 
2637 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2638 			amdgpu_ucode_free_bo(adev);
2639 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2640 			amdgpu_device_wb_fini(adev);
2641 			amdgpu_device_vram_scratch_fini(adev);
2642 			amdgpu_ib_pool_fini(adev);
2643 		}
2644 
2645 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2646 		/* XXX handle errors */
2647 		if (r) {
2648 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2649 				  adev->ip_blocks[i].version->funcs->name, r);
2650 		}
2651 		adev->ip_blocks[i].status.sw = false;
2652 		adev->ip_blocks[i].status.valid = false;
2653 	}
2654 
2655 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2656 		if (!adev->ip_blocks[i].status.late_initialized)
2657 			continue;
2658 		if (adev->ip_blocks[i].version->funcs->late_fini)
2659 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2660 		adev->ip_blocks[i].status.late_initialized = false;
2661 	}
2662 
2663 	amdgpu_ras_fini(adev);
2664 
2665 	if (amdgpu_sriov_vf(adev))
2666 		if (amdgpu_virt_release_full_gpu(adev, false))
2667 			DRM_ERROR("failed to release exclusive mode on fini\n");
2668 
2669 	return 0;
2670 }
2671 
2672 /**
2673  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2674  *
2675  * @work: work_struct.
2676  */
2677 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2678 {
2679 	struct amdgpu_device *adev =
2680 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2681 	int r;
2682 
2683 	r = amdgpu_ib_ring_tests(adev);
2684 	if (r)
2685 		DRM_ERROR("ib ring test failed (%d).\n", r);
2686 }
2687 
2688 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2689 {
2690 	struct amdgpu_device *adev =
2691 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2692 
2693 	mutex_lock(&adev->gfx.gfx_off_mutex);
2694 	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2695 		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2696 			adev->gfx.gfx_off_state = true;
2697 	}
2698 	mutex_unlock(&adev->gfx.gfx_off_mutex);
2699 }
2700 
2701 /**
2702  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2703  *
2704  * @adev: amdgpu_device pointer
2705  *
2706  * Main suspend function for hardware IPs.  The list of all the hardware
2707  * IPs that make up the asic is walked, clockgating is disabled and the
2708  * suspend callbacks are run.  suspend puts the hardware and software state
2709  * in each IP into a state suitable for suspend.
2710  * Returns 0 on success, negative error code on failure.
2711  */
2712 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2713 {
2714 	int i, r;
2715 
2716 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2717 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2718 
2719 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2720 		if (!adev->ip_blocks[i].status.valid)
2721 			continue;
2722 
2723 		/* displays are handled separately */
2724 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2725 			continue;
2726 
2727 		/* XXX handle errors */
2728 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2729 		/* XXX handle errors */
2730 		if (r) {
2731 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2732 				  adev->ip_blocks[i].version->funcs->name, r);
2733 			return r;
2734 		}
2735 
2736 		adev->ip_blocks[i].status.hw = false;
2737 	}
2738 
2739 	return 0;
2740 }
2741 
2742 /**
2743  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2744  *
2745  * @adev: amdgpu_device pointer
2746  *
2747  * Main suspend function for hardware IPs.  The list of all the hardware
2748  * IPs that make up the asic is walked, clockgating is disabled and the
2749  * suspend callbacks are run.  suspend puts the hardware and software state
2750  * in each IP into a state suitable for suspend.
2751  * Returns 0 on success, negative error code on failure.
2752  */
2753 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2754 {
2755 	int i, r;
2756 
2757 	if (adev->in_s0ix)
2758 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
2759 
2760 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2761 		if (!adev->ip_blocks[i].status.valid)
2762 			continue;
2763 		/* displays are handled in phase1 */
2764 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2765 			continue;
2766 		/* PSP lost connection when err_event_athub occurs */
2767 		if (amdgpu_ras_intr_triggered() &&
2768 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2769 			adev->ip_blocks[i].status.hw = false;
2770 			continue;
2771 		}
2772 
2773 		/* skip unnecessary suspend if we do not initialize them yet */
2774 		if (adev->gmc.xgmi.pending_reset &&
2775 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2776 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2777 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2778 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2779 			adev->ip_blocks[i].status.hw = false;
2780 			continue;
2781 		}
2782 
2783 		/* skip suspend of gfx and psp for S0ix
2784 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2785 		 * like at runtime. PSP is also part of the always on hardware
2786 		 * so no need to suspend it.
2787 		 */
2788 		if (adev->in_s0ix &&
2789 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2790 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
2791 			continue;
2792 
2793 		/* XXX handle errors */
2794 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2795 		/* XXX handle errors */
2796 		if (r) {
2797 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2798 				  adev->ip_blocks[i].version->funcs->name, r);
2799 		}
2800 		adev->ip_blocks[i].status.hw = false;
2801 		/* handle putting the SMC in the appropriate state */
2802 		if(!amdgpu_sriov_vf(adev)){
2803 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2804 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2805 				if (r) {
2806 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2807 							adev->mp1_state, r);
2808 					return r;
2809 				}
2810 			}
2811 		}
2812 	}
2813 
2814 	return 0;
2815 }
2816 
2817 /**
2818  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2819  *
2820  * @adev: amdgpu_device pointer
2821  *
2822  * Main suspend function for hardware IPs.  The list of all the hardware
2823  * IPs that make up the asic is walked, clockgating is disabled and the
2824  * suspend callbacks are run.  suspend puts the hardware and software state
2825  * in each IP into a state suitable for suspend.
2826  * Returns 0 on success, negative error code on failure.
2827  */
2828 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2829 {
2830 	int r;
2831 
2832 	if (amdgpu_sriov_vf(adev)) {
2833 		amdgpu_virt_fini_data_exchange(adev);
2834 		amdgpu_virt_request_full_gpu(adev, false);
2835 	}
2836 
2837 	r = amdgpu_device_ip_suspend_phase1(adev);
2838 	if (r)
2839 		return r;
2840 	r = amdgpu_device_ip_suspend_phase2(adev);
2841 
2842 	if (amdgpu_sriov_vf(adev))
2843 		amdgpu_virt_release_full_gpu(adev, false);
2844 
2845 	return r;
2846 }
2847 
2848 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2849 {
2850 	int i, r;
2851 
2852 	static enum amd_ip_block_type ip_order[] = {
2853 		AMD_IP_BLOCK_TYPE_GMC,
2854 		AMD_IP_BLOCK_TYPE_COMMON,
2855 		AMD_IP_BLOCK_TYPE_PSP,
2856 		AMD_IP_BLOCK_TYPE_IH,
2857 	};
2858 
2859 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2860 		int j;
2861 		struct amdgpu_ip_block *block;
2862 
2863 		block = &adev->ip_blocks[i];
2864 		block->status.hw = false;
2865 
2866 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2867 
2868 			if (block->version->type != ip_order[j] ||
2869 				!block->status.valid)
2870 				continue;
2871 
2872 			r = block->version->funcs->hw_init(adev);
2873 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2874 			if (r)
2875 				return r;
2876 			block->status.hw = true;
2877 		}
2878 	}
2879 
2880 	return 0;
2881 }
2882 
2883 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2884 {
2885 	int i, r;
2886 
2887 	static enum amd_ip_block_type ip_order[] = {
2888 		AMD_IP_BLOCK_TYPE_SMC,
2889 		AMD_IP_BLOCK_TYPE_DCE,
2890 		AMD_IP_BLOCK_TYPE_GFX,
2891 		AMD_IP_BLOCK_TYPE_SDMA,
2892 		AMD_IP_BLOCK_TYPE_UVD,
2893 		AMD_IP_BLOCK_TYPE_VCE,
2894 		AMD_IP_BLOCK_TYPE_VCN
2895 	};
2896 
2897 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2898 		int j;
2899 		struct amdgpu_ip_block *block;
2900 
2901 		for (j = 0; j < adev->num_ip_blocks; j++) {
2902 			block = &adev->ip_blocks[j];
2903 
2904 			if (block->version->type != ip_order[i] ||
2905 				!block->status.valid ||
2906 				block->status.hw)
2907 				continue;
2908 
2909 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2910 				r = block->version->funcs->resume(adev);
2911 			else
2912 				r = block->version->funcs->hw_init(adev);
2913 
2914 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2915 			if (r)
2916 				return r;
2917 			block->status.hw = true;
2918 		}
2919 	}
2920 
2921 	return 0;
2922 }
2923 
2924 /**
2925  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2926  *
2927  * @adev: amdgpu_device pointer
2928  *
2929  * First resume function for hardware IPs.  The list of all the hardware
2930  * IPs that make up the asic is walked and the resume callbacks are run for
2931  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2932  * after a suspend and updates the software state as necessary.  This
2933  * function is also used for restoring the GPU after a GPU reset.
2934  * Returns 0 on success, negative error code on failure.
2935  */
2936 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2937 {
2938 	int i, r;
2939 
2940 	for (i = 0; i < adev->num_ip_blocks; i++) {
2941 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2942 			continue;
2943 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2944 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2945 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2946 
2947 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2948 			if (r) {
2949 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2950 					  adev->ip_blocks[i].version->funcs->name, r);
2951 				return r;
2952 			}
2953 			adev->ip_blocks[i].status.hw = true;
2954 		}
2955 	}
2956 
2957 	return 0;
2958 }
2959 
2960 /**
2961  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2962  *
2963  * @adev: amdgpu_device pointer
2964  *
2965  * First resume function for hardware IPs.  The list of all the hardware
2966  * IPs that make up the asic is walked and the resume callbacks are run for
2967  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2968  * functional state after a suspend and updates the software state as
2969  * necessary.  This function is also used for restoring the GPU after a GPU
2970  * reset.
2971  * Returns 0 on success, negative error code on failure.
2972  */
2973 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2974 {
2975 	int i, r;
2976 
2977 	for (i = 0; i < adev->num_ip_blocks; i++) {
2978 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2979 			continue;
2980 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2981 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2982 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2983 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2984 			continue;
2985 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2986 		if (r) {
2987 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2988 				  adev->ip_blocks[i].version->funcs->name, r);
2989 			return r;
2990 		}
2991 		adev->ip_blocks[i].status.hw = true;
2992 	}
2993 
2994 	return 0;
2995 }
2996 
2997 /**
2998  * amdgpu_device_ip_resume - run resume for hardware IPs
2999  *
3000  * @adev: amdgpu_device pointer
3001  *
3002  * Main resume function for hardware IPs.  The hardware IPs
3003  * are split into two resume functions because they are
3004  * are also used in in recovering from a GPU reset and some additional
3005  * steps need to be take between them.  In this case (S3/S4) they are
3006  * run sequentially.
3007  * Returns 0 on success, negative error code on failure.
3008  */
3009 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3010 {
3011 	int r;
3012 
3013 	r = amdgpu_device_ip_resume_phase1(adev);
3014 	if (r)
3015 		return r;
3016 
3017 	r = amdgpu_device_fw_loading(adev);
3018 	if (r)
3019 		return r;
3020 
3021 	r = amdgpu_device_ip_resume_phase2(adev);
3022 
3023 	return r;
3024 }
3025 
3026 /**
3027  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3028  *
3029  * @adev: amdgpu_device pointer
3030  *
3031  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3032  */
3033 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3034 {
3035 	if (amdgpu_sriov_vf(adev)) {
3036 		if (adev->is_atom_fw) {
3037 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
3038 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3039 		} else {
3040 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3041 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3042 		}
3043 
3044 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3045 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3046 	}
3047 }
3048 
3049 /**
3050  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3051  *
3052  * @asic_type: AMD asic type
3053  *
3054  * Check if there is DC (new modesetting infrastructre) support for an asic.
3055  * returns true if DC has support, false if not.
3056  */
3057 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3058 {
3059 	switch (asic_type) {
3060 #if defined(CONFIG_DRM_AMD_DC)
3061 #if defined(CONFIG_DRM_AMD_DC_SI)
3062 	case CHIP_TAHITI:
3063 	case CHIP_PITCAIRN:
3064 	case CHIP_VERDE:
3065 	case CHIP_OLAND:
3066 #endif
3067 	case CHIP_BONAIRE:
3068 	case CHIP_KAVERI:
3069 	case CHIP_KABINI:
3070 	case CHIP_MULLINS:
3071 		/*
3072 		 * We have systems in the wild with these ASICs that require
3073 		 * LVDS and VGA support which is not supported with DC.
3074 		 *
3075 		 * Fallback to the non-DC driver here by default so as not to
3076 		 * cause regressions.
3077 		 */
3078 		return amdgpu_dc > 0;
3079 	case CHIP_HAWAII:
3080 	case CHIP_CARRIZO:
3081 	case CHIP_STONEY:
3082 	case CHIP_POLARIS10:
3083 	case CHIP_POLARIS11:
3084 	case CHIP_POLARIS12:
3085 	case CHIP_VEGAM:
3086 	case CHIP_TONGA:
3087 	case CHIP_FIJI:
3088 	case CHIP_VEGA10:
3089 	case CHIP_VEGA12:
3090 	case CHIP_VEGA20:
3091 #if defined(CONFIG_DRM_AMD_DC_DCN)
3092 	case CHIP_RAVEN:
3093 	case CHIP_NAVI10:
3094 	case CHIP_NAVI14:
3095 	case CHIP_NAVI12:
3096 	case CHIP_RENOIR:
3097 	case CHIP_SIENNA_CICHLID:
3098 	case CHIP_NAVY_FLOUNDER:
3099 	case CHIP_DIMGREY_CAVEFISH:
3100 	case CHIP_VANGOGH:
3101 #endif
3102 		return amdgpu_dc != 0;
3103 #endif
3104 	default:
3105 		if (amdgpu_dc > 0)
3106 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3107 					 "but isn't supported by ASIC, ignoring\n");
3108 		return false;
3109 	}
3110 }
3111 
3112 /**
3113  * amdgpu_device_has_dc_support - check if dc is supported
3114  *
3115  * @adev: amdgpu_device pointer
3116  *
3117  * Returns true for supported, false for not supported
3118  */
3119 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3120 {
3121 	if (amdgpu_sriov_vf(adev) ||
3122 	    adev->enable_virtual_display ||
3123 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3124 		return false;
3125 
3126 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3127 }
3128 
3129 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3130 {
3131 	struct amdgpu_device *adev =
3132 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3133 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3134 
3135 	/* It's a bug to not have a hive within this function */
3136 	if (WARN_ON(!hive))
3137 		return;
3138 
3139 	/*
3140 	 * Use task barrier to synchronize all xgmi reset works across the
3141 	 * hive. task_barrier_enter and task_barrier_exit will block
3142 	 * until all the threads running the xgmi reset works reach
3143 	 * those points. task_barrier_full will do both blocks.
3144 	 */
3145 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3146 
3147 		task_barrier_enter(&hive->tb);
3148 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3149 
3150 		if (adev->asic_reset_res)
3151 			goto fail;
3152 
3153 		task_barrier_exit(&hive->tb);
3154 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3155 
3156 		if (adev->asic_reset_res)
3157 			goto fail;
3158 
3159 		if (adev->mmhub.ras_funcs &&
3160 		    adev->mmhub.ras_funcs->reset_ras_error_count)
3161 			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
3162 	} else {
3163 
3164 		task_barrier_full(&hive->tb);
3165 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3166 	}
3167 
3168 fail:
3169 	if (adev->asic_reset_res)
3170 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3171 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3172 	amdgpu_put_xgmi_hive(hive);
3173 }
3174 
3175 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3176 {
3177 	char *input = amdgpu_lockup_timeout;
3178 	char *timeout_setting = NULL;
3179 	int index = 0;
3180 	long timeout;
3181 	int ret = 0;
3182 
3183 	/*
3184 	 * By default timeout for non compute jobs is 10000.
3185 	 * And there is no timeout enforced on compute jobs.
3186 	 * In SR-IOV or passthrough mode, timeout for compute
3187 	 * jobs are 60000 by default.
3188 	 */
3189 	adev->gfx_timeout = msecs_to_jiffies(10000);
3190 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3191 	if (amdgpu_sriov_vf(adev))
3192 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3193 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3194 	else if (amdgpu_passthrough(adev))
3195 		adev->compute_timeout =  msecs_to_jiffies(60000);
3196 	else
3197 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3198 
3199 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3200 		while ((timeout_setting = strsep(&input, ",")) &&
3201 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3202 			ret = kstrtol(timeout_setting, 0, &timeout);
3203 			if (ret)
3204 				return ret;
3205 
3206 			if (timeout == 0) {
3207 				index++;
3208 				continue;
3209 			} else if (timeout < 0) {
3210 				timeout = MAX_SCHEDULE_TIMEOUT;
3211 			} else {
3212 				timeout = msecs_to_jiffies(timeout);
3213 			}
3214 
3215 			switch (index++) {
3216 			case 0:
3217 				adev->gfx_timeout = timeout;
3218 				break;
3219 			case 1:
3220 				adev->compute_timeout = timeout;
3221 				break;
3222 			case 2:
3223 				adev->sdma_timeout = timeout;
3224 				break;
3225 			case 3:
3226 				adev->video_timeout = timeout;
3227 				break;
3228 			default:
3229 				break;
3230 			}
3231 		}
3232 		/*
3233 		 * There is only one value specified and
3234 		 * it should apply to all non-compute jobs.
3235 		 */
3236 		if (index == 1) {
3237 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3238 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3239 				adev->compute_timeout = adev->gfx_timeout;
3240 		}
3241 	}
3242 
3243 	return ret;
3244 }
3245 
3246 static const struct attribute *amdgpu_dev_attributes[] = {
3247 	&dev_attr_product_name.attr,
3248 	&dev_attr_product_number.attr,
3249 	&dev_attr_serial_number.attr,
3250 	&dev_attr_pcie_replay_count.attr,
3251 	NULL
3252 };
3253 
3254 
3255 /**
3256  * amdgpu_device_init - initialize the driver
3257  *
3258  * @adev: amdgpu_device pointer
3259  * @flags: driver flags
3260  *
3261  * Initializes the driver info and hw (all asics).
3262  * Returns 0 for success or an error on failure.
3263  * Called at driver startup.
3264  */
3265 int amdgpu_device_init(struct amdgpu_device *adev,
3266 		       uint32_t flags)
3267 {
3268 	struct drm_device *ddev = adev_to_drm(adev);
3269 	struct pci_dev *pdev = adev->pdev;
3270 	int r, i;
3271 	bool px = false;
3272 	u32 max_MBps;
3273 
3274 	adev->shutdown = false;
3275 	adev->flags = flags;
3276 
3277 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3278 		adev->asic_type = amdgpu_force_asic_type;
3279 	else
3280 		adev->asic_type = flags & AMD_ASIC_MASK;
3281 
3282 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3283 	if (amdgpu_emu_mode == 1)
3284 		adev->usec_timeout *= 10;
3285 	adev->gmc.gart_size = 512 * 1024 * 1024;
3286 	adev->accel_working = false;
3287 	adev->num_rings = 0;
3288 	adev->mman.buffer_funcs = NULL;
3289 	adev->mman.buffer_funcs_ring = NULL;
3290 	adev->vm_manager.vm_pte_funcs = NULL;
3291 	adev->vm_manager.vm_pte_num_scheds = 0;
3292 	adev->gmc.gmc_funcs = NULL;
3293 	adev->harvest_ip_mask = 0x0;
3294 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3295 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3296 
3297 	adev->smc_rreg = &amdgpu_invalid_rreg;
3298 	adev->smc_wreg = &amdgpu_invalid_wreg;
3299 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3300 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3301 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3302 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3303 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3304 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3305 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3306 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3307 	adev->didt_rreg = &amdgpu_invalid_rreg;
3308 	adev->didt_wreg = &amdgpu_invalid_wreg;
3309 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3310 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3311 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3312 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3313 
3314 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3315 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3316 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3317 
3318 	/* mutex initialization are all done here so we
3319 	 * can recall function without having locking issues */
3320 	mutex_init(&adev->firmware.mutex);
3321 	mutex_init(&adev->pm.mutex);
3322 	mutex_init(&adev->gfx.gpu_clock_mutex);
3323 	mutex_init(&adev->srbm_mutex);
3324 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3325 	mutex_init(&adev->gfx.gfx_off_mutex);
3326 	mutex_init(&adev->grbm_idx_mutex);
3327 	mutex_init(&adev->mn_lock);
3328 	mutex_init(&adev->virt.vf_errors.lock);
3329 	hash_init(adev->mn_hash);
3330 	atomic_set(&adev->in_gpu_reset, 0);
3331 	init_rwsem(&adev->reset_sem);
3332 	mutex_init(&adev->psp.mutex);
3333 	mutex_init(&adev->notifier_lock);
3334 
3335 	r = amdgpu_device_check_arguments(adev);
3336 	if (r)
3337 		return r;
3338 
3339 	spin_lock_init(&adev->mmio_idx_lock);
3340 	spin_lock_init(&adev->smc_idx_lock);
3341 	spin_lock_init(&adev->pcie_idx_lock);
3342 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3343 	spin_lock_init(&adev->didt_idx_lock);
3344 	spin_lock_init(&adev->gc_cac_idx_lock);
3345 	spin_lock_init(&adev->se_cac_idx_lock);
3346 	spin_lock_init(&adev->audio_endpt_idx_lock);
3347 	spin_lock_init(&adev->mm_stats.lock);
3348 
3349 	INIT_LIST_HEAD(&adev->shadow_list);
3350 	mutex_init(&adev->shadow_list_lock);
3351 
3352 	INIT_LIST_HEAD(&adev->reset_list);
3353 
3354 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3355 			  amdgpu_device_delayed_init_work_handler);
3356 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3357 			  amdgpu_device_delay_enable_gfx_off);
3358 
3359 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3360 
3361 	adev->gfx.gfx_off_req_count = 1;
3362 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3363 
3364 	atomic_set(&adev->throttling_logging_enabled, 1);
3365 	/*
3366 	 * If throttling continues, logging will be performed every minute
3367 	 * to avoid log flooding. "-1" is subtracted since the thermal
3368 	 * throttling interrupt comes every second. Thus, the total logging
3369 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3370 	 * for throttling interrupt) = 60 seconds.
3371 	 */
3372 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3373 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3374 
3375 	/* Registers mapping */
3376 	/* TODO: block userspace mapping of io register */
3377 	if (adev->asic_type >= CHIP_BONAIRE) {
3378 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3379 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3380 	} else {
3381 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3382 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3383 	}
3384 
3385 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3386 	if (adev->rmmio == NULL) {
3387 		return -ENOMEM;
3388 	}
3389 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3390 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3391 
3392 	/* enable PCIE atomic ops */
3393 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3394 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3395 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3396 	if (r) {
3397 		adev->have_atomics_support = false;
3398 		DRM_INFO("PCIE atomic ops is not supported\n");
3399 	} else {
3400 		adev->have_atomics_support = true;
3401 	}
3402 
3403 	amdgpu_device_get_pcie_info(adev);
3404 
3405 	if (amdgpu_mcbp)
3406 		DRM_INFO("MCBP is enabled\n");
3407 
3408 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3409 		adev->enable_mes = true;
3410 
3411 	/* detect hw virtualization here */
3412 	amdgpu_detect_virtualization(adev);
3413 
3414 	r = amdgpu_device_get_job_timeout_settings(adev);
3415 	if (r) {
3416 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3417 		goto failed_unmap;
3418 	}
3419 
3420 	/* early init functions */
3421 	r = amdgpu_device_ip_early_init(adev);
3422 	if (r)
3423 		goto failed_unmap;
3424 
3425 	/* doorbell bar mapping and doorbell index init*/
3426 	amdgpu_device_doorbell_init(adev);
3427 
3428 	if (amdgpu_emu_mode == 1) {
3429 		/* post the asic on emulation mode */
3430 		emu_soc_asic_init(adev);
3431 		goto fence_driver_init;
3432 	}
3433 
3434 	amdgpu_reset_init(adev);
3435 
3436 	/* detect if we are with an SRIOV vbios */
3437 	amdgpu_device_detect_sriov_bios(adev);
3438 
3439 	/* check if we need to reset the asic
3440 	 *  E.g., driver was not cleanly unloaded previously, etc.
3441 	 */
3442 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3443 		if (adev->gmc.xgmi.num_physical_nodes) {
3444 			dev_info(adev->dev, "Pending hive reset.\n");
3445 			adev->gmc.xgmi.pending_reset = true;
3446 			/* Only need to init necessary block for SMU to handle the reset */
3447 			for (i = 0; i < adev->num_ip_blocks; i++) {
3448 				if (!adev->ip_blocks[i].status.valid)
3449 					continue;
3450 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3451 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3452 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3453 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3454 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3455 						adev->ip_blocks[i].version->funcs->name);
3456 					adev->ip_blocks[i].status.hw = true;
3457 				}
3458 			}
3459 		} else {
3460 			r = amdgpu_asic_reset(adev);
3461 			if (r) {
3462 				dev_err(adev->dev, "asic reset on init failed\n");
3463 				goto failed;
3464 			}
3465 		}
3466 	}
3467 
3468 	pci_enable_pcie_error_reporting(adev->pdev);
3469 
3470 	/* Post card if necessary */
3471 	if (amdgpu_device_need_post(adev)) {
3472 		if (!adev->bios) {
3473 			dev_err(adev->dev, "no vBIOS found\n");
3474 			r = -EINVAL;
3475 			goto failed;
3476 		}
3477 		DRM_INFO("GPU posting now...\n");
3478 		r = amdgpu_device_asic_init(adev);
3479 		if (r) {
3480 			dev_err(adev->dev, "gpu post error!\n");
3481 			goto failed;
3482 		}
3483 	}
3484 
3485 	if (adev->is_atom_fw) {
3486 		/* Initialize clocks */
3487 		r = amdgpu_atomfirmware_get_clock_info(adev);
3488 		if (r) {
3489 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3490 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3491 			goto failed;
3492 		}
3493 	} else {
3494 		/* Initialize clocks */
3495 		r = amdgpu_atombios_get_clock_info(adev);
3496 		if (r) {
3497 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3498 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3499 			goto failed;
3500 		}
3501 		/* init i2c buses */
3502 		if (!amdgpu_device_has_dc_support(adev))
3503 			amdgpu_atombios_i2c_init(adev);
3504 	}
3505 
3506 fence_driver_init:
3507 	/* Fence driver */
3508 	r = amdgpu_fence_driver_init(adev);
3509 	if (r) {
3510 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3511 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3512 		goto failed;
3513 	}
3514 
3515 	/* init the mode config */
3516 	drm_mode_config_init(adev_to_drm(adev));
3517 
3518 	r = amdgpu_device_ip_init(adev);
3519 	if (r) {
3520 		/* failed in exclusive mode due to timeout */
3521 		if (amdgpu_sriov_vf(adev) &&
3522 		    !amdgpu_sriov_runtime(adev) &&
3523 		    amdgpu_virt_mmio_blocked(adev) &&
3524 		    !amdgpu_virt_wait_reset(adev)) {
3525 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3526 			/* Don't send request since VF is inactive. */
3527 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3528 			adev->virt.ops = NULL;
3529 			r = -EAGAIN;
3530 			goto release_ras_con;
3531 		}
3532 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3533 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3534 		goto release_ras_con;
3535 	}
3536 
3537 	dev_info(adev->dev,
3538 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3539 			adev->gfx.config.max_shader_engines,
3540 			adev->gfx.config.max_sh_per_se,
3541 			adev->gfx.config.max_cu_per_sh,
3542 			adev->gfx.cu_info.number);
3543 
3544 	adev->accel_working = true;
3545 
3546 	amdgpu_vm_check_compute_bug(adev);
3547 
3548 	/* Initialize the buffer migration limit. */
3549 	if (amdgpu_moverate >= 0)
3550 		max_MBps = amdgpu_moverate;
3551 	else
3552 		max_MBps = 8; /* Allow 8 MB/s. */
3553 	/* Get a log2 for easy divisions. */
3554 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3555 
3556 	amdgpu_fbdev_init(adev);
3557 
3558 	r = amdgpu_pm_sysfs_init(adev);
3559 	if (r) {
3560 		adev->pm_sysfs_en = false;
3561 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3562 	} else
3563 		adev->pm_sysfs_en = true;
3564 
3565 	r = amdgpu_ucode_sysfs_init(adev);
3566 	if (r) {
3567 		adev->ucode_sysfs_en = false;
3568 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3569 	} else
3570 		adev->ucode_sysfs_en = true;
3571 
3572 	if ((amdgpu_testing & 1)) {
3573 		if (adev->accel_working)
3574 			amdgpu_test_moves(adev);
3575 		else
3576 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3577 	}
3578 	if (amdgpu_benchmarking) {
3579 		if (adev->accel_working)
3580 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3581 		else
3582 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3583 	}
3584 
3585 	/*
3586 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3587 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3588 	 * gpu instance is counted less.
3589 	 */
3590 	amdgpu_register_gpu_instance(adev);
3591 
3592 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3593 	 * explicit gating rather than handling it automatically.
3594 	 */
3595 	if (!adev->gmc.xgmi.pending_reset) {
3596 		r = amdgpu_device_ip_late_init(adev);
3597 		if (r) {
3598 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3599 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3600 			goto release_ras_con;
3601 		}
3602 		/* must succeed. */
3603 		amdgpu_ras_resume(adev);
3604 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3605 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3606 	}
3607 
3608 	if (amdgpu_sriov_vf(adev))
3609 		flush_delayed_work(&adev->delayed_init_work);
3610 
3611 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3612 	if (r)
3613 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3614 
3615 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3616 		r = amdgpu_pmu_init(adev);
3617 	if (r)
3618 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3619 
3620 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3621 	if (amdgpu_device_cache_pci_state(adev->pdev))
3622 		pci_restore_state(pdev);
3623 
3624 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3625 	/* this will fail for cards that aren't VGA class devices, just
3626 	 * ignore it */
3627 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3628 		vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3629 
3630 	if (amdgpu_device_supports_px(ddev)) {
3631 		px = true;
3632 		vga_switcheroo_register_client(adev->pdev,
3633 					       &amdgpu_switcheroo_ops, px);
3634 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3635 	}
3636 
3637 	if (adev->gmc.xgmi.pending_reset)
3638 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3639 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3640 
3641 	return 0;
3642 
3643 release_ras_con:
3644 	amdgpu_release_ras_context(adev);
3645 
3646 failed:
3647 	amdgpu_vf_error_trans_all(adev);
3648 
3649 failed_unmap:
3650 	iounmap(adev->rmmio);
3651 	adev->rmmio = NULL;
3652 
3653 	return r;
3654 }
3655 
3656 /**
3657  * amdgpu_device_fini - tear down the driver
3658  *
3659  * @adev: amdgpu_device pointer
3660  *
3661  * Tear down the driver info (all asics).
3662  * Called at driver shutdown.
3663  */
3664 void amdgpu_device_fini(struct amdgpu_device *adev)
3665 {
3666 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3667 	flush_delayed_work(&adev->delayed_init_work);
3668 	ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3669 	adev->shutdown = true;
3670 
3671 	kfree(adev->pci_state);
3672 
3673 	/* make sure IB test finished before entering exclusive mode
3674 	 * to avoid preemption on IB test
3675 	 * */
3676 	if (amdgpu_sriov_vf(adev)) {
3677 		amdgpu_virt_request_full_gpu(adev, false);
3678 		amdgpu_virt_fini_data_exchange(adev);
3679 	}
3680 
3681 	/* disable all interrupts */
3682 	amdgpu_irq_disable_all(adev);
3683 	if (adev->mode_info.mode_config_initialized){
3684 		if (!amdgpu_device_has_dc_support(adev))
3685 			drm_helper_force_disable_all(adev_to_drm(adev));
3686 		else
3687 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3688 	}
3689 	amdgpu_fence_driver_fini(adev);
3690 	if (adev->pm_sysfs_en)
3691 		amdgpu_pm_sysfs_fini(adev);
3692 	amdgpu_fbdev_fini(adev);
3693 	amdgpu_device_ip_fini(adev);
3694 	release_firmware(adev->firmware.gpu_info_fw);
3695 	adev->firmware.gpu_info_fw = NULL;
3696 	adev->accel_working = false;
3697 
3698 	amdgpu_reset_fini(adev);
3699 
3700 	/* free i2c buses */
3701 	if (!amdgpu_device_has_dc_support(adev))
3702 		amdgpu_i2c_fini(adev);
3703 
3704 	if (amdgpu_emu_mode != 1)
3705 		amdgpu_atombios_fini(adev);
3706 
3707 	kfree(adev->bios);
3708 	adev->bios = NULL;
3709 	if (amdgpu_device_supports_px(adev_to_drm(adev))) {
3710 		vga_switcheroo_unregister_client(adev->pdev);
3711 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3712 	}
3713 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3714 		vga_client_register(adev->pdev, NULL, NULL, NULL);
3715 	iounmap(adev->rmmio);
3716 	adev->rmmio = NULL;
3717 	amdgpu_device_doorbell_fini(adev);
3718 
3719 	if (adev->ucode_sysfs_en)
3720 		amdgpu_ucode_sysfs_fini(adev);
3721 
3722 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3723 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3724 		amdgpu_pmu_fini(adev);
3725 	if (adev->mman.discovery_bin)
3726 		amdgpu_discovery_fini(adev);
3727 }
3728 
3729 
3730 /*
3731  * Suspend & resume.
3732  */
3733 /**
3734  * amdgpu_device_suspend - initiate device suspend
3735  *
3736  * @dev: drm dev pointer
3737  * @fbcon : notify the fbdev of suspend
3738  *
3739  * Puts the hw in the suspend state (all asics).
3740  * Returns 0 for success or an error on failure.
3741  * Called at driver suspend.
3742  */
3743 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3744 {
3745 	struct amdgpu_device *adev = drm_to_adev(dev);
3746 	int r;
3747 
3748 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3749 		return 0;
3750 
3751 	adev->in_suspend = true;
3752 	drm_kms_helper_poll_disable(dev);
3753 
3754 	if (fbcon)
3755 		amdgpu_fbdev_set_suspend(adev, 1);
3756 
3757 	cancel_delayed_work_sync(&adev->delayed_init_work);
3758 
3759 	amdgpu_ras_suspend(adev);
3760 
3761 	r = amdgpu_device_ip_suspend_phase1(adev);
3762 
3763 	if (!adev->in_s0ix)
3764 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
3765 
3766 	/* evict vram memory */
3767 	amdgpu_bo_evict_vram(adev);
3768 
3769 	amdgpu_fence_driver_suspend(adev);
3770 
3771 	r = amdgpu_device_ip_suspend_phase2(adev);
3772 	/* evict remaining vram memory
3773 	 * This second call to evict vram is to evict the gart page table
3774 	 * using the CPU.
3775 	 */
3776 	amdgpu_bo_evict_vram(adev);
3777 
3778 	return 0;
3779 }
3780 
3781 /**
3782  * amdgpu_device_resume - initiate device resume
3783  *
3784  * @dev: drm dev pointer
3785  * @fbcon : notify the fbdev of resume
3786  *
3787  * Bring the hw back to operating state (all asics).
3788  * Returns 0 for success or an error on failure.
3789  * Called at driver resume.
3790  */
3791 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3792 {
3793 	struct amdgpu_device *adev = drm_to_adev(dev);
3794 	int r = 0;
3795 
3796 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3797 		return 0;
3798 
3799 	if (adev->in_s0ix)
3800 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3801 
3802 	/* post card */
3803 	if (amdgpu_device_need_post(adev)) {
3804 		r = amdgpu_device_asic_init(adev);
3805 		if (r)
3806 			dev_err(adev->dev, "amdgpu asic init failed\n");
3807 	}
3808 
3809 	r = amdgpu_device_ip_resume(adev);
3810 	if (r) {
3811 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3812 		return r;
3813 	}
3814 	amdgpu_fence_driver_resume(adev);
3815 
3816 
3817 	r = amdgpu_device_ip_late_init(adev);
3818 	if (r)
3819 		return r;
3820 
3821 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3822 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3823 
3824 	if (!adev->in_s0ix) {
3825 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3826 		if (r)
3827 			return r;
3828 	}
3829 
3830 	/* Make sure IB tests flushed */
3831 	flush_delayed_work(&adev->delayed_init_work);
3832 
3833 	if (fbcon)
3834 		amdgpu_fbdev_set_suspend(adev, 0);
3835 
3836 	drm_kms_helper_poll_enable(dev);
3837 
3838 	amdgpu_ras_resume(adev);
3839 
3840 	/*
3841 	 * Most of the connector probing functions try to acquire runtime pm
3842 	 * refs to ensure that the GPU is powered on when connector polling is
3843 	 * performed. Since we're calling this from a runtime PM callback,
3844 	 * trying to acquire rpm refs will cause us to deadlock.
3845 	 *
3846 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3847 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3848 	 */
3849 #ifdef CONFIG_PM
3850 	dev->dev->power.disable_depth++;
3851 #endif
3852 	if (!amdgpu_device_has_dc_support(adev))
3853 		drm_helper_hpd_irq_event(dev);
3854 	else
3855 		drm_kms_helper_hotplug_event(dev);
3856 #ifdef CONFIG_PM
3857 	dev->dev->power.disable_depth--;
3858 #endif
3859 	adev->in_suspend = false;
3860 
3861 	return 0;
3862 }
3863 
3864 /**
3865  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3866  *
3867  * @adev: amdgpu_device pointer
3868  *
3869  * The list of all the hardware IPs that make up the asic is walked and
3870  * the check_soft_reset callbacks are run.  check_soft_reset determines
3871  * if the asic is still hung or not.
3872  * Returns true if any of the IPs are still in a hung state, false if not.
3873  */
3874 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3875 {
3876 	int i;
3877 	bool asic_hang = false;
3878 
3879 	if (amdgpu_sriov_vf(adev))
3880 		return true;
3881 
3882 	if (amdgpu_asic_need_full_reset(adev))
3883 		return true;
3884 
3885 	for (i = 0; i < adev->num_ip_blocks; i++) {
3886 		if (!adev->ip_blocks[i].status.valid)
3887 			continue;
3888 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3889 			adev->ip_blocks[i].status.hang =
3890 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3891 		if (adev->ip_blocks[i].status.hang) {
3892 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3893 			asic_hang = true;
3894 		}
3895 	}
3896 	return asic_hang;
3897 }
3898 
3899 /**
3900  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3901  *
3902  * @adev: amdgpu_device pointer
3903  *
3904  * The list of all the hardware IPs that make up the asic is walked and the
3905  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3906  * handles any IP specific hardware or software state changes that are
3907  * necessary for a soft reset to succeed.
3908  * Returns 0 on success, negative error code on failure.
3909  */
3910 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3911 {
3912 	int i, r = 0;
3913 
3914 	for (i = 0; i < adev->num_ip_blocks; i++) {
3915 		if (!adev->ip_blocks[i].status.valid)
3916 			continue;
3917 		if (adev->ip_blocks[i].status.hang &&
3918 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3919 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3920 			if (r)
3921 				return r;
3922 		}
3923 	}
3924 
3925 	return 0;
3926 }
3927 
3928 /**
3929  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3930  *
3931  * @adev: amdgpu_device pointer
3932  *
3933  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3934  * reset is necessary to recover.
3935  * Returns true if a full asic reset is required, false if not.
3936  */
3937 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3938 {
3939 	int i;
3940 
3941 	if (amdgpu_asic_need_full_reset(adev))
3942 		return true;
3943 
3944 	for (i = 0; i < adev->num_ip_blocks; i++) {
3945 		if (!adev->ip_blocks[i].status.valid)
3946 			continue;
3947 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3948 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3949 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3950 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3951 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3952 			if (adev->ip_blocks[i].status.hang) {
3953 				dev_info(adev->dev, "Some block need full reset!\n");
3954 				return true;
3955 			}
3956 		}
3957 	}
3958 	return false;
3959 }
3960 
3961 /**
3962  * amdgpu_device_ip_soft_reset - do a soft reset
3963  *
3964  * @adev: amdgpu_device pointer
3965  *
3966  * The list of all the hardware IPs that make up the asic is walked and the
3967  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3968  * IP specific hardware or software state changes that are necessary to soft
3969  * reset the IP.
3970  * Returns 0 on success, negative error code on failure.
3971  */
3972 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3973 {
3974 	int i, r = 0;
3975 
3976 	for (i = 0; i < adev->num_ip_blocks; i++) {
3977 		if (!adev->ip_blocks[i].status.valid)
3978 			continue;
3979 		if (adev->ip_blocks[i].status.hang &&
3980 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3981 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3982 			if (r)
3983 				return r;
3984 		}
3985 	}
3986 
3987 	return 0;
3988 }
3989 
3990 /**
3991  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3992  *
3993  * @adev: amdgpu_device pointer
3994  *
3995  * The list of all the hardware IPs that make up the asic is walked and the
3996  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3997  * handles any IP specific hardware or software state changes that are
3998  * necessary after the IP has been soft reset.
3999  * Returns 0 on success, negative error code on failure.
4000  */
4001 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4002 {
4003 	int i, r = 0;
4004 
4005 	for (i = 0; i < adev->num_ip_blocks; i++) {
4006 		if (!adev->ip_blocks[i].status.valid)
4007 			continue;
4008 		if (adev->ip_blocks[i].status.hang &&
4009 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4010 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4011 		if (r)
4012 			return r;
4013 	}
4014 
4015 	return 0;
4016 }
4017 
4018 /**
4019  * amdgpu_device_recover_vram - Recover some VRAM contents
4020  *
4021  * @adev: amdgpu_device pointer
4022  *
4023  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4024  * restore things like GPUVM page tables after a GPU reset where
4025  * the contents of VRAM might be lost.
4026  *
4027  * Returns:
4028  * 0 on success, negative error code on failure.
4029  */
4030 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4031 {
4032 	struct dma_fence *fence = NULL, *next = NULL;
4033 	struct amdgpu_bo *shadow;
4034 	long r = 1, tmo;
4035 
4036 	if (amdgpu_sriov_runtime(adev))
4037 		tmo = msecs_to_jiffies(8000);
4038 	else
4039 		tmo = msecs_to_jiffies(100);
4040 
4041 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4042 	mutex_lock(&adev->shadow_list_lock);
4043 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4044 
4045 		/* No need to recover an evicted BO */
4046 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4047 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4048 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4049 			continue;
4050 
4051 		r = amdgpu_bo_restore_shadow(shadow, &next);
4052 		if (r)
4053 			break;
4054 
4055 		if (fence) {
4056 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4057 			dma_fence_put(fence);
4058 			fence = next;
4059 			if (tmo == 0) {
4060 				r = -ETIMEDOUT;
4061 				break;
4062 			} else if (tmo < 0) {
4063 				r = tmo;
4064 				break;
4065 			}
4066 		} else {
4067 			fence = next;
4068 		}
4069 	}
4070 	mutex_unlock(&adev->shadow_list_lock);
4071 
4072 	if (fence)
4073 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4074 	dma_fence_put(fence);
4075 
4076 	if (r < 0 || tmo <= 0) {
4077 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4078 		return -EIO;
4079 	}
4080 
4081 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4082 	return 0;
4083 }
4084 
4085 
4086 /**
4087  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4088  *
4089  * @adev: amdgpu_device pointer
4090  * @from_hypervisor: request from hypervisor
4091  *
4092  * do VF FLR and reinitialize Asic
4093  * return 0 means succeeded otherwise failed
4094  */
4095 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4096 				     bool from_hypervisor)
4097 {
4098 	int r;
4099 
4100 	if (from_hypervisor)
4101 		r = amdgpu_virt_request_full_gpu(adev, true);
4102 	else
4103 		r = amdgpu_virt_reset_gpu(adev);
4104 	if (r)
4105 		return r;
4106 
4107 	amdgpu_amdkfd_pre_reset(adev);
4108 
4109 	/* Resume IP prior to SMC */
4110 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4111 	if (r)
4112 		goto error;
4113 
4114 	amdgpu_virt_init_data_exchange(adev);
4115 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4116 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4117 
4118 	r = amdgpu_device_fw_loading(adev);
4119 	if (r)
4120 		return r;
4121 
4122 	/* now we are okay to resume SMC/CP/SDMA */
4123 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4124 	if (r)
4125 		goto error;
4126 
4127 	amdgpu_irq_gpu_reset_resume_helper(adev);
4128 	r = amdgpu_ib_ring_tests(adev);
4129 	amdgpu_amdkfd_post_reset(adev);
4130 
4131 error:
4132 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4133 		amdgpu_inc_vram_lost(adev);
4134 		r = amdgpu_device_recover_vram(adev);
4135 	}
4136 	amdgpu_virt_release_full_gpu(adev, true);
4137 
4138 	return r;
4139 }
4140 
4141 /**
4142  * amdgpu_device_has_job_running - check if there is any job in mirror list
4143  *
4144  * @adev: amdgpu_device pointer
4145  *
4146  * check if there is any job in mirror list
4147  */
4148 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4149 {
4150 	int i;
4151 	struct drm_sched_job *job;
4152 
4153 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4154 		struct amdgpu_ring *ring = adev->rings[i];
4155 
4156 		if (!ring || !ring->sched.thread)
4157 			continue;
4158 
4159 		spin_lock(&ring->sched.job_list_lock);
4160 		job = list_first_entry_or_null(&ring->sched.pending_list,
4161 					       struct drm_sched_job, list);
4162 		spin_unlock(&ring->sched.job_list_lock);
4163 		if (job)
4164 			return true;
4165 	}
4166 	return false;
4167 }
4168 
4169 /**
4170  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4171  *
4172  * @adev: amdgpu_device pointer
4173  *
4174  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4175  * a hung GPU.
4176  */
4177 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4178 {
4179 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4180 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4181 		return false;
4182 	}
4183 
4184 	if (amdgpu_gpu_recovery == 0)
4185 		goto disabled;
4186 
4187 	if (amdgpu_sriov_vf(adev))
4188 		return true;
4189 
4190 	if (amdgpu_gpu_recovery == -1) {
4191 		switch (adev->asic_type) {
4192 		case CHIP_BONAIRE:
4193 		case CHIP_HAWAII:
4194 		case CHIP_TOPAZ:
4195 		case CHIP_TONGA:
4196 		case CHIP_FIJI:
4197 		case CHIP_POLARIS10:
4198 		case CHIP_POLARIS11:
4199 		case CHIP_POLARIS12:
4200 		case CHIP_VEGAM:
4201 		case CHIP_VEGA20:
4202 		case CHIP_VEGA10:
4203 		case CHIP_VEGA12:
4204 		case CHIP_RAVEN:
4205 		case CHIP_ARCTURUS:
4206 		case CHIP_RENOIR:
4207 		case CHIP_NAVI10:
4208 		case CHIP_NAVI14:
4209 		case CHIP_NAVI12:
4210 		case CHIP_SIENNA_CICHLID:
4211 		case CHIP_NAVY_FLOUNDER:
4212 		case CHIP_DIMGREY_CAVEFISH:
4213 		case CHIP_VANGOGH:
4214 		case CHIP_ALDEBARAN:
4215 			break;
4216 		default:
4217 			goto disabled;
4218 		}
4219 	}
4220 
4221 	return true;
4222 
4223 disabled:
4224 		dev_info(adev->dev, "GPU recovery disabled.\n");
4225 		return false;
4226 }
4227 
4228 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4229 {
4230         u32 i;
4231         int ret = 0;
4232 
4233         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4234 
4235         dev_info(adev->dev, "GPU mode1 reset\n");
4236 
4237         /* disable BM */
4238         pci_clear_master(adev->pdev);
4239 
4240         amdgpu_device_cache_pci_state(adev->pdev);
4241 
4242         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4243                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4244                 ret = amdgpu_dpm_mode1_reset(adev);
4245         } else {
4246                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4247                 ret = psp_gpu_reset(adev);
4248         }
4249 
4250         if (ret)
4251                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4252 
4253         amdgpu_device_load_pci_state(adev->pdev);
4254 
4255         /* wait for asic to come out of reset */
4256         for (i = 0; i < adev->usec_timeout; i++) {
4257                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4258 
4259                 if (memsize != 0xffffffff)
4260                         break;
4261                 udelay(1);
4262         }
4263 
4264         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4265         return ret;
4266 }
4267 
4268 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4269 				 struct amdgpu_reset_context *reset_context)
4270 {
4271 	int i, r = 0;
4272 	struct amdgpu_job *job = NULL;
4273 	bool need_full_reset =
4274 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4275 
4276 	if (reset_context->reset_req_dev == adev)
4277 		job = reset_context->job;
4278 
4279 	/* no need to dump if device is not in good state during probe period */
4280 	if (!adev->gmc.xgmi.pending_reset)
4281 		amdgpu_debugfs_wait_dump(adev);
4282 
4283 	if (amdgpu_sriov_vf(adev)) {
4284 		/* stop the data exchange thread */
4285 		amdgpu_virt_fini_data_exchange(adev);
4286 	}
4287 
4288 	/* block all schedulers and reset given job's ring */
4289 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4290 		struct amdgpu_ring *ring = adev->rings[i];
4291 
4292 		if (!ring || !ring->sched.thread)
4293 			continue;
4294 
4295 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4296 		amdgpu_fence_driver_force_completion(ring);
4297 	}
4298 
4299 	if(job)
4300 		drm_sched_increase_karma(&job->base);
4301 
4302 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4303 	/* If reset handler not implemented, continue; otherwise return */
4304 	if (r == -ENOSYS)
4305 		r = 0;
4306 	else
4307 		return r;
4308 
4309 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4310 	if (!amdgpu_sriov_vf(adev)) {
4311 
4312 		if (!need_full_reset)
4313 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4314 
4315 		if (!need_full_reset) {
4316 			amdgpu_device_ip_pre_soft_reset(adev);
4317 			r = amdgpu_device_ip_soft_reset(adev);
4318 			amdgpu_device_ip_post_soft_reset(adev);
4319 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4320 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4321 				need_full_reset = true;
4322 			}
4323 		}
4324 
4325 		if (need_full_reset)
4326 			r = amdgpu_device_ip_suspend(adev);
4327 		if (need_full_reset)
4328 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4329 		else
4330 			clear_bit(AMDGPU_NEED_FULL_RESET,
4331 				  &reset_context->flags);
4332 	}
4333 
4334 	return r;
4335 }
4336 
4337 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4338 			 struct amdgpu_reset_context *reset_context)
4339 {
4340 	struct amdgpu_device *tmp_adev = NULL;
4341 	bool need_full_reset, skip_hw_reset, vram_lost = false;
4342 	int r = 0;
4343 
4344 	/* Try reset handler method first */
4345 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4346 				    reset_list);
4347 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4348 	/* If reset handler not implemented, continue; otherwise return */
4349 	if (r == -ENOSYS)
4350 		r = 0;
4351 	else
4352 		return r;
4353 
4354 	/* Reset handler not implemented, use the default method */
4355 	need_full_reset =
4356 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4357 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4358 
4359 	/*
4360 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
4361 	 * to allow proper links negotiation in FW (within 1 sec)
4362 	 */
4363 	if (!skip_hw_reset && need_full_reset) {
4364 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4365 			/* For XGMI run all resets in parallel to speed up the process */
4366 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4367 				tmp_adev->gmc.xgmi.pending_reset = false;
4368 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4369 					r = -EALREADY;
4370 			} else
4371 				r = amdgpu_asic_reset(tmp_adev);
4372 
4373 			if (r) {
4374 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4375 					 r, adev_to_drm(tmp_adev)->unique);
4376 				break;
4377 			}
4378 		}
4379 
4380 		/* For XGMI wait for all resets to complete before proceed */
4381 		if (!r) {
4382 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4383 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4384 					flush_work(&tmp_adev->xgmi_reset_work);
4385 					r = tmp_adev->asic_reset_res;
4386 					if (r)
4387 						break;
4388 				}
4389 			}
4390 		}
4391 	}
4392 
4393 	if (!r && amdgpu_ras_intr_triggered()) {
4394 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4395 			if (tmp_adev->mmhub.ras_funcs &&
4396 			    tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4397 				tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
4398 		}
4399 
4400 		amdgpu_ras_intr_cleared();
4401 	}
4402 
4403 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4404 		if (need_full_reset) {
4405 			/* post card */
4406 			r = amdgpu_device_asic_init(tmp_adev);
4407 			if (r) {
4408 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4409 			} else {
4410 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4411 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4412 				if (r)
4413 					goto out;
4414 
4415 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4416 				if (vram_lost) {
4417 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4418 					amdgpu_inc_vram_lost(tmp_adev);
4419 				}
4420 
4421 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4422 				if (r)
4423 					goto out;
4424 
4425 				r = amdgpu_device_fw_loading(tmp_adev);
4426 				if (r)
4427 					return r;
4428 
4429 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4430 				if (r)
4431 					goto out;
4432 
4433 				if (vram_lost)
4434 					amdgpu_device_fill_reset_magic(tmp_adev);
4435 
4436 				/*
4437 				 * Add this ASIC as tracked as reset was already
4438 				 * complete successfully.
4439 				 */
4440 				amdgpu_register_gpu_instance(tmp_adev);
4441 
4442 				if (!reset_context->hive &&
4443 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4444 					amdgpu_xgmi_add_device(tmp_adev);
4445 
4446 				r = amdgpu_device_ip_late_init(tmp_adev);
4447 				if (r)
4448 					goto out;
4449 
4450 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4451 
4452 				/*
4453 				 * The GPU enters bad state once faulty pages
4454 				 * by ECC has reached the threshold, and ras
4455 				 * recovery is scheduled next. So add one check
4456 				 * here to break recovery if it indeed exceeds
4457 				 * bad page threshold, and remind user to
4458 				 * retire this GPU or setting one bigger
4459 				 * bad_page_threshold value to fix this once
4460 				 * probing driver again.
4461 				 */
4462 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4463 					/* must succeed. */
4464 					amdgpu_ras_resume(tmp_adev);
4465 				} else {
4466 					r = -EINVAL;
4467 					goto out;
4468 				}
4469 
4470 				/* Update PSP FW topology after reset */
4471 				if (reset_context->hive &&
4472 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4473 					r = amdgpu_xgmi_update_topology(
4474 						reset_context->hive, tmp_adev);
4475 			}
4476 		}
4477 
4478 out:
4479 		if (!r) {
4480 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4481 			r = amdgpu_ib_ring_tests(tmp_adev);
4482 			if (r) {
4483 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4484 				need_full_reset = true;
4485 				r = -EAGAIN;
4486 				goto end;
4487 			}
4488 		}
4489 
4490 		if (!r)
4491 			r = amdgpu_device_recover_vram(tmp_adev);
4492 		else
4493 			tmp_adev->asic_reset_res = r;
4494 	}
4495 
4496 end:
4497 	if (need_full_reset)
4498 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4499 	else
4500 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4501 	return r;
4502 }
4503 
4504 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4505 				struct amdgpu_hive_info *hive)
4506 {
4507 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4508 		return false;
4509 
4510 	if (hive) {
4511 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4512 	} else {
4513 		down_write(&adev->reset_sem);
4514 	}
4515 
4516 	switch (amdgpu_asic_reset_method(adev)) {
4517 	case AMD_RESET_METHOD_MODE1:
4518 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4519 		break;
4520 	case AMD_RESET_METHOD_MODE2:
4521 		adev->mp1_state = PP_MP1_STATE_RESET;
4522 		break;
4523 	default:
4524 		adev->mp1_state = PP_MP1_STATE_NONE;
4525 		break;
4526 	}
4527 
4528 	return true;
4529 }
4530 
4531 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4532 {
4533 	amdgpu_vf_error_trans_all(adev);
4534 	adev->mp1_state = PP_MP1_STATE_NONE;
4535 	atomic_set(&adev->in_gpu_reset, 0);
4536 	up_write(&adev->reset_sem);
4537 }
4538 
4539 /*
4540  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4541  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4542  *
4543  * unlock won't require roll back.
4544  */
4545 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4546 {
4547 	struct amdgpu_device *tmp_adev = NULL;
4548 
4549 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4550 		if (!hive) {
4551 			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4552 			return -ENODEV;
4553 		}
4554 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4555 			if (!amdgpu_device_lock_adev(tmp_adev, hive))
4556 				goto roll_back;
4557 		}
4558 	} else if (!amdgpu_device_lock_adev(adev, hive))
4559 		return -EAGAIN;
4560 
4561 	return 0;
4562 roll_back:
4563 	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4564 		/*
4565 		 * if the lockup iteration break in the middle of a hive,
4566 		 * it may means there may has a race issue,
4567 		 * or a hive device locked up independently.
4568 		 * we may be in trouble and may not, so will try to roll back
4569 		 * the lock and give out a warnning.
4570 		 */
4571 		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4572 		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4573 			amdgpu_device_unlock_adev(tmp_adev);
4574 		}
4575 	}
4576 	return -EAGAIN;
4577 }
4578 
4579 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4580 {
4581 	struct pci_dev *p = NULL;
4582 
4583 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4584 			adev->pdev->bus->number, 1);
4585 	if (p) {
4586 		pm_runtime_enable(&(p->dev));
4587 		pm_runtime_resume(&(p->dev));
4588 	}
4589 }
4590 
4591 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4592 {
4593 	enum amd_reset_method reset_method;
4594 	struct pci_dev *p = NULL;
4595 	u64 expires;
4596 
4597 	/*
4598 	 * For now, only BACO and mode1 reset are confirmed
4599 	 * to suffer the audio issue without proper suspended.
4600 	 */
4601 	reset_method = amdgpu_asic_reset_method(adev);
4602 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4603 	     (reset_method != AMD_RESET_METHOD_MODE1))
4604 		return -EINVAL;
4605 
4606 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4607 			adev->pdev->bus->number, 1);
4608 	if (!p)
4609 		return -ENODEV;
4610 
4611 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4612 	if (!expires)
4613 		/*
4614 		 * If we cannot get the audio device autosuspend delay,
4615 		 * a fixed 4S interval will be used. Considering 3S is
4616 		 * the audio controller default autosuspend delay setting.
4617 		 * 4S used here is guaranteed to cover that.
4618 		 */
4619 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4620 
4621 	while (!pm_runtime_status_suspended(&(p->dev))) {
4622 		if (!pm_runtime_suspend(&(p->dev)))
4623 			break;
4624 
4625 		if (expires < ktime_get_mono_fast_ns()) {
4626 			dev_warn(adev->dev, "failed to suspend display audio\n");
4627 			/* TODO: abort the succeeding gpu reset? */
4628 			return -ETIMEDOUT;
4629 		}
4630 	}
4631 
4632 	pm_runtime_disable(&(p->dev));
4633 
4634 	return 0;
4635 }
4636 
4637 void amdgpu_device_recheck_guilty_jobs(
4638 	struct amdgpu_device *adev, struct list_head *device_list_handle,
4639 	struct amdgpu_reset_context *reset_context)
4640 {
4641 	int i, r = 0;
4642 
4643 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4644 		struct amdgpu_ring *ring = adev->rings[i];
4645 		int ret = 0;
4646 		struct drm_sched_job *s_job;
4647 
4648 		if (!ring || !ring->sched.thread)
4649 			continue;
4650 
4651 		s_job = list_first_entry_or_null(&ring->sched.pending_list,
4652 				struct drm_sched_job, list);
4653 		if (s_job == NULL)
4654 			continue;
4655 
4656 		/* clear job's guilty and depend the folowing step to decide the real one */
4657 		drm_sched_reset_karma(s_job);
4658 		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4659 
4660 		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4661 		if (ret == 0) { /* timeout */
4662 			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4663 						ring->sched.name, s_job->id);
4664 
4665 			/* set guilty */
4666 			drm_sched_increase_karma(s_job);
4667 retry:
4668 			/* do hw reset */
4669 			if (amdgpu_sriov_vf(adev)) {
4670 				amdgpu_virt_fini_data_exchange(adev);
4671 				r = amdgpu_device_reset_sriov(adev, false);
4672 				if (r)
4673 					adev->asic_reset_res = r;
4674 			} else {
4675 				clear_bit(AMDGPU_SKIP_HW_RESET,
4676 					  &reset_context->flags);
4677 				r = amdgpu_do_asic_reset(device_list_handle,
4678 							 reset_context);
4679 				if (r && r == -EAGAIN)
4680 					goto retry;
4681 			}
4682 
4683 			/*
4684 			 * add reset counter so that the following
4685 			 * resubmitted job could flush vmid
4686 			 */
4687 			atomic_inc(&adev->gpu_reset_counter);
4688 			continue;
4689 		}
4690 
4691 		/* got the hw fence, signal finished fence */
4692 		atomic_dec(ring->sched.score);
4693 		dma_fence_get(&s_job->s_fence->finished);
4694 		dma_fence_signal(&s_job->s_fence->finished);
4695 		dma_fence_put(&s_job->s_fence->finished);
4696 
4697 		/* remove node from list and free the job */
4698 		spin_lock(&ring->sched.job_list_lock);
4699 		list_del_init(&s_job->list);
4700 		spin_unlock(&ring->sched.job_list_lock);
4701 		ring->sched.ops->free_job(s_job);
4702 	}
4703 }
4704 
4705 /**
4706  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4707  *
4708  * @adev: amdgpu_device pointer
4709  * @job: which job trigger hang
4710  *
4711  * Attempt to reset the GPU if it has hung (all asics).
4712  * Attempt to do soft-reset or full-reset and reinitialize Asic
4713  * Returns 0 for success or an error on failure.
4714  */
4715 
4716 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4717 			      struct amdgpu_job *job)
4718 {
4719 	struct list_head device_list, *device_list_handle =  NULL;
4720 	bool job_signaled = false;
4721 	struct amdgpu_hive_info *hive = NULL;
4722 	struct amdgpu_device *tmp_adev = NULL;
4723 	int i, r = 0;
4724 	bool need_emergency_restart = false;
4725 	bool audio_suspended = false;
4726 	int tmp_vram_lost_counter;
4727 	struct amdgpu_reset_context reset_context;
4728 
4729 	memset(&reset_context, 0, sizeof(reset_context));
4730 
4731 	/*
4732 	 * Special case: RAS triggered and full reset isn't supported
4733 	 */
4734 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4735 
4736 	/*
4737 	 * Flush RAM to disk so that after reboot
4738 	 * the user can read log and see why the system rebooted.
4739 	 */
4740 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4741 		DRM_WARN("Emergency reboot.");
4742 
4743 		ksys_sync_helper();
4744 		emergency_restart();
4745 	}
4746 
4747 	dev_info(adev->dev, "GPU %s begin!\n",
4748 		need_emergency_restart ? "jobs stop":"reset");
4749 
4750 	/*
4751 	 * Here we trylock to avoid chain of resets executing from
4752 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4753 	 * different schedulers for same device while this TO handler is running.
4754 	 * We always reset all schedulers for device and all devices for XGMI
4755 	 * hive so that should take care of them too.
4756 	 */
4757 	hive = amdgpu_get_xgmi_hive(adev);
4758 	if (hive) {
4759 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4760 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4761 				job ? job->base.id : -1, hive->hive_id);
4762 			amdgpu_put_xgmi_hive(hive);
4763 			if (job)
4764 				drm_sched_increase_karma(&job->base);
4765 			return 0;
4766 		}
4767 		mutex_lock(&hive->hive_lock);
4768 	}
4769 
4770 	reset_context.method = AMD_RESET_METHOD_NONE;
4771 	reset_context.reset_req_dev = adev;
4772 	reset_context.job = job;
4773 	reset_context.hive = hive;
4774 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
4775 
4776 	/*
4777 	 * lock the device before we try to operate the linked list
4778 	 * if didn't get the device lock, don't touch the linked list since
4779 	 * others may iterating it.
4780 	 */
4781 	r = amdgpu_device_lock_hive_adev(adev, hive);
4782 	if (r) {
4783 		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4784 					job ? job->base.id : -1);
4785 
4786 		/* even we skipped this reset, still need to set the job to guilty */
4787 		if (job)
4788 			drm_sched_increase_karma(&job->base);
4789 		goto skip_recovery;
4790 	}
4791 
4792 	/*
4793 	 * Build list of devices to reset.
4794 	 * In case we are in XGMI hive mode, resort the device list
4795 	 * to put adev in the 1st position.
4796 	 */
4797 	INIT_LIST_HEAD(&device_list);
4798 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4799 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4800 			list_add_tail(&tmp_adev->reset_list, &device_list);
4801 		if (!list_is_first(&adev->reset_list, &device_list))
4802 			list_rotate_to_front(&adev->reset_list, &device_list);
4803 		device_list_handle = &device_list;
4804 	} else {
4805 		list_add_tail(&adev->reset_list, &device_list);
4806 		device_list_handle = &device_list;
4807 	}
4808 
4809 	/* block all schedulers and reset given job's ring */
4810 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4811 		/*
4812 		 * Try to put the audio codec into suspend state
4813 		 * before gpu reset started.
4814 		 *
4815 		 * Due to the power domain of the graphics device
4816 		 * is shared with AZ power domain. Without this,
4817 		 * we may change the audio hardware from behind
4818 		 * the audio driver's back. That will trigger
4819 		 * some audio codec errors.
4820 		 */
4821 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4822 			audio_suspended = true;
4823 
4824 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4825 
4826 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4827 
4828 		if (!amdgpu_sriov_vf(tmp_adev))
4829 			amdgpu_amdkfd_pre_reset(tmp_adev);
4830 
4831 		/*
4832 		 * Mark these ASICs to be reseted as untracked first
4833 		 * And add them back after reset completed
4834 		 */
4835 		amdgpu_unregister_gpu_instance(tmp_adev);
4836 
4837 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4838 
4839 		/* disable ras on ALL IPs */
4840 		if (!need_emergency_restart &&
4841 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4842 			amdgpu_ras_suspend(tmp_adev);
4843 
4844 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4845 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4846 
4847 			if (!ring || !ring->sched.thread)
4848 				continue;
4849 
4850 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4851 
4852 			if (need_emergency_restart)
4853 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4854 		}
4855 		atomic_inc(&tmp_adev->gpu_reset_counter);
4856 	}
4857 
4858 	if (need_emergency_restart)
4859 		goto skip_sched_resume;
4860 
4861 	/*
4862 	 * Must check guilty signal here since after this point all old
4863 	 * HW fences are force signaled.
4864 	 *
4865 	 * job->base holds a reference to parent fence
4866 	 */
4867 	if (job && job->base.s_fence->parent &&
4868 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4869 		job_signaled = true;
4870 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4871 		goto skip_hw_reset;
4872 	}
4873 
4874 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4875 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4876 		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
4877 		/*TODO Should we stop ?*/
4878 		if (r) {
4879 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4880 				  r, adev_to_drm(tmp_adev)->unique);
4881 			tmp_adev->asic_reset_res = r;
4882 		}
4883 	}
4884 
4885 	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
4886 	/* Actual ASIC resets if needed.*/
4887 	/* TODO Implement XGMI hive reset logic for SRIOV */
4888 	if (amdgpu_sriov_vf(adev)) {
4889 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4890 		if (r)
4891 			adev->asic_reset_res = r;
4892 	} else {
4893 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
4894 		if (r && r == -EAGAIN)
4895 			goto retry;
4896 	}
4897 
4898 skip_hw_reset:
4899 
4900 	/* Post ASIC reset for all devs .*/
4901 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4902 
4903 		/*
4904 		 * Sometimes a later bad compute job can block a good gfx job as gfx
4905 		 * and compute ring share internal GC HW mutually. We add an additional
4906 		 * guilty jobs recheck step to find the real guilty job, it synchronously
4907 		 * submits and pends for the first job being signaled. If it gets timeout,
4908 		 * we identify it as a real guilty job.
4909 		 */
4910 		if (amdgpu_gpu_recovery == 2 &&
4911 			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
4912 			amdgpu_device_recheck_guilty_jobs(
4913 				tmp_adev, device_list_handle, &reset_context);
4914 
4915 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4916 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4917 
4918 			if (!ring || !ring->sched.thread)
4919 				continue;
4920 
4921 			/* No point to resubmit jobs if we didn't HW reset*/
4922 			if (!tmp_adev->asic_reset_res && !job_signaled)
4923 				drm_sched_resubmit_jobs(&ring->sched);
4924 
4925 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4926 		}
4927 
4928 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4929 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4930 		}
4931 
4932 		tmp_adev->asic_reset_res = 0;
4933 
4934 		if (r) {
4935 			/* bad news, how to tell it to userspace ? */
4936 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4937 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4938 		} else {
4939 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4940 		}
4941 	}
4942 
4943 skip_sched_resume:
4944 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4945 		/* unlock kfd: SRIOV would do it separately */
4946 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4947 	                amdgpu_amdkfd_post_reset(tmp_adev);
4948 
4949 		/* kfd_post_reset will do nothing if kfd device is not initialized,
4950 		 * need to bring up kfd here if it's not be initialized before
4951 		 */
4952 		if (!adev->kfd.init_complete)
4953 			amdgpu_amdkfd_device_init(adev);
4954 
4955 		if (audio_suspended)
4956 			amdgpu_device_resume_display_audio(tmp_adev);
4957 		amdgpu_device_unlock_adev(tmp_adev);
4958 	}
4959 
4960 skip_recovery:
4961 	if (hive) {
4962 		atomic_set(&hive->in_reset, 0);
4963 		mutex_unlock(&hive->hive_lock);
4964 		amdgpu_put_xgmi_hive(hive);
4965 	}
4966 
4967 	if (r && r != -EAGAIN)
4968 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4969 	return r;
4970 }
4971 
4972 /**
4973  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4974  *
4975  * @adev: amdgpu_device pointer
4976  *
4977  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4978  * and lanes) of the slot the device is in. Handles APUs and
4979  * virtualized environments where PCIE config space may not be available.
4980  */
4981 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4982 {
4983 	struct pci_dev *pdev;
4984 	enum pci_bus_speed speed_cap, platform_speed_cap;
4985 	enum pcie_link_width platform_link_width;
4986 
4987 	if (amdgpu_pcie_gen_cap)
4988 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4989 
4990 	if (amdgpu_pcie_lane_cap)
4991 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4992 
4993 	/* covers APUs as well */
4994 	if (pci_is_root_bus(adev->pdev->bus)) {
4995 		if (adev->pm.pcie_gen_mask == 0)
4996 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4997 		if (adev->pm.pcie_mlw_mask == 0)
4998 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4999 		return;
5000 	}
5001 
5002 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5003 		return;
5004 
5005 	pcie_bandwidth_available(adev->pdev, NULL,
5006 				 &platform_speed_cap, &platform_link_width);
5007 
5008 	if (adev->pm.pcie_gen_mask == 0) {
5009 		/* asic caps */
5010 		pdev = adev->pdev;
5011 		speed_cap = pcie_get_speed_cap(pdev);
5012 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5013 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5014 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5015 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5016 		} else {
5017 			if (speed_cap == PCIE_SPEED_32_0GT)
5018 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5019 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5020 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5021 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5022 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5023 			else if (speed_cap == PCIE_SPEED_16_0GT)
5024 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5025 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5026 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5027 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5028 			else if (speed_cap == PCIE_SPEED_8_0GT)
5029 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5030 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5031 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5032 			else if (speed_cap == PCIE_SPEED_5_0GT)
5033 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5034 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5035 			else
5036 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5037 		}
5038 		/* platform caps */
5039 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5040 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5041 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5042 		} else {
5043 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5044 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5045 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5046 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5047 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5048 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5049 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5050 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5051 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5052 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5053 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5054 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5055 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5056 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5057 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5058 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5059 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5060 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5061 			else
5062 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5063 
5064 		}
5065 	}
5066 	if (adev->pm.pcie_mlw_mask == 0) {
5067 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5068 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5069 		} else {
5070 			switch (platform_link_width) {
5071 			case PCIE_LNK_X32:
5072 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5073 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5074 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5075 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5076 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5077 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5078 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5079 				break;
5080 			case PCIE_LNK_X16:
5081 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5082 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5083 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5084 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5085 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5086 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5087 				break;
5088 			case PCIE_LNK_X12:
5089 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5090 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5091 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5092 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5093 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5094 				break;
5095 			case PCIE_LNK_X8:
5096 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5097 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5098 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5099 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5100 				break;
5101 			case PCIE_LNK_X4:
5102 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5103 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5104 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5105 				break;
5106 			case PCIE_LNK_X2:
5107 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5108 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5109 				break;
5110 			case PCIE_LNK_X1:
5111 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5112 				break;
5113 			default:
5114 				break;
5115 			}
5116 		}
5117 	}
5118 }
5119 
5120 int amdgpu_device_baco_enter(struct drm_device *dev)
5121 {
5122 	struct amdgpu_device *adev = drm_to_adev(dev);
5123 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5124 
5125 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5126 		return -ENOTSUPP;
5127 
5128 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5129 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5130 
5131 	return amdgpu_dpm_baco_enter(adev);
5132 }
5133 
5134 int amdgpu_device_baco_exit(struct drm_device *dev)
5135 {
5136 	struct amdgpu_device *adev = drm_to_adev(dev);
5137 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5138 	int ret = 0;
5139 
5140 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5141 		return -ENOTSUPP;
5142 
5143 	ret = amdgpu_dpm_baco_exit(adev);
5144 	if (ret)
5145 		return ret;
5146 
5147 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5148 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5149 
5150 	return 0;
5151 }
5152 
5153 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5154 {
5155 	int i;
5156 
5157 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5158 		struct amdgpu_ring *ring = adev->rings[i];
5159 
5160 		if (!ring || !ring->sched.thread)
5161 			continue;
5162 
5163 		cancel_delayed_work_sync(&ring->sched.work_tdr);
5164 	}
5165 }
5166 
5167 /**
5168  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5169  * @pdev: PCI device struct
5170  * @state: PCI channel state
5171  *
5172  * Description: Called when a PCI error is detected.
5173  *
5174  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5175  */
5176 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5177 {
5178 	struct drm_device *dev = pci_get_drvdata(pdev);
5179 	struct amdgpu_device *adev = drm_to_adev(dev);
5180 	int i;
5181 
5182 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5183 
5184 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5185 		DRM_WARN("No support for XGMI hive yet...");
5186 		return PCI_ERS_RESULT_DISCONNECT;
5187 	}
5188 
5189 	switch (state) {
5190 	case pci_channel_io_normal:
5191 		return PCI_ERS_RESULT_CAN_RECOVER;
5192 	/* Fatal error, prepare for slot reset */
5193 	case pci_channel_io_frozen:
5194 		/*
5195 		 * Cancel and wait for all TDRs in progress if failing to
5196 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5197 		 *
5198 		 * Locking adev->reset_sem will prevent any external access
5199 		 * to GPU during PCI error recovery
5200 		 */
5201 		while (!amdgpu_device_lock_adev(adev, NULL))
5202 			amdgpu_cancel_all_tdr(adev);
5203 
5204 		/*
5205 		 * Block any work scheduling as we do for regular GPU reset
5206 		 * for the duration of the recovery
5207 		 */
5208 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5209 			struct amdgpu_ring *ring = adev->rings[i];
5210 
5211 			if (!ring || !ring->sched.thread)
5212 				continue;
5213 
5214 			drm_sched_stop(&ring->sched, NULL);
5215 		}
5216 		atomic_inc(&adev->gpu_reset_counter);
5217 		return PCI_ERS_RESULT_NEED_RESET;
5218 	case pci_channel_io_perm_failure:
5219 		/* Permanent error, prepare for device removal */
5220 		return PCI_ERS_RESULT_DISCONNECT;
5221 	}
5222 
5223 	return PCI_ERS_RESULT_NEED_RESET;
5224 }
5225 
5226 /**
5227  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5228  * @pdev: pointer to PCI device
5229  */
5230 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5231 {
5232 
5233 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5234 
5235 	/* TODO - dump whatever for debugging purposes */
5236 
5237 	/* This called only if amdgpu_pci_error_detected returns
5238 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5239 	 * works, no need to reset slot.
5240 	 */
5241 
5242 	return PCI_ERS_RESULT_RECOVERED;
5243 }
5244 
5245 /**
5246  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5247  * @pdev: PCI device struct
5248  *
5249  * Description: This routine is called by the pci error recovery
5250  * code after the PCI slot has been reset, just before we
5251  * should resume normal operations.
5252  */
5253 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5254 {
5255 	struct drm_device *dev = pci_get_drvdata(pdev);
5256 	struct amdgpu_device *adev = drm_to_adev(dev);
5257 	int r, i;
5258 	struct amdgpu_reset_context reset_context;
5259 	u32 memsize;
5260 	struct list_head device_list;
5261 
5262 	DRM_INFO("PCI error: slot reset callback!!\n");
5263 
5264 	memset(&reset_context, 0, sizeof(reset_context));
5265 
5266 	INIT_LIST_HEAD(&device_list);
5267 	list_add_tail(&adev->reset_list, &device_list);
5268 
5269 	/* wait for asic to come out of reset */
5270 	msleep(500);
5271 
5272 	/* Restore PCI confspace */
5273 	amdgpu_device_load_pci_state(pdev);
5274 
5275 	/* confirm  ASIC came out of reset */
5276 	for (i = 0; i < adev->usec_timeout; i++) {
5277 		memsize = amdgpu_asic_get_config_memsize(adev);
5278 
5279 		if (memsize != 0xffffffff)
5280 			break;
5281 		udelay(1);
5282 	}
5283 	if (memsize == 0xffffffff) {
5284 		r = -ETIME;
5285 		goto out;
5286 	}
5287 
5288 	reset_context.method = AMD_RESET_METHOD_NONE;
5289 	reset_context.reset_req_dev = adev;
5290 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5291 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5292 
5293 	adev->in_pci_err_recovery = true;
5294 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5295 	adev->in_pci_err_recovery = false;
5296 	if (r)
5297 		goto out;
5298 
5299 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5300 
5301 out:
5302 	if (!r) {
5303 		if (amdgpu_device_cache_pci_state(adev->pdev))
5304 			pci_restore_state(adev->pdev);
5305 
5306 		DRM_INFO("PCIe error recovery succeeded\n");
5307 	} else {
5308 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5309 		amdgpu_device_unlock_adev(adev);
5310 	}
5311 
5312 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5313 }
5314 
5315 /**
5316  * amdgpu_pci_resume() - resume normal ops after PCI reset
5317  * @pdev: pointer to PCI device
5318  *
5319  * Called when the error recovery driver tells us that its
5320  * OK to resume normal operation.
5321  */
5322 void amdgpu_pci_resume(struct pci_dev *pdev)
5323 {
5324 	struct drm_device *dev = pci_get_drvdata(pdev);
5325 	struct amdgpu_device *adev = drm_to_adev(dev);
5326 	int i;
5327 
5328 
5329 	DRM_INFO("PCI error: resume callback!!\n");
5330 
5331 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5332 		struct amdgpu_ring *ring = adev->rings[i];
5333 
5334 		if (!ring || !ring->sched.thread)
5335 			continue;
5336 
5337 
5338 		drm_sched_resubmit_jobs(&ring->sched);
5339 		drm_sched_start(&ring->sched, true);
5340 	}
5341 
5342 	amdgpu_device_unlock_adev(adev);
5343 }
5344 
5345 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5346 {
5347 	struct drm_device *dev = pci_get_drvdata(pdev);
5348 	struct amdgpu_device *adev = drm_to_adev(dev);
5349 	int r;
5350 
5351 	r = pci_save_state(pdev);
5352 	if (!r) {
5353 		kfree(adev->pci_state);
5354 
5355 		adev->pci_state = pci_store_saved_state(pdev);
5356 
5357 		if (!adev->pci_state) {
5358 			DRM_ERROR("Failed to store PCI saved state");
5359 			return false;
5360 		}
5361 	} else {
5362 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5363 		return false;
5364 	}
5365 
5366 	return true;
5367 }
5368 
5369 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5370 {
5371 	struct drm_device *dev = pci_get_drvdata(pdev);
5372 	struct amdgpu_device *adev = drm_to_adev(dev);
5373 	int r;
5374 
5375 	if (!adev->pci_state)
5376 		return false;
5377 
5378 	r = pci_load_saved_state(pdev, adev->pci_state);
5379 
5380 	if (!r) {
5381 		pci_restore_state(pdev);
5382 	} else {
5383 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5384 		return false;
5385 	}
5386 
5387 	return true;
5388 }
5389 
5390 
5391