1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 #include "amdgpu_reset.h"
69 
70 #include <linux/suspend.h>
71 #include <drm/task_barrier.h>
72 #include <linux/pm_runtime.h>
73 
74 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
85 
86 #define AMDGPU_RESUME_MS		2000
87 
88 const char *amdgpu_asic_name[] = {
89 	"TAHITI",
90 	"PITCAIRN",
91 	"VERDE",
92 	"OLAND",
93 	"HAINAN",
94 	"BONAIRE",
95 	"KAVERI",
96 	"KABINI",
97 	"HAWAII",
98 	"MULLINS",
99 	"TOPAZ",
100 	"TONGA",
101 	"FIJI",
102 	"CARRIZO",
103 	"STONEY",
104 	"POLARIS10",
105 	"POLARIS11",
106 	"POLARIS12",
107 	"VEGAM",
108 	"VEGA10",
109 	"VEGA12",
110 	"VEGA20",
111 	"RAVEN",
112 	"ARCTURUS",
113 	"RENOIR",
114 	"ALDEBARAN",
115 	"NAVI10",
116 	"NAVI14",
117 	"NAVI12",
118 	"SIENNA_CICHLID",
119 	"NAVY_FLOUNDER",
120 	"VANGOGH",
121 	"DIMGREY_CAVEFISH",
122 	"LAST",
123 };
124 
125 /**
126  * DOC: pcie_replay_count
127  *
128  * The amdgpu driver provides a sysfs API for reporting the total number
129  * of PCIe replays (NAKs)
130  * The file pcie_replay_count is used for this and returns the total
131  * number of replays as a sum of the NAKs generated and NAKs received
132  */
133 
134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
135 		struct device_attribute *attr, char *buf)
136 {
137 	struct drm_device *ddev = dev_get_drvdata(dev);
138 	struct amdgpu_device *adev = drm_to_adev(ddev);
139 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
140 
141 	return sysfs_emit(buf, "%llu\n", cnt);
142 }
143 
144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
145 		amdgpu_device_get_pcie_replay_count, NULL);
146 
147 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
148 
149 /**
150  * DOC: product_name
151  *
152  * The amdgpu driver provides a sysfs API for reporting the product name
153  * for the device
154  * The file serial_number is used for this and returns the product name
155  * as returned from the FRU.
156  * NOTE: This is only available for certain server cards
157  */
158 
159 static ssize_t amdgpu_device_get_product_name(struct device *dev,
160 		struct device_attribute *attr, char *buf)
161 {
162 	struct drm_device *ddev = dev_get_drvdata(dev);
163 	struct amdgpu_device *adev = drm_to_adev(ddev);
164 
165 	return sysfs_emit(buf, "%s\n", adev->product_name);
166 }
167 
168 static DEVICE_ATTR(product_name, S_IRUGO,
169 		amdgpu_device_get_product_name, NULL);
170 
171 /**
172  * DOC: product_number
173  *
174  * The amdgpu driver provides a sysfs API for reporting the part number
175  * for the device
176  * The file serial_number is used for this and returns the part number
177  * as returned from the FRU.
178  * NOTE: This is only available for certain server cards
179  */
180 
181 static ssize_t amdgpu_device_get_product_number(struct device *dev,
182 		struct device_attribute *attr, char *buf)
183 {
184 	struct drm_device *ddev = dev_get_drvdata(dev);
185 	struct amdgpu_device *adev = drm_to_adev(ddev);
186 
187 	return sysfs_emit(buf, "%s\n", adev->product_number);
188 }
189 
190 static DEVICE_ATTR(product_number, S_IRUGO,
191 		amdgpu_device_get_product_number, NULL);
192 
193 /**
194  * DOC: serial_number
195  *
196  * The amdgpu driver provides a sysfs API for reporting the serial number
197  * for the device
198  * The file serial_number is used for this and returns the serial number
199  * as returned from the FRU.
200  * NOTE: This is only available for certain server cards
201  */
202 
203 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
204 		struct device_attribute *attr, char *buf)
205 {
206 	struct drm_device *ddev = dev_get_drvdata(dev);
207 	struct amdgpu_device *adev = drm_to_adev(ddev);
208 
209 	return sysfs_emit(buf, "%s\n", adev->serial);
210 }
211 
212 static DEVICE_ATTR(serial_number, S_IRUGO,
213 		amdgpu_device_get_serial_number, NULL);
214 
215 /**
216  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
217  *
218  * @dev: drm_device pointer
219  *
220  * Returns true if the device is a dGPU with ATPX power control,
221  * otherwise return false.
222  */
223 bool amdgpu_device_supports_px(struct drm_device *dev)
224 {
225 	struct amdgpu_device *adev = drm_to_adev(dev);
226 
227 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
228 		return true;
229 	return false;
230 }
231 
232 /**
233  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
234  *
235  * @dev: drm_device pointer
236  *
237  * Returns true if the device is a dGPU with ACPI power control,
238  * otherwise return false.
239  */
240 bool amdgpu_device_supports_boco(struct drm_device *dev)
241 {
242 	struct amdgpu_device *adev = drm_to_adev(dev);
243 
244 	if (adev->has_pr3 ||
245 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
246 		return true;
247 	return false;
248 }
249 
250 /**
251  * amdgpu_device_supports_baco - Does the device support BACO
252  *
253  * @dev: drm_device pointer
254  *
255  * Returns true if the device supporte BACO,
256  * otherwise return false.
257  */
258 bool amdgpu_device_supports_baco(struct drm_device *dev)
259 {
260 	struct amdgpu_device *adev = drm_to_adev(dev);
261 
262 	return amdgpu_asic_supports_baco(adev);
263 }
264 
265 /*
266  * VRAM access helper functions
267  */
268 
269 /**
270  * amdgpu_device_vram_access - read/write a buffer in vram
271  *
272  * @adev: amdgpu_device pointer
273  * @pos: offset of the buffer in vram
274  * @buf: virtual address of the buffer in system memory
275  * @size: read/write size, sizeof(@buf) must > @size
276  * @write: true - write to vram, otherwise - read from vram
277  */
278 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
279 			       uint32_t *buf, size_t size, bool write)
280 {
281 	unsigned long flags;
282 	uint32_t hi = ~0;
283 	uint64_t last;
284 
285 
286 #ifdef CONFIG_64BIT
287 	last = min(pos + size, adev->gmc.visible_vram_size);
288 	if (last > pos) {
289 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
290 		size_t count = last - pos;
291 
292 		if (write) {
293 			memcpy_toio(addr, buf, count);
294 			mb();
295 			amdgpu_asic_flush_hdp(adev, NULL);
296 		} else {
297 			amdgpu_asic_invalidate_hdp(adev, NULL);
298 			mb();
299 			memcpy_fromio(buf, addr, count);
300 		}
301 
302 		if (count == size)
303 			return;
304 
305 		pos += count;
306 		buf += count / 4;
307 		size -= count;
308 	}
309 #endif
310 
311 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
312 	for (last = pos + size; pos < last; pos += 4) {
313 		uint32_t tmp = pos >> 31;
314 
315 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
316 		if (tmp != hi) {
317 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
318 			hi = tmp;
319 		}
320 		if (write)
321 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
322 		else
323 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
324 	}
325 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
326 }
327 
328 /*
329  * register access helper functions.
330  */
331 
332 /* Check if hw access should be skipped because of hotplug or device error */
333 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
334 {
335 	if (adev->in_pci_err_recovery)
336 		return true;
337 
338 #ifdef CONFIG_LOCKDEP
339 	/*
340 	 * This is a bit complicated to understand, so worth a comment. What we assert
341 	 * here is that the GPU reset is not running on another thread in parallel.
342 	 *
343 	 * For this we trylock the read side of the reset semaphore, if that succeeds
344 	 * we know that the reset is not running in paralell.
345 	 *
346 	 * If the trylock fails we assert that we are either already holding the read
347 	 * side of the lock or are the reset thread itself and hold the write side of
348 	 * the lock.
349 	 */
350 	if (in_task()) {
351 		if (down_read_trylock(&adev->reset_sem))
352 			up_read(&adev->reset_sem);
353 		else
354 			lockdep_assert_held(&adev->reset_sem);
355 	}
356 #endif
357 	return false;
358 }
359 
360 /**
361  * amdgpu_device_rreg - read a memory mapped IO or indirect register
362  *
363  * @adev: amdgpu_device pointer
364  * @reg: dword aligned register offset
365  * @acc_flags: access flags which require special behavior
366  *
367  * Returns the 32 bit value from the offset specified.
368  */
369 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
370 			    uint32_t reg, uint32_t acc_flags)
371 {
372 	uint32_t ret;
373 
374 	if (amdgpu_device_skip_hw_access(adev))
375 		return 0;
376 
377 	if ((reg * 4) < adev->rmmio_size) {
378 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
379 		    amdgpu_sriov_runtime(adev) &&
380 		    down_read_trylock(&adev->reset_sem)) {
381 			ret = amdgpu_kiq_rreg(adev, reg);
382 			up_read(&adev->reset_sem);
383 		} else {
384 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
385 		}
386 	} else {
387 		ret = adev->pcie_rreg(adev, reg * 4);
388 	}
389 
390 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
391 
392 	return ret;
393 }
394 
395 /*
396  * MMIO register read with bytes helper functions
397  * @offset:bytes offset from MMIO start
398  *
399 */
400 
401 /**
402  * amdgpu_mm_rreg8 - read a memory mapped IO register
403  *
404  * @adev: amdgpu_device pointer
405  * @offset: byte aligned register offset
406  *
407  * Returns the 8 bit value from the offset specified.
408  */
409 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
410 {
411 	if (amdgpu_device_skip_hw_access(adev))
412 		return 0;
413 
414 	if (offset < adev->rmmio_size)
415 		return (readb(adev->rmmio + offset));
416 	BUG();
417 }
418 
419 /*
420  * MMIO register write with bytes helper functions
421  * @offset:bytes offset from MMIO start
422  * @value: the value want to be written to the register
423  *
424 */
425 /**
426  * amdgpu_mm_wreg8 - read a memory mapped IO register
427  *
428  * @adev: amdgpu_device pointer
429  * @offset: byte aligned register offset
430  * @value: 8 bit value to write
431  *
432  * Writes the value specified to the offset specified.
433  */
434 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
435 {
436 	if (amdgpu_device_skip_hw_access(adev))
437 		return;
438 
439 	if (offset < adev->rmmio_size)
440 		writeb(value, adev->rmmio + offset);
441 	else
442 		BUG();
443 }
444 
445 /**
446  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
447  *
448  * @adev: amdgpu_device pointer
449  * @reg: dword aligned register offset
450  * @v: 32 bit value to write to the register
451  * @acc_flags: access flags which require special behavior
452  *
453  * Writes the value specified to the offset specified.
454  */
455 void amdgpu_device_wreg(struct amdgpu_device *adev,
456 			uint32_t reg, uint32_t v,
457 			uint32_t acc_flags)
458 {
459 	if (amdgpu_device_skip_hw_access(adev))
460 		return;
461 
462 	if ((reg * 4) < adev->rmmio_size) {
463 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
464 		    amdgpu_sriov_runtime(adev) &&
465 		    down_read_trylock(&adev->reset_sem)) {
466 			amdgpu_kiq_wreg(adev, reg, v);
467 			up_read(&adev->reset_sem);
468 		} else {
469 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
470 		}
471 	} else {
472 		adev->pcie_wreg(adev, reg * 4, v);
473 	}
474 
475 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
476 }
477 
478 /*
479  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
480  *
481  * this function is invoked only the debugfs register access
482  * */
483 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
484 			     uint32_t reg, uint32_t v)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return;
488 
489 	if (amdgpu_sriov_fullaccess(adev) &&
490 	    adev->gfx.rlc.funcs &&
491 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
492 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
493 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0);
494 	} else {
495 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
496 	}
497 }
498 
499 /**
500  * amdgpu_mm_rdoorbell - read a doorbell dword
501  *
502  * @adev: amdgpu_device pointer
503  * @index: doorbell index
504  *
505  * Returns the value in the doorbell aperture at the
506  * requested doorbell index (CIK).
507  */
508 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
509 {
510 	if (amdgpu_device_skip_hw_access(adev))
511 		return 0;
512 
513 	if (index < adev->doorbell.num_doorbells) {
514 		return readl(adev->doorbell.ptr + index);
515 	} else {
516 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
517 		return 0;
518 	}
519 }
520 
521 /**
522  * amdgpu_mm_wdoorbell - write a doorbell dword
523  *
524  * @adev: amdgpu_device pointer
525  * @index: doorbell index
526  * @v: value to write
527  *
528  * Writes @v to the doorbell aperture at the
529  * requested doorbell index (CIK).
530  */
531 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
532 {
533 	if (amdgpu_device_skip_hw_access(adev))
534 		return;
535 
536 	if (index < adev->doorbell.num_doorbells) {
537 		writel(v, adev->doorbell.ptr + index);
538 	} else {
539 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
540 	}
541 }
542 
543 /**
544  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
545  *
546  * @adev: amdgpu_device pointer
547  * @index: doorbell index
548  *
549  * Returns the value in the doorbell aperture at the
550  * requested doorbell index (VEGA10+).
551  */
552 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
553 {
554 	if (amdgpu_device_skip_hw_access(adev))
555 		return 0;
556 
557 	if (index < adev->doorbell.num_doorbells) {
558 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
559 	} else {
560 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
561 		return 0;
562 	}
563 }
564 
565 /**
566  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
567  *
568  * @adev: amdgpu_device pointer
569  * @index: doorbell index
570  * @v: value to write
571  *
572  * Writes @v to the doorbell aperture at the
573  * requested doorbell index (VEGA10+).
574  */
575 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
576 {
577 	if (amdgpu_device_skip_hw_access(adev))
578 		return;
579 
580 	if (index < adev->doorbell.num_doorbells) {
581 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
582 	} else {
583 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
584 	}
585 }
586 
587 /**
588  * amdgpu_device_indirect_rreg - read an indirect register
589  *
590  * @adev: amdgpu_device pointer
591  * @pcie_index: mmio register offset
592  * @pcie_data: mmio register offset
593  * @reg_addr: indirect register address to read from
594  *
595  * Returns the value of indirect register @reg_addr
596  */
597 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
598 				u32 pcie_index, u32 pcie_data,
599 				u32 reg_addr)
600 {
601 	unsigned long flags;
602 	u32 r;
603 	void __iomem *pcie_index_offset;
604 	void __iomem *pcie_data_offset;
605 
606 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
607 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
608 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
609 
610 	writel(reg_addr, pcie_index_offset);
611 	readl(pcie_index_offset);
612 	r = readl(pcie_data_offset);
613 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
614 
615 	return r;
616 }
617 
618 /**
619  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
620  *
621  * @adev: amdgpu_device pointer
622  * @pcie_index: mmio register offset
623  * @pcie_data: mmio register offset
624  * @reg_addr: indirect register address to read from
625  *
626  * Returns the value of indirect register @reg_addr
627  */
628 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
629 				  u32 pcie_index, u32 pcie_data,
630 				  u32 reg_addr)
631 {
632 	unsigned long flags;
633 	u64 r;
634 	void __iomem *pcie_index_offset;
635 	void __iomem *pcie_data_offset;
636 
637 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
638 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
639 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
640 
641 	/* read low 32 bits */
642 	writel(reg_addr, pcie_index_offset);
643 	readl(pcie_index_offset);
644 	r = readl(pcie_data_offset);
645 	/* read high 32 bits */
646 	writel(reg_addr + 4, pcie_index_offset);
647 	readl(pcie_index_offset);
648 	r |= ((u64)readl(pcie_data_offset) << 32);
649 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
650 
651 	return r;
652 }
653 
654 /**
655  * amdgpu_device_indirect_wreg - write an indirect register address
656  *
657  * @adev: amdgpu_device pointer
658  * @pcie_index: mmio register offset
659  * @pcie_data: mmio register offset
660  * @reg_addr: indirect register offset
661  * @reg_data: indirect register data
662  *
663  */
664 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
665 				 u32 pcie_index, u32 pcie_data,
666 				 u32 reg_addr, u32 reg_data)
667 {
668 	unsigned long flags;
669 	void __iomem *pcie_index_offset;
670 	void __iomem *pcie_data_offset;
671 
672 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
673 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
674 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
675 
676 	writel(reg_addr, pcie_index_offset);
677 	readl(pcie_index_offset);
678 	writel(reg_data, pcie_data_offset);
679 	readl(pcie_data_offset);
680 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
681 }
682 
683 /**
684  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
685  *
686  * @adev: amdgpu_device pointer
687  * @pcie_index: mmio register offset
688  * @pcie_data: mmio register offset
689  * @reg_addr: indirect register offset
690  * @reg_data: indirect register data
691  *
692  */
693 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
694 				   u32 pcie_index, u32 pcie_data,
695 				   u32 reg_addr, u64 reg_data)
696 {
697 	unsigned long flags;
698 	void __iomem *pcie_index_offset;
699 	void __iomem *pcie_data_offset;
700 
701 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
702 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
703 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
704 
705 	/* write low 32 bits */
706 	writel(reg_addr, pcie_index_offset);
707 	readl(pcie_index_offset);
708 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
709 	readl(pcie_data_offset);
710 	/* write high 32 bits */
711 	writel(reg_addr + 4, pcie_index_offset);
712 	readl(pcie_index_offset);
713 	writel((u32)(reg_data >> 32), pcie_data_offset);
714 	readl(pcie_data_offset);
715 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
716 }
717 
718 /**
719  * amdgpu_invalid_rreg - dummy reg read function
720  *
721  * @adev: amdgpu_device pointer
722  * @reg: offset of register
723  *
724  * Dummy register read function.  Used for register blocks
725  * that certain asics don't have (all asics).
726  * Returns the value in the register.
727  */
728 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
729 {
730 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
731 	BUG();
732 	return 0;
733 }
734 
735 /**
736  * amdgpu_invalid_wreg - dummy reg write function
737  *
738  * @adev: amdgpu_device pointer
739  * @reg: offset of register
740  * @v: value to write to the register
741  *
742  * Dummy register read function.  Used for register blocks
743  * that certain asics don't have (all asics).
744  */
745 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
746 {
747 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
748 		  reg, v);
749 	BUG();
750 }
751 
752 /**
753  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
754  *
755  * @adev: amdgpu_device pointer
756  * @reg: offset of register
757  *
758  * Dummy register read function.  Used for register blocks
759  * that certain asics don't have (all asics).
760  * Returns the value in the register.
761  */
762 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
763 {
764 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
765 	BUG();
766 	return 0;
767 }
768 
769 /**
770  * amdgpu_invalid_wreg64 - dummy reg write function
771  *
772  * @adev: amdgpu_device pointer
773  * @reg: offset of register
774  * @v: value to write to the register
775  *
776  * Dummy register read function.  Used for register blocks
777  * that certain asics don't have (all asics).
778  */
779 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
780 {
781 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
782 		  reg, v);
783 	BUG();
784 }
785 
786 /**
787  * amdgpu_block_invalid_rreg - dummy reg read function
788  *
789  * @adev: amdgpu_device pointer
790  * @block: offset of instance
791  * @reg: offset of register
792  *
793  * Dummy register read function.  Used for register blocks
794  * that certain asics don't have (all asics).
795  * Returns the value in the register.
796  */
797 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
798 					  uint32_t block, uint32_t reg)
799 {
800 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
801 		  reg, block);
802 	BUG();
803 	return 0;
804 }
805 
806 /**
807  * amdgpu_block_invalid_wreg - dummy reg write function
808  *
809  * @adev: amdgpu_device pointer
810  * @block: offset of instance
811  * @reg: offset of register
812  * @v: value to write to the register
813  *
814  * Dummy register read function.  Used for register blocks
815  * that certain asics don't have (all asics).
816  */
817 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
818 				      uint32_t block,
819 				      uint32_t reg, uint32_t v)
820 {
821 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
822 		  reg, block, v);
823 	BUG();
824 }
825 
826 /**
827  * amdgpu_device_asic_init - Wrapper for atom asic_init
828  *
829  * @adev: amdgpu_device pointer
830  *
831  * Does any asic specific work and then calls atom asic init.
832  */
833 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
834 {
835 	amdgpu_asic_pre_asic_init(adev);
836 
837 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
838 }
839 
840 /**
841  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
842  *
843  * @adev: amdgpu_device pointer
844  *
845  * Allocates a scratch page of VRAM for use by various things in the
846  * driver.
847  */
848 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
849 {
850 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
851 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
852 				       &adev->vram_scratch.robj,
853 				       &adev->vram_scratch.gpu_addr,
854 				       (void **)&adev->vram_scratch.ptr);
855 }
856 
857 /**
858  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
859  *
860  * @adev: amdgpu_device pointer
861  *
862  * Frees the VRAM scratch page.
863  */
864 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
865 {
866 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
867 }
868 
869 /**
870  * amdgpu_device_program_register_sequence - program an array of registers.
871  *
872  * @adev: amdgpu_device pointer
873  * @registers: pointer to the register array
874  * @array_size: size of the register array
875  *
876  * Programs an array or registers with and and or masks.
877  * This is a helper for setting golden registers.
878  */
879 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
880 					     const u32 *registers,
881 					     const u32 array_size)
882 {
883 	u32 tmp, reg, and_mask, or_mask;
884 	int i;
885 
886 	if (array_size % 3)
887 		return;
888 
889 	for (i = 0; i < array_size; i +=3) {
890 		reg = registers[i + 0];
891 		and_mask = registers[i + 1];
892 		or_mask = registers[i + 2];
893 
894 		if (and_mask == 0xffffffff) {
895 			tmp = or_mask;
896 		} else {
897 			tmp = RREG32(reg);
898 			tmp &= ~and_mask;
899 			if (adev->family >= AMDGPU_FAMILY_AI)
900 				tmp |= (or_mask & and_mask);
901 			else
902 				tmp |= or_mask;
903 		}
904 		WREG32(reg, tmp);
905 	}
906 }
907 
908 /**
909  * amdgpu_device_pci_config_reset - reset the GPU
910  *
911  * @adev: amdgpu_device pointer
912  *
913  * Resets the GPU using the pci config reset sequence.
914  * Only applicable to asics prior to vega10.
915  */
916 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
917 {
918 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
919 }
920 
921 /**
922  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
923  *
924  * @adev: amdgpu_device pointer
925  *
926  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
927  */
928 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
929 {
930 	return pci_reset_function(adev->pdev);
931 }
932 
933 /*
934  * GPU doorbell aperture helpers function.
935  */
936 /**
937  * amdgpu_device_doorbell_init - Init doorbell driver information.
938  *
939  * @adev: amdgpu_device pointer
940  *
941  * Init doorbell driver information (CIK)
942  * Returns 0 on success, error on failure.
943  */
944 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
945 {
946 
947 	/* No doorbell on SI hardware generation */
948 	if (adev->asic_type < CHIP_BONAIRE) {
949 		adev->doorbell.base = 0;
950 		adev->doorbell.size = 0;
951 		adev->doorbell.num_doorbells = 0;
952 		adev->doorbell.ptr = NULL;
953 		return 0;
954 	}
955 
956 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
957 		return -EINVAL;
958 
959 	amdgpu_asic_init_doorbell_index(adev);
960 
961 	/* doorbell bar mapping */
962 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
963 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
964 
965 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
966 					     adev->doorbell_index.max_assignment+1);
967 	if (adev->doorbell.num_doorbells == 0)
968 		return -EINVAL;
969 
970 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
971 	 * paging queue doorbell use the second page. The
972 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
973 	 * doorbells are in the first page. So with paging queue enabled,
974 	 * the max num_doorbells should + 1 page (0x400 in dword)
975 	 */
976 	if (adev->asic_type >= CHIP_VEGA10)
977 		adev->doorbell.num_doorbells += 0x400;
978 
979 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
980 				     adev->doorbell.num_doorbells *
981 				     sizeof(u32));
982 	if (adev->doorbell.ptr == NULL)
983 		return -ENOMEM;
984 
985 	return 0;
986 }
987 
988 /**
989  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
990  *
991  * @adev: amdgpu_device pointer
992  *
993  * Tear down doorbell driver information (CIK)
994  */
995 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
996 {
997 	iounmap(adev->doorbell.ptr);
998 	adev->doorbell.ptr = NULL;
999 }
1000 
1001 
1002 
1003 /*
1004  * amdgpu_device_wb_*()
1005  * Writeback is the method by which the GPU updates special pages in memory
1006  * with the status of certain GPU events (fences, ring pointers,etc.).
1007  */
1008 
1009 /**
1010  * amdgpu_device_wb_fini - Disable Writeback and free memory
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Disables Writeback and frees the Writeback memory (all asics).
1015  * Used at driver shutdown.
1016  */
1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 	if (adev->wb.wb_obj) {
1020 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 				      &adev->wb.gpu_addr,
1022 				      (void **)&adev->wb.wb);
1023 		adev->wb.wb_obj = NULL;
1024 	}
1025 }
1026 
1027 /**
1028  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1029  *
1030  * @adev: amdgpu_device pointer
1031  *
1032  * Initializes writeback and allocates writeback memory (all asics).
1033  * Used at driver startup.
1034  * Returns 0 on success or an -error on failure.
1035  */
1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 	int r;
1039 
1040 	if (adev->wb.wb_obj == NULL) {
1041 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 					    (void **)&adev->wb.wb);
1046 		if (r) {
1047 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 			return r;
1049 		}
1050 
1051 		adev->wb.num_wb = AMDGPU_MAX_WB;
1052 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053 
1054 		/* clear wb memory */
1055 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 	}
1057 
1058 	return 0;
1059 }
1060 
1061 /**
1062  * amdgpu_device_wb_get - Allocate a wb entry
1063  *
1064  * @adev: amdgpu_device pointer
1065  * @wb: wb index
1066  *
1067  * Allocate a wb slot for use by the driver (all asics).
1068  * Returns 0 on success or -EINVAL on failure.
1069  */
1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073 
1074 	if (offset < adev->wb.num_wb) {
1075 		__set_bit(offset, adev->wb.used);
1076 		*wb = offset << 3; /* convert to dw offset */
1077 		return 0;
1078 	} else {
1079 		return -EINVAL;
1080 	}
1081 }
1082 
1083 /**
1084  * amdgpu_device_wb_free - Free a wb entry
1085  *
1086  * @adev: amdgpu_device pointer
1087  * @wb: wb index
1088  *
1089  * Free a wb slot allocated for use by the driver (all asics)
1090  */
1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 	wb >>= 3;
1094 	if (wb < adev->wb.num_wb)
1095 		__clear_bit(wb, adev->wb.used);
1096 }
1097 
1098 /**
1099  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100  *
1101  * @adev: amdgpu_device pointer
1102  *
1103  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104  * to fail, but if any of the BARs is not accessible after the size we abort
1105  * driver loading by returning -ENODEV.
1106  */
1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1110 	struct pci_bus *root;
1111 	struct resource *res;
1112 	unsigned i;
1113 	u16 cmd;
1114 	int r;
1115 
1116 	/* Bypass for VF */
1117 	if (amdgpu_sriov_vf(adev))
1118 		return 0;
1119 
1120 	/* skip if the bios has already enabled large BAR */
1121 	if (adev->gmc.real_vram_size &&
1122 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1123 		return 0;
1124 
1125 	/* Check if the root BUS has 64bit memory resources */
1126 	root = adev->pdev->bus;
1127 	while (root->parent)
1128 		root = root->parent;
1129 
1130 	pci_bus_for_each_resource(root, res, i) {
1131 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1132 		    res->start > 0x100000000ull)
1133 			break;
1134 	}
1135 
1136 	/* Trying to resize is pointless without a root hub window above 4GB */
1137 	if (!res)
1138 		return 0;
1139 
1140 	/* Limit the BAR size to what is available */
1141 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1142 			rbar_size);
1143 
1144 	/* Disable memory decoding while we change the BAR addresses and size */
1145 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1146 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1147 			      cmd & ~PCI_COMMAND_MEMORY);
1148 
1149 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1150 	amdgpu_device_doorbell_fini(adev);
1151 	if (adev->asic_type >= CHIP_BONAIRE)
1152 		pci_release_resource(adev->pdev, 2);
1153 
1154 	pci_release_resource(adev->pdev, 0);
1155 
1156 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1157 	if (r == -ENOSPC)
1158 		DRM_INFO("Not enough PCI address space for a large BAR.");
1159 	else if (r && r != -ENOTSUPP)
1160 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1161 
1162 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1163 
1164 	/* When the doorbell or fb BAR isn't available we have no chance of
1165 	 * using the device.
1166 	 */
1167 	r = amdgpu_device_doorbell_init(adev);
1168 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1169 		return -ENODEV;
1170 
1171 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1172 
1173 	return 0;
1174 }
1175 
1176 /*
1177  * GPU helpers function.
1178  */
1179 /**
1180  * amdgpu_device_need_post - check if the hw need post or not
1181  *
1182  * @adev: amdgpu_device pointer
1183  *
1184  * Check if the asic has been initialized (all asics) at driver startup
1185  * or post is needed if  hw reset is performed.
1186  * Returns true if need or false if not.
1187  */
1188 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1189 {
1190 	uint32_t reg;
1191 
1192 	if (amdgpu_sriov_vf(adev))
1193 		return false;
1194 
1195 	if (amdgpu_passthrough(adev)) {
1196 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1197 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1198 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1199 		 * vpost executed for smc version below 22.15
1200 		 */
1201 		if (adev->asic_type == CHIP_FIJI) {
1202 			int err;
1203 			uint32_t fw_ver;
1204 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1205 			/* force vPost if error occured */
1206 			if (err)
1207 				return true;
1208 
1209 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1210 			if (fw_ver < 0x00160e00)
1211 				return true;
1212 		}
1213 	}
1214 
1215 	/* Don't post if we need to reset whole hive on init */
1216 	if (adev->gmc.xgmi.pending_reset)
1217 		return false;
1218 
1219 	if (adev->has_hw_reset) {
1220 		adev->has_hw_reset = false;
1221 		return true;
1222 	}
1223 
1224 	/* bios scratch used on CIK+ */
1225 	if (adev->asic_type >= CHIP_BONAIRE)
1226 		return amdgpu_atombios_scratch_need_asic_init(adev);
1227 
1228 	/* check MEM_SIZE for older asics */
1229 	reg = amdgpu_asic_get_config_memsize(adev);
1230 
1231 	if ((reg != 0) && (reg != 0xffffffff))
1232 		return false;
1233 
1234 	return true;
1235 }
1236 
1237 /* if we get transitioned to only one device, take VGA back */
1238 /**
1239  * amdgpu_device_vga_set_decode - enable/disable vga decode
1240  *
1241  * @cookie: amdgpu_device pointer
1242  * @state: enable/disable vga decode
1243  *
1244  * Enable/disable vga decode (all asics).
1245  * Returns VGA resource flags.
1246  */
1247 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1248 {
1249 	struct amdgpu_device *adev = cookie;
1250 	amdgpu_asic_set_vga_state(adev, state);
1251 	if (state)
1252 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1253 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1254 	else
1255 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1256 }
1257 
1258 /**
1259  * amdgpu_device_check_block_size - validate the vm block size
1260  *
1261  * @adev: amdgpu_device pointer
1262  *
1263  * Validates the vm block size specified via module parameter.
1264  * The vm block size defines number of bits in page table versus page directory,
1265  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1266  * page table and the remaining bits are in the page directory.
1267  */
1268 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1269 {
1270 	/* defines number of bits in page table versus page directory,
1271 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1272 	 * page table and the remaining bits are in the page directory */
1273 	if (amdgpu_vm_block_size == -1)
1274 		return;
1275 
1276 	if (amdgpu_vm_block_size < 9) {
1277 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1278 			 amdgpu_vm_block_size);
1279 		amdgpu_vm_block_size = -1;
1280 	}
1281 }
1282 
1283 /**
1284  * amdgpu_device_check_vm_size - validate the vm size
1285  *
1286  * @adev: amdgpu_device pointer
1287  *
1288  * Validates the vm size in GB specified via module parameter.
1289  * The VM size is the size of the GPU virtual memory space in GB.
1290  */
1291 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1292 {
1293 	/* no need to check the default value */
1294 	if (amdgpu_vm_size == -1)
1295 		return;
1296 
1297 	if (amdgpu_vm_size < 1) {
1298 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1299 			 amdgpu_vm_size);
1300 		amdgpu_vm_size = -1;
1301 	}
1302 }
1303 
1304 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1305 {
1306 	struct sysinfo si;
1307 	bool is_os_64 = (sizeof(void *) == 8);
1308 	uint64_t total_memory;
1309 	uint64_t dram_size_seven_GB = 0x1B8000000;
1310 	uint64_t dram_size_three_GB = 0xB8000000;
1311 
1312 	if (amdgpu_smu_memory_pool_size == 0)
1313 		return;
1314 
1315 	if (!is_os_64) {
1316 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1317 		goto def_value;
1318 	}
1319 	si_meminfo(&si);
1320 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1321 
1322 	if ((amdgpu_smu_memory_pool_size == 1) ||
1323 		(amdgpu_smu_memory_pool_size == 2)) {
1324 		if (total_memory < dram_size_three_GB)
1325 			goto def_value1;
1326 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1327 		(amdgpu_smu_memory_pool_size == 8)) {
1328 		if (total_memory < dram_size_seven_GB)
1329 			goto def_value1;
1330 	} else {
1331 		DRM_WARN("Smu memory pool size not supported\n");
1332 		goto def_value;
1333 	}
1334 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1335 
1336 	return;
1337 
1338 def_value1:
1339 	DRM_WARN("No enough system memory\n");
1340 def_value:
1341 	adev->pm.smu_prv_buffer_size = 0;
1342 }
1343 
1344 /**
1345  * amdgpu_device_check_arguments - validate module params
1346  *
1347  * @adev: amdgpu_device pointer
1348  *
1349  * Validates certain module parameters and updates
1350  * the associated values used by the driver (all asics).
1351  */
1352 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1353 {
1354 	if (amdgpu_sched_jobs < 4) {
1355 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1356 			 amdgpu_sched_jobs);
1357 		amdgpu_sched_jobs = 4;
1358 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1359 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1360 			 amdgpu_sched_jobs);
1361 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1362 	}
1363 
1364 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1365 		/* gart size must be greater or equal to 32M */
1366 		dev_warn(adev->dev, "gart size (%d) too small\n",
1367 			 amdgpu_gart_size);
1368 		amdgpu_gart_size = -1;
1369 	}
1370 
1371 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1372 		/* gtt size must be greater or equal to 32M */
1373 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1374 				 amdgpu_gtt_size);
1375 		amdgpu_gtt_size = -1;
1376 	}
1377 
1378 	/* valid range is between 4 and 9 inclusive */
1379 	if (amdgpu_vm_fragment_size != -1 &&
1380 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1381 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1382 		amdgpu_vm_fragment_size = -1;
1383 	}
1384 
1385 	if (amdgpu_sched_hw_submission < 2) {
1386 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1387 			 amdgpu_sched_hw_submission);
1388 		amdgpu_sched_hw_submission = 2;
1389 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1390 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1391 			 amdgpu_sched_hw_submission);
1392 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1393 	}
1394 
1395 	amdgpu_device_check_smu_prv_buffer_size(adev);
1396 
1397 	amdgpu_device_check_vm_size(adev);
1398 
1399 	amdgpu_device_check_block_size(adev);
1400 
1401 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1402 
1403 	amdgpu_gmc_tmz_set(adev);
1404 
1405 	amdgpu_gmc_noretry_set(adev);
1406 
1407 	return 0;
1408 }
1409 
1410 /**
1411  * amdgpu_switcheroo_set_state - set switcheroo state
1412  *
1413  * @pdev: pci dev pointer
1414  * @state: vga_switcheroo state
1415  *
1416  * Callback for the switcheroo driver.  Suspends or resumes the
1417  * the asics before or after it is powered up using ACPI methods.
1418  */
1419 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1420 					enum vga_switcheroo_state state)
1421 {
1422 	struct drm_device *dev = pci_get_drvdata(pdev);
1423 	int r;
1424 
1425 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1426 		return;
1427 
1428 	if (state == VGA_SWITCHEROO_ON) {
1429 		pr_info("switched on\n");
1430 		/* don't suspend or resume card normally */
1431 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1432 
1433 		pci_set_power_state(pdev, PCI_D0);
1434 		amdgpu_device_load_pci_state(pdev);
1435 		r = pci_enable_device(pdev);
1436 		if (r)
1437 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1438 		amdgpu_device_resume(dev, true);
1439 
1440 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1441 	} else {
1442 		pr_info("switched off\n");
1443 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1444 		amdgpu_device_suspend(dev, true);
1445 		amdgpu_device_cache_pci_state(pdev);
1446 		/* Shut down the device */
1447 		pci_disable_device(pdev);
1448 		pci_set_power_state(pdev, PCI_D3cold);
1449 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1450 	}
1451 }
1452 
1453 /**
1454  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1455  *
1456  * @pdev: pci dev pointer
1457  *
1458  * Callback for the switcheroo driver.  Check of the switcheroo
1459  * state can be changed.
1460  * Returns true if the state can be changed, false if not.
1461  */
1462 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1463 {
1464 	struct drm_device *dev = pci_get_drvdata(pdev);
1465 
1466 	/*
1467 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1468 	* locking inversion with the driver load path. And the access here is
1469 	* completely racy anyway. So don't bother with locking for now.
1470 	*/
1471 	return atomic_read(&dev->open_count) == 0;
1472 }
1473 
1474 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1475 	.set_gpu_state = amdgpu_switcheroo_set_state,
1476 	.reprobe = NULL,
1477 	.can_switch = amdgpu_switcheroo_can_switch,
1478 };
1479 
1480 /**
1481  * amdgpu_device_ip_set_clockgating_state - set the CG state
1482  *
1483  * @dev: amdgpu_device pointer
1484  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1485  * @state: clockgating state (gate or ungate)
1486  *
1487  * Sets the requested clockgating state for all instances of
1488  * the hardware IP specified.
1489  * Returns the error code from the last instance.
1490  */
1491 int amdgpu_device_ip_set_clockgating_state(void *dev,
1492 					   enum amd_ip_block_type block_type,
1493 					   enum amd_clockgating_state state)
1494 {
1495 	struct amdgpu_device *adev = dev;
1496 	int i, r = 0;
1497 
1498 	for (i = 0; i < adev->num_ip_blocks; i++) {
1499 		if (!adev->ip_blocks[i].status.valid)
1500 			continue;
1501 		if (adev->ip_blocks[i].version->type != block_type)
1502 			continue;
1503 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1504 			continue;
1505 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1506 			(void *)adev, state);
1507 		if (r)
1508 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1509 				  adev->ip_blocks[i].version->funcs->name, r);
1510 	}
1511 	return r;
1512 }
1513 
1514 /**
1515  * amdgpu_device_ip_set_powergating_state - set the PG state
1516  *
1517  * @dev: amdgpu_device pointer
1518  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1519  * @state: powergating state (gate or ungate)
1520  *
1521  * Sets the requested powergating state for all instances of
1522  * the hardware IP specified.
1523  * Returns the error code from the last instance.
1524  */
1525 int amdgpu_device_ip_set_powergating_state(void *dev,
1526 					   enum amd_ip_block_type block_type,
1527 					   enum amd_powergating_state state)
1528 {
1529 	struct amdgpu_device *adev = dev;
1530 	int i, r = 0;
1531 
1532 	for (i = 0; i < adev->num_ip_blocks; i++) {
1533 		if (!adev->ip_blocks[i].status.valid)
1534 			continue;
1535 		if (adev->ip_blocks[i].version->type != block_type)
1536 			continue;
1537 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1538 			continue;
1539 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1540 			(void *)adev, state);
1541 		if (r)
1542 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1543 				  adev->ip_blocks[i].version->funcs->name, r);
1544 	}
1545 	return r;
1546 }
1547 
1548 /**
1549  * amdgpu_device_ip_get_clockgating_state - get the CG state
1550  *
1551  * @adev: amdgpu_device pointer
1552  * @flags: clockgating feature flags
1553  *
1554  * Walks the list of IPs on the device and updates the clockgating
1555  * flags for each IP.
1556  * Updates @flags with the feature flags for each hardware IP where
1557  * clockgating is enabled.
1558  */
1559 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1560 					    u32 *flags)
1561 {
1562 	int i;
1563 
1564 	for (i = 0; i < adev->num_ip_blocks; i++) {
1565 		if (!adev->ip_blocks[i].status.valid)
1566 			continue;
1567 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1568 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1569 	}
1570 }
1571 
1572 /**
1573  * amdgpu_device_ip_wait_for_idle - wait for idle
1574  *
1575  * @adev: amdgpu_device pointer
1576  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1577  *
1578  * Waits for the request hardware IP to be idle.
1579  * Returns 0 for success or a negative error code on failure.
1580  */
1581 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1582 				   enum amd_ip_block_type block_type)
1583 {
1584 	int i, r;
1585 
1586 	for (i = 0; i < adev->num_ip_blocks; i++) {
1587 		if (!adev->ip_blocks[i].status.valid)
1588 			continue;
1589 		if (adev->ip_blocks[i].version->type == block_type) {
1590 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1591 			if (r)
1592 				return r;
1593 			break;
1594 		}
1595 	}
1596 	return 0;
1597 
1598 }
1599 
1600 /**
1601  * amdgpu_device_ip_is_idle - is the hardware IP idle
1602  *
1603  * @adev: amdgpu_device pointer
1604  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1605  *
1606  * Check if the hardware IP is idle or not.
1607  * Returns true if it the IP is idle, false if not.
1608  */
1609 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1610 			      enum amd_ip_block_type block_type)
1611 {
1612 	int i;
1613 
1614 	for (i = 0; i < adev->num_ip_blocks; i++) {
1615 		if (!adev->ip_blocks[i].status.valid)
1616 			continue;
1617 		if (adev->ip_blocks[i].version->type == block_type)
1618 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1619 	}
1620 	return true;
1621 
1622 }
1623 
1624 /**
1625  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1626  *
1627  * @adev: amdgpu_device pointer
1628  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1629  *
1630  * Returns a pointer to the hardware IP block structure
1631  * if it exists for the asic, otherwise NULL.
1632  */
1633 struct amdgpu_ip_block *
1634 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1635 			      enum amd_ip_block_type type)
1636 {
1637 	int i;
1638 
1639 	for (i = 0; i < adev->num_ip_blocks; i++)
1640 		if (adev->ip_blocks[i].version->type == type)
1641 			return &adev->ip_blocks[i];
1642 
1643 	return NULL;
1644 }
1645 
1646 /**
1647  * amdgpu_device_ip_block_version_cmp
1648  *
1649  * @adev: amdgpu_device pointer
1650  * @type: enum amd_ip_block_type
1651  * @major: major version
1652  * @minor: minor version
1653  *
1654  * return 0 if equal or greater
1655  * return 1 if smaller or the ip_block doesn't exist
1656  */
1657 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1658 				       enum amd_ip_block_type type,
1659 				       u32 major, u32 minor)
1660 {
1661 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1662 
1663 	if (ip_block && ((ip_block->version->major > major) ||
1664 			((ip_block->version->major == major) &&
1665 			(ip_block->version->minor >= minor))))
1666 		return 0;
1667 
1668 	return 1;
1669 }
1670 
1671 /**
1672  * amdgpu_device_ip_block_add
1673  *
1674  * @adev: amdgpu_device pointer
1675  * @ip_block_version: pointer to the IP to add
1676  *
1677  * Adds the IP block driver information to the collection of IPs
1678  * on the asic.
1679  */
1680 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1681 			       const struct amdgpu_ip_block_version *ip_block_version)
1682 {
1683 	if (!ip_block_version)
1684 		return -EINVAL;
1685 
1686 	switch (ip_block_version->type) {
1687 	case AMD_IP_BLOCK_TYPE_VCN:
1688 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1689 			return 0;
1690 		break;
1691 	case AMD_IP_BLOCK_TYPE_JPEG:
1692 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1693 			return 0;
1694 		break;
1695 	default:
1696 		break;
1697 	}
1698 
1699 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1700 		  ip_block_version->funcs->name);
1701 
1702 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1703 
1704 	return 0;
1705 }
1706 
1707 /**
1708  * amdgpu_device_enable_virtual_display - enable virtual display feature
1709  *
1710  * @adev: amdgpu_device pointer
1711  *
1712  * Enabled the virtual display feature if the user has enabled it via
1713  * the module parameter virtual_display.  This feature provides a virtual
1714  * display hardware on headless boards or in virtualized environments.
1715  * This function parses and validates the configuration string specified by
1716  * the user and configues the virtual display configuration (number of
1717  * virtual connectors, crtcs, etc.) specified.
1718  */
1719 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1720 {
1721 	adev->enable_virtual_display = false;
1722 
1723 	if (amdgpu_virtual_display) {
1724 		const char *pci_address_name = pci_name(adev->pdev);
1725 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1726 
1727 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1728 		pciaddstr_tmp = pciaddstr;
1729 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1730 			pciaddname = strsep(&pciaddname_tmp, ",");
1731 			if (!strcmp("all", pciaddname)
1732 			    || !strcmp(pci_address_name, pciaddname)) {
1733 				long num_crtc;
1734 				int res = -1;
1735 
1736 				adev->enable_virtual_display = true;
1737 
1738 				if (pciaddname_tmp)
1739 					res = kstrtol(pciaddname_tmp, 10,
1740 						      &num_crtc);
1741 
1742 				if (!res) {
1743 					if (num_crtc < 1)
1744 						num_crtc = 1;
1745 					if (num_crtc > 6)
1746 						num_crtc = 6;
1747 					adev->mode_info.num_crtc = num_crtc;
1748 				} else {
1749 					adev->mode_info.num_crtc = 1;
1750 				}
1751 				break;
1752 			}
1753 		}
1754 
1755 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1756 			 amdgpu_virtual_display, pci_address_name,
1757 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1758 
1759 		kfree(pciaddstr);
1760 	}
1761 }
1762 
1763 /**
1764  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1765  *
1766  * @adev: amdgpu_device pointer
1767  *
1768  * Parses the asic configuration parameters specified in the gpu info
1769  * firmware and makes them availale to the driver for use in configuring
1770  * the asic.
1771  * Returns 0 on success, -EINVAL on failure.
1772  */
1773 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1774 {
1775 	const char *chip_name;
1776 	char fw_name[40];
1777 	int err;
1778 	const struct gpu_info_firmware_header_v1_0 *hdr;
1779 
1780 	adev->firmware.gpu_info_fw = NULL;
1781 
1782 	if (adev->mman.discovery_bin) {
1783 		amdgpu_discovery_get_gfx_info(adev);
1784 
1785 		/*
1786 		 * FIXME: The bounding box is still needed by Navi12, so
1787 		 * temporarily read it from gpu_info firmware. Should be droped
1788 		 * when DAL no longer needs it.
1789 		 */
1790 		if (adev->asic_type != CHIP_NAVI12)
1791 			return 0;
1792 	}
1793 
1794 	switch (adev->asic_type) {
1795 #ifdef CONFIG_DRM_AMDGPU_SI
1796 	case CHIP_VERDE:
1797 	case CHIP_TAHITI:
1798 	case CHIP_PITCAIRN:
1799 	case CHIP_OLAND:
1800 	case CHIP_HAINAN:
1801 #endif
1802 #ifdef CONFIG_DRM_AMDGPU_CIK
1803 	case CHIP_BONAIRE:
1804 	case CHIP_HAWAII:
1805 	case CHIP_KAVERI:
1806 	case CHIP_KABINI:
1807 	case CHIP_MULLINS:
1808 #endif
1809 	case CHIP_TOPAZ:
1810 	case CHIP_TONGA:
1811 	case CHIP_FIJI:
1812 	case CHIP_POLARIS10:
1813 	case CHIP_POLARIS11:
1814 	case CHIP_POLARIS12:
1815 	case CHIP_VEGAM:
1816 	case CHIP_CARRIZO:
1817 	case CHIP_STONEY:
1818 	case CHIP_VEGA20:
1819 	case CHIP_ALDEBARAN:
1820 	case CHIP_SIENNA_CICHLID:
1821 	case CHIP_NAVY_FLOUNDER:
1822 	case CHIP_DIMGREY_CAVEFISH:
1823 	default:
1824 		return 0;
1825 	case CHIP_VEGA10:
1826 		chip_name = "vega10";
1827 		break;
1828 	case CHIP_VEGA12:
1829 		chip_name = "vega12";
1830 		break;
1831 	case CHIP_RAVEN:
1832 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1833 			chip_name = "raven2";
1834 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1835 			chip_name = "picasso";
1836 		else
1837 			chip_name = "raven";
1838 		break;
1839 	case CHIP_ARCTURUS:
1840 		chip_name = "arcturus";
1841 		break;
1842 	case CHIP_RENOIR:
1843 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1844 			chip_name = "renoir";
1845 		else
1846 			chip_name = "green_sardine";
1847 		break;
1848 	case CHIP_NAVI10:
1849 		chip_name = "navi10";
1850 		break;
1851 	case CHIP_NAVI14:
1852 		chip_name = "navi14";
1853 		break;
1854 	case CHIP_NAVI12:
1855 		chip_name = "navi12";
1856 		break;
1857 	case CHIP_VANGOGH:
1858 		chip_name = "vangogh";
1859 		break;
1860 	}
1861 
1862 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1863 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1864 	if (err) {
1865 		dev_err(adev->dev,
1866 			"Failed to load gpu_info firmware \"%s\"\n",
1867 			fw_name);
1868 		goto out;
1869 	}
1870 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1871 	if (err) {
1872 		dev_err(adev->dev,
1873 			"Failed to validate gpu_info firmware \"%s\"\n",
1874 			fw_name);
1875 		goto out;
1876 	}
1877 
1878 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1879 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1880 
1881 	switch (hdr->version_major) {
1882 	case 1:
1883 	{
1884 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1885 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1886 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1887 
1888 		/*
1889 		 * Should be droped when DAL no longer needs it.
1890 		 */
1891 		if (adev->asic_type == CHIP_NAVI12)
1892 			goto parse_soc_bounding_box;
1893 
1894 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1895 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1896 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1897 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1898 		adev->gfx.config.max_texture_channel_caches =
1899 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1900 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1901 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1902 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1903 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1904 		adev->gfx.config.double_offchip_lds_buf =
1905 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1906 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1907 		adev->gfx.cu_info.max_waves_per_simd =
1908 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1909 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1910 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1911 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1912 		if (hdr->version_minor >= 1) {
1913 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1914 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1915 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1916 			adev->gfx.config.num_sc_per_sh =
1917 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1918 			adev->gfx.config.num_packer_per_sc =
1919 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1920 		}
1921 
1922 parse_soc_bounding_box:
1923 		/*
1924 		 * soc bounding box info is not integrated in disocovery table,
1925 		 * we always need to parse it from gpu info firmware if needed.
1926 		 */
1927 		if (hdr->version_minor == 2) {
1928 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1929 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1930 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1931 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1932 		}
1933 		break;
1934 	}
1935 	default:
1936 		dev_err(adev->dev,
1937 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1938 		err = -EINVAL;
1939 		goto out;
1940 	}
1941 out:
1942 	return err;
1943 }
1944 
1945 /**
1946  * amdgpu_device_ip_early_init - run early init for hardware IPs
1947  *
1948  * @adev: amdgpu_device pointer
1949  *
1950  * Early initialization pass for hardware IPs.  The hardware IPs that make
1951  * up each asic are discovered each IP's early_init callback is run.  This
1952  * is the first stage in initializing the asic.
1953  * Returns 0 on success, negative error code on failure.
1954  */
1955 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1956 {
1957 	int i, r;
1958 
1959 	amdgpu_device_enable_virtual_display(adev);
1960 
1961 	if (amdgpu_sriov_vf(adev)) {
1962 		r = amdgpu_virt_request_full_gpu(adev, true);
1963 		if (r)
1964 			return r;
1965 	}
1966 
1967 	switch (adev->asic_type) {
1968 #ifdef CONFIG_DRM_AMDGPU_SI
1969 	case CHIP_VERDE:
1970 	case CHIP_TAHITI:
1971 	case CHIP_PITCAIRN:
1972 	case CHIP_OLAND:
1973 	case CHIP_HAINAN:
1974 		adev->family = AMDGPU_FAMILY_SI;
1975 		r = si_set_ip_blocks(adev);
1976 		if (r)
1977 			return r;
1978 		break;
1979 #endif
1980 #ifdef CONFIG_DRM_AMDGPU_CIK
1981 	case CHIP_BONAIRE:
1982 	case CHIP_HAWAII:
1983 	case CHIP_KAVERI:
1984 	case CHIP_KABINI:
1985 	case CHIP_MULLINS:
1986 		if (adev->flags & AMD_IS_APU)
1987 			adev->family = AMDGPU_FAMILY_KV;
1988 		else
1989 			adev->family = AMDGPU_FAMILY_CI;
1990 
1991 		r = cik_set_ip_blocks(adev);
1992 		if (r)
1993 			return r;
1994 		break;
1995 #endif
1996 	case CHIP_TOPAZ:
1997 	case CHIP_TONGA:
1998 	case CHIP_FIJI:
1999 	case CHIP_POLARIS10:
2000 	case CHIP_POLARIS11:
2001 	case CHIP_POLARIS12:
2002 	case CHIP_VEGAM:
2003 	case CHIP_CARRIZO:
2004 	case CHIP_STONEY:
2005 		if (adev->flags & AMD_IS_APU)
2006 			adev->family = AMDGPU_FAMILY_CZ;
2007 		else
2008 			adev->family = AMDGPU_FAMILY_VI;
2009 
2010 		r = vi_set_ip_blocks(adev);
2011 		if (r)
2012 			return r;
2013 		break;
2014 	case CHIP_VEGA10:
2015 	case CHIP_VEGA12:
2016 	case CHIP_VEGA20:
2017 	case CHIP_RAVEN:
2018 	case CHIP_ARCTURUS:
2019 	case CHIP_RENOIR:
2020 	case CHIP_ALDEBARAN:
2021 		if (adev->flags & AMD_IS_APU)
2022 			adev->family = AMDGPU_FAMILY_RV;
2023 		else
2024 			adev->family = AMDGPU_FAMILY_AI;
2025 
2026 		r = soc15_set_ip_blocks(adev);
2027 		if (r)
2028 			return r;
2029 		break;
2030 	case  CHIP_NAVI10:
2031 	case  CHIP_NAVI14:
2032 	case  CHIP_NAVI12:
2033 	case  CHIP_SIENNA_CICHLID:
2034 	case  CHIP_NAVY_FLOUNDER:
2035 	case  CHIP_DIMGREY_CAVEFISH:
2036 	case CHIP_VANGOGH:
2037 		if (adev->asic_type == CHIP_VANGOGH)
2038 			adev->family = AMDGPU_FAMILY_VGH;
2039 		else
2040 			adev->family = AMDGPU_FAMILY_NV;
2041 
2042 		r = nv_set_ip_blocks(adev);
2043 		if (r)
2044 			return r;
2045 		break;
2046 	default:
2047 		/* FIXME: not supported yet */
2048 		return -EINVAL;
2049 	}
2050 
2051 	amdgpu_amdkfd_device_probe(adev);
2052 
2053 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2054 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2055 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2056 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2057 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2058 
2059 	for (i = 0; i < adev->num_ip_blocks; i++) {
2060 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2061 			DRM_ERROR("disabled ip block: %d <%s>\n",
2062 				  i, adev->ip_blocks[i].version->funcs->name);
2063 			adev->ip_blocks[i].status.valid = false;
2064 		} else {
2065 			if (adev->ip_blocks[i].version->funcs->early_init) {
2066 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2067 				if (r == -ENOENT) {
2068 					adev->ip_blocks[i].status.valid = false;
2069 				} else if (r) {
2070 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2071 						  adev->ip_blocks[i].version->funcs->name, r);
2072 					return r;
2073 				} else {
2074 					adev->ip_blocks[i].status.valid = true;
2075 				}
2076 			} else {
2077 				adev->ip_blocks[i].status.valid = true;
2078 			}
2079 		}
2080 		/* get the vbios after the asic_funcs are set up */
2081 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2082 			r = amdgpu_device_parse_gpu_info_fw(adev);
2083 			if (r)
2084 				return r;
2085 
2086 			/* Read BIOS */
2087 			if (!amdgpu_get_bios(adev))
2088 				return -EINVAL;
2089 
2090 			r = amdgpu_atombios_init(adev);
2091 			if (r) {
2092 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2093 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2094 				return r;
2095 			}
2096 
2097 			/*get pf2vf msg info at it's earliest time*/
2098 			if (amdgpu_sriov_vf(adev))
2099 				amdgpu_virt_init_data_exchange(adev);
2100 
2101 		}
2102 	}
2103 
2104 	adev->cg_flags &= amdgpu_cg_mask;
2105 	adev->pg_flags &= amdgpu_pg_mask;
2106 
2107 	return 0;
2108 }
2109 
2110 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2111 {
2112 	int i, r;
2113 
2114 	for (i = 0; i < adev->num_ip_blocks; i++) {
2115 		if (!adev->ip_blocks[i].status.sw)
2116 			continue;
2117 		if (adev->ip_blocks[i].status.hw)
2118 			continue;
2119 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2120 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2121 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2122 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2123 			if (r) {
2124 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2125 					  adev->ip_blocks[i].version->funcs->name, r);
2126 				return r;
2127 			}
2128 			adev->ip_blocks[i].status.hw = true;
2129 		}
2130 	}
2131 
2132 	return 0;
2133 }
2134 
2135 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2136 {
2137 	int i, r;
2138 
2139 	for (i = 0; i < adev->num_ip_blocks; i++) {
2140 		if (!adev->ip_blocks[i].status.sw)
2141 			continue;
2142 		if (adev->ip_blocks[i].status.hw)
2143 			continue;
2144 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2145 		if (r) {
2146 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2147 				  adev->ip_blocks[i].version->funcs->name, r);
2148 			return r;
2149 		}
2150 		adev->ip_blocks[i].status.hw = true;
2151 	}
2152 
2153 	return 0;
2154 }
2155 
2156 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2157 {
2158 	int r = 0;
2159 	int i;
2160 	uint32_t smu_version;
2161 
2162 	if (adev->asic_type >= CHIP_VEGA10) {
2163 		for (i = 0; i < adev->num_ip_blocks; i++) {
2164 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2165 				continue;
2166 
2167 			if (!adev->ip_blocks[i].status.sw)
2168 				continue;
2169 
2170 			/* no need to do the fw loading again if already done*/
2171 			if (adev->ip_blocks[i].status.hw == true)
2172 				break;
2173 
2174 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2175 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2176 				if (r) {
2177 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2178 							  adev->ip_blocks[i].version->funcs->name, r);
2179 					return r;
2180 				}
2181 			} else {
2182 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2183 				if (r) {
2184 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2185 							  adev->ip_blocks[i].version->funcs->name, r);
2186 					return r;
2187 				}
2188 			}
2189 
2190 			adev->ip_blocks[i].status.hw = true;
2191 			break;
2192 		}
2193 	}
2194 
2195 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2196 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2197 
2198 	return r;
2199 }
2200 
2201 /**
2202  * amdgpu_device_ip_init - run init for hardware IPs
2203  *
2204  * @adev: amdgpu_device pointer
2205  *
2206  * Main initialization pass for hardware IPs.  The list of all the hardware
2207  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2208  * are run.  sw_init initializes the software state associated with each IP
2209  * and hw_init initializes the hardware associated with each IP.
2210  * Returns 0 on success, negative error code on failure.
2211  */
2212 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2213 {
2214 	int i, r;
2215 
2216 	r = amdgpu_ras_init(adev);
2217 	if (r)
2218 		return r;
2219 
2220 	for (i = 0; i < adev->num_ip_blocks; i++) {
2221 		if (!adev->ip_blocks[i].status.valid)
2222 			continue;
2223 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2224 		if (r) {
2225 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2226 				  adev->ip_blocks[i].version->funcs->name, r);
2227 			goto init_failed;
2228 		}
2229 		adev->ip_blocks[i].status.sw = true;
2230 
2231 		/* need to do gmc hw init early so we can allocate gpu mem */
2232 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2233 			r = amdgpu_device_vram_scratch_init(adev);
2234 			if (r) {
2235 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2236 				goto init_failed;
2237 			}
2238 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2239 			if (r) {
2240 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2241 				goto init_failed;
2242 			}
2243 			r = amdgpu_device_wb_init(adev);
2244 			if (r) {
2245 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2246 				goto init_failed;
2247 			}
2248 			adev->ip_blocks[i].status.hw = true;
2249 
2250 			/* right after GMC hw init, we create CSA */
2251 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2252 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2253 								AMDGPU_GEM_DOMAIN_VRAM,
2254 								AMDGPU_CSA_SIZE);
2255 				if (r) {
2256 					DRM_ERROR("allocate CSA failed %d\n", r);
2257 					goto init_failed;
2258 				}
2259 			}
2260 		}
2261 	}
2262 
2263 	if (amdgpu_sriov_vf(adev))
2264 		amdgpu_virt_init_data_exchange(adev);
2265 
2266 	r = amdgpu_ib_pool_init(adev);
2267 	if (r) {
2268 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2269 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2270 		goto init_failed;
2271 	}
2272 
2273 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2274 	if (r)
2275 		goto init_failed;
2276 
2277 	r = amdgpu_device_ip_hw_init_phase1(adev);
2278 	if (r)
2279 		goto init_failed;
2280 
2281 	r = amdgpu_device_fw_loading(adev);
2282 	if (r)
2283 		goto init_failed;
2284 
2285 	r = amdgpu_device_ip_hw_init_phase2(adev);
2286 	if (r)
2287 		goto init_failed;
2288 
2289 	/*
2290 	 * retired pages will be loaded from eeprom and reserved here,
2291 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2292 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2293 	 * for I2C communication which only true at this point.
2294 	 *
2295 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2296 	 * failure from bad gpu situation and stop amdgpu init process
2297 	 * accordingly. For other failed cases, it will still release all
2298 	 * the resource and print error message, rather than returning one
2299 	 * negative value to upper level.
2300 	 *
2301 	 * Note: theoretically, this should be called before all vram allocations
2302 	 * to protect retired page from abusing
2303 	 */
2304 	r = amdgpu_ras_recovery_init(adev);
2305 	if (r)
2306 		goto init_failed;
2307 
2308 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2309 		amdgpu_xgmi_add_device(adev);
2310 
2311 	/* Don't init kfd if whole hive need to be reset during init */
2312 	if (!adev->gmc.xgmi.pending_reset)
2313 		amdgpu_amdkfd_device_init(adev);
2314 
2315 	amdgpu_fru_get_product_info(adev);
2316 
2317 init_failed:
2318 	if (amdgpu_sriov_vf(adev))
2319 		amdgpu_virt_release_full_gpu(adev, true);
2320 
2321 	return r;
2322 }
2323 
2324 /**
2325  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2326  *
2327  * @adev: amdgpu_device pointer
2328  *
2329  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2330  * this function before a GPU reset.  If the value is retained after a
2331  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2332  */
2333 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2334 {
2335 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2336 }
2337 
2338 /**
2339  * amdgpu_device_check_vram_lost - check if vram is valid
2340  *
2341  * @adev: amdgpu_device pointer
2342  *
2343  * Checks the reset magic value written to the gart pointer in VRAM.
2344  * The driver calls this after a GPU reset to see if the contents of
2345  * VRAM is lost or now.
2346  * returns true if vram is lost, false if not.
2347  */
2348 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2349 {
2350 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2351 			AMDGPU_RESET_MAGIC_NUM))
2352 		return true;
2353 
2354 	if (!amdgpu_in_reset(adev))
2355 		return false;
2356 
2357 	/*
2358 	 * For all ASICs with baco/mode1 reset, the VRAM is
2359 	 * always assumed to be lost.
2360 	 */
2361 	switch (amdgpu_asic_reset_method(adev)) {
2362 	case AMD_RESET_METHOD_BACO:
2363 	case AMD_RESET_METHOD_MODE1:
2364 		return true;
2365 	default:
2366 		return false;
2367 	}
2368 }
2369 
2370 /**
2371  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2372  *
2373  * @adev: amdgpu_device pointer
2374  * @state: clockgating state (gate or ungate)
2375  *
2376  * The list of all the hardware IPs that make up the asic is walked and the
2377  * set_clockgating_state callbacks are run.
2378  * Late initialization pass enabling clockgating for hardware IPs.
2379  * Fini or suspend, pass disabling clockgating for hardware IPs.
2380  * Returns 0 on success, negative error code on failure.
2381  */
2382 
2383 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2384 			       enum amd_clockgating_state state)
2385 {
2386 	int i, j, r;
2387 
2388 	if (amdgpu_emu_mode == 1)
2389 		return 0;
2390 
2391 	for (j = 0; j < adev->num_ip_blocks; j++) {
2392 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2393 		if (!adev->ip_blocks[i].status.late_initialized)
2394 			continue;
2395 		/* skip CG for GFX on S0ix */
2396 		if (adev->in_s0ix &&
2397 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2398 			continue;
2399 		/* skip CG for VCE/UVD, it's handled specially */
2400 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2401 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2402 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2403 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2404 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2405 			/* enable clockgating to save power */
2406 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2407 										     state);
2408 			if (r) {
2409 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2410 					  adev->ip_blocks[i].version->funcs->name, r);
2411 				return r;
2412 			}
2413 		}
2414 	}
2415 
2416 	return 0;
2417 }
2418 
2419 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2420 			       enum amd_powergating_state state)
2421 {
2422 	int i, j, r;
2423 
2424 	if (amdgpu_emu_mode == 1)
2425 		return 0;
2426 
2427 	for (j = 0; j < adev->num_ip_blocks; j++) {
2428 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2429 		if (!adev->ip_blocks[i].status.late_initialized)
2430 			continue;
2431 		/* skip PG for GFX on S0ix */
2432 		if (adev->in_s0ix &&
2433 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2434 			continue;
2435 		/* skip CG for VCE/UVD, it's handled specially */
2436 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2437 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2438 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2439 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2440 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2441 			/* enable powergating to save power */
2442 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2443 											state);
2444 			if (r) {
2445 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2446 					  adev->ip_blocks[i].version->funcs->name, r);
2447 				return r;
2448 			}
2449 		}
2450 	}
2451 	return 0;
2452 }
2453 
2454 static int amdgpu_device_enable_mgpu_fan_boost(void)
2455 {
2456 	struct amdgpu_gpu_instance *gpu_ins;
2457 	struct amdgpu_device *adev;
2458 	int i, ret = 0;
2459 
2460 	mutex_lock(&mgpu_info.mutex);
2461 
2462 	/*
2463 	 * MGPU fan boost feature should be enabled
2464 	 * only when there are two or more dGPUs in
2465 	 * the system
2466 	 */
2467 	if (mgpu_info.num_dgpu < 2)
2468 		goto out;
2469 
2470 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2471 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2472 		adev = gpu_ins->adev;
2473 		if (!(adev->flags & AMD_IS_APU) &&
2474 		    !gpu_ins->mgpu_fan_enabled) {
2475 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2476 			if (ret)
2477 				break;
2478 
2479 			gpu_ins->mgpu_fan_enabled = 1;
2480 		}
2481 	}
2482 
2483 out:
2484 	mutex_unlock(&mgpu_info.mutex);
2485 
2486 	return ret;
2487 }
2488 
2489 /**
2490  * amdgpu_device_ip_late_init - run late init for hardware IPs
2491  *
2492  * @adev: amdgpu_device pointer
2493  *
2494  * Late initialization pass for hardware IPs.  The list of all the hardware
2495  * IPs that make up the asic is walked and the late_init callbacks are run.
2496  * late_init covers any special initialization that an IP requires
2497  * after all of the have been initialized or something that needs to happen
2498  * late in the init process.
2499  * Returns 0 on success, negative error code on failure.
2500  */
2501 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2502 {
2503 	struct amdgpu_gpu_instance *gpu_instance;
2504 	int i = 0, r;
2505 
2506 	for (i = 0; i < adev->num_ip_blocks; i++) {
2507 		if (!adev->ip_blocks[i].status.hw)
2508 			continue;
2509 		if (adev->ip_blocks[i].version->funcs->late_init) {
2510 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2511 			if (r) {
2512 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2513 					  adev->ip_blocks[i].version->funcs->name, r);
2514 				return r;
2515 			}
2516 		}
2517 		adev->ip_blocks[i].status.late_initialized = true;
2518 	}
2519 
2520 	amdgpu_ras_set_error_query_ready(adev, true);
2521 
2522 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2523 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2524 
2525 	amdgpu_device_fill_reset_magic(adev);
2526 
2527 	r = amdgpu_device_enable_mgpu_fan_boost();
2528 	if (r)
2529 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2530 
2531 	/* For XGMI + passthrough configuration on arcturus, enable light SBR */
2532 	if (adev->asic_type == CHIP_ARCTURUS &&
2533 	    amdgpu_passthrough(adev) &&
2534 	    adev->gmc.xgmi.num_physical_nodes > 1)
2535 		smu_set_light_sbr(&adev->smu, true);
2536 
2537 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2538 		mutex_lock(&mgpu_info.mutex);
2539 
2540 		/*
2541 		 * Reset device p-state to low as this was booted with high.
2542 		 *
2543 		 * This should be performed only after all devices from the same
2544 		 * hive get initialized.
2545 		 *
2546 		 * However, it's unknown how many device in the hive in advance.
2547 		 * As this is counted one by one during devices initializations.
2548 		 *
2549 		 * So, we wait for all XGMI interlinked devices initialized.
2550 		 * This may bring some delays as those devices may come from
2551 		 * different hives. But that should be OK.
2552 		 */
2553 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2554 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2555 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2556 				if (gpu_instance->adev->flags & AMD_IS_APU)
2557 					continue;
2558 
2559 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2560 						AMDGPU_XGMI_PSTATE_MIN);
2561 				if (r) {
2562 					DRM_ERROR("pstate setting failed (%d).\n", r);
2563 					break;
2564 				}
2565 			}
2566 		}
2567 
2568 		mutex_unlock(&mgpu_info.mutex);
2569 	}
2570 
2571 	return 0;
2572 }
2573 
2574 /**
2575  * amdgpu_device_ip_fini - run fini for hardware IPs
2576  *
2577  * @adev: amdgpu_device pointer
2578  *
2579  * Main teardown pass for hardware IPs.  The list of all the hardware
2580  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2581  * are run.  hw_fini tears down the hardware associated with each IP
2582  * and sw_fini tears down any software state associated with each IP.
2583  * Returns 0 on success, negative error code on failure.
2584  */
2585 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2586 {
2587 	int i, r;
2588 
2589 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2590 		amdgpu_virt_release_ras_err_handler_data(adev);
2591 
2592 	amdgpu_ras_pre_fini(adev);
2593 
2594 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2595 		amdgpu_xgmi_remove_device(adev);
2596 
2597 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2598 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2599 
2600 	amdgpu_amdkfd_device_fini(adev);
2601 
2602 	/* need to disable SMC first */
2603 	for (i = 0; i < adev->num_ip_blocks; i++) {
2604 		if (!adev->ip_blocks[i].status.hw)
2605 			continue;
2606 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2607 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2608 			/* XXX handle errors */
2609 			if (r) {
2610 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2611 					  adev->ip_blocks[i].version->funcs->name, r);
2612 			}
2613 			adev->ip_blocks[i].status.hw = false;
2614 			break;
2615 		}
2616 	}
2617 
2618 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2619 		if (!adev->ip_blocks[i].status.hw)
2620 			continue;
2621 
2622 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2623 		/* XXX handle errors */
2624 		if (r) {
2625 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2626 				  adev->ip_blocks[i].version->funcs->name, r);
2627 		}
2628 
2629 		adev->ip_blocks[i].status.hw = false;
2630 	}
2631 
2632 
2633 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2634 		if (!adev->ip_blocks[i].status.sw)
2635 			continue;
2636 
2637 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2638 			amdgpu_ucode_free_bo(adev);
2639 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2640 			amdgpu_device_wb_fini(adev);
2641 			amdgpu_device_vram_scratch_fini(adev);
2642 			amdgpu_ib_pool_fini(adev);
2643 		}
2644 
2645 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2646 		/* XXX handle errors */
2647 		if (r) {
2648 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2649 				  adev->ip_blocks[i].version->funcs->name, r);
2650 		}
2651 		adev->ip_blocks[i].status.sw = false;
2652 		adev->ip_blocks[i].status.valid = false;
2653 	}
2654 
2655 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2656 		if (!adev->ip_blocks[i].status.late_initialized)
2657 			continue;
2658 		if (adev->ip_blocks[i].version->funcs->late_fini)
2659 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2660 		adev->ip_blocks[i].status.late_initialized = false;
2661 	}
2662 
2663 	amdgpu_ras_fini(adev);
2664 
2665 	if (amdgpu_sriov_vf(adev))
2666 		if (amdgpu_virt_release_full_gpu(adev, false))
2667 			DRM_ERROR("failed to release exclusive mode on fini\n");
2668 
2669 	return 0;
2670 }
2671 
2672 /**
2673  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2674  *
2675  * @work: work_struct.
2676  */
2677 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2678 {
2679 	struct amdgpu_device *adev =
2680 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2681 	int r;
2682 
2683 	r = amdgpu_ib_ring_tests(adev);
2684 	if (r)
2685 		DRM_ERROR("ib ring test failed (%d).\n", r);
2686 }
2687 
2688 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2689 {
2690 	struct amdgpu_device *adev =
2691 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2692 
2693 	mutex_lock(&adev->gfx.gfx_off_mutex);
2694 	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2695 		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2696 			adev->gfx.gfx_off_state = true;
2697 	}
2698 	mutex_unlock(&adev->gfx.gfx_off_mutex);
2699 }
2700 
2701 /**
2702  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2703  *
2704  * @adev: amdgpu_device pointer
2705  *
2706  * Main suspend function for hardware IPs.  The list of all the hardware
2707  * IPs that make up the asic is walked, clockgating is disabled and the
2708  * suspend callbacks are run.  suspend puts the hardware and software state
2709  * in each IP into a state suitable for suspend.
2710  * Returns 0 on success, negative error code on failure.
2711  */
2712 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2713 {
2714 	int i, r;
2715 
2716 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2717 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2718 
2719 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2720 		if (!adev->ip_blocks[i].status.valid)
2721 			continue;
2722 
2723 		/* displays are handled separately */
2724 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2725 			continue;
2726 
2727 		/* XXX handle errors */
2728 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2729 		/* XXX handle errors */
2730 		if (r) {
2731 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2732 				  adev->ip_blocks[i].version->funcs->name, r);
2733 			return r;
2734 		}
2735 
2736 		adev->ip_blocks[i].status.hw = false;
2737 	}
2738 
2739 	return 0;
2740 }
2741 
2742 /**
2743  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2744  *
2745  * @adev: amdgpu_device pointer
2746  *
2747  * Main suspend function for hardware IPs.  The list of all the hardware
2748  * IPs that make up the asic is walked, clockgating is disabled and the
2749  * suspend callbacks are run.  suspend puts the hardware and software state
2750  * in each IP into a state suitable for suspend.
2751  * Returns 0 on success, negative error code on failure.
2752  */
2753 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2754 {
2755 	int i, r;
2756 
2757 	if (adev->in_s0ix)
2758 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
2759 
2760 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2761 		if (!adev->ip_blocks[i].status.valid)
2762 			continue;
2763 		/* displays are handled in phase1 */
2764 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2765 			continue;
2766 		/* PSP lost connection when err_event_athub occurs */
2767 		if (amdgpu_ras_intr_triggered() &&
2768 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2769 			adev->ip_blocks[i].status.hw = false;
2770 			continue;
2771 		}
2772 
2773 		/* skip unnecessary suspend if we do not initialize them yet */
2774 		if (adev->gmc.xgmi.pending_reset &&
2775 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2776 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2777 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2778 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2779 			adev->ip_blocks[i].status.hw = false;
2780 			continue;
2781 		}
2782 
2783 		/* skip suspend of gfx and psp for S0ix
2784 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2785 		 * like at runtime. PSP is also part of the always on hardware
2786 		 * so no need to suspend it.
2787 		 */
2788 		if (adev->in_s0ix &&
2789 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2790 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
2791 			continue;
2792 
2793 		/* XXX handle errors */
2794 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2795 		/* XXX handle errors */
2796 		if (r) {
2797 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2798 				  adev->ip_blocks[i].version->funcs->name, r);
2799 		}
2800 		adev->ip_blocks[i].status.hw = false;
2801 		/* handle putting the SMC in the appropriate state */
2802 		if(!amdgpu_sriov_vf(adev)){
2803 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2804 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2805 				if (r) {
2806 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2807 							adev->mp1_state, r);
2808 					return r;
2809 				}
2810 			}
2811 		}
2812 	}
2813 
2814 	return 0;
2815 }
2816 
2817 /**
2818  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2819  *
2820  * @adev: amdgpu_device pointer
2821  *
2822  * Main suspend function for hardware IPs.  The list of all the hardware
2823  * IPs that make up the asic is walked, clockgating is disabled and the
2824  * suspend callbacks are run.  suspend puts the hardware and software state
2825  * in each IP into a state suitable for suspend.
2826  * Returns 0 on success, negative error code on failure.
2827  */
2828 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2829 {
2830 	int r;
2831 
2832 	if (amdgpu_sriov_vf(adev)) {
2833 		amdgpu_virt_fini_data_exchange(adev);
2834 		amdgpu_virt_request_full_gpu(adev, false);
2835 	}
2836 
2837 	r = amdgpu_device_ip_suspend_phase1(adev);
2838 	if (r)
2839 		return r;
2840 	r = amdgpu_device_ip_suspend_phase2(adev);
2841 
2842 	if (amdgpu_sriov_vf(adev))
2843 		amdgpu_virt_release_full_gpu(adev, false);
2844 
2845 	return r;
2846 }
2847 
2848 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2849 {
2850 	int i, r;
2851 
2852 	static enum amd_ip_block_type ip_order[] = {
2853 		AMD_IP_BLOCK_TYPE_GMC,
2854 		AMD_IP_BLOCK_TYPE_COMMON,
2855 		AMD_IP_BLOCK_TYPE_PSP,
2856 		AMD_IP_BLOCK_TYPE_IH,
2857 	};
2858 
2859 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2860 		int j;
2861 		struct amdgpu_ip_block *block;
2862 
2863 		block = &adev->ip_blocks[i];
2864 		block->status.hw = false;
2865 
2866 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2867 
2868 			if (block->version->type != ip_order[j] ||
2869 				!block->status.valid)
2870 				continue;
2871 
2872 			r = block->version->funcs->hw_init(adev);
2873 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2874 			if (r)
2875 				return r;
2876 			block->status.hw = true;
2877 		}
2878 	}
2879 
2880 	return 0;
2881 }
2882 
2883 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2884 {
2885 	int i, r;
2886 
2887 	static enum amd_ip_block_type ip_order[] = {
2888 		AMD_IP_BLOCK_TYPE_SMC,
2889 		AMD_IP_BLOCK_TYPE_DCE,
2890 		AMD_IP_BLOCK_TYPE_GFX,
2891 		AMD_IP_BLOCK_TYPE_SDMA,
2892 		AMD_IP_BLOCK_TYPE_UVD,
2893 		AMD_IP_BLOCK_TYPE_VCE,
2894 		AMD_IP_BLOCK_TYPE_VCN
2895 	};
2896 
2897 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2898 		int j;
2899 		struct amdgpu_ip_block *block;
2900 
2901 		for (j = 0; j < adev->num_ip_blocks; j++) {
2902 			block = &adev->ip_blocks[j];
2903 
2904 			if (block->version->type != ip_order[i] ||
2905 				!block->status.valid ||
2906 				block->status.hw)
2907 				continue;
2908 
2909 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2910 				r = block->version->funcs->resume(adev);
2911 			else
2912 				r = block->version->funcs->hw_init(adev);
2913 
2914 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2915 			if (r)
2916 				return r;
2917 			block->status.hw = true;
2918 		}
2919 	}
2920 
2921 	return 0;
2922 }
2923 
2924 /**
2925  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2926  *
2927  * @adev: amdgpu_device pointer
2928  *
2929  * First resume function for hardware IPs.  The list of all the hardware
2930  * IPs that make up the asic is walked and the resume callbacks are run for
2931  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2932  * after a suspend and updates the software state as necessary.  This
2933  * function is also used for restoring the GPU after a GPU reset.
2934  * Returns 0 on success, negative error code on failure.
2935  */
2936 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2937 {
2938 	int i, r;
2939 
2940 	for (i = 0; i < adev->num_ip_blocks; i++) {
2941 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2942 			continue;
2943 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2944 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2945 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2946 
2947 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2948 			if (r) {
2949 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2950 					  adev->ip_blocks[i].version->funcs->name, r);
2951 				return r;
2952 			}
2953 			adev->ip_blocks[i].status.hw = true;
2954 		}
2955 	}
2956 
2957 	return 0;
2958 }
2959 
2960 /**
2961  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2962  *
2963  * @adev: amdgpu_device pointer
2964  *
2965  * First resume function for hardware IPs.  The list of all the hardware
2966  * IPs that make up the asic is walked and the resume callbacks are run for
2967  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2968  * functional state after a suspend and updates the software state as
2969  * necessary.  This function is also used for restoring the GPU after a GPU
2970  * reset.
2971  * Returns 0 on success, negative error code on failure.
2972  */
2973 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2974 {
2975 	int i, r;
2976 
2977 	for (i = 0; i < adev->num_ip_blocks; i++) {
2978 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2979 			continue;
2980 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2981 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2982 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2983 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2984 			continue;
2985 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2986 		if (r) {
2987 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2988 				  adev->ip_blocks[i].version->funcs->name, r);
2989 			return r;
2990 		}
2991 		adev->ip_blocks[i].status.hw = true;
2992 	}
2993 
2994 	return 0;
2995 }
2996 
2997 /**
2998  * amdgpu_device_ip_resume - run resume for hardware IPs
2999  *
3000  * @adev: amdgpu_device pointer
3001  *
3002  * Main resume function for hardware IPs.  The hardware IPs
3003  * are split into two resume functions because they are
3004  * are also used in in recovering from a GPU reset and some additional
3005  * steps need to be take between them.  In this case (S3/S4) they are
3006  * run sequentially.
3007  * Returns 0 on success, negative error code on failure.
3008  */
3009 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3010 {
3011 	int r;
3012 
3013 	r = amdgpu_device_ip_resume_phase1(adev);
3014 	if (r)
3015 		return r;
3016 
3017 	r = amdgpu_device_fw_loading(adev);
3018 	if (r)
3019 		return r;
3020 
3021 	r = amdgpu_device_ip_resume_phase2(adev);
3022 
3023 	return r;
3024 }
3025 
3026 /**
3027  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3028  *
3029  * @adev: amdgpu_device pointer
3030  *
3031  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3032  */
3033 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3034 {
3035 	if (amdgpu_sriov_vf(adev)) {
3036 		if (adev->is_atom_fw) {
3037 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
3038 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3039 		} else {
3040 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3041 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3042 		}
3043 
3044 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3045 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3046 	}
3047 }
3048 
3049 /**
3050  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3051  *
3052  * @asic_type: AMD asic type
3053  *
3054  * Check if there is DC (new modesetting infrastructre) support for an asic.
3055  * returns true if DC has support, false if not.
3056  */
3057 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3058 {
3059 	switch (asic_type) {
3060 #if defined(CONFIG_DRM_AMD_DC)
3061 #if defined(CONFIG_DRM_AMD_DC_SI)
3062 	case CHIP_TAHITI:
3063 	case CHIP_PITCAIRN:
3064 	case CHIP_VERDE:
3065 	case CHIP_OLAND:
3066 #endif
3067 	case CHIP_BONAIRE:
3068 	case CHIP_KAVERI:
3069 	case CHIP_KABINI:
3070 	case CHIP_MULLINS:
3071 		/*
3072 		 * We have systems in the wild with these ASICs that require
3073 		 * LVDS and VGA support which is not supported with DC.
3074 		 *
3075 		 * Fallback to the non-DC driver here by default so as not to
3076 		 * cause regressions.
3077 		 */
3078 		return amdgpu_dc > 0;
3079 	case CHIP_HAWAII:
3080 	case CHIP_CARRIZO:
3081 	case CHIP_STONEY:
3082 	case CHIP_POLARIS10:
3083 	case CHIP_POLARIS11:
3084 	case CHIP_POLARIS12:
3085 	case CHIP_VEGAM:
3086 	case CHIP_TONGA:
3087 	case CHIP_FIJI:
3088 	case CHIP_VEGA10:
3089 	case CHIP_VEGA12:
3090 	case CHIP_VEGA20:
3091 #if defined(CONFIG_DRM_AMD_DC_DCN)
3092 	case CHIP_RAVEN:
3093 	case CHIP_NAVI10:
3094 	case CHIP_NAVI14:
3095 	case CHIP_NAVI12:
3096 	case CHIP_RENOIR:
3097 	case CHIP_SIENNA_CICHLID:
3098 	case CHIP_NAVY_FLOUNDER:
3099 	case CHIP_DIMGREY_CAVEFISH:
3100 	case CHIP_VANGOGH:
3101 #endif
3102 		return amdgpu_dc != 0;
3103 #endif
3104 	default:
3105 		if (amdgpu_dc > 0)
3106 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3107 					 "but isn't supported by ASIC, ignoring\n");
3108 		return false;
3109 	}
3110 }
3111 
3112 /**
3113  * amdgpu_device_has_dc_support - check if dc is supported
3114  *
3115  * @adev: amdgpu_device pointer
3116  *
3117  * Returns true for supported, false for not supported
3118  */
3119 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3120 {
3121 	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3122 		return false;
3123 
3124 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3125 }
3126 
3127 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3128 {
3129 	struct amdgpu_device *adev =
3130 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3131 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3132 
3133 	/* It's a bug to not have a hive within this function */
3134 	if (WARN_ON(!hive))
3135 		return;
3136 
3137 	/*
3138 	 * Use task barrier to synchronize all xgmi reset works across the
3139 	 * hive. task_barrier_enter and task_barrier_exit will block
3140 	 * until all the threads running the xgmi reset works reach
3141 	 * those points. task_barrier_full will do both blocks.
3142 	 */
3143 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3144 
3145 		task_barrier_enter(&hive->tb);
3146 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3147 
3148 		if (adev->asic_reset_res)
3149 			goto fail;
3150 
3151 		task_barrier_exit(&hive->tb);
3152 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3153 
3154 		if (adev->asic_reset_res)
3155 			goto fail;
3156 
3157 		if (adev->mmhub.ras_funcs &&
3158 		    adev->mmhub.ras_funcs->reset_ras_error_count)
3159 			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
3160 	} else {
3161 
3162 		task_barrier_full(&hive->tb);
3163 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3164 	}
3165 
3166 fail:
3167 	if (adev->asic_reset_res)
3168 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3169 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3170 	amdgpu_put_xgmi_hive(hive);
3171 }
3172 
3173 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3174 {
3175 	char *input = amdgpu_lockup_timeout;
3176 	char *timeout_setting = NULL;
3177 	int index = 0;
3178 	long timeout;
3179 	int ret = 0;
3180 
3181 	/*
3182 	 * By default timeout for non compute jobs is 10000.
3183 	 * And there is no timeout enforced on compute jobs.
3184 	 * In SR-IOV or passthrough mode, timeout for compute
3185 	 * jobs are 60000 by default.
3186 	 */
3187 	adev->gfx_timeout = msecs_to_jiffies(10000);
3188 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3189 	if (amdgpu_sriov_vf(adev))
3190 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3191 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3192 	else if (amdgpu_passthrough(adev))
3193 		adev->compute_timeout =  msecs_to_jiffies(60000);
3194 	else
3195 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3196 
3197 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3198 		while ((timeout_setting = strsep(&input, ",")) &&
3199 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3200 			ret = kstrtol(timeout_setting, 0, &timeout);
3201 			if (ret)
3202 				return ret;
3203 
3204 			if (timeout == 0) {
3205 				index++;
3206 				continue;
3207 			} else if (timeout < 0) {
3208 				timeout = MAX_SCHEDULE_TIMEOUT;
3209 			} else {
3210 				timeout = msecs_to_jiffies(timeout);
3211 			}
3212 
3213 			switch (index++) {
3214 			case 0:
3215 				adev->gfx_timeout = timeout;
3216 				break;
3217 			case 1:
3218 				adev->compute_timeout = timeout;
3219 				break;
3220 			case 2:
3221 				adev->sdma_timeout = timeout;
3222 				break;
3223 			case 3:
3224 				adev->video_timeout = timeout;
3225 				break;
3226 			default:
3227 				break;
3228 			}
3229 		}
3230 		/*
3231 		 * There is only one value specified and
3232 		 * it should apply to all non-compute jobs.
3233 		 */
3234 		if (index == 1) {
3235 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3236 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3237 				adev->compute_timeout = adev->gfx_timeout;
3238 		}
3239 	}
3240 
3241 	return ret;
3242 }
3243 
3244 static const struct attribute *amdgpu_dev_attributes[] = {
3245 	&dev_attr_product_name.attr,
3246 	&dev_attr_product_number.attr,
3247 	&dev_attr_serial_number.attr,
3248 	&dev_attr_pcie_replay_count.attr,
3249 	NULL
3250 };
3251 
3252 
3253 /**
3254  * amdgpu_device_init - initialize the driver
3255  *
3256  * @adev: amdgpu_device pointer
3257  * @flags: driver flags
3258  *
3259  * Initializes the driver info and hw (all asics).
3260  * Returns 0 for success or an error on failure.
3261  * Called at driver startup.
3262  */
3263 int amdgpu_device_init(struct amdgpu_device *adev,
3264 		       uint32_t flags)
3265 {
3266 	struct drm_device *ddev = adev_to_drm(adev);
3267 	struct pci_dev *pdev = adev->pdev;
3268 	int r, i;
3269 	bool px = false;
3270 	u32 max_MBps;
3271 
3272 	adev->shutdown = false;
3273 	adev->flags = flags;
3274 
3275 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3276 		adev->asic_type = amdgpu_force_asic_type;
3277 	else
3278 		adev->asic_type = flags & AMD_ASIC_MASK;
3279 
3280 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3281 	if (amdgpu_emu_mode == 1)
3282 		adev->usec_timeout *= 10;
3283 	adev->gmc.gart_size = 512 * 1024 * 1024;
3284 	adev->accel_working = false;
3285 	adev->num_rings = 0;
3286 	adev->mman.buffer_funcs = NULL;
3287 	adev->mman.buffer_funcs_ring = NULL;
3288 	adev->vm_manager.vm_pte_funcs = NULL;
3289 	adev->vm_manager.vm_pte_num_scheds = 0;
3290 	adev->gmc.gmc_funcs = NULL;
3291 	adev->harvest_ip_mask = 0x0;
3292 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3293 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3294 
3295 	adev->smc_rreg = &amdgpu_invalid_rreg;
3296 	adev->smc_wreg = &amdgpu_invalid_wreg;
3297 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3298 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3299 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3300 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3301 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3302 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3303 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3304 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3305 	adev->didt_rreg = &amdgpu_invalid_rreg;
3306 	adev->didt_wreg = &amdgpu_invalid_wreg;
3307 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3308 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3309 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3310 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3311 
3312 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3313 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3314 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3315 
3316 	/* mutex initialization are all done here so we
3317 	 * can recall function without having locking issues */
3318 	mutex_init(&adev->firmware.mutex);
3319 	mutex_init(&adev->pm.mutex);
3320 	mutex_init(&adev->gfx.gpu_clock_mutex);
3321 	mutex_init(&adev->srbm_mutex);
3322 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3323 	mutex_init(&adev->gfx.gfx_off_mutex);
3324 	mutex_init(&adev->grbm_idx_mutex);
3325 	mutex_init(&adev->mn_lock);
3326 	mutex_init(&adev->virt.vf_errors.lock);
3327 	hash_init(adev->mn_hash);
3328 	atomic_set(&adev->in_gpu_reset, 0);
3329 	init_rwsem(&adev->reset_sem);
3330 	mutex_init(&adev->psp.mutex);
3331 	mutex_init(&adev->notifier_lock);
3332 
3333 	r = amdgpu_device_check_arguments(adev);
3334 	if (r)
3335 		return r;
3336 
3337 	spin_lock_init(&adev->mmio_idx_lock);
3338 	spin_lock_init(&adev->smc_idx_lock);
3339 	spin_lock_init(&adev->pcie_idx_lock);
3340 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3341 	spin_lock_init(&adev->didt_idx_lock);
3342 	spin_lock_init(&adev->gc_cac_idx_lock);
3343 	spin_lock_init(&adev->se_cac_idx_lock);
3344 	spin_lock_init(&adev->audio_endpt_idx_lock);
3345 	spin_lock_init(&adev->mm_stats.lock);
3346 
3347 	INIT_LIST_HEAD(&adev->shadow_list);
3348 	mutex_init(&adev->shadow_list_lock);
3349 
3350 	INIT_LIST_HEAD(&adev->reset_list);
3351 
3352 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3353 			  amdgpu_device_delayed_init_work_handler);
3354 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3355 			  amdgpu_device_delay_enable_gfx_off);
3356 
3357 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3358 
3359 	adev->gfx.gfx_off_req_count = 1;
3360 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3361 
3362 	atomic_set(&adev->throttling_logging_enabled, 1);
3363 	/*
3364 	 * If throttling continues, logging will be performed every minute
3365 	 * to avoid log flooding. "-1" is subtracted since the thermal
3366 	 * throttling interrupt comes every second. Thus, the total logging
3367 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3368 	 * for throttling interrupt) = 60 seconds.
3369 	 */
3370 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3371 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3372 
3373 	/* Registers mapping */
3374 	/* TODO: block userspace mapping of io register */
3375 	if (adev->asic_type >= CHIP_BONAIRE) {
3376 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3377 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3378 	} else {
3379 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3380 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3381 	}
3382 
3383 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3384 	if (adev->rmmio == NULL) {
3385 		return -ENOMEM;
3386 	}
3387 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3388 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3389 
3390 	/* enable PCIE atomic ops */
3391 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3392 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3393 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3394 	if (r) {
3395 		adev->have_atomics_support = false;
3396 		DRM_INFO("PCIE atomic ops is not supported\n");
3397 	} else {
3398 		adev->have_atomics_support = true;
3399 	}
3400 
3401 	amdgpu_device_get_pcie_info(adev);
3402 
3403 	if (amdgpu_mcbp)
3404 		DRM_INFO("MCBP is enabled\n");
3405 
3406 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3407 		adev->enable_mes = true;
3408 
3409 	/* detect hw virtualization here */
3410 	amdgpu_detect_virtualization(adev);
3411 
3412 	r = amdgpu_device_get_job_timeout_settings(adev);
3413 	if (r) {
3414 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3415 		goto failed_unmap;
3416 	}
3417 
3418 	/* early init functions */
3419 	r = amdgpu_device_ip_early_init(adev);
3420 	if (r)
3421 		goto failed_unmap;
3422 
3423 	/* doorbell bar mapping and doorbell index init*/
3424 	amdgpu_device_doorbell_init(adev);
3425 
3426 	if (amdgpu_emu_mode == 1) {
3427 		/* post the asic on emulation mode */
3428 		emu_soc_asic_init(adev);
3429 		goto fence_driver_init;
3430 	}
3431 
3432 	amdgpu_reset_init(adev);
3433 
3434 	/* detect if we are with an SRIOV vbios */
3435 	amdgpu_device_detect_sriov_bios(adev);
3436 
3437 	/* check if we need to reset the asic
3438 	 *  E.g., driver was not cleanly unloaded previously, etc.
3439 	 */
3440 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3441 		if (adev->gmc.xgmi.num_physical_nodes) {
3442 			dev_info(adev->dev, "Pending hive reset.\n");
3443 			adev->gmc.xgmi.pending_reset = true;
3444 			/* Only need to init necessary block for SMU to handle the reset */
3445 			for (i = 0; i < adev->num_ip_blocks; i++) {
3446 				if (!adev->ip_blocks[i].status.valid)
3447 					continue;
3448 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3449 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3450 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3451 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3452 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3453 						adev->ip_blocks[i].version->funcs->name);
3454 					adev->ip_blocks[i].status.hw = true;
3455 				}
3456 			}
3457 		} else {
3458 			r = amdgpu_asic_reset(adev);
3459 			if (r) {
3460 				dev_err(adev->dev, "asic reset on init failed\n");
3461 				goto failed;
3462 			}
3463 		}
3464 	}
3465 
3466 	pci_enable_pcie_error_reporting(adev->pdev);
3467 
3468 	/* Post card if necessary */
3469 	if (amdgpu_device_need_post(adev)) {
3470 		if (!adev->bios) {
3471 			dev_err(adev->dev, "no vBIOS found\n");
3472 			r = -EINVAL;
3473 			goto failed;
3474 		}
3475 		DRM_INFO("GPU posting now...\n");
3476 		r = amdgpu_device_asic_init(adev);
3477 		if (r) {
3478 			dev_err(adev->dev, "gpu post error!\n");
3479 			goto failed;
3480 		}
3481 	}
3482 
3483 	if (adev->is_atom_fw) {
3484 		/* Initialize clocks */
3485 		r = amdgpu_atomfirmware_get_clock_info(adev);
3486 		if (r) {
3487 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3488 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3489 			goto failed;
3490 		}
3491 	} else {
3492 		/* Initialize clocks */
3493 		r = amdgpu_atombios_get_clock_info(adev);
3494 		if (r) {
3495 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3496 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3497 			goto failed;
3498 		}
3499 		/* init i2c buses */
3500 		if (!amdgpu_device_has_dc_support(adev))
3501 			amdgpu_atombios_i2c_init(adev);
3502 	}
3503 
3504 fence_driver_init:
3505 	/* Fence driver */
3506 	r = amdgpu_fence_driver_init(adev);
3507 	if (r) {
3508 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3509 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3510 		goto failed;
3511 	}
3512 
3513 	/* init the mode config */
3514 	drm_mode_config_init(adev_to_drm(adev));
3515 
3516 	r = amdgpu_device_ip_init(adev);
3517 	if (r) {
3518 		/* failed in exclusive mode due to timeout */
3519 		if (amdgpu_sriov_vf(adev) &&
3520 		    !amdgpu_sriov_runtime(adev) &&
3521 		    amdgpu_virt_mmio_blocked(adev) &&
3522 		    !amdgpu_virt_wait_reset(adev)) {
3523 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3524 			/* Don't send request since VF is inactive. */
3525 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3526 			adev->virt.ops = NULL;
3527 			r = -EAGAIN;
3528 			goto release_ras_con;
3529 		}
3530 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3531 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3532 		goto release_ras_con;
3533 	}
3534 
3535 	dev_info(adev->dev,
3536 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3537 			adev->gfx.config.max_shader_engines,
3538 			adev->gfx.config.max_sh_per_se,
3539 			adev->gfx.config.max_cu_per_sh,
3540 			adev->gfx.cu_info.number);
3541 
3542 	adev->accel_working = true;
3543 
3544 	amdgpu_vm_check_compute_bug(adev);
3545 
3546 	/* Initialize the buffer migration limit. */
3547 	if (amdgpu_moverate >= 0)
3548 		max_MBps = amdgpu_moverate;
3549 	else
3550 		max_MBps = 8; /* Allow 8 MB/s. */
3551 	/* Get a log2 for easy divisions. */
3552 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3553 
3554 	amdgpu_fbdev_init(adev);
3555 
3556 	r = amdgpu_pm_sysfs_init(adev);
3557 	if (r) {
3558 		adev->pm_sysfs_en = false;
3559 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3560 	} else
3561 		adev->pm_sysfs_en = true;
3562 
3563 	r = amdgpu_ucode_sysfs_init(adev);
3564 	if (r) {
3565 		adev->ucode_sysfs_en = false;
3566 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3567 	} else
3568 		adev->ucode_sysfs_en = true;
3569 
3570 	if ((amdgpu_testing & 1)) {
3571 		if (adev->accel_working)
3572 			amdgpu_test_moves(adev);
3573 		else
3574 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3575 	}
3576 	if (amdgpu_benchmarking) {
3577 		if (adev->accel_working)
3578 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3579 		else
3580 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3581 	}
3582 
3583 	/*
3584 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3585 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3586 	 * gpu instance is counted less.
3587 	 */
3588 	amdgpu_register_gpu_instance(adev);
3589 
3590 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3591 	 * explicit gating rather than handling it automatically.
3592 	 */
3593 	if (!adev->gmc.xgmi.pending_reset) {
3594 		r = amdgpu_device_ip_late_init(adev);
3595 		if (r) {
3596 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3597 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3598 			goto release_ras_con;
3599 		}
3600 		/* must succeed. */
3601 		amdgpu_ras_resume(adev);
3602 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3603 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3604 	}
3605 
3606 	if (amdgpu_sriov_vf(adev))
3607 		flush_delayed_work(&adev->delayed_init_work);
3608 
3609 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3610 	if (r)
3611 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3612 
3613 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3614 		r = amdgpu_pmu_init(adev);
3615 	if (r)
3616 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3617 
3618 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3619 	if (amdgpu_device_cache_pci_state(adev->pdev))
3620 		pci_restore_state(pdev);
3621 
3622 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3623 	/* this will fail for cards that aren't VGA class devices, just
3624 	 * ignore it */
3625 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3626 		vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3627 
3628 	if (amdgpu_device_supports_px(ddev)) {
3629 		px = true;
3630 		vga_switcheroo_register_client(adev->pdev,
3631 					       &amdgpu_switcheroo_ops, px);
3632 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3633 	}
3634 
3635 	if (adev->gmc.xgmi.pending_reset)
3636 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3637 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3638 
3639 	return 0;
3640 
3641 release_ras_con:
3642 	amdgpu_release_ras_context(adev);
3643 
3644 failed:
3645 	amdgpu_vf_error_trans_all(adev);
3646 
3647 failed_unmap:
3648 	iounmap(adev->rmmio);
3649 	adev->rmmio = NULL;
3650 
3651 	return r;
3652 }
3653 
3654 /**
3655  * amdgpu_device_fini - tear down the driver
3656  *
3657  * @adev: amdgpu_device pointer
3658  *
3659  * Tear down the driver info (all asics).
3660  * Called at driver shutdown.
3661  */
3662 void amdgpu_device_fini(struct amdgpu_device *adev)
3663 {
3664 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3665 	flush_delayed_work(&adev->delayed_init_work);
3666 	ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3667 	adev->shutdown = true;
3668 
3669 	kfree(adev->pci_state);
3670 
3671 	/* make sure IB test finished before entering exclusive mode
3672 	 * to avoid preemption on IB test
3673 	 * */
3674 	if (amdgpu_sriov_vf(adev)) {
3675 		amdgpu_virt_request_full_gpu(adev, false);
3676 		amdgpu_virt_fini_data_exchange(adev);
3677 	}
3678 
3679 	/* disable all interrupts */
3680 	amdgpu_irq_disable_all(adev);
3681 	if (adev->mode_info.mode_config_initialized){
3682 		if (!amdgpu_device_has_dc_support(adev))
3683 			drm_helper_force_disable_all(adev_to_drm(adev));
3684 		else
3685 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3686 	}
3687 	amdgpu_fence_driver_fini(adev);
3688 	if (adev->pm_sysfs_en)
3689 		amdgpu_pm_sysfs_fini(adev);
3690 	amdgpu_fbdev_fini(adev);
3691 	amdgpu_device_ip_fini(adev);
3692 	release_firmware(adev->firmware.gpu_info_fw);
3693 	adev->firmware.gpu_info_fw = NULL;
3694 	adev->accel_working = false;
3695 
3696 	amdgpu_reset_fini(adev);
3697 
3698 	/* free i2c buses */
3699 	if (!amdgpu_device_has_dc_support(adev))
3700 		amdgpu_i2c_fini(adev);
3701 
3702 	if (amdgpu_emu_mode != 1)
3703 		amdgpu_atombios_fini(adev);
3704 
3705 	kfree(adev->bios);
3706 	adev->bios = NULL;
3707 	if (amdgpu_device_supports_px(adev_to_drm(adev))) {
3708 		vga_switcheroo_unregister_client(adev->pdev);
3709 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3710 	}
3711 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3712 		vga_client_register(adev->pdev, NULL, NULL, NULL);
3713 	iounmap(adev->rmmio);
3714 	adev->rmmio = NULL;
3715 	amdgpu_device_doorbell_fini(adev);
3716 
3717 	if (adev->ucode_sysfs_en)
3718 		amdgpu_ucode_sysfs_fini(adev);
3719 
3720 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3721 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3722 		amdgpu_pmu_fini(adev);
3723 	if (adev->mman.discovery_bin)
3724 		amdgpu_discovery_fini(adev);
3725 }
3726 
3727 
3728 /*
3729  * Suspend & resume.
3730  */
3731 /**
3732  * amdgpu_device_suspend - initiate device suspend
3733  *
3734  * @dev: drm dev pointer
3735  * @fbcon : notify the fbdev of suspend
3736  *
3737  * Puts the hw in the suspend state (all asics).
3738  * Returns 0 for success or an error on failure.
3739  * Called at driver suspend.
3740  */
3741 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3742 {
3743 	struct amdgpu_device *adev = drm_to_adev(dev);
3744 	int r;
3745 
3746 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3747 		return 0;
3748 
3749 	adev->in_suspend = true;
3750 	drm_kms_helper_poll_disable(dev);
3751 
3752 	if (fbcon)
3753 		amdgpu_fbdev_set_suspend(adev, 1);
3754 
3755 	cancel_delayed_work_sync(&adev->delayed_init_work);
3756 
3757 	amdgpu_ras_suspend(adev);
3758 
3759 	r = amdgpu_device_ip_suspend_phase1(adev);
3760 
3761 	if (!adev->in_s0ix)
3762 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
3763 
3764 	/* evict vram memory */
3765 	amdgpu_bo_evict_vram(adev);
3766 
3767 	amdgpu_fence_driver_suspend(adev);
3768 
3769 	r = amdgpu_device_ip_suspend_phase2(adev);
3770 	/* evict remaining vram memory
3771 	 * This second call to evict vram is to evict the gart page table
3772 	 * using the CPU.
3773 	 */
3774 	amdgpu_bo_evict_vram(adev);
3775 
3776 	return 0;
3777 }
3778 
3779 /**
3780  * amdgpu_device_resume - initiate device resume
3781  *
3782  * @dev: drm dev pointer
3783  * @fbcon : notify the fbdev of resume
3784  *
3785  * Bring the hw back to operating state (all asics).
3786  * Returns 0 for success or an error on failure.
3787  * Called at driver resume.
3788  */
3789 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3790 {
3791 	struct amdgpu_device *adev = drm_to_adev(dev);
3792 	int r = 0;
3793 
3794 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3795 		return 0;
3796 
3797 	if (adev->in_s0ix)
3798 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3799 
3800 	/* post card */
3801 	if (amdgpu_device_need_post(adev)) {
3802 		r = amdgpu_device_asic_init(adev);
3803 		if (r)
3804 			dev_err(adev->dev, "amdgpu asic init failed\n");
3805 	}
3806 
3807 	r = amdgpu_device_ip_resume(adev);
3808 	if (r) {
3809 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3810 		return r;
3811 	}
3812 	amdgpu_fence_driver_resume(adev);
3813 
3814 
3815 	r = amdgpu_device_ip_late_init(adev);
3816 	if (r)
3817 		return r;
3818 
3819 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3820 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3821 
3822 	if (!adev->in_s0ix) {
3823 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3824 		if (r)
3825 			return r;
3826 	}
3827 
3828 	/* Make sure IB tests flushed */
3829 	flush_delayed_work(&adev->delayed_init_work);
3830 
3831 	if (fbcon)
3832 		amdgpu_fbdev_set_suspend(adev, 0);
3833 
3834 	drm_kms_helper_poll_enable(dev);
3835 
3836 	amdgpu_ras_resume(adev);
3837 
3838 	/*
3839 	 * Most of the connector probing functions try to acquire runtime pm
3840 	 * refs to ensure that the GPU is powered on when connector polling is
3841 	 * performed. Since we're calling this from a runtime PM callback,
3842 	 * trying to acquire rpm refs will cause us to deadlock.
3843 	 *
3844 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3845 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3846 	 */
3847 #ifdef CONFIG_PM
3848 	dev->dev->power.disable_depth++;
3849 #endif
3850 	if (!amdgpu_device_has_dc_support(adev))
3851 		drm_helper_hpd_irq_event(dev);
3852 	else
3853 		drm_kms_helper_hotplug_event(dev);
3854 #ifdef CONFIG_PM
3855 	dev->dev->power.disable_depth--;
3856 #endif
3857 	adev->in_suspend = false;
3858 
3859 	return 0;
3860 }
3861 
3862 /**
3863  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3864  *
3865  * @adev: amdgpu_device pointer
3866  *
3867  * The list of all the hardware IPs that make up the asic is walked and
3868  * the check_soft_reset callbacks are run.  check_soft_reset determines
3869  * if the asic is still hung or not.
3870  * Returns true if any of the IPs are still in a hung state, false if not.
3871  */
3872 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3873 {
3874 	int i;
3875 	bool asic_hang = false;
3876 
3877 	if (amdgpu_sriov_vf(adev))
3878 		return true;
3879 
3880 	if (amdgpu_asic_need_full_reset(adev))
3881 		return true;
3882 
3883 	for (i = 0; i < adev->num_ip_blocks; i++) {
3884 		if (!adev->ip_blocks[i].status.valid)
3885 			continue;
3886 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3887 			adev->ip_blocks[i].status.hang =
3888 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3889 		if (adev->ip_blocks[i].status.hang) {
3890 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3891 			asic_hang = true;
3892 		}
3893 	}
3894 	return asic_hang;
3895 }
3896 
3897 /**
3898  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3899  *
3900  * @adev: amdgpu_device pointer
3901  *
3902  * The list of all the hardware IPs that make up the asic is walked and the
3903  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3904  * handles any IP specific hardware or software state changes that are
3905  * necessary for a soft reset to succeed.
3906  * Returns 0 on success, negative error code on failure.
3907  */
3908 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3909 {
3910 	int i, r = 0;
3911 
3912 	for (i = 0; i < adev->num_ip_blocks; i++) {
3913 		if (!adev->ip_blocks[i].status.valid)
3914 			continue;
3915 		if (adev->ip_blocks[i].status.hang &&
3916 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3917 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3918 			if (r)
3919 				return r;
3920 		}
3921 	}
3922 
3923 	return 0;
3924 }
3925 
3926 /**
3927  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3928  *
3929  * @adev: amdgpu_device pointer
3930  *
3931  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3932  * reset is necessary to recover.
3933  * Returns true if a full asic reset is required, false if not.
3934  */
3935 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3936 {
3937 	int i;
3938 
3939 	if (amdgpu_asic_need_full_reset(adev))
3940 		return true;
3941 
3942 	for (i = 0; i < adev->num_ip_blocks; i++) {
3943 		if (!adev->ip_blocks[i].status.valid)
3944 			continue;
3945 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3946 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3947 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3948 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3949 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3950 			if (adev->ip_blocks[i].status.hang) {
3951 				dev_info(adev->dev, "Some block need full reset!\n");
3952 				return true;
3953 			}
3954 		}
3955 	}
3956 	return false;
3957 }
3958 
3959 /**
3960  * amdgpu_device_ip_soft_reset - do a soft reset
3961  *
3962  * @adev: amdgpu_device pointer
3963  *
3964  * The list of all the hardware IPs that make up the asic is walked and the
3965  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3966  * IP specific hardware or software state changes that are necessary to soft
3967  * reset the IP.
3968  * Returns 0 on success, negative error code on failure.
3969  */
3970 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3971 {
3972 	int i, r = 0;
3973 
3974 	for (i = 0; i < adev->num_ip_blocks; i++) {
3975 		if (!adev->ip_blocks[i].status.valid)
3976 			continue;
3977 		if (adev->ip_blocks[i].status.hang &&
3978 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3979 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3980 			if (r)
3981 				return r;
3982 		}
3983 	}
3984 
3985 	return 0;
3986 }
3987 
3988 /**
3989  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3990  *
3991  * @adev: amdgpu_device pointer
3992  *
3993  * The list of all the hardware IPs that make up the asic is walked and the
3994  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3995  * handles any IP specific hardware or software state changes that are
3996  * necessary after the IP has been soft reset.
3997  * Returns 0 on success, negative error code on failure.
3998  */
3999 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4000 {
4001 	int i, r = 0;
4002 
4003 	for (i = 0; i < adev->num_ip_blocks; i++) {
4004 		if (!adev->ip_blocks[i].status.valid)
4005 			continue;
4006 		if (adev->ip_blocks[i].status.hang &&
4007 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4008 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4009 		if (r)
4010 			return r;
4011 	}
4012 
4013 	return 0;
4014 }
4015 
4016 /**
4017  * amdgpu_device_recover_vram - Recover some VRAM contents
4018  *
4019  * @adev: amdgpu_device pointer
4020  *
4021  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4022  * restore things like GPUVM page tables after a GPU reset where
4023  * the contents of VRAM might be lost.
4024  *
4025  * Returns:
4026  * 0 on success, negative error code on failure.
4027  */
4028 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4029 {
4030 	struct dma_fence *fence = NULL, *next = NULL;
4031 	struct amdgpu_bo *shadow;
4032 	long r = 1, tmo;
4033 
4034 	if (amdgpu_sriov_runtime(adev))
4035 		tmo = msecs_to_jiffies(8000);
4036 	else
4037 		tmo = msecs_to_jiffies(100);
4038 
4039 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4040 	mutex_lock(&adev->shadow_list_lock);
4041 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4042 
4043 		/* No need to recover an evicted BO */
4044 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4045 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4046 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4047 			continue;
4048 
4049 		r = amdgpu_bo_restore_shadow(shadow, &next);
4050 		if (r)
4051 			break;
4052 
4053 		if (fence) {
4054 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4055 			dma_fence_put(fence);
4056 			fence = next;
4057 			if (tmo == 0) {
4058 				r = -ETIMEDOUT;
4059 				break;
4060 			} else if (tmo < 0) {
4061 				r = tmo;
4062 				break;
4063 			}
4064 		} else {
4065 			fence = next;
4066 		}
4067 	}
4068 	mutex_unlock(&adev->shadow_list_lock);
4069 
4070 	if (fence)
4071 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4072 	dma_fence_put(fence);
4073 
4074 	if (r < 0 || tmo <= 0) {
4075 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4076 		return -EIO;
4077 	}
4078 
4079 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4080 	return 0;
4081 }
4082 
4083 
4084 /**
4085  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4086  *
4087  * @adev: amdgpu_device pointer
4088  * @from_hypervisor: request from hypervisor
4089  *
4090  * do VF FLR and reinitialize Asic
4091  * return 0 means succeeded otherwise failed
4092  */
4093 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4094 				     bool from_hypervisor)
4095 {
4096 	int r;
4097 
4098 	if (from_hypervisor)
4099 		r = amdgpu_virt_request_full_gpu(adev, true);
4100 	else
4101 		r = amdgpu_virt_reset_gpu(adev);
4102 	if (r)
4103 		return r;
4104 
4105 	amdgpu_amdkfd_pre_reset(adev);
4106 
4107 	/* Resume IP prior to SMC */
4108 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4109 	if (r)
4110 		goto error;
4111 
4112 	amdgpu_virt_init_data_exchange(adev);
4113 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4114 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4115 
4116 	r = amdgpu_device_fw_loading(adev);
4117 	if (r)
4118 		return r;
4119 
4120 	/* now we are okay to resume SMC/CP/SDMA */
4121 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4122 	if (r)
4123 		goto error;
4124 
4125 	amdgpu_irq_gpu_reset_resume_helper(adev);
4126 	r = amdgpu_ib_ring_tests(adev);
4127 	amdgpu_amdkfd_post_reset(adev);
4128 
4129 error:
4130 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4131 		amdgpu_inc_vram_lost(adev);
4132 		r = amdgpu_device_recover_vram(adev);
4133 	}
4134 	amdgpu_virt_release_full_gpu(adev, true);
4135 
4136 	return r;
4137 }
4138 
4139 /**
4140  * amdgpu_device_has_job_running - check if there is any job in mirror list
4141  *
4142  * @adev: amdgpu_device pointer
4143  *
4144  * check if there is any job in mirror list
4145  */
4146 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4147 {
4148 	int i;
4149 	struct drm_sched_job *job;
4150 
4151 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4152 		struct amdgpu_ring *ring = adev->rings[i];
4153 
4154 		if (!ring || !ring->sched.thread)
4155 			continue;
4156 
4157 		spin_lock(&ring->sched.job_list_lock);
4158 		job = list_first_entry_or_null(&ring->sched.pending_list,
4159 					       struct drm_sched_job, list);
4160 		spin_unlock(&ring->sched.job_list_lock);
4161 		if (job)
4162 			return true;
4163 	}
4164 	return false;
4165 }
4166 
4167 /**
4168  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4169  *
4170  * @adev: amdgpu_device pointer
4171  *
4172  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4173  * a hung GPU.
4174  */
4175 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4176 {
4177 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4178 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4179 		return false;
4180 	}
4181 
4182 	if (amdgpu_gpu_recovery == 0)
4183 		goto disabled;
4184 
4185 	if (amdgpu_sriov_vf(adev))
4186 		return true;
4187 
4188 	if (amdgpu_gpu_recovery == -1) {
4189 		switch (adev->asic_type) {
4190 		case CHIP_BONAIRE:
4191 		case CHIP_HAWAII:
4192 		case CHIP_TOPAZ:
4193 		case CHIP_TONGA:
4194 		case CHIP_FIJI:
4195 		case CHIP_POLARIS10:
4196 		case CHIP_POLARIS11:
4197 		case CHIP_POLARIS12:
4198 		case CHIP_VEGAM:
4199 		case CHIP_VEGA20:
4200 		case CHIP_VEGA10:
4201 		case CHIP_VEGA12:
4202 		case CHIP_RAVEN:
4203 		case CHIP_ARCTURUS:
4204 		case CHIP_RENOIR:
4205 		case CHIP_NAVI10:
4206 		case CHIP_NAVI14:
4207 		case CHIP_NAVI12:
4208 		case CHIP_SIENNA_CICHLID:
4209 		case CHIP_NAVY_FLOUNDER:
4210 		case CHIP_DIMGREY_CAVEFISH:
4211 		case CHIP_VANGOGH:
4212 		case CHIP_ALDEBARAN:
4213 			break;
4214 		default:
4215 			goto disabled;
4216 		}
4217 	}
4218 
4219 	return true;
4220 
4221 disabled:
4222 		dev_info(adev->dev, "GPU recovery disabled.\n");
4223 		return false;
4224 }
4225 
4226 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4227 {
4228         u32 i;
4229         int ret = 0;
4230 
4231         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4232 
4233         dev_info(adev->dev, "GPU mode1 reset\n");
4234 
4235         /* disable BM */
4236         pci_clear_master(adev->pdev);
4237 
4238         amdgpu_device_cache_pci_state(adev->pdev);
4239 
4240         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4241                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4242                 ret = amdgpu_dpm_mode1_reset(adev);
4243         } else {
4244                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4245                 ret = psp_gpu_reset(adev);
4246         }
4247 
4248         if (ret)
4249                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4250 
4251         amdgpu_device_load_pci_state(adev->pdev);
4252 
4253         /* wait for asic to come out of reset */
4254         for (i = 0; i < adev->usec_timeout; i++) {
4255                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4256 
4257                 if (memsize != 0xffffffff)
4258                         break;
4259                 udelay(1);
4260         }
4261 
4262         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4263         return ret;
4264 }
4265 
4266 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4267 				 struct amdgpu_reset_context *reset_context)
4268 {
4269 	int i, r = 0;
4270 	struct amdgpu_job *job = NULL;
4271 	bool need_full_reset =
4272 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4273 
4274 	if (reset_context->reset_req_dev == adev)
4275 		job = reset_context->job;
4276 
4277 	/* no need to dump if device is not in good state during probe period */
4278 	if (!adev->gmc.xgmi.pending_reset)
4279 		amdgpu_debugfs_wait_dump(adev);
4280 
4281 	if (amdgpu_sriov_vf(adev)) {
4282 		/* stop the data exchange thread */
4283 		amdgpu_virt_fini_data_exchange(adev);
4284 	}
4285 
4286 	/* block all schedulers and reset given job's ring */
4287 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4288 		struct amdgpu_ring *ring = adev->rings[i];
4289 
4290 		if (!ring || !ring->sched.thread)
4291 			continue;
4292 
4293 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4294 		amdgpu_fence_driver_force_completion(ring);
4295 	}
4296 
4297 	if(job)
4298 		drm_sched_increase_karma(&job->base);
4299 
4300 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4301 	/* If reset handler not implemented, continue; otherwise return */
4302 	if (r == -ENOSYS)
4303 		r = 0;
4304 	else
4305 		return r;
4306 
4307 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4308 	if (!amdgpu_sriov_vf(adev)) {
4309 
4310 		if (!need_full_reset)
4311 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4312 
4313 		if (!need_full_reset) {
4314 			amdgpu_device_ip_pre_soft_reset(adev);
4315 			r = amdgpu_device_ip_soft_reset(adev);
4316 			amdgpu_device_ip_post_soft_reset(adev);
4317 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4318 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4319 				need_full_reset = true;
4320 			}
4321 		}
4322 
4323 		if (need_full_reset)
4324 			r = amdgpu_device_ip_suspend(adev);
4325 		if (need_full_reset)
4326 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4327 		else
4328 			clear_bit(AMDGPU_NEED_FULL_RESET,
4329 				  &reset_context->flags);
4330 	}
4331 
4332 	return r;
4333 }
4334 
4335 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4336 			 struct amdgpu_reset_context *reset_context)
4337 {
4338 	struct amdgpu_device *tmp_adev = NULL;
4339 	bool need_full_reset, skip_hw_reset, vram_lost = false;
4340 	int r = 0;
4341 
4342 	/* Try reset handler method first */
4343 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4344 				    reset_list);
4345 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4346 	/* If reset handler not implemented, continue; otherwise return */
4347 	if (r == -ENOSYS)
4348 		r = 0;
4349 	else
4350 		return r;
4351 
4352 	/* Reset handler not implemented, use the default method */
4353 	need_full_reset =
4354 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4355 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4356 
4357 	/*
4358 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
4359 	 * to allow proper links negotiation in FW (within 1 sec)
4360 	 */
4361 	if (!skip_hw_reset && need_full_reset) {
4362 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4363 			/* For XGMI run all resets in parallel to speed up the process */
4364 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4365 				tmp_adev->gmc.xgmi.pending_reset = false;
4366 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4367 					r = -EALREADY;
4368 			} else
4369 				r = amdgpu_asic_reset(tmp_adev);
4370 
4371 			if (r) {
4372 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4373 					 r, adev_to_drm(tmp_adev)->unique);
4374 				break;
4375 			}
4376 		}
4377 
4378 		/* For XGMI wait for all resets to complete before proceed */
4379 		if (!r) {
4380 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4381 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4382 					flush_work(&tmp_adev->xgmi_reset_work);
4383 					r = tmp_adev->asic_reset_res;
4384 					if (r)
4385 						break;
4386 				}
4387 			}
4388 		}
4389 	}
4390 
4391 	if (!r && amdgpu_ras_intr_triggered()) {
4392 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4393 			if (tmp_adev->mmhub.ras_funcs &&
4394 			    tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4395 				tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
4396 		}
4397 
4398 		amdgpu_ras_intr_cleared();
4399 	}
4400 
4401 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4402 		if (need_full_reset) {
4403 			/* post card */
4404 			r = amdgpu_device_asic_init(tmp_adev);
4405 			if (r) {
4406 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4407 			} else {
4408 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4409 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4410 				if (r)
4411 					goto out;
4412 
4413 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4414 				if (vram_lost) {
4415 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4416 					amdgpu_inc_vram_lost(tmp_adev);
4417 				}
4418 
4419 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4420 				if (r)
4421 					goto out;
4422 
4423 				r = amdgpu_device_fw_loading(tmp_adev);
4424 				if (r)
4425 					return r;
4426 
4427 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4428 				if (r)
4429 					goto out;
4430 
4431 				if (vram_lost)
4432 					amdgpu_device_fill_reset_magic(tmp_adev);
4433 
4434 				/*
4435 				 * Add this ASIC as tracked as reset was already
4436 				 * complete successfully.
4437 				 */
4438 				amdgpu_register_gpu_instance(tmp_adev);
4439 
4440 				if (!reset_context->hive &&
4441 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4442 					amdgpu_xgmi_add_device(tmp_adev);
4443 
4444 				r = amdgpu_device_ip_late_init(tmp_adev);
4445 				if (r)
4446 					goto out;
4447 
4448 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4449 
4450 				/*
4451 				 * The GPU enters bad state once faulty pages
4452 				 * by ECC has reached the threshold, and ras
4453 				 * recovery is scheduled next. So add one check
4454 				 * here to break recovery if it indeed exceeds
4455 				 * bad page threshold, and remind user to
4456 				 * retire this GPU or setting one bigger
4457 				 * bad_page_threshold value to fix this once
4458 				 * probing driver again.
4459 				 */
4460 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4461 					/* must succeed. */
4462 					amdgpu_ras_resume(tmp_adev);
4463 				} else {
4464 					r = -EINVAL;
4465 					goto out;
4466 				}
4467 
4468 				/* Update PSP FW topology after reset */
4469 				if (reset_context->hive &&
4470 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4471 					r = amdgpu_xgmi_update_topology(
4472 						reset_context->hive, tmp_adev);
4473 			}
4474 		}
4475 
4476 out:
4477 		if (!r) {
4478 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4479 			r = amdgpu_ib_ring_tests(tmp_adev);
4480 			if (r) {
4481 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4482 				need_full_reset = true;
4483 				r = -EAGAIN;
4484 				goto end;
4485 			}
4486 		}
4487 
4488 		if (!r)
4489 			r = amdgpu_device_recover_vram(tmp_adev);
4490 		else
4491 			tmp_adev->asic_reset_res = r;
4492 	}
4493 
4494 end:
4495 	if (need_full_reset)
4496 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4497 	else
4498 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4499 	return r;
4500 }
4501 
4502 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4503 				struct amdgpu_hive_info *hive)
4504 {
4505 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4506 		return false;
4507 
4508 	if (hive) {
4509 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4510 	} else {
4511 		down_write(&adev->reset_sem);
4512 	}
4513 
4514 	switch (amdgpu_asic_reset_method(adev)) {
4515 	case AMD_RESET_METHOD_MODE1:
4516 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4517 		break;
4518 	case AMD_RESET_METHOD_MODE2:
4519 		adev->mp1_state = PP_MP1_STATE_RESET;
4520 		break;
4521 	default:
4522 		adev->mp1_state = PP_MP1_STATE_NONE;
4523 		break;
4524 	}
4525 
4526 	return true;
4527 }
4528 
4529 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4530 {
4531 	amdgpu_vf_error_trans_all(adev);
4532 	adev->mp1_state = PP_MP1_STATE_NONE;
4533 	atomic_set(&adev->in_gpu_reset, 0);
4534 	up_write(&adev->reset_sem);
4535 }
4536 
4537 /*
4538  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4539  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4540  *
4541  * unlock won't require roll back.
4542  */
4543 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4544 {
4545 	struct amdgpu_device *tmp_adev = NULL;
4546 
4547 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4548 		if (!hive) {
4549 			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4550 			return -ENODEV;
4551 		}
4552 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4553 			if (!amdgpu_device_lock_adev(tmp_adev, hive))
4554 				goto roll_back;
4555 		}
4556 	} else if (!amdgpu_device_lock_adev(adev, hive))
4557 		return -EAGAIN;
4558 
4559 	return 0;
4560 roll_back:
4561 	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4562 		/*
4563 		 * if the lockup iteration break in the middle of a hive,
4564 		 * it may means there may has a race issue,
4565 		 * or a hive device locked up independently.
4566 		 * we may be in trouble and may not, so will try to roll back
4567 		 * the lock and give out a warnning.
4568 		 */
4569 		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4570 		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4571 			amdgpu_device_unlock_adev(tmp_adev);
4572 		}
4573 	}
4574 	return -EAGAIN;
4575 }
4576 
4577 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4578 {
4579 	struct pci_dev *p = NULL;
4580 
4581 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4582 			adev->pdev->bus->number, 1);
4583 	if (p) {
4584 		pm_runtime_enable(&(p->dev));
4585 		pm_runtime_resume(&(p->dev));
4586 	}
4587 }
4588 
4589 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4590 {
4591 	enum amd_reset_method reset_method;
4592 	struct pci_dev *p = NULL;
4593 	u64 expires;
4594 
4595 	/*
4596 	 * For now, only BACO and mode1 reset are confirmed
4597 	 * to suffer the audio issue without proper suspended.
4598 	 */
4599 	reset_method = amdgpu_asic_reset_method(adev);
4600 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4601 	     (reset_method != AMD_RESET_METHOD_MODE1))
4602 		return -EINVAL;
4603 
4604 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4605 			adev->pdev->bus->number, 1);
4606 	if (!p)
4607 		return -ENODEV;
4608 
4609 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4610 	if (!expires)
4611 		/*
4612 		 * If we cannot get the audio device autosuspend delay,
4613 		 * a fixed 4S interval will be used. Considering 3S is
4614 		 * the audio controller default autosuspend delay setting.
4615 		 * 4S used here is guaranteed to cover that.
4616 		 */
4617 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4618 
4619 	while (!pm_runtime_status_suspended(&(p->dev))) {
4620 		if (!pm_runtime_suspend(&(p->dev)))
4621 			break;
4622 
4623 		if (expires < ktime_get_mono_fast_ns()) {
4624 			dev_warn(adev->dev, "failed to suspend display audio\n");
4625 			/* TODO: abort the succeeding gpu reset? */
4626 			return -ETIMEDOUT;
4627 		}
4628 	}
4629 
4630 	pm_runtime_disable(&(p->dev));
4631 
4632 	return 0;
4633 }
4634 
4635 void amdgpu_device_recheck_guilty_jobs(
4636 	struct amdgpu_device *adev, struct list_head *device_list_handle,
4637 	struct amdgpu_reset_context *reset_context)
4638 {
4639 	int i, r = 0;
4640 
4641 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4642 		struct amdgpu_ring *ring = adev->rings[i];
4643 		int ret = 0;
4644 		struct drm_sched_job *s_job;
4645 
4646 		if (!ring || !ring->sched.thread)
4647 			continue;
4648 
4649 		s_job = list_first_entry_or_null(&ring->sched.pending_list,
4650 				struct drm_sched_job, list);
4651 		if (s_job == NULL)
4652 			continue;
4653 
4654 		/* clear job's guilty and depend the folowing step to decide the real one */
4655 		drm_sched_reset_karma(s_job);
4656 		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4657 
4658 		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4659 		if (ret == 0) { /* timeout */
4660 			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4661 						ring->sched.name, s_job->id);
4662 
4663 			/* set guilty */
4664 			drm_sched_increase_karma(s_job);
4665 retry:
4666 			/* do hw reset */
4667 			if (amdgpu_sriov_vf(adev)) {
4668 				amdgpu_virt_fini_data_exchange(adev);
4669 				r = amdgpu_device_reset_sriov(adev, false);
4670 				if (r)
4671 					adev->asic_reset_res = r;
4672 			} else {
4673 				clear_bit(AMDGPU_SKIP_HW_RESET,
4674 					  &reset_context->flags);
4675 				r = amdgpu_do_asic_reset(device_list_handle,
4676 							 reset_context);
4677 				if (r && r == -EAGAIN)
4678 					goto retry;
4679 			}
4680 
4681 			/*
4682 			 * add reset counter so that the following
4683 			 * resubmitted job could flush vmid
4684 			 */
4685 			atomic_inc(&adev->gpu_reset_counter);
4686 			continue;
4687 		}
4688 
4689 		/* got the hw fence, signal finished fence */
4690 		atomic_dec(ring->sched.score);
4691 		dma_fence_get(&s_job->s_fence->finished);
4692 		dma_fence_signal(&s_job->s_fence->finished);
4693 		dma_fence_put(&s_job->s_fence->finished);
4694 
4695 		/* remove node from list and free the job */
4696 		spin_lock(&ring->sched.job_list_lock);
4697 		list_del_init(&s_job->list);
4698 		spin_unlock(&ring->sched.job_list_lock);
4699 		ring->sched.ops->free_job(s_job);
4700 	}
4701 }
4702 
4703 /**
4704  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4705  *
4706  * @adev: amdgpu_device pointer
4707  * @job: which job trigger hang
4708  *
4709  * Attempt to reset the GPU if it has hung (all asics).
4710  * Attempt to do soft-reset or full-reset and reinitialize Asic
4711  * Returns 0 for success or an error on failure.
4712  */
4713 
4714 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4715 			      struct amdgpu_job *job)
4716 {
4717 	struct list_head device_list, *device_list_handle =  NULL;
4718 	bool job_signaled = false;
4719 	struct amdgpu_hive_info *hive = NULL;
4720 	struct amdgpu_device *tmp_adev = NULL;
4721 	int i, r = 0;
4722 	bool need_emergency_restart = false;
4723 	bool audio_suspended = false;
4724 	int tmp_vram_lost_counter;
4725 	struct amdgpu_reset_context reset_context;
4726 
4727 	memset(&reset_context, 0, sizeof(reset_context));
4728 
4729 	/*
4730 	 * Special case: RAS triggered and full reset isn't supported
4731 	 */
4732 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4733 
4734 	/*
4735 	 * Flush RAM to disk so that after reboot
4736 	 * the user can read log and see why the system rebooted.
4737 	 */
4738 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4739 		DRM_WARN("Emergency reboot.");
4740 
4741 		ksys_sync_helper();
4742 		emergency_restart();
4743 	}
4744 
4745 	dev_info(adev->dev, "GPU %s begin!\n",
4746 		need_emergency_restart ? "jobs stop":"reset");
4747 
4748 	/*
4749 	 * Here we trylock to avoid chain of resets executing from
4750 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4751 	 * different schedulers for same device while this TO handler is running.
4752 	 * We always reset all schedulers for device and all devices for XGMI
4753 	 * hive so that should take care of them too.
4754 	 */
4755 	hive = amdgpu_get_xgmi_hive(adev);
4756 	if (hive) {
4757 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4758 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4759 				job ? job->base.id : -1, hive->hive_id);
4760 			amdgpu_put_xgmi_hive(hive);
4761 			if (job)
4762 				drm_sched_increase_karma(&job->base);
4763 			return 0;
4764 		}
4765 		mutex_lock(&hive->hive_lock);
4766 	}
4767 
4768 	reset_context.method = AMD_RESET_METHOD_NONE;
4769 	reset_context.reset_req_dev = adev;
4770 	reset_context.job = job;
4771 	reset_context.hive = hive;
4772 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
4773 
4774 	/*
4775 	 * lock the device before we try to operate the linked list
4776 	 * if didn't get the device lock, don't touch the linked list since
4777 	 * others may iterating it.
4778 	 */
4779 	r = amdgpu_device_lock_hive_adev(adev, hive);
4780 	if (r) {
4781 		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4782 					job ? job->base.id : -1);
4783 
4784 		/* even we skipped this reset, still need to set the job to guilty */
4785 		if (job)
4786 			drm_sched_increase_karma(&job->base);
4787 		goto skip_recovery;
4788 	}
4789 
4790 	/*
4791 	 * Build list of devices to reset.
4792 	 * In case we are in XGMI hive mode, resort the device list
4793 	 * to put adev in the 1st position.
4794 	 */
4795 	INIT_LIST_HEAD(&device_list);
4796 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4797 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4798 			list_add_tail(&tmp_adev->reset_list, &device_list);
4799 		if (!list_is_first(&adev->reset_list, &device_list))
4800 			list_rotate_to_front(&adev->reset_list, &device_list);
4801 		device_list_handle = &device_list;
4802 	} else {
4803 		list_add_tail(&adev->reset_list, &device_list);
4804 		device_list_handle = &device_list;
4805 	}
4806 
4807 	/* block all schedulers and reset given job's ring */
4808 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4809 		/*
4810 		 * Try to put the audio codec into suspend state
4811 		 * before gpu reset started.
4812 		 *
4813 		 * Due to the power domain of the graphics device
4814 		 * is shared with AZ power domain. Without this,
4815 		 * we may change the audio hardware from behind
4816 		 * the audio driver's back. That will trigger
4817 		 * some audio codec errors.
4818 		 */
4819 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4820 			audio_suspended = true;
4821 
4822 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4823 
4824 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4825 
4826 		if (!amdgpu_sriov_vf(tmp_adev))
4827 			amdgpu_amdkfd_pre_reset(tmp_adev);
4828 
4829 		/*
4830 		 * Mark these ASICs to be reseted as untracked first
4831 		 * And add them back after reset completed
4832 		 */
4833 		amdgpu_unregister_gpu_instance(tmp_adev);
4834 
4835 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4836 
4837 		/* disable ras on ALL IPs */
4838 		if (!need_emergency_restart &&
4839 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4840 			amdgpu_ras_suspend(tmp_adev);
4841 
4842 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4843 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4844 
4845 			if (!ring || !ring->sched.thread)
4846 				continue;
4847 
4848 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4849 
4850 			if (need_emergency_restart)
4851 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4852 		}
4853 		atomic_inc(&tmp_adev->gpu_reset_counter);
4854 	}
4855 
4856 	if (need_emergency_restart)
4857 		goto skip_sched_resume;
4858 
4859 	/*
4860 	 * Must check guilty signal here since after this point all old
4861 	 * HW fences are force signaled.
4862 	 *
4863 	 * job->base holds a reference to parent fence
4864 	 */
4865 	if (job && job->base.s_fence->parent &&
4866 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4867 		job_signaled = true;
4868 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4869 		goto skip_hw_reset;
4870 	}
4871 
4872 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4873 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4874 		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
4875 		/*TODO Should we stop ?*/
4876 		if (r) {
4877 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4878 				  r, adev_to_drm(tmp_adev)->unique);
4879 			tmp_adev->asic_reset_res = r;
4880 		}
4881 	}
4882 
4883 	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
4884 	/* Actual ASIC resets if needed.*/
4885 	/* TODO Implement XGMI hive reset logic for SRIOV */
4886 	if (amdgpu_sriov_vf(adev)) {
4887 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4888 		if (r)
4889 			adev->asic_reset_res = r;
4890 	} else {
4891 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
4892 		if (r && r == -EAGAIN)
4893 			goto retry;
4894 	}
4895 
4896 skip_hw_reset:
4897 
4898 	/* Post ASIC reset for all devs .*/
4899 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4900 
4901 		/*
4902 		 * Sometimes a later bad compute job can block a good gfx job as gfx
4903 		 * and compute ring share internal GC HW mutually. We add an additional
4904 		 * guilty jobs recheck step to find the real guilty job, it synchronously
4905 		 * submits and pends for the first job being signaled. If it gets timeout,
4906 		 * we identify it as a real guilty job.
4907 		 */
4908 		if (amdgpu_gpu_recovery == 2 &&
4909 			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
4910 			amdgpu_device_recheck_guilty_jobs(
4911 				tmp_adev, device_list_handle, &reset_context);
4912 
4913 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4914 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4915 
4916 			if (!ring || !ring->sched.thread)
4917 				continue;
4918 
4919 			/* No point to resubmit jobs if we didn't HW reset*/
4920 			if (!tmp_adev->asic_reset_res && !job_signaled)
4921 				drm_sched_resubmit_jobs(&ring->sched);
4922 
4923 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4924 		}
4925 
4926 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4927 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4928 		}
4929 
4930 		tmp_adev->asic_reset_res = 0;
4931 
4932 		if (r) {
4933 			/* bad news, how to tell it to userspace ? */
4934 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4935 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4936 		} else {
4937 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4938 		}
4939 	}
4940 
4941 skip_sched_resume:
4942 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4943 		/* unlock kfd: SRIOV would do it separately */
4944 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4945 	                amdgpu_amdkfd_post_reset(tmp_adev);
4946 
4947 		/* kfd_post_reset will do nothing if kfd device is not initialized,
4948 		 * need to bring up kfd here if it's not be initialized before
4949 		 */
4950 		if (!adev->kfd.init_complete)
4951 			amdgpu_amdkfd_device_init(adev);
4952 
4953 		if (audio_suspended)
4954 			amdgpu_device_resume_display_audio(tmp_adev);
4955 		amdgpu_device_unlock_adev(tmp_adev);
4956 	}
4957 
4958 skip_recovery:
4959 	if (hive) {
4960 		atomic_set(&hive->in_reset, 0);
4961 		mutex_unlock(&hive->hive_lock);
4962 		amdgpu_put_xgmi_hive(hive);
4963 	}
4964 
4965 	if (r && r != -EAGAIN)
4966 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4967 	return r;
4968 }
4969 
4970 /**
4971  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4972  *
4973  * @adev: amdgpu_device pointer
4974  *
4975  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4976  * and lanes) of the slot the device is in. Handles APUs and
4977  * virtualized environments where PCIE config space may not be available.
4978  */
4979 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4980 {
4981 	struct pci_dev *pdev;
4982 	enum pci_bus_speed speed_cap, platform_speed_cap;
4983 	enum pcie_link_width platform_link_width;
4984 
4985 	if (amdgpu_pcie_gen_cap)
4986 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4987 
4988 	if (amdgpu_pcie_lane_cap)
4989 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4990 
4991 	/* covers APUs as well */
4992 	if (pci_is_root_bus(adev->pdev->bus)) {
4993 		if (adev->pm.pcie_gen_mask == 0)
4994 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4995 		if (adev->pm.pcie_mlw_mask == 0)
4996 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4997 		return;
4998 	}
4999 
5000 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5001 		return;
5002 
5003 	pcie_bandwidth_available(adev->pdev, NULL,
5004 				 &platform_speed_cap, &platform_link_width);
5005 
5006 	if (adev->pm.pcie_gen_mask == 0) {
5007 		/* asic caps */
5008 		pdev = adev->pdev;
5009 		speed_cap = pcie_get_speed_cap(pdev);
5010 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5011 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5012 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5013 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5014 		} else {
5015 			if (speed_cap == PCIE_SPEED_32_0GT)
5016 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5017 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5018 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5019 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5020 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5021 			else if (speed_cap == PCIE_SPEED_16_0GT)
5022 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5023 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5024 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5025 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5026 			else if (speed_cap == PCIE_SPEED_8_0GT)
5027 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5028 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5029 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5030 			else if (speed_cap == PCIE_SPEED_5_0GT)
5031 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5032 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5033 			else
5034 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5035 		}
5036 		/* platform caps */
5037 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5038 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5039 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5040 		} else {
5041 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5042 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5043 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5044 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5045 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5046 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5047 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5048 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5049 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5050 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5051 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5052 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5053 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5054 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5055 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5056 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5057 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5058 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5059 			else
5060 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5061 
5062 		}
5063 	}
5064 	if (adev->pm.pcie_mlw_mask == 0) {
5065 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5066 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5067 		} else {
5068 			switch (platform_link_width) {
5069 			case PCIE_LNK_X32:
5070 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5071 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5072 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5073 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5074 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5075 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5076 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5077 				break;
5078 			case PCIE_LNK_X16:
5079 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5080 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5081 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5082 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5083 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5084 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5085 				break;
5086 			case PCIE_LNK_X12:
5087 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5088 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5089 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5090 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5091 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5092 				break;
5093 			case PCIE_LNK_X8:
5094 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5095 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5096 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5097 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5098 				break;
5099 			case PCIE_LNK_X4:
5100 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5101 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5102 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5103 				break;
5104 			case PCIE_LNK_X2:
5105 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5106 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5107 				break;
5108 			case PCIE_LNK_X1:
5109 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5110 				break;
5111 			default:
5112 				break;
5113 			}
5114 		}
5115 	}
5116 }
5117 
5118 int amdgpu_device_baco_enter(struct drm_device *dev)
5119 {
5120 	struct amdgpu_device *adev = drm_to_adev(dev);
5121 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5122 
5123 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5124 		return -ENOTSUPP;
5125 
5126 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5127 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5128 
5129 	return amdgpu_dpm_baco_enter(adev);
5130 }
5131 
5132 int amdgpu_device_baco_exit(struct drm_device *dev)
5133 {
5134 	struct amdgpu_device *adev = drm_to_adev(dev);
5135 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5136 	int ret = 0;
5137 
5138 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5139 		return -ENOTSUPP;
5140 
5141 	ret = amdgpu_dpm_baco_exit(adev);
5142 	if (ret)
5143 		return ret;
5144 
5145 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5146 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5147 
5148 	return 0;
5149 }
5150 
5151 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5152 {
5153 	int i;
5154 
5155 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5156 		struct amdgpu_ring *ring = adev->rings[i];
5157 
5158 		if (!ring || !ring->sched.thread)
5159 			continue;
5160 
5161 		cancel_delayed_work_sync(&ring->sched.work_tdr);
5162 	}
5163 }
5164 
5165 /**
5166  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5167  * @pdev: PCI device struct
5168  * @state: PCI channel state
5169  *
5170  * Description: Called when a PCI error is detected.
5171  *
5172  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5173  */
5174 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5175 {
5176 	struct drm_device *dev = pci_get_drvdata(pdev);
5177 	struct amdgpu_device *adev = drm_to_adev(dev);
5178 	int i;
5179 
5180 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5181 
5182 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5183 		DRM_WARN("No support for XGMI hive yet...");
5184 		return PCI_ERS_RESULT_DISCONNECT;
5185 	}
5186 
5187 	switch (state) {
5188 	case pci_channel_io_normal:
5189 		return PCI_ERS_RESULT_CAN_RECOVER;
5190 	/* Fatal error, prepare for slot reset */
5191 	case pci_channel_io_frozen:
5192 		/*
5193 		 * Cancel and wait for all TDRs in progress if failing to
5194 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5195 		 *
5196 		 * Locking adev->reset_sem will prevent any external access
5197 		 * to GPU during PCI error recovery
5198 		 */
5199 		while (!amdgpu_device_lock_adev(adev, NULL))
5200 			amdgpu_cancel_all_tdr(adev);
5201 
5202 		/*
5203 		 * Block any work scheduling as we do for regular GPU reset
5204 		 * for the duration of the recovery
5205 		 */
5206 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5207 			struct amdgpu_ring *ring = adev->rings[i];
5208 
5209 			if (!ring || !ring->sched.thread)
5210 				continue;
5211 
5212 			drm_sched_stop(&ring->sched, NULL);
5213 		}
5214 		atomic_inc(&adev->gpu_reset_counter);
5215 		return PCI_ERS_RESULT_NEED_RESET;
5216 	case pci_channel_io_perm_failure:
5217 		/* Permanent error, prepare for device removal */
5218 		return PCI_ERS_RESULT_DISCONNECT;
5219 	}
5220 
5221 	return PCI_ERS_RESULT_NEED_RESET;
5222 }
5223 
5224 /**
5225  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5226  * @pdev: pointer to PCI device
5227  */
5228 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5229 {
5230 
5231 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5232 
5233 	/* TODO - dump whatever for debugging purposes */
5234 
5235 	/* This called only if amdgpu_pci_error_detected returns
5236 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5237 	 * works, no need to reset slot.
5238 	 */
5239 
5240 	return PCI_ERS_RESULT_RECOVERED;
5241 }
5242 
5243 /**
5244  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5245  * @pdev: PCI device struct
5246  *
5247  * Description: This routine is called by the pci error recovery
5248  * code after the PCI slot has been reset, just before we
5249  * should resume normal operations.
5250  */
5251 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5252 {
5253 	struct drm_device *dev = pci_get_drvdata(pdev);
5254 	struct amdgpu_device *adev = drm_to_adev(dev);
5255 	int r, i;
5256 	struct amdgpu_reset_context reset_context;
5257 	u32 memsize;
5258 	struct list_head device_list;
5259 
5260 	DRM_INFO("PCI error: slot reset callback!!\n");
5261 
5262 	memset(&reset_context, 0, sizeof(reset_context));
5263 
5264 	INIT_LIST_HEAD(&device_list);
5265 	list_add_tail(&adev->reset_list, &device_list);
5266 
5267 	/* wait for asic to come out of reset */
5268 	msleep(500);
5269 
5270 	/* Restore PCI confspace */
5271 	amdgpu_device_load_pci_state(pdev);
5272 
5273 	/* confirm  ASIC came out of reset */
5274 	for (i = 0; i < adev->usec_timeout; i++) {
5275 		memsize = amdgpu_asic_get_config_memsize(adev);
5276 
5277 		if (memsize != 0xffffffff)
5278 			break;
5279 		udelay(1);
5280 	}
5281 	if (memsize == 0xffffffff) {
5282 		r = -ETIME;
5283 		goto out;
5284 	}
5285 
5286 	reset_context.method = AMD_RESET_METHOD_NONE;
5287 	reset_context.reset_req_dev = adev;
5288 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5289 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5290 
5291 	adev->in_pci_err_recovery = true;
5292 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5293 	adev->in_pci_err_recovery = false;
5294 	if (r)
5295 		goto out;
5296 
5297 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5298 
5299 out:
5300 	if (!r) {
5301 		if (amdgpu_device_cache_pci_state(adev->pdev))
5302 			pci_restore_state(adev->pdev);
5303 
5304 		DRM_INFO("PCIe error recovery succeeded\n");
5305 	} else {
5306 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5307 		amdgpu_device_unlock_adev(adev);
5308 	}
5309 
5310 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5311 }
5312 
5313 /**
5314  * amdgpu_pci_resume() - resume normal ops after PCI reset
5315  * @pdev: pointer to PCI device
5316  *
5317  * Called when the error recovery driver tells us that its
5318  * OK to resume normal operation.
5319  */
5320 void amdgpu_pci_resume(struct pci_dev *pdev)
5321 {
5322 	struct drm_device *dev = pci_get_drvdata(pdev);
5323 	struct amdgpu_device *adev = drm_to_adev(dev);
5324 	int i;
5325 
5326 
5327 	DRM_INFO("PCI error: resume callback!!\n");
5328 
5329 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5330 		struct amdgpu_ring *ring = adev->rings[i];
5331 
5332 		if (!ring || !ring->sched.thread)
5333 			continue;
5334 
5335 
5336 		drm_sched_resubmit_jobs(&ring->sched);
5337 		drm_sched_start(&ring->sched, true);
5338 	}
5339 
5340 	amdgpu_device_unlock_adev(adev);
5341 }
5342 
5343 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5344 {
5345 	struct drm_device *dev = pci_get_drvdata(pdev);
5346 	struct amdgpu_device *adev = drm_to_adev(dev);
5347 	int r;
5348 
5349 	r = pci_save_state(pdev);
5350 	if (!r) {
5351 		kfree(adev->pci_state);
5352 
5353 		adev->pci_state = pci_store_saved_state(pdev);
5354 
5355 		if (!adev->pci_state) {
5356 			DRM_ERROR("Failed to store PCI saved state");
5357 			return false;
5358 		}
5359 	} else {
5360 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5361 		return false;
5362 	}
5363 
5364 	return true;
5365 }
5366 
5367 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5368 {
5369 	struct drm_device *dev = pci_get_drvdata(pdev);
5370 	struct amdgpu_device *adev = drm_to_adev(dev);
5371 	int r;
5372 
5373 	if (!adev->pci_state)
5374 		return false;
5375 
5376 	r = pci_load_saved_state(pdev, adev->pci_state);
5377 
5378 	if (!r) {
5379 		pci_restore_state(pdev);
5380 	} else {
5381 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5382 		return false;
5383 	}
5384 
5385 	return true;
5386 }
5387 
5388 
5389