xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c (revision f97cee494dc92395a668445bcd24d34c89f4ff8c)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72 
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 
84 #define AMDGPU_RESUME_MS		2000
85 
86 const char *amdgpu_asic_name[] = {
87 	"TAHITI",
88 	"PITCAIRN",
89 	"VERDE",
90 	"OLAND",
91 	"HAINAN",
92 	"BONAIRE",
93 	"KAVERI",
94 	"KABINI",
95 	"HAWAII",
96 	"MULLINS",
97 	"TOPAZ",
98 	"TONGA",
99 	"FIJI",
100 	"CARRIZO",
101 	"STONEY",
102 	"POLARIS10",
103 	"POLARIS11",
104 	"POLARIS12",
105 	"VEGAM",
106 	"VEGA10",
107 	"VEGA12",
108 	"VEGA20",
109 	"RAVEN",
110 	"ARCTURUS",
111 	"RENOIR",
112 	"NAVI10",
113 	"NAVI14",
114 	"NAVI12",
115 	"SIENNA_CICHLID",
116 	"NAVY_FLOUNDER",
117 	"LAST",
118 };
119 
120 /**
121  * DOC: pcie_replay_count
122  *
123  * The amdgpu driver provides a sysfs API for reporting the total number
124  * of PCIe replays (NAKs)
125  * The file pcie_replay_count is used for this and returns the total
126  * number of replays as a sum of the NAKs generated and NAKs received
127  */
128 
129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 		struct device_attribute *attr, char *buf)
131 {
132 	struct drm_device *ddev = dev_get_drvdata(dev);
133 	struct amdgpu_device *adev = ddev->dev_private;
134 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135 
136 	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138 
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 		amdgpu_device_get_pcie_replay_count, NULL);
141 
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143 
144 /**
145  * DOC: product_name
146  *
147  * The amdgpu driver provides a sysfs API for reporting the product name
148  * for the device
149  * The file serial_number is used for this and returns the product name
150  * as returned from the FRU.
151  * NOTE: This is only available for certain server cards
152  */
153 
154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 		struct device_attribute *attr, char *buf)
156 {
157 	struct drm_device *ddev = dev_get_drvdata(dev);
158 	struct amdgpu_device *adev = ddev->dev_private;
159 
160 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162 
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 		amdgpu_device_get_product_name, NULL);
165 
166 /**
167  * DOC: product_number
168  *
169  * The amdgpu driver provides a sysfs API for reporting the part number
170  * for the device
171  * The file serial_number is used for this and returns the part number
172  * as returned from the FRU.
173  * NOTE: This is only available for certain server cards
174  */
175 
176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 		struct device_attribute *attr, char *buf)
178 {
179 	struct drm_device *ddev = dev_get_drvdata(dev);
180 	struct amdgpu_device *adev = ddev->dev_private;
181 
182 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184 
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 		amdgpu_device_get_product_number, NULL);
187 
188 /**
189  * DOC: serial_number
190  *
191  * The amdgpu driver provides a sysfs API for reporting the serial number
192  * for the device
193  * The file serial_number is used for this and returns the serial number
194  * as returned from the FRU.
195  * NOTE: This is only available for certain server cards
196  */
197 
198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 		struct device_attribute *attr, char *buf)
200 {
201 	struct drm_device *ddev = dev_get_drvdata(dev);
202 	struct amdgpu_device *adev = ddev->dev_private;
203 
204 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206 
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 		amdgpu_device_get_serial_number, NULL);
209 
210 /**
211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212  *
213  * @dev: drm_device pointer
214  *
215  * Returns true if the device is a dGPU with HG/PX power control,
216  * otherwise return false.
217  */
218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 	struct amdgpu_device *adev = dev->dev_private;
221 
222 	if (adev->flags & AMD_IS_PX)
223 		return true;
224 	return false;
225 }
226 
227 /**
228  * amdgpu_device_supports_baco - Does the device support BACO
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device supporte BACO,
233  * otherwise return false.
234  */
235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 	struct amdgpu_device *adev = dev->dev_private;
238 
239 	return amdgpu_asic_supports_baco(adev);
240 }
241 
242 /**
243  * VRAM access helper functions.
244  *
245  * amdgpu_device_vram_access - read/write a buffer in vram
246  *
247  * @adev: amdgpu_device pointer
248  * @pos: offset of the buffer in vram
249  * @buf: virtual address of the buffer in system memory
250  * @size: read/write size, sizeof(@buf) must > @size
251  * @write: true - write to vram, otherwise - read from vram
252  */
253 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
254 			       uint32_t *buf, size_t size, bool write)
255 {
256 	unsigned long flags;
257 	uint32_t hi = ~0;
258 	uint64_t last;
259 
260 
261 #ifdef CONFIG_64BIT
262 	last = min(pos + size, adev->gmc.visible_vram_size);
263 	if (last > pos) {
264 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
265 		size_t count = last - pos;
266 
267 		if (write) {
268 			memcpy_toio(addr, buf, count);
269 			mb();
270 			amdgpu_asic_flush_hdp(adev, NULL);
271 		} else {
272 			amdgpu_asic_invalidate_hdp(adev, NULL);
273 			mb();
274 			memcpy_fromio(buf, addr, count);
275 		}
276 
277 		if (count == size)
278 			return;
279 
280 		pos += count;
281 		buf += count / 4;
282 		size -= count;
283 	}
284 #endif
285 
286 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
287 	for (last = pos + size; pos < last; pos += 4) {
288 		uint32_t tmp = pos >> 31;
289 
290 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
291 		if (tmp != hi) {
292 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
293 			hi = tmp;
294 		}
295 		if (write)
296 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
297 		else
298 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
299 	}
300 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
301 }
302 
303 /*
304  * MMIO register access helper functions.
305  */
306 /**
307  * amdgpu_mm_rreg - read a memory mapped IO register
308  *
309  * @adev: amdgpu_device pointer
310  * @reg: dword aligned register offset
311  * @acc_flags: access flags which require special behavior
312  *
313  * Returns the 32 bit value from the offset specified.
314  */
315 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
316 			uint32_t acc_flags)
317 {
318 	uint32_t ret;
319 
320 	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
321 		return amdgpu_kiq_rreg(adev, reg);
322 
323 	if ((reg * 4) < adev->rmmio_size)
324 		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
325 	else {
326 		unsigned long flags;
327 
328 		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
329 		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
330 		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
331 		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
332 	}
333 	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
334 	return ret;
335 }
336 
337 /*
338  * MMIO register read with bytes helper functions
339  * @offset:bytes offset from MMIO start
340  *
341 */
342 
343 /**
344  * amdgpu_mm_rreg8 - read a memory mapped IO register
345  *
346  * @adev: amdgpu_device pointer
347  * @offset: byte aligned register offset
348  *
349  * Returns the 8 bit value from the offset specified.
350  */
351 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
352 	if (offset < adev->rmmio_size)
353 		return (readb(adev->rmmio + offset));
354 	BUG();
355 }
356 
357 /*
358  * MMIO register write with bytes helper functions
359  * @offset:bytes offset from MMIO start
360  * @value: the value want to be written to the register
361  *
362 */
363 /**
364  * amdgpu_mm_wreg8 - read a memory mapped IO register
365  *
366  * @adev: amdgpu_device pointer
367  * @offset: byte aligned register offset
368  * @value: 8 bit value to write
369  *
370  * Writes the value specified to the offset specified.
371  */
372 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
373 	if (offset < adev->rmmio_size)
374 		writeb(value, adev->rmmio + offset);
375 	else
376 		BUG();
377 }
378 
379 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
380 {
381 	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
382 
383 	if ((reg * 4) < adev->rmmio_size)
384 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
385 	else {
386 		unsigned long flags;
387 
388 		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
389 		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
390 		writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
391 		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
392 	}
393 }
394 
395 /**
396  * amdgpu_mm_wreg - write to a memory mapped IO register
397  *
398  * @adev: amdgpu_device pointer
399  * @reg: dword aligned register offset
400  * @v: 32 bit value to write to the register
401  * @acc_flags: access flags which require special behavior
402  *
403  * Writes the value specified to the offset specified.
404  */
405 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
406 		    uint32_t acc_flags)
407 {
408 	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
409 		return amdgpu_kiq_wreg(adev, reg, v);
410 
411 	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
412 }
413 
414 /*
415  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
416  *
417  * this function is invoked only the debugfs register access
418  * */
419 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
420 		    uint32_t acc_flags)
421 {
422 	if (amdgpu_sriov_fullaccess(adev) &&
423 		adev->gfx.rlc.funcs &&
424 		adev->gfx.rlc.funcs->is_rlcg_access_range) {
425 
426 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
427 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
428 	}
429 
430 	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
431 }
432 
433 /**
434  * amdgpu_io_rreg - read an IO register
435  *
436  * @adev: amdgpu_device pointer
437  * @reg: dword aligned register offset
438  *
439  * Returns the 32 bit value from the offset specified.
440  */
441 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
442 {
443 	if ((reg * 4) < adev->rio_mem_size)
444 		return ioread32(adev->rio_mem + (reg * 4));
445 	else {
446 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
447 		return ioread32(adev->rio_mem + (mmMM_DATA * 4));
448 	}
449 }
450 
451 /**
452  * amdgpu_io_wreg - write to an IO register
453  *
454  * @adev: amdgpu_device pointer
455  * @reg: dword aligned register offset
456  * @v: 32 bit value to write to the register
457  *
458  * Writes the value specified to the offset specified.
459  */
460 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
461 {
462 	if ((reg * 4) < adev->rio_mem_size)
463 		iowrite32(v, adev->rio_mem + (reg * 4));
464 	else {
465 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
466 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
467 	}
468 }
469 
470 /**
471  * amdgpu_mm_rdoorbell - read a doorbell dword
472  *
473  * @adev: amdgpu_device pointer
474  * @index: doorbell index
475  *
476  * Returns the value in the doorbell aperture at the
477  * requested doorbell index (CIK).
478  */
479 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
480 {
481 	if (index < adev->doorbell.num_doorbells) {
482 		return readl(adev->doorbell.ptr + index);
483 	} else {
484 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
485 		return 0;
486 	}
487 }
488 
489 /**
490  * amdgpu_mm_wdoorbell - write a doorbell dword
491  *
492  * @adev: amdgpu_device pointer
493  * @index: doorbell index
494  * @v: value to write
495  *
496  * Writes @v to the doorbell aperture at the
497  * requested doorbell index (CIK).
498  */
499 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
500 {
501 	if (index < adev->doorbell.num_doorbells) {
502 		writel(v, adev->doorbell.ptr + index);
503 	} else {
504 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
505 	}
506 }
507 
508 /**
509  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
510  *
511  * @adev: amdgpu_device pointer
512  * @index: doorbell index
513  *
514  * Returns the value in the doorbell aperture at the
515  * requested doorbell index (VEGA10+).
516  */
517 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
518 {
519 	if (index < adev->doorbell.num_doorbells) {
520 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
521 	} else {
522 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
523 		return 0;
524 	}
525 }
526 
527 /**
528  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
529  *
530  * @adev: amdgpu_device pointer
531  * @index: doorbell index
532  * @v: value to write
533  *
534  * Writes @v to the doorbell aperture at the
535  * requested doorbell index (VEGA10+).
536  */
537 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
538 {
539 	if (index < adev->doorbell.num_doorbells) {
540 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
541 	} else {
542 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
543 	}
544 }
545 
546 /**
547  * amdgpu_invalid_rreg - dummy reg read function
548  *
549  * @adev: amdgpu device pointer
550  * @reg: offset of register
551  *
552  * Dummy register read function.  Used for register blocks
553  * that certain asics don't have (all asics).
554  * Returns the value in the register.
555  */
556 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
557 {
558 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
559 	BUG();
560 	return 0;
561 }
562 
563 /**
564  * amdgpu_invalid_wreg - dummy reg write function
565  *
566  * @adev: amdgpu device pointer
567  * @reg: offset of register
568  * @v: value to write to the register
569  *
570  * Dummy register read function.  Used for register blocks
571  * that certain asics don't have (all asics).
572  */
573 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
574 {
575 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
576 		  reg, v);
577 	BUG();
578 }
579 
580 /**
581  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
582  *
583  * @adev: amdgpu device pointer
584  * @reg: offset of register
585  *
586  * Dummy register read function.  Used for register blocks
587  * that certain asics don't have (all asics).
588  * Returns the value in the register.
589  */
590 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
591 {
592 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
593 	BUG();
594 	return 0;
595 }
596 
597 /**
598  * amdgpu_invalid_wreg64 - dummy reg write function
599  *
600  * @adev: amdgpu device pointer
601  * @reg: offset of register
602  * @v: value to write to the register
603  *
604  * Dummy register read function.  Used for register blocks
605  * that certain asics don't have (all asics).
606  */
607 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
608 {
609 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
610 		  reg, v);
611 	BUG();
612 }
613 
614 /**
615  * amdgpu_block_invalid_rreg - dummy reg read function
616  *
617  * @adev: amdgpu device pointer
618  * @block: offset of instance
619  * @reg: offset of register
620  *
621  * Dummy register read function.  Used for register blocks
622  * that certain asics don't have (all asics).
623  * Returns the value in the register.
624  */
625 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
626 					  uint32_t block, uint32_t reg)
627 {
628 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
629 		  reg, block);
630 	BUG();
631 	return 0;
632 }
633 
634 /**
635  * amdgpu_block_invalid_wreg - dummy reg write function
636  *
637  * @adev: amdgpu device pointer
638  * @block: offset of instance
639  * @reg: offset of register
640  * @v: value to write to the register
641  *
642  * Dummy register read function.  Used for register blocks
643  * that certain asics don't have (all asics).
644  */
645 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
646 				      uint32_t block,
647 				      uint32_t reg, uint32_t v)
648 {
649 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
650 		  reg, block, v);
651 	BUG();
652 }
653 
654 /**
655  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
656  *
657  * @adev: amdgpu device pointer
658  *
659  * Allocates a scratch page of VRAM for use by various things in the
660  * driver.
661  */
662 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
663 {
664 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
665 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
666 				       &adev->vram_scratch.robj,
667 				       &adev->vram_scratch.gpu_addr,
668 				       (void **)&adev->vram_scratch.ptr);
669 }
670 
671 /**
672  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
673  *
674  * @adev: amdgpu device pointer
675  *
676  * Frees the VRAM scratch page.
677  */
678 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
679 {
680 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
681 }
682 
683 /**
684  * amdgpu_device_program_register_sequence - program an array of registers.
685  *
686  * @adev: amdgpu_device pointer
687  * @registers: pointer to the register array
688  * @array_size: size of the register array
689  *
690  * Programs an array or registers with and and or masks.
691  * This is a helper for setting golden registers.
692  */
693 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
694 					     const u32 *registers,
695 					     const u32 array_size)
696 {
697 	u32 tmp, reg, and_mask, or_mask;
698 	int i;
699 
700 	if (array_size % 3)
701 		return;
702 
703 	for (i = 0; i < array_size; i +=3) {
704 		reg = registers[i + 0];
705 		and_mask = registers[i + 1];
706 		or_mask = registers[i + 2];
707 
708 		if (and_mask == 0xffffffff) {
709 			tmp = or_mask;
710 		} else {
711 			tmp = RREG32(reg);
712 			tmp &= ~and_mask;
713 			if (adev->family >= AMDGPU_FAMILY_AI)
714 				tmp |= (or_mask & and_mask);
715 			else
716 				tmp |= or_mask;
717 		}
718 		WREG32(reg, tmp);
719 	}
720 }
721 
722 /**
723  * amdgpu_device_pci_config_reset - reset the GPU
724  *
725  * @adev: amdgpu_device pointer
726  *
727  * Resets the GPU using the pci config reset sequence.
728  * Only applicable to asics prior to vega10.
729  */
730 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
731 {
732 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
733 }
734 
735 /*
736  * GPU doorbell aperture helpers function.
737  */
738 /**
739  * amdgpu_device_doorbell_init - Init doorbell driver information.
740  *
741  * @adev: amdgpu_device pointer
742  *
743  * Init doorbell driver information (CIK)
744  * Returns 0 on success, error on failure.
745  */
746 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
747 {
748 
749 	/* No doorbell on SI hardware generation */
750 	if (adev->asic_type < CHIP_BONAIRE) {
751 		adev->doorbell.base = 0;
752 		adev->doorbell.size = 0;
753 		adev->doorbell.num_doorbells = 0;
754 		adev->doorbell.ptr = NULL;
755 		return 0;
756 	}
757 
758 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
759 		return -EINVAL;
760 
761 	amdgpu_asic_init_doorbell_index(adev);
762 
763 	/* doorbell bar mapping */
764 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
765 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
766 
767 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
768 					     adev->doorbell_index.max_assignment+1);
769 	if (adev->doorbell.num_doorbells == 0)
770 		return -EINVAL;
771 
772 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
773 	 * paging queue doorbell use the second page. The
774 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
775 	 * doorbells are in the first page. So with paging queue enabled,
776 	 * the max num_doorbells should + 1 page (0x400 in dword)
777 	 */
778 	if (adev->asic_type >= CHIP_VEGA10)
779 		adev->doorbell.num_doorbells += 0x400;
780 
781 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
782 				     adev->doorbell.num_doorbells *
783 				     sizeof(u32));
784 	if (adev->doorbell.ptr == NULL)
785 		return -ENOMEM;
786 
787 	return 0;
788 }
789 
790 /**
791  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
792  *
793  * @adev: amdgpu_device pointer
794  *
795  * Tear down doorbell driver information (CIK)
796  */
797 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
798 {
799 	iounmap(adev->doorbell.ptr);
800 	adev->doorbell.ptr = NULL;
801 }
802 
803 
804 
805 /*
806  * amdgpu_device_wb_*()
807  * Writeback is the method by which the GPU updates special pages in memory
808  * with the status of certain GPU events (fences, ring pointers,etc.).
809  */
810 
811 /**
812  * amdgpu_device_wb_fini - Disable Writeback and free memory
813  *
814  * @adev: amdgpu_device pointer
815  *
816  * Disables Writeback and frees the Writeback memory (all asics).
817  * Used at driver shutdown.
818  */
819 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
820 {
821 	if (adev->wb.wb_obj) {
822 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
823 				      &adev->wb.gpu_addr,
824 				      (void **)&adev->wb.wb);
825 		adev->wb.wb_obj = NULL;
826 	}
827 }
828 
829 /**
830  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
831  *
832  * @adev: amdgpu_device pointer
833  *
834  * Initializes writeback and allocates writeback memory (all asics).
835  * Used at driver startup.
836  * Returns 0 on success or an -error on failure.
837  */
838 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
839 {
840 	int r;
841 
842 	if (adev->wb.wb_obj == NULL) {
843 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
844 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
845 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
846 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
847 					    (void **)&adev->wb.wb);
848 		if (r) {
849 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
850 			return r;
851 		}
852 
853 		adev->wb.num_wb = AMDGPU_MAX_WB;
854 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
855 
856 		/* clear wb memory */
857 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
858 	}
859 
860 	return 0;
861 }
862 
863 /**
864  * amdgpu_device_wb_get - Allocate a wb entry
865  *
866  * @adev: amdgpu_device pointer
867  * @wb: wb index
868  *
869  * Allocate a wb slot for use by the driver (all asics).
870  * Returns 0 on success or -EINVAL on failure.
871  */
872 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
873 {
874 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
875 
876 	if (offset < adev->wb.num_wb) {
877 		__set_bit(offset, adev->wb.used);
878 		*wb = offset << 3; /* convert to dw offset */
879 		return 0;
880 	} else {
881 		return -EINVAL;
882 	}
883 }
884 
885 /**
886  * amdgpu_device_wb_free - Free a wb entry
887  *
888  * @adev: amdgpu_device pointer
889  * @wb: wb index
890  *
891  * Free a wb slot allocated for use by the driver (all asics)
892  */
893 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
894 {
895 	wb >>= 3;
896 	if (wb < adev->wb.num_wb)
897 		__clear_bit(wb, adev->wb.used);
898 }
899 
900 /**
901  * amdgpu_device_resize_fb_bar - try to resize FB BAR
902  *
903  * @adev: amdgpu_device pointer
904  *
905  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
906  * to fail, but if any of the BARs is not accessible after the size we abort
907  * driver loading by returning -ENODEV.
908  */
909 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
910 {
911 	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
912 	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
913 	struct pci_bus *root;
914 	struct resource *res;
915 	unsigned i;
916 	u16 cmd;
917 	int r;
918 
919 	/* Bypass for VF */
920 	if (amdgpu_sriov_vf(adev))
921 		return 0;
922 
923 	/* skip if the bios has already enabled large BAR */
924 	if (adev->gmc.real_vram_size &&
925 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
926 		return 0;
927 
928 	/* Check if the root BUS has 64bit memory resources */
929 	root = adev->pdev->bus;
930 	while (root->parent)
931 		root = root->parent;
932 
933 	pci_bus_for_each_resource(root, res, i) {
934 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
935 		    res->start > 0x100000000ull)
936 			break;
937 	}
938 
939 	/* Trying to resize is pointless without a root hub window above 4GB */
940 	if (!res)
941 		return 0;
942 
943 	/* Disable memory decoding while we change the BAR addresses and size */
944 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
945 	pci_write_config_word(adev->pdev, PCI_COMMAND,
946 			      cmd & ~PCI_COMMAND_MEMORY);
947 
948 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
949 	amdgpu_device_doorbell_fini(adev);
950 	if (adev->asic_type >= CHIP_BONAIRE)
951 		pci_release_resource(adev->pdev, 2);
952 
953 	pci_release_resource(adev->pdev, 0);
954 
955 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
956 	if (r == -ENOSPC)
957 		DRM_INFO("Not enough PCI address space for a large BAR.");
958 	else if (r && r != -ENOTSUPP)
959 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
960 
961 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
962 
963 	/* When the doorbell or fb BAR isn't available we have no chance of
964 	 * using the device.
965 	 */
966 	r = amdgpu_device_doorbell_init(adev);
967 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
968 		return -ENODEV;
969 
970 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
971 
972 	return 0;
973 }
974 
975 /*
976  * GPU helpers function.
977  */
978 /**
979  * amdgpu_device_need_post - check if the hw need post or not
980  *
981  * @adev: amdgpu_device pointer
982  *
983  * Check if the asic has been initialized (all asics) at driver startup
984  * or post is needed if  hw reset is performed.
985  * Returns true if need or false if not.
986  */
987 bool amdgpu_device_need_post(struct amdgpu_device *adev)
988 {
989 	uint32_t reg;
990 
991 	if (amdgpu_sriov_vf(adev))
992 		return false;
993 
994 	if (amdgpu_passthrough(adev)) {
995 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
996 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
997 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
998 		 * vpost executed for smc version below 22.15
999 		 */
1000 		if (adev->asic_type == CHIP_FIJI) {
1001 			int err;
1002 			uint32_t fw_ver;
1003 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1004 			/* force vPost if error occured */
1005 			if (err)
1006 				return true;
1007 
1008 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1009 			if (fw_ver < 0x00160e00)
1010 				return true;
1011 		}
1012 	}
1013 
1014 	if (adev->has_hw_reset) {
1015 		adev->has_hw_reset = false;
1016 		return true;
1017 	}
1018 
1019 	/* bios scratch used on CIK+ */
1020 	if (adev->asic_type >= CHIP_BONAIRE)
1021 		return amdgpu_atombios_scratch_need_asic_init(adev);
1022 
1023 	/* check MEM_SIZE for older asics */
1024 	reg = amdgpu_asic_get_config_memsize(adev);
1025 
1026 	if ((reg != 0) && (reg != 0xffffffff))
1027 		return false;
1028 
1029 	return true;
1030 }
1031 
1032 /* if we get transitioned to only one device, take VGA back */
1033 /**
1034  * amdgpu_device_vga_set_decode - enable/disable vga decode
1035  *
1036  * @cookie: amdgpu_device pointer
1037  * @state: enable/disable vga decode
1038  *
1039  * Enable/disable vga decode (all asics).
1040  * Returns VGA resource flags.
1041  */
1042 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1043 {
1044 	struct amdgpu_device *adev = cookie;
1045 	amdgpu_asic_set_vga_state(adev, state);
1046 	if (state)
1047 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1048 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1049 	else
1050 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1051 }
1052 
1053 /**
1054  * amdgpu_device_check_block_size - validate the vm block size
1055  *
1056  * @adev: amdgpu_device pointer
1057  *
1058  * Validates the vm block size specified via module parameter.
1059  * The vm block size defines number of bits in page table versus page directory,
1060  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1061  * page table and the remaining bits are in the page directory.
1062  */
1063 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1064 {
1065 	/* defines number of bits in page table versus page directory,
1066 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1067 	 * page table and the remaining bits are in the page directory */
1068 	if (amdgpu_vm_block_size == -1)
1069 		return;
1070 
1071 	if (amdgpu_vm_block_size < 9) {
1072 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1073 			 amdgpu_vm_block_size);
1074 		amdgpu_vm_block_size = -1;
1075 	}
1076 }
1077 
1078 /**
1079  * amdgpu_device_check_vm_size - validate the vm size
1080  *
1081  * @adev: amdgpu_device pointer
1082  *
1083  * Validates the vm size in GB specified via module parameter.
1084  * The VM size is the size of the GPU virtual memory space in GB.
1085  */
1086 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1087 {
1088 	/* no need to check the default value */
1089 	if (amdgpu_vm_size == -1)
1090 		return;
1091 
1092 	if (amdgpu_vm_size < 1) {
1093 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1094 			 amdgpu_vm_size);
1095 		amdgpu_vm_size = -1;
1096 	}
1097 }
1098 
1099 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1100 {
1101 	struct sysinfo si;
1102 	bool is_os_64 = (sizeof(void *) == 8);
1103 	uint64_t total_memory;
1104 	uint64_t dram_size_seven_GB = 0x1B8000000;
1105 	uint64_t dram_size_three_GB = 0xB8000000;
1106 
1107 	if (amdgpu_smu_memory_pool_size == 0)
1108 		return;
1109 
1110 	if (!is_os_64) {
1111 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1112 		goto def_value;
1113 	}
1114 	si_meminfo(&si);
1115 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1116 
1117 	if ((amdgpu_smu_memory_pool_size == 1) ||
1118 		(amdgpu_smu_memory_pool_size == 2)) {
1119 		if (total_memory < dram_size_three_GB)
1120 			goto def_value1;
1121 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1122 		(amdgpu_smu_memory_pool_size == 8)) {
1123 		if (total_memory < dram_size_seven_GB)
1124 			goto def_value1;
1125 	} else {
1126 		DRM_WARN("Smu memory pool size not supported\n");
1127 		goto def_value;
1128 	}
1129 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1130 
1131 	return;
1132 
1133 def_value1:
1134 	DRM_WARN("No enough system memory\n");
1135 def_value:
1136 	adev->pm.smu_prv_buffer_size = 0;
1137 }
1138 
1139 /**
1140  * amdgpu_device_check_arguments - validate module params
1141  *
1142  * @adev: amdgpu_device pointer
1143  *
1144  * Validates certain module parameters and updates
1145  * the associated values used by the driver (all asics).
1146  */
1147 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1148 {
1149 	if (amdgpu_sched_jobs < 4) {
1150 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1151 			 amdgpu_sched_jobs);
1152 		amdgpu_sched_jobs = 4;
1153 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1154 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1155 			 amdgpu_sched_jobs);
1156 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1157 	}
1158 
1159 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1160 		/* gart size must be greater or equal to 32M */
1161 		dev_warn(adev->dev, "gart size (%d) too small\n",
1162 			 amdgpu_gart_size);
1163 		amdgpu_gart_size = -1;
1164 	}
1165 
1166 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1167 		/* gtt size must be greater or equal to 32M */
1168 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1169 				 amdgpu_gtt_size);
1170 		amdgpu_gtt_size = -1;
1171 	}
1172 
1173 	/* valid range is between 4 and 9 inclusive */
1174 	if (amdgpu_vm_fragment_size != -1 &&
1175 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1176 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1177 		amdgpu_vm_fragment_size = -1;
1178 	}
1179 
1180 	if (amdgpu_sched_hw_submission < 2) {
1181 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1182 			 amdgpu_sched_hw_submission);
1183 		amdgpu_sched_hw_submission = 2;
1184 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1185 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1186 			 amdgpu_sched_hw_submission);
1187 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1188 	}
1189 
1190 	amdgpu_device_check_smu_prv_buffer_size(adev);
1191 
1192 	amdgpu_device_check_vm_size(adev);
1193 
1194 	amdgpu_device_check_block_size(adev);
1195 
1196 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1197 
1198 	amdgpu_gmc_tmz_set(adev);
1199 
1200 	return 0;
1201 }
1202 
1203 /**
1204  * amdgpu_switcheroo_set_state - set switcheroo state
1205  *
1206  * @pdev: pci dev pointer
1207  * @state: vga_switcheroo state
1208  *
1209  * Callback for the switcheroo driver.  Suspends or resumes the
1210  * the asics before or after it is powered up using ACPI methods.
1211  */
1212 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1213 {
1214 	struct drm_device *dev = pci_get_drvdata(pdev);
1215 	int r;
1216 
1217 	if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1218 		return;
1219 
1220 	if (state == VGA_SWITCHEROO_ON) {
1221 		pr_info("switched on\n");
1222 		/* don't suspend or resume card normally */
1223 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1224 
1225 		pci_set_power_state(dev->pdev, PCI_D0);
1226 		pci_restore_state(dev->pdev);
1227 		r = pci_enable_device(dev->pdev);
1228 		if (r)
1229 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1230 		amdgpu_device_resume(dev, true);
1231 
1232 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1233 		drm_kms_helper_poll_enable(dev);
1234 	} else {
1235 		pr_info("switched off\n");
1236 		drm_kms_helper_poll_disable(dev);
1237 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1238 		amdgpu_device_suspend(dev, true);
1239 		pci_save_state(dev->pdev);
1240 		/* Shut down the device */
1241 		pci_disable_device(dev->pdev);
1242 		pci_set_power_state(dev->pdev, PCI_D3cold);
1243 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1244 	}
1245 }
1246 
1247 /**
1248  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1249  *
1250  * @pdev: pci dev pointer
1251  *
1252  * Callback for the switcheroo driver.  Check of the switcheroo
1253  * state can be changed.
1254  * Returns true if the state can be changed, false if not.
1255  */
1256 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1257 {
1258 	struct drm_device *dev = pci_get_drvdata(pdev);
1259 
1260 	/*
1261 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1262 	* locking inversion with the driver load path. And the access here is
1263 	* completely racy anyway. So don't bother with locking for now.
1264 	*/
1265 	return atomic_read(&dev->open_count) == 0;
1266 }
1267 
1268 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1269 	.set_gpu_state = amdgpu_switcheroo_set_state,
1270 	.reprobe = NULL,
1271 	.can_switch = amdgpu_switcheroo_can_switch,
1272 };
1273 
1274 /**
1275  * amdgpu_device_ip_set_clockgating_state - set the CG state
1276  *
1277  * @dev: amdgpu_device pointer
1278  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1279  * @state: clockgating state (gate or ungate)
1280  *
1281  * Sets the requested clockgating state for all instances of
1282  * the hardware IP specified.
1283  * Returns the error code from the last instance.
1284  */
1285 int amdgpu_device_ip_set_clockgating_state(void *dev,
1286 					   enum amd_ip_block_type block_type,
1287 					   enum amd_clockgating_state state)
1288 {
1289 	struct amdgpu_device *adev = dev;
1290 	int i, r = 0;
1291 
1292 	for (i = 0; i < adev->num_ip_blocks; i++) {
1293 		if (!adev->ip_blocks[i].status.valid)
1294 			continue;
1295 		if (adev->ip_blocks[i].version->type != block_type)
1296 			continue;
1297 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1298 			continue;
1299 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1300 			(void *)adev, state);
1301 		if (r)
1302 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1303 				  adev->ip_blocks[i].version->funcs->name, r);
1304 	}
1305 	return r;
1306 }
1307 
1308 /**
1309  * amdgpu_device_ip_set_powergating_state - set the PG state
1310  *
1311  * @dev: amdgpu_device pointer
1312  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1313  * @state: powergating state (gate or ungate)
1314  *
1315  * Sets the requested powergating state for all instances of
1316  * the hardware IP specified.
1317  * Returns the error code from the last instance.
1318  */
1319 int amdgpu_device_ip_set_powergating_state(void *dev,
1320 					   enum amd_ip_block_type block_type,
1321 					   enum amd_powergating_state state)
1322 {
1323 	struct amdgpu_device *adev = dev;
1324 	int i, r = 0;
1325 
1326 	for (i = 0; i < adev->num_ip_blocks; i++) {
1327 		if (!adev->ip_blocks[i].status.valid)
1328 			continue;
1329 		if (adev->ip_blocks[i].version->type != block_type)
1330 			continue;
1331 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1332 			continue;
1333 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1334 			(void *)adev, state);
1335 		if (r)
1336 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1337 				  adev->ip_blocks[i].version->funcs->name, r);
1338 	}
1339 	return r;
1340 }
1341 
1342 /**
1343  * amdgpu_device_ip_get_clockgating_state - get the CG state
1344  *
1345  * @adev: amdgpu_device pointer
1346  * @flags: clockgating feature flags
1347  *
1348  * Walks the list of IPs on the device and updates the clockgating
1349  * flags for each IP.
1350  * Updates @flags with the feature flags for each hardware IP where
1351  * clockgating is enabled.
1352  */
1353 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1354 					    u32 *flags)
1355 {
1356 	int i;
1357 
1358 	for (i = 0; i < adev->num_ip_blocks; i++) {
1359 		if (!adev->ip_blocks[i].status.valid)
1360 			continue;
1361 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1362 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1363 	}
1364 }
1365 
1366 /**
1367  * amdgpu_device_ip_wait_for_idle - wait for idle
1368  *
1369  * @adev: amdgpu_device pointer
1370  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1371  *
1372  * Waits for the request hardware IP to be idle.
1373  * Returns 0 for success or a negative error code on failure.
1374  */
1375 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1376 				   enum amd_ip_block_type block_type)
1377 {
1378 	int i, r;
1379 
1380 	for (i = 0; i < adev->num_ip_blocks; i++) {
1381 		if (!adev->ip_blocks[i].status.valid)
1382 			continue;
1383 		if (adev->ip_blocks[i].version->type == block_type) {
1384 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1385 			if (r)
1386 				return r;
1387 			break;
1388 		}
1389 	}
1390 	return 0;
1391 
1392 }
1393 
1394 /**
1395  * amdgpu_device_ip_is_idle - is the hardware IP idle
1396  *
1397  * @adev: amdgpu_device pointer
1398  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1399  *
1400  * Check if the hardware IP is idle or not.
1401  * Returns true if it the IP is idle, false if not.
1402  */
1403 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1404 			      enum amd_ip_block_type block_type)
1405 {
1406 	int i;
1407 
1408 	for (i = 0; i < adev->num_ip_blocks; i++) {
1409 		if (!adev->ip_blocks[i].status.valid)
1410 			continue;
1411 		if (adev->ip_blocks[i].version->type == block_type)
1412 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1413 	}
1414 	return true;
1415 
1416 }
1417 
1418 /**
1419  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1420  *
1421  * @adev: amdgpu_device pointer
1422  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1423  *
1424  * Returns a pointer to the hardware IP block structure
1425  * if it exists for the asic, otherwise NULL.
1426  */
1427 struct amdgpu_ip_block *
1428 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1429 			      enum amd_ip_block_type type)
1430 {
1431 	int i;
1432 
1433 	for (i = 0; i < adev->num_ip_blocks; i++)
1434 		if (adev->ip_blocks[i].version->type == type)
1435 			return &adev->ip_blocks[i];
1436 
1437 	return NULL;
1438 }
1439 
1440 /**
1441  * amdgpu_device_ip_block_version_cmp
1442  *
1443  * @adev: amdgpu_device pointer
1444  * @type: enum amd_ip_block_type
1445  * @major: major version
1446  * @minor: minor version
1447  *
1448  * return 0 if equal or greater
1449  * return 1 if smaller or the ip_block doesn't exist
1450  */
1451 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1452 				       enum amd_ip_block_type type,
1453 				       u32 major, u32 minor)
1454 {
1455 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1456 
1457 	if (ip_block && ((ip_block->version->major > major) ||
1458 			((ip_block->version->major == major) &&
1459 			(ip_block->version->minor >= minor))))
1460 		return 0;
1461 
1462 	return 1;
1463 }
1464 
1465 /**
1466  * amdgpu_device_ip_block_add
1467  *
1468  * @adev: amdgpu_device pointer
1469  * @ip_block_version: pointer to the IP to add
1470  *
1471  * Adds the IP block driver information to the collection of IPs
1472  * on the asic.
1473  */
1474 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1475 			       const struct amdgpu_ip_block_version *ip_block_version)
1476 {
1477 	if (!ip_block_version)
1478 		return -EINVAL;
1479 
1480 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1481 		  ip_block_version->funcs->name);
1482 
1483 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1484 
1485 	return 0;
1486 }
1487 
1488 /**
1489  * amdgpu_device_enable_virtual_display - enable virtual display feature
1490  *
1491  * @adev: amdgpu_device pointer
1492  *
1493  * Enabled the virtual display feature if the user has enabled it via
1494  * the module parameter virtual_display.  This feature provides a virtual
1495  * display hardware on headless boards or in virtualized environments.
1496  * This function parses and validates the configuration string specified by
1497  * the user and configues the virtual display configuration (number of
1498  * virtual connectors, crtcs, etc.) specified.
1499  */
1500 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1501 {
1502 	adev->enable_virtual_display = false;
1503 
1504 	if (amdgpu_virtual_display) {
1505 		struct drm_device *ddev = adev->ddev;
1506 		const char *pci_address_name = pci_name(ddev->pdev);
1507 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1508 
1509 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1510 		pciaddstr_tmp = pciaddstr;
1511 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1512 			pciaddname = strsep(&pciaddname_tmp, ",");
1513 			if (!strcmp("all", pciaddname)
1514 			    || !strcmp(pci_address_name, pciaddname)) {
1515 				long num_crtc;
1516 				int res = -1;
1517 
1518 				adev->enable_virtual_display = true;
1519 
1520 				if (pciaddname_tmp)
1521 					res = kstrtol(pciaddname_tmp, 10,
1522 						      &num_crtc);
1523 
1524 				if (!res) {
1525 					if (num_crtc < 1)
1526 						num_crtc = 1;
1527 					if (num_crtc > 6)
1528 						num_crtc = 6;
1529 					adev->mode_info.num_crtc = num_crtc;
1530 				} else {
1531 					adev->mode_info.num_crtc = 1;
1532 				}
1533 				break;
1534 			}
1535 		}
1536 
1537 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1538 			 amdgpu_virtual_display, pci_address_name,
1539 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1540 
1541 		kfree(pciaddstr);
1542 	}
1543 }
1544 
1545 /**
1546  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1547  *
1548  * @adev: amdgpu_device pointer
1549  *
1550  * Parses the asic configuration parameters specified in the gpu info
1551  * firmware and makes them availale to the driver for use in configuring
1552  * the asic.
1553  * Returns 0 on success, -EINVAL on failure.
1554  */
1555 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1556 {
1557 	const char *chip_name;
1558 	char fw_name[40];
1559 	int err;
1560 	const struct gpu_info_firmware_header_v1_0 *hdr;
1561 
1562 	adev->firmware.gpu_info_fw = NULL;
1563 
1564 	if (adev->discovery_bin) {
1565 		amdgpu_discovery_get_gfx_info(adev);
1566 
1567 		/*
1568 		 * FIXME: The bounding box is still needed by Navi12, so
1569 		 * temporarily read it from gpu_info firmware. Should be droped
1570 		 * when DAL no longer needs it.
1571 		 */
1572 		if (adev->asic_type != CHIP_NAVI12)
1573 			return 0;
1574 	}
1575 
1576 	switch (adev->asic_type) {
1577 #ifdef CONFIG_DRM_AMDGPU_SI
1578 	case CHIP_VERDE:
1579 	case CHIP_TAHITI:
1580 	case CHIP_PITCAIRN:
1581 	case CHIP_OLAND:
1582 	case CHIP_HAINAN:
1583 #endif
1584 #ifdef CONFIG_DRM_AMDGPU_CIK
1585 	case CHIP_BONAIRE:
1586 	case CHIP_HAWAII:
1587 	case CHIP_KAVERI:
1588 	case CHIP_KABINI:
1589 	case CHIP_MULLINS:
1590 #endif
1591 	case CHIP_TOPAZ:
1592 	case CHIP_TONGA:
1593 	case CHIP_FIJI:
1594 	case CHIP_POLARIS10:
1595 	case CHIP_POLARIS11:
1596 	case CHIP_POLARIS12:
1597 	case CHIP_VEGAM:
1598 	case CHIP_CARRIZO:
1599 	case CHIP_STONEY:
1600 	case CHIP_VEGA20:
1601 	case CHIP_SIENNA_CICHLID:
1602 	case CHIP_NAVY_FLOUNDER:
1603 	default:
1604 		return 0;
1605 	case CHIP_VEGA10:
1606 		chip_name = "vega10";
1607 		break;
1608 	case CHIP_VEGA12:
1609 		chip_name = "vega12";
1610 		break;
1611 	case CHIP_RAVEN:
1612 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1613 			chip_name = "raven2";
1614 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1615 			chip_name = "picasso";
1616 		else
1617 			chip_name = "raven";
1618 		break;
1619 	case CHIP_ARCTURUS:
1620 		chip_name = "arcturus";
1621 		break;
1622 	case CHIP_RENOIR:
1623 		chip_name = "renoir";
1624 		break;
1625 	case CHIP_NAVI10:
1626 		chip_name = "navi10";
1627 		break;
1628 	case CHIP_NAVI14:
1629 		chip_name = "navi14";
1630 		break;
1631 	case CHIP_NAVI12:
1632 		chip_name = "navi12";
1633 		break;
1634 	}
1635 
1636 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1637 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1638 	if (err) {
1639 		dev_err(adev->dev,
1640 			"Failed to load gpu_info firmware \"%s\"\n",
1641 			fw_name);
1642 		goto out;
1643 	}
1644 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1645 	if (err) {
1646 		dev_err(adev->dev,
1647 			"Failed to validate gpu_info firmware \"%s\"\n",
1648 			fw_name);
1649 		goto out;
1650 	}
1651 
1652 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1653 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1654 
1655 	switch (hdr->version_major) {
1656 	case 1:
1657 	{
1658 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1659 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1660 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1661 
1662 		/*
1663 		 * Should be droped when DAL no longer needs it.
1664 		 */
1665 		if (adev->asic_type == CHIP_NAVI12)
1666 			goto parse_soc_bounding_box;
1667 
1668 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1669 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1670 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1671 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1672 		adev->gfx.config.max_texture_channel_caches =
1673 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1674 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1675 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1676 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1677 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1678 		adev->gfx.config.double_offchip_lds_buf =
1679 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1680 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1681 		adev->gfx.cu_info.max_waves_per_simd =
1682 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1683 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1684 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1685 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1686 		if (hdr->version_minor >= 1) {
1687 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1688 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1689 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1690 			adev->gfx.config.num_sc_per_sh =
1691 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1692 			adev->gfx.config.num_packer_per_sc =
1693 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1694 		}
1695 
1696 parse_soc_bounding_box:
1697 		/*
1698 		 * soc bounding box info is not integrated in disocovery table,
1699 		 * we always need to parse it from gpu info firmware if needed.
1700 		 */
1701 		if (hdr->version_minor == 2) {
1702 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1703 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1704 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1705 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1706 		}
1707 		break;
1708 	}
1709 	default:
1710 		dev_err(adev->dev,
1711 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1712 		err = -EINVAL;
1713 		goto out;
1714 	}
1715 out:
1716 	return err;
1717 }
1718 
1719 /**
1720  * amdgpu_device_ip_early_init - run early init for hardware IPs
1721  *
1722  * @adev: amdgpu_device pointer
1723  *
1724  * Early initialization pass for hardware IPs.  The hardware IPs that make
1725  * up each asic are discovered each IP's early_init callback is run.  This
1726  * is the first stage in initializing the asic.
1727  * Returns 0 on success, negative error code on failure.
1728  */
1729 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1730 {
1731 	int i, r;
1732 
1733 	amdgpu_device_enable_virtual_display(adev);
1734 
1735 	if (amdgpu_sriov_vf(adev)) {
1736 		r = amdgpu_virt_request_full_gpu(adev, true);
1737 		if (r)
1738 			return r;
1739 	}
1740 
1741 	switch (adev->asic_type) {
1742 #ifdef CONFIG_DRM_AMDGPU_SI
1743 	case CHIP_VERDE:
1744 	case CHIP_TAHITI:
1745 	case CHIP_PITCAIRN:
1746 	case CHIP_OLAND:
1747 	case CHIP_HAINAN:
1748 		adev->family = AMDGPU_FAMILY_SI;
1749 		r = si_set_ip_blocks(adev);
1750 		if (r)
1751 			return r;
1752 		break;
1753 #endif
1754 #ifdef CONFIG_DRM_AMDGPU_CIK
1755 	case CHIP_BONAIRE:
1756 	case CHIP_HAWAII:
1757 	case CHIP_KAVERI:
1758 	case CHIP_KABINI:
1759 	case CHIP_MULLINS:
1760 		if (adev->flags & AMD_IS_APU)
1761 			adev->family = AMDGPU_FAMILY_KV;
1762 		else
1763 			adev->family = AMDGPU_FAMILY_CI;
1764 
1765 		r = cik_set_ip_blocks(adev);
1766 		if (r)
1767 			return r;
1768 		break;
1769 #endif
1770 	case CHIP_TOPAZ:
1771 	case CHIP_TONGA:
1772 	case CHIP_FIJI:
1773 	case CHIP_POLARIS10:
1774 	case CHIP_POLARIS11:
1775 	case CHIP_POLARIS12:
1776 	case CHIP_VEGAM:
1777 	case CHIP_CARRIZO:
1778 	case CHIP_STONEY:
1779 		if (adev->flags & AMD_IS_APU)
1780 			adev->family = AMDGPU_FAMILY_CZ;
1781 		else
1782 			adev->family = AMDGPU_FAMILY_VI;
1783 
1784 		r = vi_set_ip_blocks(adev);
1785 		if (r)
1786 			return r;
1787 		break;
1788 	case CHIP_VEGA10:
1789 	case CHIP_VEGA12:
1790 	case CHIP_VEGA20:
1791 	case CHIP_RAVEN:
1792 	case CHIP_ARCTURUS:
1793 	case CHIP_RENOIR:
1794 		if (adev->flags & AMD_IS_APU)
1795 			adev->family = AMDGPU_FAMILY_RV;
1796 		else
1797 			adev->family = AMDGPU_FAMILY_AI;
1798 
1799 		r = soc15_set_ip_blocks(adev);
1800 		if (r)
1801 			return r;
1802 		break;
1803 	case  CHIP_NAVI10:
1804 	case  CHIP_NAVI14:
1805 	case  CHIP_NAVI12:
1806 	case  CHIP_SIENNA_CICHLID:
1807 	case  CHIP_NAVY_FLOUNDER:
1808 		adev->family = AMDGPU_FAMILY_NV;
1809 
1810 		r = nv_set_ip_blocks(adev);
1811 		if (r)
1812 			return r;
1813 		break;
1814 	default:
1815 		/* FIXME: not supported yet */
1816 		return -EINVAL;
1817 	}
1818 
1819 	amdgpu_amdkfd_device_probe(adev);
1820 
1821 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
1822 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1823 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1824 
1825 	for (i = 0; i < adev->num_ip_blocks; i++) {
1826 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1827 			DRM_ERROR("disabled ip block: %d <%s>\n",
1828 				  i, adev->ip_blocks[i].version->funcs->name);
1829 			adev->ip_blocks[i].status.valid = false;
1830 		} else {
1831 			if (adev->ip_blocks[i].version->funcs->early_init) {
1832 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1833 				if (r == -ENOENT) {
1834 					adev->ip_blocks[i].status.valid = false;
1835 				} else if (r) {
1836 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
1837 						  adev->ip_blocks[i].version->funcs->name, r);
1838 					return r;
1839 				} else {
1840 					adev->ip_blocks[i].status.valid = true;
1841 				}
1842 			} else {
1843 				adev->ip_blocks[i].status.valid = true;
1844 			}
1845 		}
1846 		/* get the vbios after the asic_funcs are set up */
1847 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1848 			r = amdgpu_device_parse_gpu_info_fw(adev);
1849 			if (r)
1850 				return r;
1851 
1852 			/* Read BIOS */
1853 			if (!amdgpu_get_bios(adev))
1854 				return -EINVAL;
1855 
1856 			r = amdgpu_atombios_init(adev);
1857 			if (r) {
1858 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1859 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1860 				return r;
1861 			}
1862 		}
1863 	}
1864 
1865 	adev->cg_flags &= amdgpu_cg_mask;
1866 	adev->pg_flags &= amdgpu_pg_mask;
1867 
1868 	return 0;
1869 }
1870 
1871 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1872 {
1873 	int i, r;
1874 
1875 	for (i = 0; i < adev->num_ip_blocks; i++) {
1876 		if (!adev->ip_blocks[i].status.sw)
1877 			continue;
1878 		if (adev->ip_blocks[i].status.hw)
1879 			continue;
1880 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1881 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1882 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1883 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1884 			if (r) {
1885 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1886 					  adev->ip_blocks[i].version->funcs->name, r);
1887 				return r;
1888 			}
1889 			adev->ip_blocks[i].status.hw = true;
1890 		}
1891 	}
1892 
1893 	return 0;
1894 }
1895 
1896 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1897 {
1898 	int i, r;
1899 
1900 	for (i = 0; i < adev->num_ip_blocks; i++) {
1901 		if (!adev->ip_blocks[i].status.sw)
1902 			continue;
1903 		if (adev->ip_blocks[i].status.hw)
1904 			continue;
1905 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1906 		if (r) {
1907 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1908 				  adev->ip_blocks[i].version->funcs->name, r);
1909 			return r;
1910 		}
1911 		adev->ip_blocks[i].status.hw = true;
1912 	}
1913 
1914 	return 0;
1915 }
1916 
1917 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1918 {
1919 	int r = 0;
1920 	int i;
1921 	uint32_t smu_version;
1922 
1923 	if (adev->asic_type >= CHIP_VEGA10) {
1924 		for (i = 0; i < adev->num_ip_blocks; i++) {
1925 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1926 				continue;
1927 
1928 			/* no need to do the fw loading again if already done*/
1929 			if (adev->ip_blocks[i].status.hw == true)
1930 				break;
1931 
1932 			if (adev->in_gpu_reset || adev->in_suspend) {
1933 				r = adev->ip_blocks[i].version->funcs->resume(adev);
1934 				if (r) {
1935 					DRM_ERROR("resume of IP block <%s> failed %d\n",
1936 							  adev->ip_blocks[i].version->funcs->name, r);
1937 					return r;
1938 				}
1939 			} else {
1940 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1941 				if (r) {
1942 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1943 							  adev->ip_blocks[i].version->funcs->name, r);
1944 					return r;
1945 				}
1946 			}
1947 
1948 			adev->ip_blocks[i].status.hw = true;
1949 			break;
1950 		}
1951 	}
1952 
1953 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1954 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1955 
1956 	return r;
1957 }
1958 
1959 /**
1960  * amdgpu_device_ip_init - run init for hardware IPs
1961  *
1962  * @adev: amdgpu_device pointer
1963  *
1964  * Main initialization pass for hardware IPs.  The list of all the hardware
1965  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1966  * are run.  sw_init initializes the software state associated with each IP
1967  * and hw_init initializes the hardware associated with each IP.
1968  * Returns 0 on success, negative error code on failure.
1969  */
1970 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1971 {
1972 	int i, r;
1973 
1974 	r = amdgpu_ras_init(adev);
1975 	if (r)
1976 		return r;
1977 
1978 	for (i = 0; i < adev->num_ip_blocks; i++) {
1979 		if (!adev->ip_blocks[i].status.valid)
1980 			continue;
1981 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1982 		if (r) {
1983 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1984 				  adev->ip_blocks[i].version->funcs->name, r);
1985 			goto init_failed;
1986 		}
1987 		adev->ip_blocks[i].status.sw = true;
1988 
1989 		/* need to do gmc hw init early so we can allocate gpu mem */
1990 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
1991 			r = amdgpu_device_vram_scratch_init(adev);
1992 			if (r) {
1993 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
1994 				goto init_failed;
1995 			}
1996 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
1997 			if (r) {
1998 				DRM_ERROR("hw_init %d failed %d\n", i, r);
1999 				goto init_failed;
2000 			}
2001 			r = amdgpu_device_wb_init(adev);
2002 			if (r) {
2003 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2004 				goto init_failed;
2005 			}
2006 			adev->ip_blocks[i].status.hw = true;
2007 
2008 			/* right after GMC hw init, we create CSA */
2009 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2010 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2011 								AMDGPU_GEM_DOMAIN_VRAM,
2012 								AMDGPU_CSA_SIZE);
2013 				if (r) {
2014 					DRM_ERROR("allocate CSA failed %d\n", r);
2015 					goto init_failed;
2016 				}
2017 			}
2018 		}
2019 	}
2020 
2021 	if (amdgpu_sriov_vf(adev))
2022 		amdgpu_virt_init_data_exchange(adev);
2023 
2024 	r = amdgpu_ib_pool_init(adev);
2025 	if (r) {
2026 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2027 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2028 		goto init_failed;
2029 	}
2030 
2031 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2032 	if (r)
2033 		goto init_failed;
2034 
2035 	r = amdgpu_device_ip_hw_init_phase1(adev);
2036 	if (r)
2037 		goto init_failed;
2038 
2039 	r = amdgpu_device_fw_loading(adev);
2040 	if (r)
2041 		goto init_failed;
2042 
2043 	r = amdgpu_device_ip_hw_init_phase2(adev);
2044 	if (r)
2045 		goto init_failed;
2046 
2047 	/*
2048 	 * retired pages will be loaded from eeprom and reserved here,
2049 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2050 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2051 	 * for I2C communication which only true at this point.
2052 	 * recovery_init may fail, but it can free all resources allocated by
2053 	 * itself and its failure should not stop amdgpu init process.
2054 	 *
2055 	 * Note: theoretically, this should be called before all vram allocations
2056 	 * to protect retired page from abusing
2057 	 */
2058 	amdgpu_ras_recovery_init(adev);
2059 
2060 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2061 		amdgpu_xgmi_add_device(adev);
2062 	amdgpu_amdkfd_device_init(adev);
2063 
2064 	amdgpu_fru_get_product_info(adev);
2065 
2066 init_failed:
2067 	if (amdgpu_sriov_vf(adev))
2068 		amdgpu_virt_release_full_gpu(adev, true);
2069 
2070 	return r;
2071 }
2072 
2073 /**
2074  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2075  *
2076  * @adev: amdgpu_device pointer
2077  *
2078  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2079  * this function before a GPU reset.  If the value is retained after a
2080  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2081  */
2082 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2083 {
2084 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2085 }
2086 
2087 /**
2088  * amdgpu_device_check_vram_lost - check if vram is valid
2089  *
2090  * @adev: amdgpu_device pointer
2091  *
2092  * Checks the reset magic value written to the gart pointer in VRAM.
2093  * The driver calls this after a GPU reset to see if the contents of
2094  * VRAM is lost or now.
2095  * returns true if vram is lost, false if not.
2096  */
2097 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2098 {
2099 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2100 			AMDGPU_RESET_MAGIC_NUM))
2101 		return true;
2102 
2103 	if (!adev->in_gpu_reset)
2104 		return false;
2105 
2106 	/*
2107 	 * For all ASICs with baco/mode1 reset, the VRAM is
2108 	 * always assumed to be lost.
2109 	 */
2110 	switch (amdgpu_asic_reset_method(adev)) {
2111 	case AMD_RESET_METHOD_BACO:
2112 	case AMD_RESET_METHOD_MODE1:
2113 		return true;
2114 	default:
2115 		return false;
2116 	}
2117 }
2118 
2119 /**
2120  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2121  *
2122  * @adev: amdgpu_device pointer
2123  * @state: clockgating state (gate or ungate)
2124  *
2125  * The list of all the hardware IPs that make up the asic is walked and the
2126  * set_clockgating_state callbacks are run.
2127  * Late initialization pass enabling clockgating for hardware IPs.
2128  * Fini or suspend, pass disabling clockgating for hardware IPs.
2129  * Returns 0 on success, negative error code on failure.
2130  */
2131 
2132 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2133 						enum amd_clockgating_state state)
2134 {
2135 	int i, j, r;
2136 
2137 	if (amdgpu_emu_mode == 1)
2138 		return 0;
2139 
2140 	for (j = 0; j < adev->num_ip_blocks; j++) {
2141 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2142 		if (!adev->ip_blocks[i].status.late_initialized)
2143 			continue;
2144 		/* skip CG for VCE/UVD, it's handled specially */
2145 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2146 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2147 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2148 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2149 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2150 			/* enable clockgating to save power */
2151 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2152 										     state);
2153 			if (r) {
2154 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2155 					  adev->ip_blocks[i].version->funcs->name, r);
2156 				return r;
2157 			}
2158 		}
2159 	}
2160 
2161 	return 0;
2162 }
2163 
2164 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2165 {
2166 	int i, j, r;
2167 
2168 	if (amdgpu_emu_mode == 1)
2169 		return 0;
2170 
2171 	for (j = 0; j < adev->num_ip_blocks; j++) {
2172 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2173 		if (!adev->ip_blocks[i].status.late_initialized)
2174 			continue;
2175 		/* skip CG for VCE/UVD, it's handled specially */
2176 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2177 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2178 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2179 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2180 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2181 			/* enable powergating to save power */
2182 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2183 											state);
2184 			if (r) {
2185 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2186 					  adev->ip_blocks[i].version->funcs->name, r);
2187 				return r;
2188 			}
2189 		}
2190 	}
2191 	return 0;
2192 }
2193 
2194 static int amdgpu_device_enable_mgpu_fan_boost(void)
2195 {
2196 	struct amdgpu_gpu_instance *gpu_ins;
2197 	struct amdgpu_device *adev;
2198 	int i, ret = 0;
2199 
2200 	mutex_lock(&mgpu_info.mutex);
2201 
2202 	/*
2203 	 * MGPU fan boost feature should be enabled
2204 	 * only when there are two or more dGPUs in
2205 	 * the system
2206 	 */
2207 	if (mgpu_info.num_dgpu < 2)
2208 		goto out;
2209 
2210 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2211 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2212 		adev = gpu_ins->adev;
2213 		if (!(adev->flags & AMD_IS_APU) &&
2214 		    !gpu_ins->mgpu_fan_enabled &&
2215 		    adev->powerplay.pp_funcs &&
2216 		    adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2217 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2218 			if (ret)
2219 				break;
2220 
2221 			gpu_ins->mgpu_fan_enabled = 1;
2222 		}
2223 	}
2224 
2225 out:
2226 	mutex_unlock(&mgpu_info.mutex);
2227 
2228 	return ret;
2229 }
2230 
2231 /**
2232  * amdgpu_device_ip_late_init - run late init for hardware IPs
2233  *
2234  * @adev: amdgpu_device pointer
2235  *
2236  * Late initialization pass for hardware IPs.  The list of all the hardware
2237  * IPs that make up the asic is walked and the late_init callbacks are run.
2238  * late_init covers any special initialization that an IP requires
2239  * after all of the have been initialized or something that needs to happen
2240  * late in the init process.
2241  * Returns 0 on success, negative error code on failure.
2242  */
2243 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2244 {
2245 	struct amdgpu_gpu_instance *gpu_instance;
2246 	int i = 0, r;
2247 
2248 	for (i = 0; i < adev->num_ip_blocks; i++) {
2249 		if (!adev->ip_blocks[i].status.hw)
2250 			continue;
2251 		if (adev->ip_blocks[i].version->funcs->late_init) {
2252 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2253 			if (r) {
2254 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2255 					  adev->ip_blocks[i].version->funcs->name, r);
2256 				return r;
2257 			}
2258 		}
2259 		adev->ip_blocks[i].status.late_initialized = true;
2260 	}
2261 
2262 	amdgpu_ras_set_error_query_ready(adev, true);
2263 
2264 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2265 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2266 
2267 	amdgpu_device_fill_reset_magic(adev);
2268 
2269 	r = amdgpu_device_enable_mgpu_fan_boost();
2270 	if (r)
2271 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2272 
2273 
2274 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2275 		mutex_lock(&mgpu_info.mutex);
2276 
2277 		/*
2278 		 * Reset device p-state to low as this was booted with high.
2279 		 *
2280 		 * This should be performed only after all devices from the same
2281 		 * hive get initialized.
2282 		 *
2283 		 * However, it's unknown how many device in the hive in advance.
2284 		 * As this is counted one by one during devices initializations.
2285 		 *
2286 		 * So, we wait for all XGMI interlinked devices initialized.
2287 		 * This may bring some delays as those devices may come from
2288 		 * different hives. But that should be OK.
2289 		 */
2290 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2291 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2292 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2293 				if (gpu_instance->adev->flags & AMD_IS_APU)
2294 					continue;
2295 
2296 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2297 						AMDGPU_XGMI_PSTATE_MIN);
2298 				if (r) {
2299 					DRM_ERROR("pstate setting failed (%d).\n", r);
2300 					break;
2301 				}
2302 			}
2303 		}
2304 
2305 		mutex_unlock(&mgpu_info.mutex);
2306 	}
2307 
2308 	return 0;
2309 }
2310 
2311 /**
2312  * amdgpu_device_ip_fini - run fini for hardware IPs
2313  *
2314  * @adev: amdgpu_device pointer
2315  *
2316  * Main teardown pass for hardware IPs.  The list of all the hardware
2317  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2318  * are run.  hw_fini tears down the hardware associated with each IP
2319  * and sw_fini tears down any software state associated with each IP.
2320  * Returns 0 on success, negative error code on failure.
2321  */
2322 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2323 {
2324 	int i, r;
2325 
2326 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2327 		amdgpu_virt_release_ras_err_handler_data(adev);
2328 
2329 	amdgpu_ras_pre_fini(adev);
2330 
2331 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2332 		amdgpu_xgmi_remove_device(adev);
2333 
2334 	amdgpu_amdkfd_device_fini(adev);
2335 
2336 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2337 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2338 
2339 	/* need to disable SMC first */
2340 	for (i = 0; i < adev->num_ip_blocks; i++) {
2341 		if (!adev->ip_blocks[i].status.hw)
2342 			continue;
2343 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2344 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2345 			/* XXX handle errors */
2346 			if (r) {
2347 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2348 					  adev->ip_blocks[i].version->funcs->name, r);
2349 			}
2350 			adev->ip_blocks[i].status.hw = false;
2351 			break;
2352 		}
2353 	}
2354 
2355 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2356 		if (!adev->ip_blocks[i].status.hw)
2357 			continue;
2358 
2359 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2360 		/* XXX handle errors */
2361 		if (r) {
2362 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2363 				  adev->ip_blocks[i].version->funcs->name, r);
2364 		}
2365 
2366 		adev->ip_blocks[i].status.hw = false;
2367 	}
2368 
2369 
2370 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2371 		if (!adev->ip_blocks[i].status.sw)
2372 			continue;
2373 
2374 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2375 			amdgpu_ucode_free_bo(adev);
2376 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2377 			amdgpu_device_wb_fini(adev);
2378 			amdgpu_device_vram_scratch_fini(adev);
2379 			amdgpu_ib_pool_fini(adev);
2380 		}
2381 
2382 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2383 		/* XXX handle errors */
2384 		if (r) {
2385 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2386 				  adev->ip_blocks[i].version->funcs->name, r);
2387 		}
2388 		adev->ip_blocks[i].status.sw = false;
2389 		adev->ip_blocks[i].status.valid = false;
2390 	}
2391 
2392 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2393 		if (!adev->ip_blocks[i].status.late_initialized)
2394 			continue;
2395 		if (adev->ip_blocks[i].version->funcs->late_fini)
2396 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2397 		adev->ip_blocks[i].status.late_initialized = false;
2398 	}
2399 
2400 	amdgpu_ras_fini(adev);
2401 
2402 	if (amdgpu_sriov_vf(adev))
2403 		if (amdgpu_virt_release_full_gpu(adev, false))
2404 			DRM_ERROR("failed to release exclusive mode on fini\n");
2405 
2406 	return 0;
2407 }
2408 
2409 /**
2410  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2411  *
2412  * @work: work_struct.
2413  */
2414 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2415 {
2416 	struct amdgpu_device *adev =
2417 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2418 	int r;
2419 
2420 	r = amdgpu_ib_ring_tests(adev);
2421 	if (r)
2422 		DRM_ERROR("ib ring test failed (%d).\n", r);
2423 }
2424 
2425 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2426 {
2427 	struct amdgpu_device *adev =
2428 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2429 
2430 	mutex_lock(&adev->gfx.gfx_off_mutex);
2431 	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2432 		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2433 			adev->gfx.gfx_off_state = true;
2434 	}
2435 	mutex_unlock(&adev->gfx.gfx_off_mutex);
2436 }
2437 
2438 /**
2439  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2440  *
2441  * @adev: amdgpu_device pointer
2442  *
2443  * Main suspend function for hardware IPs.  The list of all the hardware
2444  * IPs that make up the asic is walked, clockgating is disabled and the
2445  * suspend callbacks are run.  suspend puts the hardware and software state
2446  * in each IP into a state suitable for suspend.
2447  * Returns 0 on success, negative error code on failure.
2448  */
2449 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2450 {
2451 	int i, r;
2452 
2453 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2454 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2455 
2456 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2457 		if (!adev->ip_blocks[i].status.valid)
2458 			continue;
2459 
2460 		/* displays are handled separately */
2461 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2462 			continue;
2463 
2464 		/* XXX handle errors */
2465 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2466 		/* XXX handle errors */
2467 		if (r) {
2468 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2469 				  adev->ip_blocks[i].version->funcs->name, r);
2470 			return r;
2471 		}
2472 
2473 		adev->ip_blocks[i].status.hw = false;
2474 	}
2475 
2476 	return 0;
2477 }
2478 
2479 /**
2480  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2481  *
2482  * @adev: amdgpu_device pointer
2483  *
2484  * Main suspend function for hardware IPs.  The list of all the hardware
2485  * IPs that make up the asic is walked, clockgating is disabled and the
2486  * suspend callbacks are run.  suspend puts the hardware and software state
2487  * in each IP into a state suitable for suspend.
2488  * Returns 0 on success, negative error code on failure.
2489  */
2490 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2491 {
2492 	int i, r;
2493 
2494 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2495 		if (!adev->ip_blocks[i].status.valid)
2496 			continue;
2497 		/* displays are handled in phase1 */
2498 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2499 			continue;
2500 		/* PSP lost connection when err_event_athub occurs */
2501 		if (amdgpu_ras_intr_triggered() &&
2502 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2503 			adev->ip_blocks[i].status.hw = false;
2504 			continue;
2505 		}
2506 		/* XXX handle errors */
2507 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2508 		/* XXX handle errors */
2509 		if (r) {
2510 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2511 				  adev->ip_blocks[i].version->funcs->name, r);
2512 		}
2513 		adev->ip_blocks[i].status.hw = false;
2514 		/* handle putting the SMC in the appropriate state */
2515 		if(!amdgpu_sriov_vf(adev)){
2516 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2517 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2518 				if (r) {
2519 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2520 							adev->mp1_state, r);
2521 					return r;
2522 				}
2523 			}
2524 		}
2525 		adev->ip_blocks[i].status.hw = false;
2526 	}
2527 
2528 	return 0;
2529 }
2530 
2531 /**
2532  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2533  *
2534  * @adev: amdgpu_device pointer
2535  *
2536  * Main suspend function for hardware IPs.  The list of all the hardware
2537  * IPs that make up the asic is walked, clockgating is disabled and the
2538  * suspend callbacks are run.  suspend puts the hardware and software state
2539  * in each IP into a state suitable for suspend.
2540  * Returns 0 on success, negative error code on failure.
2541  */
2542 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2543 {
2544 	int r;
2545 
2546 	if (amdgpu_sriov_vf(adev))
2547 		amdgpu_virt_request_full_gpu(adev, false);
2548 
2549 	r = amdgpu_device_ip_suspend_phase1(adev);
2550 	if (r)
2551 		return r;
2552 	r = amdgpu_device_ip_suspend_phase2(adev);
2553 
2554 	if (amdgpu_sriov_vf(adev))
2555 		amdgpu_virt_release_full_gpu(adev, false);
2556 
2557 	return r;
2558 }
2559 
2560 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2561 {
2562 	int i, r;
2563 
2564 	static enum amd_ip_block_type ip_order[] = {
2565 		AMD_IP_BLOCK_TYPE_GMC,
2566 		AMD_IP_BLOCK_TYPE_COMMON,
2567 		AMD_IP_BLOCK_TYPE_PSP,
2568 		AMD_IP_BLOCK_TYPE_IH,
2569 	};
2570 
2571 	for (i = 0; i < adev->num_ip_blocks; i++)
2572 		adev->ip_blocks[i].status.hw = false;
2573 
2574 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2575 		int j;
2576 		struct amdgpu_ip_block *block;
2577 
2578 		for (j = 0; j < adev->num_ip_blocks; j++) {
2579 			block = &adev->ip_blocks[j];
2580 
2581 			if (block->version->type != ip_order[i] ||
2582 				!block->status.valid)
2583 				continue;
2584 
2585 			r = block->version->funcs->hw_init(adev);
2586 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2587 			if (r)
2588 				return r;
2589 			block->status.hw = true;
2590 		}
2591 	}
2592 
2593 	return 0;
2594 }
2595 
2596 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2597 {
2598 	int i, r;
2599 
2600 	static enum amd_ip_block_type ip_order[] = {
2601 		AMD_IP_BLOCK_TYPE_SMC,
2602 		AMD_IP_BLOCK_TYPE_DCE,
2603 		AMD_IP_BLOCK_TYPE_GFX,
2604 		AMD_IP_BLOCK_TYPE_SDMA,
2605 		AMD_IP_BLOCK_TYPE_UVD,
2606 		AMD_IP_BLOCK_TYPE_VCE,
2607 		AMD_IP_BLOCK_TYPE_VCN
2608 	};
2609 
2610 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2611 		int j;
2612 		struct amdgpu_ip_block *block;
2613 
2614 		for (j = 0; j < adev->num_ip_blocks; j++) {
2615 			block = &adev->ip_blocks[j];
2616 
2617 			if (block->version->type != ip_order[i] ||
2618 				!block->status.valid ||
2619 				block->status.hw)
2620 				continue;
2621 
2622 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2623 				r = block->version->funcs->resume(adev);
2624 			else
2625 				r = block->version->funcs->hw_init(adev);
2626 
2627 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2628 			if (r)
2629 				return r;
2630 			block->status.hw = true;
2631 		}
2632 	}
2633 
2634 	return 0;
2635 }
2636 
2637 /**
2638  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2639  *
2640  * @adev: amdgpu_device pointer
2641  *
2642  * First resume function for hardware IPs.  The list of all the hardware
2643  * IPs that make up the asic is walked and the resume callbacks are run for
2644  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2645  * after a suspend and updates the software state as necessary.  This
2646  * function is also used for restoring the GPU after a GPU reset.
2647  * Returns 0 on success, negative error code on failure.
2648  */
2649 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2650 {
2651 	int i, r;
2652 
2653 	for (i = 0; i < adev->num_ip_blocks; i++) {
2654 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2655 			continue;
2656 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2657 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2658 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2659 
2660 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2661 			if (r) {
2662 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2663 					  adev->ip_blocks[i].version->funcs->name, r);
2664 				return r;
2665 			}
2666 			adev->ip_blocks[i].status.hw = true;
2667 		}
2668 	}
2669 
2670 	return 0;
2671 }
2672 
2673 /**
2674  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2675  *
2676  * @adev: amdgpu_device pointer
2677  *
2678  * First resume function for hardware IPs.  The list of all the hardware
2679  * IPs that make up the asic is walked and the resume callbacks are run for
2680  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2681  * functional state after a suspend and updates the software state as
2682  * necessary.  This function is also used for restoring the GPU after a GPU
2683  * reset.
2684  * Returns 0 on success, negative error code on failure.
2685  */
2686 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2687 {
2688 	int i, r;
2689 
2690 	for (i = 0; i < adev->num_ip_blocks; i++) {
2691 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2692 			continue;
2693 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2694 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2695 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2696 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2697 			continue;
2698 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2699 		if (r) {
2700 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2701 				  adev->ip_blocks[i].version->funcs->name, r);
2702 			return r;
2703 		}
2704 		adev->ip_blocks[i].status.hw = true;
2705 	}
2706 
2707 	return 0;
2708 }
2709 
2710 /**
2711  * amdgpu_device_ip_resume - run resume for hardware IPs
2712  *
2713  * @adev: amdgpu_device pointer
2714  *
2715  * Main resume function for hardware IPs.  The hardware IPs
2716  * are split into two resume functions because they are
2717  * are also used in in recovering from a GPU reset and some additional
2718  * steps need to be take between them.  In this case (S3/S4) they are
2719  * run sequentially.
2720  * Returns 0 on success, negative error code on failure.
2721  */
2722 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2723 {
2724 	int r;
2725 
2726 	r = amdgpu_device_ip_resume_phase1(adev);
2727 	if (r)
2728 		return r;
2729 
2730 	r = amdgpu_device_fw_loading(adev);
2731 	if (r)
2732 		return r;
2733 
2734 	r = amdgpu_device_ip_resume_phase2(adev);
2735 
2736 	return r;
2737 }
2738 
2739 /**
2740  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2741  *
2742  * @adev: amdgpu_device pointer
2743  *
2744  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2745  */
2746 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2747 {
2748 	if (amdgpu_sriov_vf(adev)) {
2749 		if (adev->is_atom_fw) {
2750 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2751 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2752 		} else {
2753 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2754 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2755 		}
2756 
2757 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2758 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2759 	}
2760 }
2761 
2762 /**
2763  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2764  *
2765  * @asic_type: AMD asic type
2766  *
2767  * Check if there is DC (new modesetting infrastructre) support for an asic.
2768  * returns true if DC has support, false if not.
2769  */
2770 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2771 {
2772 	switch (asic_type) {
2773 #if defined(CONFIG_DRM_AMD_DC)
2774 	case CHIP_BONAIRE:
2775 	case CHIP_KAVERI:
2776 	case CHIP_KABINI:
2777 	case CHIP_MULLINS:
2778 		/*
2779 		 * We have systems in the wild with these ASICs that require
2780 		 * LVDS and VGA support which is not supported with DC.
2781 		 *
2782 		 * Fallback to the non-DC driver here by default so as not to
2783 		 * cause regressions.
2784 		 */
2785 		return amdgpu_dc > 0;
2786 	case CHIP_HAWAII:
2787 	case CHIP_CARRIZO:
2788 	case CHIP_STONEY:
2789 	case CHIP_POLARIS10:
2790 	case CHIP_POLARIS11:
2791 	case CHIP_POLARIS12:
2792 	case CHIP_VEGAM:
2793 	case CHIP_TONGA:
2794 	case CHIP_FIJI:
2795 	case CHIP_VEGA10:
2796 	case CHIP_VEGA12:
2797 	case CHIP_VEGA20:
2798 #if defined(CONFIG_DRM_AMD_DC_DCN)
2799 	case CHIP_RAVEN:
2800 	case CHIP_NAVI10:
2801 	case CHIP_NAVI14:
2802 	case CHIP_NAVI12:
2803 	case CHIP_RENOIR:
2804 #endif
2805 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2806 	case CHIP_SIENNA_CICHLID:
2807 	case CHIP_NAVY_FLOUNDER:
2808 #endif
2809 		return amdgpu_dc != 0;
2810 #endif
2811 	default:
2812 		if (amdgpu_dc > 0)
2813 			DRM_INFO("Display Core has been requested via kernel parameter "
2814 					 "but isn't supported by ASIC, ignoring\n");
2815 		return false;
2816 	}
2817 }
2818 
2819 /**
2820  * amdgpu_device_has_dc_support - check if dc is supported
2821  *
2822  * @adev: amdgpu_device_pointer
2823  *
2824  * Returns true for supported, false for not supported
2825  */
2826 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2827 {
2828 	if (amdgpu_sriov_vf(adev))
2829 		return false;
2830 
2831 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
2832 }
2833 
2834 
2835 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2836 {
2837 	struct amdgpu_device *adev =
2838 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
2839 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2840 
2841 	/* It's a bug to not have a hive within this function */
2842 	if (WARN_ON(!hive))
2843 		return;
2844 
2845 	/*
2846 	 * Use task barrier to synchronize all xgmi reset works across the
2847 	 * hive. task_barrier_enter and task_barrier_exit will block
2848 	 * until all the threads running the xgmi reset works reach
2849 	 * those points. task_barrier_full will do both blocks.
2850 	 */
2851 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2852 
2853 		task_barrier_enter(&hive->tb);
2854 		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2855 
2856 		if (adev->asic_reset_res)
2857 			goto fail;
2858 
2859 		task_barrier_exit(&hive->tb);
2860 		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2861 
2862 		if (adev->asic_reset_res)
2863 			goto fail;
2864 
2865 		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2866 			adev->mmhub.funcs->reset_ras_error_count(adev);
2867 	} else {
2868 
2869 		task_barrier_full(&hive->tb);
2870 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
2871 	}
2872 
2873 fail:
2874 	if (adev->asic_reset_res)
2875 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2876 			 adev->asic_reset_res, adev->ddev->unique);
2877 }
2878 
2879 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2880 {
2881 	char *input = amdgpu_lockup_timeout;
2882 	char *timeout_setting = NULL;
2883 	int index = 0;
2884 	long timeout;
2885 	int ret = 0;
2886 
2887 	/*
2888 	 * By default timeout for non compute jobs is 10000.
2889 	 * And there is no timeout enforced on compute jobs.
2890 	 * In SR-IOV or passthrough mode, timeout for compute
2891 	 * jobs are 60000 by default.
2892 	 */
2893 	adev->gfx_timeout = msecs_to_jiffies(10000);
2894 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2895 	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2896 		adev->compute_timeout =  msecs_to_jiffies(60000);
2897 	else
2898 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2899 
2900 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2901 		while ((timeout_setting = strsep(&input, ",")) &&
2902 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2903 			ret = kstrtol(timeout_setting, 0, &timeout);
2904 			if (ret)
2905 				return ret;
2906 
2907 			if (timeout == 0) {
2908 				index++;
2909 				continue;
2910 			} else if (timeout < 0) {
2911 				timeout = MAX_SCHEDULE_TIMEOUT;
2912 			} else {
2913 				timeout = msecs_to_jiffies(timeout);
2914 			}
2915 
2916 			switch (index++) {
2917 			case 0:
2918 				adev->gfx_timeout = timeout;
2919 				break;
2920 			case 1:
2921 				adev->compute_timeout = timeout;
2922 				break;
2923 			case 2:
2924 				adev->sdma_timeout = timeout;
2925 				break;
2926 			case 3:
2927 				adev->video_timeout = timeout;
2928 				break;
2929 			default:
2930 				break;
2931 			}
2932 		}
2933 		/*
2934 		 * There is only one value specified and
2935 		 * it should apply to all non-compute jobs.
2936 		 */
2937 		if (index == 1) {
2938 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2939 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2940 				adev->compute_timeout = adev->gfx_timeout;
2941 		}
2942 	}
2943 
2944 	return ret;
2945 }
2946 
2947 static const struct attribute *amdgpu_dev_attributes[] = {
2948 	&dev_attr_product_name.attr,
2949 	&dev_attr_product_number.attr,
2950 	&dev_attr_serial_number.attr,
2951 	&dev_attr_pcie_replay_count.attr,
2952 	NULL
2953 };
2954 
2955 /**
2956  * amdgpu_device_init - initialize the driver
2957  *
2958  * @adev: amdgpu_device pointer
2959  * @ddev: drm dev pointer
2960  * @pdev: pci dev pointer
2961  * @flags: driver flags
2962  *
2963  * Initializes the driver info and hw (all asics).
2964  * Returns 0 for success or an error on failure.
2965  * Called at driver startup.
2966  */
2967 int amdgpu_device_init(struct amdgpu_device *adev,
2968 		       struct drm_device *ddev,
2969 		       struct pci_dev *pdev,
2970 		       uint32_t flags)
2971 {
2972 	int r, i;
2973 	bool boco = false;
2974 	u32 max_MBps;
2975 
2976 	adev->shutdown = false;
2977 	adev->dev = &pdev->dev;
2978 	adev->ddev = ddev;
2979 	adev->pdev = pdev;
2980 	adev->flags = flags;
2981 
2982 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2983 		adev->asic_type = amdgpu_force_asic_type;
2984 	else
2985 		adev->asic_type = flags & AMD_ASIC_MASK;
2986 
2987 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2988 	if (amdgpu_emu_mode == 1)
2989 		adev->usec_timeout *= 10;
2990 	adev->gmc.gart_size = 512 * 1024 * 1024;
2991 	adev->accel_working = false;
2992 	adev->num_rings = 0;
2993 	adev->mman.buffer_funcs = NULL;
2994 	adev->mman.buffer_funcs_ring = NULL;
2995 	adev->vm_manager.vm_pte_funcs = NULL;
2996 	adev->vm_manager.vm_pte_num_scheds = 0;
2997 	adev->gmc.gmc_funcs = NULL;
2998 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2999 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3000 
3001 	adev->smc_rreg = &amdgpu_invalid_rreg;
3002 	adev->smc_wreg = &amdgpu_invalid_wreg;
3003 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3004 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3005 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3006 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3007 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3008 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3009 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3010 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3011 	adev->didt_rreg = &amdgpu_invalid_rreg;
3012 	adev->didt_wreg = &amdgpu_invalid_wreg;
3013 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3014 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3015 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3016 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3017 
3018 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3019 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3020 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3021 
3022 	/* mutex initialization are all done here so we
3023 	 * can recall function without having locking issues */
3024 	atomic_set(&adev->irq.ih.lock, 0);
3025 	mutex_init(&adev->firmware.mutex);
3026 	mutex_init(&adev->pm.mutex);
3027 	mutex_init(&adev->gfx.gpu_clock_mutex);
3028 	mutex_init(&adev->srbm_mutex);
3029 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3030 	mutex_init(&adev->gfx.gfx_off_mutex);
3031 	mutex_init(&adev->grbm_idx_mutex);
3032 	mutex_init(&adev->mn_lock);
3033 	mutex_init(&adev->virt.vf_errors.lock);
3034 	hash_init(adev->mn_hash);
3035 	mutex_init(&adev->lock_reset);
3036 	mutex_init(&adev->psp.mutex);
3037 	mutex_init(&adev->notifier_lock);
3038 
3039 	r = amdgpu_device_check_arguments(adev);
3040 	if (r)
3041 		return r;
3042 
3043 	spin_lock_init(&adev->mmio_idx_lock);
3044 	spin_lock_init(&adev->smc_idx_lock);
3045 	spin_lock_init(&adev->pcie_idx_lock);
3046 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3047 	spin_lock_init(&adev->didt_idx_lock);
3048 	spin_lock_init(&adev->gc_cac_idx_lock);
3049 	spin_lock_init(&adev->se_cac_idx_lock);
3050 	spin_lock_init(&adev->audio_endpt_idx_lock);
3051 	spin_lock_init(&adev->mm_stats.lock);
3052 
3053 	INIT_LIST_HEAD(&adev->shadow_list);
3054 	mutex_init(&adev->shadow_list_lock);
3055 
3056 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3057 			  amdgpu_device_delayed_init_work_handler);
3058 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3059 			  amdgpu_device_delay_enable_gfx_off);
3060 
3061 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3062 
3063 	adev->gfx.gfx_off_req_count = 1;
3064 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3065 
3066 	atomic_set(&adev->throttling_logging_enabled, 1);
3067 	/*
3068 	 * If throttling continues, logging will be performed every minute
3069 	 * to avoid log flooding. "-1" is subtracted since the thermal
3070 	 * throttling interrupt comes every second. Thus, the total logging
3071 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3072 	 * for throttling interrupt) = 60 seconds.
3073 	 */
3074 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3075 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3076 
3077 	/* Registers mapping */
3078 	/* TODO: block userspace mapping of io register */
3079 	if (adev->asic_type >= CHIP_BONAIRE) {
3080 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3081 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3082 	} else {
3083 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3084 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3085 	}
3086 
3087 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3088 	if (adev->rmmio == NULL) {
3089 		return -ENOMEM;
3090 	}
3091 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3092 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3093 
3094 	/* io port mapping */
3095 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3096 		if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3097 			adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3098 			adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3099 			break;
3100 		}
3101 	}
3102 	if (adev->rio_mem == NULL)
3103 		DRM_INFO("PCI I/O BAR is not found.\n");
3104 
3105 	/* enable PCIE atomic ops */
3106 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3107 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3108 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3109 	if (r) {
3110 		adev->have_atomics_support = false;
3111 		DRM_INFO("PCIE atomic ops is not supported\n");
3112 	} else {
3113 		adev->have_atomics_support = true;
3114 	}
3115 
3116 	amdgpu_device_get_pcie_info(adev);
3117 
3118 	if (amdgpu_mcbp)
3119 		DRM_INFO("MCBP is enabled\n");
3120 
3121 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3122 		adev->enable_mes = true;
3123 
3124 	/* detect hw virtualization here */
3125 	amdgpu_detect_virtualization(adev);
3126 
3127 	r = amdgpu_device_get_job_timeout_settings(adev);
3128 	if (r) {
3129 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3130 		return r;
3131 	}
3132 
3133 	/* early init functions */
3134 	r = amdgpu_device_ip_early_init(adev);
3135 	if (r)
3136 		return r;
3137 
3138 	/* doorbell bar mapping and doorbell index init*/
3139 	amdgpu_device_doorbell_init(adev);
3140 
3141 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3142 	/* this will fail for cards that aren't VGA class devices, just
3143 	 * ignore it */
3144 	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3145 
3146 	if (amdgpu_device_supports_boco(ddev))
3147 		boco = true;
3148 	if (amdgpu_has_atpx() &&
3149 	    (amdgpu_is_atpx_hybrid() ||
3150 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3151 	    !pci_is_thunderbolt_attached(adev->pdev))
3152 		vga_switcheroo_register_client(adev->pdev,
3153 					       &amdgpu_switcheroo_ops, boco);
3154 	if (boco)
3155 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3156 
3157 	if (amdgpu_emu_mode == 1) {
3158 		/* post the asic on emulation mode */
3159 		emu_soc_asic_init(adev);
3160 		goto fence_driver_init;
3161 	}
3162 
3163 	/* detect if we are with an SRIOV vbios */
3164 	amdgpu_device_detect_sriov_bios(adev);
3165 
3166 	/* check if we need to reset the asic
3167 	 *  E.g., driver was not cleanly unloaded previously, etc.
3168 	 */
3169 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3170 		r = amdgpu_asic_reset(adev);
3171 		if (r) {
3172 			dev_err(adev->dev, "asic reset on init failed\n");
3173 			goto failed;
3174 		}
3175 	}
3176 
3177 	/* Post card if necessary */
3178 	if (amdgpu_device_need_post(adev)) {
3179 		if (!adev->bios) {
3180 			dev_err(adev->dev, "no vBIOS found\n");
3181 			r = -EINVAL;
3182 			goto failed;
3183 		}
3184 		DRM_INFO("GPU posting now...\n");
3185 		r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3186 		if (r) {
3187 			dev_err(adev->dev, "gpu post error!\n");
3188 			goto failed;
3189 		}
3190 	}
3191 
3192 	if (adev->is_atom_fw) {
3193 		/* Initialize clocks */
3194 		r = amdgpu_atomfirmware_get_clock_info(adev);
3195 		if (r) {
3196 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3197 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3198 			goto failed;
3199 		}
3200 	} else {
3201 		/* Initialize clocks */
3202 		r = amdgpu_atombios_get_clock_info(adev);
3203 		if (r) {
3204 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3205 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3206 			goto failed;
3207 		}
3208 		/* init i2c buses */
3209 		if (!amdgpu_device_has_dc_support(adev))
3210 			amdgpu_atombios_i2c_init(adev);
3211 	}
3212 
3213 fence_driver_init:
3214 	/* Fence driver */
3215 	r = amdgpu_fence_driver_init(adev);
3216 	if (r) {
3217 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3218 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3219 		goto failed;
3220 	}
3221 
3222 	/* init the mode config */
3223 	drm_mode_config_init(adev->ddev);
3224 
3225 	r = amdgpu_device_ip_init(adev);
3226 	if (r) {
3227 		/* failed in exclusive mode due to timeout */
3228 		if (amdgpu_sriov_vf(adev) &&
3229 		    !amdgpu_sriov_runtime(adev) &&
3230 		    amdgpu_virt_mmio_blocked(adev) &&
3231 		    !amdgpu_virt_wait_reset(adev)) {
3232 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3233 			/* Don't send request since VF is inactive. */
3234 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3235 			adev->virt.ops = NULL;
3236 			r = -EAGAIN;
3237 			goto failed;
3238 		}
3239 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3240 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3241 		goto failed;
3242 	}
3243 
3244 	dev_info(adev->dev,
3245 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3246 			adev->gfx.config.max_shader_engines,
3247 			adev->gfx.config.max_sh_per_se,
3248 			adev->gfx.config.max_cu_per_sh,
3249 			adev->gfx.cu_info.number);
3250 
3251 	adev->accel_working = true;
3252 
3253 	amdgpu_vm_check_compute_bug(adev);
3254 
3255 	/* Initialize the buffer migration limit. */
3256 	if (amdgpu_moverate >= 0)
3257 		max_MBps = amdgpu_moverate;
3258 	else
3259 		max_MBps = 8; /* Allow 8 MB/s. */
3260 	/* Get a log2 for easy divisions. */
3261 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3262 
3263 	amdgpu_fbdev_init(adev);
3264 
3265 	r = amdgpu_pm_sysfs_init(adev);
3266 	if (r) {
3267 		adev->pm_sysfs_en = false;
3268 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3269 	} else
3270 		adev->pm_sysfs_en = true;
3271 
3272 	r = amdgpu_ucode_sysfs_init(adev);
3273 	if (r) {
3274 		adev->ucode_sysfs_en = false;
3275 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3276 	} else
3277 		adev->ucode_sysfs_en = true;
3278 
3279 	if ((amdgpu_testing & 1)) {
3280 		if (adev->accel_working)
3281 			amdgpu_test_moves(adev);
3282 		else
3283 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3284 	}
3285 	if (amdgpu_benchmarking) {
3286 		if (adev->accel_working)
3287 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3288 		else
3289 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3290 	}
3291 
3292 	/*
3293 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3294 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3295 	 * gpu instance is counted less.
3296 	 */
3297 	amdgpu_register_gpu_instance(adev);
3298 
3299 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3300 	 * explicit gating rather than handling it automatically.
3301 	 */
3302 	r = amdgpu_device_ip_late_init(adev);
3303 	if (r) {
3304 		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3305 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3306 		goto failed;
3307 	}
3308 
3309 	/* must succeed. */
3310 	amdgpu_ras_resume(adev);
3311 
3312 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3313 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3314 
3315 	if (amdgpu_sriov_vf(adev))
3316 		flush_delayed_work(&adev->delayed_init_work);
3317 
3318 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3319 	if (r) {
3320 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3321 		return r;
3322 	}
3323 
3324 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3325 		r = amdgpu_pmu_init(adev);
3326 	if (r)
3327 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3328 
3329 	return 0;
3330 
3331 failed:
3332 	amdgpu_vf_error_trans_all(adev);
3333 	if (boco)
3334 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3335 
3336 	return r;
3337 }
3338 
3339 /**
3340  * amdgpu_device_fini - tear down the driver
3341  *
3342  * @adev: amdgpu_device pointer
3343  *
3344  * Tear down the driver info (all asics).
3345  * Called at driver shutdown.
3346  */
3347 void amdgpu_device_fini(struct amdgpu_device *adev)
3348 {
3349 	int r;
3350 
3351 	DRM_INFO("amdgpu: finishing device.\n");
3352 	flush_delayed_work(&adev->delayed_init_work);
3353 	adev->shutdown = true;
3354 
3355 	/* make sure IB test finished before entering exclusive mode
3356 	 * to avoid preemption on IB test
3357 	 * */
3358 	if (amdgpu_sriov_vf(adev))
3359 		amdgpu_virt_request_full_gpu(adev, false);
3360 
3361 	/* disable all interrupts */
3362 	amdgpu_irq_disable_all(adev);
3363 	if (adev->mode_info.mode_config_initialized){
3364 		if (!amdgpu_device_has_dc_support(adev))
3365 			drm_helper_force_disable_all(adev->ddev);
3366 		else
3367 			drm_atomic_helper_shutdown(adev->ddev);
3368 	}
3369 	amdgpu_fence_driver_fini(adev);
3370 	if (adev->pm_sysfs_en)
3371 		amdgpu_pm_sysfs_fini(adev);
3372 	amdgpu_fbdev_fini(adev);
3373 	r = amdgpu_device_ip_fini(adev);
3374 	release_firmware(adev->firmware.gpu_info_fw);
3375 	adev->firmware.gpu_info_fw = NULL;
3376 	adev->accel_working = false;
3377 	/* free i2c buses */
3378 	if (!amdgpu_device_has_dc_support(adev))
3379 		amdgpu_i2c_fini(adev);
3380 
3381 	if (amdgpu_emu_mode != 1)
3382 		amdgpu_atombios_fini(adev);
3383 
3384 	kfree(adev->bios);
3385 	adev->bios = NULL;
3386 	if (amdgpu_has_atpx() &&
3387 	    (amdgpu_is_atpx_hybrid() ||
3388 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3389 	    !pci_is_thunderbolt_attached(adev->pdev))
3390 		vga_switcheroo_unregister_client(adev->pdev);
3391 	if (amdgpu_device_supports_boco(adev->ddev))
3392 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3393 	vga_client_register(adev->pdev, NULL, NULL, NULL);
3394 	if (adev->rio_mem)
3395 		pci_iounmap(adev->pdev, adev->rio_mem);
3396 	adev->rio_mem = NULL;
3397 	iounmap(adev->rmmio);
3398 	adev->rmmio = NULL;
3399 	amdgpu_device_doorbell_fini(adev);
3400 
3401 	if (adev->ucode_sysfs_en)
3402 		amdgpu_ucode_sysfs_fini(adev);
3403 
3404 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3405 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3406 		amdgpu_pmu_fini(adev);
3407 	if (adev->discovery_bin)
3408 		amdgpu_discovery_fini(adev);
3409 }
3410 
3411 
3412 /*
3413  * Suspend & resume.
3414  */
3415 /**
3416  * amdgpu_device_suspend - initiate device suspend
3417  *
3418  * @dev: drm dev pointer
3419  * @fbcon : notify the fbdev of suspend
3420  *
3421  * Puts the hw in the suspend state (all asics).
3422  * Returns 0 for success or an error on failure.
3423  * Called at driver suspend.
3424  */
3425 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3426 {
3427 	struct amdgpu_device *adev;
3428 	struct drm_crtc *crtc;
3429 	struct drm_connector *connector;
3430 	struct drm_connector_list_iter iter;
3431 	int r;
3432 
3433 	if (dev == NULL || dev->dev_private == NULL) {
3434 		return -ENODEV;
3435 	}
3436 
3437 	adev = dev->dev_private;
3438 
3439 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3440 		return 0;
3441 
3442 	adev->in_suspend = true;
3443 	drm_kms_helper_poll_disable(dev);
3444 
3445 	if (fbcon)
3446 		amdgpu_fbdev_set_suspend(adev, 1);
3447 
3448 	cancel_delayed_work_sync(&adev->delayed_init_work);
3449 
3450 	if (!amdgpu_device_has_dc_support(adev)) {
3451 		/* turn off display hw */
3452 		drm_modeset_lock_all(dev);
3453 		drm_connector_list_iter_begin(dev, &iter);
3454 		drm_for_each_connector_iter(connector, &iter)
3455 			drm_helper_connector_dpms(connector,
3456 						  DRM_MODE_DPMS_OFF);
3457 		drm_connector_list_iter_end(&iter);
3458 		drm_modeset_unlock_all(dev);
3459 			/* unpin the front buffers and cursors */
3460 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3461 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3462 			struct drm_framebuffer *fb = crtc->primary->fb;
3463 			struct amdgpu_bo *robj;
3464 
3465 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3466 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3467 				r = amdgpu_bo_reserve(aobj, true);
3468 				if (r == 0) {
3469 					amdgpu_bo_unpin(aobj);
3470 					amdgpu_bo_unreserve(aobj);
3471 				}
3472 			}
3473 
3474 			if (fb == NULL || fb->obj[0] == NULL) {
3475 				continue;
3476 			}
3477 			robj = gem_to_amdgpu_bo(fb->obj[0]);
3478 			/* don't unpin kernel fb objects */
3479 			if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3480 				r = amdgpu_bo_reserve(robj, true);
3481 				if (r == 0) {
3482 					amdgpu_bo_unpin(robj);
3483 					amdgpu_bo_unreserve(robj);
3484 				}
3485 			}
3486 		}
3487 	}
3488 
3489 	amdgpu_ras_suspend(adev);
3490 
3491 	r = amdgpu_device_ip_suspend_phase1(adev);
3492 
3493 	amdgpu_amdkfd_suspend(adev, !fbcon);
3494 
3495 	/* evict vram memory */
3496 	amdgpu_bo_evict_vram(adev);
3497 
3498 	amdgpu_fence_driver_suspend(adev);
3499 
3500 	r = amdgpu_device_ip_suspend_phase2(adev);
3501 
3502 	/* evict remaining vram memory
3503 	 * This second call to evict vram is to evict the gart page table
3504 	 * using the CPU.
3505 	 */
3506 	amdgpu_bo_evict_vram(adev);
3507 
3508 	return 0;
3509 }
3510 
3511 /**
3512  * amdgpu_device_resume - initiate device resume
3513  *
3514  * @dev: drm dev pointer
3515  * @fbcon : notify the fbdev of resume
3516  *
3517  * Bring the hw back to operating state (all asics).
3518  * Returns 0 for success or an error on failure.
3519  * Called at driver resume.
3520  */
3521 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3522 {
3523 	struct drm_connector *connector;
3524 	struct drm_connector_list_iter iter;
3525 	struct amdgpu_device *adev = dev->dev_private;
3526 	struct drm_crtc *crtc;
3527 	int r = 0;
3528 
3529 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3530 		return 0;
3531 
3532 	/* post card */
3533 	if (amdgpu_device_need_post(adev)) {
3534 		r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3535 		if (r)
3536 			DRM_ERROR("amdgpu asic init failed\n");
3537 	}
3538 
3539 	r = amdgpu_device_ip_resume(adev);
3540 	if (r) {
3541 		DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3542 		return r;
3543 	}
3544 	amdgpu_fence_driver_resume(adev);
3545 
3546 
3547 	r = amdgpu_device_ip_late_init(adev);
3548 	if (r)
3549 		return r;
3550 
3551 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3552 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3553 
3554 	if (!amdgpu_device_has_dc_support(adev)) {
3555 		/* pin cursors */
3556 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3557 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3558 
3559 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3560 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3561 				r = amdgpu_bo_reserve(aobj, true);
3562 				if (r == 0) {
3563 					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3564 					if (r != 0)
3565 						DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3566 					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3567 					amdgpu_bo_unreserve(aobj);
3568 				}
3569 			}
3570 		}
3571 	}
3572 	r = amdgpu_amdkfd_resume(adev, !fbcon);
3573 	if (r)
3574 		return r;
3575 
3576 	/* Make sure IB tests flushed */
3577 	flush_delayed_work(&adev->delayed_init_work);
3578 
3579 	/* blat the mode back in */
3580 	if (fbcon) {
3581 		if (!amdgpu_device_has_dc_support(adev)) {
3582 			/* pre DCE11 */
3583 			drm_helper_resume_force_mode(dev);
3584 
3585 			/* turn on display hw */
3586 			drm_modeset_lock_all(dev);
3587 
3588 			drm_connector_list_iter_begin(dev, &iter);
3589 			drm_for_each_connector_iter(connector, &iter)
3590 				drm_helper_connector_dpms(connector,
3591 							  DRM_MODE_DPMS_ON);
3592 			drm_connector_list_iter_end(&iter);
3593 
3594 			drm_modeset_unlock_all(dev);
3595 		}
3596 		amdgpu_fbdev_set_suspend(adev, 0);
3597 	}
3598 
3599 	drm_kms_helper_poll_enable(dev);
3600 
3601 	amdgpu_ras_resume(adev);
3602 
3603 	/*
3604 	 * Most of the connector probing functions try to acquire runtime pm
3605 	 * refs to ensure that the GPU is powered on when connector polling is
3606 	 * performed. Since we're calling this from a runtime PM callback,
3607 	 * trying to acquire rpm refs will cause us to deadlock.
3608 	 *
3609 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3610 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3611 	 */
3612 #ifdef CONFIG_PM
3613 	dev->dev->power.disable_depth++;
3614 #endif
3615 	if (!amdgpu_device_has_dc_support(adev))
3616 		drm_helper_hpd_irq_event(dev);
3617 	else
3618 		drm_kms_helper_hotplug_event(dev);
3619 #ifdef CONFIG_PM
3620 	dev->dev->power.disable_depth--;
3621 #endif
3622 	adev->in_suspend = false;
3623 
3624 	return 0;
3625 }
3626 
3627 /**
3628  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3629  *
3630  * @adev: amdgpu_device pointer
3631  *
3632  * The list of all the hardware IPs that make up the asic is walked and
3633  * the check_soft_reset callbacks are run.  check_soft_reset determines
3634  * if the asic is still hung or not.
3635  * Returns true if any of the IPs are still in a hung state, false if not.
3636  */
3637 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3638 {
3639 	int i;
3640 	bool asic_hang = false;
3641 
3642 	if (amdgpu_sriov_vf(adev))
3643 		return true;
3644 
3645 	if (amdgpu_asic_need_full_reset(adev))
3646 		return true;
3647 
3648 	for (i = 0; i < adev->num_ip_blocks; i++) {
3649 		if (!adev->ip_blocks[i].status.valid)
3650 			continue;
3651 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3652 			adev->ip_blocks[i].status.hang =
3653 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3654 		if (adev->ip_blocks[i].status.hang) {
3655 			DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3656 			asic_hang = true;
3657 		}
3658 	}
3659 	return asic_hang;
3660 }
3661 
3662 /**
3663  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3664  *
3665  * @adev: amdgpu_device pointer
3666  *
3667  * The list of all the hardware IPs that make up the asic is walked and the
3668  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3669  * handles any IP specific hardware or software state changes that are
3670  * necessary for a soft reset to succeed.
3671  * Returns 0 on success, negative error code on failure.
3672  */
3673 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3674 {
3675 	int i, r = 0;
3676 
3677 	for (i = 0; i < adev->num_ip_blocks; i++) {
3678 		if (!adev->ip_blocks[i].status.valid)
3679 			continue;
3680 		if (adev->ip_blocks[i].status.hang &&
3681 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3682 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3683 			if (r)
3684 				return r;
3685 		}
3686 	}
3687 
3688 	return 0;
3689 }
3690 
3691 /**
3692  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3693  *
3694  * @adev: amdgpu_device pointer
3695  *
3696  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3697  * reset is necessary to recover.
3698  * Returns true if a full asic reset is required, false if not.
3699  */
3700 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3701 {
3702 	int i;
3703 
3704 	if (amdgpu_asic_need_full_reset(adev))
3705 		return true;
3706 
3707 	for (i = 0; i < adev->num_ip_blocks; i++) {
3708 		if (!adev->ip_blocks[i].status.valid)
3709 			continue;
3710 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3711 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3712 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3713 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3714 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3715 			if (adev->ip_blocks[i].status.hang) {
3716 				DRM_INFO("Some block need full reset!\n");
3717 				return true;
3718 			}
3719 		}
3720 	}
3721 	return false;
3722 }
3723 
3724 /**
3725  * amdgpu_device_ip_soft_reset - do a soft reset
3726  *
3727  * @adev: amdgpu_device pointer
3728  *
3729  * The list of all the hardware IPs that make up the asic is walked and the
3730  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3731  * IP specific hardware or software state changes that are necessary to soft
3732  * reset the IP.
3733  * Returns 0 on success, negative error code on failure.
3734  */
3735 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3736 {
3737 	int i, r = 0;
3738 
3739 	for (i = 0; i < adev->num_ip_blocks; i++) {
3740 		if (!adev->ip_blocks[i].status.valid)
3741 			continue;
3742 		if (adev->ip_blocks[i].status.hang &&
3743 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3744 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3745 			if (r)
3746 				return r;
3747 		}
3748 	}
3749 
3750 	return 0;
3751 }
3752 
3753 /**
3754  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3755  *
3756  * @adev: amdgpu_device pointer
3757  *
3758  * The list of all the hardware IPs that make up the asic is walked and the
3759  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3760  * handles any IP specific hardware or software state changes that are
3761  * necessary after the IP has been soft reset.
3762  * Returns 0 on success, negative error code on failure.
3763  */
3764 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3765 {
3766 	int i, r = 0;
3767 
3768 	for (i = 0; i < adev->num_ip_blocks; i++) {
3769 		if (!adev->ip_blocks[i].status.valid)
3770 			continue;
3771 		if (adev->ip_blocks[i].status.hang &&
3772 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3773 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3774 		if (r)
3775 			return r;
3776 	}
3777 
3778 	return 0;
3779 }
3780 
3781 /**
3782  * amdgpu_device_recover_vram - Recover some VRAM contents
3783  *
3784  * @adev: amdgpu_device pointer
3785  *
3786  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3787  * restore things like GPUVM page tables after a GPU reset where
3788  * the contents of VRAM might be lost.
3789  *
3790  * Returns:
3791  * 0 on success, negative error code on failure.
3792  */
3793 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3794 {
3795 	struct dma_fence *fence = NULL, *next = NULL;
3796 	struct amdgpu_bo *shadow;
3797 	long r = 1, tmo;
3798 
3799 	if (amdgpu_sriov_runtime(adev))
3800 		tmo = msecs_to_jiffies(8000);
3801 	else
3802 		tmo = msecs_to_jiffies(100);
3803 
3804 	DRM_INFO("recover vram bo from shadow start\n");
3805 	mutex_lock(&adev->shadow_list_lock);
3806 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3807 
3808 		/* No need to recover an evicted BO */
3809 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3810 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3811 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3812 			continue;
3813 
3814 		r = amdgpu_bo_restore_shadow(shadow, &next);
3815 		if (r)
3816 			break;
3817 
3818 		if (fence) {
3819 			tmo = dma_fence_wait_timeout(fence, false, tmo);
3820 			dma_fence_put(fence);
3821 			fence = next;
3822 			if (tmo == 0) {
3823 				r = -ETIMEDOUT;
3824 				break;
3825 			} else if (tmo < 0) {
3826 				r = tmo;
3827 				break;
3828 			}
3829 		} else {
3830 			fence = next;
3831 		}
3832 	}
3833 	mutex_unlock(&adev->shadow_list_lock);
3834 
3835 	if (fence)
3836 		tmo = dma_fence_wait_timeout(fence, false, tmo);
3837 	dma_fence_put(fence);
3838 
3839 	if (r < 0 || tmo <= 0) {
3840 		DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3841 		return -EIO;
3842 	}
3843 
3844 	DRM_INFO("recover vram bo from shadow done\n");
3845 	return 0;
3846 }
3847 
3848 
3849 /**
3850  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3851  *
3852  * @adev: amdgpu device pointer
3853  * @from_hypervisor: request from hypervisor
3854  *
3855  * do VF FLR and reinitialize Asic
3856  * return 0 means succeeded otherwise failed
3857  */
3858 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3859 				     bool from_hypervisor)
3860 {
3861 	int r;
3862 
3863 	if (from_hypervisor)
3864 		r = amdgpu_virt_request_full_gpu(adev, true);
3865 	else
3866 		r = amdgpu_virt_reset_gpu(adev);
3867 	if (r)
3868 		return r;
3869 
3870 	amdgpu_amdkfd_pre_reset(adev);
3871 
3872 	/* Resume IP prior to SMC */
3873 	r = amdgpu_device_ip_reinit_early_sriov(adev);
3874 	if (r)
3875 		goto error;
3876 
3877 	amdgpu_virt_init_data_exchange(adev);
3878 	/* we need recover gart prior to run SMC/CP/SDMA resume */
3879 	amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3880 
3881 	r = amdgpu_device_fw_loading(adev);
3882 	if (r)
3883 		return r;
3884 
3885 	/* now we are okay to resume SMC/CP/SDMA */
3886 	r = amdgpu_device_ip_reinit_late_sriov(adev);
3887 	if (r)
3888 		goto error;
3889 
3890 	amdgpu_irq_gpu_reset_resume_helper(adev);
3891 	r = amdgpu_ib_ring_tests(adev);
3892 	amdgpu_amdkfd_post_reset(adev);
3893 
3894 error:
3895 	amdgpu_virt_release_full_gpu(adev, true);
3896 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3897 		amdgpu_inc_vram_lost(adev);
3898 		r = amdgpu_device_recover_vram(adev);
3899 	}
3900 
3901 	return r;
3902 }
3903 
3904 /**
3905  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3906  *
3907  * @adev: amdgpu device pointer
3908  *
3909  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3910  * a hung GPU.
3911  */
3912 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3913 {
3914 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
3915 		DRM_INFO("Timeout, but no hardware hang detected.\n");
3916 		return false;
3917 	}
3918 
3919 	if (amdgpu_gpu_recovery == 0)
3920 		goto disabled;
3921 
3922 	if (amdgpu_sriov_vf(adev))
3923 		return true;
3924 
3925 	if (amdgpu_gpu_recovery == -1) {
3926 		switch (adev->asic_type) {
3927 		case CHIP_BONAIRE:
3928 		case CHIP_HAWAII:
3929 		case CHIP_TOPAZ:
3930 		case CHIP_TONGA:
3931 		case CHIP_FIJI:
3932 		case CHIP_POLARIS10:
3933 		case CHIP_POLARIS11:
3934 		case CHIP_POLARIS12:
3935 		case CHIP_VEGAM:
3936 		case CHIP_VEGA20:
3937 		case CHIP_VEGA10:
3938 		case CHIP_VEGA12:
3939 		case CHIP_RAVEN:
3940 		case CHIP_ARCTURUS:
3941 		case CHIP_RENOIR:
3942 		case CHIP_NAVI10:
3943 		case CHIP_NAVI14:
3944 		case CHIP_NAVI12:
3945 		case CHIP_SIENNA_CICHLID:
3946 			break;
3947 		default:
3948 			goto disabled;
3949 		}
3950 	}
3951 
3952 	return true;
3953 
3954 disabled:
3955 		DRM_INFO("GPU recovery disabled.\n");
3956 		return false;
3957 }
3958 
3959 
3960 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3961 					struct amdgpu_job *job,
3962 					bool *need_full_reset_arg)
3963 {
3964 	int i, r = 0;
3965 	bool need_full_reset  = *need_full_reset_arg;
3966 
3967 	amdgpu_debugfs_wait_dump(adev);
3968 
3969 	/* block all schedulers and reset given job's ring */
3970 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3971 		struct amdgpu_ring *ring = adev->rings[i];
3972 
3973 		if (!ring || !ring->sched.thread)
3974 			continue;
3975 
3976 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3977 		amdgpu_fence_driver_force_completion(ring);
3978 	}
3979 
3980 	if(job)
3981 		drm_sched_increase_karma(&job->base);
3982 
3983 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
3984 	if (!amdgpu_sriov_vf(adev)) {
3985 
3986 		if (!need_full_reset)
3987 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3988 
3989 		if (!need_full_reset) {
3990 			amdgpu_device_ip_pre_soft_reset(adev);
3991 			r = amdgpu_device_ip_soft_reset(adev);
3992 			amdgpu_device_ip_post_soft_reset(adev);
3993 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
3994 				DRM_INFO("soft reset failed, will fallback to full reset!\n");
3995 				need_full_reset = true;
3996 			}
3997 		}
3998 
3999 		if (need_full_reset)
4000 			r = amdgpu_device_ip_suspend(adev);
4001 
4002 		*need_full_reset_arg = need_full_reset;
4003 	}
4004 
4005 	return r;
4006 }
4007 
4008 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4009 			       struct list_head *device_list_handle,
4010 			       bool *need_full_reset_arg)
4011 {
4012 	struct amdgpu_device *tmp_adev = NULL;
4013 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4014 	int r = 0;
4015 
4016 	/*
4017 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
4018 	 * to allow proper links negotiation in FW (within 1 sec)
4019 	 */
4020 	if (need_full_reset) {
4021 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4022 			/* For XGMI run all resets in parallel to speed up the process */
4023 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4024 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4025 					r = -EALREADY;
4026 			} else
4027 				r = amdgpu_asic_reset(tmp_adev);
4028 
4029 			if (r) {
4030 				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
4031 					 r, tmp_adev->ddev->unique);
4032 				break;
4033 			}
4034 		}
4035 
4036 		/* For XGMI wait for all resets to complete before proceed */
4037 		if (!r) {
4038 			list_for_each_entry(tmp_adev, device_list_handle,
4039 					    gmc.xgmi.head) {
4040 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4041 					flush_work(&tmp_adev->xgmi_reset_work);
4042 					r = tmp_adev->asic_reset_res;
4043 					if (r)
4044 						break;
4045 				}
4046 			}
4047 		}
4048 	}
4049 
4050 	if (!r && amdgpu_ras_intr_triggered()) {
4051 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4052 			if (tmp_adev->mmhub.funcs &&
4053 			    tmp_adev->mmhub.funcs->reset_ras_error_count)
4054 				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4055 		}
4056 
4057 		amdgpu_ras_intr_cleared();
4058 	}
4059 
4060 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4061 		if (need_full_reset) {
4062 			/* post card */
4063 			if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
4064 				DRM_WARN("asic atom init failed!");
4065 
4066 			if (!r) {
4067 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4068 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4069 				if (r)
4070 					goto out;
4071 
4072 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4073 				if (vram_lost) {
4074 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4075 					amdgpu_inc_vram_lost(tmp_adev);
4076 				}
4077 
4078 				r = amdgpu_gtt_mgr_recover(
4079 					&tmp_adev->mman.bdev.man[TTM_PL_TT]);
4080 				if (r)
4081 					goto out;
4082 
4083 				r = amdgpu_device_fw_loading(tmp_adev);
4084 				if (r)
4085 					return r;
4086 
4087 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4088 				if (r)
4089 					goto out;
4090 
4091 				if (vram_lost)
4092 					amdgpu_device_fill_reset_magic(tmp_adev);
4093 
4094 				/*
4095 				 * Add this ASIC as tracked as reset was already
4096 				 * complete successfully.
4097 				 */
4098 				amdgpu_register_gpu_instance(tmp_adev);
4099 
4100 				r = amdgpu_device_ip_late_init(tmp_adev);
4101 				if (r)
4102 					goto out;
4103 
4104 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4105 
4106 				/* must succeed. */
4107 				amdgpu_ras_resume(tmp_adev);
4108 
4109 				/* Update PSP FW topology after reset */
4110 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4111 					r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4112 			}
4113 		}
4114 
4115 
4116 out:
4117 		if (!r) {
4118 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4119 			r = amdgpu_ib_ring_tests(tmp_adev);
4120 			if (r) {
4121 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4122 				r = amdgpu_device_ip_suspend(tmp_adev);
4123 				need_full_reset = true;
4124 				r = -EAGAIN;
4125 				goto end;
4126 			}
4127 		}
4128 
4129 		if (!r)
4130 			r = amdgpu_device_recover_vram(tmp_adev);
4131 		else
4132 			tmp_adev->asic_reset_res = r;
4133 	}
4134 
4135 end:
4136 	*need_full_reset_arg = need_full_reset;
4137 	return r;
4138 }
4139 
4140 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
4141 {
4142 	if (trylock) {
4143 		if (!mutex_trylock(&adev->lock_reset))
4144 			return false;
4145 	} else
4146 		mutex_lock(&adev->lock_reset);
4147 
4148 	atomic_inc(&adev->gpu_reset_counter);
4149 	adev->in_gpu_reset = true;
4150 	switch (amdgpu_asic_reset_method(adev)) {
4151 	case AMD_RESET_METHOD_MODE1:
4152 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4153 		break;
4154 	case AMD_RESET_METHOD_MODE2:
4155 		adev->mp1_state = PP_MP1_STATE_RESET;
4156 		break;
4157 	default:
4158 		adev->mp1_state = PP_MP1_STATE_NONE;
4159 		break;
4160 	}
4161 
4162 	return true;
4163 }
4164 
4165 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4166 {
4167 	amdgpu_vf_error_trans_all(adev);
4168 	adev->mp1_state = PP_MP1_STATE_NONE;
4169 	adev->in_gpu_reset = false;
4170 	mutex_unlock(&adev->lock_reset);
4171 }
4172 
4173 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4174 {
4175 	struct pci_dev *p = NULL;
4176 
4177 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4178 			adev->pdev->bus->number, 1);
4179 	if (p) {
4180 		pm_runtime_enable(&(p->dev));
4181 		pm_runtime_resume(&(p->dev));
4182 	}
4183 }
4184 
4185 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4186 {
4187 	enum amd_reset_method reset_method;
4188 	struct pci_dev *p = NULL;
4189 	u64 expires;
4190 
4191 	/*
4192 	 * For now, only BACO and mode1 reset are confirmed
4193 	 * to suffer the audio issue without proper suspended.
4194 	 */
4195 	reset_method = amdgpu_asic_reset_method(adev);
4196 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4197 	     (reset_method != AMD_RESET_METHOD_MODE1))
4198 		return -EINVAL;
4199 
4200 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4201 			adev->pdev->bus->number, 1);
4202 	if (!p)
4203 		return -ENODEV;
4204 
4205 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4206 	if (!expires)
4207 		/*
4208 		 * If we cannot get the audio device autosuspend delay,
4209 		 * a fixed 4S interval will be used. Considering 3S is
4210 		 * the audio controller default autosuspend delay setting.
4211 		 * 4S used here is guaranteed to cover that.
4212 		 */
4213 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4214 
4215 	while (!pm_runtime_status_suspended(&(p->dev))) {
4216 		if (!pm_runtime_suspend(&(p->dev)))
4217 			break;
4218 
4219 		if (expires < ktime_get_mono_fast_ns()) {
4220 			dev_warn(adev->dev, "failed to suspend display audio\n");
4221 			/* TODO: abort the succeeding gpu reset? */
4222 			return -ETIMEDOUT;
4223 		}
4224 	}
4225 
4226 	pm_runtime_disable(&(p->dev));
4227 
4228 	return 0;
4229 }
4230 
4231 /**
4232  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4233  *
4234  * @adev: amdgpu device pointer
4235  * @job: which job trigger hang
4236  *
4237  * Attempt to reset the GPU if it has hung (all asics).
4238  * Attempt to do soft-reset or full-reset and reinitialize Asic
4239  * Returns 0 for success or an error on failure.
4240  */
4241 
4242 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4243 			      struct amdgpu_job *job)
4244 {
4245 	struct list_head device_list, *device_list_handle =  NULL;
4246 	bool need_full_reset = false;
4247 	bool job_signaled = false;
4248 	struct amdgpu_hive_info *hive = NULL;
4249 	struct amdgpu_device *tmp_adev = NULL;
4250 	int i, r = 0;
4251 	bool need_emergency_restart = false;
4252 	bool audio_suspended = false;
4253 
4254 	/**
4255 	 * Special case: RAS triggered and full reset isn't supported
4256 	 */
4257 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4258 
4259 	/*
4260 	 * Flush RAM to disk so that after reboot
4261 	 * the user can read log and see why the system rebooted.
4262 	 */
4263 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4264 		DRM_WARN("Emergency reboot.");
4265 
4266 		ksys_sync_helper();
4267 		emergency_restart();
4268 	}
4269 
4270 	dev_info(adev->dev, "GPU %s begin!\n",
4271 		need_emergency_restart ? "jobs stop":"reset");
4272 
4273 	/*
4274 	 * Here we trylock to avoid chain of resets executing from
4275 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4276 	 * different schedulers for same device while this TO handler is running.
4277 	 * We always reset all schedulers for device and all devices for XGMI
4278 	 * hive so that should take care of them too.
4279 	 */
4280 	hive = amdgpu_get_xgmi_hive(adev, true);
4281 	if (hive && !mutex_trylock(&hive->reset_lock)) {
4282 		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4283 			  job ? job->base.id : -1, hive->hive_id);
4284 		mutex_unlock(&hive->hive_lock);
4285 		return 0;
4286 	}
4287 
4288 	/*
4289 	 * Build list of devices to reset.
4290 	 * In case we are in XGMI hive mode, resort the device list
4291 	 * to put adev in the 1st position.
4292 	 */
4293 	INIT_LIST_HEAD(&device_list);
4294 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4295 		if (!hive)
4296 			return -ENODEV;
4297 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4298 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4299 		device_list_handle = &hive->device_list;
4300 	} else {
4301 		list_add_tail(&adev->gmc.xgmi.head, &device_list);
4302 		device_list_handle = &device_list;
4303 	}
4304 
4305 	/* block all schedulers and reset given job's ring */
4306 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4307 		if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
4308 			DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
4309 				  job ? job->base.id : -1);
4310 			mutex_unlock(&hive->hive_lock);
4311 			return 0;
4312 		}
4313 
4314 		/*
4315 		 * Try to put the audio codec into suspend state
4316 		 * before gpu reset started.
4317 		 *
4318 		 * Due to the power domain of the graphics device
4319 		 * is shared with AZ power domain. Without this,
4320 		 * we may change the audio hardware from behind
4321 		 * the audio driver's back. That will trigger
4322 		 * some audio codec errors.
4323 		 */
4324 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4325 			audio_suspended = true;
4326 
4327 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4328 
4329 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4330 
4331 		if (!amdgpu_sriov_vf(tmp_adev))
4332 			amdgpu_amdkfd_pre_reset(tmp_adev);
4333 
4334 		/*
4335 		 * Mark these ASICs to be reseted as untracked first
4336 		 * And add them back after reset completed
4337 		 */
4338 		amdgpu_unregister_gpu_instance(tmp_adev);
4339 
4340 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4341 
4342 		/* disable ras on ALL IPs */
4343 		if (!need_emergency_restart &&
4344 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4345 			amdgpu_ras_suspend(tmp_adev);
4346 
4347 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4348 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4349 
4350 			if (!ring || !ring->sched.thread)
4351 				continue;
4352 
4353 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4354 
4355 			if (need_emergency_restart)
4356 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4357 		}
4358 	}
4359 
4360 	if (need_emergency_restart)
4361 		goto skip_sched_resume;
4362 
4363 	/*
4364 	 * Must check guilty signal here since after this point all old
4365 	 * HW fences are force signaled.
4366 	 *
4367 	 * job->base holds a reference to parent fence
4368 	 */
4369 	if (job && job->base.s_fence->parent &&
4370 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4371 		job_signaled = true;
4372 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4373 		goto skip_hw_reset;
4374 	}
4375 
4376 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4377 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4378 		r = amdgpu_device_pre_asic_reset(tmp_adev,
4379 						 NULL,
4380 						 &need_full_reset);
4381 		/*TODO Should we stop ?*/
4382 		if (r) {
4383 			DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4384 				  r, tmp_adev->ddev->unique);
4385 			tmp_adev->asic_reset_res = r;
4386 		}
4387 	}
4388 
4389 	/* Actual ASIC resets if needed.*/
4390 	/* TODO Implement XGMI hive reset logic for SRIOV */
4391 	if (amdgpu_sriov_vf(adev)) {
4392 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4393 		if (r)
4394 			adev->asic_reset_res = r;
4395 	} else {
4396 		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4397 		if (r && r == -EAGAIN)
4398 			goto retry;
4399 	}
4400 
4401 skip_hw_reset:
4402 
4403 	/* Post ASIC reset for all devs .*/
4404 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4405 
4406 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4407 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4408 
4409 			if (!ring || !ring->sched.thread)
4410 				continue;
4411 
4412 			/* No point to resubmit jobs if we didn't HW reset*/
4413 			if (!tmp_adev->asic_reset_res && !job_signaled)
4414 				drm_sched_resubmit_jobs(&ring->sched);
4415 
4416 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4417 		}
4418 
4419 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4420 			drm_helper_resume_force_mode(tmp_adev->ddev);
4421 		}
4422 
4423 		tmp_adev->asic_reset_res = 0;
4424 
4425 		if (r) {
4426 			/* bad news, how to tell it to userspace ? */
4427 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4428 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4429 		} else {
4430 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4431 		}
4432 	}
4433 
4434 skip_sched_resume:
4435 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4436 		/*unlock kfd: SRIOV would do it separately */
4437 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4438 	                amdgpu_amdkfd_post_reset(tmp_adev);
4439 		if (audio_suspended)
4440 			amdgpu_device_resume_display_audio(tmp_adev);
4441 		amdgpu_device_unlock_adev(tmp_adev);
4442 	}
4443 
4444 	if (hive) {
4445 		mutex_unlock(&hive->reset_lock);
4446 		mutex_unlock(&hive->hive_lock);
4447 	}
4448 
4449 	if (r)
4450 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4451 	return r;
4452 }
4453 
4454 /**
4455  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4456  *
4457  * @adev: amdgpu_device pointer
4458  *
4459  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4460  * and lanes) of the slot the device is in. Handles APUs and
4461  * virtualized environments where PCIE config space may not be available.
4462  */
4463 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4464 {
4465 	struct pci_dev *pdev;
4466 	enum pci_bus_speed speed_cap, platform_speed_cap;
4467 	enum pcie_link_width platform_link_width;
4468 
4469 	if (amdgpu_pcie_gen_cap)
4470 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4471 
4472 	if (amdgpu_pcie_lane_cap)
4473 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4474 
4475 	/* covers APUs as well */
4476 	if (pci_is_root_bus(adev->pdev->bus)) {
4477 		if (adev->pm.pcie_gen_mask == 0)
4478 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4479 		if (adev->pm.pcie_mlw_mask == 0)
4480 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4481 		return;
4482 	}
4483 
4484 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4485 		return;
4486 
4487 	pcie_bandwidth_available(adev->pdev, NULL,
4488 				 &platform_speed_cap, &platform_link_width);
4489 
4490 	if (adev->pm.pcie_gen_mask == 0) {
4491 		/* asic caps */
4492 		pdev = adev->pdev;
4493 		speed_cap = pcie_get_speed_cap(pdev);
4494 		if (speed_cap == PCI_SPEED_UNKNOWN) {
4495 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4496 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4497 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4498 		} else {
4499 			if (speed_cap == PCIE_SPEED_16_0GT)
4500 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4501 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4502 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4503 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4504 			else if (speed_cap == PCIE_SPEED_8_0GT)
4505 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4506 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4507 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4508 			else if (speed_cap == PCIE_SPEED_5_0GT)
4509 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4510 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4511 			else
4512 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4513 		}
4514 		/* platform caps */
4515 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4516 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4517 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4518 		} else {
4519 			if (platform_speed_cap == PCIE_SPEED_16_0GT)
4520 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4521 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4522 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4523 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4524 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4525 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4526 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4527 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4528 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4529 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4530 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4531 			else
4532 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4533 
4534 		}
4535 	}
4536 	if (adev->pm.pcie_mlw_mask == 0) {
4537 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4538 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4539 		} else {
4540 			switch (platform_link_width) {
4541 			case PCIE_LNK_X32:
4542 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4543 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4544 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4545 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4546 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4547 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4548 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4549 				break;
4550 			case PCIE_LNK_X16:
4551 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4552 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4553 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4554 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4555 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4556 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4557 				break;
4558 			case PCIE_LNK_X12:
4559 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4560 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4561 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4562 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4563 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4564 				break;
4565 			case PCIE_LNK_X8:
4566 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4567 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4568 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4569 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4570 				break;
4571 			case PCIE_LNK_X4:
4572 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4573 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4574 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4575 				break;
4576 			case PCIE_LNK_X2:
4577 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4578 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4579 				break;
4580 			case PCIE_LNK_X1:
4581 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4582 				break;
4583 			default:
4584 				break;
4585 			}
4586 		}
4587 	}
4588 }
4589 
4590 int amdgpu_device_baco_enter(struct drm_device *dev)
4591 {
4592 	struct amdgpu_device *adev = dev->dev_private;
4593 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4594 
4595 	if (!amdgpu_device_supports_baco(adev->ddev))
4596 		return -ENOTSUPP;
4597 
4598 	if (ras && ras->supported)
4599 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4600 
4601 	return amdgpu_dpm_baco_enter(adev);
4602 }
4603 
4604 int amdgpu_device_baco_exit(struct drm_device *dev)
4605 {
4606 	struct amdgpu_device *adev = dev->dev_private;
4607 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4608 	int ret = 0;
4609 
4610 	if (!amdgpu_device_supports_baco(adev->ddev))
4611 		return -ENOTSUPP;
4612 
4613 	ret = amdgpu_dpm_baco_exit(adev);
4614 	if (ret)
4615 		return ret;
4616 
4617 	if (ras && ras->supported)
4618 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4619 
4620 	return 0;
4621 }
4622