xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c (revision 3ecb3b794e2c1793443b72a968cb09d829c01a10)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72 
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
85 
86 #define AMDGPU_RESUME_MS		2000
87 
88 const char *amdgpu_asic_name[] = {
89 	"TAHITI",
90 	"PITCAIRN",
91 	"VERDE",
92 	"OLAND",
93 	"HAINAN",
94 	"BONAIRE",
95 	"KAVERI",
96 	"KABINI",
97 	"HAWAII",
98 	"MULLINS",
99 	"TOPAZ",
100 	"TONGA",
101 	"FIJI",
102 	"CARRIZO",
103 	"STONEY",
104 	"POLARIS10",
105 	"POLARIS11",
106 	"POLARIS12",
107 	"VEGAM",
108 	"VEGA10",
109 	"VEGA12",
110 	"VEGA20",
111 	"RAVEN",
112 	"ARCTURUS",
113 	"RENOIR",
114 	"NAVI10",
115 	"NAVI14",
116 	"NAVI12",
117 	"SIENNA_CICHLID",
118 	"NAVY_FLOUNDER",
119 	"LAST",
120 };
121 
122 /**
123  * DOC: pcie_replay_count
124  *
125  * The amdgpu driver provides a sysfs API for reporting the total number
126  * of PCIe replays (NAKs)
127  * The file pcie_replay_count is used for this and returns the total
128  * number of replays as a sum of the NAKs generated and NAKs received
129  */
130 
131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
132 		struct device_attribute *attr, char *buf)
133 {
134 	struct drm_device *ddev = dev_get_drvdata(dev);
135 	struct amdgpu_device *adev = ddev->dev_private;
136 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
137 
138 	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
139 }
140 
141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
142 		amdgpu_device_get_pcie_replay_count, NULL);
143 
144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
145 
146 /**
147  * DOC: product_name
148  *
149  * The amdgpu driver provides a sysfs API for reporting the product name
150  * for the device
151  * The file serial_number is used for this and returns the product name
152  * as returned from the FRU.
153  * NOTE: This is only available for certain server cards
154  */
155 
156 static ssize_t amdgpu_device_get_product_name(struct device *dev,
157 		struct device_attribute *attr, char *buf)
158 {
159 	struct drm_device *ddev = dev_get_drvdata(dev);
160 	struct amdgpu_device *adev = ddev->dev_private;
161 
162 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
163 }
164 
165 static DEVICE_ATTR(product_name, S_IRUGO,
166 		amdgpu_device_get_product_name, NULL);
167 
168 /**
169  * DOC: product_number
170  *
171  * The amdgpu driver provides a sysfs API for reporting the part number
172  * for the device
173  * The file serial_number is used for this and returns the part number
174  * as returned from the FRU.
175  * NOTE: This is only available for certain server cards
176  */
177 
178 static ssize_t amdgpu_device_get_product_number(struct device *dev,
179 		struct device_attribute *attr, char *buf)
180 {
181 	struct drm_device *ddev = dev_get_drvdata(dev);
182 	struct amdgpu_device *adev = ddev->dev_private;
183 
184 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
185 }
186 
187 static DEVICE_ATTR(product_number, S_IRUGO,
188 		amdgpu_device_get_product_number, NULL);
189 
190 /**
191  * DOC: serial_number
192  *
193  * The amdgpu driver provides a sysfs API for reporting the serial number
194  * for the device
195  * The file serial_number is used for this and returns the serial number
196  * as returned from the FRU.
197  * NOTE: This is only available for certain server cards
198  */
199 
200 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
201 		struct device_attribute *attr, char *buf)
202 {
203 	struct drm_device *ddev = dev_get_drvdata(dev);
204 	struct amdgpu_device *adev = ddev->dev_private;
205 
206 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
207 }
208 
209 static DEVICE_ATTR(serial_number, S_IRUGO,
210 		amdgpu_device_get_serial_number, NULL);
211 
212 /**
213  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
214  *
215  * @dev: drm_device pointer
216  *
217  * Returns true if the device is a dGPU with HG/PX power control,
218  * otherwise return false.
219  */
220 bool amdgpu_device_supports_boco(struct drm_device *dev)
221 {
222 	struct amdgpu_device *adev = dev->dev_private;
223 
224 	if (adev->flags & AMD_IS_PX)
225 		return true;
226 	return false;
227 }
228 
229 /**
230  * amdgpu_device_supports_baco - Does the device support BACO
231  *
232  * @dev: drm_device pointer
233  *
234  * Returns true if the device supporte BACO,
235  * otherwise return false.
236  */
237 bool amdgpu_device_supports_baco(struct drm_device *dev)
238 {
239 	struct amdgpu_device *adev = dev->dev_private;
240 
241 	return amdgpu_asic_supports_baco(adev);
242 }
243 
244 /**
245  * VRAM access helper functions.
246  *
247  * amdgpu_device_vram_access - read/write a buffer in vram
248  *
249  * @adev: amdgpu_device pointer
250  * @pos: offset of the buffer in vram
251  * @buf: virtual address of the buffer in system memory
252  * @size: read/write size, sizeof(@buf) must > @size
253  * @write: true - write to vram, otherwise - read from vram
254  */
255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 			       uint32_t *buf, size_t size, bool write)
257 {
258 	unsigned long flags;
259 	uint32_t hi = ~0;
260 	uint64_t last;
261 
262 
263 #ifdef CONFIG_64BIT
264 	last = min(pos + size, adev->gmc.visible_vram_size);
265 	if (last > pos) {
266 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 		size_t count = last - pos;
268 
269 		if (write) {
270 			memcpy_toio(addr, buf, count);
271 			mb();
272 			amdgpu_asic_flush_hdp(adev, NULL);
273 		} else {
274 			amdgpu_asic_invalidate_hdp(adev, NULL);
275 			mb();
276 			memcpy_fromio(buf, addr, count);
277 		}
278 
279 		if (count == size)
280 			return;
281 
282 		pos += count;
283 		buf += count / 4;
284 		size -= count;
285 	}
286 #endif
287 
288 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 	for (last = pos + size; pos < last; pos += 4) {
290 		uint32_t tmp = pos >> 31;
291 
292 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 		if (tmp != hi) {
294 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 			hi = tmp;
296 		}
297 		if (write)
298 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 		else
300 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 	}
302 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304 
305 /*
306  * MMIO register access helper functions.
307  */
308 /**
309  * amdgpu_mm_rreg - read a memory mapped IO register
310  *
311  * @adev: amdgpu_device pointer
312  * @reg: dword aligned register offset
313  * @acc_flags: access flags which require special behavior
314  *
315  * Returns the 32 bit value from the offset specified.
316  */
317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
318 			uint32_t acc_flags)
319 {
320 	uint32_t ret;
321 
322 	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
323 		return amdgpu_kiq_rreg(adev, reg);
324 
325 	if ((reg * 4) < adev->rmmio_size)
326 		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
327 	else {
328 		unsigned long flags;
329 
330 		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
331 		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
332 		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
333 		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
334 	}
335 	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
336 	return ret;
337 }
338 
339 /*
340  * MMIO register read with bytes helper functions
341  * @offset:bytes offset from MMIO start
342  *
343 */
344 
345 /**
346  * amdgpu_mm_rreg8 - read a memory mapped IO register
347  *
348  * @adev: amdgpu_device pointer
349  * @offset: byte aligned register offset
350  *
351  * Returns the 8 bit value from the offset specified.
352  */
353 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
354 	if (offset < adev->rmmio_size)
355 		return (readb(adev->rmmio + offset));
356 	BUG();
357 }
358 
359 /*
360  * MMIO register write with bytes helper functions
361  * @offset:bytes offset from MMIO start
362  * @value: the value want to be written to the register
363  *
364 */
365 /**
366  * amdgpu_mm_wreg8 - read a memory mapped IO register
367  *
368  * @adev: amdgpu_device pointer
369  * @offset: byte aligned register offset
370  * @value: 8 bit value to write
371  *
372  * Writes the value specified to the offset specified.
373  */
374 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
375 	if (offset < adev->rmmio_size)
376 		writeb(value, adev->rmmio + offset);
377 	else
378 		BUG();
379 }
380 
381 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
382 {
383 	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
384 
385 	if ((reg * 4) < adev->rmmio_size)
386 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
387 	else {
388 		unsigned long flags;
389 
390 		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
391 		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
392 		writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
393 		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
394 	}
395 }
396 
397 /**
398  * amdgpu_mm_wreg - write to a memory mapped IO register
399  *
400  * @adev: amdgpu_device pointer
401  * @reg: dword aligned register offset
402  * @v: 32 bit value to write to the register
403  * @acc_flags: access flags which require special behavior
404  *
405  * Writes the value specified to the offset specified.
406  */
407 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
408 		    uint32_t acc_flags)
409 {
410 	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
411 		return amdgpu_kiq_wreg(adev, reg, v);
412 
413 	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
414 }
415 
416 /*
417  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
418  *
419  * this function is invoked only the debugfs register access
420  * */
421 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
422 		    uint32_t acc_flags)
423 {
424 	if (amdgpu_sriov_fullaccess(adev) &&
425 		adev->gfx.rlc.funcs &&
426 		adev->gfx.rlc.funcs->is_rlcg_access_range) {
427 
428 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
429 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
430 	}
431 
432 	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
433 }
434 
435 /**
436  * amdgpu_io_rreg - read an IO register
437  *
438  * @adev: amdgpu_device pointer
439  * @reg: dword aligned register offset
440  *
441  * Returns the 32 bit value from the offset specified.
442  */
443 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
444 {
445 	if ((reg * 4) < adev->rio_mem_size)
446 		return ioread32(adev->rio_mem + (reg * 4));
447 	else {
448 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
449 		return ioread32(adev->rio_mem + (mmMM_DATA * 4));
450 	}
451 }
452 
453 /**
454  * amdgpu_io_wreg - write to an IO register
455  *
456  * @adev: amdgpu_device pointer
457  * @reg: dword aligned register offset
458  * @v: 32 bit value to write to the register
459  *
460  * Writes the value specified to the offset specified.
461  */
462 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
463 {
464 	if ((reg * 4) < adev->rio_mem_size)
465 		iowrite32(v, adev->rio_mem + (reg * 4));
466 	else {
467 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
468 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
469 	}
470 }
471 
472 /**
473  * amdgpu_mm_rdoorbell - read a doorbell dword
474  *
475  * @adev: amdgpu_device pointer
476  * @index: doorbell index
477  *
478  * Returns the value in the doorbell aperture at the
479  * requested doorbell index (CIK).
480  */
481 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
482 {
483 	if (index < adev->doorbell.num_doorbells) {
484 		return readl(adev->doorbell.ptr + index);
485 	} else {
486 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
487 		return 0;
488 	}
489 }
490 
491 /**
492  * amdgpu_mm_wdoorbell - write a doorbell dword
493  *
494  * @adev: amdgpu_device pointer
495  * @index: doorbell index
496  * @v: value to write
497  *
498  * Writes @v to the doorbell aperture at the
499  * requested doorbell index (CIK).
500  */
501 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
502 {
503 	if (index < adev->doorbell.num_doorbells) {
504 		writel(v, adev->doorbell.ptr + index);
505 	} else {
506 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
507 	}
508 }
509 
510 /**
511  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
512  *
513  * @adev: amdgpu_device pointer
514  * @index: doorbell index
515  *
516  * Returns the value in the doorbell aperture at the
517  * requested doorbell index (VEGA10+).
518  */
519 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
520 {
521 	if (index < adev->doorbell.num_doorbells) {
522 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
523 	} else {
524 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
525 		return 0;
526 	}
527 }
528 
529 /**
530  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
531  *
532  * @adev: amdgpu_device pointer
533  * @index: doorbell index
534  * @v: value to write
535  *
536  * Writes @v to the doorbell aperture at the
537  * requested doorbell index (VEGA10+).
538  */
539 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
540 {
541 	if (index < adev->doorbell.num_doorbells) {
542 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
543 	} else {
544 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
545 	}
546 }
547 
548 /**
549  * amdgpu_invalid_rreg - dummy reg read function
550  *
551  * @adev: amdgpu device pointer
552  * @reg: offset of register
553  *
554  * Dummy register read function.  Used for register blocks
555  * that certain asics don't have (all asics).
556  * Returns the value in the register.
557  */
558 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
559 {
560 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
561 	BUG();
562 	return 0;
563 }
564 
565 /**
566  * amdgpu_invalid_wreg - dummy reg write function
567  *
568  * @adev: amdgpu device pointer
569  * @reg: offset of register
570  * @v: value to write to the register
571  *
572  * Dummy register read function.  Used for register blocks
573  * that certain asics don't have (all asics).
574  */
575 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
576 {
577 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
578 		  reg, v);
579 	BUG();
580 }
581 
582 /**
583  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
584  *
585  * @adev: amdgpu device pointer
586  * @reg: offset of register
587  *
588  * Dummy register read function.  Used for register blocks
589  * that certain asics don't have (all asics).
590  * Returns the value in the register.
591  */
592 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
593 {
594 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
595 	BUG();
596 	return 0;
597 }
598 
599 /**
600  * amdgpu_invalid_wreg64 - dummy reg write function
601  *
602  * @adev: amdgpu device pointer
603  * @reg: offset of register
604  * @v: value to write to the register
605  *
606  * Dummy register read function.  Used for register blocks
607  * that certain asics don't have (all asics).
608  */
609 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
610 {
611 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
612 		  reg, v);
613 	BUG();
614 }
615 
616 /**
617  * amdgpu_block_invalid_rreg - dummy reg read function
618  *
619  * @adev: amdgpu device pointer
620  * @block: offset of instance
621  * @reg: offset of register
622  *
623  * Dummy register read function.  Used for register blocks
624  * that certain asics don't have (all asics).
625  * Returns the value in the register.
626  */
627 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
628 					  uint32_t block, uint32_t reg)
629 {
630 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
631 		  reg, block);
632 	BUG();
633 	return 0;
634 }
635 
636 /**
637  * amdgpu_block_invalid_wreg - dummy reg write function
638  *
639  * @adev: amdgpu device pointer
640  * @block: offset of instance
641  * @reg: offset of register
642  * @v: value to write to the register
643  *
644  * Dummy register read function.  Used for register blocks
645  * that certain asics don't have (all asics).
646  */
647 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
648 				      uint32_t block,
649 				      uint32_t reg, uint32_t v)
650 {
651 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
652 		  reg, block, v);
653 	BUG();
654 }
655 
656 /**
657  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
658  *
659  * @adev: amdgpu device pointer
660  *
661  * Allocates a scratch page of VRAM for use by various things in the
662  * driver.
663  */
664 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
665 {
666 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
667 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
668 				       &adev->vram_scratch.robj,
669 				       &adev->vram_scratch.gpu_addr,
670 				       (void **)&adev->vram_scratch.ptr);
671 }
672 
673 /**
674  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
675  *
676  * @adev: amdgpu device pointer
677  *
678  * Frees the VRAM scratch page.
679  */
680 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
681 {
682 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
683 }
684 
685 /**
686  * amdgpu_device_program_register_sequence - program an array of registers.
687  *
688  * @adev: amdgpu_device pointer
689  * @registers: pointer to the register array
690  * @array_size: size of the register array
691  *
692  * Programs an array or registers with and and or masks.
693  * This is a helper for setting golden registers.
694  */
695 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
696 					     const u32 *registers,
697 					     const u32 array_size)
698 {
699 	u32 tmp, reg, and_mask, or_mask;
700 	int i;
701 
702 	if (array_size % 3)
703 		return;
704 
705 	for (i = 0; i < array_size; i +=3) {
706 		reg = registers[i + 0];
707 		and_mask = registers[i + 1];
708 		or_mask = registers[i + 2];
709 
710 		if (and_mask == 0xffffffff) {
711 			tmp = or_mask;
712 		} else {
713 			tmp = RREG32(reg);
714 			tmp &= ~and_mask;
715 			if (adev->family >= AMDGPU_FAMILY_AI)
716 				tmp |= (or_mask & and_mask);
717 			else
718 				tmp |= or_mask;
719 		}
720 		WREG32(reg, tmp);
721 	}
722 }
723 
724 /**
725  * amdgpu_device_pci_config_reset - reset the GPU
726  *
727  * @adev: amdgpu_device pointer
728  *
729  * Resets the GPU using the pci config reset sequence.
730  * Only applicable to asics prior to vega10.
731  */
732 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
733 {
734 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
735 }
736 
737 /*
738  * GPU doorbell aperture helpers function.
739  */
740 /**
741  * amdgpu_device_doorbell_init - Init doorbell driver information.
742  *
743  * @adev: amdgpu_device pointer
744  *
745  * Init doorbell driver information (CIK)
746  * Returns 0 on success, error on failure.
747  */
748 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
749 {
750 
751 	/* No doorbell on SI hardware generation */
752 	if (adev->asic_type < CHIP_BONAIRE) {
753 		adev->doorbell.base = 0;
754 		adev->doorbell.size = 0;
755 		adev->doorbell.num_doorbells = 0;
756 		adev->doorbell.ptr = NULL;
757 		return 0;
758 	}
759 
760 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
761 		return -EINVAL;
762 
763 	amdgpu_asic_init_doorbell_index(adev);
764 
765 	/* doorbell bar mapping */
766 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
767 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
768 
769 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
770 					     adev->doorbell_index.max_assignment+1);
771 	if (adev->doorbell.num_doorbells == 0)
772 		return -EINVAL;
773 
774 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
775 	 * paging queue doorbell use the second page. The
776 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
777 	 * doorbells are in the first page. So with paging queue enabled,
778 	 * the max num_doorbells should + 1 page (0x400 in dword)
779 	 */
780 	if (adev->asic_type >= CHIP_VEGA10)
781 		adev->doorbell.num_doorbells += 0x400;
782 
783 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
784 				     adev->doorbell.num_doorbells *
785 				     sizeof(u32));
786 	if (adev->doorbell.ptr == NULL)
787 		return -ENOMEM;
788 
789 	return 0;
790 }
791 
792 /**
793  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
794  *
795  * @adev: amdgpu_device pointer
796  *
797  * Tear down doorbell driver information (CIK)
798  */
799 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
800 {
801 	iounmap(adev->doorbell.ptr);
802 	adev->doorbell.ptr = NULL;
803 }
804 
805 
806 
807 /*
808  * amdgpu_device_wb_*()
809  * Writeback is the method by which the GPU updates special pages in memory
810  * with the status of certain GPU events (fences, ring pointers,etc.).
811  */
812 
813 /**
814  * amdgpu_device_wb_fini - Disable Writeback and free memory
815  *
816  * @adev: amdgpu_device pointer
817  *
818  * Disables Writeback and frees the Writeback memory (all asics).
819  * Used at driver shutdown.
820  */
821 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
822 {
823 	if (adev->wb.wb_obj) {
824 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
825 				      &adev->wb.gpu_addr,
826 				      (void **)&adev->wb.wb);
827 		adev->wb.wb_obj = NULL;
828 	}
829 }
830 
831 /**
832  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
833  *
834  * @adev: amdgpu_device pointer
835  *
836  * Initializes writeback and allocates writeback memory (all asics).
837  * Used at driver startup.
838  * Returns 0 on success or an -error on failure.
839  */
840 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
841 {
842 	int r;
843 
844 	if (adev->wb.wb_obj == NULL) {
845 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
846 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
847 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
848 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
849 					    (void **)&adev->wb.wb);
850 		if (r) {
851 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
852 			return r;
853 		}
854 
855 		adev->wb.num_wb = AMDGPU_MAX_WB;
856 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
857 
858 		/* clear wb memory */
859 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
860 	}
861 
862 	return 0;
863 }
864 
865 /**
866  * amdgpu_device_wb_get - Allocate a wb entry
867  *
868  * @adev: amdgpu_device pointer
869  * @wb: wb index
870  *
871  * Allocate a wb slot for use by the driver (all asics).
872  * Returns 0 on success or -EINVAL on failure.
873  */
874 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
875 {
876 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
877 
878 	if (offset < adev->wb.num_wb) {
879 		__set_bit(offset, adev->wb.used);
880 		*wb = offset << 3; /* convert to dw offset */
881 		return 0;
882 	} else {
883 		return -EINVAL;
884 	}
885 }
886 
887 /**
888  * amdgpu_device_wb_free - Free a wb entry
889  *
890  * @adev: amdgpu_device pointer
891  * @wb: wb index
892  *
893  * Free a wb slot allocated for use by the driver (all asics)
894  */
895 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
896 {
897 	wb >>= 3;
898 	if (wb < adev->wb.num_wb)
899 		__clear_bit(wb, adev->wb.used);
900 }
901 
902 /**
903  * amdgpu_device_resize_fb_bar - try to resize FB BAR
904  *
905  * @adev: amdgpu_device pointer
906  *
907  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
908  * to fail, but if any of the BARs is not accessible after the size we abort
909  * driver loading by returning -ENODEV.
910  */
911 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
912 {
913 	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
914 	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
915 	struct pci_bus *root;
916 	struct resource *res;
917 	unsigned i;
918 	u16 cmd;
919 	int r;
920 
921 	/* Bypass for VF */
922 	if (amdgpu_sriov_vf(adev))
923 		return 0;
924 
925 	/* skip if the bios has already enabled large BAR */
926 	if (adev->gmc.real_vram_size &&
927 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
928 		return 0;
929 
930 	/* Check if the root BUS has 64bit memory resources */
931 	root = adev->pdev->bus;
932 	while (root->parent)
933 		root = root->parent;
934 
935 	pci_bus_for_each_resource(root, res, i) {
936 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
937 		    res->start > 0x100000000ull)
938 			break;
939 	}
940 
941 	/* Trying to resize is pointless without a root hub window above 4GB */
942 	if (!res)
943 		return 0;
944 
945 	/* Disable memory decoding while we change the BAR addresses and size */
946 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
947 	pci_write_config_word(adev->pdev, PCI_COMMAND,
948 			      cmd & ~PCI_COMMAND_MEMORY);
949 
950 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
951 	amdgpu_device_doorbell_fini(adev);
952 	if (adev->asic_type >= CHIP_BONAIRE)
953 		pci_release_resource(adev->pdev, 2);
954 
955 	pci_release_resource(adev->pdev, 0);
956 
957 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
958 	if (r == -ENOSPC)
959 		DRM_INFO("Not enough PCI address space for a large BAR.");
960 	else if (r && r != -ENOTSUPP)
961 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
962 
963 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
964 
965 	/* When the doorbell or fb BAR isn't available we have no chance of
966 	 * using the device.
967 	 */
968 	r = amdgpu_device_doorbell_init(adev);
969 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
970 		return -ENODEV;
971 
972 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
973 
974 	return 0;
975 }
976 
977 /*
978  * GPU helpers function.
979  */
980 /**
981  * amdgpu_device_need_post - check if the hw need post or not
982  *
983  * @adev: amdgpu_device pointer
984  *
985  * Check if the asic has been initialized (all asics) at driver startup
986  * or post is needed if  hw reset is performed.
987  * Returns true if need or false if not.
988  */
989 bool amdgpu_device_need_post(struct amdgpu_device *adev)
990 {
991 	uint32_t reg;
992 
993 	if (amdgpu_sriov_vf(adev))
994 		return false;
995 
996 	if (amdgpu_passthrough(adev)) {
997 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
998 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
999 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1000 		 * vpost executed for smc version below 22.15
1001 		 */
1002 		if (adev->asic_type == CHIP_FIJI) {
1003 			int err;
1004 			uint32_t fw_ver;
1005 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1006 			/* force vPost if error occured */
1007 			if (err)
1008 				return true;
1009 
1010 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1011 			if (fw_ver < 0x00160e00)
1012 				return true;
1013 		}
1014 	}
1015 
1016 	if (adev->has_hw_reset) {
1017 		adev->has_hw_reset = false;
1018 		return true;
1019 	}
1020 
1021 	/* bios scratch used on CIK+ */
1022 	if (adev->asic_type >= CHIP_BONAIRE)
1023 		return amdgpu_atombios_scratch_need_asic_init(adev);
1024 
1025 	/* check MEM_SIZE for older asics */
1026 	reg = amdgpu_asic_get_config_memsize(adev);
1027 
1028 	if ((reg != 0) && (reg != 0xffffffff))
1029 		return false;
1030 
1031 	return true;
1032 }
1033 
1034 /* if we get transitioned to only one device, take VGA back */
1035 /**
1036  * amdgpu_device_vga_set_decode - enable/disable vga decode
1037  *
1038  * @cookie: amdgpu_device pointer
1039  * @state: enable/disable vga decode
1040  *
1041  * Enable/disable vga decode (all asics).
1042  * Returns VGA resource flags.
1043  */
1044 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1045 {
1046 	struct amdgpu_device *adev = cookie;
1047 	amdgpu_asic_set_vga_state(adev, state);
1048 	if (state)
1049 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1050 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1051 	else
1052 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1053 }
1054 
1055 /**
1056  * amdgpu_device_check_block_size - validate the vm block size
1057  *
1058  * @adev: amdgpu_device pointer
1059  *
1060  * Validates the vm block size specified via module parameter.
1061  * The vm block size defines number of bits in page table versus page directory,
1062  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1063  * page table and the remaining bits are in the page directory.
1064  */
1065 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1066 {
1067 	/* defines number of bits in page table versus page directory,
1068 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1069 	 * page table and the remaining bits are in the page directory */
1070 	if (amdgpu_vm_block_size == -1)
1071 		return;
1072 
1073 	if (amdgpu_vm_block_size < 9) {
1074 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1075 			 amdgpu_vm_block_size);
1076 		amdgpu_vm_block_size = -1;
1077 	}
1078 }
1079 
1080 /**
1081  * amdgpu_device_check_vm_size - validate the vm size
1082  *
1083  * @adev: amdgpu_device pointer
1084  *
1085  * Validates the vm size in GB specified via module parameter.
1086  * The VM size is the size of the GPU virtual memory space in GB.
1087  */
1088 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1089 {
1090 	/* no need to check the default value */
1091 	if (amdgpu_vm_size == -1)
1092 		return;
1093 
1094 	if (amdgpu_vm_size < 1) {
1095 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1096 			 amdgpu_vm_size);
1097 		amdgpu_vm_size = -1;
1098 	}
1099 }
1100 
1101 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1102 {
1103 	struct sysinfo si;
1104 	bool is_os_64 = (sizeof(void *) == 8);
1105 	uint64_t total_memory;
1106 	uint64_t dram_size_seven_GB = 0x1B8000000;
1107 	uint64_t dram_size_three_GB = 0xB8000000;
1108 
1109 	if (amdgpu_smu_memory_pool_size == 0)
1110 		return;
1111 
1112 	if (!is_os_64) {
1113 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1114 		goto def_value;
1115 	}
1116 	si_meminfo(&si);
1117 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1118 
1119 	if ((amdgpu_smu_memory_pool_size == 1) ||
1120 		(amdgpu_smu_memory_pool_size == 2)) {
1121 		if (total_memory < dram_size_three_GB)
1122 			goto def_value1;
1123 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1124 		(amdgpu_smu_memory_pool_size == 8)) {
1125 		if (total_memory < dram_size_seven_GB)
1126 			goto def_value1;
1127 	} else {
1128 		DRM_WARN("Smu memory pool size not supported\n");
1129 		goto def_value;
1130 	}
1131 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1132 
1133 	return;
1134 
1135 def_value1:
1136 	DRM_WARN("No enough system memory\n");
1137 def_value:
1138 	adev->pm.smu_prv_buffer_size = 0;
1139 }
1140 
1141 /**
1142  * amdgpu_device_check_arguments - validate module params
1143  *
1144  * @adev: amdgpu_device pointer
1145  *
1146  * Validates certain module parameters and updates
1147  * the associated values used by the driver (all asics).
1148  */
1149 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1150 {
1151 	if (amdgpu_sched_jobs < 4) {
1152 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1153 			 amdgpu_sched_jobs);
1154 		amdgpu_sched_jobs = 4;
1155 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1156 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1157 			 amdgpu_sched_jobs);
1158 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1159 	}
1160 
1161 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1162 		/* gart size must be greater or equal to 32M */
1163 		dev_warn(adev->dev, "gart size (%d) too small\n",
1164 			 amdgpu_gart_size);
1165 		amdgpu_gart_size = -1;
1166 	}
1167 
1168 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1169 		/* gtt size must be greater or equal to 32M */
1170 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1171 				 amdgpu_gtt_size);
1172 		amdgpu_gtt_size = -1;
1173 	}
1174 
1175 	/* valid range is between 4 and 9 inclusive */
1176 	if (amdgpu_vm_fragment_size != -1 &&
1177 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1178 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1179 		amdgpu_vm_fragment_size = -1;
1180 	}
1181 
1182 	if (amdgpu_sched_hw_submission < 2) {
1183 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1184 			 amdgpu_sched_hw_submission);
1185 		amdgpu_sched_hw_submission = 2;
1186 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1187 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1188 			 amdgpu_sched_hw_submission);
1189 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1190 	}
1191 
1192 	amdgpu_device_check_smu_prv_buffer_size(adev);
1193 
1194 	amdgpu_device_check_vm_size(adev);
1195 
1196 	amdgpu_device_check_block_size(adev);
1197 
1198 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1199 
1200 	amdgpu_gmc_tmz_set(adev);
1201 
1202 	return 0;
1203 }
1204 
1205 /**
1206  * amdgpu_switcheroo_set_state - set switcheroo state
1207  *
1208  * @pdev: pci dev pointer
1209  * @state: vga_switcheroo state
1210  *
1211  * Callback for the switcheroo driver.  Suspends or resumes the
1212  * the asics before or after it is powered up using ACPI methods.
1213  */
1214 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1215 {
1216 	struct drm_device *dev = pci_get_drvdata(pdev);
1217 	int r;
1218 
1219 	if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1220 		return;
1221 
1222 	if (state == VGA_SWITCHEROO_ON) {
1223 		pr_info("switched on\n");
1224 		/* don't suspend or resume card normally */
1225 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1226 
1227 		pci_set_power_state(dev->pdev, PCI_D0);
1228 		pci_restore_state(dev->pdev);
1229 		r = pci_enable_device(dev->pdev);
1230 		if (r)
1231 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1232 		amdgpu_device_resume(dev, true);
1233 
1234 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1235 		drm_kms_helper_poll_enable(dev);
1236 	} else {
1237 		pr_info("switched off\n");
1238 		drm_kms_helper_poll_disable(dev);
1239 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1240 		amdgpu_device_suspend(dev, true);
1241 		pci_save_state(dev->pdev);
1242 		/* Shut down the device */
1243 		pci_disable_device(dev->pdev);
1244 		pci_set_power_state(dev->pdev, PCI_D3cold);
1245 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1246 	}
1247 }
1248 
1249 /**
1250  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1251  *
1252  * @pdev: pci dev pointer
1253  *
1254  * Callback for the switcheroo driver.  Check of the switcheroo
1255  * state can be changed.
1256  * Returns true if the state can be changed, false if not.
1257  */
1258 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1259 {
1260 	struct drm_device *dev = pci_get_drvdata(pdev);
1261 
1262 	/*
1263 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1264 	* locking inversion with the driver load path. And the access here is
1265 	* completely racy anyway. So don't bother with locking for now.
1266 	*/
1267 	return atomic_read(&dev->open_count) == 0;
1268 }
1269 
1270 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1271 	.set_gpu_state = amdgpu_switcheroo_set_state,
1272 	.reprobe = NULL,
1273 	.can_switch = amdgpu_switcheroo_can_switch,
1274 };
1275 
1276 /**
1277  * amdgpu_device_ip_set_clockgating_state - set the CG state
1278  *
1279  * @dev: amdgpu_device pointer
1280  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1281  * @state: clockgating state (gate or ungate)
1282  *
1283  * Sets the requested clockgating state for all instances of
1284  * the hardware IP specified.
1285  * Returns the error code from the last instance.
1286  */
1287 int amdgpu_device_ip_set_clockgating_state(void *dev,
1288 					   enum amd_ip_block_type block_type,
1289 					   enum amd_clockgating_state state)
1290 {
1291 	struct amdgpu_device *adev = dev;
1292 	int i, r = 0;
1293 
1294 	for (i = 0; i < adev->num_ip_blocks; i++) {
1295 		if (!adev->ip_blocks[i].status.valid)
1296 			continue;
1297 		if (adev->ip_blocks[i].version->type != block_type)
1298 			continue;
1299 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1300 			continue;
1301 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1302 			(void *)adev, state);
1303 		if (r)
1304 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1305 				  adev->ip_blocks[i].version->funcs->name, r);
1306 	}
1307 	return r;
1308 }
1309 
1310 /**
1311  * amdgpu_device_ip_set_powergating_state - set the PG state
1312  *
1313  * @dev: amdgpu_device pointer
1314  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1315  * @state: powergating state (gate or ungate)
1316  *
1317  * Sets the requested powergating state for all instances of
1318  * the hardware IP specified.
1319  * Returns the error code from the last instance.
1320  */
1321 int amdgpu_device_ip_set_powergating_state(void *dev,
1322 					   enum amd_ip_block_type block_type,
1323 					   enum amd_powergating_state state)
1324 {
1325 	struct amdgpu_device *adev = dev;
1326 	int i, r = 0;
1327 
1328 	for (i = 0; i < adev->num_ip_blocks; i++) {
1329 		if (!adev->ip_blocks[i].status.valid)
1330 			continue;
1331 		if (adev->ip_blocks[i].version->type != block_type)
1332 			continue;
1333 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1334 			continue;
1335 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1336 			(void *)adev, state);
1337 		if (r)
1338 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1339 				  adev->ip_blocks[i].version->funcs->name, r);
1340 	}
1341 	return r;
1342 }
1343 
1344 /**
1345  * amdgpu_device_ip_get_clockgating_state - get the CG state
1346  *
1347  * @adev: amdgpu_device pointer
1348  * @flags: clockgating feature flags
1349  *
1350  * Walks the list of IPs on the device and updates the clockgating
1351  * flags for each IP.
1352  * Updates @flags with the feature flags for each hardware IP where
1353  * clockgating is enabled.
1354  */
1355 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1356 					    u32 *flags)
1357 {
1358 	int i;
1359 
1360 	for (i = 0; i < adev->num_ip_blocks; i++) {
1361 		if (!adev->ip_blocks[i].status.valid)
1362 			continue;
1363 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1364 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1365 	}
1366 }
1367 
1368 /**
1369  * amdgpu_device_ip_wait_for_idle - wait for idle
1370  *
1371  * @adev: amdgpu_device pointer
1372  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1373  *
1374  * Waits for the request hardware IP to be idle.
1375  * Returns 0 for success or a negative error code on failure.
1376  */
1377 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1378 				   enum amd_ip_block_type block_type)
1379 {
1380 	int i, r;
1381 
1382 	for (i = 0; i < adev->num_ip_blocks; i++) {
1383 		if (!adev->ip_blocks[i].status.valid)
1384 			continue;
1385 		if (adev->ip_blocks[i].version->type == block_type) {
1386 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1387 			if (r)
1388 				return r;
1389 			break;
1390 		}
1391 	}
1392 	return 0;
1393 
1394 }
1395 
1396 /**
1397  * amdgpu_device_ip_is_idle - is the hardware IP idle
1398  *
1399  * @adev: amdgpu_device pointer
1400  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1401  *
1402  * Check if the hardware IP is idle or not.
1403  * Returns true if it the IP is idle, false if not.
1404  */
1405 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1406 			      enum amd_ip_block_type block_type)
1407 {
1408 	int i;
1409 
1410 	for (i = 0; i < adev->num_ip_blocks; i++) {
1411 		if (!adev->ip_blocks[i].status.valid)
1412 			continue;
1413 		if (adev->ip_blocks[i].version->type == block_type)
1414 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1415 	}
1416 	return true;
1417 
1418 }
1419 
1420 /**
1421  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1422  *
1423  * @adev: amdgpu_device pointer
1424  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1425  *
1426  * Returns a pointer to the hardware IP block structure
1427  * if it exists for the asic, otherwise NULL.
1428  */
1429 struct amdgpu_ip_block *
1430 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1431 			      enum amd_ip_block_type type)
1432 {
1433 	int i;
1434 
1435 	for (i = 0; i < adev->num_ip_blocks; i++)
1436 		if (adev->ip_blocks[i].version->type == type)
1437 			return &adev->ip_blocks[i];
1438 
1439 	return NULL;
1440 }
1441 
1442 /**
1443  * amdgpu_device_ip_block_version_cmp
1444  *
1445  * @adev: amdgpu_device pointer
1446  * @type: enum amd_ip_block_type
1447  * @major: major version
1448  * @minor: minor version
1449  *
1450  * return 0 if equal or greater
1451  * return 1 if smaller or the ip_block doesn't exist
1452  */
1453 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1454 				       enum amd_ip_block_type type,
1455 				       u32 major, u32 minor)
1456 {
1457 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1458 
1459 	if (ip_block && ((ip_block->version->major > major) ||
1460 			((ip_block->version->major == major) &&
1461 			(ip_block->version->minor >= minor))))
1462 		return 0;
1463 
1464 	return 1;
1465 }
1466 
1467 /**
1468  * amdgpu_device_ip_block_add
1469  *
1470  * @adev: amdgpu_device pointer
1471  * @ip_block_version: pointer to the IP to add
1472  *
1473  * Adds the IP block driver information to the collection of IPs
1474  * on the asic.
1475  */
1476 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1477 			       const struct amdgpu_ip_block_version *ip_block_version)
1478 {
1479 	if (!ip_block_version)
1480 		return -EINVAL;
1481 
1482 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1483 		  ip_block_version->funcs->name);
1484 
1485 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1486 
1487 	return 0;
1488 }
1489 
1490 /**
1491  * amdgpu_device_enable_virtual_display - enable virtual display feature
1492  *
1493  * @adev: amdgpu_device pointer
1494  *
1495  * Enabled the virtual display feature if the user has enabled it via
1496  * the module parameter virtual_display.  This feature provides a virtual
1497  * display hardware on headless boards or in virtualized environments.
1498  * This function parses and validates the configuration string specified by
1499  * the user and configues the virtual display configuration (number of
1500  * virtual connectors, crtcs, etc.) specified.
1501  */
1502 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1503 {
1504 	adev->enable_virtual_display = false;
1505 
1506 	if (amdgpu_virtual_display) {
1507 		struct drm_device *ddev = adev->ddev;
1508 		const char *pci_address_name = pci_name(ddev->pdev);
1509 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1510 
1511 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1512 		pciaddstr_tmp = pciaddstr;
1513 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1514 			pciaddname = strsep(&pciaddname_tmp, ",");
1515 			if (!strcmp("all", pciaddname)
1516 			    || !strcmp(pci_address_name, pciaddname)) {
1517 				long num_crtc;
1518 				int res = -1;
1519 
1520 				adev->enable_virtual_display = true;
1521 
1522 				if (pciaddname_tmp)
1523 					res = kstrtol(pciaddname_tmp, 10,
1524 						      &num_crtc);
1525 
1526 				if (!res) {
1527 					if (num_crtc < 1)
1528 						num_crtc = 1;
1529 					if (num_crtc > 6)
1530 						num_crtc = 6;
1531 					adev->mode_info.num_crtc = num_crtc;
1532 				} else {
1533 					adev->mode_info.num_crtc = 1;
1534 				}
1535 				break;
1536 			}
1537 		}
1538 
1539 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1540 			 amdgpu_virtual_display, pci_address_name,
1541 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1542 
1543 		kfree(pciaddstr);
1544 	}
1545 }
1546 
1547 /**
1548  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1549  *
1550  * @adev: amdgpu_device pointer
1551  *
1552  * Parses the asic configuration parameters specified in the gpu info
1553  * firmware and makes them availale to the driver for use in configuring
1554  * the asic.
1555  * Returns 0 on success, -EINVAL on failure.
1556  */
1557 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1558 {
1559 	const char *chip_name;
1560 	char fw_name[40];
1561 	int err;
1562 	const struct gpu_info_firmware_header_v1_0 *hdr;
1563 
1564 	adev->firmware.gpu_info_fw = NULL;
1565 
1566 	if (adev->discovery_bin) {
1567 		amdgpu_discovery_get_gfx_info(adev);
1568 
1569 		/*
1570 		 * FIXME: The bounding box is still needed by Navi12, so
1571 		 * temporarily read it from gpu_info firmware. Should be droped
1572 		 * when DAL no longer needs it.
1573 		 */
1574 		if (adev->asic_type != CHIP_NAVI12)
1575 			return 0;
1576 	}
1577 
1578 	switch (adev->asic_type) {
1579 #ifdef CONFIG_DRM_AMDGPU_SI
1580 	case CHIP_VERDE:
1581 	case CHIP_TAHITI:
1582 	case CHIP_PITCAIRN:
1583 	case CHIP_OLAND:
1584 	case CHIP_HAINAN:
1585 #endif
1586 #ifdef CONFIG_DRM_AMDGPU_CIK
1587 	case CHIP_BONAIRE:
1588 	case CHIP_HAWAII:
1589 	case CHIP_KAVERI:
1590 	case CHIP_KABINI:
1591 	case CHIP_MULLINS:
1592 #endif
1593 	case CHIP_TOPAZ:
1594 	case CHIP_TONGA:
1595 	case CHIP_FIJI:
1596 	case CHIP_POLARIS10:
1597 	case CHIP_POLARIS11:
1598 	case CHIP_POLARIS12:
1599 	case CHIP_VEGAM:
1600 	case CHIP_CARRIZO:
1601 	case CHIP_STONEY:
1602 	case CHIP_VEGA20:
1603 	default:
1604 		return 0;
1605 	case CHIP_VEGA10:
1606 		chip_name = "vega10";
1607 		break;
1608 	case CHIP_VEGA12:
1609 		chip_name = "vega12";
1610 		break;
1611 	case CHIP_RAVEN:
1612 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1613 			chip_name = "raven2";
1614 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1615 			chip_name = "picasso";
1616 		else
1617 			chip_name = "raven";
1618 		break;
1619 	case CHIP_ARCTURUS:
1620 		chip_name = "arcturus";
1621 		break;
1622 	case CHIP_RENOIR:
1623 		chip_name = "renoir";
1624 		break;
1625 	case CHIP_NAVI10:
1626 		chip_name = "navi10";
1627 		break;
1628 	case CHIP_NAVI14:
1629 		chip_name = "navi14";
1630 		break;
1631 	case CHIP_NAVI12:
1632 		chip_name = "navi12";
1633 		break;
1634 	case CHIP_SIENNA_CICHLID:
1635 		chip_name = "sienna_cichlid";
1636 		break;
1637 	case CHIP_NAVY_FLOUNDER:
1638 		chip_name = "navy_flounder";
1639 		break;
1640 	}
1641 
1642 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1643 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1644 	if (err) {
1645 		dev_err(adev->dev,
1646 			"Failed to load gpu_info firmware \"%s\"\n",
1647 			fw_name);
1648 		goto out;
1649 	}
1650 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1651 	if (err) {
1652 		dev_err(adev->dev,
1653 			"Failed to validate gpu_info firmware \"%s\"\n",
1654 			fw_name);
1655 		goto out;
1656 	}
1657 
1658 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1659 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1660 
1661 	switch (hdr->version_major) {
1662 	case 1:
1663 	{
1664 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1665 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1666 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1667 
1668 		/*
1669 		 * Should be droped when DAL no longer needs it.
1670 		 */
1671 		if (adev->asic_type == CHIP_NAVI12)
1672 			goto parse_soc_bounding_box;
1673 
1674 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1675 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1676 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1677 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1678 		adev->gfx.config.max_texture_channel_caches =
1679 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1680 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1681 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1682 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1683 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1684 		adev->gfx.config.double_offchip_lds_buf =
1685 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1686 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1687 		adev->gfx.cu_info.max_waves_per_simd =
1688 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1689 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1690 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1691 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1692 		if (hdr->version_minor >= 1) {
1693 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1694 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1695 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1696 			adev->gfx.config.num_sc_per_sh =
1697 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1698 			adev->gfx.config.num_packer_per_sc =
1699 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1700 		}
1701 
1702 parse_soc_bounding_box:
1703 		/*
1704 		 * soc bounding box info is not integrated in disocovery table,
1705 		 * we always need to parse it from gpu info firmware if needed.
1706 		 */
1707 		if (hdr->version_minor == 2) {
1708 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1709 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1710 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1711 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1712 		}
1713 		break;
1714 	}
1715 	default:
1716 		dev_err(adev->dev,
1717 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1718 		err = -EINVAL;
1719 		goto out;
1720 	}
1721 out:
1722 	return err;
1723 }
1724 
1725 /**
1726  * amdgpu_device_ip_early_init - run early init for hardware IPs
1727  *
1728  * @adev: amdgpu_device pointer
1729  *
1730  * Early initialization pass for hardware IPs.  The hardware IPs that make
1731  * up each asic are discovered each IP's early_init callback is run.  This
1732  * is the first stage in initializing the asic.
1733  * Returns 0 on success, negative error code on failure.
1734  */
1735 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1736 {
1737 	int i, r;
1738 
1739 	amdgpu_device_enable_virtual_display(adev);
1740 
1741 	if (amdgpu_sriov_vf(adev)) {
1742 		r = amdgpu_virt_request_full_gpu(adev, true);
1743 		if (r)
1744 			return r;
1745 	}
1746 
1747 	switch (adev->asic_type) {
1748 #ifdef CONFIG_DRM_AMDGPU_SI
1749 	case CHIP_VERDE:
1750 	case CHIP_TAHITI:
1751 	case CHIP_PITCAIRN:
1752 	case CHIP_OLAND:
1753 	case CHIP_HAINAN:
1754 		adev->family = AMDGPU_FAMILY_SI;
1755 		r = si_set_ip_blocks(adev);
1756 		if (r)
1757 			return r;
1758 		break;
1759 #endif
1760 #ifdef CONFIG_DRM_AMDGPU_CIK
1761 	case CHIP_BONAIRE:
1762 	case CHIP_HAWAII:
1763 	case CHIP_KAVERI:
1764 	case CHIP_KABINI:
1765 	case CHIP_MULLINS:
1766 		if (adev->flags & AMD_IS_APU)
1767 			adev->family = AMDGPU_FAMILY_KV;
1768 		else
1769 			adev->family = AMDGPU_FAMILY_CI;
1770 
1771 		r = cik_set_ip_blocks(adev);
1772 		if (r)
1773 			return r;
1774 		break;
1775 #endif
1776 	case CHIP_TOPAZ:
1777 	case CHIP_TONGA:
1778 	case CHIP_FIJI:
1779 	case CHIP_POLARIS10:
1780 	case CHIP_POLARIS11:
1781 	case CHIP_POLARIS12:
1782 	case CHIP_VEGAM:
1783 	case CHIP_CARRIZO:
1784 	case CHIP_STONEY:
1785 		if (adev->flags & AMD_IS_APU)
1786 			adev->family = AMDGPU_FAMILY_CZ;
1787 		else
1788 			adev->family = AMDGPU_FAMILY_VI;
1789 
1790 		r = vi_set_ip_blocks(adev);
1791 		if (r)
1792 			return r;
1793 		break;
1794 	case CHIP_VEGA10:
1795 	case CHIP_VEGA12:
1796 	case CHIP_VEGA20:
1797 	case CHIP_RAVEN:
1798 	case CHIP_ARCTURUS:
1799 	case CHIP_RENOIR:
1800 		if (adev->flags & AMD_IS_APU)
1801 			adev->family = AMDGPU_FAMILY_RV;
1802 		else
1803 			adev->family = AMDGPU_FAMILY_AI;
1804 
1805 		r = soc15_set_ip_blocks(adev);
1806 		if (r)
1807 			return r;
1808 		break;
1809 	case  CHIP_NAVI10:
1810 	case  CHIP_NAVI14:
1811 	case  CHIP_NAVI12:
1812 	case  CHIP_SIENNA_CICHLID:
1813 	case  CHIP_NAVY_FLOUNDER:
1814 		adev->family = AMDGPU_FAMILY_NV;
1815 
1816 		r = nv_set_ip_blocks(adev);
1817 		if (r)
1818 			return r;
1819 		break;
1820 	default:
1821 		/* FIXME: not supported yet */
1822 		return -EINVAL;
1823 	}
1824 
1825 	amdgpu_amdkfd_device_probe(adev);
1826 
1827 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
1828 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1829 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1830 
1831 	for (i = 0; i < adev->num_ip_blocks; i++) {
1832 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1833 			DRM_ERROR("disabled ip block: %d <%s>\n",
1834 				  i, adev->ip_blocks[i].version->funcs->name);
1835 			adev->ip_blocks[i].status.valid = false;
1836 		} else {
1837 			if (adev->ip_blocks[i].version->funcs->early_init) {
1838 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1839 				if (r == -ENOENT) {
1840 					adev->ip_blocks[i].status.valid = false;
1841 				} else if (r) {
1842 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
1843 						  adev->ip_blocks[i].version->funcs->name, r);
1844 					return r;
1845 				} else {
1846 					adev->ip_blocks[i].status.valid = true;
1847 				}
1848 			} else {
1849 				adev->ip_blocks[i].status.valid = true;
1850 			}
1851 		}
1852 		/* get the vbios after the asic_funcs are set up */
1853 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1854 			r = amdgpu_device_parse_gpu_info_fw(adev);
1855 			if (r)
1856 				return r;
1857 
1858 			/* Read BIOS */
1859 			if (!amdgpu_get_bios(adev))
1860 				return -EINVAL;
1861 
1862 			r = amdgpu_atombios_init(adev);
1863 			if (r) {
1864 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1865 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1866 				return r;
1867 			}
1868 		}
1869 	}
1870 
1871 	adev->cg_flags &= amdgpu_cg_mask;
1872 	adev->pg_flags &= amdgpu_pg_mask;
1873 
1874 	return 0;
1875 }
1876 
1877 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1878 {
1879 	int i, r;
1880 
1881 	for (i = 0; i < adev->num_ip_blocks; i++) {
1882 		if (!adev->ip_blocks[i].status.sw)
1883 			continue;
1884 		if (adev->ip_blocks[i].status.hw)
1885 			continue;
1886 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1887 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1888 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1889 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1890 			if (r) {
1891 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1892 					  adev->ip_blocks[i].version->funcs->name, r);
1893 				return r;
1894 			}
1895 			adev->ip_blocks[i].status.hw = true;
1896 		}
1897 	}
1898 
1899 	return 0;
1900 }
1901 
1902 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1903 {
1904 	int i, r;
1905 
1906 	for (i = 0; i < adev->num_ip_blocks; i++) {
1907 		if (!adev->ip_blocks[i].status.sw)
1908 			continue;
1909 		if (adev->ip_blocks[i].status.hw)
1910 			continue;
1911 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1912 		if (r) {
1913 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1914 				  adev->ip_blocks[i].version->funcs->name, r);
1915 			return r;
1916 		}
1917 		adev->ip_blocks[i].status.hw = true;
1918 	}
1919 
1920 	return 0;
1921 }
1922 
1923 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1924 {
1925 	int r = 0;
1926 	int i;
1927 	uint32_t smu_version;
1928 
1929 	if (adev->asic_type >= CHIP_VEGA10) {
1930 		for (i = 0; i < adev->num_ip_blocks; i++) {
1931 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1932 				continue;
1933 
1934 			/* no need to do the fw loading again if already done*/
1935 			if (adev->ip_blocks[i].status.hw == true)
1936 				break;
1937 
1938 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
1939 				r = adev->ip_blocks[i].version->funcs->resume(adev);
1940 				if (r) {
1941 					DRM_ERROR("resume of IP block <%s> failed %d\n",
1942 							  adev->ip_blocks[i].version->funcs->name, r);
1943 					return r;
1944 				}
1945 			} else {
1946 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1947 				if (r) {
1948 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1949 							  adev->ip_blocks[i].version->funcs->name, r);
1950 					return r;
1951 				}
1952 			}
1953 
1954 			adev->ip_blocks[i].status.hw = true;
1955 			break;
1956 		}
1957 	}
1958 
1959 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1960 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1961 
1962 	return r;
1963 }
1964 
1965 /**
1966  * amdgpu_device_ip_init - run init for hardware IPs
1967  *
1968  * @adev: amdgpu_device pointer
1969  *
1970  * Main initialization pass for hardware IPs.  The list of all the hardware
1971  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1972  * are run.  sw_init initializes the software state associated with each IP
1973  * and hw_init initializes the hardware associated with each IP.
1974  * Returns 0 on success, negative error code on failure.
1975  */
1976 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1977 {
1978 	int i, r;
1979 
1980 	r = amdgpu_ras_init(adev);
1981 	if (r)
1982 		return r;
1983 
1984 	for (i = 0; i < adev->num_ip_blocks; i++) {
1985 		if (!adev->ip_blocks[i].status.valid)
1986 			continue;
1987 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1988 		if (r) {
1989 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1990 				  adev->ip_blocks[i].version->funcs->name, r);
1991 			goto init_failed;
1992 		}
1993 		adev->ip_blocks[i].status.sw = true;
1994 
1995 		/* need to do gmc hw init early so we can allocate gpu mem */
1996 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
1997 			r = amdgpu_device_vram_scratch_init(adev);
1998 			if (r) {
1999 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2000 				goto init_failed;
2001 			}
2002 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2003 			if (r) {
2004 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2005 				goto init_failed;
2006 			}
2007 			r = amdgpu_device_wb_init(adev);
2008 			if (r) {
2009 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2010 				goto init_failed;
2011 			}
2012 			adev->ip_blocks[i].status.hw = true;
2013 
2014 			/* right after GMC hw init, we create CSA */
2015 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2016 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2017 								AMDGPU_GEM_DOMAIN_VRAM,
2018 								AMDGPU_CSA_SIZE);
2019 				if (r) {
2020 					DRM_ERROR("allocate CSA failed %d\n", r);
2021 					goto init_failed;
2022 				}
2023 			}
2024 		}
2025 	}
2026 
2027 	if (amdgpu_sriov_vf(adev))
2028 		amdgpu_virt_init_data_exchange(adev);
2029 
2030 	r = amdgpu_ib_pool_init(adev);
2031 	if (r) {
2032 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2033 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2034 		goto init_failed;
2035 	}
2036 
2037 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2038 	if (r)
2039 		goto init_failed;
2040 
2041 	r = amdgpu_device_ip_hw_init_phase1(adev);
2042 	if (r)
2043 		goto init_failed;
2044 
2045 	r = amdgpu_device_fw_loading(adev);
2046 	if (r)
2047 		goto init_failed;
2048 
2049 	r = amdgpu_device_ip_hw_init_phase2(adev);
2050 	if (r)
2051 		goto init_failed;
2052 
2053 	/*
2054 	 * retired pages will be loaded from eeprom and reserved here,
2055 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2056 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2057 	 * for I2C communication which only true at this point.
2058 	 * recovery_init may fail, but it can free all resources allocated by
2059 	 * itself and its failure should not stop amdgpu init process.
2060 	 *
2061 	 * Note: theoretically, this should be called before all vram allocations
2062 	 * to protect retired page from abusing
2063 	 */
2064 	amdgpu_ras_recovery_init(adev);
2065 
2066 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2067 		amdgpu_xgmi_add_device(adev);
2068 	amdgpu_amdkfd_device_init(adev);
2069 
2070 	amdgpu_fru_get_product_info(adev);
2071 
2072 init_failed:
2073 	if (amdgpu_sriov_vf(adev))
2074 		amdgpu_virt_release_full_gpu(adev, true);
2075 
2076 	return r;
2077 }
2078 
2079 /**
2080  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2081  *
2082  * @adev: amdgpu_device pointer
2083  *
2084  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2085  * this function before a GPU reset.  If the value is retained after a
2086  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2087  */
2088 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2089 {
2090 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2091 }
2092 
2093 /**
2094  * amdgpu_device_check_vram_lost - check if vram is valid
2095  *
2096  * @adev: amdgpu_device pointer
2097  *
2098  * Checks the reset magic value written to the gart pointer in VRAM.
2099  * The driver calls this after a GPU reset to see if the contents of
2100  * VRAM is lost or now.
2101  * returns true if vram is lost, false if not.
2102  */
2103 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2104 {
2105 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2106 			AMDGPU_RESET_MAGIC_NUM))
2107 		return true;
2108 
2109 	if (!amdgpu_in_reset(adev))
2110 		return false;
2111 
2112 	/*
2113 	 * For all ASICs with baco/mode1 reset, the VRAM is
2114 	 * always assumed to be lost.
2115 	 */
2116 	switch (amdgpu_asic_reset_method(adev)) {
2117 	case AMD_RESET_METHOD_BACO:
2118 	case AMD_RESET_METHOD_MODE1:
2119 		return true;
2120 	default:
2121 		return false;
2122 	}
2123 }
2124 
2125 /**
2126  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2127  *
2128  * @adev: amdgpu_device pointer
2129  * @state: clockgating state (gate or ungate)
2130  *
2131  * The list of all the hardware IPs that make up the asic is walked and the
2132  * set_clockgating_state callbacks are run.
2133  * Late initialization pass enabling clockgating for hardware IPs.
2134  * Fini or suspend, pass disabling clockgating for hardware IPs.
2135  * Returns 0 on success, negative error code on failure.
2136  */
2137 
2138 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2139 						enum amd_clockgating_state state)
2140 {
2141 	int i, j, r;
2142 
2143 	if (amdgpu_emu_mode == 1)
2144 		return 0;
2145 
2146 	for (j = 0; j < adev->num_ip_blocks; j++) {
2147 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2148 		if (!adev->ip_blocks[i].status.late_initialized)
2149 			continue;
2150 		/* skip CG for VCE/UVD, it's handled specially */
2151 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2152 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2153 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2154 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2155 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2156 			/* enable clockgating to save power */
2157 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2158 										     state);
2159 			if (r) {
2160 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2161 					  adev->ip_blocks[i].version->funcs->name, r);
2162 				return r;
2163 			}
2164 		}
2165 	}
2166 
2167 	return 0;
2168 }
2169 
2170 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2171 {
2172 	int i, j, r;
2173 
2174 	if (amdgpu_emu_mode == 1)
2175 		return 0;
2176 
2177 	for (j = 0; j < adev->num_ip_blocks; j++) {
2178 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2179 		if (!adev->ip_blocks[i].status.late_initialized)
2180 			continue;
2181 		/* skip CG for VCE/UVD, it's handled specially */
2182 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2183 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2184 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2185 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2186 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2187 			/* enable powergating to save power */
2188 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2189 											state);
2190 			if (r) {
2191 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2192 					  adev->ip_blocks[i].version->funcs->name, r);
2193 				return r;
2194 			}
2195 		}
2196 	}
2197 	return 0;
2198 }
2199 
2200 static int amdgpu_device_enable_mgpu_fan_boost(void)
2201 {
2202 	struct amdgpu_gpu_instance *gpu_ins;
2203 	struct amdgpu_device *adev;
2204 	int i, ret = 0;
2205 
2206 	mutex_lock(&mgpu_info.mutex);
2207 
2208 	/*
2209 	 * MGPU fan boost feature should be enabled
2210 	 * only when there are two or more dGPUs in
2211 	 * the system
2212 	 */
2213 	if (mgpu_info.num_dgpu < 2)
2214 		goto out;
2215 
2216 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2217 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2218 		adev = gpu_ins->adev;
2219 		if (!(adev->flags & AMD_IS_APU) &&
2220 		    !gpu_ins->mgpu_fan_enabled &&
2221 		    adev->powerplay.pp_funcs &&
2222 		    adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2223 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2224 			if (ret)
2225 				break;
2226 
2227 			gpu_ins->mgpu_fan_enabled = 1;
2228 		}
2229 	}
2230 
2231 out:
2232 	mutex_unlock(&mgpu_info.mutex);
2233 
2234 	return ret;
2235 }
2236 
2237 /**
2238  * amdgpu_device_ip_late_init - run late init for hardware IPs
2239  *
2240  * @adev: amdgpu_device pointer
2241  *
2242  * Late initialization pass for hardware IPs.  The list of all the hardware
2243  * IPs that make up the asic is walked and the late_init callbacks are run.
2244  * late_init covers any special initialization that an IP requires
2245  * after all of the have been initialized or something that needs to happen
2246  * late in the init process.
2247  * Returns 0 on success, negative error code on failure.
2248  */
2249 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2250 {
2251 	struct amdgpu_gpu_instance *gpu_instance;
2252 	int i = 0, r;
2253 
2254 	for (i = 0; i < adev->num_ip_blocks; i++) {
2255 		if (!adev->ip_blocks[i].status.hw)
2256 			continue;
2257 		if (adev->ip_blocks[i].version->funcs->late_init) {
2258 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2259 			if (r) {
2260 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2261 					  adev->ip_blocks[i].version->funcs->name, r);
2262 				return r;
2263 			}
2264 		}
2265 		adev->ip_blocks[i].status.late_initialized = true;
2266 	}
2267 
2268 	amdgpu_ras_set_error_query_ready(adev, true);
2269 
2270 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2271 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2272 
2273 	amdgpu_device_fill_reset_magic(adev);
2274 
2275 	r = amdgpu_device_enable_mgpu_fan_boost();
2276 	if (r)
2277 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2278 
2279 
2280 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2281 		mutex_lock(&mgpu_info.mutex);
2282 
2283 		/*
2284 		 * Reset device p-state to low as this was booted with high.
2285 		 *
2286 		 * This should be performed only after all devices from the same
2287 		 * hive get initialized.
2288 		 *
2289 		 * However, it's unknown how many device in the hive in advance.
2290 		 * As this is counted one by one during devices initializations.
2291 		 *
2292 		 * So, we wait for all XGMI interlinked devices initialized.
2293 		 * This may bring some delays as those devices may come from
2294 		 * different hives. But that should be OK.
2295 		 */
2296 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2297 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2298 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2299 				if (gpu_instance->adev->flags & AMD_IS_APU)
2300 					continue;
2301 
2302 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2303 						AMDGPU_XGMI_PSTATE_MIN);
2304 				if (r) {
2305 					DRM_ERROR("pstate setting failed (%d).\n", r);
2306 					break;
2307 				}
2308 			}
2309 		}
2310 
2311 		mutex_unlock(&mgpu_info.mutex);
2312 	}
2313 
2314 	return 0;
2315 }
2316 
2317 /**
2318  * amdgpu_device_ip_fini - run fini for hardware IPs
2319  *
2320  * @adev: amdgpu_device pointer
2321  *
2322  * Main teardown pass for hardware IPs.  The list of all the hardware
2323  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2324  * are run.  hw_fini tears down the hardware associated with each IP
2325  * and sw_fini tears down any software state associated with each IP.
2326  * Returns 0 on success, negative error code on failure.
2327  */
2328 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2329 {
2330 	int i, r;
2331 
2332 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2333 		amdgpu_virt_release_ras_err_handler_data(adev);
2334 
2335 	amdgpu_ras_pre_fini(adev);
2336 
2337 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2338 		amdgpu_xgmi_remove_device(adev);
2339 
2340 	amdgpu_amdkfd_device_fini(adev);
2341 
2342 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2343 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2344 
2345 	/* need to disable SMC first */
2346 	for (i = 0; i < adev->num_ip_blocks; i++) {
2347 		if (!adev->ip_blocks[i].status.hw)
2348 			continue;
2349 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2350 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2351 			/* XXX handle errors */
2352 			if (r) {
2353 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2354 					  adev->ip_blocks[i].version->funcs->name, r);
2355 			}
2356 			adev->ip_blocks[i].status.hw = false;
2357 			break;
2358 		}
2359 	}
2360 
2361 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2362 		if (!adev->ip_blocks[i].status.hw)
2363 			continue;
2364 
2365 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2366 		/* XXX handle errors */
2367 		if (r) {
2368 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2369 				  adev->ip_blocks[i].version->funcs->name, r);
2370 		}
2371 
2372 		adev->ip_blocks[i].status.hw = false;
2373 	}
2374 
2375 
2376 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2377 		if (!adev->ip_blocks[i].status.sw)
2378 			continue;
2379 
2380 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2381 			amdgpu_ucode_free_bo(adev);
2382 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2383 			amdgpu_device_wb_fini(adev);
2384 			amdgpu_device_vram_scratch_fini(adev);
2385 			amdgpu_ib_pool_fini(adev);
2386 		}
2387 
2388 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2389 		/* XXX handle errors */
2390 		if (r) {
2391 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2392 				  adev->ip_blocks[i].version->funcs->name, r);
2393 		}
2394 		adev->ip_blocks[i].status.sw = false;
2395 		adev->ip_blocks[i].status.valid = false;
2396 	}
2397 
2398 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2399 		if (!adev->ip_blocks[i].status.late_initialized)
2400 			continue;
2401 		if (adev->ip_blocks[i].version->funcs->late_fini)
2402 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2403 		adev->ip_blocks[i].status.late_initialized = false;
2404 	}
2405 
2406 	amdgpu_ras_fini(adev);
2407 
2408 	if (amdgpu_sriov_vf(adev))
2409 		if (amdgpu_virt_release_full_gpu(adev, false))
2410 			DRM_ERROR("failed to release exclusive mode on fini\n");
2411 
2412 	return 0;
2413 }
2414 
2415 /**
2416  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2417  *
2418  * @work: work_struct.
2419  */
2420 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2421 {
2422 	struct amdgpu_device *adev =
2423 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2424 	int r;
2425 
2426 	r = amdgpu_ib_ring_tests(adev);
2427 	if (r)
2428 		DRM_ERROR("ib ring test failed (%d).\n", r);
2429 }
2430 
2431 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2432 {
2433 	struct amdgpu_device *adev =
2434 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2435 
2436 	mutex_lock(&adev->gfx.gfx_off_mutex);
2437 	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2438 		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2439 			adev->gfx.gfx_off_state = true;
2440 	}
2441 	mutex_unlock(&adev->gfx.gfx_off_mutex);
2442 }
2443 
2444 /**
2445  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2446  *
2447  * @adev: amdgpu_device pointer
2448  *
2449  * Main suspend function for hardware IPs.  The list of all the hardware
2450  * IPs that make up the asic is walked, clockgating is disabled and the
2451  * suspend callbacks are run.  suspend puts the hardware and software state
2452  * in each IP into a state suitable for suspend.
2453  * Returns 0 on success, negative error code on failure.
2454  */
2455 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2456 {
2457 	int i, r;
2458 
2459 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2460 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2461 
2462 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2463 		if (!adev->ip_blocks[i].status.valid)
2464 			continue;
2465 
2466 		/* displays are handled separately */
2467 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2468 			continue;
2469 
2470 		/* XXX handle errors */
2471 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2472 		/* XXX handle errors */
2473 		if (r) {
2474 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2475 				  adev->ip_blocks[i].version->funcs->name, r);
2476 			return r;
2477 		}
2478 
2479 		adev->ip_blocks[i].status.hw = false;
2480 	}
2481 
2482 	return 0;
2483 }
2484 
2485 /**
2486  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2487  *
2488  * @adev: amdgpu_device pointer
2489  *
2490  * Main suspend function for hardware IPs.  The list of all the hardware
2491  * IPs that make up the asic is walked, clockgating is disabled and the
2492  * suspend callbacks are run.  suspend puts the hardware and software state
2493  * in each IP into a state suitable for suspend.
2494  * Returns 0 on success, negative error code on failure.
2495  */
2496 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2497 {
2498 	int i, r;
2499 
2500 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2501 		if (!adev->ip_blocks[i].status.valid)
2502 			continue;
2503 		/* displays are handled in phase1 */
2504 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2505 			continue;
2506 		/* PSP lost connection when err_event_athub occurs */
2507 		if (amdgpu_ras_intr_triggered() &&
2508 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2509 			adev->ip_blocks[i].status.hw = false;
2510 			continue;
2511 		}
2512 		/* XXX handle errors */
2513 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2514 		/* XXX handle errors */
2515 		if (r) {
2516 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2517 				  adev->ip_blocks[i].version->funcs->name, r);
2518 		}
2519 		adev->ip_blocks[i].status.hw = false;
2520 		/* handle putting the SMC in the appropriate state */
2521 		if(!amdgpu_sriov_vf(adev)){
2522 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2523 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2524 				if (r) {
2525 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2526 							adev->mp1_state, r);
2527 					return r;
2528 				}
2529 			}
2530 		}
2531 		adev->ip_blocks[i].status.hw = false;
2532 	}
2533 
2534 	return 0;
2535 }
2536 
2537 /**
2538  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2539  *
2540  * @adev: amdgpu_device pointer
2541  *
2542  * Main suspend function for hardware IPs.  The list of all the hardware
2543  * IPs that make up the asic is walked, clockgating is disabled and the
2544  * suspend callbacks are run.  suspend puts the hardware and software state
2545  * in each IP into a state suitable for suspend.
2546  * Returns 0 on success, negative error code on failure.
2547  */
2548 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2549 {
2550 	int r;
2551 
2552 	if (amdgpu_sriov_vf(adev))
2553 		amdgpu_virt_request_full_gpu(adev, false);
2554 
2555 	r = amdgpu_device_ip_suspend_phase1(adev);
2556 	if (r)
2557 		return r;
2558 	r = amdgpu_device_ip_suspend_phase2(adev);
2559 
2560 	if (amdgpu_sriov_vf(adev))
2561 		amdgpu_virt_release_full_gpu(adev, false);
2562 
2563 	return r;
2564 }
2565 
2566 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2567 {
2568 	int i, r;
2569 
2570 	static enum amd_ip_block_type ip_order[] = {
2571 		AMD_IP_BLOCK_TYPE_GMC,
2572 		AMD_IP_BLOCK_TYPE_COMMON,
2573 		AMD_IP_BLOCK_TYPE_PSP,
2574 		AMD_IP_BLOCK_TYPE_IH,
2575 	};
2576 
2577 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2578 		int j;
2579 		struct amdgpu_ip_block *block;
2580 
2581 		for (j = 0; j < adev->num_ip_blocks; j++) {
2582 			block = &adev->ip_blocks[j];
2583 
2584 			block->status.hw = false;
2585 			if (block->version->type != ip_order[i] ||
2586 				!block->status.valid)
2587 				continue;
2588 
2589 			r = block->version->funcs->hw_init(adev);
2590 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2591 			if (r)
2592 				return r;
2593 			block->status.hw = true;
2594 		}
2595 	}
2596 
2597 	return 0;
2598 }
2599 
2600 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2601 {
2602 	int i, r;
2603 
2604 	static enum amd_ip_block_type ip_order[] = {
2605 		AMD_IP_BLOCK_TYPE_SMC,
2606 		AMD_IP_BLOCK_TYPE_DCE,
2607 		AMD_IP_BLOCK_TYPE_GFX,
2608 		AMD_IP_BLOCK_TYPE_SDMA,
2609 		AMD_IP_BLOCK_TYPE_UVD,
2610 		AMD_IP_BLOCK_TYPE_VCE,
2611 		AMD_IP_BLOCK_TYPE_VCN
2612 	};
2613 
2614 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2615 		int j;
2616 		struct amdgpu_ip_block *block;
2617 
2618 		for (j = 0; j < adev->num_ip_blocks; j++) {
2619 			block = &adev->ip_blocks[j];
2620 
2621 			if (block->version->type != ip_order[i] ||
2622 				!block->status.valid ||
2623 				block->status.hw)
2624 				continue;
2625 
2626 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2627 				r = block->version->funcs->resume(adev);
2628 			else
2629 				r = block->version->funcs->hw_init(adev);
2630 
2631 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2632 			if (r)
2633 				return r;
2634 			block->status.hw = true;
2635 		}
2636 	}
2637 
2638 	return 0;
2639 }
2640 
2641 /**
2642  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2643  *
2644  * @adev: amdgpu_device pointer
2645  *
2646  * First resume function for hardware IPs.  The list of all the hardware
2647  * IPs that make up the asic is walked and the resume callbacks are run for
2648  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2649  * after a suspend and updates the software state as necessary.  This
2650  * function is also used for restoring the GPU after a GPU reset.
2651  * Returns 0 on success, negative error code on failure.
2652  */
2653 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2654 {
2655 	int i, r;
2656 
2657 	for (i = 0; i < adev->num_ip_blocks; i++) {
2658 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2659 			continue;
2660 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2661 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2662 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2663 
2664 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2665 			if (r) {
2666 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2667 					  adev->ip_blocks[i].version->funcs->name, r);
2668 				return r;
2669 			}
2670 			adev->ip_blocks[i].status.hw = true;
2671 		}
2672 	}
2673 
2674 	return 0;
2675 }
2676 
2677 /**
2678  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2679  *
2680  * @adev: amdgpu_device pointer
2681  *
2682  * First resume function for hardware IPs.  The list of all the hardware
2683  * IPs that make up the asic is walked and the resume callbacks are run for
2684  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2685  * functional state after a suspend and updates the software state as
2686  * necessary.  This function is also used for restoring the GPU after a GPU
2687  * reset.
2688  * Returns 0 on success, negative error code on failure.
2689  */
2690 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2691 {
2692 	int i, r;
2693 
2694 	for (i = 0; i < adev->num_ip_blocks; i++) {
2695 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2696 			continue;
2697 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2698 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2699 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2700 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2701 			continue;
2702 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2703 		if (r) {
2704 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2705 				  adev->ip_blocks[i].version->funcs->name, r);
2706 			return r;
2707 		}
2708 		adev->ip_blocks[i].status.hw = true;
2709 	}
2710 
2711 	return 0;
2712 }
2713 
2714 /**
2715  * amdgpu_device_ip_resume - run resume for hardware IPs
2716  *
2717  * @adev: amdgpu_device pointer
2718  *
2719  * Main resume function for hardware IPs.  The hardware IPs
2720  * are split into two resume functions because they are
2721  * are also used in in recovering from a GPU reset and some additional
2722  * steps need to be take between them.  In this case (S3/S4) they are
2723  * run sequentially.
2724  * Returns 0 on success, negative error code on failure.
2725  */
2726 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2727 {
2728 	int r;
2729 
2730 	r = amdgpu_device_ip_resume_phase1(adev);
2731 	if (r)
2732 		return r;
2733 
2734 	r = amdgpu_device_fw_loading(adev);
2735 	if (r)
2736 		return r;
2737 
2738 	r = amdgpu_device_ip_resume_phase2(adev);
2739 
2740 	return r;
2741 }
2742 
2743 /**
2744  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2745  *
2746  * @adev: amdgpu_device pointer
2747  *
2748  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2749  */
2750 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2751 {
2752 	if (amdgpu_sriov_vf(adev)) {
2753 		if (adev->is_atom_fw) {
2754 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2755 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2756 		} else {
2757 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2758 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2759 		}
2760 
2761 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2762 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2763 	}
2764 }
2765 
2766 /**
2767  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2768  *
2769  * @asic_type: AMD asic type
2770  *
2771  * Check if there is DC (new modesetting infrastructre) support for an asic.
2772  * returns true if DC has support, false if not.
2773  */
2774 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2775 {
2776 	switch (asic_type) {
2777 #if defined(CONFIG_DRM_AMD_DC)
2778 	case CHIP_BONAIRE:
2779 	case CHIP_KAVERI:
2780 	case CHIP_KABINI:
2781 	case CHIP_MULLINS:
2782 		/*
2783 		 * We have systems in the wild with these ASICs that require
2784 		 * LVDS and VGA support which is not supported with DC.
2785 		 *
2786 		 * Fallback to the non-DC driver here by default so as not to
2787 		 * cause regressions.
2788 		 */
2789 		return amdgpu_dc > 0;
2790 	case CHIP_HAWAII:
2791 	case CHIP_CARRIZO:
2792 	case CHIP_STONEY:
2793 	case CHIP_POLARIS10:
2794 	case CHIP_POLARIS11:
2795 	case CHIP_POLARIS12:
2796 	case CHIP_VEGAM:
2797 	case CHIP_TONGA:
2798 	case CHIP_FIJI:
2799 	case CHIP_VEGA10:
2800 	case CHIP_VEGA12:
2801 	case CHIP_VEGA20:
2802 #if defined(CONFIG_DRM_AMD_DC_DCN)
2803 	case CHIP_RAVEN:
2804 	case CHIP_NAVI10:
2805 	case CHIP_NAVI14:
2806 	case CHIP_NAVI12:
2807 	case CHIP_RENOIR:
2808 #endif
2809 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2810 	case CHIP_SIENNA_CICHLID:
2811 	case CHIP_NAVY_FLOUNDER:
2812 #endif
2813 		return amdgpu_dc != 0;
2814 #endif
2815 	default:
2816 		if (amdgpu_dc > 0)
2817 			DRM_INFO("Display Core has been requested via kernel parameter "
2818 					 "but isn't supported by ASIC, ignoring\n");
2819 		return false;
2820 	}
2821 }
2822 
2823 /**
2824  * amdgpu_device_has_dc_support - check if dc is supported
2825  *
2826  * @adev: amdgpu_device_pointer
2827  *
2828  * Returns true for supported, false for not supported
2829  */
2830 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2831 {
2832 	if (amdgpu_sriov_vf(adev))
2833 		return false;
2834 
2835 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
2836 }
2837 
2838 
2839 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2840 {
2841 	struct amdgpu_device *adev =
2842 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
2843 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2844 
2845 	/* It's a bug to not have a hive within this function */
2846 	if (WARN_ON(!hive))
2847 		return;
2848 
2849 	/*
2850 	 * Use task barrier to synchronize all xgmi reset works across the
2851 	 * hive. task_barrier_enter and task_barrier_exit will block
2852 	 * until all the threads running the xgmi reset works reach
2853 	 * those points. task_barrier_full will do both blocks.
2854 	 */
2855 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2856 
2857 		task_barrier_enter(&hive->tb);
2858 		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2859 
2860 		if (adev->asic_reset_res)
2861 			goto fail;
2862 
2863 		task_barrier_exit(&hive->tb);
2864 		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2865 
2866 		if (adev->asic_reset_res)
2867 			goto fail;
2868 
2869 		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2870 			adev->mmhub.funcs->reset_ras_error_count(adev);
2871 	} else {
2872 
2873 		task_barrier_full(&hive->tb);
2874 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
2875 	}
2876 
2877 fail:
2878 	if (adev->asic_reset_res)
2879 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2880 			 adev->asic_reset_res, adev->ddev->unique);
2881 }
2882 
2883 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2884 {
2885 	char *input = amdgpu_lockup_timeout;
2886 	char *timeout_setting = NULL;
2887 	int index = 0;
2888 	long timeout;
2889 	int ret = 0;
2890 
2891 	/*
2892 	 * By default timeout for non compute jobs is 10000.
2893 	 * And there is no timeout enforced on compute jobs.
2894 	 * In SR-IOV or passthrough mode, timeout for compute
2895 	 * jobs are 60000 by default.
2896 	 */
2897 	adev->gfx_timeout = msecs_to_jiffies(10000);
2898 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2899 	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2900 		adev->compute_timeout =  msecs_to_jiffies(60000);
2901 	else
2902 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2903 
2904 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2905 		while ((timeout_setting = strsep(&input, ",")) &&
2906 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2907 			ret = kstrtol(timeout_setting, 0, &timeout);
2908 			if (ret)
2909 				return ret;
2910 
2911 			if (timeout == 0) {
2912 				index++;
2913 				continue;
2914 			} else if (timeout < 0) {
2915 				timeout = MAX_SCHEDULE_TIMEOUT;
2916 			} else {
2917 				timeout = msecs_to_jiffies(timeout);
2918 			}
2919 
2920 			switch (index++) {
2921 			case 0:
2922 				adev->gfx_timeout = timeout;
2923 				break;
2924 			case 1:
2925 				adev->compute_timeout = timeout;
2926 				break;
2927 			case 2:
2928 				adev->sdma_timeout = timeout;
2929 				break;
2930 			case 3:
2931 				adev->video_timeout = timeout;
2932 				break;
2933 			default:
2934 				break;
2935 			}
2936 		}
2937 		/*
2938 		 * There is only one value specified and
2939 		 * it should apply to all non-compute jobs.
2940 		 */
2941 		if (index == 1) {
2942 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2943 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2944 				adev->compute_timeout = adev->gfx_timeout;
2945 		}
2946 	}
2947 
2948 	return ret;
2949 }
2950 
2951 static const struct attribute *amdgpu_dev_attributes[] = {
2952 	&dev_attr_product_name.attr,
2953 	&dev_attr_product_number.attr,
2954 	&dev_attr_serial_number.attr,
2955 	&dev_attr_pcie_replay_count.attr,
2956 	NULL
2957 };
2958 
2959 /**
2960  * amdgpu_device_init - initialize the driver
2961  *
2962  * @adev: amdgpu_device pointer
2963  * @ddev: drm dev pointer
2964  * @pdev: pci dev pointer
2965  * @flags: driver flags
2966  *
2967  * Initializes the driver info and hw (all asics).
2968  * Returns 0 for success or an error on failure.
2969  * Called at driver startup.
2970  */
2971 int amdgpu_device_init(struct amdgpu_device *adev,
2972 		       struct drm_device *ddev,
2973 		       struct pci_dev *pdev,
2974 		       uint32_t flags)
2975 {
2976 	int r, i;
2977 	bool boco = false;
2978 	u32 max_MBps;
2979 
2980 	adev->shutdown = false;
2981 	adev->dev = &pdev->dev;
2982 	adev->ddev = ddev;
2983 	adev->pdev = pdev;
2984 	adev->flags = flags;
2985 
2986 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2987 		adev->asic_type = amdgpu_force_asic_type;
2988 	else
2989 		adev->asic_type = flags & AMD_ASIC_MASK;
2990 
2991 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2992 	if (amdgpu_emu_mode == 1)
2993 		adev->usec_timeout *= 10;
2994 	adev->gmc.gart_size = 512 * 1024 * 1024;
2995 	adev->accel_working = false;
2996 	adev->num_rings = 0;
2997 	adev->mman.buffer_funcs = NULL;
2998 	adev->mman.buffer_funcs_ring = NULL;
2999 	adev->vm_manager.vm_pte_funcs = NULL;
3000 	adev->vm_manager.vm_pte_num_scheds = 0;
3001 	adev->gmc.gmc_funcs = NULL;
3002 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3003 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3004 
3005 	adev->smc_rreg = &amdgpu_invalid_rreg;
3006 	adev->smc_wreg = &amdgpu_invalid_wreg;
3007 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3008 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3009 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3010 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3011 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3012 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3013 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3014 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3015 	adev->didt_rreg = &amdgpu_invalid_rreg;
3016 	adev->didt_wreg = &amdgpu_invalid_wreg;
3017 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3018 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3019 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3020 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3021 
3022 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3023 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3024 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3025 
3026 	/* mutex initialization are all done here so we
3027 	 * can recall function without having locking issues */
3028 	atomic_set(&adev->irq.ih.lock, 0);
3029 	mutex_init(&adev->firmware.mutex);
3030 	mutex_init(&adev->pm.mutex);
3031 	mutex_init(&adev->gfx.gpu_clock_mutex);
3032 	mutex_init(&adev->srbm_mutex);
3033 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3034 	mutex_init(&adev->gfx.gfx_off_mutex);
3035 	mutex_init(&adev->grbm_idx_mutex);
3036 	mutex_init(&adev->mn_lock);
3037 	mutex_init(&adev->virt.vf_errors.lock);
3038 	hash_init(adev->mn_hash);
3039 	init_rwsem(&adev->reset_sem);
3040 	atomic_set(&adev->in_gpu_reset, 0);
3041 	mutex_init(&adev->psp.mutex);
3042 	mutex_init(&adev->notifier_lock);
3043 
3044 	r = amdgpu_device_check_arguments(adev);
3045 	if (r)
3046 		return r;
3047 
3048 	spin_lock_init(&adev->mmio_idx_lock);
3049 	spin_lock_init(&adev->smc_idx_lock);
3050 	spin_lock_init(&adev->pcie_idx_lock);
3051 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3052 	spin_lock_init(&adev->didt_idx_lock);
3053 	spin_lock_init(&adev->gc_cac_idx_lock);
3054 	spin_lock_init(&adev->se_cac_idx_lock);
3055 	spin_lock_init(&adev->audio_endpt_idx_lock);
3056 	spin_lock_init(&adev->mm_stats.lock);
3057 
3058 	INIT_LIST_HEAD(&adev->shadow_list);
3059 	mutex_init(&adev->shadow_list_lock);
3060 
3061 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3062 			  amdgpu_device_delayed_init_work_handler);
3063 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3064 			  amdgpu_device_delay_enable_gfx_off);
3065 
3066 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3067 
3068 	adev->gfx.gfx_off_req_count = 1;
3069 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3070 
3071 	atomic_set(&adev->throttling_logging_enabled, 1);
3072 	/*
3073 	 * If throttling continues, logging will be performed every minute
3074 	 * to avoid log flooding. "-1" is subtracted since the thermal
3075 	 * throttling interrupt comes every second. Thus, the total logging
3076 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3077 	 * for throttling interrupt) = 60 seconds.
3078 	 */
3079 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3080 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3081 
3082 	/* Registers mapping */
3083 	/* TODO: block userspace mapping of io register */
3084 	if (adev->asic_type >= CHIP_BONAIRE) {
3085 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3086 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3087 	} else {
3088 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3089 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3090 	}
3091 
3092 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3093 	if (adev->rmmio == NULL) {
3094 		return -ENOMEM;
3095 	}
3096 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3097 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3098 
3099 	/* io port mapping */
3100 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3101 		if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3102 			adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3103 			adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3104 			break;
3105 		}
3106 	}
3107 	if (adev->rio_mem == NULL)
3108 		DRM_INFO("PCI I/O BAR is not found.\n");
3109 
3110 	/* enable PCIE atomic ops */
3111 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3112 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3113 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3114 	if (r) {
3115 		adev->have_atomics_support = false;
3116 		DRM_INFO("PCIE atomic ops is not supported\n");
3117 	} else {
3118 		adev->have_atomics_support = true;
3119 	}
3120 
3121 	amdgpu_device_get_pcie_info(adev);
3122 
3123 	if (amdgpu_mcbp)
3124 		DRM_INFO("MCBP is enabled\n");
3125 
3126 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3127 		adev->enable_mes = true;
3128 
3129 	/* detect hw virtualization here */
3130 	amdgpu_detect_virtualization(adev);
3131 
3132 	r = amdgpu_device_get_job_timeout_settings(adev);
3133 	if (r) {
3134 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3135 		return r;
3136 	}
3137 
3138 	/* early init functions */
3139 	r = amdgpu_device_ip_early_init(adev);
3140 	if (r)
3141 		return r;
3142 
3143 	/* doorbell bar mapping and doorbell index init*/
3144 	amdgpu_device_doorbell_init(adev);
3145 
3146 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3147 	/* this will fail for cards that aren't VGA class devices, just
3148 	 * ignore it */
3149 	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3150 
3151 	if (amdgpu_device_supports_boco(ddev))
3152 		boco = true;
3153 	if (amdgpu_has_atpx() &&
3154 	    (amdgpu_is_atpx_hybrid() ||
3155 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3156 	    !pci_is_thunderbolt_attached(adev->pdev))
3157 		vga_switcheroo_register_client(adev->pdev,
3158 					       &amdgpu_switcheroo_ops, boco);
3159 	if (boco)
3160 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3161 
3162 	if (amdgpu_emu_mode == 1) {
3163 		/* post the asic on emulation mode */
3164 		emu_soc_asic_init(adev);
3165 		goto fence_driver_init;
3166 	}
3167 
3168 	/* detect if we are with an SRIOV vbios */
3169 	amdgpu_device_detect_sriov_bios(adev);
3170 
3171 	/* check if we need to reset the asic
3172 	 *  E.g., driver was not cleanly unloaded previously, etc.
3173 	 */
3174 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3175 		r = amdgpu_asic_reset(adev);
3176 		if (r) {
3177 			dev_err(adev->dev, "asic reset on init failed\n");
3178 			goto failed;
3179 		}
3180 	}
3181 
3182 	/* Post card if necessary */
3183 	if (amdgpu_device_need_post(adev)) {
3184 		if (!adev->bios) {
3185 			dev_err(adev->dev, "no vBIOS found\n");
3186 			r = -EINVAL;
3187 			goto failed;
3188 		}
3189 		DRM_INFO("GPU posting now...\n");
3190 		r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3191 		if (r) {
3192 			dev_err(adev->dev, "gpu post error!\n");
3193 			goto failed;
3194 		}
3195 	}
3196 
3197 	if (adev->is_atom_fw) {
3198 		/* Initialize clocks */
3199 		r = amdgpu_atomfirmware_get_clock_info(adev);
3200 		if (r) {
3201 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3202 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3203 			goto failed;
3204 		}
3205 	} else {
3206 		/* Initialize clocks */
3207 		r = amdgpu_atombios_get_clock_info(adev);
3208 		if (r) {
3209 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3210 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3211 			goto failed;
3212 		}
3213 		/* init i2c buses */
3214 		if (!amdgpu_device_has_dc_support(adev))
3215 			amdgpu_atombios_i2c_init(adev);
3216 	}
3217 
3218 fence_driver_init:
3219 	/* Fence driver */
3220 	r = amdgpu_fence_driver_init(adev);
3221 	if (r) {
3222 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3223 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3224 		goto failed;
3225 	}
3226 
3227 	/* init the mode config */
3228 	drm_mode_config_init(adev->ddev);
3229 
3230 	r = amdgpu_device_ip_init(adev);
3231 	if (r) {
3232 		/* failed in exclusive mode due to timeout */
3233 		if (amdgpu_sriov_vf(adev) &&
3234 		    !amdgpu_sriov_runtime(adev) &&
3235 		    amdgpu_virt_mmio_blocked(adev) &&
3236 		    !amdgpu_virt_wait_reset(adev)) {
3237 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3238 			/* Don't send request since VF is inactive. */
3239 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3240 			adev->virt.ops = NULL;
3241 			r = -EAGAIN;
3242 			goto failed;
3243 		}
3244 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3245 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3246 		goto failed;
3247 	}
3248 
3249 	dev_info(adev->dev,
3250 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3251 			adev->gfx.config.max_shader_engines,
3252 			adev->gfx.config.max_sh_per_se,
3253 			adev->gfx.config.max_cu_per_sh,
3254 			adev->gfx.cu_info.number);
3255 
3256 	adev->accel_working = true;
3257 
3258 	amdgpu_vm_check_compute_bug(adev);
3259 
3260 	/* Initialize the buffer migration limit. */
3261 	if (amdgpu_moverate >= 0)
3262 		max_MBps = amdgpu_moverate;
3263 	else
3264 		max_MBps = 8; /* Allow 8 MB/s. */
3265 	/* Get a log2 for easy divisions. */
3266 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3267 
3268 	amdgpu_fbdev_init(adev);
3269 
3270 	r = amdgpu_pm_sysfs_init(adev);
3271 	if (r) {
3272 		adev->pm_sysfs_en = false;
3273 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3274 	} else
3275 		adev->pm_sysfs_en = true;
3276 
3277 	r = amdgpu_ucode_sysfs_init(adev);
3278 	if (r) {
3279 		adev->ucode_sysfs_en = false;
3280 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3281 	} else
3282 		adev->ucode_sysfs_en = true;
3283 
3284 	if ((amdgpu_testing & 1)) {
3285 		if (adev->accel_working)
3286 			amdgpu_test_moves(adev);
3287 		else
3288 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3289 	}
3290 	if (amdgpu_benchmarking) {
3291 		if (adev->accel_working)
3292 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3293 		else
3294 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3295 	}
3296 
3297 	/*
3298 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3299 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3300 	 * gpu instance is counted less.
3301 	 */
3302 	amdgpu_register_gpu_instance(adev);
3303 
3304 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3305 	 * explicit gating rather than handling it automatically.
3306 	 */
3307 	r = amdgpu_device_ip_late_init(adev);
3308 	if (r) {
3309 		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3310 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3311 		goto failed;
3312 	}
3313 
3314 	/* must succeed. */
3315 	amdgpu_ras_resume(adev);
3316 
3317 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3318 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3319 
3320 	if (amdgpu_sriov_vf(adev))
3321 		flush_delayed_work(&adev->delayed_init_work);
3322 
3323 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3324 	if (r) {
3325 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3326 		return r;
3327 	}
3328 
3329 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3330 		r = amdgpu_pmu_init(adev);
3331 	if (r)
3332 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3333 
3334 	return 0;
3335 
3336 failed:
3337 	amdgpu_vf_error_trans_all(adev);
3338 	if (boco)
3339 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3340 
3341 	return r;
3342 }
3343 
3344 /**
3345  * amdgpu_device_fini - tear down the driver
3346  *
3347  * @adev: amdgpu_device pointer
3348  *
3349  * Tear down the driver info (all asics).
3350  * Called at driver shutdown.
3351  */
3352 void amdgpu_device_fini(struct amdgpu_device *adev)
3353 {
3354 	int r;
3355 
3356 	DRM_INFO("amdgpu: finishing device.\n");
3357 	flush_delayed_work(&adev->delayed_init_work);
3358 	adev->shutdown = true;
3359 
3360 	/* make sure IB test finished before entering exclusive mode
3361 	 * to avoid preemption on IB test
3362 	 * */
3363 	if (amdgpu_sriov_vf(adev))
3364 		amdgpu_virt_request_full_gpu(adev, false);
3365 
3366 	/* disable all interrupts */
3367 	amdgpu_irq_disable_all(adev);
3368 	if (adev->mode_info.mode_config_initialized){
3369 		if (!amdgpu_device_has_dc_support(adev))
3370 			drm_helper_force_disable_all(adev->ddev);
3371 		else
3372 			drm_atomic_helper_shutdown(adev->ddev);
3373 	}
3374 	amdgpu_fence_driver_fini(adev);
3375 	if (adev->pm_sysfs_en)
3376 		amdgpu_pm_sysfs_fini(adev);
3377 	amdgpu_fbdev_fini(adev);
3378 	r = amdgpu_device_ip_fini(adev);
3379 	release_firmware(adev->firmware.gpu_info_fw);
3380 	adev->firmware.gpu_info_fw = NULL;
3381 	adev->accel_working = false;
3382 	/* free i2c buses */
3383 	if (!amdgpu_device_has_dc_support(adev))
3384 		amdgpu_i2c_fini(adev);
3385 
3386 	if (amdgpu_emu_mode != 1)
3387 		amdgpu_atombios_fini(adev);
3388 
3389 	kfree(adev->bios);
3390 	adev->bios = NULL;
3391 	if (amdgpu_has_atpx() &&
3392 	    (amdgpu_is_atpx_hybrid() ||
3393 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3394 	    !pci_is_thunderbolt_attached(adev->pdev))
3395 		vga_switcheroo_unregister_client(adev->pdev);
3396 	if (amdgpu_device_supports_boco(adev->ddev))
3397 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3398 	vga_client_register(adev->pdev, NULL, NULL, NULL);
3399 	if (adev->rio_mem)
3400 		pci_iounmap(adev->pdev, adev->rio_mem);
3401 	adev->rio_mem = NULL;
3402 	iounmap(adev->rmmio);
3403 	adev->rmmio = NULL;
3404 	amdgpu_device_doorbell_fini(adev);
3405 
3406 	if (adev->ucode_sysfs_en)
3407 		amdgpu_ucode_sysfs_fini(adev);
3408 
3409 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3410 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3411 		amdgpu_pmu_fini(adev);
3412 	if (adev->discovery_bin)
3413 		amdgpu_discovery_fini(adev);
3414 }
3415 
3416 
3417 /*
3418  * Suspend & resume.
3419  */
3420 /**
3421  * amdgpu_device_suspend - initiate device suspend
3422  *
3423  * @dev: drm dev pointer
3424  * @fbcon : notify the fbdev of suspend
3425  *
3426  * Puts the hw in the suspend state (all asics).
3427  * Returns 0 for success or an error on failure.
3428  * Called at driver suspend.
3429  */
3430 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3431 {
3432 	struct amdgpu_device *adev;
3433 	struct drm_crtc *crtc;
3434 	struct drm_connector *connector;
3435 	struct drm_connector_list_iter iter;
3436 	int r;
3437 
3438 	if (dev == NULL || dev->dev_private == NULL) {
3439 		return -ENODEV;
3440 	}
3441 
3442 	adev = dev->dev_private;
3443 
3444 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3445 		return 0;
3446 
3447 	adev->in_suspend = true;
3448 	drm_kms_helper_poll_disable(dev);
3449 
3450 	if (fbcon)
3451 		amdgpu_fbdev_set_suspend(adev, 1);
3452 
3453 	cancel_delayed_work_sync(&adev->delayed_init_work);
3454 
3455 	if (!amdgpu_device_has_dc_support(adev)) {
3456 		/* turn off display hw */
3457 		drm_modeset_lock_all(dev);
3458 		drm_connector_list_iter_begin(dev, &iter);
3459 		drm_for_each_connector_iter(connector, &iter)
3460 			drm_helper_connector_dpms(connector,
3461 						  DRM_MODE_DPMS_OFF);
3462 		drm_connector_list_iter_end(&iter);
3463 		drm_modeset_unlock_all(dev);
3464 			/* unpin the front buffers and cursors */
3465 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3466 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3467 			struct drm_framebuffer *fb = crtc->primary->fb;
3468 			struct amdgpu_bo *robj;
3469 
3470 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3471 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3472 				r = amdgpu_bo_reserve(aobj, true);
3473 				if (r == 0) {
3474 					amdgpu_bo_unpin(aobj);
3475 					amdgpu_bo_unreserve(aobj);
3476 				}
3477 			}
3478 
3479 			if (fb == NULL || fb->obj[0] == NULL) {
3480 				continue;
3481 			}
3482 			robj = gem_to_amdgpu_bo(fb->obj[0]);
3483 			/* don't unpin kernel fb objects */
3484 			if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3485 				r = amdgpu_bo_reserve(robj, true);
3486 				if (r == 0) {
3487 					amdgpu_bo_unpin(robj);
3488 					amdgpu_bo_unreserve(robj);
3489 				}
3490 			}
3491 		}
3492 	}
3493 
3494 	amdgpu_ras_suspend(adev);
3495 
3496 	r = amdgpu_device_ip_suspend_phase1(adev);
3497 
3498 	amdgpu_amdkfd_suspend(adev, !fbcon);
3499 
3500 	/* evict vram memory */
3501 	amdgpu_bo_evict_vram(adev);
3502 
3503 	amdgpu_fence_driver_suspend(adev);
3504 
3505 	r = amdgpu_device_ip_suspend_phase2(adev);
3506 
3507 	/* evict remaining vram memory
3508 	 * This second call to evict vram is to evict the gart page table
3509 	 * using the CPU.
3510 	 */
3511 	amdgpu_bo_evict_vram(adev);
3512 
3513 	return 0;
3514 }
3515 
3516 /**
3517  * amdgpu_device_resume - initiate device resume
3518  *
3519  * @dev: drm dev pointer
3520  * @fbcon : notify the fbdev of resume
3521  *
3522  * Bring the hw back to operating state (all asics).
3523  * Returns 0 for success or an error on failure.
3524  * Called at driver resume.
3525  */
3526 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3527 {
3528 	struct drm_connector *connector;
3529 	struct drm_connector_list_iter iter;
3530 	struct amdgpu_device *adev = dev->dev_private;
3531 	struct drm_crtc *crtc;
3532 	int r = 0;
3533 
3534 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3535 		return 0;
3536 
3537 	/* post card */
3538 	if (amdgpu_device_need_post(adev)) {
3539 		r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3540 		if (r)
3541 			DRM_ERROR("amdgpu asic init failed\n");
3542 	}
3543 
3544 	r = amdgpu_device_ip_resume(adev);
3545 	if (r) {
3546 		DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3547 		return r;
3548 	}
3549 	amdgpu_fence_driver_resume(adev);
3550 
3551 
3552 	r = amdgpu_device_ip_late_init(adev);
3553 	if (r)
3554 		return r;
3555 
3556 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3557 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3558 
3559 	if (!amdgpu_device_has_dc_support(adev)) {
3560 		/* pin cursors */
3561 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3562 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3563 
3564 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3565 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3566 				r = amdgpu_bo_reserve(aobj, true);
3567 				if (r == 0) {
3568 					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3569 					if (r != 0)
3570 						DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3571 					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3572 					amdgpu_bo_unreserve(aobj);
3573 				}
3574 			}
3575 		}
3576 	}
3577 	r = amdgpu_amdkfd_resume(adev, !fbcon);
3578 	if (r)
3579 		return r;
3580 
3581 	/* Make sure IB tests flushed */
3582 	flush_delayed_work(&adev->delayed_init_work);
3583 
3584 	/* blat the mode back in */
3585 	if (fbcon) {
3586 		if (!amdgpu_device_has_dc_support(adev)) {
3587 			/* pre DCE11 */
3588 			drm_helper_resume_force_mode(dev);
3589 
3590 			/* turn on display hw */
3591 			drm_modeset_lock_all(dev);
3592 
3593 			drm_connector_list_iter_begin(dev, &iter);
3594 			drm_for_each_connector_iter(connector, &iter)
3595 				drm_helper_connector_dpms(connector,
3596 							  DRM_MODE_DPMS_ON);
3597 			drm_connector_list_iter_end(&iter);
3598 
3599 			drm_modeset_unlock_all(dev);
3600 		}
3601 		amdgpu_fbdev_set_suspend(adev, 0);
3602 	}
3603 
3604 	drm_kms_helper_poll_enable(dev);
3605 
3606 	amdgpu_ras_resume(adev);
3607 
3608 	/*
3609 	 * Most of the connector probing functions try to acquire runtime pm
3610 	 * refs to ensure that the GPU is powered on when connector polling is
3611 	 * performed. Since we're calling this from a runtime PM callback,
3612 	 * trying to acquire rpm refs will cause us to deadlock.
3613 	 *
3614 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3615 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3616 	 */
3617 #ifdef CONFIG_PM
3618 	dev->dev->power.disable_depth++;
3619 #endif
3620 	if (!amdgpu_device_has_dc_support(adev))
3621 		drm_helper_hpd_irq_event(dev);
3622 	else
3623 		drm_kms_helper_hotplug_event(dev);
3624 #ifdef CONFIG_PM
3625 	dev->dev->power.disable_depth--;
3626 #endif
3627 	adev->in_suspend = false;
3628 
3629 	return 0;
3630 }
3631 
3632 /**
3633  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3634  *
3635  * @adev: amdgpu_device pointer
3636  *
3637  * The list of all the hardware IPs that make up the asic is walked and
3638  * the check_soft_reset callbacks are run.  check_soft_reset determines
3639  * if the asic is still hung or not.
3640  * Returns true if any of the IPs are still in a hung state, false if not.
3641  */
3642 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3643 {
3644 	int i;
3645 	bool asic_hang = false;
3646 
3647 	if (amdgpu_sriov_vf(adev))
3648 		return true;
3649 
3650 	if (amdgpu_asic_need_full_reset(adev))
3651 		return true;
3652 
3653 	for (i = 0; i < adev->num_ip_blocks; i++) {
3654 		if (!adev->ip_blocks[i].status.valid)
3655 			continue;
3656 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3657 			adev->ip_blocks[i].status.hang =
3658 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3659 		if (adev->ip_blocks[i].status.hang) {
3660 			DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3661 			asic_hang = true;
3662 		}
3663 	}
3664 	return asic_hang;
3665 }
3666 
3667 /**
3668  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3669  *
3670  * @adev: amdgpu_device pointer
3671  *
3672  * The list of all the hardware IPs that make up the asic is walked and the
3673  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3674  * handles any IP specific hardware or software state changes that are
3675  * necessary for a soft reset to succeed.
3676  * Returns 0 on success, negative error code on failure.
3677  */
3678 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3679 {
3680 	int i, r = 0;
3681 
3682 	for (i = 0; i < adev->num_ip_blocks; i++) {
3683 		if (!adev->ip_blocks[i].status.valid)
3684 			continue;
3685 		if (adev->ip_blocks[i].status.hang &&
3686 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3687 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3688 			if (r)
3689 				return r;
3690 		}
3691 	}
3692 
3693 	return 0;
3694 }
3695 
3696 /**
3697  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3698  *
3699  * @adev: amdgpu_device pointer
3700  *
3701  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3702  * reset is necessary to recover.
3703  * Returns true if a full asic reset is required, false if not.
3704  */
3705 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3706 {
3707 	int i;
3708 
3709 	if (amdgpu_asic_need_full_reset(adev))
3710 		return true;
3711 
3712 	for (i = 0; i < adev->num_ip_blocks; i++) {
3713 		if (!adev->ip_blocks[i].status.valid)
3714 			continue;
3715 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3716 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3717 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3718 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3719 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3720 			if (adev->ip_blocks[i].status.hang) {
3721 				DRM_INFO("Some block need full reset!\n");
3722 				return true;
3723 			}
3724 		}
3725 	}
3726 	return false;
3727 }
3728 
3729 /**
3730  * amdgpu_device_ip_soft_reset - do a soft reset
3731  *
3732  * @adev: amdgpu_device pointer
3733  *
3734  * The list of all the hardware IPs that make up the asic is walked and the
3735  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3736  * IP specific hardware or software state changes that are necessary to soft
3737  * reset the IP.
3738  * Returns 0 on success, negative error code on failure.
3739  */
3740 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3741 {
3742 	int i, r = 0;
3743 
3744 	for (i = 0; i < adev->num_ip_blocks; i++) {
3745 		if (!adev->ip_blocks[i].status.valid)
3746 			continue;
3747 		if (adev->ip_blocks[i].status.hang &&
3748 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3749 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3750 			if (r)
3751 				return r;
3752 		}
3753 	}
3754 
3755 	return 0;
3756 }
3757 
3758 /**
3759  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3760  *
3761  * @adev: amdgpu_device pointer
3762  *
3763  * The list of all the hardware IPs that make up the asic is walked and the
3764  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3765  * handles any IP specific hardware or software state changes that are
3766  * necessary after the IP has been soft reset.
3767  * Returns 0 on success, negative error code on failure.
3768  */
3769 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3770 {
3771 	int i, r = 0;
3772 
3773 	for (i = 0; i < adev->num_ip_blocks; i++) {
3774 		if (!adev->ip_blocks[i].status.valid)
3775 			continue;
3776 		if (adev->ip_blocks[i].status.hang &&
3777 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3778 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3779 		if (r)
3780 			return r;
3781 	}
3782 
3783 	return 0;
3784 }
3785 
3786 /**
3787  * amdgpu_device_recover_vram - Recover some VRAM contents
3788  *
3789  * @adev: amdgpu_device pointer
3790  *
3791  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3792  * restore things like GPUVM page tables after a GPU reset where
3793  * the contents of VRAM might be lost.
3794  *
3795  * Returns:
3796  * 0 on success, negative error code on failure.
3797  */
3798 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3799 {
3800 	struct dma_fence *fence = NULL, *next = NULL;
3801 	struct amdgpu_bo *shadow;
3802 	long r = 1, tmo;
3803 
3804 	if (amdgpu_sriov_runtime(adev))
3805 		tmo = msecs_to_jiffies(8000);
3806 	else
3807 		tmo = msecs_to_jiffies(100);
3808 
3809 	DRM_INFO("recover vram bo from shadow start\n");
3810 	mutex_lock(&adev->shadow_list_lock);
3811 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3812 
3813 		/* No need to recover an evicted BO */
3814 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3815 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3816 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3817 			continue;
3818 
3819 		r = amdgpu_bo_restore_shadow(shadow, &next);
3820 		if (r)
3821 			break;
3822 
3823 		if (fence) {
3824 			tmo = dma_fence_wait_timeout(fence, false, tmo);
3825 			dma_fence_put(fence);
3826 			fence = next;
3827 			if (tmo == 0) {
3828 				r = -ETIMEDOUT;
3829 				break;
3830 			} else if (tmo < 0) {
3831 				r = tmo;
3832 				break;
3833 			}
3834 		} else {
3835 			fence = next;
3836 		}
3837 	}
3838 	mutex_unlock(&adev->shadow_list_lock);
3839 
3840 	if (fence)
3841 		tmo = dma_fence_wait_timeout(fence, false, tmo);
3842 	dma_fence_put(fence);
3843 
3844 	if (r < 0 || tmo <= 0) {
3845 		DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3846 		return -EIO;
3847 	}
3848 
3849 	DRM_INFO("recover vram bo from shadow done\n");
3850 	return 0;
3851 }
3852 
3853 
3854 /**
3855  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3856  *
3857  * @adev: amdgpu device pointer
3858  * @from_hypervisor: request from hypervisor
3859  *
3860  * do VF FLR and reinitialize Asic
3861  * return 0 means succeeded otherwise failed
3862  */
3863 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3864 				     bool from_hypervisor)
3865 {
3866 	int r;
3867 
3868 	if (from_hypervisor)
3869 		r = amdgpu_virt_request_full_gpu(adev, true);
3870 	else
3871 		r = amdgpu_virt_reset_gpu(adev);
3872 	if (r)
3873 		return r;
3874 
3875 	amdgpu_amdkfd_pre_reset(adev);
3876 
3877 	/* Resume IP prior to SMC */
3878 	r = amdgpu_device_ip_reinit_early_sriov(adev);
3879 	if (r)
3880 		goto error;
3881 
3882 	amdgpu_virt_init_data_exchange(adev);
3883 	/* we need recover gart prior to run SMC/CP/SDMA resume */
3884 	amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3885 
3886 	r = amdgpu_device_fw_loading(adev);
3887 	if (r)
3888 		return r;
3889 
3890 	/* now we are okay to resume SMC/CP/SDMA */
3891 	r = amdgpu_device_ip_reinit_late_sriov(adev);
3892 	if (r)
3893 		goto error;
3894 
3895 	amdgpu_irq_gpu_reset_resume_helper(adev);
3896 	r = amdgpu_ib_ring_tests(adev);
3897 	amdgpu_amdkfd_post_reset(adev);
3898 
3899 error:
3900 	amdgpu_virt_release_full_gpu(adev, true);
3901 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3902 		amdgpu_inc_vram_lost(adev);
3903 		r = amdgpu_device_recover_vram(adev);
3904 	}
3905 
3906 	return r;
3907 }
3908 
3909 /**
3910  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3911  *
3912  * @adev: amdgpu device pointer
3913  *
3914  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3915  * a hung GPU.
3916  */
3917 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3918 {
3919 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
3920 		DRM_INFO("Timeout, but no hardware hang detected.\n");
3921 		return false;
3922 	}
3923 
3924 	if (amdgpu_gpu_recovery == 0)
3925 		goto disabled;
3926 
3927 	if (amdgpu_sriov_vf(adev))
3928 		return true;
3929 
3930 	if (amdgpu_gpu_recovery == -1) {
3931 		switch (adev->asic_type) {
3932 		case CHIP_BONAIRE:
3933 		case CHIP_HAWAII:
3934 		case CHIP_TOPAZ:
3935 		case CHIP_TONGA:
3936 		case CHIP_FIJI:
3937 		case CHIP_POLARIS10:
3938 		case CHIP_POLARIS11:
3939 		case CHIP_POLARIS12:
3940 		case CHIP_VEGAM:
3941 		case CHIP_VEGA20:
3942 		case CHIP_VEGA10:
3943 		case CHIP_VEGA12:
3944 		case CHIP_RAVEN:
3945 		case CHIP_ARCTURUS:
3946 		case CHIP_RENOIR:
3947 		case CHIP_NAVI10:
3948 		case CHIP_NAVI14:
3949 		case CHIP_NAVI12:
3950 		case CHIP_SIENNA_CICHLID:
3951 			break;
3952 		default:
3953 			goto disabled;
3954 		}
3955 	}
3956 
3957 	return true;
3958 
3959 disabled:
3960 		DRM_INFO("GPU recovery disabled.\n");
3961 		return false;
3962 }
3963 
3964 
3965 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3966 					struct amdgpu_job *job,
3967 					bool *need_full_reset_arg)
3968 {
3969 	int i, r = 0;
3970 	bool need_full_reset  = *need_full_reset_arg;
3971 
3972 	amdgpu_debugfs_wait_dump(adev);
3973 
3974 	/* block all schedulers and reset given job's ring */
3975 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3976 		struct amdgpu_ring *ring = adev->rings[i];
3977 
3978 		if (!ring || !ring->sched.thread)
3979 			continue;
3980 
3981 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3982 		amdgpu_fence_driver_force_completion(ring);
3983 	}
3984 
3985 	if(job)
3986 		drm_sched_increase_karma(&job->base);
3987 
3988 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
3989 	if (!amdgpu_sriov_vf(adev)) {
3990 
3991 		if (!need_full_reset)
3992 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3993 
3994 		if (!need_full_reset) {
3995 			amdgpu_device_ip_pre_soft_reset(adev);
3996 			r = amdgpu_device_ip_soft_reset(adev);
3997 			amdgpu_device_ip_post_soft_reset(adev);
3998 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
3999 				DRM_INFO("soft reset failed, will fallback to full reset!\n");
4000 				need_full_reset = true;
4001 			}
4002 		}
4003 
4004 		if (need_full_reset)
4005 			r = amdgpu_device_ip_suspend(adev);
4006 
4007 		*need_full_reset_arg = need_full_reset;
4008 	}
4009 
4010 	return r;
4011 }
4012 
4013 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4014 			       struct list_head *device_list_handle,
4015 			       bool *need_full_reset_arg)
4016 {
4017 	struct amdgpu_device *tmp_adev = NULL;
4018 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4019 	int r = 0;
4020 
4021 	/*
4022 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
4023 	 * to allow proper links negotiation in FW (within 1 sec)
4024 	 */
4025 	if (need_full_reset) {
4026 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4027 			/* For XGMI run all resets in parallel to speed up the process */
4028 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4029 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4030 					r = -EALREADY;
4031 			} else
4032 				r = amdgpu_asic_reset(tmp_adev);
4033 
4034 			if (r) {
4035 				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
4036 					 r, tmp_adev->ddev->unique);
4037 				break;
4038 			}
4039 		}
4040 
4041 		/* For XGMI wait for all resets to complete before proceed */
4042 		if (!r) {
4043 			list_for_each_entry(tmp_adev, device_list_handle,
4044 					    gmc.xgmi.head) {
4045 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4046 					flush_work(&tmp_adev->xgmi_reset_work);
4047 					r = tmp_adev->asic_reset_res;
4048 					if (r)
4049 						break;
4050 				}
4051 			}
4052 		}
4053 	}
4054 
4055 	if (!r && amdgpu_ras_intr_triggered()) {
4056 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4057 			if (tmp_adev->mmhub.funcs &&
4058 			    tmp_adev->mmhub.funcs->reset_ras_error_count)
4059 				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4060 		}
4061 
4062 		amdgpu_ras_intr_cleared();
4063 	}
4064 
4065 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4066 		if (need_full_reset) {
4067 			/* post card */
4068 			if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
4069 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4070 				r = -EAGAIN;
4071 				goto out;
4072 			}
4073 
4074 			if (!r) {
4075 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4076 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4077 				if (r)
4078 					goto out;
4079 
4080 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4081 				if (vram_lost) {
4082 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4083 					amdgpu_inc_vram_lost(tmp_adev);
4084 				}
4085 
4086 				r = amdgpu_gtt_mgr_recover(
4087 					&tmp_adev->mman.bdev.man[TTM_PL_TT]);
4088 				if (r)
4089 					goto out;
4090 
4091 				r = amdgpu_device_fw_loading(tmp_adev);
4092 				if (r)
4093 					return r;
4094 
4095 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4096 				if (r)
4097 					goto out;
4098 
4099 				if (vram_lost)
4100 					amdgpu_device_fill_reset_magic(tmp_adev);
4101 
4102 				/*
4103 				 * Add this ASIC as tracked as reset was already
4104 				 * complete successfully.
4105 				 */
4106 				amdgpu_register_gpu_instance(tmp_adev);
4107 
4108 				r = amdgpu_device_ip_late_init(tmp_adev);
4109 				if (r)
4110 					goto out;
4111 
4112 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4113 
4114 				/* must succeed. */
4115 				amdgpu_ras_resume(tmp_adev);
4116 
4117 				/* Update PSP FW topology after reset */
4118 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4119 					r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4120 			}
4121 		}
4122 
4123 
4124 out:
4125 		if (!r) {
4126 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4127 			r = amdgpu_ib_ring_tests(tmp_adev);
4128 			if (r) {
4129 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4130 				r = amdgpu_device_ip_suspend(tmp_adev);
4131 				need_full_reset = true;
4132 				r = -EAGAIN;
4133 				goto end;
4134 			}
4135 		}
4136 
4137 		if (!r)
4138 			r = amdgpu_device_recover_vram(tmp_adev);
4139 		else
4140 			tmp_adev->asic_reset_res = r;
4141 	}
4142 
4143 end:
4144 	*need_full_reset_arg = need_full_reset;
4145 	return r;
4146 }
4147 
4148 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
4149 {
4150 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4151 		return false;
4152 
4153 	down_write(&adev->reset_sem);
4154 
4155 	atomic_inc(&adev->gpu_reset_counter);
4156 	switch (amdgpu_asic_reset_method(adev)) {
4157 	case AMD_RESET_METHOD_MODE1:
4158 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4159 		break;
4160 	case AMD_RESET_METHOD_MODE2:
4161 		adev->mp1_state = PP_MP1_STATE_RESET;
4162 		break;
4163 	default:
4164 		adev->mp1_state = PP_MP1_STATE_NONE;
4165 		break;
4166 	}
4167 
4168 	return true;
4169 }
4170 
4171 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4172 {
4173 	amdgpu_vf_error_trans_all(adev);
4174 	adev->mp1_state = PP_MP1_STATE_NONE;
4175 	atomic_set(&adev->in_gpu_reset, 0);
4176 	up_write(&adev->reset_sem);
4177 }
4178 
4179 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4180 {
4181 	struct pci_dev *p = NULL;
4182 
4183 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4184 			adev->pdev->bus->number, 1);
4185 	if (p) {
4186 		pm_runtime_enable(&(p->dev));
4187 		pm_runtime_resume(&(p->dev));
4188 	}
4189 }
4190 
4191 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4192 {
4193 	enum amd_reset_method reset_method;
4194 	struct pci_dev *p = NULL;
4195 	u64 expires;
4196 
4197 	/*
4198 	 * For now, only BACO and mode1 reset are confirmed
4199 	 * to suffer the audio issue without proper suspended.
4200 	 */
4201 	reset_method = amdgpu_asic_reset_method(adev);
4202 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4203 	     (reset_method != AMD_RESET_METHOD_MODE1))
4204 		return -EINVAL;
4205 
4206 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4207 			adev->pdev->bus->number, 1);
4208 	if (!p)
4209 		return -ENODEV;
4210 
4211 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4212 	if (!expires)
4213 		/*
4214 		 * If we cannot get the audio device autosuspend delay,
4215 		 * a fixed 4S interval will be used. Considering 3S is
4216 		 * the audio controller default autosuspend delay setting.
4217 		 * 4S used here is guaranteed to cover that.
4218 		 */
4219 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4220 
4221 	while (!pm_runtime_status_suspended(&(p->dev))) {
4222 		if (!pm_runtime_suspend(&(p->dev)))
4223 			break;
4224 
4225 		if (expires < ktime_get_mono_fast_ns()) {
4226 			dev_warn(adev->dev, "failed to suspend display audio\n");
4227 			/* TODO: abort the succeeding gpu reset? */
4228 			return -ETIMEDOUT;
4229 		}
4230 	}
4231 
4232 	pm_runtime_disable(&(p->dev));
4233 
4234 	return 0;
4235 }
4236 
4237 /**
4238  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4239  *
4240  * @adev: amdgpu device pointer
4241  * @job: which job trigger hang
4242  *
4243  * Attempt to reset the GPU if it has hung (all asics).
4244  * Attempt to do soft-reset or full-reset and reinitialize Asic
4245  * Returns 0 for success or an error on failure.
4246  */
4247 
4248 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4249 			      struct amdgpu_job *job)
4250 {
4251 	struct list_head device_list, *device_list_handle =  NULL;
4252 	bool need_full_reset = false;
4253 	bool job_signaled = false;
4254 	struct amdgpu_hive_info *hive = NULL;
4255 	struct amdgpu_device *tmp_adev = NULL;
4256 	int i, r = 0;
4257 	bool need_emergency_restart = false;
4258 	bool audio_suspended = false;
4259 
4260 	/**
4261 	 * Special case: RAS triggered and full reset isn't supported
4262 	 */
4263 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4264 
4265 	/*
4266 	 * Flush RAM to disk so that after reboot
4267 	 * the user can read log and see why the system rebooted.
4268 	 */
4269 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4270 		DRM_WARN("Emergency reboot.");
4271 
4272 		ksys_sync_helper();
4273 		emergency_restart();
4274 	}
4275 
4276 	dev_info(adev->dev, "GPU %s begin!\n",
4277 		need_emergency_restart ? "jobs stop":"reset");
4278 
4279 	/*
4280 	 * Here we trylock to avoid chain of resets executing from
4281 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4282 	 * different schedulers for same device while this TO handler is running.
4283 	 * We always reset all schedulers for device and all devices for XGMI
4284 	 * hive so that should take care of them too.
4285 	 */
4286 	hive = amdgpu_get_xgmi_hive(adev, false);
4287 	if (hive) {
4288 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4289 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4290 				job ? job->base.id : -1, hive->hive_id);
4291 			return 0;
4292 		}
4293 		mutex_lock(&hive->hive_lock);
4294 	}
4295 
4296 	/*
4297 	 * Build list of devices to reset.
4298 	 * In case we are in XGMI hive mode, resort the device list
4299 	 * to put adev in the 1st position.
4300 	 */
4301 	INIT_LIST_HEAD(&device_list);
4302 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4303 		if (!hive)
4304 			return -ENODEV;
4305 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4306 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4307 		device_list_handle = &hive->device_list;
4308 	} else {
4309 		list_add_tail(&adev->gmc.xgmi.head, &device_list);
4310 		device_list_handle = &device_list;
4311 	}
4312 
4313 	/* block all schedulers and reset given job's ring */
4314 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4315 		if (!amdgpu_device_lock_adev(tmp_adev)) {
4316 			DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
4317 				  job ? job->base.id : -1);
4318 			r = 0;
4319 			goto skip_recovery;
4320 		}
4321 
4322 		/*
4323 		 * Try to put the audio codec into suspend state
4324 		 * before gpu reset started.
4325 		 *
4326 		 * Due to the power domain of the graphics device
4327 		 * is shared with AZ power domain. Without this,
4328 		 * we may change the audio hardware from behind
4329 		 * the audio driver's back. That will trigger
4330 		 * some audio codec errors.
4331 		 */
4332 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4333 			audio_suspended = true;
4334 
4335 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4336 
4337 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4338 
4339 		if (!amdgpu_sriov_vf(tmp_adev))
4340 			amdgpu_amdkfd_pre_reset(tmp_adev);
4341 
4342 		/*
4343 		 * Mark these ASICs to be reseted as untracked first
4344 		 * And add them back after reset completed
4345 		 */
4346 		amdgpu_unregister_gpu_instance(tmp_adev);
4347 
4348 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4349 
4350 		/* disable ras on ALL IPs */
4351 		if (!need_emergency_restart &&
4352 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4353 			amdgpu_ras_suspend(tmp_adev);
4354 
4355 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4356 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4357 
4358 			if (!ring || !ring->sched.thread)
4359 				continue;
4360 
4361 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4362 
4363 			if (need_emergency_restart)
4364 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4365 		}
4366 	}
4367 
4368 	if (need_emergency_restart)
4369 		goto skip_sched_resume;
4370 
4371 	/*
4372 	 * Must check guilty signal here since after this point all old
4373 	 * HW fences are force signaled.
4374 	 *
4375 	 * job->base holds a reference to parent fence
4376 	 */
4377 	if (job && job->base.s_fence->parent &&
4378 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4379 		job_signaled = true;
4380 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4381 		goto skip_hw_reset;
4382 	}
4383 
4384 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4385 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4386 		r = amdgpu_device_pre_asic_reset(tmp_adev,
4387 						 NULL,
4388 						 &need_full_reset);
4389 		/*TODO Should we stop ?*/
4390 		if (r) {
4391 			DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4392 				  r, tmp_adev->ddev->unique);
4393 			tmp_adev->asic_reset_res = r;
4394 		}
4395 	}
4396 
4397 	/* Actual ASIC resets if needed.*/
4398 	/* TODO Implement XGMI hive reset logic for SRIOV */
4399 	if (amdgpu_sriov_vf(adev)) {
4400 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4401 		if (r)
4402 			adev->asic_reset_res = r;
4403 	} else {
4404 		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4405 		if (r && r == -EAGAIN)
4406 			goto retry;
4407 	}
4408 
4409 skip_hw_reset:
4410 
4411 	/* Post ASIC reset for all devs .*/
4412 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4413 
4414 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4415 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4416 
4417 			if (!ring || !ring->sched.thread)
4418 				continue;
4419 
4420 			/* No point to resubmit jobs if we didn't HW reset*/
4421 			if (!tmp_adev->asic_reset_res && !job_signaled)
4422 				drm_sched_resubmit_jobs(&ring->sched);
4423 
4424 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4425 		}
4426 
4427 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4428 			drm_helper_resume_force_mode(tmp_adev->ddev);
4429 		}
4430 
4431 		tmp_adev->asic_reset_res = 0;
4432 
4433 		if (r) {
4434 			/* bad news, how to tell it to userspace ? */
4435 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4436 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4437 		} else {
4438 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4439 		}
4440 	}
4441 
4442 skip_sched_resume:
4443 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4444 		/*unlock kfd: SRIOV would do it separately */
4445 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4446 	                amdgpu_amdkfd_post_reset(tmp_adev);
4447 		if (audio_suspended)
4448 			amdgpu_device_resume_display_audio(tmp_adev);
4449 		amdgpu_device_unlock_adev(tmp_adev);
4450 	}
4451 
4452 skip_recovery:
4453 	if (hive) {
4454 		atomic_set(&hive->in_reset, 0);
4455 		mutex_unlock(&hive->hive_lock);
4456 	}
4457 
4458 	if (r)
4459 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4460 	return r;
4461 }
4462 
4463 /**
4464  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4465  *
4466  * @adev: amdgpu_device pointer
4467  *
4468  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4469  * and lanes) of the slot the device is in. Handles APUs and
4470  * virtualized environments where PCIE config space may not be available.
4471  */
4472 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4473 {
4474 	struct pci_dev *pdev;
4475 	enum pci_bus_speed speed_cap, platform_speed_cap;
4476 	enum pcie_link_width platform_link_width;
4477 
4478 	if (amdgpu_pcie_gen_cap)
4479 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4480 
4481 	if (amdgpu_pcie_lane_cap)
4482 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4483 
4484 	/* covers APUs as well */
4485 	if (pci_is_root_bus(adev->pdev->bus)) {
4486 		if (adev->pm.pcie_gen_mask == 0)
4487 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4488 		if (adev->pm.pcie_mlw_mask == 0)
4489 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4490 		return;
4491 	}
4492 
4493 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4494 		return;
4495 
4496 	pcie_bandwidth_available(adev->pdev, NULL,
4497 				 &platform_speed_cap, &platform_link_width);
4498 
4499 	if (adev->pm.pcie_gen_mask == 0) {
4500 		/* asic caps */
4501 		pdev = adev->pdev;
4502 		speed_cap = pcie_get_speed_cap(pdev);
4503 		if (speed_cap == PCI_SPEED_UNKNOWN) {
4504 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4505 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4506 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4507 		} else {
4508 			if (speed_cap == PCIE_SPEED_16_0GT)
4509 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4510 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4511 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4512 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4513 			else if (speed_cap == PCIE_SPEED_8_0GT)
4514 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4515 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4516 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4517 			else if (speed_cap == PCIE_SPEED_5_0GT)
4518 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4519 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4520 			else
4521 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4522 		}
4523 		/* platform caps */
4524 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4525 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4526 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4527 		} else {
4528 			if (platform_speed_cap == PCIE_SPEED_16_0GT)
4529 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4530 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4531 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4532 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4533 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4534 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4535 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4536 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4537 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4538 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4539 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4540 			else
4541 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4542 
4543 		}
4544 	}
4545 	if (adev->pm.pcie_mlw_mask == 0) {
4546 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4547 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4548 		} else {
4549 			switch (platform_link_width) {
4550 			case PCIE_LNK_X32:
4551 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4552 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4553 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4554 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4555 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4556 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4557 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4558 				break;
4559 			case PCIE_LNK_X16:
4560 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4561 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4562 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4563 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4564 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4565 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4566 				break;
4567 			case PCIE_LNK_X12:
4568 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4569 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4570 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4571 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4572 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4573 				break;
4574 			case PCIE_LNK_X8:
4575 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4576 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4577 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4578 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4579 				break;
4580 			case PCIE_LNK_X4:
4581 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4582 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4583 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4584 				break;
4585 			case PCIE_LNK_X2:
4586 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4587 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4588 				break;
4589 			case PCIE_LNK_X1:
4590 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4591 				break;
4592 			default:
4593 				break;
4594 			}
4595 		}
4596 	}
4597 }
4598 
4599 int amdgpu_device_baco_enter(struct drm_device *dev)
4600 {
4601 	struct amdgpu_device *adev = dev->dev_private;
4602 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4603 
4604 	if (!amdgpu_device_supports_baco(adev->ddev))
4605 		return -ENOTSUPP;
4606 
4607 	if (ras && ras->supported)
4608 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4609 
4610 	return amdgpu_dpm_baco_enter(adev);
4611 }
4612 
4613 int amdgpu_device_baco_exit(struct drm_device *dev)
4614 {
4615 	struct amdgpu_device *adev = dev->dev_private;
4616 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4617 	int ret = 0;
4618 
4619 	if (!amdgpu_device_supports_baco(adev->ddev))
4620 		return -ENOTSUPP;
4621 
4622 	ret = amdgpu_dpm_baco_exit(adev);
4623 	if (ret)
4624 		return ret;
4625 
4626 	if (ras && ras->supported)
4627 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4628 
4629 	return 0;
4630 }
4631