xref: /openbmc/linux/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c (revision 83775e158a3d2dc437132ab357ed6c9214ef0ae9)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2018-2019 The Linux Foundation. All rights reserved. */
3 
4 #include <linux/ascii85.h>
5 #include "msm_gem.h"
6 #include "a6xx_gpu.h"
7 #include "a6xx_gmu.h"
8 #include "a6xx_gpu_state.h"
9 #include "a6xx_gmu.xml.h"
10 
11 struct a6xx_gpu_state_obj {
12 	const void *handle;
13 	u32 *data;
14 };
15 
16 struct a6xx_gpu_state {
17 	struct msm_gpu_state base;
18 
19 	struct a6xx_gpu_state_obj *gmu_registers;
20 	int nr_gmu_registers;
21 
22 	struct a6xx_gpu_state_obj *registers;
23 	int nr_registers;
24 
25 	struct a6xx_gpu_state_obj *shaders;
26 	int nr_shaders;
27 
28 	struct a6xx_gpu_state_obj *clusters;
29 	int nr_clusters;
30 
31 	struct a6xx_gpu_state_obj *dbgahb_clusters;
32 	int nr_dbgahb_clusters;
33 
34 	struct a6xx_gpu_state_obj *indexed_regs;
35 	int nr_indexed_regs;
36 
37 	struct a6xx_gpu_state_obj *debugbus;
38 	int nr_debugbus;
39 
40 	struct a6xx_gpu_state_obj *vbif_debugbus;
41 
42 	struct a6xx_gpu_state_obj *cx_debugbus;
43 	int nr_cx_debugbus;
44 
45 	struct msm_gpu_state_bo *gmu_log;
46 	struct msm_gpu_state_bo *gmu_hfi;
47 	struct msm_gpu_state_bo *gmu_debug;
48 
49 	s32 hfi_queue_history[2][HFI_HISTORY_SZ];
50 
51 	struct list_head objs;
52 
53 	bool gpu_initialized;
54 };
55 
56 static inline int CRASHDUMP_WRITE(u64 *in, u32 reg, u32 val)
57 {
58 	in[0] = val;
59 	in[1] = (((u64) reg) << 44 | (1 << 21) | 1);
60 
61 	return 2;
62 }
63 
64 static inline int CRASHDUMP_READ(u64 *in, u32 reg, u32 dwords, u64 target)
65 {
66 	in[0] = target;
67 	in[1] = (((u64) reg) << 44 | dwords);
68 
69 	return 2;
70 }
71 
72 static inline int CRASHDUMP_FINI(u64 *in)
73 {
74 	in[0] = 0;
75 	in[1] = 0;
76 
77 	return 2;
78 }
79 
80 struct a6xx_crashdumper {
81 	void *ptr;
82 	struct drm_gem_object *bo;
83 	u64 iova;
84 };
85 
86 struct a6xx_state_memobj {
87 	struct list_head node;
88 	unsigned long long data[];
89 };
90 
91 static void *state_kcalloc(struct a6xx_gpu_state *a6xx_state, int nr, size_t objsize)
92 {
93 	struct a6xx_state_memobj *obj =
94 		kvzalloc((nr * objsize) + sizeof(*obj), GFP_KERNEL);
95 
96 	if (!obj)
97 		return NULL;
98 
99 	list_add_tail(&obj->node, &a6xx_state->objs);
100 	return &obj->data;
101 }
102 
103 static void *state_kmemdup(struct a6xx_gpu_state *a6xx_state, void *src,
104 		size_t size)
105 {
106 	void *dst = state_kcalloc(a6xx_state, 1, size);
107 
108 	if (dst)
109 		memcpy(dst, src, size);
110 	return dst;
111 }
112 
113 /*
114  * Allocate 1MB for the crashdumper scratch region - 8k for the script and
115  * the rest for the data
116  */
117 #define A6XX_CD_DATA_OFFSET 8192
118 #define A6XX_CD_DATA_SIZE  (SZ_1M - 8192)
119 
120 static int a6xx_crashdumper_init(struct msm_gpu *gpu,
121 		struct a6xx_crashdumper *dumper)
122 {
123 	dumper->ptr = msm_gem_kernel_new(gpu->dev,
124 		SZ_1M, MSM_BO_WC, gpu->aspace,
125 		&dumper->bo, &dumper->iova);
126 
127 	if (!IS_ERR(dumper->ptr))
128 		msm_gem_object_set_name(dumper->bo, "crashdump");
129 
130 	return PTR_ERR_OR_ZERO(dumper->ptr);
131 }
132 
133 static int a6xx_crashdumper_run(struct msm_gpu *gpu,
134 		struct a6xx_crashdumper *dumper)
135 {
136 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
137 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
138 	u32 val;
139 	int ret;
140 
141 	if (IS_ERR_OR_NULL(dumper->ptr))
142 		return -EINVAL;
143 
144 	if (!a6xx_gmu_sptprac_is_on(&a6xx_gpu->gmu))
145 		return -EINVAL;
146 
147 	/* Make sure all pending memory writes are posted */
148 	wmb();
149 
150 	gpu_write64(gpu, REG_A6XX_CP_CRASH_SCRIPT_BASE, dumper->iova);
151 
152 	gpu_write(gpu, REG_A6XX_CP_CRASH_DUMP_CNTL, 1);
153 
154 	ret = gpu_poll_timeout(gpu, REG_A6XX_CP_CRASH_DUMP_STATUS, val,
155 		val & 0x02, 100, 10000);
156 
157 	gpu_write(gpu, REG_A6XX_CP_CRASH_DUMP_CNTL, 0);
158 
159 	return ret;
160 }
161 
162 /* read a value from the GX debug bus */
163 static int debugbus_read(struct msm_gpu *gpu, u32 block, u32 offset,
164 		u32 *data)
165 {
166 	u32 reg = A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_INDEX(offset) |
167 		A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_BLK_SEL(block);
168 
169 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_A, reg);
170 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_B, reg);
171 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_C, reg);
172 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_D, reg);
173 
174 	/* Wait 1 us to make sure the data is flowing */
175 	udelay(1);
176 
177 	data[0] = gpu_read(gpu, REG_A6XX_DBGC_CFG_DBGBUS_TRACE_BUF2);
178 	data[1] = gpu_read(gpu, REG_A6XX_DBGC_CFG_DBGBUS_TRACE_BUF1);
179 
180 	return 2;
181 }
182 
183 #define cxdbg_write(ptr, offset, val) \
184 	msm_writel((val), (ptr) + ((offset) << 2))
185 
186 #define cxdbg_read(ptr, offset) \
187 	msm_readl((ptr) + ((offset) << 2))
188 
189 /* read a value from the CX debug bus */
190 static int cx_debugbus_read(void __iomem *cxdbg, u32 block, u32 offset,
191 		u32 *data)
192 {
193 	u32 reg = A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_INDEX(offset) |
194 		A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);
195 
196 	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_A, reg);
197 	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_B, reg);
198 	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_C, reg);
199 	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_D, reg);
200 
201 	/* Wait 1 us to make sure the data is flowing */
202 	udelay(1);
203 
204 	data[0] = cxdbg_read(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_TRACE_BUF2);
205 	data[1] = cxdbg_read(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_TRACE_BUF1);
206 
207 	return 2;
208 }
209 
210 /* Read a chunk of data from the VBIF debug bus */
211 static int vbif_debugbus_read(struct msm_gpu *gpu, u32 ctrl0, u32 ctrl1,
212 		u32 reg, int count, u32 *data)
213 {
214 	int i;
215 
216 	gpu_write(gpu, ctrl0, reg);
217 
218 	for (i = 0; i < count; i++) {
219 		gpu_write(gpu, ctrl1, i);
220 		data[i] = gpu_read(gpu, REG_A6XX_VBIF_TEST_BUS_OUT);
221 	}
222 
223 	return count;
224 }
225 
226 #define AXI_ARB_BLOCKS 2
227 #define XIN_AXI_BLOCKS 5
228 #define XIN_CORE_BLOCKS 4
229 
230 #define VBIF_DEBUGBUS_BLOCK_SIZE \
231 	((16 * AXI_ARB_BLOCKS) + \
232 	 (18 * XIN_AXI_BLOCKS) + \
233 	 (12 * XIN_CORE_BLOCKS))
234 
235 static void a6xx_get_vbif_debugbus_block(struct msm_gpu *gpu,
236 		struct a6xx_gpu_state *a6xx_state,
237 		struct a6xx_gpu_state_obj *obj)
238 {
239 	u32 clk, *ptr;
240 	int i;
241 
242 	obj->data = state_kcalloc(a6xx_state, VBIF_DEBUGBUS_BLOCK_SIZE,
243 		sizeof(u32));
244 	if (!obj->data)
245 		return;
246 
247 	obj->handle = NULL;
248 
249 	/* Get the current clock setting */
250 	clk = gpu_read(gpu, REG_A6XX_VBIF_CLKON);
251 
252 	/* Force on the bus so we can read it */
253 	gpu_write(gpu, REG_A6XX_VBIF_CLKON,
254 		clk | A6XX_VBIF_CLKON_FORCE_ON_TESTBUS);
255 
256 	/* We will read from BUS2 first, so disable BUS1 */
257 	gpu_write(gpu, REG_A6XX_VBIF_TEST_BUS1_CTRL0, 0);
258 
259 	/* Enable the VBIF bus for reading */
260 	gpu_write(gpu, REG_A6XX_VBIF_TEST_BUS_OUT_CTRL, 1);
261 
262 	ptr = obj->data;
263 
264 	for (i = 0; i < AXI_ARB_BLOCKS; i++)
265 		ptr += vbif_debugbus_read(gpu,
266 			REG_A6XX_VBIF_TEST_BUS2_CTRL0,
267 			REG_A6XX_VBIF_TEST_BUS2_CTRL1,
268 			1 << (i + 16), 16, ptr);
269 
270 	for (i = 0; i < XIN_AXI_BLOCKS; i++)
271 		ptr += vbif_debugbus_read(gpu,
272 			REG_A6XX_VBIF_TEST_BUS2_CTRL0,
273 			REG_A6XX_VBIF_TEST_BUS2_CTRL1,
274 			1 << i, 18, ptr);
275 
276 	/* Stop BUS2 so we can turn on BUS1 */
277 	gpu_write(gpu, REG_A6XX_VBIF_TEST_BUS2_CTRL0, 0);
278 
279 	for (i = 0; i < XIN_CORE_BLOCKS; i++)
280 		ptr += vbif_debugbus_read(gpu,
281 			REG_A6XX_VBIF_TEST_BUS1_CTRL0,
282 			REG_A6XX_VBIF_TEST_BUS1_CTRL1,
283 			1 << i, 12, ptr);
284 
285 	/* Restore the VBIF clock setting */
286 	gpu_write(gpu, REG_A6XX_VBIF_CLKON, clk);
287 }
288 
289 static void a6xx_get_debugbus_block(struct msm_gpu *gpu,
290 		struct a6xx_gpu_state *a6xx_state,
291 		const struct a6xx_debugbus_block *block,
292 		struct a6xx_gpu_state_obj *obj)
293 {
294 	int i;
295 	u32 *ptr;
296 
297 	obj->data = state_kcalloc(a6xx_state, block->count, sizeof(u64));
298 	if (!obj->data)
299 		return;
300 
301 	obj->handle = block;
302 
303 	for (ptr = obj->data, i = 0; i < block->count; i++)
304 		ptr += debugbus_read(gpu, block->id, i, ptr);
305 }
306 
307 static void a6xx_get_cx_debugbus_block(void __iomem *cxdbg,
308 		struct a6xx_gpu_state *a6xx_state,
309 		const struct a6xx_debugbus_block *block,
310 		struct a6xx_gpu_state_obj *obj)
311 {
312 	int i;
313 	u32 *ptr;
314 
315 	obj->data = state_kcalloc(a6xx_state, block->count, sizeof(u64));
316 	if (!obj->data)
317 		return;
318 
319 	obj->handle = block;
320 
321 	for (ptr = obj->data, i = 0; i < block->count; i++)
322 		ptr += cx_debugbus_read(cxdbg, block->id, i, ptr);
323 }
324 
325 static void a6xx_get_debugbus(struct msm_gpu *gpu,
326 		struct a6xx_gpu_state *a6xx_state)
327 {
328 	struct resource *res;
329 	void __iomem *cxdbg = NULL;
330 	int nr_debugbus_blocks;
331 
332 	/* Set up the GX debug bus */
333 
334 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_CNTLT,
335 		A6XX_DBGC_CFG_DBGBUS_CNTLT_SEGT(0xf));
336 
337 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_CNTLM,
338 		A6XX_DBGC_CFG_DBGBUS_CNTLM_ENABLE(0xf));
339 
340 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_0, 0);
341 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_1, 0);
342 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_2, 0);
343 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_3, 0);
344 
345 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_BYTEL_0, 0x76543210);
346 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_BYTEL_1, 0xFEDCBA98);
347 
348 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_0, 0);
349 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_1, 0);
350 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_2, 0);
351 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_3, 0);
352 
353 	/* Set up the CX debug bus - it lives elsewhere in the system so do a
354 	 * temporary ioremap for the registers
355 	 */
356 	res = platform_get_resource_byname(gpu->pdev, IORESOURCE_MEM,
357 			"cx_dbgc");
358 
359 	if (res)
360 		cxdbg = ioremap(res->start, resource_size(res));
361 
362 	if (cxdbg) {
363 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_CNTLT,
364 			A6XX_DBGC_CFG_DBGBUS_CNTLT_SEGT(0xf));
365 
366 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_CNTLM,
367 			A6XX_DBGC_CFG_DBGBUS_CNTLM_ENABLE(0xf));
368 
369 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_0, 0);
370 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_1, 0);
371 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_2, 0);
372 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_3, 0);
373 
374 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_BYTEL_0,
375 			0x76543210);
376 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_BYTEL_1,
377 			0xFEDCBA98);
378 
379 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_0, 0);
380 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_1, 0);
381 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_2, 0);
382 		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_3, 0);
383 	}
384 
385 	nr_debugbus_blocks = ARRAY_SIZE(a6xx_debugbus_blocks) +
386 		(a6xx_has_gbif(to_adreno_gpu(gpu)) ? 1 : 0);
387 
388 	if (adreno_is_a650_family(to_adreno_gpu(gpu)))
389 		nr_debugbus_blocks += ARRAY_SIZE(a650_debugbus_blocks);
390 
391 	a6xx_state->debugbus = state_kcalloc(a6xx_state, nr_debugbus_blocks,
392 			sizeof(*a6xx_state->debugbus));
393 
394 	if (a6xx_state->debugbus) {
395 		int i;
396 
397 		for (i = 0; i < ARRAY_SIZE(a6xx_debugbus_blocks); i++)
398 			a6xx_get_debugbus_block(gpu,
399 				a6xx_state,
400 				&a6xx_debugbus_blocks[i],
401 				&a6xx_state->debugbus[i]);
402 
403 		a6xx_state->nr_debugbus = ARRAY_SIZE(a6xx_debugbus_blocks);
404 
405 		/*
406 		 * GBIF has same debugbus as of other GPU blocks, fall back to
407 		 * default path if GPU uses GBIF, also GBIF uses exactly same
408 		 * ID as of VBIF.
409 		 */
410 		if (a6xx_has_gbif(to_adreno_gpu(gpu))) {
411 			a6xx_get_debugbus_block(gpu, a6xx_state,
412 				&a6xx_gbif_debugbus_block,
413 				&a6xx_state->debugbus[i]);
414 
415 			a6xx_state->nr_debugbus += 1;
416 		}
417 
418 
419 		if (adreno_is_a650_family(to_adreno_gpu(gpu))) {
420 			for (i = 0; i < ARRAY_SIZE(a650_debugbus_blocks); i++)
421 				a6xx_get_debugbus_block(gpu,
422 					a6xx_state,
423 					&a650_debugbus_blocks[i],
424 					&a6xx_state->debugbus[i]);
425 		}
426 	}
427 
428 	/*  Dump the VBIF debugbus on applicable targets */
429 	if (!a6xx_has_gbif(to_adreno_gpu(gpu))) {
430 		a6xx_state->vbif_debugbus =
431 			state_kcalloc(a6xx_state, 1,
432 					sizeof(*a6xx_state->vbif_debugbus));
433 
434 		if (a6xx_state->vbif_debugbus)
435 			a6xx_get_vbif_debugbus_block(gpu, a6xx_state,
436 					a6xx_state->vbif_debugbus);
437 	}
438 
439 	if (cxdbg) {
440 		a6xx_state->cx_debugbus =
441 			state_kcalloc(a6xx_state,
442 			ARRAY_SIZE(a6xx_cx_debugbus_blocks),
443 			sizeof(*a6xx_state->cx_debugbus));
444 
445 		if (a6xx_state->cx_debugbus) {
446 			int i;
447 
448 			for (i = 0; i < ARRAY_SIZE(a6xx_cx_debugbus_blocks); i++)
449 				a6xx_get_cx_debugbus_block(cxdbg,
450 					a6xx_state,
451 					&a6xx_cx_debugbus_blocks[i],
452 					&a6xx_state->cx_debugbus[i]);
453 
454 			a6xx_state->nr_cx_debugbus =
455 				ARRAY_SIZE(a6xx_cx_debugbus_blocks);
456 		}
457 
458 		iounmap(cxdbg);
459 	}
460 }
461 
462 #define RANGE(reg, a) ((reg)[(a) + 1] - (reg)[(a)] + 1)
463 
464 /* Read a data cluster from behind the AHB aperture */
465 static void a6xx_get_dbgahb_cluster(struct msm_gpu *gpu,
466 		struct a6xx_gpu_state *a6xx_state,
467 		const struct a6xx_dbgahb_cluster *dbgahb,
468 		struct a6xx_gpu_state_obj *obj,
469 		struct a6xx_crashdumper *dumper)
470 {
471 	u64 *in = dumper->ptr;
472 	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
473 	size_t datasize;
474 	int i, regcount = 0;
475 
476 	for (i = 0; i < A6XX_NUM_CONTEXTS; i++) {
477 		int j;
478 
479 		in += CRASHDUMP_WRITE(in, REG_A6XX_HLSQ_DBG_READ_SEL,
480 			(dbgahb->statetype + i * 2) << 8);
481 
482 		for (j = 0; j < dbgahb->count; j += 2) {
483 			int count = RANGE(dbgahb->registers, j);
484 			u32 offset = REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE +
485 				dbgahb->registers[j] - (dbgahb->base >> 2);
486 
487 			in += CRASHDUMP_READ(in, offset, count, out);
488 
489 			out += count * sizeof(u32);
490 
491 			if (i == 0)
492 				regcount += count;
493 		}
494 	}
495 
496 	CRASHDUMP_FINI(in);
497 
498 	datasize = regcount * A6XX_NUM_CONTEXTS * sizeof(u32);
499 
500 	if (WARN_ON(datasize > A6XX_CD_DATA_SIZE))
501 		return;
502 
503 	if (a6xx_crashdumper_run(gpu, dumper))
504 		return;
505 
506 	obj->handle = dbgahb;
507 	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
508 		datasize);
509 }
510 
511 static void a6xx_get_dbgahb_clusters(struct msm_gpu *gpu,
512 		struct a6xx_gpu_state *a6xx_state,
513 		struct a6xx_crashdumper *dumper)
514 {
515 	int i;
516 
517 	a6xx_state->dbgahb_clusters = state_kcalloc(a6xx_state,
518 		ARRAY_SIZE(a6xx_dbgahb_clusters),
519 		sizeof(*a6xx_state->dbgahb_clusters));
520 
521 	if (!a6xx_state->dbgahb_clusters)
522 		return;
523 
524 	a6xx_state->nr_dbgahb_clusters = ARRAY_SIZE(a6xx_dbgahb_clusters);
525 
526 	for (i = 0; i < ARRAY_SIZE(a6xx_dbgahb_clusters); i++)
527 		a6xx_get_dbgahb_cluster(gpu, a6xx_state,
528 			&a6xx_dbgahb_clusters[i],
529 			&a6xx_state->dbgahb_clusters[i], dumper);
530 }
531 
532 /* Read a data cluster from the CP aperture with the crashdumper */
533 static void a6xx_get_cluster(struct msm_gpu *gpu,
534 		struct a6xx_gpu_state *a6xx_state,
535 		const struct a6xx_cluster *cluster,
536 		struct a6xx_gpu_state_obj *obj,
537 		struct a6xx_crashdumper *dumper)
538 {
539 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
540 	u64 *in = dumper->ptr;
541 	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
542 	size_t datasize;
543 	int i, regcount = 0;
544 	u32 id = cluster->id;
545 
546 	/* Skip registers that are not present on older generation */
547 	if (!adreno_is_a660_family(adreno_gpu) &&
548 			cluster->registers == a660_fe_cluster)
549 		return;
550 
551 	if (adreno_is_a650_family(adreno_gpu) &&
552 			cluster->registers == a6xx_ps_cluster)
553 		id = CLUSTER_VPC_PS;
554 
555 	/* Some clusters need a selector register to be programmed too */
556 	if (cluster->sel_reg)
557 		in += CRASHDUMP_WRITE(in, cluster->sel_reg, cluster->sel_val);
558 
559 	for (i = 0; i < A6XX_NUM_CONTEXTS; i++) {
560 		int j;
561 
562 		in += CRASHDUMP_WRITE(in, REG_A6XX_CP_APERTURE_CNTL_CD,
563 			(id << 8) | (i << 4) | i);
564 
565 		for (j = 0; j < cluster->count; j += 2) {
566 			int count = RANGE(cluster->registers, j);
567 
568 			in += CRASHDUMP_READ(in, cluster->registers[j],
569 				count, out);
570 
571 			out += count * sizeof(u32);
572 
573 			if (i == 0)
574 				regcount += count;
575 		}
576 	}
577 
578 	CRASHDUMP_FINI(in);
579 
580 	datasize = regcount * A6XX_NUM_CONTEXTS * sizeof(u32);
581 
582 	if (WARN_ON(datasize > A6XX_CD_DATA_SIZE))
583 		return;
584 
585 	if (a6xx_crashdumper_run(gpu, dumper))
586 		return;
587 
588 	obj->handle = cluster;
589 	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
590 		datasize);
591 }
592 
593 static void a6xx_get_clusters(struct msm_gpu *gpu,
594 		struct a6xx_gpu_state *a6xx_state,
595 		struct a6xx_crashdumper *dumper)
596 {
597 	int i;
598 
599 	a6xx_state->clusters = state_kcalloc(a6xx_state,
600 		ARRAY_SIZE(a6xx_clusters), sizeof(*a6xx_state->clusters));
601 
602 	if (!a6xx_state->clusters)
603 		return;
604 
605 	a6xx_state->nr_clusters = ARRAY_SIZE(a6xx_clusters);
606 
607 	for (i = 0; i < ARRAY_SIZE(a6xx_clusters); i++)
608 		a6xx_get_cluster(gpu, a6xx_state, &a6xx_clusters[i],
609 			&a6xx_state->clusters[i], dumper);
610 }
611 
612 /* Read a shader / debug block from the HLSQ aperture with the crashdumper */
613 static void a6xx_get_shader_block(struct msm_gpu *gpu,
614 		struct a6xx_gpu_state *a6xx_state,
615 		const struct a6xx_shader_block *block,
616 		struct a6xx_gpu_state_obj *obj,
617 		struct a6xx_crashdumper *dumper)
618 {
619 	u64 *in = dumper->ptr;
620 	size_t datasize = block->size * A6XX_NUM_SHADER_BANKS * sizeof(u32);
621 	int i;
622 
623 	if (WARN_ON(datasize > A6XX_CD_DATA_SIZE))
624 		return;
625 
626 	for (i = 0; i < A6XX_NUM_SHADER_BANKS; i++) {
627 		in += CRASHDUMP_WRITE(in, REG_A6XX_HLSQ_DBG_READ_SEL,
628 			(block->type << 8) | i);
629 
630 		in += CRASHDUMP_READ(in, REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE,
631 			block->size, dumper->iova + A6XX_CD_DATA_OFFSET);
632 	}
633 
634 	CRASHDUMP_FINI(in);
635 
636 	if (a6xx_crashdumper_run(gpu, dumper))
637 		return;
638 
639 	obj->handle = block;
640 	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
641 		datasize);
642 }
643 
644 static void a6xx_get_shaders(struct msm_gpu *gpu,
645 		struct a6xx_gpu_state *a6xx_state,
646 		struct a6xx_crashdumper *dumper)
647 {
648 	int i;
649 
650 	a6xx_state->shaders = state_kcalloc(a6xx_state,
651 		ARRAY_SIZE(a6xx_shader_blocks), sizeof(*a6xx_state->shaders));
652 
653 	if (!a6xx_state->shaders)
654 		return;
655 
656 	a6xx_state->nr_shaders = ARRAY_SIZE(a6xx_shader_blocks);
657 
658 	for (i = 0; i < ARRAY_SIZE(a6xx_shader_blocks); i++)
659 		a6xx_get_shader_block(gpu, a6xx_state, &a6xx_shader_blocks[i],
660 			&a6xx_state->shaders[i], dumper);
661 }
662 
663 /* Read registers from behind the HLSQ aperture with the crashdumper */
664 static void a6xx_get_crashdumper_hlsq_registers(struct msm_gpu *gpu,
665 		struct a6xx_gpu_state *a6xx_state,
666 		const struct a6xx_registers *regs,
667 		struct a6xx_gpu_state_obj *obj,
668 		struct a6xx_crashdumper *dumper)
669 
670 {
671 	u64 *in = dumper->ptr;
672 	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
673 	int i, regcount = 0;
674 
675 	in += CRASHDUMP_WRITE(in, REG_A6XX_HLSQ_DBG_READ_SEL, regs->val1);
676 
677 	for (i = 0; i < regs->count; i += 2) {
678 		u32 count = RANGE(regs->registers, i);
679 		u32 offset = REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE +
680 			regs->registers[i] - (regs->val0 >> 2);
681 
682 		in += CRASHDUMP_READ(in, offset, count, out);
683 
684 		out += count * sizeof(u32);
685 		regcount += count;
686 	}
687 
688 	CRASHDUMP_FINI(in);
689 
690 	if (WARN_ON((regcount * sizeof(u32)) > A6XX_CD_DATA_SIZE))
691 		return;
692 
693 	if (a6xx_crashdumper_run(gpu, dumper))
694 		return;
695 
696 	obj->handle = regs;
697 	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
698 		regcount * sizeof(u32));
699 }
700 
701 /* Read a block of registers using the crashdumper */
702 static void a6xx_get_crashdumper_registers(struct msm_gpu *gpu,
703 		struct a6xx_gpu_state *a6xx_state,
704 		const struct a6xx_registers *regs,
705 		struct a6xx_gpu_state_obj *obj,
706 		struct a6xx_crashdumper *dumper)
707 
708 {
709 	u64 *in = dumper->ptr;
710 	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
711 	int i, regcount = 0;
712 
713 	/* Skip unsupported registers on older generations */
714 	if (!adreno_is_a660_family(to_adreno_gpu(gpu)) &&
715 			(regs->registers == a660_registers))
716 		return;
717 
718 	/* Some blocks might need to program a selector register first */
719 	if (regs->val0)
720 		in += CRASHDUMP_WRITE(in, regs->val0, regs->val1);
721 
722 	for (i = 0; i < regs->count; i += 2) {
723 		u32 count = RANGE(regs->registers, i);
724 
725 		in += CRASHDUMP_READ(in, regs->registers[i], count, out);
726 
727 		out += count * sizeof(u32);
728 		regcount += count;
729 	}
730 
731 	CRASHDUMP_FINI(in);
732 
733 	if (WARN_ON((regcount * sizeof(u32)) > A6XX_CD_DATA_SIZE))
734 		return;
735 
736 	if (a6xx_crashdumper_run(gpu, dumper))
737 		return;
738 
739 	obj->handle = regs;
740 	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
741 		regcount * sizeof(u32));
742 }
743 
744 /* Read a block of registers via AHB */
745 static void a6xx_get_ahb_gpu_registers(struct msm_gpu *gpu,
746 		struct a6xx_gpu_state *a6xx_state,
747 		const struct a6xx_registers *regs,
748 		struct a6xx_gpu_state_obj *obj)
749 {
750 	int i, regcount = 0, index = 0;
751 
752 	/* Skip unsupported registers on older generations */
753 	if (!adreno_is_a660_family(to_adreno_gpu(gpu)) &&
754 			(regs->registers == a660_registers))
755 		return;
756 
757 	for (i = 0; i < regs->count; i += 2)
758 		regcount += RANGE(regs->registers, i);
759 
760 	obj->handle = (const void *) regs;
761 	obj->data = state_kcalloc(a6xx_state, regcount, sizeof(u32));
762 	if (!obj->data)
763 		return;
764 
765 	for (i = 0; i < regs->count; i += 2) {
766 		u32 count = RANGE(regs->registers, i);
767 		int j;
768 
769 		for (j = 0; j < count; j++)
770 			obj->data[index++] = gpu_read(gpu,
771 				regs->registers[i] + j);
772 	}
773 }
774 
775 /* Read a block of GMU registers */
776 static void _a6xx_get_gmu_registers(struct msm_gpu *gpu,
777 		struct a6xx_gpu_state *a6xx_state,
778 		const struct a6xx_registers *regs,
779 		struct a6xx_gpu_state_obj *obj,
780 		bool rscc)
781 {
782 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
783 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
784 	struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
785 	int i, regcount = 0, index = 0;
786 
787 	for (i = 0; i < regs->count; i += 2)
788 		regcount += RANGE(regs->registers, i);
789 
790 	obj->handle = (const void *) regs;
791 	obj->data = state_kcalloc(a6xx_state, regcount, sizeof(u32));
792 	if (!obj->data)
793 		return;
794 
795 	for (i = 0; i < regs->count; i += 2) {
796 		u32 count = RANGE(regs->registers, i);
797 		int j;
798 
799 		for (j = 0; j < count; j++) {
800 			u32 offset = regs->registers[i] + j;
801 			u32 val;
802 
803 			if (rscc)
804 				val = gmu_read_rscc(gmu, offset);
805 			else
806 				val = gmu_read(gmu, offset);
807 
808 			obj->data[index++] = val;
809 		}
810 	}
811 }
812 
813 static void a6xx_get_gmu_registers(struct msm_gpu *gpu,
814 		struct a6xx_gpu_state *a6xx_state)
815 {
816 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
817 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
818 
819 	a6xx_state->gmu_registers = state_kcalloc(a6xx_state,
820 		3, sizeof(*a6xx_state->gmu_registers));
821 
822 	if (!a6xx_state->gmu_registers)
823 		return;
824 
825 	a6xx_state->nr_gmu_registers = 3;
826 
827 	/* Get the CX GMU registers from AHB */
828 	_a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[0],
829 		&a6xx_state->gmu_registers[0], false);
830 	_a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[1],
831 		&a6xx_state->gmu_registers[1], true);
832 
833 	if (!a6xx_gmu_gx_is_on(&a6xx_gpu->gmu))
834 		return;
835 
836 	/* Set the fence to ALLOW mode so we can access the registers */
837 	gpu_write(gpu, REG_A6XX_GMU_AO_AHB_FENCE_CTRL, 0);
838 
839 	_a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[2],
840 		&a6xx_state->gmu_registers[2], false);
841 }
842 
843 static struct msm_gpu_state_bo *a6xx_snapshot_gmu_bo(
844 		struct a6xx_gpu_state *a6xx_state, struct a6xx_gmu_bo *bo)
845 {
846 	struct msm_gpu_state_bo *snapshot;
847 
848 	if (!bo->size)
849 		return NULL;
850 
851 	snapshot = state_kcalloc(a6xx_state, 1, sizeof(*snapshot));
852 	if (!snapshot)
853 		return NULL;
854 
855 	snapshot->iova = bo->iova;
856 	snapshot->size = bo->size;
857 	snapshot->data = kvzalloc(snapshot->size, GFP_KERNEL);
858 	if (!snapshot->data)
859 		return NULL;
860 
861 	memcpy(snapshot->data, bo->virt, bo->size);
862 
863 	return snapshot;
864 }
865 
866 static void a6xx_snapshot_gmu_hfi_history(struct msm_gpu *gpu,
867 					  struct a6xx_gpu_state *a6xx_state)
868 {
869 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
870 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
871 	struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
872 	unsigned i, j;
873 
874 	BUILD_BUG_ON(ARRAY_SIZE(gmu->queues) != ARRAY_SIZE(a6xx_state->hfi_queue_history));
875 
876 	for (i = 0; i < ARRAY_SIZE(gmu->queues); i++) {
877 		struct a6xx_hfi_queue *queue = &gmu->queues[i];
878 		for (j = 0; j < HFI_HISTORY_SZ; j++) {
879 			unsigned idx = (j + queue->history_idx) % HFI_HISTORY_SZ;
880 			a6xx_state->hfi_queue_history[i][j] = queue->history[idx];
881 		}
882 	}
883 }
884 
885 #define A6XX_GBIF_REGLIST_SIZE   1
886 static void a6xx_get_registers(struct msm_gpu *gpu,
887 		struct a6xx_gpu_state *a6xx_state,
888 		struct a6xx_crashdumper *dumper)
889 {
890 	int i, count = ARRAY_SIZE(a6xx_ahb_reglist) +
891 		ARRAY_SIZE(a6xx_reglist) +
892 		ARRAY_SIZE(a6xx_hlsq_reglist) + A6XX_GBIF_REGLIST_SIZE;
893 	int index = 0;
894 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
895 
896 	a6xx_state->registers = state_kcalloc(a6xx_state,
897 		count, sizeof(*a6xx_state->registers));
898 
899 	if (!a6xx_state->registers)
900 		return;
901 
902 	a6xx_state->nr_registers = count;
903 
904 	for (i = 0; i < ARRAY_SIZE(a6xx_ahb_reglist); i++)
905 		a6xx_get_ahb_gpu_registers(gpu,
906 			a6xx_state, &a6xx_ahb_reglist[i],
907 			&a6xx_state->registers[index++]);
908 
909 	if (a6xx_has_gbif(adreno_gpu))
910 		a6xx_get_ahb_gpu_registers(gpu,
911 				a6xx_state, &a6xx_gbif_reglist,
912 				&a6xx_state->registers[index++]);
913 	else
914 		a6xx_get_ahb_gpu_registers(gpu,
915 				a6xx_state, &a6xx_vbif_reglist,
916 				&a6xx_state->registers[index++]);
917 	if (!dumper) {
918 		/*
919 		 * We can't use the crashdumper when the SMMU is stalled,
920 		 * because the GPU has no memory access until we resume
921 		 * translation (but we don't want to do that until after
922 		 * we have captured as much useful GPU state as possible).
923 		 * So instead collect registers via the CPU:
924 		 */
925 		for (i = 0; i < ARRAY_SIZE(a6xx_reglist); i++)
926 			a6xx_get_ahb_gpu_registers(gpu,
927 				a6xx_state, &a6xx_reglist[i],
928 				&a6xx_state->registers[index++]);
929 		return;
930 	}
931 
932 	for (i = 0; i < ARRAY_SIZE(a6xx_reglist); i++)
933 		a6xx_get_crashdumper_registers(gpu,
934 			a6xx_state, &a6xx_reglist[i],
935 			&a6xx_state->registers[index++],
936 			dumper);
937 
938 	for (i = 0; i < ARRAY_SIZE(a6xx_hlsq_reglist); i++)
939 		a6xx_get_crashdumper_hlsq_registers(gpu,
940 			a6xx_state, &a6xx_hlsq_reglist[i],
941 			&a6xx_state->registers[index++],
942 			dumper);
943 }
944 
945 static u32 a6xx_get_cp_roq_size(struct msm_gpu *gpu)
946 {
947 	/* The value at [16:31] is in 4dword units. Convert it to dwords */
948 	return gpu_read(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2) >> 14;
949 }
950 
951 /* Read a block of data from an indexed register pair */
952 static void a6xx_get_indexed_regs(struct msm_gpu *gpu,
953 		struct a6xx_gpu_state *a6xx_state,
954 		struct a6xx_indexed_registers *indexed,
955 		struct a6xx_gpu_state_obj *obj)
956 {
957 	int i;
958 
959 	obj->handle = (const void *) indexed;
960 	if (indexed->count_fn)
961 		indexed->count = indexed->count_fn(gpu);
962 
963 	obj->data = state_kcalloc(a6xx_state, indexed->count, sizeof(u32));
964 	if (!obj->data)
965 		return;
966 
967 	/* All the indexed banks start at address 0 */
968 	gpu_write(gpu, indexed->addr, 0);
969 
970 	/* Read the data - each read increments the internal address by 1 */
971 	for (i = 0; i < indexed->count; i++)
972 		obj->data[i] = gpu_read(gpu, indexed->data);
973 }
974 
975 static void a6xx_get_indexed_registers(struct msm_gpu *gpu,
976 		struct a6xx_gpu_state *a6xx_state)
977 {
978 	u32 mempool_size;
979 	int count = ARRAY_SIZE(a6xx_indexed_reglist) + 1;
980 	int i;
981 
982 	a6xx_state->indexed_regs = state_kcalloc(a6xx_state, count,
983 		sizeof(*a6xx_state->indexed_regs));
984 	if (!a6xx_state->indexed_regs)
985 		return;
986 
987 	for (i = 0; i < ARRAY_SIZE(a6xx_indexed_reglist); i++)
988 		a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_indexed_reglist[i],
989 			&a6xx_state->indexed_regs[i]);
990 
991 	if (adreno_is_a650_family(to_adreno_gpu(gpu))) {
992 		u32 val;
993 
994 		val = gpu_read(gpu, REG_A6XX_CP_CHICKEN_DBG);
995 		gpu_write(gpu, REG_A6XX_CP_CHICKEN_DBG, val | 4);
996 
997 		/* Get the contents of the CP mempool */
998 		a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_cp_mempool_indexed,
999 			&a6xx_state->indexed_regs[i]);
1000 
1001 		gpu_write(gpu, REG_A6XX_CP_CHICKEN_DBG, val);
1002 		a6xx_state->nr_indexed_regs = count;
1003 		return;
1004 	}
1005 
1006 	/* Set the CP mempool size to 0 to stabilize it while dumping */
1007 	mempool_size = gpu_read(gpu, REG_A6XX_CP_MEM_POOL_SIZE);
1008 	gpu_write(gpu, REG_A6XX_CP_MEM_POOL_SIZE, 0);
1009 
1010 	/* Get the contents of the CP mempool */
1011 	a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_cp_mempool_indexed,
1012 		&a6xx_state->indexed_regs[i]);
1013 
1014 	/*
1015 	 * Offset 0x2000 in the mempool is the size - copy the saved size over
1016 	 * so the data is consistent
1017 	 */
1018 	a6xx_state->indexed_regs[i].data[0x2000] = mempool_size;
1019 
1020 	/* Restore the size in the hardware */
1021 	gpu_write(gpu, REG_A6XX_CP_MEM_POOL_SIZE, mempool_size);
1022 
1023 	a6xx_state->nr_indexed_regs = count;
1024 }
1025 
1026 struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
1027 {
1028 	struct a6xx_crashdumper _dumper = { 0 }, *dumper = NULL;
1029 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
1030 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
1031 	struct a6xx_gpu_state *a6xx_state = kzalloc(sizeof(*a6xx_state),
1032 		GFP_KERNEL);
1033 	bool stalled = !!(gpu_read(gpu, REG_A6XX_RBBM_STATUS3) &
1034 			A6XX_RBBM_STATUS3_SMMU_STALLED_ON_FAULT);
1035 
1036 	if (!a6xx_state)
1037 		return ERR_PTR(-ENOMEM);
1038 
1039 	INIT_LIST_HEAD(&a6xx_state->objs);
1040 
1041 	/* Get the generic state from the adreno core */
1042 	adreno_gpu_state_get(gpu, &a6xx_state->base);
1043 
1044 	if (!adreno_has_gmu_wrapper(adreno_gpu)) {
1045 		a6xx_get_gmu_registers(gpu, a6xx_state);
1046 
1047 		a6xx_state->gmu_log = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.log);
1048 		a6xx_state->gmu_hfi = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.hfi);
1049 		a6xx_state->gmu_debug = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.debug);
1050 
1051 		a6xx_snapshot_gmu_hfi_history(gpu, a6xx_state);
1052 	}
1053 
1054 	/* If GX isn't on the rest of the data isn't going to be accessible */
1055 	if (!adreno_has_gmu_wrapper(adreno_gpu) && !a6xx_gmu_gx_is_on(&a6xx_gpu->gmu))
1056 		return &a6xx_state->base;
1057 
1058 	/* Get the banks of indexed registers */
1059 	a6xx_get_indexed_registers(gpu, a6xx_state);
1060 
1061 	/*
1062 	 * Try to initialize the crashdumper, if we are not dumping state
1063 	 * with the SMMU stalled.  The crashdumper needs memory access to
1064 	 * write out GPU state, so we need to skip this when the SMMU is
1065 	 * stalled in response to an iova fault
1066 	 */
1067 	if (!stalled && !gpu->needs_hw_init &&
1068 	    !a6xx_crashdumper_init(gpu, &_dumper)) {
1069 		dumper = &_dumper;
1070 	}
1071 
1072 	a6xx_get_registers(gpu, a6xx_state, dumper);
1073 
1074 	if (dumper) {
1075 		a6xx_get_shaders(gpu, a6xx_state, dumper);
1076 		a6xx_get_clusters(gpu, a6xx_state, dumper);
1077 		a6xx_get_dbgahb_clusters(gpu, a6xx_state, dumper);
1078 
1079 		msm_gem_kernel_put(dumper->bo, gpu->aspace);
1080 	}
1081 
1082 	if (snapshot_debugbus)
1083 		a6xx_get_debugbus(gpu, a6xx_state);
1084 
1085 	a6xx_state->gpu_initialized = !gpu->needs_hw_init;
1086 
1087 	return  &a6xx_state->base;
1088 }
1089 
1090 static void a6xx_gpu_state_destroy(struct kref *kref)
1091 {
1092 	struct a6xx_state_memobj *obj, *tmp;
1093 	struct msm_gpu_state *state = container_of(kref,
1094 			struct msm_gpu_state, ref);
1095 	struct a6xx_gpu_state *a6xx_state = container_of(state,
1096 			struct a6xx_gpu_state, base);
1097 
1098 	if (a6xx_state->gmu_log)
1099 		kvfree(a6xx_state->gmu_log->data);
1100 
1101 	if (a6xx_state->gmu_hfi)
1102 		kvfree(a6xx_state->gmu_hfi->data);
1103 
1104 	if (a6xx_state->gmu_debug)
1105 		kvfree(a6xx_state->gmu_debug->data);
1106 
1107 	list_for_each_entry_safe(obj, tmp, &a6xx_state->objs, node) {
1108 		list_del(&obj->node);
1109 		kvfree(obj);
1110 	}
1111 
1112 	adreno_gpu_state_destroy(state);
1113 	kfree(a6xx_state);
1114 }
1115 
1116 int a6xx_gpu_state_put(struct msm_gpu_state *state)
1117 {
1118 	if (IS_ERR_OR_NULL(state))
1119 		return 1;
1120 
1121 	return kref_put(&state->ref, a6xx_gpu_state_destroy);
1122 }
1123 
1124 static void a6xx_show_registers(const u32 *registers, u32 *data, size_t count,
1125 		struct drm_printer *p)
1126 {
1127 	int i, index = 0;
1128 
1129 	if (!data)
1130 		return;
1131 
1132 	for (i = 0; i < count; i += 2) {
1133 		u32 count = RANGE(registers, i);
1134 		u32 offset = registers[i];
1135 		int j;
1136 
1137 		for (j = 0; j < count; index++, offset++, j++) {
1138 			if (data[index] == 0xdeafbead)
1139 				continue;
1140 
1141 			drm_printf(p, "  - { offset: 0x%06x, value: 0x%08x }\n",
1142 				offset << 2, data[index]);
1143 		}
1144 	}
1145 }
1146 
1147 static void print_ascii85(struct drm_printer *p, size_t len, u32 *data)
1148 {
1149 	char out[ASCII85_BUFSZ];
1150 	long i, l, datalen = 0;
1151 
1152 	for (i = 0; i < len >> 2; i++) {
1153 		if (data[i])
1154 			datalen = (i + 1) << 2;
1155 	}
1156 
1157 	if (datalen == 0)
1158 		return;
1159 
1160 	drm_puts(p, "    data: !!ascii85 |\n");
1161 	drm_puts(p, "      ");
1162 
1163 
1164 	l = ascii85_encode_len(datalen);
1165 
1166 	for (i = 0; i < l; i++)
1167 		drm_puts(p, ascii85_encode(data[i], out));
1168 
1169 	drm_puts(p, "\n");
1170 }
1171 
1172 static void print_name(struct drm_printer *p, const char *fmt, const char *name)
1173 {
1174 	drm_puts(p, fmt);
1175 	drm_puts(p, name);
1176 	drm_puts(p, "\n");
1177 }
1178 
1179 static void a6xx_show_shader(struct a6xx_gpu_state_obj *obj,
1180 		struct drm_printer *p)
1181 {
1182 	const struct a6xx_shader_block *block = obj->handle;
1183 	int i;
1184 
1185 	if (!obj->handle)
1186 		return;
1187 
1188 	print_name(p, "  - type: ", block->name);
1189 
1190 	for (i = 0; i < A6XX_NUM_SHADER_BANKS; i++) {
1191 		drm_printf(p, "    - bank: %d\n", i);
1192 		drm_printf(p, "      size: %d\n", block->size);
1193 
1194 		if (!obj->data)
1195 			continue;
1196 
1197 		print_ascii85(p, block->size << 2,
1198 			obj->data + (block->size * i));
1199 	}
1200 }
1201 
1202 static void a6xx_show_cluster_data(const u32 *registers, int size, u32 *data,
1203 		struct drm_printer *p)
1204 {
1205 	int ctx, index = 0;
1206 
1207 	for (ctx = 0; ctx < A6XX_NUM_CONTEXTS; ctx++) {
1208 		int j;
1209 
1210 		drm_printf(p, "    - context: %d\n", ctx);
1211 
1212 		for (j = 0; j < size; j += 2) {
1213 			u32 count = RANGE(registers, j);
1214 			u32 offset = registers[j];
1215 			int k;
1216 
1217 			for (k = 0; k < count; index++, offset++, k++) {
1218 				if (data[index] == 0xdeafbead)
1219 					continue;
1220 
1221 				drm_printf(p, "      - { offset: 0x%06x, value: 0x%08x }\n",
1222 					offset << 2, data[index]);
1223 			}
1224 		}
1225 	}
1226 }
1227 
1228 static void a6xx_show_dbgahb_cluster(struct a6xx_gpu_state_obj *obj,
1229 		struct drm_printer *p)
1230 {
1231 	const struct a6xx_dbgahb_cluster *dbgahb = obj->handle;
1232 
1233 	if (dbgahb) {
1234 		print_name(p, "  - cluster-name: ", dbgahb->name);
1235 		a6xx_show_cluster_data(dbgahb->registers, dbgahb->count,
1236 			obj->data, p);
1237 	}
1238 }
1239 
1240 static void a6xx_show_cluster(struct a6xx_gpu_state_obj *obj,
1241 		struct drm_printer *p)
1242 {
1243 	const struct a6xx_cluster *cluster = obj->handle;
1244 
1245 	if (cluster) {
1246 		print_name(p, "  - cluster-name: ", cluster->name);
1247 		a6xx_show_cluster_data(cluster->registers, cluster->count,
1248 			obj->data, p);
1249 	}
1250 }
1251 
1252 static void a6xx_show_indexed_regs(struct a6xx_gpu_state_obj *obj,
1253 		struct drm_printer *p)
1254 {
1255 	const struct a6xx_indexed_registers *indexed = obj->handle;
1256 
1257 	if (!indexed)
1258 		return;
1259 
1260 	print_name(p, "  - regs-name: ", indexed->name);
1261 	drm_printf(p, "    dwords: %d\n", indexed->count);
1262 
1263 	print_ascii85(p, indexed->count << 2, obj->data);
1264 }
1265 
1266 static void a6xx_show_debugbus_block(const struct a6xx_debugbus_block *block,
1267 		u32 *data, struct drm_printer *p)
1268 {
1269 	if (block) {
1270 		print_name(p, "  - debugbus-block: ", block->name);
1271 
1272 		/*
1273 		 * count for regular debugbus data is in quadwords,
1274 		 * but print the size in dwords for consistency
1275 		 */
1276 		drm_printf(p, "    count: %d\n", block->count << 1);
1277 
1278 		print_ascii85(p, block->count << 3, data);
1279 	}
1280 }
1281 
1282 static void a6xx_show_debugbus(struct a6xx_gpu_state *a6xx_state,
1283 		struct drm_printer *p)
1284 {
1285 	int i;
1286 
1287 	for (i = 0; i < a6xx_state->nr_debugbus; i++) {
1288 		struct a6xx_gpu_state_obj *obj = &a6xx_state->debugbus[i];
1289 
1290 		a6xx_show_debugbus_block(obj->handle, obj->data, p);
1291 	}
1292 
1293 	if (a6xx_state->vbif_debugbus) {
1294 		struct a6xx_gpu_state_obj *obj = a6xx_state->vbif_debugbus;
1295 
1296 		drm_puts(p, "  - debugbus-block: A6XX_DBGBUS_VBIF\n");
1297 		drm_printf(p, "    count: %d\n", VBIF_DEBUGBUS_BLOCK_SIZE);
1298 
1299 		/* vbif debugbus data is in dwords.  Confusing, huh? */
1300 		print_ascii85(p, VBIF_DEBUGBUS_BLOCK_SIZE << 2, obj->data);
1301 	}
1302 
1303 	for (i = 0; i < a6xx_state->nr_cx_debugbus; i++) {
1304 		struct a6xx_gpu_state_obj *obj = &a6xx_state->cx_debugbus[i];
1305 
1306 		a6xx_show_debugbus_block(obj->handle, obj->data, p);
1307 	}
1308 }
1309 
1310 void a6xx_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
1311 		struct drm_printer *p)
1312 {
1313 	struct a6xx_gpu_state *a6xx_state = container_of(state,
1314 			struct a6xx_gpu_state, base);
1315 	int i;
1316 
1317 	if (IS_ERR_OR_NULL(state))
1318 		return;
1319 
1320 	drm_printf(p, "gpu-initialized: %d\n", a6xx_state->gpu_initialized);
1321 
1322 	adreno_show(gpu, state, p);
1323 
1324 	drm_puts(p, "gmu-log:\n");
1325 	if (a6xx_state->gmu_log) {
1326 		struct msm_gpu_state_bo *gmu_log = a6xx_state->gmu_log;
1327 
1328 		drm_printf(p, "    iova: 0x%016llx\n", gmu_log->iova);
1329 		drm_printf(p, "    size: %zu\n", gmu_log->size);
1330 		adreno_show_object(p, &gmu_log->data, gmu_log->size,
1331 				&gmu_log->encoded);
1332 	}
1333 
1334 	drm_puts(p, "gmu-hfi:\n");
1335 	if (a6xx_state->gmu_hfi) {
1336 		struct msm_gpu_state_bo *gmu_hfi = a6xx_state->gmu_hfi;
1337 		unsigned i, j;
1338 
1339 		drm_printf(p, "    iova: 0x%016llx\n", gmu_hfi->iova);
1340 		drm_printf(p, "    size: %zu\n", gmu_hfi->size);
1341 		for (i = 0; i < ARRAY_SIZE(a6xx_state->hfi_queue_history); i++) {
1342 			drm_printf(p, "    queue-history[%u]:", i);
1343 			for (j = 0; j < HFI_HISTORY_SZ; j++) {
1344 				drm_printf(p, " %d", a6xx_state->hfi_queue_history[i][j]);
1345 			}
1346 			drm_printf(p, "\n");
1347 		}
1348 		adreno_show_object(p, &gmu_hfi->data, gmu_hfi->size,
1349 				&gmu_hfi->encoded);
1350 	}
1351 
1352 	drm_puts(p, "gmu-debug:\n");
1353 	if (a6xx_state->gmu_debug) {
1354 		struct msm_gpu_state_bo *gmu_debug = a6xx_state->gmu_debug;
1355 
1356 		drm_printf(p, "    iova: 0x%016llx\n", gmu_debug->iova);
1357 		drm_printf(p, "    size: %zu\n", gmu_debug->size);
1358 		adreno_show_object(p, &gmu_debug->data, gmu_debug->size,
1359 				&gmu_debug->encoded);
1360 	}
1361 
1362 	drm_puts(p, "registers:\n");
1363 	for (i = 0; i < a6xx_state->nr_registers; i++) {
1364 		struct a6xx_gpu_state_obj *obj = &a6xx_state->registers[i];
1365 		const struct a6xx_registers *regs = obj->handle;
1366 
1367 		if (!obj->handle)
1368 			continue;
1369 
1370 		a6xx_show_registers(regs->registers, obj->data, regs->count, p);
1371 	}
1372 
1373 	drm_puts(p, "registers-gmu:\n");
1374 	for (i = 0; i < a6xx_state->nr_gmu_registers; i++) {
1375 		struct a6xx_gpu_state_obj *obj = &a6xx_state->gmu_registers[i];
1376 		const struct a6xx_registers *regs = obj->handle;
1377 
1378 		if (!obj->handle)
1379 			continue;
1380 
1381 		a6xx_show_registers(regs->registers, obj->data, regs->count, p);
1382 	}
1383 
1384 	drm_puts(p, "indexed-registers:\n");
1385 	for (i = 0; i < a6xx_state->nr_indexed_regs; i++)
1386 		a6xx_show_indexed_regs(&a6xx_state->indexed_regs[i], p);
1387 
1388 	drm_puts(p, "shader-blocks:\n");
1389 	for (i = 0; i < a6xx_state->nr_shaders; i++)
1390 		a6xx_show_shader(&a6xx_state->shaders[i], p);
1391 
1392 	drm_puts(p, "clusters:\n");
1393 	for (i = 0; i < a6xx_state->nr_clusters; i++)
1394 		a6xx_show_cluster(&a6xx_state->clusters[i], p);
1395 
1396 	for (i = 0; i < a6xx_state->nr_dbgahb_clusters; i++)
1397 		a6xx_show_dbgahb_cluster(&a6xx_state->dbgahb_clusters[i], p);
1398 
1399 	drm_puts(p, "debugbus:\n");
1400 	a6xx_show_debugbus(a6xx_state, p);
1401 }
1402