1 /*
2  * Copyright (c) 2008 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *    Keith Packard <keithp@keithp.com>
26  *    Mika Kuoppala <mika.kuoppala@intel.com>
27  *
28  */
29 
30 #include <linux/ascii85.h>
31 #include <linux/nmi.h>
32 #include <linux/pagevec.h>
33 #include <linux/scatterlist.h>
34 #include <linux/utsname.h>
35 #include <linux/zlib.h>
36 
37 #include <drm/drm_print.h>
38 
39 #include "display/intel_atomic.h"
40 #include "display/intel_overlay.h"
41 
42 #include "gem/i915_gem_context.h"
43 
44 #include "i915_drv.h"
45 #include "i915_gpu_error.h"
46 #include "i915_memcpy.h"
47 #include "i915_scatterlist.h"
48 #include "intel_csr.h"
49 
50 #define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
51 #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
52 
53 static void __sg_set_buf(struct scatterlist *sg,
54 			 void *addr, unsigned int len, loff_t it)
55 {
56 	sg->page_link = (unsigned long)virt_to_page(addr);
57 	sg->offset = offset_in_page(addr);
58 	sg->length = len;
59 	sg->dma_address = it;
60 }
61 
62 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
63 {
64 	if (!len)
65 		return false;
66 
67 	if (e->bytes + len + 1 <= e->size)
68 		return true;
69 
70 	if (e->bytes) {
71 		__sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
72 		e->iter += e->bytes;
73 		e->buf = NULL;
74 		e->bytes = 0;
75 	}
76 
77 	if (e->cur == e->end) {
78 		struct scatterlist *sgl;
79 
80 		sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
81 		if (!sgl) {
82 			e->err = -ENOMEM;
83 			return false;
84 		}
85 
86 		if (e->cur) {
87 			e->cur->offset = 0;
88 			e->cur->length = 0;
89 			e->cur->page_link =
90 				(unsigned long)sgl | SG_CHAIN;
91 		} else {
92 			e->sgl = sgl;
93 		}
94 
95 		e->cur = sgl;
96 		e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
97 	}
98 
99 	e->size = ALIGN(len + 1, SZ_64K);
100 	e->buf = kmalloc(e->size, ALLOW_FAIL);
101 	if (!e->buf) {
102 		e->size = PAGE_ALIGN(len + 1);
103 		e->buf = kmalloc(e->size, GFP_KERNEL);
104 	}
105 	if (!e->buf) {
106 		e->err = -ENOMEM;
107 		return false;
108 	}
109 
110 	return true;
111 }
112 
113 __printf(2, 0)
114 static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
115 			       const char *fmt, va_list args)
116 {
117 	va_list ap;
118 	int len;
119 
120 	if (e->err)
121 		return;
122 
123 	va_copy(ap, args);
124 	len = vsnprintf(NULL, 0, fmt, ap);
125 	va_end(ap);
126 	if (len <= 0) {
127 		e->err = len;
128 		return;
129 	}
130 
131 	if (!__i915_error_grow(e, len))
132 		return;
133 
134 	GEM_BUG_ON(e->bytes >= e->size);
135 	len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
136 	if (len < 0) {
137 		e->err = len;
138 		return;
139 	}
140 	e->bytes += len;
141 }
142 
143 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
144 {
145 	unsigned len;
146 
147 	if (e->err || !str)
148 		return;
149 
150 	len = strlen(str);
151 	if (!__i915_error_grow(e, len))
152 		return;
153 
154 	GEM_BUG_ON(e->bytes + len > e->size);
155 	memcpy(e->buf + e->bytes, str, len);
156 	e->bytes += len;
157 }
158 
159 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
160 #define err_puts(e, s) i915_error_puts(e, s)
161 
162 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
163 {
164 	i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
165 }
166 
167 static inline struct drm_printer
168 i915_error_printer(struct drm_i915_error_state_buf *e)
169 {
170 	struct drm_printer p = {
171 		.printfn = __i915_printfn_error,
172 		.arg = e,
173 	};
174 	return p;
175 }
176 
177 /* single threaded page allocator with a reserved stash for emergencies */
178 static void pool_fini(struct pagevec *pv)
179 {
180 	pagevec_release(pv);
181 }
182 
183 static int pool_refill(struct pagevec *pv, gfp_t gfp)
184 {
185 	while (pagevec_space(pv)) {
186 		struct page *p;
187 
188 		p = alloc_page(gfp);
189 		if (!p)
190 			return -ENOMEM;
191 
192 		pagevec_add(pv, p);
193 	}
194 
195 	return 0;
196 }
197 
198 static int pool_init(struct pagevec *pv, gfp_t gfp)
199 {
200 	int err;
201 
202 	pagevec_init(pv);
203 
204 	err = pool_refill(pv, gfp);
205 	if (err)
206 		pool_fini(pv);
207 
208 	return err;
209 }
210 
211 static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
212 {
213 	struct page *p;
214 
215 	p = alloc_page(gfp);
216 	if (!p && pagevec_count(pv))
217 		p = pv->pages[--pv->nr];
218 
219 	return p ? page_address(p) : NULL;
220 }
221 
222 static void pool_free(struct pagevec *pv, void *addr)
223 {
224 	struct page *p = virt_to_page(addr);
225 
226 	if (pagevec_space(pv))
227 		pagevec_add(pv, p);
228 	else
229 		__free_page(p);
230 }
231 
232 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
233 
234 struct compress {
235 	struct pagevec pool;
236 	struct z_stream_s zstream;
237 	void *tmp;
238 };
239 
240 static bool compress_init(struct compress *c)
241 {
242 	struct z_stream_s *zstream = &c->zstream;
243 
244 	if (pool_init(&c->pool, ALLOW_FAIL))
245 		return false;
246 
247 	zstream->workspace =
248 		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
249 			ALLOW_FAIL);
250 	if (!zstream->workspace) {
251 		pool_fini(&c->pool);
252 		return false;
253 	}
254 
255 	c->tmp = NULL;
256 	if (i915_has_memcpy_from_wc())
257 		c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
258 
259 	return true;
260 }
261 
262 static bool compress_start(struct compress *c)
263 {
264 	struct z_stream_s *zstream = &c->zstream;
265 	void *workspace = zstream->workspace;
266 
267 	memset(zstream, 0, sizeof(*zstream));
268 	zstream->workspace = workspace;
269 
270 	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
271 }
272 
273 static void *compress_next_page(struct compress *c,
274 				struct drm_i915_error_object *dst)
275 {
276 	void *page;
277 
278 	if (dst->page_count >= dst->num_pages)
279 		return ERR_PTR(-ENOSPC);
280 
281 	page = pool_alloc(&c->pool, ALLOW_FAIL);
282 	if (!page)
283 		return ERR_PTR(-ENOMEM);
284 
285 	return dst->pages[dst->page_count++] = page;
286 }
287 
288 static int compress_page(struct compress *c,
289 			 void *src,
290 			 struct drm_i915_error_object *dst)
291 {
292 	struct z_stream_s *zstream = &c->zstream;
293 
294 	zstream->next_in = src;
295 	if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
296 		zstream->next_in = c->tmp;
297 	zstream->avail_in = PAGE_SIZE;
298 
299 	do {
300 		if (zstream->avail_out == 0) {
301 			zstream->next_out = compress_next_page(c, dst);
302 			if (IS_ERR(zstream->next_out))
303 				return PTR_ERR(zstream->next_out);
304 
305 			zstream->avail_out = PAGE_SIZE;
306 		}
307 
308 		if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
309 			return -EIO;
310 	} while (zstream->avail_in);
311 
312 	/* Fallback to uncompressed if we increase size? */
313 	if (0 && zstream->total_out > zstream->total_in)
314 		return -E2BIG;
315 
316 	return 0;
317 }
318 
319 static int compress_flush(struct compress *c,
320 			  struct drm_i915_error_object *dst)
321 {
322 	struct z_stream_s *zstream = &c->zstream;
323 
324 	do {
325 		switch (zlib_deflate(zstream, Z_FINISH)) {
326 		case Z_OK: /* more space requested */
327 			zstream->next_out = compress_next_page(c, dst);
328 			if (IS_ERR(zstream->next_out))
329 				return PTR_ERR(zstream->next_out);
330 
331 			zstream->avail_out = PAGE_SIZE;
332 			break;
333 
334 		case Z_STREAM_END:
335 			goto end;
336 
337 		default: /* any error */
338 			return -EIO;
339 		}
340 	} while (1);
341 
342 end:
343 	memset(zstream->next_out, 0, zstream->avail_out);
344 	dst->unused = zstream->avail_out;
345 	return 0;
346 }
347 
348 static void compress_finish(struct compress *c)
349 {
350 	zlib_deflateEnd(&c->zstream);
351 }
352 
353 static void compress_fini(struct compress *c)
354 {
355 	kfree(c->zstream.workspace);
356 	if (c->tmp)
357 		pool_free(&c->pool, c->tmp);
358 	pool_fini(&c->pool);
359 }
360 
361 static void err_compression_marker(struct drm_i915_error_state_buf *m)
362 {
363 	err_puts(m, ":");
364 }
365 
366 #else
367 
368 struct compress {
369 	struct pagevec pool;
370 };
371 
372 static bool compress_init(struct compress *c)
373 {
374 	return pool_init(&c->pool, ALLOW_FAIL) == 0;
375 }
376 
377 static bool compress_start(struct compress *c)
378 {
379 	return true;
380 }
381 
382 static int compress_page(struct compress *c,
383 			 void *src,
384 			 struct drm_i915_error_object *dst)
385 {
386 	void *ptr;
387 
388 	ptr = pool_alloc(&c->pool, ALLOW_FAIL);
389 	if (!ptr)
390 		return -ENOMEM;
391 
392 	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
393 		memcpy(ptr, src, PAGE_SIZE);
394 	dst->pages[dst->page_count++] = ptr;
395 
396 	return 0;
397 }
398 
399 static int compress_flush(struct compress *c,
400 			  struct drm_i915_error_object *dst)
401 {
402 	return 0;
403 }
404 
405 static void compress_finish(struct compress *c)
406 {
407 }
408 
409 static void compress_fini(struct compress *c)
410 {
411 	pool_fini(&c->pool);
412 }
413 
414 static void err_compression_marker(struct drm_i915_error_state_buf *m)
415 {
416 	err_puts(m, "~");
417 }
418 
419 #endif
420 
421 static void error_print_instdone(struct drm_i915_error_state_buf *m,
422 				 const struct drm_i915_error_engine *ee)
423 {
424 	const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu;
425 	int slice;
426 	int subslice;
427 
428 	err_printf(m, "  INSTDONE: 0x%08x\n",
429 		   ee->instdone.instdone);
430 
431 	if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3)
432 		return;
433 
434 	err_printf(m, "  SC_INSTDONE: 0x%08x\n",
435 		   ee->instdone.slice_common);
436 
437 	if (INTEL_GEN(m->i915) <= 6)
438 		return;
439 
440 	for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
441 		err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
442 			   slice, subslice,
443 			   ee->instdone.sampler[slice][subslice]);
444 
445 	for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
446 		err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
447 			   slice, subslice,
448 			   ee->instdone.row[slice][subslice]);
449 }
450 
451 static void error_print_request(struct drm_i915_error_state_buf *m,
452 				const char *prefix,
453 				const struct drm_i915_error_request *erq,
454 				const unsigned long epoch)
455 {
456 	if (!erq->seqno)
457 		return;
458 
459 	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
460 		   prefix, erq->pid, erq->context, erq->seqno,
461 		   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
462 			    &erq->flags) ? "!" : "",
463 		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
464 			    &erq->flags) ? "+" : "",
465 		   erq->sched_attr.priority,
466 		   jiffies_to_msecs(erq->jiffies - epoch),
467 		   erq->start, erq->head, erq->tail);
468 }
469 
470 static void error_print_context(struct drm_i915_error_state_buf *m,
471 				const char *header,
472 				const struct drm_i915_error_context *ctx)
473 {
474 	err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n",
475 		   header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
476 		   ctx->guilty, ctx->active);
477 }
478 
479 static void error_print_engine(struct drm_i915_error_state_buf *m,
480 			       const struct drm_i915_error_engine *ee,
481 			       const unsigned long epoch)
482 {
483 	int n;
484 
485 	err_printf(m, "%s command stream:\n", ee->engine->name);
486 	err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
487 	err_printf(m, "  START: 0x%08x\n", ee->start);
488 	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
489 	err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
490 		   ee->tail, ee->rq_post, ee->rq_tail);
491 	err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
492 	err_printf(m, "  MODE:  0x%08x\n", ee->mode);
493 	err_printf(m, "  HWS:   0x%08x\n", ee->hws);
494 	err_printf(m, "  ACTHD: 0x%08x %08x\n",
495 		   (u32)(ee->acthd>>32), (u32)ee->acthd);
496 	err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
497 	err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
498 
499 	error_print_instdone(m, ee);
500 
501 	if (ee->batchbuffer) {
502 		u64 start = ee->batchbuffer->gtt_offset;
503 		u64 end = start + ee->batchbuffer->gtt_size;
504 
505 		err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
506 			   upper_32_bits(start), lower_32_bits(start),
507 			   upper_32_bits(end), lower_32_bits(end));
508 	}
509 	if (INTEL_GEN(m->i915) >= 4) {
510 		err_printf(m, "  BBADDR: 0x%08x_%08x\n",
511 			   (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
512 		err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
513 		err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
514 	}
515 	err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
516 	err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
517 		   lower_32_bits(ee->faddr));
518 	if (INTEL_GEN(m->i915) >= 6) {
519 		err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
520 		err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
521 	}
522 	if (HAS_PPGTT(m->i915)) {
523 		err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
524 
525 		if (INTEL_GEN(m->i915) >= 8) {
526 			int i;
527 			for (i = 0; i < 4; i++)
528 				err_printf(m, "  PDP%d: 0x%016llx\n",
529 					   i, ee->vm_info.pdp[i]);
530 		} else {
531 			err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
532 				   ee->vm_info.pp_dir_base);
533 		}
534 	}
535 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
536 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
537 	err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
538 		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
539 		   ee->hangcheck_timestamp,
540 		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
541 	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
542 
543 	for (n = 0; n < ee->num_ports; n++) {
544 		err_printf(m, "  ELSP[%d]:", n);
545 		error_print_request(m, " ", &ee->execlist[n], epoch);
546 	}
547 
548 	error_print_context(m, "  Active context: ", &ee->context);
549 }
550 
551 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
552 {
553 	va_list args;
554 
555 	va_start(args, f);
556 	i915_error_vprintf(e, f, args);
557 	va_end(args);
558 }
559 
560 static void print_error_obj(struct drm_i915_error_state_buf *m,
561 			    const struct intel_engine_cs *engine,
562 			    const char *name,
563 			    const struct drm_i915_error_object *obj)
564 {
565 	char out[ASCII85_BUFSZ];
566 	int page;
567 
568 	if (!obj)
569 		return;
570 
571 	if (name) {
572 		err_printf(m, "%s --- %s = 0x%08x %08x\n",
573 			   engine ? engine->name : "global", name,
574 			   upper_32_bits(obj->gtt_offset),
575 			   lower_32_bits(obj->gtt_offset));
576 	}
577 
578 	if (obj->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
579 		err_printf(m, "gtt_page_sizes = 0x%08x\n", obj->gtt_page_sizes);
580 
581 	err_compression_marker(m);
582 	for (page = 0; page < obj->page_count; page++) {
583 		int i, len;
584 
585 		len = PAGE_SIZE;
586 		if (page == obj->page_count - 1)
587 			len -= obj->unused;
588 		len = ascii85_encode_len(len);
589 
590 		for (i = 0; i < len; i++)
591 			err_puts(m, ascii85_encode(obj->pages[page][i], out));
592 	}
593 	err_puts(m, "\n");
594 }
595 
596 static void err_print_capabilities(struct drm_i915_error_state_buf *m,
597 				   const struct intel_device_info *info,
598 				   const struct intel_runtime_info *runtime,
599 				   const struct intel_driver_caps *caps)
600 {
601 	struct drm_printer p = i915_error_printer(m);
602 
603 	intel_device_info_dump_flags(info, &p);
604 	intel_driver_caps_print(caps, &p);
605 	intel_device_info_dump_topology(&runtime->sseu, &p);
606 }
607 
608 static void err_print_params(struct drm_i915_error_state_buf *m,
609 			     const struct i915_params *params)
610 {
611 	struct drm_printer p = i915_error_printer(m);
612 
613 	i915_params_dump(params, &p);
614 }
615 
616 static void err_print_pciid(struct drm_i915_error_state_buf *m,
617 			    struct drm_i915_private *i915)
618 {
619 	struct pci_dev *pdev = i915->drm.pdev;
620 
621 	err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
622 	err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
623 	err_printf(m, "PCI Subsystem: %04x:%04x\n",
624 		   pdev->subsystem_vendor,
625 		   pdev->subsystem_device);
626 }
627 
628 static void err_print_uc(struct drm_i915_error_state_buf *m,
629 			 const struct i915_error_uc *error_uc)
630 {
631 	struct drm_printer p = i915_error_printer(m);
632 	const struct i915_gpu_state *error =
633 		container_of(error_uc, typeof(*error), uc);
634 
635 	if (!error->device_info.has_gt_uc)
636 		return;
637 
638 	intel_uc_fw_dump(&error_uc->guc_fw, &p);
639 	intel_uc_fw_dump(&error_uc->huc_fw, &p);
640 	print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
641 }
642 
643 static void err_free_sgl(struct scatterlist *sgl)
644 {
645 	while (sgl) {
646 		struct scatterlist *sg;
647 
648 		for (sg = sgl; !sg_is_chain(sg); sg++) {
649 			kfree(sg_virt(sg));
650 			if (sg_is_last(sg))
651 				break;
652 		}
653 
654 		sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
655 		free_page((unsigned long)sgl);
656 		sgl = sg;
657 	}
658 }
659 
660 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
661 			       struct i915_gpu_state *error)
662 {
663 	const struct drm_i915_error_engine *ee;
664 	struct timespec64 ts;
665 	int i, j;
666 
667 	if (*error->error_msg)
668 		err_printf(m, "%s\n", error->error_msg);
669 	err_printf(m, "Kernel: %s %s\n",
670 		   init_utsname()->release,
671 		   init_utsname()->machine);
672 	err_printf(m, "Driver: %s\n", DRIVER_DATE);
673 	ts = ktime_to_timespec64(error->time);
674 	err_printf(m, "Time: %lld s %ld us\n",
675 		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
676 	ts = ktime_to_timespec64(error->boottime);
677 	err_printf(m, "Boottime: %lld s %ld us\n",
678 		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
679 	ts = ktime_to_timespec64(error->uptime);
680 	err_printf(m, "Uptime: %lld s %ld us\n",
681 		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
682 	err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
683 	err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
684 		   error->capture,
685 		   jiffies_to_msecs(jiffies - error->capture),
686 		   jiffies_to_msecs(error->capture - error->epoch));
687 
688 	for (ee = error->engine; ee; ee = ee->next)
689 		err_printf(m, "Active process (on ring %s): %s [%d]\n",
690 			   ee->engine->name,
691 			   ee->context.comm,
692 			   ee->context.pid);
693 
694 	err_printf(m, "Reset count: %u\n", error->reset_count);
695 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
696 	err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
697 	err_printf(m, "Subplatform: 0x%x\n",
698 		   intel_subplatform(&error->runtime_info,
699 				     error->device_info.platform));
700 	err_print_pciid(m, m->i915);
701 
702 	err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
703 
704 	if (HAS_CSR(m->i915)) {
705 		struct intel_csr *csr = &m->i915->csr;
706 
707 		err_printf(m, "DMC loaded: %s\n",
708 			   yesno(csr->dmc_payload != NULL));
709 		err_printf(m, "DMC fw version: %d.%d\n",
710 			   CSR_VERSION_MAJOR(csr->version),
711 			   CSR_VERSION_MINOR(csr->version));
712 	}
713 
714 	err_printf(m, "GT awake: %s\n", yesno(error->awake));
715 	err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
716 	err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
717 	err_printf(m, "EIR: 0x%08x\n", error->eir);
718 	err_printf(m, "IER: 0x%08x\n", error->ier);
719 	for (i = 0; i < error->ngtier; i++)
720 		err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
721 	err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
722 	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
723 	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
724 	err_printf(m, "CCID: 0x%08x\n", error->ccid);
725 
726 	for (i = 0; i < error->nfence; i++)
727 		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
728 
729 	if (IS_GEN_RANGE(m->i915, 6, 11)) {
730 		err_printf(m, "ERROR: 0x%08x\n", error->error);
731 		err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
732 	}
733 
734 	if (INTEL_GEN(m->i915) >= 8)
735 		err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
736 			   error->fault_data1, error->fault_data0);
737 
738 	if (IS_GEN(m->i915, 7))
739 		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
740 
741 	if (IS_GEN_RANGE(m->i915, 8, 11))
742 		err_printf(m, "GTT_CACHE_EN: 0x%08x\n", error->gtt_cache);
743 
744 	for (ee = error->engine; ee; ee = ee->next)
745 		error_print_engine(m, ee, error->epoch);
746 
747 	for (ee = error->engine; ee; ee = ee->next) {
748 		const struct drm_i915_error_object *obj;
749 
750 		obj = ee->batchbuffer;
751 		if (obj) {
752 			err_puts(m, ee->engine->name);
753 			if (ee->context.pid)
754 				err_printf(m, " (submitted by %s [%d])",
755 					   ee->context.comm,
756 					   ee->context.pid);
757 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
758 				   upper_32_bits(obj->gtt_offset),
759 				   lower_32_bits(obj->gtt_offset));
760 			print_error_obj(m, ee->engine, NULL, obj);
761 		}
762 
763 		for (j = 0; j < ee->user_bo_count; j++)
764 			print_error_obj(m, ee->engine, "user", ee->user_bo[j]);
765 
766 		if (ee->num_requests) {
767 			err_printf(m, "%s --- %d requests\n",
768 				   ee->engine->name,
769 				   ee->num_requests);
770 			for (j = 0; j < ee->num_requests; j++)
771 				error_print_request(m, " ",
772 						    &ee->requests[j],
773 						    error->epoch);
774 		}
775 
776 		print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer);
777 		print_error_obj(m, ee->engine, "HW Status", ee->hws_page);
778 		print_error_obj(m, ee->engine, "HW context", ee->ctx);
779 		print_error_obj(m, ee->engine, "WA context", ee->wa_ctx);
780 		print_error_obj(m, ee->engine,
781 				"WA batchbuffer", ee->wa_batchbuffer);
782 		print_error_obj(m, ee->engine,
783 				"NULL context", ee->default_state);
784 	}
785 
786 	if (error->overlay)
787 		intel_overlay_print_error_state(m, error->overlay);
788 
789 	if (error->display)
790 		intel_display_print_error_state(m, error->display);
791 
792 	err_print_capabilities(m, &error->device_info, &error->runtime_info,
793 			       &error->driver_caps);
794 	err_print_params(m, &error->params);
795 	err_print_uc(m, &error->uc);
796 }
797 
798 static int err_print_to_sgl(struct i915_gpu_state *error)
799 {
800 	struct drm_i915_error_state_buf m;
801 
802 	if (IS_ERR(error))
803 		return PTR_ERR(error);
804 
805 	if (READ_ONCE(error->sgl))
806 		return 0;
807 
808 	memset(&m, 0, sizeof(m));
809 	m.i915 = error->i915;
810 
811 	__err_print_to_sgl(&m, error);
812 
813 	if (m.buf) {
814 		__sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
815 		m.bytes = 0;
816 		m.buf = NULL;
817 	}
818 	if (m.cur) {
819 		GEM_BUG_ON(m.end < m.cur);
820 		sg_mark_end(m.cur - 1);
821 	}
822 	GEM_BUG_ON(m.sgl && !m.cur);
823 
824 	if (m.err) {
825 		err_free_sgl(m.sgl);
826 		return m.err;
827 	}
828 
829 	if (cmpxchg(&error->sgl, NULL, m.sgl))
830 		err_free_sgl(m.sgl);
831 
832 	return 0;
833 }
834 
835 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
836 				      char *buf, loff_t off, size_t rem)
837 {
838 	struct scatterlist *sg;
839 	size_t count;
840 	loff_t pos;
841 	int err;
842 
843 	if (!error || !rem)
844 		return 0;
845 
846 	err = err_print_to_sgl(error);
847 	if (err)
848 		return err;
849 
850 	sg = READ_ONCE(error->fit);
851 	if (!sg || off < sg->dma_address)
852 		sg = error->sgl;
853 	if (!sg)
854 		return 0;
855 
856 	pos = sg->dma_address;
857 	count = 0;
858 	do {
859 		size_t len, start;
860 
861 		if (sg_is_chain(sg)) {
862 			sg = sg_chain_ptr(sg);
863 			GEM_BUG_ON(sg_is_chain(sg));
864 		}
865 
866 		len = sg->length;
867 		if (pos + len <= off) {
868 			pos += len;
869 			continue;
870 		}
871 
872 		start = sg->offset;
873 		if (pos < off) {
874 			GEM_BUG_ON(off - pos > len);
875 			len -= off - pos;
876 			start += off - pos;
877 			pos = off;
878 		}
879 
880 		len = min(len, rem);
881 		GEM_BUG_ON(!len || len > sg->length);
882 
883 		memcpy(buf, page_address(sg_page(sg)) + start, len);
884 
885 		count += len;
886 		pos += len;
887 
888 		buf += len;
889 		rem -= len;
890 		if (!rem) {
891 			WRITE_ONCE(error->fit, sg);
892 			break;
893 		}
894 	} while (!sg_is_last(sg++));
895 
896 	return count;
897 }
898 
899 static void i915_error_object_free(struct drm_i915_error_object *obj)
900 {
901 	int page;
902 
903 	if (obj == NULL)
904 		return;
905 
906 	for (page = 0; page < obj->page_count; page++)
907 		free_page((unsigned long)obj->pages[page]);
908 
909 	kfree(obj);
910 }
911 
912 
913 static void cleanup_params(struct i915_gpu_state *error)
914 {
915 	i915_params_free(&error->params);
916 }
917 
918 static void cleanup_uc_state(struct i915_gpu_state *error)
919 {
920 	struct i915_error_uc *error_uc = &error->uc;
921 
922 	kfree(error_uc->guc_fw.path);
923 	kfree(error_uc->huc_fw.path);
924 	i915_error_object_free(error_uc->guc_log);
925 }
926 
927 void __i915_gpu_state_free(struct kref *error_ref)
928 {
929 	struct i915_gpu_state *error =
930 		container_of(error_ref, typeof(*error), ref);
931 	long i;
932 
933 	while (error->engine) {
934 		struct drm_i915_error_engine *ee = error->engine;
935 
936 		error->engine = ee->next;
937 
938 		for (i = 0; i < ee->user_bo_count; i++)
939 			i915_error_object_free(ee->user_bo[i]);
940 		kfree(ee->user_bo);
941 
942 		i915_error_object_free(ee->batchbuffer);
943 		i915_error_object_free(ee->wa_batchbuffer);
944 		i915_error_object_free(ee->ringbuffer);
945 		i915_error_object_free(ee->hws_page);
946 		i915_error_object_free(ee->ctx);
947 		i915_error_object_free(ee->wa_ctx);
948 
949 		kfree(ee->requests);
950 		kfree(ee);
951 	}
952 
953 	kfree(error->overlay);
954 	kfree(error->display);
955 
956 	cleanup_params(error);
957 	cleanup_uc_state(error);
958 
959 	err_free_sgl(error->sgl);
960 	kfree(error);
961 }
962 
963 static struct drm_i915_error_object *
964 i915_error_object_create(struct drm_i915_private *i915,
965 			 struct i915_vma *vma,
966 			 struct compress *compress)
967 {
968 	struct i915_ggtt *ggtt = &i915->ggtt;
969 	const u64 slot = ggtt->error_capture.start;
970 	struct drm_i915_error_object *dst;
971 	unsigned long num_pages;
972 	struct sgt_iter iter;
973 	dma_addr_t dma;
974 	int ret;
975 
976 	might_sleep();
977 
978 	if (!vma || !vma->pages)
979 		return NULL;
980 
981 	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
982 	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
983 	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL);
984 	if (!dst)
985 		return NULL;
986 
987 	if (!compress_start(compress)) {
988 		kfree(dst);
989 		return NULL;
990 	}
991 
992 	dst->gtt_offset = vma->node.start;
993 	dst->gtt_size = vma->node.size;
994 	dst->gtt_page_sizes = vma->page_sizes.gtt;
995 	dst->num_pages = num_pages;
996 	dst->page_count = 0;
997 	dst->unused = 0;
998 
999 	ret = -EINVAL;
1000 	for_each_sgt_daddr(dma, iter, vma->pages) {
1001 		void __iomem *s;
1002 
1003 		ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1004 
1005 		s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
1006 		ret = compress_page(compress, (void  __force *)s, dst);
1007 		io_mapping_unmap(s);
1008 		if (ret)
1009 			break;
1010 	}
1011 
1012 	if (ret || compress_flush(compress, dst)) {
1013 		while (dst->page_count--)
1014 			pool_free(&compress->pool, dst->pages[dst->page_count]);
1015 		kfree(dst);
1016 		dst = NULL;
1017 	}
1018 	compress_finish(compress);
1019 
1020 	return dst;
1021 }
1022 
1023 /*
1024  * Generate a semi-unique error code. The code is not meant to have meaning, The
1025  * code's only purpose is to try to prevent false duplicated bug reports by
1026  * grossly estimating a GPU error state.
1027  *
1028  * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1029  * the hang if we could strip the GTT offset information from it.
1030  *
1031  * It's only a small step better than a random number in its current form.
1032  */
1033 static u32 i915_error_generate_code(struct i915_gpu_state *error)
1034 {
1035 	const struct drm_i915_error_engine *ee = error->engine;
1036 
1037 	/*
1038 	 * IPEHR would be an ideal way to detect errors, as it's the gross
1039 	 * measure of "the command that hung." However, has some very common
1040 	 * synchronization commands which almost always appear in the case
1041 	 * strictly a client bug. Use instdone to differentiate those some.
1042 	 */
1043 	return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
1044 }
1045 
1046 static void gem_record_fences(struct i915_gpu_state *error)
1047 {
1048 	struct drm_i915_private *dev_priv = error->i915;
1049 	struct intel_uncore *uncore = &dev_priv->uncore;
1050 	int i;
1051 
1052 	if (INTEL_GEN(dev_priv) >= 6) {
1053 		for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1054 			error->fence[i] =
1055 				intel_uncore_read64(uncore,
1056 						    FENCE_REG_GEN6_LO(i));
1057 	} else if (INTEL_GEN(dev_priv) >= 4) {
1058 		for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1059 			error->fence[i] =
1060 				intel_uncore_read64(uncore,
1061 						    FENCE_REG_965_LO(i));
1062 	} else {
1063 		for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1064 			error->fence[i] =
1065 				intel_uncore_read(uncore, FENCE_REG(i));
1066 	}
1067 	error->nfence = i;
1068 }
1069 
1070 static void error_record_engine_registers(struct i915_gpu_state *error,
1071 					  struct intel_engine_cs *engine,
1072 					  struct drm_i915_error_engine *ee)
1073 {
1074 	struct drm_i915_private *dev_priv = engine->i915;
1075 
1076 	if (INTEL_GEN(dev_priv) >= 6) {
1077 		ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1078 
1079 		if (INTEL_GEN(dev_priv) >= 12)
1080 			ee->fault_reg = I915_READ(GEN12_RING_FAULT_REG);
1081 		else if (INTEL_GEN(dev_priv) >= 8)
1082 			ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1083 		else
1084 			ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
1085 	}
1086 
1087 	if (INTEL_GEN(dev_priv) >= 4) {
1088 		ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1089 		ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1090 		ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1091 		ee->instps = ENGINE_READ(engine, RING_INSTPS);
1092 		ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1093 		if (INTEL_GEN(dev_priv) >= 8) {
1094 			ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1095 			ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1096 		}
1097 		ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1098 	} else {
1099 		ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1100 		ee->ipeir = ENGINE_READ(engine, IPEIR);
1101 		ee->ipehr = ENGINE_READ(engine, IPEHR);
1102 	}
1103 
1104 	intel_engine_get_instdone(engine, &ee->instdone);
1105 
1106 	ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1107 	ee->acthd = intel_engine_get_active_head(engine);
1108 	ee->start = ENGINE_READ(engine, RING_START);
1109 	ee->head = ENGINE_READ(engine, RING_HEAD);
1110 	ee->tail = ENGINE_READ(engine, RING_TAIL);
1111 	ee->ctl = ENGINE_READ(engine, RING_CTL);
1112 	if (INTEL_GEN(dev_priv) > 2)
1113 		ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1114 
1115 	if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1116 		i915_reg_t mmio;
1117 
1118 		if (IS_GEN(dev_priv, 7)) {
1119 			switch (engine->id) {
1120 			default:
1121 				MISSING_CASE(engine->id);
1122 				/* fall through */
1123 			case RCS0:
1124 				mmio = RENDER_HWS_PGA_GEN7;
1125 				break;
1126 			case BCS0:
1127 				mmio = BLT_HWS_PGA_GEN7;
1128 				break;
1129 			case VCS0:
1130 				mmio = BSD_HWS_PGA_GEN7;
1131 				break;
1132 			case VECS0:
1133 				mmio = VEBOX_HWS_PGA_GEN7;
1134 				break;
1135 			}
1136 		} else if (IS_GEN(engine->i915, 6)) {
1137 			mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1138 		} else {
1139 			/* XXX: gen8 returns to sanity */
1140 			mmio = RING_HWS_PGA(engine->mmio_base);
1141 		}
1142 
1143 		ee->hws = I915_READ(mmio);
1144 	}
1145 
1146 	ee->idle = intel_engine_is_idle(engine);
1147 	if (!ee->idle)
1148 		ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1149 	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1150 						  engine);
1151 
1152 	if (HAS_PPGTT(dev_priv)) {
1153 		int i;
1154 
1155 		ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
1156 
1157 		if (IS_GEN(dev_priv, 6)) {
1158 			ee->vm_info.pp_dir_base =
1159 				ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1160 		} else if (IS_GEN(dev_priv, 7)) {
1161 			ee->vm_info.pp_dir_base =
1162 				ENGINE_READ(engine, RING_PP_DIR_BASE);
1163 		} else if (INTEL_GEN(dev_priv) >= 8) {
1164 			u32 base = engine->mmio_base;
1165 
1166 			for (i = 0; i < 4; i++) {
1167 				ee->vm_info.pdp[i] =
1168 					I915_READ(GEN8_RING_PDP_UDW(base, i));
1169 				ee->vm_info.pdp[i] <<= 32;
1170 				ee->vm_info.pdp[i] |=
1171 					I915_READ(GEN8_RING_PDP_LDW(base, i));
1172 			}
1173 		}
1174 	}
1175 }
1176 
1177 static void record_request(const struct i915_request *request,
1178 			   struct drm_i915_error_request *erq)
1179 {
1180 	const struct i915_gem_context *ctx = request->gem_context;
1181 
1182 	erq->flags = request->fence.flags;
1183 	erq->context = request->fence.context;
1184 	erq->seqno = request->fence.seqno;
1185 	erq->sched_attr = request->sched.attr;
1186 	erq->jiffies = request->emitted_jiffies;
1187 	erq->start = i915_ggtt_offset(request->ring->vma);
1188 	erq->head = request->head;
1189 	erq->tail = request->tail;
1190 
1191 	rcu_read_lock();
1192 	erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1193 	rcu_read_unlock();
1194 }
1195 
1196 static void engine_record_requests(struct intel_engine_cs *engine,
1197 				   struct i915_request *first,
1198 				   struct drm_i915_error_engine *ee)
1199 {
1200 	struct i915_request *request;
1201 	int count;
1202 
1203 	count = 0;
1204 	request = first;
1205 	list_for_each_entry_from(request, &engine->active.requests, sched.link)
1206 		count++;
1207 	if (!count)
1208 		return;
1209 
1210 	ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL);
1211 	if (!ee->requests)
1212 		return;
1213 
1214 	ee->num_requests = count;
1215 
1216 	count = 0;
1217 	request = first;
1218 	list_for_each_entry_from(request,
1219 				 &engine->active.requests, sched.link) {
1220 		if (count >= ee->num_requests) {
1221 			/*
1222 			 * If the ring request list was changed in
1223 			 * between the point where the error request
1224 			 * list was created and dimensioned and this
1225 			 * point then just exit early to avoid crashes.
1226 			 *
1227 			 * We don't need to communicate that the
1228 			 * request list changed state during error
1229 			 * state capture and that the error state is
1230 			 * slightly incorrect as a consequence since we
1231 			 * are typically only interested in the request
1232 			 * list state at the point of error state
1233 			 * capture, not in any changes happening during
1234 			 * the capture.
1235 			 */
1236 			break;
1237 		}
1238 
1239 		record_request(request, &ee->requests[count++]);
1240 	}
1241 	ee->num_requests = count;
1242 }
1243 
1244 static void error_record_engine_execlists(const struct intel_engine_cs *engine,
1245 					  struct drm_i915_error_engine *ee)
1246 {
1247 	const struct intel_engine_execlists * const execlists = &engine->execlists;
1248 	struct i915_request * const *port = execlists->active;
1249 	unsigned int n = 0;
1250 
1251 	while (*port)
1252 		record_request(*port++, &ee->execlist[n++]);
1253 
1254 	ee->num_ports = n;
1255 }
1256 
1257 static bool record_context(struct drm_i915_error_context *e,
1258 			   const struct i915_request *rq)
1259 {
1260 	const struct i915_gem_context *ctx = rq->gem_context;
1261 
1262 	if (ctx->pid) {
1263 		struct task_struct *task;
1264 
1265 		rcu_read_lock();
1266 		task = pid_task(ctx->pid, PIDTYPE_PID);
1267 		if (task) {
1268 			strcpy(e->comm, task->comm);
1269 			e->pid = task->pid;
1270 		}
1271 		rcu_read_unlock();
1272 	}
1273 
1274 	e->sched_attr = ctx->sched;
1275 	e->guilty = atomic_read(&ctx->guilty_count);
1276 	e->active = atomic_read(&ctx->active_count);
1277 
1278 	return i915_gem_context_no_error_capture(ctx);
1279 }
1280 
1281 struct capture_vma {
1282 	struct capture_vma *next;
1283 	void **slot;
1284 };
1285 
1286 static struct capture_vma *
1287 capture_vma(struct capture_vma *next,
1288 	    struct i915_vma *vma,
1289 	    struct drm_i915_error_object **out)
1290 {
1291 	struct capture_vma *c;
1292 
1293 	*out = NULL;
1294 	if (!vma)
1295 		return next;
1296 
1297 	c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL);
1298 	if (!c)
1299 		return next;
1300 
1301 	if (!i915_active_acquire_if_busy(&vma->active)) {
1302 		kfree(c);
1303 		return next;
1304 	}
1305 
1306 	c->slot = (void **)out;
1307 	*c->slot = i915_vma_get(vma);
1308 
1309 	c->next = next;
1310 	return c;
1311 }
1312 
1313 static struct capture_vma *
1314 request_record_user_bo(struct i915_request *request,
1315 		       struct drm_i915_error_engine *ee,
1316 		       struct capture_vma *capture)
1317 {
1318 	struct i915_capture_list *c;
1319 	struct drm_i915_error_object **bo;
1320 	long count, max;
1321 
1322 	max = 0;
1323 	for (c = request->capture_list; c; c = c->next)
1324 		max++;
1325 	if (!max)
1326 		return capture;
1327 
1328 	bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
1329 	if (!bo) {
1330 		/* If we can't capture everything, try to capture something. */
1331 		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1332 		bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
1333 	}
1334 	if (!bo)
1335 		return capture;
1336 
1337 	count = 0;
1338 	for (c = request->capture_list; c; c = c->next) {
1339 		capture = capture_vma(capture, c->vma, &bo[count]);
1340 		if (++count == max)
1341 			break;
1342 	}
1343 
1344 	ee->user_bo = bo;
1345 	ee->user_bo_count = count;
1346 
1347 	return capture;
1348 }
1349 
1350 static struct drm_i915_error_object *
1351 capture_object(struct drm_i915_private *dev_priv,
1352 	       struct drm_i915_gem_object *obj,
1353 	       struct compress *compress)
1354 {
1355 	if (obj && i915_gem_object_has_pages(obj)) {
1356 		struct i915_vma fake = {
1357 			.node = { .start = U64_MAX, .size = obj->base.size },
1358 			.size = obj->base.size,
1359 			.pages = obj->mm.pages,
1360 			.obj = obj,
1361 		};
1362 
1363 		return i915_error_object_create(dev_priv, &fake, compress);
1364 	} else {
1365 		return NULL;
1366 	}
1367 }
1368 
1369 static void
1370 gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
1371 {
1372 	struct drm_i915_private *i915 = error->i915;
1373 	struct intel_engine_cs *engine;
1374 	struct drm_i915_error_engine *ee;
1375 
1376 	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
1377 	if (!ee)
1378 		return;
1379 
1380 	for_each_uabi_engine(engine, i915) {
1381 		struct capture_vma *capture = NULL;
1382 		struct i915_request *request;
1383 		unsigned long flags;
1384 
1385 		/* Refill our page pool before entering atomic section */
1386 		pool_refill(&compress->pool, ALLOW_FAIL);
1387 
1388 		spin_lock_irqsave(&engine->active.lock, flags);
1389 		request = intel_engine_find_active_request(engine);
1390 		if (!request) {
1391 			spin_unlock_irqrestore(&engine->active.lock, flags);
1392 			continue;
1393 		}
1394 
1395 		error->simulated |= record_context(&ee->context, request);
1396 
1397 		/*
1398 		 * We need to copy these to an anonymous buffer
1399 		 * as the simplest method to avoid being overwritten
1400 		 * by userspace.
1401 		 */
1402 		capture = capture_vma(capture,
1403 				      request->batch,
1404 				      &ee->batchbuffer);
1405 
1406 		if (HAS_BROKEN_CS_TLB(i915))
1407 			capture = capture_vma(capture,
1408 					      engine->gt->scratch,
1409 					      &ee->wa_batchbuffer);
1410 
1411 		capture = request_record_user_bo(request, ee, capture);
1412 
1413 		capture = capture_vma(capture,
1414 				      request->hw_context->state,
1415 				      &ee->ctx);
1416 
1417 		capture = capture_vma(capture,
1418 				      request->ring->vma,
1419 				      &ee->ringbuffer);
1420 
1421 		ee->cpu_ring_head = request->ring->head;
1422 		ee->cpu_ring_tail = request->ring->tail;
1423 
1424 		ee->rq_head = request->head;
1425 		ee->rq_post = request->postfix;
1426 		ee->rq_tail = request->tail;
1427 
1428 		engine_record_requests(engine, request, ee);
1429 		spin_unlock_irqrestore(&engine->active.lock, flags);
1430 
1431 		error_record_engine_registers(error, engine, ee);
1432 		error_record_engine_execlists(engine, ee);
1433 
1434 		while (capture) {
1435 			struct capture_vma *this = capture;
1436 			struct i915_vma *vma = *this->slot;
1437 
1438 			*this->slot =
1439 				i915_error_object_create(i915, vma, compress);
1440 
1441 			i915_active_release(&vma->active);
1442 			i915_vma_put(vma);
1443 
1444 			capture = this->next;
1445 			kfree(this);
1446 		}
1447 
1448 		ee->hws_page =
1449 			i915_error_object_create(i915,
1450 						 engine->status_page.vma,
1451 						 compress);
1452 
1453 		ee->wa_ctx =
1454 			i915_error_object_create(i915,
1455 						 engine->wa_ctx.vma,
1456 						 compress);
1457 
1458 		ee->default_state =
1459 			capture_object(i915, engine->default_state, compress);
1460 
1461 		ee->engine = engine;
1462 
1463 		ee->next = error->engine;
1464 		error->engine = ee;
1465 
1466 		ee = kzalloc(sizeof(*ee), GFP_KERNEL);
1467 		if (!ee)
1468 			return;
1469 	}
1470 
1471 	kfree(ee);
1472 }
1473 
1474 static void
1475 capture_uc_state(struct i915_gpu_state *error, struct compress *compress)
1476 {
1477 	struct drm_i915_private *i915 = error->i915;
1478 	struct i915_error_uc *error_uc = &error->uc;
1479 	struct intel_uc *uc = &i915->gt.uc;
1480 
1481 	/* Capturing uC state won't be useful if there is no GuC */
1482 	if (!error->device_info.has_gt_uc)
1483 		return;
1484 
1485 	memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));
1486 	memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw));
1487 
1488 	/* Non-default firmware paths will be specified by the modparam.
1489 	 * As modparams are generally accesible from the userspace make
1490 	 * explicit copies of the firmware paths.
1491 	 */
1492 	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
1493 	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
1494 	error_uc->guc_log = i915_error_object_create(i915,
1495 						     uc->guc.log.vma,
1496 						     compress);
1497 }
1498 
1499 /* Capture all registers which don't fit into another category. */
1500 static void capture_reg_state(struct i915_gpu_state *error)
1501 {
1502 	struct drm_i915_private *i915 = error->i915;
1503 	struct intel_uncore *uncore = &i915->uncore;
1504 	int i;
1505 
1506 	/* General organization
1507 	 * 1. Registers specific to a single generation
1508 	 * 2. Registers which belong to multiple generations
1509 	 * 3. Feature specific registers.
1510 	 * 4. Everything else
1511 	 * Please try to follow the order.
1512 	 */
1513 
1514 	/* 1: Registers specific to a single generation */
1515 	if (IS_VALLEYVIEW(i915)) {
1516 		error->gtier[0] = intel_uncore_read(uncore, GTIER);
1517 		error->ier = intel_uncore_read(uncore, VLV_IER);
1518 		error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
1519 	}
1520 
1521 	if (IS_GEN(i915, 7))
1522 		error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
1523 
1524 	if (INTEL_GEN(i915) >= 12) {
1525 		error->fault_data0 = intel_uncore_read(uncore,
1526 						       GEN12_FAULT_TLB_DATA0);
1527 		error->fault_data1 = intel_uncore_read(uncore,
1528 						       GEN12_FAULT_TLB_DATA1);
1529 	} else if (INTEL_GEN(i915) >= 8) {
1530 		error->fault_data0 = intel_uncore_read(uncore,
1531 						       GEN8_FAULT_TLB_DATA0);
1532 		error->fault_data1 = intel_uncore_read(uncore,
1533 						       GEN8_FAULT_TLB_DATA1);
1534 	}
1535 
1536 	if (IS_GEN(i915, 6)) {
1537 		error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1538 		error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1539 		error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
1540 	}
1541 
1542 	/* 2: Registers which belong to multiple generations */
1543 	if (INTEL_GEN(i915) >= 7)
1544 		error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
1545 
1546 	if (INTEL_GEN(i915) >= 6) {
1547 		error->derrmr = intel_uncore_read(uncore, DERRMR);
1548 		if (INTEL_GEN(i915) < 12) {
1549 			error->error = intel_uncore_read(uncore, ERROR_GEN6);
1550 			error->done_reg = intel_uncore_read(uncore, DONE_REG);
1551 		}
1552 	}
1553 
1554 	if (INTEL_GEN(i915) >= 5)
1555 		error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
1556 
1557 	/* 3: Feature specific registers */
1558 	if (IS_GEN_RANGE(i915, 6, 7)) {
1559 		error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1560 		error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
1561 	}
1562 
1563 	if (IS_GEN_RANGE(i915, 8, 11))
1564 		error->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);
1565 
1566 	/* 4: Everything else */
1567 	if (INTEL_GEN(i915) >= 11) {
1568 		error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1569 		error->gtier[0] =
1570 			intel_uncore_read(uncore,
1571 					  GEN11_RENDER_COPY_INTR_ENABLE);
1572 		error->gtier[1] =
1573 			intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1574 		error->gtier[2] =
1575 			intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1576 		error->gtier[3] =
1577 			intel_uncore_read(uncore,
1578 					  GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1579 		error->gtier[4] =
1580 			intel_uncore_read(uncore,
1581 					  GEN11_CRYPTO_RSVD_INTR_ENABLE);
1582 		error->gtier[5] =
1583 			intel_uncore_read(uncore,
1584 					  GEN11_GUNIT_CSME_INTR_ENABLE);
1585 		error->ngtier = 6;
1586 	} else if (INTEL_GEN(i915) >= 8) {
1587 		error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1588 		for (i = 0; i < 4; i++)
1589 			error->gtier[i] = intel_uncore_read(uncore,
1590 							    GEN8_GT_IER(i));
1591 		error->ngtier = 4;
1592 	} else if (HAS_PCH_SPLIT(i915)) {
1593 		error->ier = intel_uncore_read(uncore, DEIER);
1594 		error->gtier[0] = intel_uncore_read(uncore, GTIER);
1595 		error->ngtier = 1;
1596 	} else if (IS_GEN(i915, 2)) {
1597 		error->ier = intel_uncore_read16(uncore, GEN2_IER);
1598 	} else if (!IS_VALLEYVIEW(i915)) {
1599 		error->ier = intel_uncore_read(uncore, GEN2_IER);
1600 	}
1601 	error->eir = intel_uncore_read(uncore, EIR);
1602 	error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1603 }
1604 
1605 static const char *
1606 error_msg(struct i915_gpu_state *error,
1607 	  intel_engine_mask_t engines, const char *msg)
1608 {
1609 	int len;
1610 
1611 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
1612 			"GPU HANG: ecode %d:%x:0x%08x",
1613 			INTEL_GEN(error->i915), engines,
1614 			i915_error_generate_code(error));
1615 	if (error->engine) {
1616 		/* Just show the first executing process, more is confusing */
1617 		len += scnprintf(error->error_msg + len,
1618 				 sizeof(error->error_msg) - len,
1619 				 ", in %s [%d]",
1620 				 error->engine->context.comm,
1621 				 error->engine->context.pid);
1622 	}
1623 	if (msg)
1624 		len += scnprintf(error->error_msg + len,
1625 				 sizeof(error->error_msg) - len,
1626 				 ", %s", msg);
1627 
1628 	return error->error_msg;
1629 }
1630 
1631 static void capture_gen_state(struct i915_gpu_state *error)
1632 {
1633 	struct drm_i915_private *i915 = error->i915;
1634 
1635 	error->awake = i915->gt.awake;
1636 	error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1637 	error->suspended = i915->runtime_pm.suspended;
1638 
1639 	error->iommu = -1;
1640 #ifdef CONFIG_INTEL_IOMMU
1641 	error->iommu = intel_iommu_gfx_mapped;
1642 #endif
1643 	error->reset_count = i915_reset_count(&i915->gpu_error);
1644 	error->suspend_count = i915->suspend_count;
1645 
1646 	memcpy(&error->device_info,
1647 	       INTEL_INFO(i915),
1648 	       sizeof(error->device_info));
1649 	memcpy(&error->runtime_info,
1650 	       RUNTIME_INFO(i915),
1651 	       sizeof(error->runtime_info));
1652 	error->driver_caps = i915->caps;
1653 }
1654 
1655 static void capture_params(struct i915_gpu_state *error)
1656 {
1657 	i915_params_copy(&error->params, &i915_modparams);
1658 }
1659 
1660 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1661 {
1662 	const struct drm_i915_error_engine *ee;
1663 	unsigned long epoch = error->capture;
1664 
1665 	for (ee = error->engine; ee; ee = ee->next) {
1666 		if (ee->hangcheck_timestamp &&
1667 		    time_before(ee->hangcheck_timestamp, epoch))
1668 			epoch = ee->hangcheck_timestamp;
1669 	}
1670 
1671 	return epoch;
1672 }
1673 
1674 static void capture_finish(struct i915_gpu_state *error)
1675 {
1676 	struct i915_ggtt *ggtt = &error->i915->ggtt;
1677 	const u64 slot = ggtt->error_capture.start;
1678 
1679 	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1680 }
1681 
1682 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1683 
1684 struct i915_gpu_state *
1685 i915_capture_gpu_state(struct drm_i915_private *i915)
1686 {
1687 	struct i915_gpu_state *error;
1688 	struct compress compress;
1689 
1690 	/* Check if GPU capture has been disabled */
1691 	error = READ_ONCE(i915->gpu_error.first_error);
1692 	if (IS_ERR(error))
1693 		return error;
1694 
1695 	error = kzalloc(sizeof(*error), ALLOW_FAIL);
1696 	if (!error) {
1697 		i915_disable_error_state(i915, -ENOMEM);
1698 		return ERR_PTR(-ENOMEM);
1699 	}
1700 
1701 	if (!compress_init(&compress)) {
1702 		kfree(error);
1703 		i915_disable_error_state(i915, -ENOMEM);
1704 		return ERR_PTR(-ENOMEM);
1705 	}
1706 
1707 	kref_init(&error->ref);
1708 	error->i915 = i915;
1709 
1710 	error->time = ktime_get_real();
1711 	error->boottime = ktime_get_boottime();
1712 	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
1713 	error->capture = jiffies;
1714 
1715 	capture_params(error);
1716 	capture_gen_state(error);
1717 	capture_uc_state(error, &compress);
1718 	capture_reg_state(error);
1719 	gem_record_fences(error);
1720 	gem_record_rings(error, &compress);
1721 
1722 	error->overlay = intel_overlay_capture_error_state(i915);
1723 	error->display = intel_display_capture_error_state(i915);
1724 
1725 	error->epoch = capture_find_epoch(error);
1726 
1727 	capture_finish(error);
1728 	compress_fini(&compress);
1729 
1730 	return error;
1731 }
1732 
1733 /**
1734  * i915_capture_error_state - capture an error record for later analysis
1735  * @i915: i915 device
1736  * @engine_mask: the mask of engines triggering the hang
1737  * @msg: a message to insert into the error capture header
1738  *
1739  * Should be called when an error is detected (either a hang or an error
1740  * interrupt) to capture error state from the time of the error.  Fills
1741  * out a structure which becomes available in debugfs for user level tools
1742  * to pick up.
1743  */
1744 void i915_capture_error_state(struct drm_i915_private *i915,
1745 			      intel_engine_mask_t engine_mask,
1746 			      const char *msg)
1747 {
1748 	static bool warned;
1749 	struct i915_gpu_state *error;
1750 	unsigned long flags;
1751 
1752 	if (!i915_modparams.error_capture)
1753 		return;
1754 
1755 	if (READ_ONCE(i915->gpu_error.first_error))
1756 		return;
1757 
1758 	error = i915_capture_gpu_state(i915);
1759 	if (IS_ERR(error))
1760 		return;
1761 
1762 	dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
1763 
1764 	if (!error->simulated) {
1765 		spin_lock_irqsave(&i915->gpu_error.lock, flags);
1766 		if (!i915->gpu_error.first_error) {
1767 			i915->gpu_error.first_error = error;
1768 			error = NULL;
1769 		}
1770 		spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1771 	}
1772 
1773 	if (error) {
1774 		__i915_gpu_state_free(&error->ref);
1775 		return;
1776 	}
1777 
1778 	if (!xchg(&warned, true) &&
1779 	    ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1780 		pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1781 		pr_info("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1782 		pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1783 		pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n");
1784 		pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1785 			i915->drm.primary->index);
1786 	}
1787 }
1788 
1789 struct i915_gpu_state *
1790 i915_first_error_state(struct drm_i915_private *i915)
1791 {
1792 	struct i915_gpu_state *error;
1793 
1794 	spin_lock_irq(&i915->gpu_error.lock);
1795 	error = i915->gpu_error.first_error;
1796 	if (!IS_ERR_OR_NULL(error))
1797 		i915_gpu_state_get(error);
1798 	spin_unlock_irq(&i915->gpu_error.lock);
1799 
1800 	return error;
1801 }
1802 
1803 void i915_reset_error_state(struct drm_i915_private *i915)
1804 {
1805 	struct i915_gpu_state *error;
1806 
1807 	spin_lock_irq(&i915->gpu_error.lock);
1808 	error = i915->gpu_error.first_error;
1809 	if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1810 		i915->gpu_error.first_error = NULL;
1811 	spin_unlock_irq(&i915->gpu_error.lock);
1812 
1813 	if (!IS_ERR_OR_NULL(error))
1814 		i915_gpu_state_put(error);
1815 }
1816 
1817 void i915_disable_error_state(struct drm_i915_private *i915, int err)
1818 {
1819 	spin_lock_irq(&i915->gpu_error.lock);
1820 	if (!i915->gpu_error.first_error)
1821 		i915->gpu_error.first_error = ERR_PTR(err);
1822 	spin_unlock_irq(&i915->gpu_error.lock);
1823 }
1824