xref: /openbmc/linux/drivers/gpu/drm/i915/gt/selftest_hangcheck.c (revision b0e55fef624e511e060fa05e4ca96cae6d902f04)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 #include "gt/intel_gt.h"
29 #include "intel_engine_pm.h"
30 
31 #include "i915_selftest.h"
32 #include "selftests/i915_random.h"
33 #include "selftests/igt_flush_test.h"
34 #include "selftests/igt_reset.h"
35 #include "selftests/igt_atomic.h"
36 
37 #include "selftests/mock_drm.h"
38 
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41 
42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
43 
44 struct hang {
45 	struct intel_gt *gt;
46 	struct drm_i915_gem_object *hws;
47 	struct drm_i915_gem_object *obj;
48 	struct i915_gem_context *ctx;
49 	u32 *seqno;
50 	u32 *batch;
51 };
52 
53 static int hang_init(struct hang *h, struct intel_gt *gt)
54 {
55 	void *vaddr;
56 	int err;
57 
58 	memset(h, 0, sizeof(*h));
59 	h->gt = gt;
60 
61 	h->ctx = kernel_context(gt->i915);
62 	if (IS_ERR(h->ctx))
63 		return PTR_ERR(h->ctx);
64 
65 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
66 
67 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
68 	if (IS_ERR(h->hws)) {
69 		err = PTR_ERR(h->hws);
70 		goto err_ctx;
71 	}
72 
73 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
74 	if (IS_ERR(h->obj)) {
75 		err = PTR_ERR(h->obj);
76 		goto err_hws;
77 	}
78 
79 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
80 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
81 	if (IS_ERR(vaddr)) {
82 		err = PTR_ERR(vaddr);
83 		goto err_obj;
84 	}
85 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
86 
87 	vaddr = i915_gem_object_pin_map(h->obj,
88 					i915_coherent_map_type(gt->i915));
89 	if (IS_ERR(vaddr)) {
90 		err = PTR_ERR(vaddr);
91 		goto err_unpin_hws;
92 	}
93 	h->batch = vaddr;
94 
95 	return 0;
96 
97 err_unpin_hws:
98 	i915_gem_object_unpin_map(h->hws);
99 err_obj:
100 	i915_gem_object_put(h->obj);
101 err_hws:
102 	i915_gem_object_put(h->hws);
103 err_ctx:
104 	kernel_context_close(h->ctx);
105 	return err;
106 }
107 
108 static u64 hws_address(const struct i915_vma *hws,
109 		       const struct i915_request *rq)
110 {
111 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
112 }
113 
114 static int move_to_active(struct i915_vma *vma,
115 			  struct i915_request *rq,
116 			  unsigned int flags)
117 {
118 	int err;
119 
120 	i915_vma_lock(vma);
121 	err = i915_request_await_object(rq, vma->obj,
122 					flags & EXEC_OBJECT_WRITE);
123 	if (err == 0)
124 		err = i915_vma_move_to_active(vma, rq, flags);
125 	i915_vma_unlock(vma);
126 
127 	return err;
128 }
129 
130 static struct i915_request *
131 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
132 {
133 	struct intel_gt *gt = h->gt;
134 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
135 	struct drm_i915_gem_object *obj;
136 	struct i915_request *rq = NULL;
137 	struct i915_vma *hws, *vma;
138 	unsigned int flags;
139 	void *vaddr;
140 	u32 *batch;
141 	int err;
142 
143 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
144 	if (IS_ERR(obj)) {
145 		i915_vm_put(vm);
146 		return ERR_CAST(obj);
147 	}
148 
149 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
150 	if (IS_ERR(vaddr)) {
151 		i915_gem_object_put(obj);
152 		i915_vm_put(vm);
153 		return ERR_CAST(vaddr);
154 	}
155 
156 	i915_gem_object_unpin_map(h->obj);
157 	i915_gem_object_put(h->obj);
158 
159 	h->obj = obj;
160 	h->batch = vaddr;
161 
162 	vma = i915_vma_instance(h->obj, vm, NULL);
163 	if (IS_ERR(vma)) {
164 		i915_vm_put(vm);
165 		return ERR_CAST(vma);
166 	}
167 
168 	hws = i915_vma_instance(h->hws, vm, NULL);
169 	if (IS_ERR(hws)) {
170 		i915_vm_put(vm);
171 		return ERR_CAST(hws);
172 	}
173 
174 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
175 	if (err) {
176 		i915_vm_put(vm);
177 		return ERR_PTR(err);
178 	}
179 
180 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
181 	if (err)
182 		goto unpin_vma;
183 
184 	rq = igt_request_alloc(h->ctx, engine);
185 	if (IS_ERR(rq)) {
186 		err = PTR_ERR(rq);
187 		goto unpin_hws;
188 	}
189 
190 	err = move_to_active(vma, rq, 0);
191 	if (err)
192 		goto cancel_rq;
193 
194 	err = move_to_active(hws, rq, 0);
195 	if (err)
196 		goto cancel_rq;
197 
198 	batch = h->batch;
199 	if (INTEL_GEN(gt->i915) >= 8) {
200 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
201 		*batch++ = lower_32_bits(hws_address(hws, rq));
202 		*batch++ = upper_32_bits(hws_address(hws, rq));
203 		*batch++ = rq->fence.seqno;
204 		*batch++ = MI_ARB_CHECK;
205 
206 		memset(batch, 0, 1024);
207 		batch += 1024 / sizeof(*batch);
208 
209 		*batch++ = MI_ARB_CHECK;
210 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
211 		*batch++ = lower_32_bits(vma->node.start);
212 		*batch++ = upper_32_bits(vma->node.start);
213 	} else if (INTEL_GEN(gt->i915) >= 6) {
214 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
215 		*batch++ = 0;
216 		*batch++ = lower_32_bits(hws_address(hws, rq));
217 		*batch++ = rq->fence.seqno;
218 		*batch++ = MI_ARB_CHECK;
219 
220 		memset(batch, 0, 1024);
221 		batch += 1024 / sizeof(*batch);
222 
223 		*batch++ = MI_ARB_CHECK;
224 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
225 		*batch++ = lower_32_bits(vma->node.start);
226 	} else if (INTEL_GEN(gt->i915) >= 4) {
227 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
228 		*batch++ = 0;
229 		*batch++ = lower_32_bits(hws_address(hws, rq));
230 		*batch++ = rq->fence.seqno;
231 		*batch++ = MI_ARB_CHECK;
232 
233 		memset(batch, 0, 1024);
234 		batch += 1024 / sizeof(*batch);
235 
236 		*batch++ = MI_ARB_CHECK;
237 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
238 		*batch++ = lower_32_bits(vma->node.start);
239 	} else {
240 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
241 		*batch++ = lower_32_bits(hws_address(hws, rq));
242 		*batch++ = rq->fence.seqno;
243 		*batch++ = MI_ARB_CHECK;
244 
245 		memset(batch, 0, 1024);
246 		batch += 1024 / sizeof(*batch);
247 
248 		*batch++ = MI_ARB_CHECK;
249 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
250 		*batch++ = lower_32_bits(vma->node.start);
251 	}
252 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
253 	intel_gt_chipset_flush(engine->gt);
254 
255 	if (rq->engine->emit_init_breadcrumb) {
256 		err = rq->engine->emit_init_breadcrumb(rq);
257 		if (err)
258 			goto cancel_rq;
259 	}
260 
261 	flags = 0;
262 	if (INTEL_GEN(gt->i915) <= 5)
263 		flags |= I915_DISPATCH_SECURE;
264 
265 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
266 
267 cancel_rq:
268 	if (err) {
269 		i915_request_skip(rq, err);
270 		i915_request_add(rq);
271 	}
272 unpin_hws:
273 	i915_vma_unpin(hws);
274 unpin_vma:
275 	i915_vma_unpin(vma);
276 	i915_vm_put(vm);
277 	return err ? ERR_PTR(err) : rq;
278 }
279 
280 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
281 {
282 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
283 }
284 
285 static void hang_fini(struct hang *h)
286 {
287 	*h->batch = MI_BATCH_BUFFER_END;
288 	intel_gt_chipset_flush(h->gt);
289 
290 	i915_gem_object_unpin_map(h->obj);
291 	i915_gem_object_put(h->obj);
292 
293 	i915_gem_object_unpin_map(h->hws);
294 	i915_gem_object_put(h->hws);
295 
296 	kernel_context_close(h->ctx);
297 
298 	igt_flush_test(h->gt->i915);
299 }
300 
301 static bool wait_until_running(struct hang *h, struct i915_request *rq)
302 {
303 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
304 					       rq->fence.seqno),
305 			     10) &&
306 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
307 					    rq->fence.seqno),
308 			  1000));
309 }
310 
311 static int igt_hang_sanitycheck(void *arg)
312 {
313 	struct intel_gt *gt = arg;
314 	struct i915_request *rq;
315 	struct intel_engine_cs *engine;
316 	enum intel_engine_id id;
317 	struct hang h;
318 	int err;
319 
320 	/* Basic check that we can execute our hanging batch */
321 
322 	err = hang_init(&h, gt);
323 	if (err)
324 		return err;
325 
326 	for_each_engine(engine, gt, id) {
327 		struct intel_wedge_me w;
328 		long timeout;
329 
330 		if (!intel_engine_can_store_dword(engine))
331 			continue;
332 
333 		rq = hang_create_request(&h, engine);
334 		if (IS_ERR(rq)) {
335 			err = PTR_ERR(rq);
336 			pr_err("Failed to create request for %s, err=%d\n",
337 			       engine->name, err);
338 			goto fini;
339 		}
340 
341 		i915_request_get(rq);
342 
343 		*h.batch = MI_BATCH_BUFFER_END;
344 		intel_gt_chipset_flush(engine->gt);
345 
346 		i915_request_add(rq);
347 
348 		timeout = 0;
349 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
350 			timeout = i915_request_wait(rq, 0,
351 						    MAX_SCHEDULE_TIMEOUT);
352 		if (intel_gt_is_wedged(gt))
353 			timeout = -EIO;
354 
355 		i915_request_put(rq);
356 
357 		if (timeout < 0) {
358 			err = timeout;
359 			pr_err("Wait for request failed on %s, err=%d\n",
360 			       engine->name, err);
361 			goto fini;
362 		}
363 	}
364 
365 fini:
366 	hang_fini(&h);
367 	return err;
368 }
369 
370 static bool wait_for_idle(struct intel_engine_cs *engine)
371 {
372 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
373 }
374 
375 static int igt_reset_nop(void *arg)
376 {
377 	struct intel_gt *gt = arg;
378 	struct i915_gpu_error *global = &gt->i915->gpu_error;
379 	struct intel_engine_cs *engine;
380 	struct i915_gem_context *ctx;
381 	unsigned int reset_count, count;
382 	enum intel_engine_id id;
383 	struct drm_file *file;
384 	IGT_TIMEOUT(end_time);
385 	int err = 0;
386 
387 	/* Check that we can reset during non-user portions of requests */
388 
389 	file = mock_file(gt->i915);
390 	if (IS_ERR(file))
391 		return PTR_ERR(file);
392 
393 	ctx = live_context(gt->i915, file);
394 	if (IS_ERR(ctx)) {
395 		err = PTR_ERR(ctx);
396 		goto out;
397 	}
398 
399 	i915_gem_context_clear_bannable(ctx);
400 	reset_count = i915_reset_count(global);
401 	count = 0;
402 	do {
403 		for_each_engine(engine, gt, id) {
404 			int i;
405 
406 			for (i = 0; i < 16; i++) {
407 				struct i915_request *rq;
408 
409 				rq = igt_request_alloc(ctx, engine);
410 				if (IS_ERR(rq)) {
411 					err = PTR_ERR(rq);
412 					break;
413 				}
414 
415 				i915_request_add(rq);
416 			}
417 		}
418 
419 		igt_global_reset_lock(gt);
420 		intel_gt_reset(gt, ALL_ENGINES, NULL);
421 		igt_global_reset_unlock(gt);
422 
423 		if (intel_gt_is_wedged(gt)) {
424 			err = -EIO;
425 			break;
426 		}
427 
428 		if (i915_reset_count(global) != reset_count + ++count) {
429 			pr_err("Full GPU reset not recorded!\n");
430 			err = -EINVAL;
431 			break;
432 		}
433 
434 		err = igt_flush_test(gt->i915);
435 		if (err)
436 			break;
437 	} while (time_before(jiffies, end_time));
438 	pr_info("%s: %d resets\n", __func__, count);
439 
440 	err = igt_flush_test(gt->i915);
441 out:
442 	mock_file_free(gt->i915, file);
443 	if (intel_gt_is_wedged(gt))
444 		err = -EIO;
445 	return err;
446 }
447 
448 static int igt_reset_nop_engine(void *arg)
449 {
450 	struct intel_gt *gt = arg;
451 	struct i915_gpu_error *global = &gt->i915->gpu_error;
452 	struct intel_engine_cs *engine;
453 	struct i915_gem_context *ctx;
454 	enum intel_engine_id id;
455 	struct drm_file *file;
456 	int err = 0;
457 
458 	/* Check that we can engine-reset during non-user portions */
459 
460 	if (!intel_has_reset_engine(gt))
461 		return 0;
462 
463 	file = mock_file(gt->i915);
464 	if (IS_ERR(file))
465 		return PTR_ERR(file);
466 
467 	ctx = live_context(gt->i915, file);
468 	if (IS_ERR(ctx)) {
469 		err = PTR_ERR(ctx);
470 		goto out;
471 	}
472 
473 	i915_gem_context_clear_bannable(ctx);
474 	for_each_engine(engine, gt, id) {
475 		unsigned int reset_count, reset_engine_count;
476 		unsigned int count;
477 		IGT_TIMEOUT(end_time);
478 
479 		reset_count = i915_reset_count(global);
480 		reset_engine_count = i915_reset_engine_count(global, engine);
481 		count = 0;
482 
483 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
484 		do {
485 			int i;
486 
487 			if (!wait_for_idle(engine)) {
488 				pr_err("%s failed to idle before reset\n",
489 				       engine->name);
490 				err = -EIO;
491 				break;
492 			}
493 
494 			for (i = 0; i < 16; i++) {
495 				struct i915_request *rq;
496 
497 				rq = igt_request_alloc(ctx, engine);
498 				if (IS_ERR(rq)) {
499 					err = PTR_ERR(rq);
500 					break;
501 				}
502 
503 				i915_request_add(rq);
504 			}
505 			err = intel_engine_reset(engine, NULL);
506 			if (err) {
507 				pr_err("i915_reset_engine failed\n");
508 				break;
509 			}
510 
511 			if (i915_reset_count(global) != reset_count) {
512 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
513 				err = -EINVAL;
514 				break;
515 			}
516 
517 			if (i915_reset_engine_count(global, engine) !=
518 			    reset_engine_count + ++count) {
519 				pr_err("%s engine reset not recorded!\n",
520 				       engine->name);
521 				err = -EINVAL;
522 				break;
523 			}
524 		} while (time_before(jiffies, end_time));
525 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
526 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
527 
528 		if (err)
529 			break;
530 
531 		err = igt_flush_test(gt->i915);
532 		if (err)
533 			break;
534 	}
535 
536 	err = igt_flush_test(gt->i915);
537 out:
538 	mock_file_free(gt->i915, file);
539 	if (intel_gt_is_wedged(gt))
540 		err = -EIO;
541 	return err;
542 }
543 
544 static int __igt_reset_engine(struct intel_gt *gt, bool active)
545 {
546 	struct i915_gpu_error *global = &gt->i915->gpu_error;
547 	struct intel_engine_cs *engine;
548 	enum intel_engine_id id;
549 	struct hang h;
550 	int err = 0;
551 
552 	/* Check that we can issue an engine reset on an idle engine (no-op) */
553 
554 	if (!intel_has_reset_engine(gt))
555 		return 0;
556 
557 	if (active) {
558 		err = hang_init(&h, gt);
559 		if (err)
560 			return err;
561 	}
562 
563 	for_each_engine(engine, gt, id) {
564 		unsigned int reset_count, reset_engine_count;
565 		IGT_TIMEOUT(end_time);
566 
567 		if (active && !intel_engine_can_store_dword(engine))
568 			continue;
569 
570 		if (!wait_for_idle(engine)) {
571 			pr_err("%s failed to idle before reset\n",
572 			       engine->name);
573 			err = -EIO;
574 			break;
575 		}
576 
577 		reset_count = i915_reset_count(global);
578 		reset_engine_count = i915_reset_engine_count(global, engine);
579 
580 		intel_engine_pm_get(engine);
581 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
582 		do {
583 			if (active) {
584 				struct i915_request *rq;
585 
586 				rq = hang_create_request(&h, engine);
587 				if (IS_ERR(rq)) {
588 					err = PTR_ERR(rq);
589 					break;
590 				}
591 
592 				i915_request_get(rq);
593 				i915_request_add(rq);
594 
595 				if (!wait_until_running(&h, rq)) {
596 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
597 
598 					pr_err("%s: Failed to start request %llx, at %x\n",
599 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
600 					intel_engine_dump(engine, &p,
601 							  "%s\n", engine->name);
602 
603 					i915_request_put(rq);
604 					err = -EIO;
605 					break;
606 				}
607 
608 				i915_request_put(rq);
609 			}
610 
611 			err = intel_engine_reset(engine, NULL);
612 			if (err) {
613 				pr_err("i915_reset_engine failed\n");
614 				break;
615 			}
616 
617 			if (i915_reset_count(global) != reset_count) {
618 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
619 				err = -EINVAL;
620 				break;
621 			}
622 
623 			if (i915_reset_engine_count(global, engine) !=
624 			    ++reset_engine_count) {
625 				pr_err("%s engine reset not recorded!\n",
626 				       engine->name);
627 				err = -EINVAL;
628 				break;
629 			}
630 		} while (time_before(jiffies, end_time));
631 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
632 		intel_engine_pm_put(engine);
633 
634 		if (err)
635 			break;
636 
637 		err = igt_flush_test(gt->i915);
638 		if (err)
639 			break;
640 	}
641 
642 	if (intel_gt_is_wedged(gt))
643 		err = -EIO;
644 
645 	if (active)
646 		hang_fini(&h);
647 
648 	return err;
649 }
650 
651 static int igt_reset_idle_engine(void *arg)
652 {
653 	return __igt_reset_engine(arg, false);
654 }
655 
656 static int igt_reset_active_engine(void *arg)
657 {
658 	return __igt_reset_engine(arg, true);
659 }
660 
661 struct active_engine {
662 	struct task_struct *task;
663 	struct intel_engine_cs *engine;
664 	unsigned long resets;
665 	unsigned int flags;
666 };
667 
668 #define TEST_ACTIVE	BIT(0)
669 #define TEST_OTHERS	BIT(1)
670 #define TEST_SELF	BIT(2)
671 #define TEST_PRIORITY	BIT(3)
672 
673 static int active_request_put(struct i915_request *rq)
674 {
675 	int err = 0;
676 
677 	if (!rq)
678 		return 0;
679 
680 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
681 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
682 			  rq->engine->name,
683 			  rq->fence.context,
684 			  rq->fence.seqno);
685 		GEM_TRACE_DUMP();
686 
687 		intel_gt_set_wedged(rq->engine->gt);
688 		err = -EIO;
689 	}
690 
691 	i915_request_put(rq);
692 
693 	return err;
694 }
695 
696 static int active_engine(void *data)
697 {
698 	I915_RND_STATE(prng);
699 	struct active_engine *arg = data;
700 	struct intel_engine_cs *engine = arg->engine;
701 	struct i915_request *rq[8] = {};
702 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
703 	struct drm_file *file;
704 	unsigned long count = 0;
705 	int err = 0;
706 
707 	file = mock_file(engine->i915);
708 	if (IS_ERR(file))
709 		return PTR_ERR(file);
710 
711 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
712 		ctx[count] = live_context(engine->i915, file);
713 		if (IS_ERR(ctx[count])) {
714 			err = PTR_ERR(ctx[count]);
715 			while (--count)
716 				i915_gem_context_put(ctx[count]);
717 			goto err_file;
718 		}
719 	}
720 
721 	while (!kthread_should_stop()) {
722 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
723 		struct i915_request *old = rq[idx];
724 		struct i915_request *new;
725 
726 		new = igt_request_alloc(ctx[idx], engine);
727 		if (IS_ERR(new)) {
728 			err = PTR_ERR(new);
729 			break;
730 		}
731 
732 		if (arg->flags & TEST_PRIORITY)
733 			ctx[idx]->sched.priority =
734 				i915_prandom_u32_max_state(512, &prng);
735 
736 		rq[idx] = i915_request_get(new);
737 		i915_request_add(new);
738 
739 		err = active_request_put(old);
740 		if (err)
741 			break;
742 
743 		cond_resched();
744 	}
745 
746 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
747 		int err__ = active_request_put(rq[count]);
748 
749 		/* Keep the first error */
750 		if (!err)
751 			err = err__;
752 	}
753 
754 err_file:
755 	mock_file_free(engine->i915, file);
756 	return err;
757 }
758 
759 static int __igt_reset_engines(struct intel_gt *gt,
760 			       const char *test_name,
761 			       unsigned int flags)
762 {
763 	struct i915_gpu_error *global = &gt->i915->gpu_error;
764 	struct intel_engine_cs *engine, *other;
765 	enum intel_engine_id id, tmp;
766 	struct hang h;
767 	int err = 0;
768 
769 	/* Check that issuing a reset on one engine does not interfere
770 	 * with any other engine.
771 	 */
772 
773 	if (!intel_has_reset_engine(gt))
774 		return 0;
775 
776 	if (flags & TEST_ACTIVE) {
777 		err = hang_init(&h, gt);
778 		if (err)
779 			return err;
780 
781 		if (flags & TEST_PRIORITY)
782 			h.ctx->sched.priority = 1024;
783 	}
784 
785 	for_each_engine(engine, gt, id) {
786 		struct active_engine threads[I915_NUM_ENGINES] = {};
787 		unsigned long device = i915_reset_count(global);
788 		unsigned long count = 0, reported;
789 		IGT_TIMEOUT(end_time);
790 
791 		if (flags & TEST_ACTIVE &&
792 		    !intel_engine_can_store_dword(engine))
793 			continue;
794 
795 		if (!wait_for_idle(engine)) {
796 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
797 			       engine->name, test_name);
798 			err = -EIO;
799 			break;
800 		}
801 
802 		memset(threads, 0, sizeof(threads));
803 		for_each_engine(other, gt, tmp) {
804 			struct task_struct *tsk;
805 
806 			threads[tmp].resets =
807 				i915_reset_engine_count(global, other);
808 
809 			if (!(flags & TEST_OTHERS))
810 				continue;
811 
812 			if (other == engine && !(flags & TEST_SELF))
813 				continue;
814 
815 			threads[tmp].engine = other;
816 			threads[tmp].flags = flags;
817 
818 			tsk = kthread_run(active_engine, &threads[tmp],
819 					  "igt/%s", other->name);
820 			if (IS_ERR(tsk)) {
821 				err = PTR_ERR(tsk);
822 				goto unwind;
823 			}
824 
825 			threads[tmp].task = tsk;
826 			get_task_struct(tsk);
827 		}
828 
829 		yield(); /* start all threads before we begin */
830 
831 		intel_engine_pm_get(engine);
832 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
833 		do {
834 			struct i915_request *rq = NULL;
835 
836 			if (flags & TEST_ACTIVE) {
837 				rq = hang_create_request(&h, engine);
838 				if (IS_ERR(rq)) {
839 					err = PTR_ERR(rq);
840 					break;
841 				}
842 
843 				i915_request_get(rq);
844 				i915_request_add(rq);
845 
846 				if (!wait_until_running(&h, rq)) {
847 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
848 
849 					pr_err("%s: Failed to start request %llx, at %x\n",
850 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
851 					intel_engine_dump(engine, &p,
852 							  "%s\n", engine->name);
853 
854 					i915_request_put(rq);
855 					err = -EIO;
856 					break;
857 				}
858 			}
859 
860 			err = intel_engine_reset(engine, NULL);
861 			if (err) {
862 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
863 				       engine->name, test_name, err);
864 				break;
865 			}
866 
867 			count++;
868 
869 			if (rq) {
870 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
871 					struct drm_printer p =
872 						drm_info_printer(gt->i915->drm.dev);
873 
874 					pr_err("i915_reset_engine(%s:%s):"
875 					       " failed to complete request after reset\n",
876 					       engine->name, test_name);
877 					intel_engine_dump(engine, &p,
878 							  "%s\n", engine->name);
879 					i915_request_put(rq);
880 
881 					GEM_TRACE_DUMP();
882 					intel_gt_set_wedged(gt);
883 					err = -EIO;
884 					break;
885 				}
886 
887 				i915_request_put(rq);
888 			}
889 
890 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
891 				struct drm_printer p =
892 					drm_info_printer(gt->i915->drm.dev);
893 
894 				pr_err("i915_reset_engine(%s:%s):"
895 				       " failed to idle after reset\n",
896 				       engine->name, test_name);
897 				intel_engine_dump(engine, &p,
898 						  "%s\n", engine->name);
899 
900 				err = -EIO;
901 				break;
902 			}
903 		} while (time_before(jiffies, end_time));
904 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
905 		intel_engine_pm_put(engine);
906 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
907 			engine->name, test_name, count);
908 
909 		reported = i915_reset_engine_count(global, engine);
910 		reported -= threads[engine->id].resets;
911 		if (reported != count) {
912 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
913 			       engine->name, test_name, count, reported);
914 			if (!err)
915 				err = -EINVAL;
916 		}
917 
918 unwind:
919 		for_each_engine(other, gt, tmp) {
920 			int ret;
921 
922 			if (!threads[tmp].task)
923 				continue;
924 
925 			ret = kthread_stop(threads[tmp].task);
926 			if (ret) {
927 				pr_err("kthread for other engine %s failed, err=%d\n",
928 				       other->name, ret);
929 				if (!err)
930 					err = ret;
931 			}
932 			put_task_struct(threads[tmp].task);
933 
934 			if (other->uabi_class != engine->uabi_class &&
935 			    threads[tmp].resets !=
936 			    i915_reset_engine_count(global, other)) {
937 				pr_err("Innocent engine %s was reset (count=%ld)\n",
938 				       other->name,
939 				       i915_reset_engine_count(global, other) -
940 				       threads[tmp].resets);
941 				if (!err)
942 					err = -EINVAL;
943 			}
944 		}
945 
946 		if (device != i915_reset_count(global)) {
947 			pr_err("Global reset (count=%ld)!\n",
948 			       i915_reset_count(global) - device);
949 			if (!err)
950 				err = -EINVAL;
951 		}
952 
953 		if (err)
954 			break;
955 
956 		err = igt_flush_test(gt->i915);
957 		if (err)
958 			break;
959 	}
960 
961 	if (intel_gt_is_wedged(gt))
962 		err = -EIO;
963 
964 	if (flags & TEST_ACTIVE)
965 		hang_fini(&h);
966 
967 	return err;
968 }
969 
970 static int igt_reset_engines(void *arg)
971 {
972 	static const struct {
973 		const char *name;
974 		unsigned int flags;
975 	} phases[] = {
976 		{ "idle", 0 },
977 		{ "active", TEST_ACTIVE },
978 		{ "others-idle", TEST_OTHERS },
979 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
980 		{
981 			"others-priority",
982 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
983 		},
984 		{
985 			"self-priority",
986 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
987 		},
988 		{ }
989 	};
990 	struct intel_gt *gt = arg;
991 	typeof(*phases) *p;
992 	int err;
993 
994 	for (p = phases; p->name; p++) {
995 		if (p->flags & TEST_PRIORITY) {
996 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
997 				continue;
998 		}
999 
1000 		err = __igt_reset_engines(arg, p->name, p->flags);
1001 		if (err)
1002 			return err;
1003 	}
1004 
1005 	return 0;
1006 }
1007 
1008 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1009 {
1010 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1011 
1012 	intel_gt_reset(gt, mask, NULL);
1013 
1014 	return count;
1015 }
1016 
1017 static int igt_reset_wait(void *arg)
1018 {
1019 	struct intel_gt *gt = arg;
1020 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1021 	struct intel_engine_cs *engine = gt->engine[RCS0];
1022 	struct i915_request *rq;
1023 	unsigned int reset_count;
1024 	struct hang h;
1025 	long timeout;
1026 	int err;
1027 
1028 	if (!engine || !intel_engine_can_store_dword(engine))
1029 		return 0;
1030 
1031 	/* Check that we detect a stuck waiter and issue a reset */
1032 
1033 	igt_global_reset_lock(gt);
1034 
1035 	err = hang_init(&h, gt);
1036 	if (err)
1037 		goto unlock;
1038 
1039 	rq = hang_create_request(&h, engine);
1040 	if (IS_ERR(rq)) {
1041 		err = PTR_ERR(rq);
1042 		goto fini;
1043 	}
1044 
1045 	i915_request_get(rq);
1046 	i915_request_add(rq);
1047 
1048 	if (!wait_until_running(&h, rq)) {
1049 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1050 
1051 		pr_err("%s: Failed to start request %llx, at %x\n",
1052 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1053 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1054 
1055 		intel_gt_set_wedged(gt);
1056 
1057 		err = -EIO;
1058 		goto out_rq;
1059 	}
1060 
1061 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1062 
1063 	timeout = i915_request_wait(rq, 0, 10);
1064 	if (timeout < 0) {
1065 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1066 		       timeout);
1067 		err = timeout;
1068 		goto out_rq;
1069 	}
1070 
1071 	if (i915_reset_count(global) == reset_count) {
1072 		pr_err("No GPU reset recorded!\n");
1073 		err = -EINVAL;
1074 		goto out_rq;
1075 	}
1076 
1077 out_rq:
1078 	i915_request_put(rq);
1079 fini:
1080 	hang_fini(&h);
1081 unlock:
1082 	igt_global_reset_unlock(gt);
1083 
1084 	if (intel_gt_is_wedged(gt))
1085 		return -EIO;
1086 
1087 	return err;
1088 }
1089 
1090 struct evict_vma {
1091 	struct completion completion;
1092 	struct i915_vma *vma;
1093 };
1094 
1095 static int evict_vma(void *data)
1096 {
1097 	struct evict_vma *arg = data;
1098 	struct i915_address_space *vm = arg->vma->vm;
1099 	struct drm_mm_node evict = arg->vma->node;
1100 	int err;
1101 
1102 	complete(&arg->completion);
1103 
1104 	mutex_lock(&vm->mutex);
1105 	err = i915_gem_evict_for_node(vm, &evict, 0);
1106 	mutex_unlock(&vm->mutex);
1107 
1108 	return err;
1109 }
1110 
1111 static int evict_fence(void *data)
1112 {
1113 	struct evict_vma *arg = data;
1114 	int err;
1115 
1116 	complete(&arg->completion);
1117 
1118 	/* Mark the fence register as dirty to force the mmio update. */
1119 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1120 	if (err) {
1121 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1122 		return err;
1123 	}
1124 
1125 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1126 	if (err) {
1127 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1128 		return err;
1129 	}
1130 
1131 	err = i915_vma_pin_fence(arg->vma);
1132 	i915_vma_unpin(arg->vma);
1133 	if (err) {
1134 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1135 		return err;
1136 	}
1137 
1138 	i915_vma_unpin_fence(arg->vma);
1139 
1140 	return 0;
1141 }
1142 
1143 static int __igt_reset_evict_vma(struct intel_gt *gt,
1144 				 struct i915_address_space *vm,
1145 				 int (*fn)(void *),
1146 				 unsigned int flags)
1147 {
1148 	struct intel_engine_cs *engine = gt->engine[RCS0];
1149 	struct drm_i915_gem_object *obj;
1150 	struct task_struct *tsk = NULL;
1151 	struct i915_request *rq;
1152 	struct evict_vma arg;
1153 	struct hang h;
1154 	unsigned int pin_flags;
1155 	int err;
1156 
1157 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1158 		return 0;
1159 
1160 	if (!engine || !intel_engine_can_store_dword(engine))
1161 		return 0;
1162 
1163 	/* Check that we can recover an unbind stuck on a hanging request */
1164 
1165 	err = hang_init(&h, gt);
1166 	if (err)
1167 		return err;
1168 
1169 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1170 	if (IS_ERR(obj)) {
1171 		err = PTR_ERR(obj);
1172 		goto fini;
1173 	}
1174 
1175 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1176 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1177 		if (err) {
1178 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1179 			goto out_obj;
1180 		}
1181 	}
1182 
1183 	arg.vma = i915_vma_instance(obj, vm, NULL);
1184 	if (IS_ERR(arg.vma)) {
1185 		err = PTR_ERR(arg.vma);
1186 		goto out_obj;
1187 	}
1188 
1189 	rq = hang_create_request(&h, engine);
1190 	if (IS_ERR(rq)) {
1191 		err = PTR_ERR(rq);
1192 		goto out_obj;
1193 	}
1194 
1195 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1196 
1197 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1198 		pin_flags |= PIN_MAPPABLE;
1199 
1200 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1201 	if (err) {
1202 		i915_request_add(rq);
1203 		goto out_obj;
1204 	}
1205 
1206 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1207 		err = i915_vma_pin_fence(arg.vma);
1208 		if (err) {
1209 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1210 			i915_vma_unpin(arg.vma);
1211 			i915_request_add(rq);
1212 			goto out_obj;
1213 		}
1214 	}
1215 
1216 	i915_vma_lock(arg.vma);
1217 	err = i915_request_await_object(rq, arg.vma->obj,
1218 					flags & EXEC_OBJECT_WRITE);
1219 	if (err == 0)
1220 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1221 	i915_vma_unlock(arg.vma);
1222 
1223 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1224 		i915_vma_unpin_fence(arg.vma);
1225 	i915_vma_unpin(arg.vma);
1226 
1227 	i915_request_get(rq);
1228 	i915_request_add(rq);
1229 	if (err)
1230 		goto out_rq;
1231 
1232 	if (!wait_until_running(&h, rq)) {
1233 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1234 
1235 		pr_err("%s: Failed to start request %llx, at %x\n",
1236 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1237 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1238 
1239 		intel_gt_set_wedged(gt);
1240 		goto out_reset;
1241 	}
1242 
1243 	init_completion(&arg.completion);
1244 
1245 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1246 	if (IS_ERR(tsk)) {
1247 		err = PTR_ERR(tsk);
1248 		tsk = NULL;
1249 		goto out_reset;
1250 	}
1251 	get_task_struct(tsk);
1252 
1253 	wait_for_completion(&arg.completion);
1254 
1255 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1256 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1257 
1258 		pr_err("igt/evict_vma kthread did not wait\n");
1259 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1260 
1261 		intel_gt_set_wedged(gt);
1262 		goto out_reset;
1263 	}
1264 
1265 out_reset:
1266 	igt_global_reset_lock(gt);
1267 	fake_hangcheck(gt, rq->engine->mask);
1268 	igt_global_reset_unlock(gt);
1269 
1270 	if (tsk) {
1271 		struct intel_wedge_me w;
1272 
1273 		/* The reset, even indirectly, should take less than 10ms. */
1274 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1275 			err = kthread_stop(tsk);
1276 
1277 		put_task_struct(tsk);
1278 	}
1279 
1280 out_rq:
1281 	i915_request_put(rq);
1282 out_obj:
1283 	i915_gem_object_put(obj);
1284 fini:
1285 	hang_fini(&h);
1286 	if (intel_gt_is_wedged(gt))
1287 		return -EIO;
1288 
1289 	return err;
1290 }
1291 
1292 static int igt_reset_evict_ggtt(void *arg)
1293 {
1294 	struct intel_gt *gt = arg;
1295 
1296 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1297 				     evict_vma, EXEC_OBJECT_WRITE);
1298 }
1299 
1300 static int igt_reset_evict_ppgtt(void *arg)
1301 {
1302 	struct intel_gt *gt = arg;
1303 	struct i915_gem_context *ctx;
1304 	struct i915_address_space *vm;
1305 	struct drm_file *file;
1306 	int err;
1307 
1308 	file = mock_file(gt->i915);
1309 	if (IS_ERR(file))
1310 		return PTR_ERR(file);
1311 
1312 	ctx = live_context(gt->i915, file);
1313 	if (IS_ERR(ctx)) {
1314 		err = PTR_ERR(ctx);
1315 		goto out;
1316 	}
1317 
1318 	err = 0;
1319 	vm = i915_gem_context_get_vm_rcu(ctx);
1320 	if (!i915_is_ggtt(vm)) {
1321 		/* aliasing == global gtt locking, covered above */
1322 		err = __igt_reset_evict_vma(gt, vm,
1323 					    evict_vma, EXEC_OBJECT_WRITE);
1324 	}
1325 	i915_vm_put(vm);
1326 
1327 out:
1328 	mock_file_free(gt->i915, file);
1329 	return err;
1330 }
1331 
1332 static int igt_reset_evict_fence(void *arg)
1333 {
1334 	struct intel_gt *gt = arg;
1335 
1336 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1337 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1338 }
1339 
1340 static int wait_for_others(struct intel_gt *gt,
1341 			   struct intel_engine_cs *exclude)
1342 {
1343 	struct intel_engine_cs *engine;
1344 	enum intel_engine_id id;
1345 
1346 	for_each_engine(engine, gt, id) {
1347 		if (engine == exclude)
1348 			continue;
1349 
1350 		if (!wait_for_idle(engine))
1351 			return -EIO;
1352 	}
1353 
1354 	return 0;
1355 }
1356 
1357 static int igt_reset_queue(void *arg)
1358 {
1359 	struct intel_gt *gt = arg;
1360 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1361 	struct intel_engine_cs *engine;
1362 	enum intel_engine_id id;
1363 	struct hang h;
1364 	int err;
1365 
1366 	/* Check that we replay pending requests following a hang */
1367 
1368 	igt_global_reset_lock(gt);
1369 
1370 	err = hang_init(&h, gt);
1371 	if (err)
1372 		goto unlock;
1373 
1374 	for_each_engine(engine, gt, id) {
1375 		struct i915_request *prev;
1376 		IGT_TIMEOUT(end_time);
1377 		unsigned int count;
1378 
1379 		if (!intel_engine_can_store_dword(engine))
1380 			continue;
1381 
1382 		prev = hang_create_request(&h, engine);
1383 		if (IS_ERR(prev)) {
1384 			err = PTR_ERR(prev);
1385 			goto fini;
1386 		}
1387 
1388 		i915_request_get(prev);
1389 		i915_request_add(prev);
1390 
1391 		count = 0;
1392 		do {
1393 			struct i915_request *rq;
1394 			unsigned int reset_count;
1395 
1396 			rq = hang_create_request(&h, engine);
1397 			if (IS_ERR(rq)) {
1398 				err = PTR_ERR(rq);
1399 				goto fini;
1400 			}
1401 
1402 			i915_request_get(rq);
1403 			i915_request_add(rq);
1404 
1405 			/*
1406 			 * XXX We don't handle resetting the kernel context
1407 			 * very well. If we trigger a device reset twice in
1408 			 * quick succession while the kernel context is
1409 			 * executing, we may end up skipping the breadcrumb.
1410 			 * This is really only a problem for the selftest as
1411 			 * normally there is a large interlude between resets
1412 			 * (hangcheck), or we focus on resetting just one
1413 			 * engine and so avoid repeatedly resetting innocents.
1414 			 */
1415 			err = wait_for_others(gt, engine);
1416 			if (err) {
1417 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1418 				       __func__, engine->name);
1419 				i915_request_put(rq);
1420 				i915_request_put(prev);
1421 
1422 				GEM_TRACE_DUMP();
1423 				intel_gt_set_wedged(gt);
1424 				goto fini;
1425 			}
1426 
1427 			if (!wait_until_running(&h, prev)) {
1428 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1429 
1430 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1431 				       __func__, engine->name,
1432 				       prev->fence.seqno, hws_seqno(&h, prev));
1433 				intel_engine_dump(engine, &p,
1434 						  "%s\n", engine->name);
1435 
1436 				i915_request_put(rq);
1437 				i915_request_put(prev);
1438 
1439 				intel_gt_set_wedged(gt);
1440 
1441 				err = -EIO;
1442 				goto fini;
1443 			}
1444 
1445 			reset_count = fake_hangcheck(gt, BIT(id));
1446 
1447 			if (prev->fence.error != -EIO) {
1448 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1449 				       prev->fence.error);
1450 				i915_request_put(rq);
1451 				i915_request_put(prev);
1452 				err = -EINVAL;
1453 				goto fini;
1454 			}
1455 
1456 			if (rq->fence.error) {
1457 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1458 				       rq->fence.error);
1459 				i915_request_put(rq);
1460 				i915_request_put(prev);
1461 				err = -EINVAL;
1462 				goto fini;
1463 			}
1464 
1465 			if (i915_reset_count(global) == reset_count) {
1466 				pr_err("No GPU reset recorded!\n");
1467 				i915_request_put(rq);
1468 				i915_request_put(prev);
1469 				err = -EINVAL;
1470 				goto fini;
1471 			}
1472 
1473 			i915_request_put(prev);
1474 			prev = rq;
1475 			count++;
1476 		} while (time_before(jiffies, end_time));
1477 		pr_info("%s: Completed %d resets\n", engine->name, count);
1478 
1479 		*h.batch = MI_BATCH_BUFFER_END;
1480 		intel_gt_chipset_flush(engine->gt);
1481 
1482 		i915_request_put(prev);
1483 
1484 		err = igt_flush_test(gt->i915);
1485 		if (err)
1486 			break;
1487 	}
1488 
1489 fini:
1490 	hang_fini(&h);
1491 unlock:
1492 	igt_global_reset_unlock(gt);
1493 
1494 	if (intel_gt_is_wedged(gt))
1495 		return -EIO;
1496 
1497 	return err;
1498 }
1499 
1500 static int igt_handle_error(void *arg)
1501 {
1502 	struct intel_gt *gt = arg;
1503 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1504 	struct intel_engine_cs *engine = gt->engine[RCS0];
1505 	struct hang h;
1506 	struct i915_request *rq;
1507 	struct i915_gpu_state *error;
1508 	int err;
1509 
1510 	/* Check that we can issue a global GPU and engine reset */
1511 
1512 	if (!intel_has_reset_engine(gt))
1513 		return 0;
1514 
1515 	if (!engine || !intel_engine_can_store_dword(engine))
1516 		return 0;
1517 
1518 	err = hang_init(&h, gt);
1519 	if (err)
1520 		return err;
1521 
1522 	rq = hang_create_request(&h, engine);
1523 	if (IS_ERR(rq)) {
1524 		err = PTR_ERR(rq);
1525 		goto err_fini;
1526 	}
1527 
1528 	i915_request_get(rq);
1529 	i915_request_add(rq);
1530 
1531 	if (!wait_until_running(&h, rq)) {
1532 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1533 
1534 		pr_err("%s: Failed to start request %llx, at %x\n",
1535 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1536 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1537 
1538 		intel_gt_set_wedged(gt);
1539 
1540 		err = -EIO;
1541 		goto err_request;
1542 	}
1543 
1544 	/* Temporarily disable error capture */
1545 	error = xchg(&global->first_error, (void *)-1);
1546 
1547 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1548 
1549 	xchg(&global->first_error, error);
1550 
1551 	if (rq->fence.error != -EIO) {
1552 		pr_err("Guilty request not identified!\n");
1553 		err = -EINVAL;
1554 		goto err_request;
1555 	}
1556 
1557 err_request:
1558 	i915_request_put(rq);
1559 err_fini:
1560 	hang_fini(&h);
1561 	return err;
1562 }
1563 
1564 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1565 				     const struct igt_atomic_section *p,
1566 				     const char *mode)
1567 {
1568 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1569 	int err;
1570 
1571 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1572 		  engine->name, mode, p->name);
1573 
1574 	tasklet_disable(t);
1575 	p->critical_section_begin();
1576 
1577 	err = intel_engine_reset(engine, NULL);
1578 
1579 	p->critical_section_end();
1580 	tasklet_enable(t);
1581 
1582 	if (err)
1583 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1584 		       engine->name, mode, p->name);
1585 
1586 	return err;
1587 }
1588 
1589 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1590 				   const struct igt_atomic_section *p)
1591 {
1592 	struct i915_request *rq;
1593 	struct hang h;
1594 	int err;
1595 
1596 	err = __igt_atomic_reset_engine(engine, p, "idle");
1597 	if (err)
1598 		return err;
1599 
1600 	err = hang_init(&h, engine->gt);
1601 	if (err)
1602 		return err;
1603 
1604 	rq = hang_create_request(&h, engine);
1605 	if (IS_ERR(rq)) {
1606 		err = PTR_ERR(rq);
1607 		goto out;
1608 	}
1609 
1610 	i915_request_get(rq);
1611 	i915_request_add(rq);
1612 
1613 	if (wait_until_running(&h, rq)) {
1614 		err = __igt_atomic_reset_engine(engine, p, "active");
1615 	} else {
1616 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1617 		       __func__, engine->name,
1618 		       rq->fence.seqno, hws_seqno(&h, rq));
1619 		intel_gt_set_wedged(engine->gt);
1620 		err = -EIO;
1621 	}
1622 
1623 	if (err == 0) {
1624 		struct intel_wedge_me w;
1625 
1626 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1627 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1628 		if (intel_gt_is_wedged(engine->gt))
1629 			err = -EIO;
1630 	}
1631 
1632 	i915_request_put(rq);
1633 out:
1634 	hang_fini(&h);
1635 	return err;
1636 }
1637 
1638 static int igt_reset_engines_atomic(void *arg)
1639 {
1640 	struct intel_gt *gt = arg;
1641 	const typeof(*igt_atomic_phases) *p;
1642 	int err = 0;
1643 
1644 	/* Check that the engines resets are usable from atomic context */
1645 
1646 	if (!intel_has_reset_engine(gt))
1647 		return 0;
1648 
1649 	if (USES_GUC_SUBMISSION(gt->i915))
1650 		return 0;
1651 
1652 	igt_global_reset_lock(gt);
1653 
1654 	/* Flush any requests before we get started and check basics */
1655 	if (!igt_force_reset(gt))
1656 		goto unlock;
1657 
1658 	for (p = igt_atomic_phases; p->name; p++) {
1659 		struct intel_engine_cs *engine;
1660 		enum intel_engine_id id;
1661 
1662 		for_each_engine(engine, gt, id) {
1663 			err = igt_atomic_reset_engine(engine, p);
1664 			if (err)
1665 				goto out;
1666 		}
1667 	}
1668 
1669 out:
1670 	/* As we poke around the guts, do a full reset before continuing. */
1671 	igt_force_reset(gt);
1672 unlock:
1673 	igt_global_reset_unlock(gt);
1674 
1675 	return err;
1676 }
1677 
1678 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1679 {
1680 	static const struct i915_subtest tests[] = {
1681 		SUBTEST(igt_hang_sanitycheck),
1682 		SUBTEST(igt_reset_nop),
1683 		SUBTEST(igt_reset_nop_engine),
1684 		SUBTEST(igt_reset_idle_engine),
1685 		SUBTEST(igt_reset_active_engine),
1686 		SUBTEST(igt_reset_engines),
1687 		SUBTEST(igt_reset_engines_atomic),
1688 		SUBTEST(igt_reset_queue),
1689 		SUBTEST(igt_reset_wait),
1690 		SUBTEST(igt_reset_evict_ggtt),
1691 		SUBTEST(igt_reset_evict_ppgtt),
1692 		SUBTEST(igt_reset_evict_fence),
1693 		SUBTEST(igt_handle_error),
1694 	};
1695 	struct intel_gt *gt = &i915->gt;
1696 	intel_wakeref_t wakeref;
1697 	int err;
1698 
1699 	if (!intel_has_gpu_reset(gt))
1700 		return 0;
1701 
1702 	if (intel_gt_is_wedged(gt))
1703 		return -EIO; /* we're long past hope of a successful reset */
1704 
1705 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1706 
1707 	err = intel_gt_live_subtests(tests, gt);
1708 
1709 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1710 
1711 	return err;
1712 }
1713